diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,291104 @@ +{ + "best_global_step": 90900, + "best_metric": 0.060428909957408905, + "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_winogrande_101112_1760638073/checkpoint-90900", + "epoch": 20.0, + "eval_steps": 9090, + "global_step": 181800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00055005500550055, + "grad_norm": 10.741941452026367, + "learning_rate": 1.1001100110011e-08, + "loss": 7.4226, + "num_input_tokens_seen": 1056, + "step": 5 + }, + { + "epoch": 0.0011001100110011, + "grad_norm": 12.295313835144043, + "learning_rate": 2.4752475247524754e-08, + "loss": 7.157, + "num_input_tokens_seen": 2208, + "step": 10 + }, + { + "epoch": 0.0016501650165016502, + "grad_norm": 12.259510040283203, + "learning_rate": 3.850385038503851e-08, + "loss": 7.1223, + "num_input_tokens_seen": 3264, + "step": 15 + }, + { + "epoch": 0.0022002200220022, + "grad_norm": 12.49266529083252, + "learning_rate": 5.225522552255226e-08, + "loss": 7.3395, + "num_input_tokens_seen": 4288, + "step": 20 + }, + { + "epoch": 0.0027502750275027505, + "grad_norm": 11.910026550292969, + "learning_rate": 6.600660066006601e-08, + "loss": 7.4389, + "num_input_tokens_seen": 5408, + "step": 25 + }, + { + "epoch": 0.0033003300330033004, + "grad_norm": 11.222013473510742, + "learning_rate": 7.975797579757976e-08, + "loss": 7.5022, + "num_input_tokens_seen": 6464, + "step": 30 + }, + { + "epoch": 0.0038503850385038503, + "grad_norm": 10.56160831451416, + "learning_rate": 9.350935093509352e-08, + "loss": 7.2997, + "num_input_tokens_seen": 7488, + "step": 35 + }, + { + "epoch": 0.0044004400440044, + "grad_norm": 10.864595413208008, + "learning_rate": 1.0726072607260726e-07, + "loss": 7.299, + "num_input_tokens_seen": 8576, + "step": 40 + }, + { + "epoch": 0.0049504950495049506, + "grad_norm": 11.4581298828125, + "learning_rate": 1.2101210121012102e-07, + "loss": 7.2658, + "num_input_tokens_seen": 9664, + "step": 45 + }, + { + "epoch": 0.005500550055005501, + "grad_norm": 11.87352180480957, + "learning_rate": 1.3476347634763477e-07, + "loss": 7.5561, + "num_input_tokens_seen": 10752, + "step": 50 + }, + { + "epoch": 0.00605060506050605, + "grad_norm": 10.361844062805176, + "learning_rate": 1.4851485148514852e-07, + "loss": 7.1142, + "num_input_tokens_seen": 11744, + "step": 55 + }, + { + "epoch": 0.006600660066006601, + "grad_norm": 10.947933197021484, + "learning_rate": 1.622662266226623e-07, + "loss": 7.455, + "num_input_tokens_seen": 12800, + "step": 60 + }, + { + "epoch": 0.007150715071507151, + "grad_norm": 11.392529487609863, + "learning_rate": 1.76017601760176e-07, + "loss": 7.3046, + "num_input_tokens_seen": 13888, + "step": 65 + }, + { + "epoch": 0.007700770077007701, + "grad_norm": 11.557997703552246, + "learning_rate": 1.8976897689768978e-07, + "loss": 7.4093, + "num_input_tokens_seen": 14944, + "step": 70 + }, + { + "epoch": 0.00825082508250825, + "grad_norm": 11.454574584960938, + "learning_rate": 2.0352035203520353e-07, + "loss": 7.4127, + "num_input_tokens_seen": 16032, + "step": 75 + }, + { + "epoch": 0.0088008800880088, + "grad_norm": 11.222530364990234, + "learning_rate": 2.1727172717271727e-07, + "loss": 7.3939, + "num_input_tokens_seen": 17088, + "step": 80 + }, + { + "epoch": 0.00935093509350935, + "grad_norm": 10.300202369689941, + "learning_rate": 2.3102310231023105e-07, + "loss": 7.3738, + "num_input_tokens_seen": 18112, + "step": 85 + }, + { + "epoch": 0.009900990099009901, + "grad_norm": 11.372662544250488, + "learning_rate": 2.447744774477448e-07, + "loss": 7.6104, + "num_input_tokens_seen": 19168, + "step": 90 + }, + { + "epoch": 0.010451045104510451, + "grad_norm": 12.149177551269531, + "learning_rate": 2.5852585258525854e-07, + "loss": 7.2874, + "num_input_tokens_seen": 20192, + "step": 95 + }, + { + "epoch": 0.011001100110011002, + "grad_norm": 10.775299072265625, + "learning_rate": 2.7227722772277226e-07, + "loss": 7.3848, + "num_input_tokens_seen": 21216, + "step": 100 + }, + { + "epoch": 0.01155115511551155, + "grad_norm": 11.925307273864746, + "learning_rate": 2.8602860286028603e-07, + "loss": 7.3297, + "num_input_tokens_seen": 22272, + "step": 105 + }, + { + "epoch": 0.0121012101210121, + "grad_norm": 12.769255638122559, + "learning_rate": 2.997799779977998e-07, + "loss": 7.523, + "num_input_tokens_seen": 23264, + "step": 110 + }, + { + "epoch": 0.012651265126512651, + "grad_norm": 11.654729843139648, + "learning_rate": 3.1353135313531353e-07, + "loss": 7.4726, + "num_input_tokens_seen": 24288, + "step": 115 + }, + { + "epoch": 0.013201320132013201, + "grad_norm": 10.634231567382812, + "learning_rate": 3.272827282728273e-07, + "loss": 7.5043, + "num_input_tokens_seen": 25408, + "step": 120 + }, + { + "epoch": 0.013751375137513752, + "grad_norm": 11.823368072509766, + "learning_rate": 3.4103410341034107e-07, + "loss": 7.3024, + "num_input_tokens_seen": 26464, + "step": 125 + }, + { + "epoch": 0.014301430143014302, + "grad_norm": 11.656071662902832, + "learning_rate": 3.547854785478548e-07, + "loss": 6.9611, + "num_input_tokens_seen": 27552, + "step": 130 + }, + { + "epoch": 0.01485148514851485, + "grad_norm": 10.185606002807617, + "learning_rate": 3.6853685368536857e-07, + "loss": 7.2915, + "num_input_tokens_seen": 28608, + "step": 135 + }, + { + "epoch": 0.015401540154015401, + "grad_norm": 12.010210037231445, + "learning_rate": 3.822882288228823e-07, + "loss": 7.5443, + "num_input_tokens_seen": 29664, + "step": 140 + }, + { + "epoch": 0.01595159515951595, + "grad_norm": 10.740604400634766, + "learning_rate": 3.9603960396039606e-07, + "loss": 7.2221, + "num_input_tokens_seen": 30720, + "step": 145 + }, + { + "epoch": 0.0165016501650165, + "grad_norm": 11.278823852539062, + "learning_rate": 4.097909790979098e-07, + "loss": 7.426, + "num_input_tokens_seen": 31712, + "step": 150 + }, + { + "epoch": 0.017051705170517052, + "grad_norm": 10.991938591003418, + "learning_rate": 4.235423542354236e-07, + "loss": 7.3271, + "num_input_tokens_seen": 32704, + "step": 155 + }, + { + "epoch": 0.0176017601760176, + "grad_norm": 11.357768058776855, + "learning_rate": 4.3729372937293727e-07, + "loss": 7.163, + "num_input_tokens_seen": 33696, + "step": 160 + }, + { + "epoch": 0.018151815181518153, + "grad_norm": 10.993764877319336, + "learning_rate": 4.510451045104511e-07, + "loss": 7.5884, + "num_input_tokens_seen": 34752, + "step": 165 + }, + { + "epoch": 0.0187018701870187, + "grad_norm": 10.236475944519043, + "learning_rate": 4.647964796479648e-07, + "loss": 7.3003, + "num_input_tokens_seen": 35872, + "step": 170 + }, + { + "epoch": 0.019251925192519254, + "grad_norm": 11.853199005126953, + "learning_rate": 4.785478547854785e-07, + "loss": 7.6545, + "num_input_tokens_seen": 36928, + "step": 175 + }, + { + "epoch": 0.019801980198019802, + "grad_norm": 11.525308609008789, + "learning_rate": 4.922992299229923e-07, + "loss": 7.3781, + "num_input_tokens_seen": 37952, + "step": 180 + }, + { + "epoch": 0.02035203520352035, + "grad_norm": 10.885004043579102, + "learning_rate": 5.060506050605061e-07, + "loss": 7.1911, + "num_input_tokens_seen": 38944, + "step": 185 + }, + { + "epoch": 0.020902090209020903, + "grad_norm": 11.33720588684082, + "learning_rate": 5.198019801980199e-07, + "loss": 7.3379, + "num_input_tokens_seen": 40000, + "step": 190 + }, + { + "epoch": 0.02145214521452145, + "grad_norm": 10.853523254394531, + "learning_rate": 5.335533553355335e-07, + "loss": 7.194, + "num_input_tokens_seen": 41056, + "step": 195 + }, + { + "epoch": 0.022002200220022004, + "grad_norm": 12.064167976379395, + "learning_rate": 5.473047304730474e-07, + "loss": 6.9885, + "num_input_tokens_seen": 42080, + "step": 200 + }, + { + "epoch": 0.022552255225522552, + "grad_norm": 12.169781684875488, + "learning_rate": 5.610561056105611e-07, + "loss": 7.5386, + "num_input_tokens_seen": 43136, + "step": 205 + }, + { + "epoch": 0.0231023102310231, + "grad_norm": 11.237638473510742, + "learning_rate": 5.748074807480748e-07, + "loss": 7.0949, + "num_input_tokens_seen": 44224, + "step": 210 + }, + { + "epoch": 0.023652365236523653, + "grad_norm": 11.141739845275879, + "learning_rate": 5.885588558855886e-07, + "loss": 7.4771, + "num_input_tokens_seen": 45312, + "step": 215 + }, + { + "epoch": 0.0242024202420242, + "grad_norm": 11.414908409118652, + "learning_rate": 6.023102310231023e-07, + "loss": 7.2583, + "num_input_tokens_seen": 46368, + "step": 220 + }, + { + "epoch": 0.024752475247524754, + "grad_norm": 11.60216236114502, + "learning_rate": 6.160616061606162e-07, + "loss": 7.2495, + "num_input_tokens_seen": 47488, + "step": 225 + }, + { + "epoch": 0.025302530253025302, + "grad_norm": 11.20028305053711, + "learning_rate": 6.298129812981298e-07, + "loss": 7.4122, + "num_input_tokens_seen": 48512, + "step": 230 + }, + { + "epoch": 0.02585258525852585, + "grad_norm": 11.412261962890625, + "learning_rate": 6.435643564356436e-07, + "loss": 7.2682, + "num_input_tokens_seen": 49568, + "step": 235 + }, + { + "epoch": 0.026402640264026403, + "grad_norm": 10.795794486999512, + "learning_rate": 6.573157315731574e-07, + "loss": 7.282, + "num_input_tokens_seen": 50656, + "step": 240 + }, + { + "epoch": 0.02695269526952695, + "grad_norm": 11.476829528808594, + "learning_rate": 6.71067106710671e-07, + "loss": 7.4635, + "num_input_tokens_seen": 51712, + "step": 245 + }, + { + "epoch": 0.027502750275027504, + "grad_norm": 11.400288581848145, + "learning_rate": 6.848184818481849e-07, + "loss": 7.7111, + "num_input_tokens_seen": 52736, + "step": 250 + }, + { + "epoch": 0.028052805280528052, + "grad_norm": 10.808517456054688, + "learning_rate": 6.985698569856986e-07, + "loss": 7.1912, + "num_input_tokens_seen": 53824, + "step": 255 + }, + { + "epoch": 0.028602860286028604, + "grad_norm": 12.136430740356445, + "learning_rate": 7.123212321232124e-07, + "loss": 7.4596, + "num_input_tokens_seen": 54880, + "step": 260 + }, + { + "epoch": 0.029152915291529153, + "grad_norm": 12.087676048278809, + "learning_rate": 7.260726072607261e-07, + "loss": 7.5449, + "num_input_tokens_seen": 56000, + "step": 265 + }, + { + "epoch": 0.0297029702970297, + "grad_norm": 10.88523006439209, + "learning_rate": 7.398239823982398e-07, + "loss": 7.2367, + "num_input_tokens_seen": 56992, + "step": 270 + }, + { + "epoch": 0.030253025302530254, + "grad_norm": 11.200333595275879, + "learning_rate": 7.535753575357537e-07, + "loss": 7.5558, + "num_input_tokens_seen": 58080, + "step": 275 + }, + { + "epoch": 0.030803080308030802, + "grad_norm": 11.815566062927246, + "learning_rate": 7.673267326732673e-07, + "loss": 7.1495, + "num_input_tokens_seen": 59168, + "step": 280 + }, + { + "epoch": 0.03135313531353135, + "grad_norm": 11.195645332336426, + "learning_rate": 7.810781078107811e-07, + "loss": 7.343, + "num_input_tokens_seen": 60288, + "step": 285 + }, + { + "epoch": 0.0319031903190319, + "grad_norm": 13.227083206176758, + "learning_rate": 7.948294829482948e-07, + "loss": 7.4844, + "num_input_tokens_seen": 61280, + "step": 290 + }, + { + "epoch": 0.032453245324532455, + "grad_norm": 11.555545806884766, + "learning_rate": 8.085808580858086e-07, + "loss": 7.3166, + "num_input_tokens_seen": 62272, + "step": 295 + }, + { + "epoch": 0.033003300330033, + "grad_norm": 11.572136878967285, + "learning_rate": 8.223322332233224e-07, + "loss": 7.2489, + "num_input_tokens_seen": 63328, + "step": 300 + }, + { + "epoch": 0.03355335533553355, + "grad_norm": 10.627071380615234, + "learning_rate": 8.360836083608362e-07, + "loss": 7.5482, + "num_input_tokens_seen": 64352, + "step": 305 + }, + { + "epoch": 0.034103410341034104, + "grad_norm": 11.227778434753418, + "learning_rate": 8.498349834983498e-07, + "loss": 7.3467, + "num_input_tokens_seen": 65440, + "step": 310 + }, + { + "epoch": 0.034653465346534656, + "grad_norm": 11.409594535827637, + "learning_rate": 8.635863586358637e-07, + "loss": 7.214, + "num_input_tokens_seen": 66528, + "step": 315 + }, + { + "epoch": 0.0352035203520352, + "grad_norm": 11.50815486907959, + "learning_rate": 8.773377337733774e-07, + "loss": 7.5557, + "num_input_tokens_seen": 67552, + "step": 320 + }, + { + "epoch": 0.035753575357535754, + "grad_norm": 12.635124206542969, + "learning_rate": 8.910891089108911e-07, + "loss": 7.1758, + "num_input_tokens_seen": 68576, + "step": 325 + }, + { + "epoch": 0.036303630363036306, + "grad_norm": 11.633588790893555, + "learning_rate": 9.048404840484049e-07, + "loss": 7.3479, + "num_input_tokens_seen": 69600, + "step": 330 + }, + { + "epoch": 0.03685368536853685, + "grad_norm": 11.707019805908203, + "learning_rate": 9.185918591859186e-07, + "loss": 7.3781, + "num_input_tokens_seen": 70624, + "step": 335 + }, + { + "epoch": 0.0374037403740374, + "grad_norm": 11.153525352478027, + "learning_rate": 9.323432343234324e-07, + "loss": 6.924, + "num_input_tokens_seen": 71680, + "step": 340 + }, + { + "epoch": 0.037953795379537955, + "grad_norm": 11.654695510864258, + "learning_rate": 9.460946094609461e-07, + "loss": 7.2749, + "num_input_tokens_seen": 72672, + "step": 345 + }, + { + "epoch": 0.03850385038503851, + "grad_norm": 11.28149700164795, + "learning_rate": 9.5984598459846e-07, + "loss": 7.1013, + "num_input_tokens_seen": 73696, + "step": 350 + }, + { + "epoch": 0.03905390539053905, + "grad_norm": 12.163405418395996, + "learning_rate": 9.735973597359737e-07, + "loss": 7.6288, + "num_input_tokens_seen": 74752, + "step": 355 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 11.214391708374023, + "learning_rate": 9.873487348734873e-07, + "loss": 7.2786, + "num_input_tokens_seen": 75776, + "step": 360 + }, + { + "epoch": 0.040154015401540157, + "grad_norm": 11.947404861450195, + "learning_rate": 1.001100110011001e-06, + "loss": 7.156, + "num_input_tokens_seen": 76864, + "step": 365 + }, + { + "epoch": 0.0407040704070407, + "grad_norm": 11.016962051391602, + "learning_rate": 1.0148514851485148e-06, + "loss": 7.1695, + "num_input_tokens_seen": 77952, + "step": 370 + }, + { + "epoch": 0.041254125412541254, + "grad_norm": 10.449760437011719, + "learning_rate": 1.0286028602860288e-06, + "loss": 7.151, + "num_input_tokens_seen": 79040, + "step": 375 + }, + { + "epoch": 0.041804180418041806, + "grad_norm": 10.884154319763184, + "learning_rate": 1.0423542354235424e-06, + "loss": 7.5046, + "num_input_tokens_seen": 80096, + "step": 380 + }, + { + "epoch": 0.04235423542354235, + "grad_norm": 12.355956077575684, + "learning_rate": 1.0561056105610562e-06, + "loss": 7.1479, + "num_input_tokens_seen": 81184, + "step": 385 + }, + { + "epoch": 0.0429042904290429, + "grad_norm": 10.552356719970703, + "learning_rate": 1.06985698569857e-06, + "loss": 7.3582, + "num_input_tokens_seen": 82240, + "step": 390 + }, + { + "epoch": 0.043454345434543455, + "grad_norm": 11.367505073547363, + "learning_rate": 1.0836083608360837e-06, + "loss": 7.447, + "num_input_tokens_seen": 83328, + "step": 395 + }, + { + "epoch": 0.04400440044004401, + "grad_norm": 11.656962394714355, + "learning_rate": 1.0973597359735975e-06, + "loss": 7.3037, + "num_input_tokens_seen": 84352, + "step": 400 + }, + { + "epoch": 0.04455445544554455, + "grad_norm": 11.577590942382812, + "learning_rate": 1.1111111111111112e-06, + "loss": 7.6311, + "num_input_tokens_seen": 85408, + "step": 405 + }, + { + "epoch": 0.045104510451045104, + "grad_norm": 10.74866008758545, + "learning_rate": 1.1248624862486248e-06, + "loss": 7.3793, + "num_input_tokens_seen": 86432, + "step": 410 + }, + { + "epoch": 0.04565456545654566, + "grad_norm": 11.503448486328125, + "learning_rate": 1.1386138613861386e-06, + "loss": 7.4169, + "num_input_tokens_seen": 87488, + "step": 415 + }, + { + "epoch": 0.0462046204620462, + "grad_norm": 11.128928184509277, + "learning_rate": 1.1523652365236524e-06, + "loss": 7.4043, + "num_input_tokens_seen": 88480, + "step": 420 + }, + { + "epoch": 0.046754675467546754, + "grad_norm": 11.671806335449219, + "learning_rate": 1.1661166116611663e-06, + "loss": 7.3328, + "num_input_tokens_seen": 89504, + "step": 425 + }, + { + "epoch": 0.047304730473047306, + "grad_norm": 12.683539390563965, + "learning_rate": 1.17986798679868e-06, + "loss": 7.3182, + "num_input_tokens_seen": 90560, + "step": 430 + }, + { + "epoch": 0.04785478547854786, + "grad_norm": 11.621917724609375, + "learning_rate": 1.1936193619361937e-06, + "loss": 7.4648, + "num_input_tokens_seen": 91584, + "step": 435 + }, + { + "epoch": 0.0484048404840484, + "grad_norm": 11.877111434936523, + "learning_rate": 1.2073707370737074e-06, + "loss": 7.4546, + "num_input_tokens_seen": 92640, + "step": 440 + }, + { + "epoch": 0.048954895489548955, + "grad_norm": 11.824024200439453, + "learning_rate": 1.221122112211221e-06, + "loss": 7.5078, + "num_input_tokens_seen": 93664, + "step": 445 + }, + { + "epoch": 0.04950495049504951, + "grad_norm": 11.120875358581543, + "learning_rate": 1.234873487348735e-06, + "loss": 7.5724, + "num_input_tokens_seen": 94752, + "step": 450 + }, + { + "epoch": 0.05005500550055005, + "grad_norm": 11.456937789916992, + "learning_rate": 1.2486248624862488e-06, + "loss": 7.4563, + "num_input_tokens_seen": 95808, + "step": 455 + }, + { + "epoch": 0.050605060506050605, + "grad_norm": 11.365700721740723, + "learning_rate": 1.2623762376237625e-06, + "loss": 7.3836, + "num_input_tokens_seen": 96800, + "step": 460 + }, + { + "epoch": 0.05115511551155116, + "grad_norm": 10.336440086364746, + "learning_rate": 1.276127612761276e-06, + "loss": 7.2817, + "num_input_tokens_seen": 97888, + "step": 465 + }, + { + "epoch": 0.0517051705170517, + "grad_norm": 11.254975318908691, + "learning_rate": 1.2898789878987899e-06, + "loss": 7.2693, + "num_input_tokens_seen": 99008, + "step": 470 + }, + { + "epoch": 0.052255225522552254, + "grad_norm": 11.327144622802734, + "learning_rate": 1.3036303630363036e-06, + "loss": 7.3842, + "num_input_tokens_seen": 100096, + "step": 475 + }, + { + "epoch": 0.052805280528052806, + "grad_norm": 10.288135528564453, + "learning_rate": 1.3173817381738174e-06, + "loss": 7.094, + "num_input_tokens_seen": 101152, + "step": 480 + }, + { + "epoch": 0.05335533553355336, + "grad_norm": 11.842524528503418, + "learning_rate": 1.3311331133113312e-06, + "loss": 7.3928, + "num_input_tokens_seen": 102208, + "step": 485 + }, + { + "epoch": 0.0539053905390539, + "grad_norm": 12.20440673828125, + "learning_rate": 1.344884488448845e-06, + "loss": 7.4333, + "num_input_tokens_seen": 103232, + "step": 490 + }, + { + "epoch": 0.054455445544554455, + "grad_norm": 12.185340881347656, + "learning_rate": 1.3586358635863585e-06, + "loss": 7.3685, + "num_input_tokens_seen": 104320, + "step": 495 + }, + { + "epoch": 0.05500550055005501, + "grad_norm": 11.026677131652832, + "learning_rate": 1.3723872387238725e-06, + "loss": 7.2974, + "num_input_tokens_seen": 105376, + "step": 500 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 11.185867309570312, + "learning_rate": 1.3861386138613863e-06, + "loss": 7.3134, + "num_input_tokens_seen": 106432, + "step": 505 + }, + { + "epoch": 0.056105610561056105, + "grad_norm": 12.593390464782715, + "learning_rate": 1.3998899889989e-06, + "loss": 7.302, + "num_input_tokens_seen": 107392, + "step": 510 + }, + { + "epoch": 0.05665566556655666, + "grad_norm": 11.427498817443848, + "learning_rate": 1.4136413641364136e-06, + "loss": 7.445, + "num_input_tokens_seen": 108416, + "step": 515 + }, + { + "epoch": 0.05720572057205721, + "grad_norm": 11.559636116027832, + "learning_rate": 1.4273927392739274e-06, + "loss": 7.3436, + "num_input_tokens_seen": 109504, + "step": 520 + }, + { + "epoch": 0.057755775577557754, + "grad_norm": 11.08027172088623, + "learning_rate": 1.4411441144114412e-06, + "loss": 7.318, + "num_input_tokens_seen": 110528, + "step": 525 + }, + { + "epoch": 0.058305830583058306, + "grad_norm": 12.184982299804688, + "learning_rate": 1.454895489548955e-06, + "loss": 7.2152, + "num_input_tokens_seen": 111520, + "step": 530 + }, + { + "epoch": 0.05885588558855886, + "grad_norm": 11.27355670928955, + "learning_rate": 1.4686468646864687e-06, + "loss": 7.4502, + "num_input_tokens_seen": 112512, + "step": 535 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 11.227193832397461, + "learning_rate": 1.4823982398239825e-06, + "loss": 7.0791, + "num_input_tokens_seen": 113536, + "step": 540 + }, + { + "epoch": 0.059955995599559955, + "grad_norm": 11.05555248260498, + "learning_rate": 1.4961496149614963e-06, + "loss": 7.1853, + "num_input_tokens_seen": 114592, + "step": 545 + }, + { + "epoch": 0.06050605060506051, + "grad_norm": 10.776226997375488, + "learning_rate": 1.50990099009901e-06, + "loss": 7.7481, + "num_input_tokens_seen": 115680, + "step": 550 + }, + { + "epoch": 0.06105610561056106, + "grad_norm": 10.466924667358398, + "learning_rate": 1.5236523652365238e-06, + "loss": 7.0417, + "num_input_tokens_seen": 116672, + "step": 555 + }, + { + "epoch": 0.061606160616061605, + "grad_norm": 12.49642276763916, + "learning_rate": 1.5374037403740376e-06, + "loss": 7.6014, + "num_input_tokens_seen": 117696, + "step": 560 + }, + { + "epoch": 0.06215621562156216, + "grad_norm": 11.011415481567383, + "learning_rate": 1.5511551155115511e-06, + "loss": 7.4048, + "num_input_tokens_seen": 118720, + "step": 565 + }, + { + "epoch": 0.0627062706270627, + "grad_norm": 10.65047550201416, + "learning_rate": 1.5649064906490651e-06, + "loss": 7.2783, + "num_input_tokens_seen": 119808, + "step": 570 + }, + { + "epoch": 0.06325632563256325, + "grad_norm": 10.948373794555664, + "learning_rate": 1.5786578657865787e-06, + "loss": 7.5366, + "num_input_tokens_seen": 120864, + "step": 575 + }, + { + "epoch": 0.0638063806380638, + "grad_norm": 10.930785179138184, + "learning_rate": 1.5924092409240925e-06, + "loss": 7.451, + "num_input_tokens_seen": 121952, + "step": 580 + }, + { + "epoch": 0.06435643564356436, + "grad_norm": 11.543394088745117, + "learning_rate": 1.6061606160616062e-06, + "loss": 7.4218, + "num_input_tokens_seen": 122976, + "step": 585 + }, + { + "epoch": 0.06490649064906491, + "grad_norm": 11.450920104980469, + "learning_rate": 1.61991199119912e-06, + "loss": 7.2462, + "num_input_tokens_seen": 123968, + "step": 590 + }, + { + "epoch": 0.06545654565456546, + "grad_norm": 11.658074378967285, + "learning_rate": 1.6336633663366338e-06, + "loss": 7.4717, + "num_input_tokens_seen": 124992, + "step": 595 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 11.074122428894043, + "learning_rate": 1.6474147414741475e-06, + "loss": 7.5661, + "num_input_tokens_seen": 126048, + "step": 600 + }, + { + "epoch": 0.06655665566556655, + "grad_norm": 11.119942665100098, + "learning_rate": 1.6611661166116613e-06, + "loss": 7.1996, + "num_input_tokens_seen": 127168, + "step": 605 + }, + { + "epoch": 0.0671067106710671, + "grad_norm": 10.796300888061523, + "learning_rate": 1.6749174917491749e-06, + "loss": 7.4235, + "num_input_tokens_seen": 128224, + "step": 610 + }, + { + "epoch": 0.06765676567656766, + "grad_norm": 10.946305274963379, + "learning_rate": 1.6886688668866887e-06, + "loss": 7.3712, + "num_input_tokens_seen": 129280, + "step": 615 + }, + { + "epoch": 0.06820682068206821, + "grad_norm": 13.31139850616455, + "learning_rate": 1.7024202420242024e-06, + "loss": 7.425, + "num_input_tokens_seen": 130336, + "step": 620 + }, + { + "epoch": 0.06875687568756876, + "grad_norm": 10.686441421508789, + "learning_rate": 1.7161716171617162e-06, + "loss": 7.3354, + "num_input_tokens_seen": 131424, + "step": 625 + }, + { + "epoch": 0.06930693069306931, + "grad_norm": 10.386049270629883, + "learning_rate": 1.72992299229923e-06, + "loss": 7.3039, + "num_input_tokens_seen": 132480, + "step": 630 + }, + { + "epoch": 0.06985698569856985, + "grad_norm": 11.20623779296875, + "learning_rate": 1.743674367436744e-06, + "loss": 7.3981, + "num_input_tokens_seen": 133536, + "step": 635 + }, + { + "epoch": 0.0704070407040704, + "grad_norm": 11.641103744506836, + "learning_rate": 1.7574257425742577e-06, + "loss": 7.075, + "num_input_tokens_seen": 134592, + "step": 640 + }, + { + "epoch": 0.07095709570957096, + "grad_norm": 11.937508583068848, + "learning_rate": 1.771177117711771e-06, + "loss": 7.1862, + "num_input_tokens_seen": 135648, + "step": 645 + }, + { + "epoch": 0.07150715071507151, + "grad_norm": 11.87981128692627, + "learning_rate": 1.7849284928492849e-06, + "loss": 7.5197, + "num_input_tokens_seen": 136768, + "step": 650 + }, + { + "epoch": 0.07205720572057206, + "grad_norm": 11.198189735412598, + "learning_rate": 1.7986798679867988e-06, + "loss": 7.4261, + "num_input_tokens_seen": 137888, + "step": 655 + }, + { + "epoch": 0.07260726072607261, + "grad_norm": 11.361278533935547, + "learning_rate": 1.8124312431243126e-06, + "loss": 7.33, + "num_input_tokens_seen": 138944, + "step": 660 + }, + { + "epoch": 0.07315731573157316, + "grad_norm": 11.910014152526855, + "learning_rate": 1.8261826182618264e-06, + "loss": 7.3168, + "num_input_tokens_seen": 140000, + "step": 665 + }, + { + "epoch": 0.0737073707370737, + "grad_norm": 11.251646995544434, + "learning_rate": 1.8399339933993402e-06, + "loss": 7.4884, + "num_input_tokens_seen": 141024, + "step": 670 + }, + { + "epoch": 0.07425742574257425, + "grad_norm": 11.918956756591797, + "learning_rate": 1.8536853685368535e-06, + "loss": 7.3331, + "num_input_tokens_seen": 142016, + "step": 675 + }, + { + "epoch": 0.0748074807480748, + "grad_norm": 11.105975151062012, + "learning_rate": 1.8674367436743675e-06, + "loss": 7.4635, + "num_input_tokens_seen": 143040, + "step": 680 + }, + { + "epoch": 0.07535753575357536, + "grad_norm": 10.877959251403809, + "learning_rate": 1.8811881188118813e-06, + "loss": 7.1036, + "num_input_tokens_seen": 144096, + "step": 685 + }, + { + "epoch": 0.07590759075907591, + "grad_norm": 11.300917625427246, + "learning_rate": 1.894939493949395e-06, + "loss": 7.5572, + "num_input_tokens_seen": 145120, + "step": 690 + }, + { + "epoch": 0.07645764576457646, + "grad_norm": 10.847497940063477, + "learning_rate": 1.908690869086909e-06, + "loss": 7.3631, + "num_input_tokens_seen": 146144, + "step": 695 + }, + { + "epoch": 0.07700770077007701, + "grad_norm": 10.927061080932617, + "learning_rate": 1.9224422442244226e-06, + "loss": 7.3748, + "num_input_tokens_seen": 147264, + "step": 700 + }, + { + "epoch": 0.07755775577557755, + "grad_norm": 11.806998252868652, + "learning_rate": 1.9361936193619364e-06, + "loss": 7.4472, + "num_input_tokens_seen": 148384, + "step": 705 + }, + { + "epoch": 0.0781078107810781, + "grad_norm": 10.555063247680664, + "learning_rate": 1.9499449944994497e-06, + "loss": 7.1466, + "num_input_tokens_seen": 149376, + "step": 710 + }, + { + "epoch": 0.07865786578657866, + "grad_norm": 11.434652328491211, + "learning_rate": 1.9636963696369635e-06, + "loss": 7.1537, + "num_input_tokens_seen": 150560, + "step": 715 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 10.094926834106445, + "learning_rate": 1.9774477447744777e-06, + "loss": 7.4277, + "num_input_tokens_seen": 151648, + "step": 720 + }, + { + "epoch": 0.07975797579757976, + "grad_norm": 11.191442489624023, + "learning_rate": 1.9911991199119915e-06, + "loss": 7.2583, + "num_input_tokens_seen": 152704, + "step": 725 + }, + { + "epoch": 0.08030803080308031, + "grad_norm": 10.968435287475586, + "learning_rate": 2.0049504950495052e-06, + "loss": 7.2974, + "num_input_tokens_seen": 153760, + "step": 730 + }, + { + "epoch": 0.08085808580858085, + "grad_norm": 11.469771385192871, + "learning_rate": 2.018701870187019e-06, + "loss": 7.5154, + "num_input_tokens_seen": 154752, + "step": 735 + }, + { + "epoch": 0.0814081408140814, + "grad_norm": 12.342435836791992, + "learning_rate": 2.0324532453245328e-06, + "loss": 7.3393, + "num_input_tokens_seen": 155840, + "step": 740 + }, + { + "epoch": 0.08195819581958196, + "grad_norm": 12.586079597473145, + "learning_rate": 2.046204620462046e-06, + "loss": 7.1618, + "num_input_tokens_seen": 156896, + "step": 745 + }, + { + "epoch": 0.08250825082508251, + "grad_norm": 10.191243171691895, + "learning_rate": 2.05995599559956e-06, + "loss": 7.1599, + "num_input_tokens_seen": 157984, + "step": 750 + }, + { + "epoch": 0.08305830583058306, + "grad_norm": 11.787452697753906, + "learning_rate": 2.0737073707370737e-06, + "loss": 7.2782, + "num_input_tokens_seen": 159072, + "step": 755 + }, + { + "epoch": 0.08360836083608361, + "grad_norm": 10.401532173156738, + "learning_rate": 2.0874587458745874e-06, + "loss": 7.1634, + "num_input_tokens_seen": 160160, + "step": 760 + }, + { + "epoch": 0.08415841584158416, + "grad_norm": 10.867810249328613, + "learning_rate": 2.101210121012101e-06, + "loss": 7.2706, + "num_input_tokens_seen": 161184, + "step": 765 + }, + { + "epoch": 0.0847084708470847, + "grad_norm": 13.580577850341797, + "learning_rate": 2.114961496149615e-06, + "loss": 7.3434, + "num_input_tokens_seen": 162208, + "step": 770 + }, + { + "epoch": 0.08525852585258525, + "grad_norm": 10.685111045837402, + "learning_rate": 2.128712871287129e-06, + "loss": 7.4966, + "num_input_tokens_seen": 163296, + "step": 775 + }, + { + "epoch": 0.0858085808580858, + "grad_norm": 10.696796417236328, + "learning_rate": 2.1424642464246425e-06, + "loss": 7.2157, + "num_input_tokens_seen": 164416, + "step": 780 + }, + { + "epoch": 0.08635863586358636, + "grad_norm": 12.616762161254883, + "learning_rate": 2.1562156215621563e-06, + "loss": 7.4092, + "num_input_tokens_seen": 165440, + "step": 785 + }, + { + "epoch": 0.08690869086908691, + "grad_norm": 11.648545265197754, + "learning_rate": 2.16996699669967e-06, + "loss": 7.6004, + "num_input_tokens_seen": 166528, + "step": 790 + }, + { + "epoch": 0.08745874587458746, + "grad_norm": 11.213480949401855, + "learning_rate": 2.183718371837184e-06, + "loss": 7.3321, + "num_input_tokens_seen": 167616, + "step": 795 + }, + { + "epoch": 0.08800880088008801, + "grad_norm": 10.488607406616211, + "learning_rate": 2.1974697469746976e-06, + "loss": 7.1959, + "num_input_tokens_seen": 168640, + "step": 800 + }, + { + "epoch": 0.08855885588558855, + "grad_norm": 11.942822456359863, + "learning_rate": 2.2112211221122114e-06, + "loss": 7.3233, + "num_input_tokens_seen": 169696, + "step": 805 + }, + { + "epoch": 0.0891089108910891, + "grad_norm": 10.280301094055176, + "learning_rate": 2.224972497249725e-06, + "loss": 7.0411, + "num_input_tokens_seen": 170720, + "step": 810 + }, + { + "epoch": 0.08965896589658966, + "grad_norm": 11.20819091796875, + "learning_rate": 2.2387238723872385e-06, + "loss": 7.2513, + "num_input_tokens_seen": 171680, + "step": 815 + }, + { + "epoch": 0.09020902090209021, + "grad_norm": 11.538114547729492, + "learning_rate": 2.2524752475247523e-06, + "loss": 7.315, + "num_input_tokens_seen": 172800, + "step": 820 + }, + { + "epoch": 0.09075907590759076, + "grad_norm": 11.021841049194336, + "learning_rate": 2.2662266226622665e-06, + "loss": 7.4849, + "num_input_tokens_seen": 173856, + "step": 825 + }, + { + "epoch": 0.09130913091309131, + "grad_norm": 12.483748435974121, + "learning_rate": 2.2799779977997803e-06, + "loss": 7.5908, + "num_input_tokens_seen": 174848, + "step": 830 + }, + { + "epoch": 0.09185918591859187, + "grad_norm": 10.731793403625488, + "learning_rate": 2.293729372937294e-06, + "loss": 7.2742, + "num_input_tokens_seen": 175936, + "step": 835 + }, + { + "epoch": 0.0924092409240924, + "grad_norm": 11.334877014160156, + "learning_rate": 2.307480748074808e-06, + "loss": 7.1664, + "num_input_tokens_seen": 176992, + "step": 840 + }, + { + "epoch": 0.09295929592959296, + "grad_norm": 11.005636215209961, + "learning_rate": 2.3212321232123216e-06, + "loss": 7.2468, + "num_input_tokens_seen": 178048, + "step": 845 + }, + { + "epoch": 0.09350935093509351, + "grad_norm": 11.330809593200684, + "learning_rate": 2.334983498349835e-06, + "loss": 7.3711, + "num_input_tokens_seen": 179072, + "step": 850 + }, + { + "epoch": 0.09405940594059406, + "grad_norm": 11.528253555297852, + "learning_rate": 2.3487348734873487e-06, + "loss": 7.4862, + "num_input_tokens_seen": 180096, + "step": 855 + }, + { + "epoch": 0.09460946094609461, + "grad_norm": 11.469449996948242, + "learning_rate": 2.3624862486248625e-06, + "loss": 7.0273, + "num_input_tokens_seen": 181152, + "step": 860 + }, + { + "epoch": 0.09515951595159516, + "grad_norm": 11.731900215148926, + "learning_rate": 2.3762376237623762e-06, + "loss": 7.4026, + "num_input_tokens_seen": 182208, + "step": 865 + }, + { + "epoch": 0.09570957095709572, + "grad_norm": 11.776248931884766, + "learning_rate": 2.38998899889989e-06, + "loss": 7.0618, + "num_input_tokens_seen": 183232, + "step": 870 + }, + { + "epoch": 0.09625962596259625, + "grad_norm": 11.427351951599121, + "learning_rate": 2.403740374037404e-06, + "loss": 7.3767, + "num_input_tokens_seen": 184256, + "step": 875 + }, + { + "epoch": 0.0968096809680968, + "grad_norm": 10.974466323852539, + "learning_rate": 2.4174917491749176e-06, + "loss": 7.2033, + "num_input_tokens_seen": 185280, + "step": 880 + }, + { + "epoch": 0.09735973597359736, + "grad_norm": 11.466033935546875, + "learning_rate": 2.4312431243124313e-06, + "loss": 7.6114, + "num_input_tokens_seen": 186336, + "step": 885 + }, + { + "epoch": 0.09790979097909791, + "grad_norm": 10.428910255432129, + "learning_rate": 2.444994499449945e-06, + "loss": 7.6624, + "num_input_tokens_seen": 187392, + "step": 890 + }, + { + "epoch": 0.09845984598459846, + "grad_norm": 12.967662811279297, + "learning_rate": 2.458745874587459e-06, + "loss": 7.2998, + "num_input_tokens_seen": 188416, + "step": 895 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 10.717248916625977, + "learning_rate": 2.4724972497249727e-06, + "loss": 7.3177, + "num_input_tokens_seen": 189472, + "step": 900 + }, + { + "epoch": 0.09955995599559957, + "grad_norm": 11.492487907409668, + "learning_rate": 2.4862486248624864e-06, + "loss": 7.3102, + "num_input_tokens_seen": 190528, + "step": 905 + }, + { + "epoch": 0.1001100110011001, + "grad_norm": 11.811433792114258, + "learning_rate": 2.5e-06, + "loss": 7.3003, + "num_input_tokens_seen": 191648, + "step": 910 + }, + { + "epoch": 0.10066006600660066, + "grad_norm": 10.682319641113281, + "learning_rate": 2.5137513751375136e-06, + "loss": 7.4943, + "num_input_tokens_seen": 192672, + "step": 915 + }, + { + "epoch": 0.10121012101210121, + "grad_norm": 11.319896697998047, + "learning_rate": 2.5275027502750273e-06, + "loss": 7.2136, + "num_input_tokens_seen": 193760, + "step": 920 + }, + { + "epoch": 0.10176017601760176, + "grad_norm": 11.154298782348633, + "learning_rate": 2.5412541254125415e-06, + "loss": 7.5807, + "num_input_tokens_seen": 194848, + "step": 925 + }, + { + "epoch": 0.10231023102310231, + "grad_norm": 10.869307518005371, + "learning_rate": 2.5550055005500553e-06, + "loss": 7.4138, + "num_input_tokens_seen": 195968, + "step": 930 + }, + { + "epoch": 0.10286028602860287, + "grad_norm": 11.188004493713379, + "learning_rate": 2.568756875687569e-06, + "loss": 7.3796, + "num_input_tokens_seen": 196992, + "step": 935 + }, + { + "epoch": 0.1034103410341034, + "grad_norm": 12.301100730895996, + "learning_rate": 2.582508250825083e-06, + "loss": 7.561, + "num_input_tokens_seen": 198112, + "step": 940 + }, + { + "epoch": 0.10396039603960396, + "grad_norm": 12.280782699584961, + "learning_rate": 2.5962596259625966e-06, + "loss": 7.3158, + "num_input_tokens_seen": 199136, + "step": 945 + }, + { + "epoch": 0.10451045104510451, + "grad_norm": 10.418272018432617, + "learning_rate": 2.61001100110011e-06, + "loss": 7.1969, + "num_input_tokens_seen": 200224, + "step": 950 + }, + { + "epoch": 0.10506050605060506, + "grad_norm": 10.995795249938965, + "learning_rate": 2.6237623762376237e-06, + "loss": 7.3743, + "num_input_tokens_seen": 201312, + "step": 955 + }, + { + "epoch": 0.10561056105610561, + "grad_norm": 12.303739547729492, + "learning_rate": 2.6375137513751375e-06, + "loss": 7.3584, + "num_input_tokens_seen": 202400, + "step": 960 + }, + { + "epoch": 0.10616061606160616, + "grad_norm": 11.089427947998047, + "learning_rate": 2.6512651265126513e-06, + "loss": 7.352, + "num_input_tokens_seen": 203360, + "step": 965 + }, + { + "epoch": 0.10671067106710672, + "grad_norm": 11.547595024108887, + "learning_rate": 2.665016501650165e-06, + "loss": 7.4999, + "num_input_tokens_seen": 204352, + "step": 970 + }, + { + "epoch": 0.10726072607260725, + "grad_norm": 11.005946159362793, + "learning_rate": 2.678767876787679e-06, + "loss": 7.5039, + "num_input_tokens_seen": 205376, + "step": 975 + }, + { + "epoch": 0.1078107810781078, + "grad_norm": 11.939065933227539, + "learning_rate": 2.6925192519251926e-06, + "loss": 7.3011, + "num_input_tokens_seen": 206400, + "step": 980 + }, + { + "epoch": 0.10836083608360836, + "grad_norm": 11.397724151611328, + "learning_rate": 2.7062706270627064e-06, + "loss": 6.9919, + "num_input_tokens_seen": 207456, + "step": 985 + }, + { + "epoch": 0.10891089108910891, + "grad_norm": 11.611028671264648, + "learning_rate": 2.72002200220022e-06, + "loss": 7.2102, + "num_input_tokens_seen": 208480, + "step": 990 + }, + { + "epoch": 0.10946094609460946, + "grad_norm": 11.51713752746582, + "learning_rate": 2.733773377337734e-06, + "loss": 7.7986, + "num_input_tokens_seen": 209536, + "step": 995 + }, + { + "epoch": 0.11001100110011001, + "grad_norm": 11.813901901245117, + "learning_rate": 2.7475247524752477e-06, + "loss": 7.5316, + "num_input_tokens_seen": 210560, + "step": 1000 + }, + { + "epoch": 0.11056105610561057, + "grad_norm": 10.897917747497559, + "learning_rate": 2.7612761276127615e-06, + "loss": 7.3361, + "num_input_tokens_seen": 211584, + "step": 1005 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 11.2900390625, + "learning_rate": 2.7750275027502752e-06, + "loss": 7.4872, + "num_input_tokens_seen": 212672, + "step": 1010 + }, + { + "epoch": 0.11166116611661166, + "grad_norm": 11.312933921813965, + "learning_rate": 2.788778877887789e-06, + "loss": 7.5535, + "num_input_tokens_seen": 213760, + "step": 1015 + }, + { + "epoch": 0.11221122112211221, + "grad_norm": 11.957337379455566, + "learning_rate": 2.8025302530253024e-06, + "loss": 7.4467, + "num_input_tokens_seen": 214816, + "step": 1020 + }, + { + "epoch": 0.11276127612761276, + "grad_norm": 11.119544982910156, + "learning_rate": 2.816281628162816e-06, + "loss": 7.4353, + "num_input_tokens_seen": 215904, + "step": 1025 + }, + { + "epoch": 0.11331133113311331, + "grad_norm": 11.655132293701172, + "learning_rate": 2.8300330033003303e-06, + "loss": 7.4279, + "num_input_tokens_seen": 216992, + "step": 1030 + }, + { + "epoch": 0.11386138613861387, + "grad_norm": 12.46743392944336, + "learning_rate": 2.843784378437844e-06, + "loss": 7.3641, + "num_input_tokens_seen": 218048, + "step": 1035 + }, + { + "epoch": 0.11441144114411442, + "grad_norm": 12.177011489868164, + "learning_rate": 2.857535753575358e-06, + "loss": 7.3208, + "num_input_tokens_seen": 219136, + "step": 1040 + }, + { + "epoch": 0.11496149614961496, + "grad_norm": 11.731202125549316, + "learning_rate": 2.8712871287128717e-06, + "loss": 7.2384, + "num_input_tokens_seen": 220128, + "step": 1045 + }, + { + "epoch": 0.11551155115511551, + "grad_norm": 10.863177299499512, + "learning_rate": 2.8850385038503854e-06, + "loss": 7.2621, + "num_input_tokens_seen": 221184, + "step": 1050 + }, + { + "epoch": 0.11606160616061606, + "grad_norm": 11.993623733520508, + "learning_rate": 2.8987898789878988e-06, + "loss": 7.3663, + "num_input_tokens_seen": 222272, + "step": 1055 + }, + { + "epoch": 0.11661166116611661, + "grad_norm": 10.901973724365234, + "learning_rate": 2.9125412541254125e-06, + "loss": 7.3285, + "num_input_tokens_seen": 223296, + "step": 1060 + }, + { + "epoch": 0.11716171617161716, + "grad_norm": 11.960739135742188, + "learning_rate": 2.9262926292629263e-06, + "loss": 7.3486, + "num_input_tokens_seen": 224352, + "step": 1065 + }, + { + "epoch": 0.11771177117711772, + "grad_norm": 12.052661895751953, + "learning_rate": 2.94004400440044e-06, + "loss": 7.4935, + "num_input_tokens_seen": 225344, + "step": 1070 + }, + { + "epoch": 0.11826182618261827, + "grad_norm": 10.475512504577637, + "learning_rate": 2.953795379537954e-06, + "loss": 7.4266, + "num_input_tokens_seen": 226400, + "step": 1075 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 12.013936996459961, + "learning_rate": 2.9675467546754676e-06, + "loss": 7.3714, + "num_input_tokens_seen": 227520, + "step": 1080 + }, + { + "epoch": 0.11936193619361936, + "grad_norm": 11.07952880859375, + "learning_rate": 2.9812981298129814e-06, + "loss": 7.3534, + "num_input_tokens_seen": 228608, + "step": 1085 + }, + { + "epoch": 0.11991199119911991, + "grad_norm": 11.51778507232666, + "learning_rate": 2.995049504950495e-06, + "loss": 7.3666, + "num_input_tokens_seen": 229664, + "step": 1090 + }, + { + "epoch": 0.12046204620462046, + "grad_norm": 11.586313247680664, + "learning_rate": 3.008800880088009e-06, + "loss": 7.1996, + "num_input_tokens_seen": 230688, + "step": 1095 + }, + { + "epoch": 0.12101210121012101, + "grad_norm": 12.936184883117676, + "learning_rate": 3.0225522552255227e-06, + "loss": 7.3358, + "num_input_tokens_seen": 231776, + "step": 1100 + }, + { + "epoch": 0.12156215621562157, + "grad_norm": 12.408177375793457, + "learning_rate": 3.0363036303630365e-06, + "loss": 7.5826, + "num_input_tokens_seen": 232864, + "step": 1105 + }, + { + "epoch": 0.12211221122112212, + "grad_norm": 11.052671432495117, + "learning_rate": 3.0500550055005503e-06, + "loss": 7.6291, + "num_input_tokens_seen": 233920, + "step": 1110 + }, + { + "epoch": 0.12266226622662266, + "grad_norm": 10.6170072555542, + "learning_rate": 3.063806380638064e-06, + "loss": 7.5185, + "num_input_tokens_seen": 235040, + "step": 1115 + }, + { + "epoch": 0.12321232123212321, + "grad_norm": 11.851432800292969, + "learning_rate": 3.0775577557755774e-06, + "loss": 7.7357, + "num_input_tokens_seen": 236128, + "step": 1120 + }, + { + "epoch": 0.12376237623762376, + "grad_norm": 10.72952938079834, + "learning_rate": 3.091309130913091e-06, + "loss": 7.2506, + "num_input_tokens_seen": 237120, + "step": 1125 + }, + { + "epoch": 0.12431243124312431, + "grad_norm": 11.687102317810059, + "learning_rate": 3.105060506050605e-06, + "loss": 7.3958, + "num_input_tokens_seen": 238112, + "step": 1130 + }, + { + "epoch": 0.12486248624862487, + "grad_norm": 11.129799842834473, + "learning_rate": 3.118811881188119e-06, + "loss": 7.4532, + "num_input_tokens_seen": 239168, + "step": 1135 + }, + { + "epoch": 0.1254125412541254, + "grad_norm": 11.664200782775879, + "learning_rate": 3.1325632563256325e-06, + "loss": 7.2466, + "num_input_tokens_seen": 240256, + "step": 1140 + }, + { + "epoch": 0.12596259625962597, + "grad_norm": 11.457294464111328, + "learning_rate": 3.1463146314631467e-06, + "loss": 7.3145, + "num_input_tokens_seen": 241280, + "step": 1145 + }, + { + "epoch": 0.1265126512651265, + "grad_norm": 11.928433418273926, + "learning_rate": 3.16006600660066e-06, + "loss": 7.2879, + "num_input_tokens_seen": 242336, + "step": 1150 + }, + { + "epoch": 0.12706270627062707, + "grad_norm": 11.008979797363281, + "learning_rate": 3.1738173817381742e-06, + "loss": 7.4092, + "num_input_tokens_seen": 243456, + "step": 1155 + }, + { + "epoch": 0.1276127612761276, + "grad_norm": 11.400548934936523, + "learning_rate": 3.1875687568756876e-06, + "loss": 7.4895, + "num_input_tokens_seen": 244576, + "step": 1160 + }, + { + "epoch": 0.12816281628162815, + "grad_norm": 11.351271629333496, + "learning_rate": 3.2013201320132018e-06, + "loss": 7.2792, + "num_input_tokens_seen": 245664, + "step": 1165 + }, + { + "epoch": 0.12871287128712872, + "grad_norm": 10.434694290161133, + "learning_rate": 3.215071507150715e-06, + "loss": 7.4002, + "num_input_tokens_seen": 246752, + "step": 1170 + }, + { + "epoch": 0.12926292629262925, + "grad_norm": 11.709702491760254, + "learning_rate": 3.2288228822882285e-06, + "loss": 7.6029, + "num_input_tokens_seen": 247744, + "step": 1175 + }, + { + "epoch": 0.12981298129812982, + "grad_norm": 11.055807113647461, + "learning_rate": 3.2425742574257427e-06, + "loss": 7.3566, + "num_input_tokens_seen": 248768, + "step": 1180 + }, + { + "epoch": 0.13036303630363036, + "grad_norm": 13.30364990234375, + "learning_rate": 3.2563256325632564e-06, + "loss": 7.2982, + "num_input_tokens_seen": 249824, + "step": 1185 + }, + { + "epoch": 0.13091309130913092, + "grad_norm": 10.925260543823242, + "learning_rate": 3.2700770077007706e-06, + "loss": 7.459, + "num_input_tokens_seen": 250816, + "step": 1190 + }, + { + "epoch": 0.13146314631463146, + "grad_norm": 10.732199668884277, + "learning_rate": 3.283828382838284e-06, + "loss": 7.405, + "num_input_tokens_seen": 251872, + "step": 1195 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 12.277381896972656, + "learning_rate": 3.297579757975798e-06, + "loss": 7.6441, + "num_input_tokens_seen": 252960, + "step": 1200 + }, + { + "epoch": 0.13256325632563257, + "grad_norm": 11.807660102844238, + "learning_rate": 3.3113311331133115e-06, + "loss": 7.5805, + "num_input_tokens_seen": 254048, + "step": 1205 + }, + { + "epoch": 0.1331133113311331, + "grad_norm": 10.976350784301758, + "learning_rate": 3.325082508250825e-06, + "loss": 7.4084, + "num_input_tokens_seen": 255168, + "step": 1210 + }, + { + "epoch": 0.13366336633663367, + "grad_norm": 11.678393363952637, + "learning_rate": 3.338833883388339e-06, + "loss": 7.2141, + "num_input_tokens_seen": 256224, + "step": 1215 + }, + { + "epoch": 0.1342134213421342, + "grad_norm": 11.138566970825195, + "learning_rate": 3.3525852585258524e-06, + "loss": 7.4799, + "num_input_tokens_seen": 257248, + "step": 1220 + }, + { + "epoch": 0.13476347634763478, + "grad_norm": 11.128775596618652, + "learning_rate": 3.3663366336633666e-06, + "loss": 7.0325, + "num_input_tokens_seen": 258336, + "step": 1225 + }, + { + "epoch": 0.1353135313531353, + "grad_norm": 11.193687438964844, + "learning_rate": 3.38008800880088e-06, + "loss": 7.2038, + "num_input_tokens_seen": 259360, + "step": 1230 + }, + { + "epoch": 0.13586358635863585, + "grad_norm": 11.366571426391602, + "learning_rate": 3.393839383938394e-06, + "loss": 7.3499, + "num_input_tokens_seen": 260448, + "step": 1235 + }, + { + "epoch": 0.13641364136413642, + "grad_norm": 11.160608291625977, + "learning_rate": 3.407590759075908e-06, + "loss": 7.343, + "num_input_tokens_seen": 261472, + "step": 1240 + }, + { + "epoch": 0.13696369636963696, + "grad_norm": 10.800458908081055, + "learning_rate": 3.4213421342134213e-06, + "loss": 6.8779, + "num_input_tokens_seen": 262528, + "step": 1245 + }, + { + "epoch": 0.13751375137513752, + "grad_norm": 11.23745059967041, + "learning_rate": 3.4350935093509355e-06, + "loss": 7.0583, + "num_input_tokens_seen": 263616, + "step": 1250 + }, + { + "epoch": 0.13806380638063806, + "grad_norm": 10.29115104675293, + "learning_rate": 3.448844884488449e-06, + "loss": 6.7024, + "num_input_tokens_seen": 264640, + "step": 1255 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 11.139581680297852, + "learning_rate": 3.462596259625963e-06, + "loss": 6.8789, + "num_input_tokens_seen": 265696, + "step": 1260 + }, + { + "epoch": 0.13916391639163916, + "grad_norm": 10.320818901062012, + "learning_rate": 3.4763476347634764e-06, + "loss": 6.789, + "num_input_tokens_seen": 266720, + "step": 1265 + }, + { + "epoch": 0.1397139713971397, + "grad_norm": 10.117521286010742, + "learning_rate": 3.4900990099009906e-06, + "loss": 6.8071, + "num_input_tokens_seen": 267776, + "step": 1270 + }, + { + "epoch": 0.14026402640264027, + "grad_norm": 10.553966522216797, + "learning_rate": 3.503850385038504e-06, + "loss": 6.5955, + "num_input_tokens_seen": 268832, + "step": 1275 + }, + { + "epoch": 0.1408140814081408, + "grad_norm": 11.549513816833496, + "learning_rate": 3.5176017601760177e-06, + "loss": 6.929, + "num_input_tokens_seen": 269952, + "step": 1280 + }, + { + "epoch": 0.14136413641364137, + "grad_norm": 11.762697219848633, + "learning_rate": 3.5313531353135315e-06, + "loss": 6.6871, + "num_input_tokens_seen": 271008, + "step": 1285 + }, + { + "epoch": 0.1419141914191419, + "grad_norm": 9.937234878540039, + "learning_rate": 3.5451045104510453e-06, + "loss": 6.6865, + "num_input_tokens_seen": 272096, + "step": 1290 + }, + { + "epoch": 0.14246424642464248, + "grad_norm": 9.53396224975586, + "learning_rate": 3.5588558855885595e-06, + "loss": 6.4958, + "num_input_tokens_seen": 273120, + "step": 1295 + }, + { + "epoch": 0.14301430143014301, + "grad_norm": 10.204097747802734, + "learning_rate": 3.572607260726073e-06, + "loss": 6.3162, + "num_input_tokens_seen": 274208, + "step": 1300 + }, + { + "epoch": 0.14356435643564355, + "grad_norm": 8.983169555664062, + "learning_rate": 3.586358635863587e-06, + "loss": 6.3615, + "num_input_tokens_seen": 275264, + "step": 1305 + }, + { + "epoch": 0.14411441144114412, + "grad_norm": 9.10477352142334, + "learning_rate": 3.6001100110011004e-06, + "loss": 6.4205, + "num_input_tokens_seen": 276256, + "step": 1310 + }, + { + "epoch": 0.14466446644664466, + "grad_norm": 10.08437728881836, + "learning_rate": 3.6138613861386137e-06, + "loss": 6.4182, + "num_input_tokens_seen": 277280, + "step": 1315 + }, + { + "epoch": 0.14521452145214522, + "grad_norm": 10.224925994873047, + "learning_rate": 3.627612761276128e-06, + "loss": 6.2854, + "num_input_tokens_seen": 278432, + "step": 1320 + }, + { + "epoch": 0.14576457645764576, + "grad_norm": 9.166878700256348, + "learning_rate": 3.6413641364136412e-06, + "loss": 6.032, + "num_input_tokens_seen": 279488, + "step": 1325 + }, + { + "epoch": 0.14631463146314633, + "grad_norm": 10.171526908874512, + "learning_rate": 3.6551155115511554e-06, + "loss": 6.0312, + "num_input_tokens_seen": 280576, + "step": 1330 + }, + { + "epoch": 0.14686468646864687, + "grad_norm": 8.355018615722656, + "learning_rate": 3.668866886688669e-06, + "loss": 5.9493, + "num_input_tokens_seen": 281664, + "step": 1335 + }, + { + "epoch": 0.1474147414741474, + "grad_norm": 8.82347297668457, + "learning_rate": 3.682618261826183e-06, + "loss": 6.0968, + "num_input_tokens_seen": 282720, + "step": 1340 + }, + { + "epoch": 0.14796479647964797, + "grad_norm": 10.870708465576172, + "learning_rate": 3.6963696369636968e-06, + "loss": 6.142, + "num_input_tokens_seen": 283776, + "step": 1345 + }, + { + "epoch": 0.1485148514851485, + "grad_norm": 8.199283599853516, + "learning_rate": 3.71012101210121e-06, + "loss": 5.9042, + "num_input_tokens_seen": 284864, + "step": 1350 + }, + { + "epoch": 0.14906490649064907, + "grad_norm": 8.007168769836426, + "learning_rate": 3.7238723872387243e-06, + "loss": 5.8249, + "num_input_tokens_seen": 285984, + "step": 1355 + }, + { + "epoch": 0.1496149614961496, + "grad_norm": 8.273394584655762, + "learning_rate": 3.7376237623762377e-06, + "loss": 5.8849, + "num_input_tokens_seen": 287040, + "step": 1360 + }, + { + "epoch": 0.15016501650165018, + "grad_norm": 9.926811218261719, + "learning_rate": 3.751375137513752e-06, + "loss": 5.9874, + "num_input_tokens_seen": 288064, + "step": 1365 + }, + { + "epoch": 0.15071507150715072, + "grad_norm": 7.653804779052734, + "learning_rate": 3.765126512651265e-06, + "loss": 5.5594, + "num_input_tokens_seen": 289088, + "step": 1370 + }, + { + "epoch": 0.15126512651265125, + "grad_norm": 7.675756454467773, + "learning_rate": 3.7788778877887786e-06, + "loss": 5.6341, + "num_input_tokens_seen": 290048, + "step": 1375 + }, + { + "epoch": 0.15181518151815182, + "grad_norm": 7.4562883377075195, + "learning_rate": 3.7926292629262927e-06, + "loss": 5.6814, + "num_input_tokens_seen": 291136, + "step": 1380 + }, + { + "epoch": 0.15236523652365236, + "grad_norm": 7.725661277770996, + "learning_rate": 3.8063806380638065e-06, + "loss": 5.5651, + "num_input_tokens_seen": 292192, + "step": 1385 + }, + { + "epoch": 0.15291529152915292, + "grad_norm": 7.3569536209106445, + "learning_rate": 3.82013201320132e-06, + "loss": 5.4306, + "num_input_tokens_seen": 293184, + "step": 1390 + }, + { + "epoch": 0.15346534653465346, + "grad_norm": 7.886046409606934, + "learning_rate": 3.833883388338834e-06, + "loss": 5.5772, + "num_input_tokens_seen": 294272, + "step": 1395 + }, + { + "epoch": 0.15401540154015403, + "grad_norm": 7.383751392364502, + "learning_rate": 3.847634763476348e-06, + "loss": 5.3978, + "num_input_tokens_seen": 295328, + "step": 1400 + }, + { + "epoch": 0.15456545654565457, + "grad_norm": 7.830593109130859, + "learning_rate": 3.861386138613861e-06, + "loss": 5.2761, + "num_input_tokens_seen": 296416, + "step": 1405 + }, + { + "epoch": 0.1551155115511551, + "grad_norm": 7.612607955932617, + "learning_rate": 3.875137513751375e-06, + "loss": 5.4516, + "num_input_tokens_seen": 297504, + "step": 1410 + }, + { + "epoch": 0.15566556655665567, + "grad_norm": 6.895577907562256, + "learning_rate": 3.888888888888889e-06, + "loss": 5.2172, + "num_input_tokens_seen": 298496, + "step": 1415 + }, + { + "epoch": 0.1562156215621562, + "grad_norm": 7.139936447143555, + "learning_rate": 3.902640264026403e-06, + "loss": 5.2467, + "num_input_tokens_seen": 299616, + "step": 1420 + }, + { + "epoch": 0.15676567656765678, + "grad_norm": 5.872285842895508, + "learning_rate": 3.916391639163917e-06, + "loss": 5.1721, + "num_input_tokens_seen": 300672, + "step": 1425 + }, + { + "epoch": 0.1573157315731573, + "grad_norm": 7.008058071136475, + "learning_rate": 3.9301430143014305e-06, + "loss": 5.0744, + "num_input_tokens_seen": 301664, + "step": 1430 + }, + { + "epoch": 0.15786578657865788, + "grad_norm": 7.406260013580322, + "learning_rate": 3.943894389438945e-06, + "loss": 5.1807, + "num_input_tokens_seen": 302656, + "step": 1435 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 5.640195369720459, + "learning_rate": 3.957645764576458e-06, + "loss": 5.0229, + "num_input_tokens_seen": 303744, + "step": 1440 + }, + { + "epoch": 0.15896589658965896, + "grad_norm": 6.699004173278809, + "learning_rate": 3.971397139713971e-06, + "loss": 4.9634, + "num_input_tokens_seen": 304768, + "step": 1445 + }, + { + "epoch": 0.15951595159515952, + "grad_norm": 5.446451187133789, + "learning_rate": 3.9851485148514856e-06, + "loss": 4.7743, + "num_input_tokens_seen": 305792, + "step": 1450 + }, + { + "epoch": 0.16006600660066006, + "grad_norm": 5.967568397521973, + "learning_rate": 3.998899889988999e-06, + "loss": 4.9524, + "num_input_tokens_seen": 306848, + "step": 1455 + }, + { + "epoch": 0.16061606160616063, + "grad_norm": 5.910089492797852, + "learning_rate": 4.012651265126513e-06, + "loss": 4.8292, + "num_input_tokens_seen": 307936, + "step": 1460 + }, + { + "epoch": 0.16116611661166116, + "grad_norm": 6.050641059875488, + "learning_rate": 4.0264026402640265e-06, + "loss": 4.7939, + "num_input_tokens_seen": 308960, + "step": 1465 + }, + { + "epoch": 0.1617161716171617, + "grad_norm": 5.760583400726318, + "learning_rate": 4.040154015401541e-06, + "loss": 4.8823, + "num_input_tokens_seen": 310080, + "step": 1470 + }, + { + "epoch": 0.16226622662266227, + "grad_norm": 6.183698654174805, + "learning_rate": 4.053905390539054e-06, + "loss": 4.7777, + "num_input_tokens_seen": 311104, + "step": 1475 + }, + { + "epoch": 0.1628162816281628, + "grad_norm": 6.81343936920166, + "learning_rate": 4.067656765676567e-06, + "loss": 4.7416, + "num_input_tokens_seen": 312192, + "step": 1480 + }, + { + "epoch": 0.16336633663366337, + "grad_norm": 6.228099822998047, + "learning_rate": 4.0814081408140816e-06, + "loss": 4.7181, + "num_input_tokens_seen": 313248, + "step": 1485 + }, + { + "epoch": 0.1639163916391639, + "grad_norm": 5.493618965148926, + "learning_rate": 4.095159515951595e-06, + "loss": 4.7193, + "num_input_tokens_seen": 314368, + "step": 1490 + }, + { + "epoch": 0.16446644664466448, + "grad_norm": 5.437508583068848, + "learning_rate": 4.108910891089109e-06, + "loss": 4.6657, + "num_input_tokens_seen": 315488, + "step": 1495 + }, + { + "epoch": 0.16501650165016502, + "grad_norm": 5.667723655700684, + "learning_rate": 4.1226622662266225e-06, + "loss": 4.5255, + "num_input_tokens_seen": 316608, + "step": 1500 + }, + { + "epoch": 0.16556655665566555, + "grad_norm": 4.631077766418457, + "learning_rate": 4.136413641364137e-06, + "loss": 4.5752, + "num_input_tokens_seen": 317664, + "step": 1505 + }, + { + "epoch": 0.16611661166116612, + "grad_norm": 5.0706562995910645, + "learning_rate": 4.15016501650165e-06, + "loss": 4.4356, + "num_input_tokens_seen": 318784, + "step": 1510 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 5.3430986404418945, + "learning_rate": 4.163916391639164e-06, + "loss": 4.4716, + "num_input_tokens_seen": 319808, + "step": 1515 + }, + { + "epoch": 0.16721672167216722, + "grad_norm": 3.7246921062469482, + "learning_rate": 4.1776677667766775e-06, + "loss": 4.3935, + "num_input_tokens_seen": 320864, + "step": 1520 + }, + { + "epoch": 0.16776677667766776, + "grad_norm": 4.783636093139648, + "learning_rate": 4.191419141914192e-06, + "loss": 4.3528, + "num_input_tokens_seen": 321856, + "step": 1525 + }, + { + "epoch": 0.16831683168316833, + "grad_norm": 4.079864501953125, + "learning_rate": 4.205170517051706e-06, + "loss": 4.3166, + "num_input_tokens_seen": 323008, + "step": 1530 + }, + { + "epoch": 0.16886688668866887, + "grad_norm": 4.757389068603516, + "learning_rate": 4.218921892189219e-06, + "loss": 4.261, + "num_input_tokens_seen": 324096, + "step": 1535 + }, + { + "epoch": 0.1694169416941694, + "grad_norm": 4.436746597290039, + "learning_rate": 4.2326732673267335e-06, + "loss": 4.2335, + "num_input_tokens_seen": 325152, + "step": 1540 + }, + { + "epoch": 0.16996699669966997, + "grad_norm": 3.6814873218536377, + "learning_rate": 4.246424642464247e-06, + "loss": 4.1291, + "num_input_tokens_seen": 326272, + "step": 1545 + }, + { + "epoch": 0.1705170517051705, + "grad_norm": 4.179168701171875, + "learning_rate": 4.26017601760176e-06, + "loss": 4.2598, + "num_input_tokens_seen": 327296, + "step": 1550 + }, + { + "epoch": 0.17106710671067107, + "grad_norm": 5.282823085784912, + "learning_rate": 4.273927392739274e-06, + "loss": 4.1707, + "num_input_tokens_seen": 328288, + "step": 1555 + }, + { + "epoch": 0.1716171617161716, + "grad_norm": 3.733018398284912, + "learning_rate": 4.287678767876788e-06, + "loss": 4.0399, + "num_input_tokens_seen": 329408, + "step": 1560 + }, + { + "epoch": 0.17216721672167218, + "grad_norm": 4.257375240325928, + "learning_rate": 4.301430143014302e-06, + "loss": 4.1585, + "num_input_tokens_seen": 330432, + "step": 1565 + }, + { + "epoch": 0.17271727172717272, + "grad_norm": 3.809572219848633, + "learning_rate": 4.315181518151815e-06, + "loss": 4.0821, + "num_input_tokens_seen": 331488, + "step": 1570 + }, + { + "epoch": 0.17326732673267325, + "grad_norm": 3.8351471424102783, + "learning_rate": 4.3289328932893295e-06, + "loss": 4.0488, + "num_input_tokens_seen": 332480, + "step": 1575 + }, + { + "epoch": 0.17381738173817382, + "grad_norm": 3.961068868637085, + "learning_rate": 4.342684268426843e-06, + "loss": 4.0445, + "num_input_tokens_seen": 333536, + "step": 1580 + }, + { + "epoch": 0.17436743674367436, + "grad_norm": 3.650266170501709, + "learning_rate": 4.356435643564356e-06, + "loss": 3.9162, + "num_input_tokens_seen": 334560, + "step": 1585 + }, + { + "epoch": 0.17491749174917492, + "grad_norm": 4.302954196929932, + "learning_rate": 4.37018701870187e-06, + "loss": 3.9284, + "num_input_tokens_seen": 335616, + "step": 1590 + }, + { + "epoch": 0.17546754675467546, + "grad_norm": 4.15805721282959, + "learning_rate": 4.383938393839384e-06, + "loss": 4.0068, + "num_input_tokens_seen": 336640, + "step": 1595 + }, + { + "epoch": 0.17601760176017603, + "grad_norm": 4.319240570068359, + "learning_rate": 4.397689768976898e-06, + "loss": 3.9552, + "num_input_tokens_seen": 337728, + "step": 1600 + }, + { + "epoch": 0.17656765676567657, + "grad_norm": 3.69417667388916, + "learning_rate": 4.411441144114411e-06, + "loss": 3.955, + "num_input_tokens_seen": 338784, + "step": 1605 + }, + { + "epoch": 0.1771177117711771, + "grad_norm": 3.9184911251068115, + "learning_rate": 4.4251925192519255e-06, + "loss": 3.8774, + "num_input_tokens_seen": 339776, + "step": 1610 + }, + { + "epoch": 0.17766776677667767, + "grad_norm": 3.853987216949463, + "learning_rate": 4.438943894389439e-06, + "loss": 3.8536, + "num_input_tokens_seen": 340768, + "step": 1615 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 3.62998104095459, + "learning_rate": 4.452695269526953e-06, + "loss": 3.8006, + "num_input_tokens_seen": 341760, + "step": 1620 + }, + { + "epoch": 0.17876787678767878, + "grad_norm": 3.9395482540130615, + "learning_rate": 4.466446644664466e-06, + "loss": 3.8889, + "num_input_tokens_seen": 342944, + "step": 1625 + }, + { + "epoch": 0.1793179317931793, + "grad_norm": 4.035778522491455, + "learning_rate": 4.4801980198019806e-06, + "loss": 3.8046, + "num_input_tokens_seen": 344000, + "step": 1630 + }, + { + "epoch": 0.17986798679867988, + "grad_norm": 3.1587066650390625, + "learning_rate": 4.493949394939495e-06, + "loss": 3.7182, + "num_input_tokens_seen": 345120, + "step": 1635 + }, + { + "epoch": 0.18041804180418042, + "grad_norm": 3.6732699871063232, + "learning_rate": 4.507700770077008e-06, + "loss": 3.803, + "num_input_tokens_seen": 346240, + "step": 1640 + }, + { + "epoch": 0.18096809680968096, + "grad_norm": 2.832618474960327, + "learning_rate": 4.521452145214522e-06, + "loss": 3.7695, + "num_input_tokens_seen": 347328, + "step": 1645 + }, + { + "epoch": 0.18151815181518152, + "grad_norm": 3.264007091522217, + "learning_rate": 4.535203520352036e-06, + "loss": 3.6966, + "num_input_tokens_seen": 348416, + "step": 1650 + }, + { + "epoch": 0.18206820682068206, + "grad_norm": 2.8927741050720215, + "learning_rate": 4.548954895489549e-06, + "loss": 3.7095, + "num_input_tokens_seen": 349568, + "step": 1655 + }, + { + "epoch": 0.18261826182618263, + "grad_norm": 2.995042085647583, + "learning_rate": 4.562706270627063e-06, + "loss": 3.6894, + "num_input_tokens_seen": 350688, + "step": 1660 + }, + { + "epoch": 0.18316831683168316, + "grad_norm": 3.416428327560425, + "learning_rate": 4.5764576457645765e-06, + "loss": 3.6925, + "num_input_tokens_seen": 351712, + "step": 1665 + }, + { + "epoch": 0.18371837183718373, + "grad_norm": 3.3007304668426514, + "learning_rate": 4.590209020902091e-06, + "loss": 3.6686, + "num_input_tokens_seen": 352800, + "step": 1670 + }, + { + "epoch": 0.18426842684268427, + "grad_norm": 2.862489700317383, + "learning_rate": 4.603960396039604e-06, + "loss": 3.608, + "num_input_tokens_seen": 353856, + "step": 1675 + }, + { + "epoch": 0.1848184818481848, + "grad_norm": 3.6244101524353027, + "learning_rate": 4.617711771177118e-06, + "loss": 3.6166, + "num_input_tokens_seen": 354912, + "step": 1680 + }, + { + "epoch": 0.18536853685368537, + "grad_norm": 3.5140771865844727, + "learning_rate": 4.631463146314632e-06, + "loss": 3.5695, + "num_input_tokens_seen": 355968, + "step": 1685 + }, + { + "epoch": 0.1859185918591859, + "grad_norm": 3.3093371391296387, + "learning_rate": 4.645214521452145e-06, + "loss": 3.599, + "num_input_tokens_seen": 357056, + "step": 1690 + }, + { + "epoch": 0.18646864686468648, + "grad_norm": 3.114076614379883, + "learning_rate": 4.658965896589659e-06, + "loss": 3.6938, + "num_input_tokens_seen": 358144, + "step": 1695 + }, + { + "epoch": 0.18701870187018702, + "grad_norm": 2.9009907245635986, + "learning_rate": 4.6727172717271725e-06, + "loss": 3.5958, + "num_input_tokens_seen": 359168, + "step": 1700 + }, + { + "epoch": 0.18756875687568758, + "grad_norm": 3.2580549716949463, + "learning_rate": 4.686468646864687e-06, + "loss": 3.6366, + "num_input_tokens_seen": 360224, + "step": 1705 + }, + { + "epoch": 0.18811881188118812, + "grad_norm": 3.078411817550659, + "learning_rate": 4.7002200220022e-06, + "loss": 3.5002, + "num_input_tokens_seen": 361248, + "step": 1710 + }, + { + "epoch": 0.18866886688668866, + "grad_norm": 3.1779232025146484, + "learning_rate": 4.713971397139714e-06, + "loss": 3.4902, + "num_input_tokens_seen": 362272, + "step": 1715 + }, + { + "epoch": 0.18921892189218922, + "grad_norm": 3.404461622238159, + "learning_rate": 4.727722772277228e-06, + "loss": 3.5858, + "num_input_tokens_seen": 363392, + "step": 1720 + }, + { + "epoch": 0.18976897689768976, + "grad_norm": 2.8948357105255127, + "learning_rate": 4.741474147414742e-06, + "loss": 3.5347, + "num_input_tokens_seen": 364512, + "step": 1725 + }, + { + "epoch": 0.19031903190319033, + "grad_norm": 2.9640021324157715, + "learning_rate": 4.755225522552256e-06, + "loss": 3.5659, + "num_input_tokens_seen": 365568, + "step": 1730 + }, + { + "epoch": 0.19086908690869087, + "grad_norm": 3.2155416011810303, + "learning_rate": 4.768976897689769e-06, + "loss": 3.5397, + "num_input_tokens_seen": 366720, + "step": 1735 + }, + { + "epoch": 0.19141914191419143, + "grad_norm": 3.0280978679656982, + "learning_rate": 4.7827282728272836e-06, + "loss": 3.5176, + "num_input_tokens_seen": 367776, + "step": 1740 + }, + { + "epoch": 0.19196919691969197, + "grad_norm": 3.1733155250549316, + "learning_rate": 4.796479647964797e-06, + "loss": 3.5718, + "num_input_tokens_seen": 368832, + "step": 1745 + }, + { + "epoch": 0.1925192519251925, + "grad_norm": 3.5336174964904785, + "learning_rate": 4.810231023102311e-06, + "loss": 3.4523, + "num_input_tokens_seen": 369888, + "step": 1750 + }, + { + "epoch": 0.19306930693069307, + "grad_norm": 2.6414990425109863, + "learning_rate": 4.8239823982398245e-06, + "loss": 3.5159, + "num_input_tokens_seen": 370880, + "step": 1755 + }, + { + "epoch": 0.1936193619361936, + "grad_norm": 3.0199828147888184, + "learning_rate": 4.837733773377338e-06, + "loss": 3.41, + "num_input_tokens_seen": 371904, + "step": 1760 + }, + { + "epoch": 0.19416941694169418, + "grad_norm": 3.136460065841675, + "learning_rate": 4.851485148514852e-06, + "loss": 3.482, + "num_input_tokens_seen": 372960, + "step": 1765 + }, + { + "epoch": 0.19471947194719472, + "grad_norm": 3.0736169815063477, + "learning_rate": 4.865236523652365e-06, + "loss": 3.4855, + "num_input_tokens_seen": 374048, + "step": 1770 + }, + { + "epoch": 0.19526952695269528, + "grad_norm": 2.8709728717803955, + "learning_rate": 4.8789878987898795e-06, + "loss": 3.4263, + "num_input_tokens_seen": 375104, + "step": 1775 + }, + { + "epoch": 0.19581958195819582, + "grad_norm": 2.8393361568450928, + "learning_rate": 4.892739273927393e-06, + "loss": 3.4421, + "num_input_tokens_seen": 376224, + "step": 1780 + }, + { + "epoch": 0.19636963696369636, + "grad_norm": 3.1292672157287598, + "learning_rate": 4.906490649064906e-06, + "loss": 3.4474, + "num_input_tokens_seen": 377344, + "step": 1785 + }, + { + "epoch": 0.19691969196919692, + "grad_norm": 3.19037127494812, + "learning_rate": 4.9202420242024204e-06, + "loss": 3.3859, + "num_input_tokens_seen": 378368, + "step": 1790 + }, + { + "epoch": 0.19746974697469746, + "grad_norm": 3.009014844894409, + "learning_rate": 4.933993399339934e-06, + "loss": 3.4431, + "num_input_tokens_seen": 379424, + "step": 1795 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 2.856912851333618, + "learning_rate": 4.947744774477448e-06, + "loss": 3.4122, + "num_input_tokens_seen": 380448, + "step": 1800 + }, + { + "epoch": 0.19856985698569857, + "grad_norm": 2.5798611640930176, + "learning_rate": 4.961496149614961e-06, + "loss": 3.4009, + "num_input_tokens_seen": 381568, + "step": 1805 + }, + { + "epoch": 0.19911991199119913, + "grad_norm": 3.2060422897338867, + "learning_rate": 4.9752475247524755e-06, + "loss": 3.3777, + "num_input_tokens_seen": 382688, + "step": 1810 + }, + { + "epoch": 0.19966996699669967, + "grad_norm": 2.902026653289795, + "learning_rate": 4.988998899889989e-06, + "loss": 3.4004, + "num_input_tokens_seen": 383808, + "step": 1815 + }, + { + "epoch": 0.2002200220022002, + "grad_norm": 2.886639356613159, + "learning_rate": 5.002750275027502e-06, + "loss": 3.3696, + "num_input_tokens_seen": 384928, + "step": 1820 + }, + { + "epoch": 0.20077007700770078, + "grad_norm": 2.79699444770813, + "learning_rate": 5.0165016501650164e-06, + "loss": 3.3638, + "num_input_tokens_seen": 385920, + "step": 1825 + }, + { + "epoch": 0.20132013201320131, + "grad_norm": 2.6614878177642822, + "learning_rate": 5.030253025302531e-06, + "loss": 3.4051, + "num_input_tokens_seen": 386912, + "step": 1830 + }, + { + "epoch": 0.20187018701870188, + "grad_norm": 2.785876512527466, + "learning_rate": 5.044004400440045e-06, + "loss": 3.3815, + "num_input_tokens_seen": 387968, + "step": 1835 + }, + { + "epoch": 0.20242024202420242, + "grad_norm": 3.074368476867676, + "learning_rate": 5.057755775577558e-06, + "loss": 3.3589, + "num_input_tokens_seen": 389024, + "step": 1840 + }, + { + "epoch": 0.20297029702970298, + "grad_norm": 2.677023410797119, + "learning_rate": 5.071507150715072e-06, + "loss": 3.3309, + "num_input_tokens_seen": 390048, + "step": 1845 + }, + { + "epoch": 0.20352035203520352, + "grad_norm": 2.9605660438537598, + "learning_rate": 5.085258525852586e-06, + "loss": 3.3234, + "num_input_tokens_seen": 391008, + "step": 1850 + }, + { + "epoch": 0.20407040704070406, + "grad_norm": 3.4253077507019043, + "learning_rate": 5.099009900990099e-06, + "loss": 3.3752, + "num_input_tokens_seen": 392096, + "step": 1855 + }, + { + "epoch": 0.20462046204620463, + "grad_norm": 3.143693208694458, + "learning_rate": 5.112761276127613e-06, + "loss": 3.303, + "num_input_tokens_seen": 393184, + "step": 1860 + }, + { + "epoch": 0.20517051705170516, + "grad_norm": 3.180712938308716, + "learning_rate": 5.126512651265127e-06, + "loss": 3.3045, + "num_input_tokens_seen": 394272, + "step": 1865 + }, + { + "epoch": 0.20572057205720573, + "grad_norm": 2.762424945831299, + "learning_rate": 5.140264026402641e-06, + "loss": 3.2605, + "num_input_tokens_seen": 395328, + "step": 1870 + }, + { + "epoch": 0.20627062706270627, + "grad_norm": 3.1639606952667236, + "learning_rate": 5.154015401540154e-06, + "loss": 3.3112, + "num_input_tokens_seen": 396384, + "step": 1875 + }, + { + "epoch": 0.2068206820682068, + "grad_norm": 3.173875093460083, + "learning_rate": 5.167766776677668e-06, + "loss": 3.2632, + "num_input_tokens_seen": 397472, + "step": 1880 + }, + { + "epoch": 0.20737073707370737, + "grad_norm": 3.05094838142395, + "learning_rate": 5.181518151815182e-06, + "loss": 3.2373, + "num_input_tokens_seen": 398528, + "step": 1885 + }, + { + "epoch": 0.2079207920792079, + "grad_norm": 3.093417167663574, + "learning_rate": 5.195269526952695e-06, + "loss": 3.2612, + "num_input_tokens_seen": 399552, + "step": 1890 + }, + { + "epoch": 0.20847084708470848, + "grad_norm": 2.9526560306549072, + "learning_rate": 5.209020902090209e-06, + "loss": 3.2957, + "num_input_tokens_seen": 400576, + "step": 1895 + }, + { + "epoch": 0.20902090209020902, + "grad_norm": 2.9314141273498535, + "learning_rate": 5.222772277227723e-06, + "loss": 3.29, + "num_input_tokens_seen": 401600, + "step": 1900 + }, + { + "epoch": 0.20957095709570958, + "grad_norm": 2.9281320571899414, + "learning_rate": 5.236523652365237e-06, + "loss": 3.1994, + "num_input_tokens_seen": 402656, + "step": 1905 + }, + { + "epoch": 0.21012101210121012, + "grad_norm": 2.844087600708008, + "learning_rate": 5.25027502750275e-06, + "loss": 3.2617, + "num_input_tokens_seen": 403648, + "step": 1910 + }, + { + "epoch": 0.21067106710671066, + "grad_norm": 2.9412171840667725, + "learning_rate": 5.264026402640264e-06, + "loss": 3.1986, + "num_input_tokens_seen": 404672, + "step": 1915 + }, + { + "epoch": 0.21122112211221122, + "grad_norm": 3.0529162883758545, + "learning_rate": 5.277777777777778e-06, + "loss": 3.1861, + "num_input_tokens_seen": 405664, + "step": 1920 + }, + { + "epoch": 0.21177117711771176, + "grad_norm": 2.775385618209839, + "learning_rate": 5.291529152915292e-06, + "loss": 3.1647, + "num_input_tokens_seen": 406720, + "step": 1925 + }, + { + "epoch": 0.21232123212321233, + "grad_norm": 3.1120247840881348, + "learning_rate": 5.305280528052805e-06, + "loss": 3.245, + "num_input_tokens_seen": 407776, + "step": 1930 + }, + { + "epoch": 0.21287128712871287, + "grad_norm": 2.6890735626220703, + "learning_rate": 5.3190319031903194e-06, + "loss": 3.2043, + "num_input_tokens_seen": 408832, + "step": 1935 + }, + { + "epoch": 0.21342134213421343, + "grad_norm": 2.90826416015625, + "learning_rate": 5.332783278327834e-06, + "loss": 3.1628, + "num_input_tokens_seen": 409888, + "step": 1940 + }, + { + "epoch": 0.21397139713971397, + "grad_norm": 2.9674291610717773, + "learning_rate": 5.346534653465347e-06, + "loss": 3.2212, + "num_input_tokens_seen": 410944, + "step": 1945 + }, + { + "epoch": 0.2145214521452145, + "grad_norm": 2.85540771484375, + "learning_rate": 5.360286028602861e-06, + "loss": 3.1492, + "num_input_tokens_seen": 412096, + "step": 1950 + }, + { + "epoch": 0.21507150715071507, + "grad_norm": 3.395864725112915, + "learning_rate": 5.3740374037403745e-06, + "loss": 3.1062, + "num_input_tokens_seen": 413184, + "step": 1955 + }, + { + "epoch": 0.2156215621562156, + "grad_norm": 3.062901258468628, + "learning_rate": 5.387788778877888e-06, + "loss": 3.1718, + "num_input_tokens_seen": 414176, + "step": 1960 + }, + { + "epoch": 0.21617161716171618, + "grad_norm": 3.2230634689331055, + "learning_rate": 5.401540154015402e-06, + "loss": 3.1644, + "num_input_tokens_seen": 415200, + "step": 1965 + }, + { + "epoch": 0.21672167216721672, + "grad_norm": 3.374772310256958, + "learning_rate": 5.415291529152915e-06, + "loss": 3.1487, + "num_input_tokens_seen": 416256, + "step": 1970 + }, + { + "epoch": 0.21727172717271728, + "grad_norm": 2.7261111736297607, + "learning_rate": 5.42904290429043e-06, + "loss": 3.0632, + "num_input_tokens_seen": 417280, + "step": 1975 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 3.5547120571136475, + "learning_rate": 5.442794279427943e-06, + "loss": 3.1184, + "num_input_tokens_seen": 418304, + "step": 1980 + }, + { + "epoch": 0.21837183718371836, + "grad_norm": 2.9222660064697266, + "learning_rate": 5.456545654565457e-06, + "loss": 3.0706, + "num_input_tokens_seen": 419456, + "step": 1985 + }, + { + "epoch": 0.21892189218921893, + "grad_norm": 3.357764720916748, + "learning_rate": 5.4702970297029705e-06, + "loss": 3.1104, + "num_input_tokens_seen": 420576, + "step": 1990 + }, + { + "epoch": 0.21947194719471946, + "grad_norm": 3.191843032836914, + "learning_rate": 5.484048404840484e-06, + "loss": 3.1684, + "num_input_tokens_seen": 421664, + "step": 1995 + }, + { + "epoch": 0.22002200220022003, + "grad_norm": 3.178685426712036, + "learning_rate": 5.497799779977998e-06, + "loss": 3.139, + "num_input_tokens_seen": 422688, + "step": 2000 + }, + { + "epoch": 0.22057205720572057, + "grad_norm": 3.302856206893921, + "learning_rate": 5.511551155115511e-06, + "loss": 2.959, + "num_input_tokens_seen": 423712, + "step": 2005 + }, + { + "epoch": 0.22112211221122113, + "grad_norm": 3.0574631690979004, + "learning_rate": 5.525302530253026e-06, + "loss": 2.9825, + "num_input_tokens_seen": 424800, + "step": 2010 + }, + { + "epoch": 0.22167216721672167, + "grad_norm": 3.0044031143188477, + "learning_rate": 5.539053905390539e-06, + "loss": 2.9748, + "num_input_tokens_seen": 425920, + "step": 2015 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.3145334720611572, + "learning_rate": 5.552805280528053e-06, + "loss": 3.0094, + "num_input_tokens_seen": 427040, + "step": 2020 + }, + { + "epoch": 0.22277227722772278, + "grad_norm": 2.906985282897949, + "learning_rate": 5.5665566556655665e-06, + "loss": 3.0871, + "num_input_tokens_seen": 428128, + "step": 2025 + }, + { + "epoch": 0.22332233223322331, + "grad_norm": 2.782726287841797, + "learning_rate": 5.580308030803081e-06, + "loss": 2.9765, + "num_input_tokens_seen": 429216, + "step": 2030 + }, + { + "epoch": 0.22387238723872388, + "grad_norm": 3.1587326526641846, + "learning_rate": 5.594059405940594e-06, + "loss": 3.0701, + "num_input_tokens_seen": 430336, + "step": 2035 + }, + { + "epoch": 0.22442244224422442, + "grad_norm": 3.2497663497924805, + "learning_rate": 5.607810781078108e-06, + "loss": 3.0517, + "num_input_tokens_seen": 431424, + "step": 2040 + }, + { + "epoch": 0.22497249724972498, + "grad_norm": 3.4295263290405273, + "learning_rate": 5.6215621562156224e-06, + "loss": 3.0606, + "num_input_tokens_seen": 432448, + "step": 2045 + }, + { + "epoch": 0.22552255225522552, + "grad_norm": 3.033081531524658, + "learning_rate": 5.635313531353136e-06, + "loss": 3.0122, + "num_input_tokens_seen": 433504, + "step": 2050 + }, + { + "epoch": 0.22607260726072606, + "grad_norm": 2.905406951904297, + "learning_rate": 5.64906490649065e-06, + "loss": 2.9753, + "num_input_tokens_seen": 434592, + "step": 2055 + }, + { + "epoch": 0.22662266226622663, + "grad_norm": 3.0226874351501465, + "learning_rate": 5.662816281628163e-06, + "loss": 3.0, + "num_input_tokens_seen": 435712, + "step": 2060 + }, + { + "epoch": 0.22717271727172716, + "grad_norm": 2.7063353061676025, + "learning_rate": 5.676567656765677e-06, + "loss": 2.9661, + "num_input_tokens_seen": 436800, + "step": 2065 + }, + { + "epoch": 0.22772277227722773, + "grad_norm": 3.0009968280792236, + "learning_rate": 5.690319031903191e-06, + "loss": 2.9543, + "num_input_tokens_seen": 437824, + "step": 2070 + }, + { + "epoch": 0.22827282728272827, + "grad_norm": 3.071255683898926, + "learning_rate": 5.704070407040704e-06, + "loss": 2.968, + "num_input_tokens_seen": 438880, + "step": 2075 + }, + { + "epoch": 0.22882288228822883, + "grad_norm": 3.4683051109313965, + "learning_rate": 5.7178217821782184e-06, + "loss": 2.9104, + "num_input_tokens_seen": 439968, + "step": 2080 + }, + { + "epoch": 0.22937293729372937, + "grad_norm": 2.881765365600586, + "learning_rate": 5.731573157315732e-06, + "loss": 2.9433, + "num_input_tokens_seen": 440992, + "step": 2085 + }, + { + "epoch": 0.2299229922992299, + "grad_norm": 2.613255500793457, + "learning_rate": 5.745324532453246e-06, + "loss": 2.9571, + "num_input_tokens_seen": 442080, + "step": 2090 + }, + { + "epoch": 0.23047304730473048, + "grad_norm": 2.8134140968322754, + "learning_rate": 5.759075907590759e-06, + "loss": 2.9133, + "num_input_tokens_seen": 443136, + "step": 2095 + }, + { + "epoch": 0.23102310231023102, + "grad_norm": 3.1498286724090576, + "learning_rate": 5.772827282728273e-06, + "loss": 2.8893, + "num_input_tokens_seen": 444192, + "step": 2100 + }, + { + "epoch": 0.23157315731573158, + "grad_norm": 2.927776336669922, + "learning_rate": 5.786578657865787e-06, + "loss": 2.8613, + "num_input_tokens_seen": 445216, + "step": 2105 + }, + { + "epoch": 0.23212321232123212, + "grad_norm": 2.834317207336426, + "learning_rate": 5.8003300330033e-06, + "loss": 2.8623, + "num_input_tokens_seen": 446176, + "step": 2110 + }, + { + "epoch": 0.23267326732673269, + "grad_norm": 2.7958178520202637, + "learning_rate": 5.814081408140814e-06, + "loss": 2.8415, + "num_input_tokens_seen": 447296, + "step": 2115 + }, + { + "epoch": 0.23322332233223322, + "grad_norm": 2.8665106296539307, + "learning_rate": 5.827832783278328e-06, + "loss": 2.7846, + "num_input_tokens_seen": 448320, + "step": 2120 + }, + { + "epoch": 0.23377337733773376, + "grad_norm": 3.023843765258789, + "learning_rate": 5.841584158415842e-06, + "loss": 2.8654, + "num_input_tokens_seen": 449376, + "step": 2125 + }, + { + "epoch": 0.23432343234323433, + "grad_norm": 2.8402607440948486, + "learning_rate": 5.855335533553355e-06, + "loss": 2.7838, + "num_input_tokens_seen": 450432, + "step": 2130 + }, + { + "epoch": 0.23487348734873487, + "grad_norm": 2.902569532394409, + "learning_rate": 5.8690869086908695e-06, + "loss": 2.8624, + "num_input_tokens_seen": 451520, + "step": 2135 + }, + { + "epoch": 0.23542354235423543, + "grad_norm": 2.902777671813965, + "learning_rate": 5.882838283828383e-06, + "loss": 2.839, + "num_input_tokens_seen": 452608, + "step": 2140 + }, + { + "epoch": 0.23597359735973597, + "grad_norm": 3.0359301567077637, + "learning_rate": 5.896589658965897e-06, + "loss": 2.7613, + "num_input_tokens_seen": 453600, + "step": 2145 + }, + { + "epoch": 0.23652365236523654, + "grad_norm": 2.8809375762939453, + "learning_rate": 5.910341034103411e-06, + "loss": 2.852, + "num_input_tokens_seen": 454656, + "step": 2150 + }, + { + "epoch": 0.23707370737073707, + "grad_norm": 2.6971781253814697, + "learning_rate": 5.924092409240925e-06, + "loss": 2.7968, + "num_input_tokens_seen": 455712, + "step": 2155 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 2.671851634979248, + "learning_rate": 5.937843784378439e-06, + "loss": 2.7158, + "num_input_tokens_seen": 456768, + "step": 2160 + }, + { + "epoch": 0.23817381738173818, + "grad_norm": 2.8054347038269043, + "learning_rate": 5.951595159515952e-06, + "loss": 2.7288, + "num_input_tokens_seen": 457760, + "step": 2165 + }, + { + "epoch": 0.23872387238723872, + "grad_norm": 3.091519355773926, + "learning_rate": 5.9653465346534655e-06, + "loss": 2.7407, + "num_input_tokens_seen": 458784, + "step": 2170 + }, + { + "epoch": 0.23927392739273928, + "grad_norm": 3.107264518737793, + "learning_rate": 5.97909790979098e-06, + "loss": 2.687, + "num_input_tokens_seen": 459840, + "step": 2175 + }, + { + "epoch": 0.23982398239823982, + "grad_norm": 2.9590842723846436, + "learning_rate": 5.992849284928493e-06, + "loss": 2.6808, + "num_input_tokens_seen": 460896, + "step": 2180 + }, + { + "epoch": 0.2403740374037404, + "grad_norm": 3.09645414352417, + "learning_rate": 6.006600660066007e-06, + "loss": 2.7467, + "num_input_tokens_seen": 461952, + "step": 2185 + }, + { + "epoch": 0.24092409240924093, + "grad_norm": 2.984297275543213, + "learning_rate": 6.020352035203521e-06, + "loss": 2.6846, + "num_input_tokens_seen": 463040, + "step": 2190 + }, + { + "epoch": 0.24147414741474146, + "grad_norm": 2.8919663429260254, + "learning_rate": 6.034103410341034e-06, + "loss": 2.7228, + "num_input_tokens_seen": 464064, + "step": 2195 + }, + { + "epoch": 0.24202420242024203, + "grad_norm": 3.472463846206665, + "learning_rate": 6.047854785478548e-06, + "loss": 2.5919, + "num_input_tokens_seen": 465184, + "step": 2200 + }, + { + "epoch": 0.24257425742574257, + "grad_norm": 2.8606150150299072, + "learning_rate": 6.0616061606160615e-06, + "loss": 2.6287, + "num_input_tokens_seen": 466240, + "step": 2205 + }, + { + "epoch": 0.24312431243124313, + "grad_norm": 3.1581246852874756, + "learning_rate": 6.075357535753576e-06, + "loss": 2.6169, + "num_input_tokens_seen": 467264, + "step": 2210 + }, + { + "epoch": 0.24367436743674367, + "grad_norm": 3.19645619392395, + "learning_rate": 6.089108910891089e-06, + "loss": 2.6332, + "num_input_tokens_seen": 468320, + "step": 2215 + }, + { + "epoch": 0.24422442244224424, + "grad_norm": 2.9265575408935547, + "learning_rate": 6.102860286028603e-06, + "loss": 2.6706, + "num_input_tokens_seen": 469376, + "step": 2220 + }, + { + "epoch": 0.24477447744774478, + "grad_norm": 3.3517985343933105, + "learning_rate": 6.1166116611661166e-06, + "loss": 2.6522, + "num_input_tokens_seen": 470464, + "step": 2225 + }, + { + "epoch": 0.24532453245324531, + "grad_norm": 3.0687789916992188, + "learning_rate": 6.13036303630363e-06, + "loss": 2.6172, + "num_input_tokens_seen": 471552, + "step": 2230 + }, + { + "epoch": 0.24587458745874588, + "grad_norm": 2.536503314971924, + "learning_rate": 6.144114411441144e-06, + "loss": 2.6658, + "num_input_tokens_seen": 472672, + "step": 2235 + }, + { + "epoch": 0.24642464246424642, + "grad_norm": 3.258240222930908, + "learning_rate": 6.157865786578658e-06, + "loss": 2.6149, + "num_input_tokens_seen": 473664, + "step": 2240 + }, + { + "epoch": 0.24697469746974698, + "grad_norm": 3.178908586502075, + "learning_rate": 6.171617161716172e-06, + "loss": 2.5302, + "num_input_tokens_seen": 474784, + "step": 2245 + }, + { + "epoch": 0.24752475247524752, + "grad_norm": 2.9465949535369873, + "learning_rate": 6.185368536853686e-06, + "loss": 2.5069, + "num_input_tokens_seen": 475808, + "step": 2250 + }, + { + "epoch": 0.2480748074807481, + "grad_norm": 3.0862748622894287, + "learning_rate": 6.1991199119912e-06, + "loss": 2.5469, + "num_input_tokens_seen": 476928, + "step": 2255 + }, + { + "epoch": 0.24862486248624863, + "grad_norm": 3.463338613510132, + "learning_rate": 6.212871287128713e-06, + "loss": 2.5927, + "num_input_tokens_seen": 477984, + "step": 2260 + }, + { + "epoch": 0.24917491749174916, + "grad_norm": 2.6893062591552734, + "learning_rate": 6.226622662266227e-06, + "loss": 2.5135, + "num_input_tokens_seen": 479040, + "step": 2265 + }, + { + "epoch": 0.24972497249724973, + "grad_norm": 3.124992847442627, + "learning_rate": 6.240374037403741e-06, + "loss": 2.4856, + "num_input_tokens_seen": 480160, + "step": 2270 + }, + { + "epoch": 0.25027502750275027, + "grad_norm": 3.4490697383880615, + "learning_rate": 6.254125412541255e-06, + "loss": 2.4731, + "num_input_tokens_seen": 481216, + "step": 2275 + }, + { + "epoch": 0.2508250825082508, + "grad_norm": 2.883293867111206, + "learning_rate": 6.2678767876787685e-06, + "loss": 2.4549, + "num_input_tokens_seen": 482272, + "step": 2280 + }, + { + "epoch": 0.2513751375137514, + "grad_norm": 2.8643085956573486, + "learning_rate": 6.281628162816282e-06, + "loss": 2.4699, + "num_input_tokens_seen": 483328, + "step": 2285 + }, + { + "epoch": 0.25192519251925194, + "grad_norm": 2.903043270111084, + "learning_rate": 6.295379537953795e-06, + "loss": 2.4472, + "num_input_tokens_seen": 484480, + "step": 2290 + }, + { + "epoch": 0.2524752475247525, + "grad_norm": 3.243638753890991, + "learning_rate": 6.30913091309131e-06, + "loss": 2.4857, + "num_input_tokens_seen": 485600, + "step": 2295 + }, + { + "epoch": 0.253025302530253, + "grad_norm": 3.023465633392334, + "learning_rate": 6.322882288228824e-06, + "loss": 2.3885, + "num_input_tokens_seen": 486624, + "step": 2300 + }, + { + "epoch": 0.25357535753575355, + "grad_norm": 2.876929521560669, + "learning_rate": 6.336633663366337e-06, + "loss": 2.3375, + "num_input_tokens_seen": 487712, + "step": 2305 + }, + { + "epoch": 0.25412541254125415, + "grad_norm": 2.959702968597412, + "learning_rate": 6.35038503850385e-06, + "loss": 2.3505, + "num_input_tokens_seen": 488768, + "step": 2310 + }, + { + "epoch": 0.2546754675467547, + "grad_norm": 3.189366102218628, + "learning_rate": 6.364136413641364e-06, + "loss": 2.3711, + "num_input_tokens_seen": 489856, + "step": 2315 + }, + { + "epoch": 0.2552255225522552, + "grad_norm": 3.2420411109924316, + "learning_rate": 6.377887788778879e-06, + "loss": 2.3953, + "num_input_tokens_seen": 490848, + "step": 2320 + }, + { + "epoch": 0.25577557755775576, + "grad_norm": 3.133049964904785, + "learning_rate": 6.391639163916392e-06, + "loss": 2.3881, + "num_input_tokens_seen": 491904, + "step": 2325 + }, + { + "epoch": 0.2563256325632563, + "grad_norm": 3.1346378326416016, + "learning_rate": 6.405390539053905e-06, + "loss": 2.3054, + "num_input_tokens_seen": 493056, + "step": 2330 + }, + { + "epoch": 0.2568756875687569, + "grad_norm": 2.9671900272369385, + "learning_rate": 6.419141914191419e-06, + "loss": 2.2839, + "num_input_tokens_seen": 494112, + "step": 2335 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 2.8681812286376953, + "learning_rate": 6.432893289328933e-06, + "loss": 2.2671, + "num_input_tokens_seen": 495168, + "step": 2340 + }, + { + "epoch": 0.25797579757975797, + "grad_norm": 2.867429733276367, + "learning_rate": 6.446644664466447e-06, + "loss": 2.2827, + "num_input_tokens_seen": 496224, + "step": 2345 + }, + { + "epoch": 0.2585258525852585, + "grad_norm": 2.878242254257202, + "learning_rate": 6.4603960396039605e-06, + "loss": 2.2421, + "num_input_tokens_seen": 497248, + "step": 2350 + }, + { + "epoch": 0.2590759075907591, + "grad_norm": 2.9429125785827637, + "learning_rate": 6.474147414741475e-06, + "loss": 2.229, + "num_input_tokens_seen": 498240, + "step": 2355 + }, + { + "epoch": 0.25962596259625964, + "grad_norm": 2.918020725250244, + "learning_rate": 6.487898789878988e-06, + "loss": 2.2353, + "num_input_tokens_seen": 499296, + "step": 2360 + }, + { + "epoch": 0.2601760176017602, + "grad_norm": 3.0526156425476074, + "learning_rate": 6.501650165016503e-06, + "loss": 2.2008, + "num_input_tokens_seen": 500320, + "step": 2365 + }, + { + "epoch": 0.2607260726072607, + "grad_norm": 3.0505754947662354, + "learning_rate": 6.515401540154016e-06, + "loss": 2.0867, + "num_input_tokens_seen": 501376, + "step": 2370 + }, + { + "epoch": 0.26127612761276126, + "grad_norm": 2.7154290676116943, + "learning_rate": 6.52915291529153e-06, + "loss": 2.225, + "num_input_tokens_seen": 502400, + "step": 2375 + }, + { + "epoch": 0.26182618261826185, + "grad_norm": 3.1390199661254883, + "learning_rate": 6.542904290429043e-06, + "loss": 2.2374, + "num_input_tokens_seen": 503552, + "step": 2380 + }, + { + "epoch": 0.2623762376237624, + "grad_norm": 3.1363956928253174, + "learning_rate": 6.5566556655665565e-06, + "loss": 2.196, + "num_input_tokens_seen": 504640, + "step": 2385 + }, + { + "epoch": 0.2629262926292629, + "grad_norm": 2.8504273891448975, + "learning_rate": 6.5704070407040715e-06, + "loss": 2.1668, + "num_input_tokens_seen": 505696, + "step": 2390 + }, + { + "epoch": 0.26347634763476346, + "grad_norm": 3.520231246948242, + "learning_rate": 6.584158415841585e-06, + "loss": 2.2206, + "num_input_tokens_seen": 506816, + "step": 2395 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 2.9290804862976074, + "learning_rate": 6.597909790979098e-06, + "loss": 2.1025, + "num_input_tokens_seen": 507808, + "step": 2400 + }, + { + "epoch": 0.2645764576457646, + "grad_norm": 3.565763473510742, + "learning_rate": 6.6116611661166116e-06, + "loss": 2.1657, + "num_input_tokens_seen": 508864, + "step": 2405 + }, + { + "epoch": 0.26512651265126513, + "grad_norm": 2.903932571411133, + "learning_rate": 6.625412541254125e-06, + "loss": 2.0244, + "num_input_tokens_seen": 509952, + "step": 2410 + }, + { + "epoch": 0.26567656765676567, + "grad_norm": 3.105470657348633, + "learning_rate": 6.63916391639164e-06, + "loss": 2.0603, + "num_input_tokens_seen": 511008, + "step": 2415 + }, + { + "epoch": 0.2662266226622662, + "grad_norm": 2.82852840423584, + "learning_rate": 6.652915291529153e-06, + "loss": 1.989, + "num_input_tokens_seen": 512032, + "step": 2420 + }, + { + "epoch": 0.2667766776677668, + "grad_norm": 3.1962361335754395, + "learning_rate": 6.666666666666667e-06, + "loss": 2.0054, + "num_input_tokens_seen": 513088, + "step": 2425 + }, + { + "epoch": 0.26732673267326734, + "grad_norm": 2.853318452835083, + "learning_rate": 6.68041804180418e-06, + "loss": 1.9166, + "num_input_tokens_seen": 514176, + "step": 2430 + }, + { + "epoch": 0.2678767876787679, + "grad_norm": 2.844691276550293, + "learning_rate": 6.694169416941694e-06, + "loss": 1.9873, + "num_input_tokens_seen": 515200, + "step": 2435 + }, + { + "epoch": 0.2684268426842684, + "grad_norm": 2.9607155323028564, + "learning_rate": 6.707920792079208e-06, + "loss": 1.9823, + "num_input_tokens_seen": 516256, + "step": 2440 + }, + { + "epoch": 0.26897689768976896, + "grad_norm": 3.0345458984375, + "learning_rate": 6.721672167216722e-06, + "loss": 1.8981, + "num_input_tokens_seen": 517312, + "step": 2445 + }, + { + "epoch": 0.26952695269526955, + "grad_norm": 3.3760290145874023, + "learning_rate": 6.735423542354236e-06, + "loss": 2.0438, + "num_input_tokens_seen": 518368, + "step": 2450 + }, + { + "epoch": 0.2700770077007701, + "grad_norm": 2.902359962463379, + "learning_rate": 6.749174917491749e-06, + "loss": 1.91, + "num_input_tokens_seen": 519424, + "step": 2455 + }, + { + "epoch": 0.2706270627062706, + "grad_norm": 3.145023822784424, + "learning_rate": 6.7629262926292635e-06, + "loss": 1.8832, + "num_input_tokens_seen": 520512, + "step": 2460 + }, + { + "epoch": 0.27117711771177117, + "grad_norm": 3.1985929012298584, + "learning_rate": 6.776677667766778e-06, + "loss": 1.8601, + "num_input_tokens_seen": 521504, + "step": 2465 + }, + { + "epoch": 0.2717271727172717, + "grad_norm": 2.9308528900146484, + "learning_rate": 6.790429042904291e-06, + "loss": 1.8649, + "num_input_tokens_seen": 522528, + "step": 2470 + }, + { + "epoch": 0.2722772277227723, + "grad_norm": 3.2469921112060547, + "learning_rate": 6.804180418041804e-06, + "loss": 1.9403, + "num_input_tokens_seen": 523616, + "step": 2475 + }, + { + "epoch": 0.27282728272827284, + "grad_norm": 3.0033719539642334, + "learning_rate": 6.817931793179318e-06, + "loss": 1.8821, + "num_input_tokens_seen": 524672, + "step": 2480 + }, + { + "epoch": 0.2733773377337734, + "grad_norm": 2.8818747997283936, + "learning_rate": 6.831683168316833e-06, + "loss": 1.796, + "num_input_tokens_seen": 525696, + "step": 2485 + }, + { + "epoch": 0.2739273927392739, + "grad_norm": 2.9688663482666016, + "learning_rate": 6.845434543454346e-06, + "loss": 1.7677, + "num_input_tokens_seen": 526784, + "step": 2490 + }, + { + "epoch": 0.27447744774477445, + "grad_norm": 2.824671983718872, + "learning_rate": 6.8591859185918595e-06, + "loss": 1.8677, + "num_input_tokens_seen": 527776, + "step": 2495 + }, + { + "epoch": 0.27502750275027504, + "grad_norm": 3.0770938396453857, + "learning_rate": 6.872937293729373e-06, + "loss": 1.8275, + "num_input_tokens_seen": 528832, + "step": 2500 + }, + { + "epoch": 0.2755775577557756, + "grad_norm": 2.464324951171875, + "learning_rate": 6.886688668866886e-06, + "loss": 1.7764, + "num_input_tokens_seen": 529920, + "step": 2505 + }, + { + "epoch": 0.2761276127612761, + "grad_norm": 3.3467352390289307, + "learning_rate": 6.900440044004401e-06, + "loss": 1.7844, + "num_input_tokens_seen": 530944, + "step": 2510 + }, + { + "epoch": 0.27667766776677666, + "grad_norm": 3.408874273300171, + "learning_rate": 6.9141914191419146e-06, + "loss": 1.6817, + "num_input_tokens_seen": 532032, + "step": 2515 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 2.7968804836273193, + "learning_rate": 6.927942794279428e-06, + "loss": 1.7105, + "num_input_tokens_seen": 533120, + "step": 2520 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 3.208728551864624, + "learning_rate": 6.941694169416941e-06, + "loss": 1.6749, + "num_input_tokens_seen": 534208, + "step": 2525 + }, + { + "epoch": 0.27832783278327833, + "grad_norm": 3.1156399250030518, + "learning_rate": 6.955445544554456e-06, + "loss": 1.7482, + "num_input_tokens_seen": 535232, + "step": 2530 + }, + { + "epoch": 0.27887788778877887, + "grad_norm": 2.8717846870422363, + "learning_rate": 6.96919691969197e-06, + "loss": 1.6794, + "num_input_tokens_seen": 536288, + "step": 2535 + }, + { + "epoch": 0.2794279427942794, + "grad_norm": 3.3600635528564453, + "learning_rate": 6.982948294829483e-06, + "loss": 1.7517, + "num_input_tokens_seen": 537376, + "step": 2540 + }, + { + "epoch": 0.27997799779978, + "grad_norm": 3.0480382442474365, + "learning_rate": 6.996699669966996e-06, + "loss": 1.6997, + "num_input_tokens_seen": 538432, + "step": 2545 + }, + { + "epoch": 0.28052805280528054, + "grad_norm": 2.981518030166626, + "learning_rate": 7.0104510451045105e-06, + "loss": 1.652, + "num_input_tokens_seen": 539520, + "step": 2550 + }, + { + "epoch": 0.2810781078107811, + "grad_norm": 2.8259880542755127, + "learning_rate": 7.024202420242025e-06, + "loss": 1.629, + "num_input_tokens_seen": 540576, + "step": 2555 + }, + { + "epoch": 0.2816281628162816, + "grad_norm": 2.95198392868042, + "learning_rate": 7.037953795379539e-06, + "loss": 1.5845, + "num_input_tokens_seen": 541632, + "step": 2560 + }, + { + "epoch": 0.28217821782178215, + "grad_norm": 2.904588460922241, + "learning_rate": 7.051705170517052e-06, + "loss": 1.5555, + "num_input_tokens_seen": 542688, + "step": 2565 + }, + { + "epoch": 0.28272827282728275, + "grad_norm": 2.9085209369659424, + "learning_rate": 7.065456545654566e-06, + "loss": 1.6028, + "num_input_tokens_seen": 543744, + "step": 2570 + }, + { + "epoch": 0.2832783278327833, + "grad_norm": 2.719245672225952, + "learning_rate": 7.079207920792079e-06, + "loss": 1.4914, + "num_input_tokens_seen": 544736, + "step": 2575 + }, + { + "epoch": 0.2838283828382838, + "grad_norm": 2.8909668922424316, + "learning_rate": 7.092959295929594e-06, + "loss": 1.5389, + "num_input_tokens_seen": 545824, + "step": 2580 + }, + { + "epoch": 0.28437843784378436, + "grad_norm": 2.945420742034912, + "learning_rate": 7.106710671067107e-06, + "loss": 1.5166, + "num_input_tokens_seen": 546880, + "step": 2585 + }, + { + "epoch": 0.28492849284928495, + "grad_norm": 3.152568817138672, + "learning_rate": 7.120462046204621e-06, + "loss": 1.5438, + "num_input_tokens_seen": 548000, + "step": 2590 + }, + { + "epoch": 0.2854785478547855, + "grad_norm": 3.0932319164276123, + "learning_rate": 7.134213421342134e-06, + "loss": 1.4628, + "num_input_tokens_seen": 549120, + "step": 2595 + }, + { + "epoch": 0.28602860286028603, + "grad_norm": 3.026383399963379, + "learning_rate": 7.147964796479649e-06, + "loss": 1.4825, + "num_input_tokens_seen": 550208, + "step": 2600 + }, + { + "epoch": 0.28657865786578657, + "grad_norm": 2.9746031761169434, + "learning_rate": 7.1617161716171625e-06, + "loss": 1.4593, + "num_input_tokens_seen": 551264, + "step": 2605 + }, + { + "epoch": 0.2871287128712871, + "grad_norm": 2.943903923034668, + "learning_rate": 7.175467546754676e-06, + "loss": 1.587, + "num_input_tokens_seen": 552320, + "step": 2610 + }, + { + "epoch": 0.2876787678767877, + "grad_norm": 2.8447115421295166, + "learning_rate": 7.189218921892189e-06, + "loss": 1.4495, + "num_input_tokens_seen": 553312, + "step": 2615 + }, + { + "epoch": 0.28822882288228824, + "grad_norm": 3.095100164413452, + "learning_rate": 7.2029702970297025e-06, + "loss": 1.5797, + "num_input_tokens_seen": 554368, + "step": 2620 + }, + { + "epoch": 0.2887788778877888, + "grad_norm": 3.3841874599456787, + "learning_rate": 7.2167216721672176e-06, + "loss": 1.412, + "num_input_tokens_seen": 555392, + "step": 2625 + }, + { + "epoch": 0.2893289328932893, + "grad_norm": 2.949648380279541, + "learning_rate": 7.230473047304731e-06, + "loss": 1.4025, + "num_input_tokens_seen": 556384, + "step": 2630 + }, + { + "epoch": 0.28987898789878985, + "grad_norm": 2.845747709274292, + "learning_rate": 7.244224422442244e-06, + "loss": 1.2972, + "num_input_tokens_seen": 557472, + "step": 2635 + }, + { + "epoch": 0.29042904290429045, + "grad_norm": 3.237241506576538, + "learning_rate": 7.257975797579758e-06, + "loss": 1.372, + "num_input_tokens_seen": 558528, + "step": 2640 + }, + { + "epoch": 0.290979097909791, + "grad_norm": 2.76768159866333, + "learning_rate": 7.271727172717272e-06, + "loss": 1.3402, + "num_input_tokens_seen": 559520, + "step": 2645 + }, + { + "epoch": 0.2915291529152915, + "grad_norm": 2.807544469833374, + "learning_rate": 7.285478547854786e-06, + "loss": 1.3813, + "num_input_tokens_seen": 560512, + "step": 2650 + }, + { + "epoch": 0.29207920792079206, + "grad_norm": 2.8720791339874268, + "learning_rate": 7.299229922992299e-06, + "loss": 1.3375, + "num_input_tokens_seen": 561568, + "step": 2655 + }, + { + "epoch": 0.29262926292629265, + "grad_norm": 2.716076374053955, + "learning_rate": 7.3129812981298136e-06, + "loss": 1.2308, + "num_input_tokens_seen": 562528, + "step": 2660 + }, + { + "epoch": 0.2931793179317932, + "grad_norm": 2.905975341796875, + "learning_rate": 7.326732673267327e-06, + "loss": 1.3154, + "num_input_tokens_seen": 563552, + "step": 2665 + }, + { + "epoch": 0.29372937293729373, + "grad_norm": 2.739940643310547, + "learning_rate": 7.340484048404841e-06, + "loss": 1.2789, + "num_input_tokens_seen": 564576, + "step": 2670 + }, + { + "epoch": 0.29427942794279427, + "grad_norm": 2.8067896366119385, + "learning_rate": 7.354235423542355e-06, + "loss": 1.2558, + "num_input_tokens_seen": 565696, + "step": 2675 + }, + { + "epoch": 0.2948294829482948, + "grad_norm": 2.9605917930603027, + "learning_rate": 7.367986798679869e-06, + "loss": 1.2033, + "num_input_tokens_seen": 566688, + "step": 2680 + }, + { + "epoch": 0.2953795379537954, + "grad_norm": 2.829892635345459, + "learning_rate": 7.381738173817382e-06, + "loss": 1.1678, + "num_input_tokens_seen": 567712, + "step": 2685 + }, + { + "epoch": 0.29592959295929594, + "grad_norm": 2.7876510620117188, + "learning_rate": 7.395489548954895e-06, + "loss": 1.1604, + "num_input_tokens_seen": 568864, + "step": 2690 + }, + { + "epoch": 0.2964796479647965, + "grad_norm": 2.892385482788086, + "learning_rate": 7.40924092409241e-06, + "loss": 1.1496, + "num_input_tokens_seen": 569888, + "step": 2695 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 3.0789384841918945, + "learning_rate": 7.422992299229924e-06, + "loss": 1.2195, + "num_input_tokens_seen": 570944, + "step": 2700 + }, + { + "epoch": 0.29757975797579755, + "grad_norm": 2.784221887588501, + "learning_rate": 7.436743674367437e-06, + "loss": 1.1571, + "num_input_tokens_seen": 572000, + "step": 2705 + }, + { + "epoch": 0.29812981298129815, + "grad_norm": 2.879195213317871, + "learning_rate": 7.4504950495049504e-06, + "loss": 1.1751, + "num_input_tokens_seen": 572992, + "step": 2710 + }, + { + "epoch": 0.2986798679867987, + "grad_norm": 2.7749152183532715, + "learning_rate": 7.464246424642464e-06, + "loss": 1.0753, + "num_input_tokens_seen": 574048, + "step": 2715 + }, + { + "epoch": 0.2992299229922992, + "grad_norm": 2.8498470783233643, + "learning_rate": 7.477997799779979e-06, + "loss": 1.108, + "num_input_tokens_seen": 575136, + "step": 2720 + }, + { + "epoch": 0.29977997799779976, + "grad_norm": 2.70365047454834, + "learning_rate": 7.491749174917492e-06, + "loss": 1.0868, + "num_input_tokens_seen": 576160, + "step": 2725 + }, + { + "epoch": 0.30033003300330036, + "grad_norm": 2.744832992553711, + "learning_rate": 7.5055005500550055e-06, + "loss": 1.1336, + "num_input_tokens_seen": 577184, + "step": 2730 + }, + { + "epoch": 0.3008800880088009, + "grad_norm": 2.8724160194396973, + "learning_rate": 7.519251925192519e-06, + "loss": 1.1232, + "num_input_tokens_seen": 578208, + "step": 2735 + }, + { + "epoch": 0.30143014301430143, + "grad_norm": 3.026425361633301, + "learning_rate": 7.533003300330034e-06, + "loss": 1.0396, + "num_input_tokens_seen": 579296, + "step": 2740 + }, + { + "epoch": 0.30198019801980197, + "grad_norm": 2.6834828853607178, + "learning_rate": 7.546754675467547e-06, + "loss": 1.0207, + "num_input_tokens_seen": 580320, + "step": 2745 + }, + { + "epoch": 0.3025302530253025, + "grad_norm": 2.792711019515991, + "learning_rate": 7.560506050605061e-06, + "loss": 1.0337, + "num_input_tokens_seen": 581344, + "step": 2750 + }, + { + "epoch": 0.3030803080308031, + "grad_norm": 2.9808197021484375, + "learning_rate": 7.574257425742575e-06, + "loss": 1.0657, + "num_input_tokens_seen": 582368, + "step": 2755 + }, + { + "epoch": 0.30363036303630364, + "grad_norm": 2.7683804035186768, + "learning_rate": 7.588008800880088e-06, + "loss": 0.9606, + "num_input_tokens_seen": 583424, + "step": 2760 + }, + { + "epoch": 0.3041804180418042, + "grad_norm": 2.763310432434082, + "learning_rate": 7.601760176017602e-06, + "loss": 0.9683, + "num_input_tokens_seen": 584512, + "step": 2765 + }, + { + "epoch": 0.3047304730473047, + "grad_norm": 2.7956044673919678, + "learning_rate": 7.6155115511551166e-06, + "loss": 0.9246, + "num_input_tokens_seen": 585536, + "step": 2770 + }, + { + "epoch": 0.30528052805280526, + "grad_norm": 2.5235464572906494, + "learning_rate": 7.62926292629263e-06, + "loss": 0.9631, + "num_input_tokens_seen": 586624, + "step": 2775 + }, + { + "epoch": 0.30583058305830585, + "grad_norm": 2.9556288719177246, + "learning_rate": 7.643014301430143e-06, + "loss": 0.9781, + "num_input_tokens_seen": 587712, + "step": 2780 + }, + { + "epoch": 0.3063806380638064, + "grad_norm": 2.8574459552764893, + "learning_rate": 7.656765676567657e-06, + "loss": 0.9028, + "num_input_tokens_seen": 588736, + "step": 2785 + }, + { + "epoch": 0.3069306930693069, + "grad_norm": 2.2778213024139404, + "learning_rate": 7.670517051705172e-06, + "loss": 0.7857, + "num_input_tokens_seen": 589792, + "step": 2790 + }, + { + "epoch": 0.30748074807480746, + "grad_norm": 2.5853662490844727, + "learning_rate": 7.684268426842685e-06, + "loss": 0.9243, + "num_input_tokens_seen": 590912, + "step": 2795 + }, + { + "epoch": 0.30803080308030806, + "grad_norm": 2.3381218910217285, + "learning_rate": 7.698019801980198e-06, + "loss": 0.8511, + "num_input_tokens_seen": 591968, + "step": 2800 + }, + { + "epoch": 0.3085808580858086, + "grad_norm": 2.864701509475708, + "learning_rate": 7.711771177117712e-06, + "loss": 0.8741, + "num_input_tokens_seen": 593024, + "step": 2805 + }, + { + "epoch": 0.30913091309130913, + "grad_norm": 2.491154193878174, + "learning_rate": 7.725522552255227e-06, + "loss": 0.7625, + "num_input_tokens_seen": 594048, + "step": 2810 + }, + { + "epoch": 0.3096809680968097, + "grad_norm": 2.7249228954315186, + "learning_rate": 7.73927392739274e-06, + "loss": 0.8602, + "num_input_tokens_seen": 595072, + "step": 2815 + }, + { + "epoch": 0.3102310231023102, + "grad_norm": 2.677046775817871, + "learning_rate": 7.753025302530253e-06, + "loss": 0.8144, + "num_input_tokens_seen": 596096, + "step": 2820 + }, + { + "epoch": 0.3107810781078108, + "grad_norm": 2.5760302543640137, + "learning_rate": 7.766776677667767e-06, + "loss": 0.8549, + "num_input_tokens_seen": 597184, + "step": 2825 + }, + { + "epoch": 0.31133113311331134, + "grad_norm": 2.4198074340820312, + "learning_rate": 7.78052805280528e-06, + "loss": 0.7722, + "num_input_tokens_seen": 598240, + "step": 2830 + }, + { + "epoch": 0.3118811881188119, + "grad_norm": 2.584496021270752, + "learning_rate": 7.794279427942795e-06, + "loss": 0.7661, + "num_input_tokens_seen": 599328, + "step": 2835 + }, + { + "epoch": 0.3124312431243124, + "grad_norm": 2.327914237976074, + "learning_rate": 7.808030803080309e-06, + "loss": 0.7026, + "num_input_tokens_seen": 600384, + "step": 2840 + }, + { + "epoch": 0.31298129812981296, + "grad_norm": 2.382534980773926, + "learning_rate": 7.821782178217822e-06, + "loss": 0.7485, + "num_input_tokens_seen": 601472, + "step": 2845 + }, + { + "epoch": 0.31353135313531355, + "grad_norm": 2.552241086959839, + "learning_rate": 7.835533553355335e-06, + "loss": 0.7902, + "num_input_tokens_seen": 602560, + "step": 2850 + }, + { + "epoch": 0.3140814081408141, + "grad_norm": 2.658682107925415, + "learning_rate": 7.849284928492849e-06, + "loss": 0.6858, + "num_input_tokens_seen": 603584, + "step": 2855 + }, + { + "epoch": 0.3146314631463146, + "grad_norm": 2.3638100624084473, + "learning_rate": 7.863036303630364e-06, + "loss": 0.7011, + "num_input_tokens_seen": 604640, + "step": 2860 + }, + { + "epoch": 0.31518151815181517, + "grad_norm": 2.374424457550049, + "learning_rate": 7.876787678767877e-06, + "loss": 0.6309, + "num_input_tokens_seen": 605760, + "step": 2865 + }, + { + "epoch": 0.31573157315731576, + "grad_norm": 2.218489408493042, + "learning_rate": 7.89053905390539e-06, + "loss": 0.6317, + "num_input_tokens_seen": 606848, + "step": 2870 + }, + { + "epoch": 0.3162816281628163, + "grad_norm": 2.118344306945801, + "learning_rate": 7.904290429042904e-06, + "loss": 0.6261, + "num_input_tokens_seen": 607904, + "step": 2875 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 2.0242421627044678, + "learning_rate": 7.918041804180417e-06, + "loss": 0.6798, + "num_input_tokens_seen": 608992, + "step": 2880 + }, + { + "epoch": 0.3173817381738174, + "grad_norm": 1.8742400407791138, + "learning_rate": 7.931793179317932e-06, + "loss": 0.6284, + "num_input_tokens_seen": 610016, + "step": 2885 + }, + { + "epoch": 0.3179317931793179, + "grad_norm": 1.9068659543991089, + "learning_rate": 7.945544554455445e-06, + "loss": 0.6523, + "num_input_tokens_seen": 611072, + "step": 2890 + }, + { + "epoch": 0.3184818481848185, + "grad_norm": 2.5069642066955566, + "learning_rate": 7.959295929592959e-06, + "loss": 0.6137, + "num_input_tokens_seen": 612160, + "step": 2895 + }, + { + "epoch": 0.31903190319031904, + "grad_norm": 2.464470386505127, + "learning_rate": 7.973047304730472e-06, + "loss": 0.8761, + "num_input_tokens_seen": 613216, + "step": 2900 + }, + { + "epoch": 0.3195819581958196, + "grad_norm": 2.5502116680145264, + "learning_rate": 7.986798679867987e-06, + "loss": 0.6466, + "num_input_tokens_seen": 614304, + "step": 2905 + }, + { + "epoch": 0.3201320132013201, + "grad_norm": 2.306168556213379, + "learning_rate": 8.0005500550055e-06, + "loss": 0.6578, + "num_input_tokens_seen": 615360, + "step": 2910 + }, + { + "epoch": 0.32068206820682066, + "grad_norm": 1.6901966333389282, + "learning_rate": 8.014301430143016e-06, + "loss": 0.555, + "num_input_tokens_seen": 616416, + "step": 2915 + }, + { + "epoch": 0.32123212321232125, + "grad_norm": 2.665722131729126, + "learning_rate": 8.028052805280529e-06, + "loss": 0.6429, + "num_input_tokens_seen": 617472, + "step": 2920 + }, + { + "epoch": 0.3217821782178218, + "grad_norm": 1.5735477209091187, + "learning_rate": 8.041804180418042e-06, + "loss": 0.563, + "num_input_tokens_seen": 618528, + "step": 2925 + }, + { + "epoch": 0.32233223322332233, + "grad_norm": 2.0944437980651855, + "learning_rate": 8.055555555555557e-06, + "loss": 0.5662, + "num_input_tokens_seen": 619648, + "step": 2930 + }, + { + "epoch": 0.32288228822882287, + "grad_norm": 2.4431891441345215, + "learning_rate": 8.06930693069307e-06, + "loss": 0.6746, + "num_input_tokens_seen": 620704, + "step": 2935 + }, + { + "epoch": 0.3234323432343234, + "grad_norm": 1.4499530792236328, + "learning_rate": 8.083058305830584e-06, + "loss": 0.5746, + "num_input_tokens_seen": 621760, + "step": 2940 + }, + { + "epoch": 0.323982398239824, + "grad_norm": 1.985046148300171, + "learning_rate": 8.096809680968097e-06, + "loss": 0.4847, + "num_input_tokens_seen": 622880, + "step": 2945 + }, + { + "epoch": 0.32453245324532454, + "grad_norm": 2.098906993865967, + "learning_rate": 8.11056105610561e-06, + "loss": 0.578, + "num_input_tokens_seen": 623904, + "step": 2950 + }, + { + "epoch": 0.3250825082508251, + "grad_norm": 1.8467786312103271, + "learning_rate": 8.124312431243126e-06, + "loss": 0.4722, + "num_input_tokens_seen": 625056, + "step": 2955 + }, + { + "epoch": 0.3256325632563256, + "grad_norm": 1.4393550157546997, + "learning_rate": 8.138063806380639e-06, + "loss": 0.4901, + "num_input_tokens_seen": 626176, + "step": 2960 + }, + { + "epoch": 0.3261826182618262, + "grad_norm": 1.4720516204833984, + "learning_rate": 8.151815181518152e-06, + "loss": 0.437, + "num_input_tokens_seen": 627200, + "step": 2965 + }, + { + "epoch": 0.32673267326732675, + "grad_norm": 1.5977435111999512, + "learning_rate": 8.165566556655666e-06, + "loss": 0.4396, + "num_input_tokens_seen": 628256, + "step": 2970 + }, + { + "epoch": 0.3272827282728273, + "grad_norm": 1.8251304626464844, + "learning_rate": 8.17931793179318e-06, + "loss": 0.4583, + "num_input_tokens_seen": 629376, + "step": 2975 + }, + { + "epoch": 0.3278327832783278, + "grad_norm": 1.1934912204742432, + "learning_rate": 8.193069306930694e-06, + "loss": 0.4615, + "num_input_tokens_seen": 630432, + "step": 2980 + }, + { + "epoch": 0.32838283828382836, + "grad_norm": 1.7355691194534302, + "learning_rate": 8.206820682068208e-06, + "loss": 0.429, + "num_input_tokens_seen": 631520, + "step": 2985 + }, + { + "epoch": 0.32893289328932895, + "grad_norm": 1.440503478050232, + "learning_rate": 8.220572057205721e-06, + "loss": 0.5213, + "num_input_tokens_seen": 632608, + "step": 2990 + }, + { + "epoch": 0.3294829482948295, + "grad_norm": 1.5559970140457153, + "learning_rate": 8.234323432343234e-06, + "loss": 0.4118, + "num_input_tokens_seen": 633664, + "step": 2995 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 1.817106008529663, + "learning_rate": 8.24807480748075e-06, + "loss": 0.4487, + "num_input_tokens_seen": 634688, + "step": 3000 + }, + { + "epoch": 0.33058305830583057, + "grad_norm": 1.522566795349121, + "learning_rate": 8.261826182618263e-06, + "loss": 0.3846, + "num_input_tokens_seen": 635744, + "step": 3005 + }, + { + "epoch": 0.3311331133113311, + "grad_norm": 1.557636022567749, + "learning_rate": 8.275577557755776e-06, + "loss": 0.4454, + "num_input_tokens_seen": 636864, + "step": 3010 + }, + { + "epoch": 0.3316831683168317, + "grad_norm": 1.4612187147140503, + "learning_rate": 8.28932893289329e-06, + "loss": 0.4078, + "num_input_tokens_seen": 637888, + "step": 3015 + }, + { + "epoch": 0.33223322332233224, + "grad_norm": 1.1782701015472412, + "learning_rate": 8.303080308030803e-06, + "loss": 0.4164, + "num_input_tokens_seen": 638944, + "step": 3020 + }, + { + "epoch": 0.3327832783278328, + "grad_norm": 1.0793310403823853, + "learning_rate": 8.316831683168318e-06, + "loss": 0.3808, + "num_input_tokens_seen": 640000, + "step": 3025 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.3698266744613647, + "learning_rate": 8.330583058305831e-06, + "loss": 0.3876, + "num_input_tokens_seen": 641024, + "step": 3030 + }, + { + "epoch": 0.3338833883388339, + "grad_norm": 1.2257530689239502, + "learning_rate": 8.344334433443344e-06, + "loss": 0.3917, + "num_input_tokens_seen": 642080, + "step": 3035 + }, + { + "epoch": 0.33443344334433445, + "grad_norm": 1.3421980142593384, + "learning_rate": 8.358085808580858e-06, + "loss": 0.4059, + "num_input_tokens_seen": 643104, + "step": 3040 + }, + { + "epoch": 0.334983498349835, + "grad_norm": 1.0498912334442139, + "learning_rate": 8.371837183718373e-06, + "loss": 0.3461, + "num_input_tokens_seen": 644128, + "step": 3045 + }, + { + "epoch": 0.3355335533553355, + "grad_norm": 0.9342377185821533, + "learning_rate": 8.385588558855886e-06, + "loss": 0.3527, + "num_input_tokens_seen": 645216, + "step": 3050 + }, + { + "epoch": 0.33608360836083606, + "grad_norm": 0.936076819896698, + "learning_rate": 8.3993399339934e-06, + "loss": 0.3504, + "num_input_tokens_seen": 646272, + "step": 3055 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 1.2344871759414673, + "learning_rate": 8.413091309130913e-06, + "loss": 0.3562, + "num_input_tokens_seen": 647392, + "step": 3060 + }, + { + "epoch": 0.3371837183718372, + "grad_norm": 0.9957095384597778, + "learning_rate": 8.426842684268426e-06, + "loss": 0.3597, + "num_input_tokens_seen": 648448, + "step": 3065 + }, + { + "epoch": 0.33773377337733773, + "grad_norm": 1.392258882522583, + "learning_rate": 8.440594059405941e-06, + "loss": 0.3851, + "num_input_tokens_seen": 649472, + "step": 3070 + }, + { + "epoch": 0.33828382838283827, + "grad_norm": 0.7497342228889465, + "learning_rate": 8.454345434543455e-06, + "loss": 0.3497, + "num_input_tokens_seen": 650464, + "step": 3075 + }, + { + "epoch": 0.3388338833883388, + "grad_norm": 0.8875479698181152, + "learning_rate": 8.468096809680968e-06, + "loss": 0.3589, + "num_input_tokens_seen": 651520, + "step": 3080 + }, + { + "epoch": 0.3393839383938394, + "grad_norm": 1.4427595138549805, + "learning_rate": 8.481848184818481e-06, + "loss": 0.3994, + "num_input_tokens_seen": 652608, + "step": 3085 + }, + { + "epoch": 0.33993399339933994, + "grad_norm": 0.9727327823638916, + "learning_rate": 8.495599559955995e-06, + "loss": 0.2968, + "num_input_tokens_seen": 653600, + "step": 3090 + }, + { + "epoch": 0.3404840484048405, + "grad_norm": 0.9477338194847107, + "learning_rate": 8.50935093509351e-06, + "loss": 0.2758, + "num_input_tokens_seen": 654656, + "step": 3095 + }, + { + "epoch": 0.341034103410341, + "grad_norm": 0.759564220905304, + "learning_rate": 8.523102310231023e-06, + "loss": 0.3051, + "num_input_tokens_seen": 655712, + "step": 3100 + }, + { + "epoch": 0.3415841584158416, + "grad_norm": 1.2964798212051392, + "learning_rate": 8.536853685368536e-06, + "loss": 0.3585, + "num_input_tokens_seen": 656736, + "step": 3105 + }, + { + "epoch": 0.34213421342134215, + "grad_norm": 1.1023167371749878, + "learning_rate": 8.550605060506051e-06, + "loss": 0.4152, + "num_input_tokens_seen": 657824, + "step": 3110 + }, + { + "epoch": 0.3426842684268427, + "grad_norm": 1.4697165489196777, + "learning_rate": 8.564356435643565e-06, + "loss": 0.3339, + "num_input_tokens_seen": 658944, + "step": 3115 + }, + { + "epoch": 0.3432343234323432, + "grad_norm": 0.8792997002601624, + "learning_rate": 8.578107810781078e-06, + "loss": 0.3278, + "num_input_tokens_seen": 660032, + "step": 3120 + }, + { + "epoch": 0.34378437843784376, + "grad_norm": 0.9508538842201233, + "learning_rate": 8.591859185918593e-06, + "loss": 0.3231, + "num_input_tokens_seen": 661088, + "step": 3125 + }, + { + "epoch": 0.34433443344334436, + "grad_norm": 0.811078667640686, + "learning_rate": 8.605610561056107e-06, + "loss": 0.3111, + "num_input_tokens_seen": 662112, + "step": 3130 + }, + { + "epoch": 0.3448844884488449, + "grad_norm": 0.7508942484855652, + "learning_rate": 8.61936193619362e-06, + "loss": 0.3227, + "num_input_tokens_seen": 663072, + "step": 3135 + }, + { + "epoch": 0.34543454345434543, + "grad_norm": 0.975106954574585, + "learning_rate": 8.633113311331135e-06, + "loss": 0.3327, + "num_input_tokens_seen": 664128, + "step": 3140 + }, + { + "epoch": 0.34598459845984597, + "grad_norm": 0.9923734664916992, + "learning_rate": 8.646864686468648e-06, + "loss": 0.3382, + "num_input_tokens_seen": 665184, + "step": 3145 + }, + { + "epoch": 0.3465346534653465, + "grad_norm": 0.8705222606658936, + "learning_rate": 8.660616061606162e-06, + "loss": 0.3481, + "num_input_tokens_seen": 666240, + "step": 3150 + }, + { + "epoch": 0.3470847084708471, + "grad_norm": 0.6796005964279175, + "learning_rate": 8.674367436743675e-06, + "loss": 0.2863, + "num_input_tokens_seen": 667232, + "step": 3155 + }, + { + "epoch": 0.34763476347634764, + "grad_norm": 0.7266517281532288, + "learning_rate": 8.688118811881188e-06, + "loss": 0.3084, + "num_input_tokens_seen": 668320, + "step": 3160 + }, + { + "epoch": 0.3481848184818482, + "grad_norm": 1.011371374130249, + "learning_rate": 8.701870187018703e-06, + "loss": 0.3105, + "num_input_tokens_seen": 669344, + "step": 3165 + }, + { + "epoch": 0.3487348734873487, + "grad_norm": 0.5996674299240112, + "learning_rate": 8.715621562156217e-06, + "loss": 0.3099, + "num_input_tokens_seen": 670400, + "step": 3170 + }, + { + "epoch": 0.3492849284928493, + "grad_norm": 0.6309585571289062, + "learning_rate": 8.72937293729373e-06, + "loss": 0.2605, + "num_input_tokens_seen": 671488, + "step": 3175 + }, + { + "epoch": 0.34983498349834985, + "grad_norm": 0.714022696018219, + "learning_rate": 8.743124312431243e-06, + "loss": 0.3088, + "num_input_tokens_seen": 672576, + "step": 3180 + }, + { + "epoch": 0.3503850385038504, + "grad_norm": 1.0381557941436768, + "learning_rate": 8.756875687568758e-06, + "loss": 0.3299, + "num_input_tokens_seen": 673696, + "step": 3185 + }, + { + "epoch": 0.3509350935093509, + "grad_norm": 0.9933156967163086, + "learning_rate": 8.770627062706272e-06, + "loss": 0.3005, + "num_input_tokens_seen": 674688, + "step": 3190 + }, + { + "epoch": 0.35148514851485146, + "grad_norm": 0.4539233446121216, + "learning_rate": 8.784378437843785e-06, + "loss": 0.3118, + "num_input_tokens_seen": 675744, + "step": 3195 + }, + { + "epoch": 0.35203520352035206, + "grad_norm": 1.0576452016830444, + "learning_rate": 8.798129812981298e-06, + "loss": 0.287, + "num_input_tokens_seen": 676800, + "step": 3200 + }, + { + "epoch": 0.3525852585258526, + "grad_norm": 0.5846837162971497, + "learning_rate": 8.811881188118812e-06, + "loss": 0.3063, + "num_input_tokens_seen": 677856, + "step": 3205 + }, + { + "epoch": 0.35313531353135313, + "grad_norm": 0.8671239018440247, + "learning_rate": 8.825632563256327e-06, + "loss": 0.2879, + "num_input_tokens_seen": 678912, + "step": 3210 + }, + { + "epoch": 0.3536853685368537, + "grad_norm": 1.74848473072052, + "learning_rate": 8.83938393839384e-06, + "loss": 0.3203, + "num_input_tokens_seen": 679904, + "step": 3215 + }, + { + "epoch": 0.3542354235423542, + "grad_norm": 0.6596723794937134, + "learning_rate": 8.853135313531354e-06, + "loss": 0.238, + "num_input_tokens_seen": 681024, + "step": 3220 + }, + { + "epoch": 0.3547854785478548, + "grad_norm": 0.5039423704147339, + "learning_rate": 8.866886688668867e-06, + "loss": 0.2871, + "num_input_tokens_seen": 682080, + "step": 3225 + }, + { + "epoch": 0.35533553355335534, + "grad_norm": 1.05155611038208, + "learning_rate": 8.88063806380638e-06, + "loss": 0.2781, + "num_input_tokens_seen": 683136, + "step": 3230 + }, + { + "epoch": 0.3558855885588559, + "grad_norm": 0.5778391361236572, + "learning_rate": 8.894389438943895e-06, + "loss": 0.2966, + "num_input_tokens_seen": 684256, + "step": 3235 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.8128246068954468, + "learning_rate": 8.908140814081409e-06, + "loss": 0.3019, + "num_input_tokens_seen": 685344, + "step": 3240 + }, + { + "epoch": 0.356985698569857, + "grad_norm": 0.8676334619522095, + "learning_rate": 8.921892189218922e-06, + "loss": 0.3064, + "num_input_tokens_seen": 686304, + "step": 3245 + }, + { + "epoch": 0.35753575357535755, + "grad_norm": 0.6959512829780579, + "learning_rate": 8.935643564356435e-06, + "loss": 0.2754, + "num_input_tokens_seen": 687328, + "step": 3250 + }, + { + "epoch": 0.3580858085808581, + "grad_norm": 0.360361248254776, + "learning_rate": 8.949394939493949e-06, + "loss": 0.3464, + "num_input_tokens_seen": 688320, + "step": 3255 + }, + { + "epoch": 0.3586358635863586, + "grad_norm": 0.7437602281570435, + "learning_rate": 8.963146314631464e-06, + "loss": 0.2833, + "num_input_tokens_seen": 689376, + "step": 3260 + }, + { + "epoch": 0.35918591859185917, + "grad_norm": 0.8676345348358154, + "learning_rate": 8.976897689768977e-06, + "loss": 0.2739, + "num_input_tokens_seen": 690336, + "step": 3265 + }, + { + "epoch": 0.35973597359735976, + "grad_norm": 1.4704917669296265, + "learning_rate": 8.99064906490649e-06, + "loss": 0.3136, + "num_input_tokens_seen": 691456, + "step": 3270 + }, + { + "epoch": 0.3602860286028603, + "grad_norm": 0.9338775277137756, + "learning_rate": 9.004400440044004e-06, + "loss": 0.2613, + "num_input_tokens_seen": 692512, + "step": 3275 + }, + { + "epoch": 0.36083608360836084, + "grad_norm": 0.8867287039756775, + "learning_rate": 9.018151815181519e-06, + "loss": 0.2624, + "num_input_tokens_seen": 693536, + "step": 3280 + }, + { + "epoch": 0.3613861386138614, + "grad_norm": 0.4297780394554138, + "learning_rate": 9.031903190319032e-06, + "loss": 0.2534, + "num_input_tokens_seen": 694592, + "step": 3285 + }, + { + "epoch": 0.3619361936193619, + "grad_norm": 0.5285094976425171, + "learning_rate": 9.045654565456546e-06, + "loss": 0.2832, + "num_input_tokens_seen": 695648, + "step": 3290 + }, + { + "epoch": 0.3624862486248625, + "grad_norm": 1.3438043594360352, + "learning_rate": 9.059405940594059e-06, + "loss": 0.278, + "num_input_tokens_seen": 696768, + "step": 3295 + }, + { + "epoch": 0.36303630363036304, + "grad_norm": 0.7408185601234436, + "learning_rate": 9.073157315731572e-06, + "loss": 0.2616, + "num_input_tokens_seen": 697824, + "step": 3300 + }, + { + "epoch": 0.3635863586358636, + "grad_norm": 0.9235795736312866, + "learning_rate": 9.086908690869087e-06, + "loss": 0.2972, + "num_input_tokens_seen": 698912, + "step": 3305 + }, + { + "epoch": 0.3641364136413641, + "grad_norm": 0.3422309160232544, + "learning_rate": 9.1006600660066e-06, + "loss": 0.2441, + "num_input_tokens_seen": 699904, + "step": 3310 + }, + { + "epoch": 0.36468646864686466, + "grad_norm": 0.6246069669723511, + "learning_rate": 9.114411441144114e-06, + "loss": 0.2372, + "num_input_tokens_seen": 700928, + "step": 3315 + }, + { + "epoch": 0.36523652365236525, + "grad_norm": 0.7337440252304077, + "learning_rate": 9.128162816281629e-06, + "loss": 0.2682, + "num_input_tokens_seen": 701952, + "step": 3320 + }, + { + "epoch": 0.3657865786578658, + "grad_norm": 0.4953889548778534, + "learning_rate": 9.141914191419142e-06, + "loss": 0.2493, + "num_input_tokens_seen": 702976, + "step": 3325 + }, + { + "epoch": 0.36633663366336633, + "grad_norm": 0.42726799845695496, + "learning_rate": 9.155665566556656e-06, + "loss": 0.2437, + "num_input_tokens_seen": 704064, + "step": 3330 + }, + { + "epoch": 0.36688668866886687, + "grad_norm": 0.5664966106414795, + "learning_rate": 9.16941694169417e-06, + "loss": 0.2285, + "num_input_tokens_seen": 705152, + "step": 3335 + }, + { + "epoch": 0.36743674367436746, + "grad_norm": 0.5903539061546326, + "learning_rate": 9.183168316831684e-06, + "loss": 0.2799, + "num_input_tokens_seen": 706240, + "step": 3340 + }, + { + "epoch": 0.367986798679868, + "grad_norm": 0.4776208698749542, + "learning_rate": 9.196919691969197e-06, + "loss": 0.2595, + "num_input_tokens_seen": 707232, + "step": 3345 + }, + { + "epoch": 0.36853685368536854, + "grad_norm": 0.4362874925136566, + "learning_rate": 9.210671067106713e-06, + "loss": 0.2558, + "num_input_tokens_seen": 708352, + "step": 3350 + }, + { + "epoch": 0.3690869086908691, + "grad_norm": 0.5988675355911255, + "learning_rate": 9.224422442244226e-06, + "loss": 0.2555, + "num_input_tokens_seen": 709408, + "step": 3355 + }, + { + "epoch": 0.3696369636963696, + "grad_norm": 0.9253873825073242, + "learning_rate": 9.23817381738174e-06, + "loss": 0.2582, + "num_input_tokens_seen": 710496, + "step": 3360 + }, + { + "epoch": 0.3701870187018702, + "grad_norm": 0.8863744139671326, + "learning_rate": 9.251925192519253e-06, + "loss": 0.2983, + "num_input_tokens_seen": 711616, + "step": 3365 + }, + { + "epoch": 0.37073707370737075, + "grad_norm": 0.7594967484474182, + "learning_rate": 9.265676567656766e-06, + "loss": 0.2643, + "num_input_tokens_seen": 712672, + "step": 3370 + }, + { + "epoch": 0.3712871287128713, + "grad_norm": 0.5092934370040894, + "learning_rate": 9.279427942794281e-06, + "loss": 0.2558, + "num_input_tokens_seen": 713760, + "step": 3375 + }, + { + "epoch": 0.3718371837183718, + "grad_norm": 0.46110618114471436, + "learning_rate": 9.293179317931794e-06, + "loss": 0.2366, + "num_input_tokens_seen": 714912, + "step": 3380 + }, + { + "epoch": 0.37238723872387236, + "grad_norm": 0.5218667387962341, + "learning_rate": 9.306930693069308e-06, + "loss": 0.2606, + "num_input_tokens_seen": 715968, + "step": 3385 + }, + { + "epoch": 0.37293729372937295, + "grad_norm": 0.4680542051792145, + "learning_rate": 9.320682068206821e-06, + "loss": 0.2598, + "num_input_tokens_seen": 717024, + "step": 3390 + }, + { + "epoch": 0.3734873487348735, + "grad_norm": 0.47221505641937256, + "learning_rate": 9.334433443344334e-06, + "loss": 0.2566, + "num_input_tokens_seen": 718112, + "step": 3395 + }, + { + "epoch": 0.37403740374037403, + "grad_norm": 0.4797643721103668, + "learning_rate": 9.34818481848185e-06, + "loss": 0.2206, + "num_input_tokens_seen": 719232, + "step": 3400 + }, + { + "epoch": 0.37458745874587457, + "grad_norm": 0.41305145621299744, + "learning_rate": 9.361936193619363e-06, + "loss": 0.2264, + "num_input_tokens_seen": 720288, + "step": 3405 + }, + { + "epoch": 0.37513751375137516, + "grad_norm": 1.0616832971572876, + "learning_rate": 9.375687568756876e-06, + "loss": 0.2654, + "num_input_tokens_seen": 721312, + "step": 3410 + }, + { + "epoch": 0.3756875687568757, + "grad_norm": 0.9035098552703857, + "learning_rate": 9.38943894389439e-06, + "loss": 0.2792, + "num_input_tokens_seen": 722400, + "step": 3415 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.5334762930870056, + "learning_rate": 9.403190319031905e-06, + "loss": 0.2869, + "num_input_tokens_seen": 723392, + "step": 3420 + }, + { + "epoch": 0.3767876787678768, + "grad_norm": 0.36659204959869385, + "learning_rate": 9.416941694169418e-06, + "loss": 0.2778, + "num_input_tokens_seen": 724448, + "step": 3425 + }, + { + "epoch": 0.3773377337733773, + "grad_norm": 0.5996098518371582, + "learning_rate": 9.430693069306931e-06, + "loss": 0.3045, + "num_input_tokens_seen": 725440, + "step": 3430 + }, + { + "epoch": 0.3778877887788779, + "grad_norm": 1.1974785327911377, + "learning_rate": 9.444444444444445e-06, + "loss": 0.2424, + "num_input_tokens_seen": 726496, + "step": 3435 + }, + { + "epoch": 0.37843784378437845, + "grad_norm": 0.44359850883483887, + "learning_rate": 9.458195819581958e-06, + "loss": 0.2361, + "num_input_tokens_seen": 727488, + "step": 3440 + }, + { + "epoch": 0.378987898789879, + "grad_norm": 0.5568664073944092, + "learning_rate": 9.471947194719473e-06, + "loss": 0.2561, + "num_input_tokens_seen": 728544, + "step": 3445 + }, + { + "epoch": 0.3795379537953795, + "grad_norm": 1.4852111339569092, + "learning_rate": 9.485698569856986e-06, + "loss": 0.3059, + "num_input_tokens_seen": 729568, + "step": 3450 + }, + { + "epoch": 0.38008800880088006, + "grad_norm": 0.5329061150550842, + "learning_rate": 9.4994499449945e-06, + "loss": 0.2235, + "num_input_tokens_seen": 730528, + "step": 3455 + }, + { + "epoch": 0.38063806380638066, + "grad_norm": 0.8120942711830139, + "learning_rate": 9.513201320132013e-06, + "loss": 0.2373, + "num_input_tokens_seen": 731616, + "step": 3460 + }, + { + "epoch": 0.3811881188118812, + "grad_norm": 0.8152030110359192, + "learning_rate": 9.526952695269526e-06, + "loss": 0.3002, + "num_input_tokens_seen": 732800, + "step": 3465 + }, + { + "epoch": 0.38173817381738173, + "grad_norm": 0.4277978539466858, + "learning_rate": 9.540704070407041e-06, + "loss": 0.2555, + "num_input_tokens_seen": 733792, + "step": 3470 + }, + { + "epoch": 0.38228822882288227, + "grad_norm": 0.6172922849655151, + "learning_rate": 9.554455445544555e-06, + "loss": 0.2151, + "num_input_tokens_seen": 734784, + "step": 3475 + }, + { + "epoch": 0.38283828382838286, + "grad_norm": 0.6915404796600342, + "learning_rate": 9.568206820682068e-06, + "loss": 0.2492, + "num_input_tokens_seen": 735968, + "step": 3480 + }, + { + "epoch": 0.3833883388338834, + "grad_norm": 1.2461515665054321, + "learning_rate": 9.581958195819581e-06, + "loss": 0.3092, + "num_input_tokens_seen": 736928, + "step": 3485 + }, + { + "epoch": 0.38393839383938394, + "grad_norm": 0.5217957496643066, + "learning_rate": 9.595709570957096e-06, + "loss": 0.2751, + "num_input_tokens_seen": 737920, + "step": 3490 + }, + { + "epoch": 0.3844884488448845, + "grad_norm": 0.5860112309455872, + "learning_rate": 9.60946094609461e-06, + "loss": 0.2719, + "num_input_tokens_seen": 739008, + "step": 3495 + }, + { + "epoch": 0.385038503850385, + "grad_norm": 0.5828173756599426, + "learning_rate": 9.623212321232123e-06, + "loss": 0.268, + "num_input_tokens_seen": 740128, + "step": 3500 + }, + { + "epoch": 0.3855885588558856, + "grad_norm": 0.6237096190452576, + "learning_rate": 9.636963696369637e-06, + "loss": 0.2663, + "num_input_tokens_seen": 741248, + "step": 3505 + }, + { + "epoch": 0.38613861386138615, + "grad_norm": 0.764178991317749, + "learning_rate": 9.65071507150715e-06, + "loss": 0.2475, + "num_input_tokens_seen": 742304, + "step": 3510 + }, + { + "epoch": 0.3866886688668867, + "grad_norm": 0.745681643486023, + "learning_rate": 9.664466446644665e-06, + "loss": 0.2469, + "num_input_tokens_seen": 743296, + "step": 3515 + }, + { + "epoch": 0.3872387238723872, + "grad_norm": 0.6016656160354614, + "learning_rate": 9.678217821782178e-06, + "loss": 0.2194, + "num_input_tokens_seen": 744288, + "step": 3520 + }, + { + "epoch": 0.38778877887788776, + "grad_norm": 0.988448977470398, + "learning_rate": 9.691969196919692e-06, + "loss": 0.2816, + "num_input_tokens_seen": 745376, + "step": 3525 + }, + { + "epoch": 0.38833883388338836, + "grad_norm": 0.5001853108406067, + "learning_rate": 9.705720572057207e-06, + "loss": 0.2383, + "num_input_tokens_seen": 746464, + "step": 3530 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 0.5847832560539246, + "learning_rate": 9.71947194719472e-06, + "loss": 0.2614, + "num_input_tokens_seen": 747520, + "step": 3535 + }, + { + "epoch": 0.38943894389438943, + "grad_norm": 0.4966396987438202, + "learning_rate": 9.733223322332233e-06, + "loss": 0.2176, + "num_input_tokens_seen": 748544, + "step": 3540 + }, + { + "epoch": 0.38998899889988997, + "grad_norm": 0.2270030379295349, + "learning_rate": 9.746974697469748e-06, + "loss": 0.2199, + "num_input_tokens_seen": 749600, + "step": 3545 + }, + { + "epoch": 0.39053905390539057, + "grad_norm": 0.4919526278972626, + "learning_rate": 9.760726072607262e-06, + "loss": 0.2181, + "num_input_tokens_seen": 750688, + "step": 3550 + }, + { + "epoch": 0.3910891089108911, + "grad_norm": 0.5474798679351807, + "learning_rate": 9.774477447744775e-06, + "loss": 0.2457, + "num_input_tokens_seen": 751712, + "step": 3555 + }, + { + "epoch": 0.39163916391639164, + "grad_norm": 0.7215396165847778, + "learning_rate": 9.78822882288229e-06, + "loss": 0.2148, + "num_input_tokens_seen": 752800, + "step": 3560 + }, + { + "epoch": 0.3921892189218922, + "grad_norm": 0.5637994408607483, + "learning_rate": 9.801980198019804e-06, + "loss": 0.2632, + "num_input_tokens_seen": 753856, + "step": 3565 + }, + { + "epoch": 0.3927392739273927, + "grad_norm": 0.30161863565444946, + "learning_rate": 9.815731573157317e-06, + "loss": 0.2203, + "num_input_tokens_seen": 754848, + "step": 3570 + }, + { + "epoch": 0.3932893289328933, + "grad_norm": 0.4055563509464264, + "learning_rate": 9.82948294829483e-06, + "loss": 0.2416, + "num_input_tokens_seen": 755872, + "step": 3575 + }, + { + "epoch": 0.39383938393839385, + "grad_norm": 0.49437710642814636, + "learning_rate": 9.843234323432344e-06, + "loss": 0.2858, + "num_input_tokens_seen": 756896, + "step": 3580 + }, + { + "epoch": 0.3943894389438944, + "grad_norm": 1.0006651878356934, + "learning_rate": 9.856985698569859e-06, + "loss": 0.279, + "num_input_tokens_seen": 758016, + "step": 3585 + }, + { + "epoch": 0.3949394939493949, + "grad_norm": 0.44126418232917786, + "learning_rate": 9.870737073707372e-06, + "loss": 0.2435, + "num_input_tokens_seen": 759008, + "step": 3590 + }, + { + "epoch": 0.39548954895489546, + "grad_norm": 0.5207707285881042, + "learning_rate": 9.884488448844885e-06, + "loss": 0.2565, + "num_input_tokens_seen": 760032, + "step": 3595 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.4455429017543793, + "learning_rate": 9.898239823982399e-06, + "loss": 0.2333, + "num_input_tokens_seen": 761152, + "step": 3600 + }, + { + "epoch": 0.3965896589658966, + "grad_norm": 0.31419238448143005, + "learning_rate": 9.911991199119912e-06, + "loss": 0.2394, + "num_input_tokens_seen": 762144, + "step": 3605 + }, + { + "epoch": 0.39713971397139713, + "grad_norm": 0.3674376904964447, + "learning_rate": 9.925742574257427e-06, + "loss": 0.2184, + "num_input_tokens_seen": 763232, + "step": 3610 + }, + { + "epoch": 0.3976897689768977, + "grad_norm": 0.9971721172332764, + "learning_rate": 9.93949394939494e-06, + "loss": 0.2512, + "num_input_tokens_seen": 764256, + "step": 3615 + }, + { + "epoch": 0.39823982398239827, + "grad_norm": 0.3970390558242798, + "learning_rate": 9.953245324532454e-06, + "loss": 0.2473, + "num_input_tokens_seen": 765280, + "step": 3620 + }, + { + "epoch": 0.3987898789878988, + "grad_norm": 0.7598722577095032, + "learning_rate": 9.966996699669967e-06, + "loss": 0.2785, + "num_input_tokens_seen": 766368, + "step": 3625 + }, + { + "epoch": 0.39933993399339934, + "grad_norm": 0.8783516883850098, + "learning_rate": 9.980748074807482e-06, + "loss": 0.2432, + "num_input_tokens_seen": 767424, + "step": 3630 + }, + { + "epoch": 0.3998899889988999, + "grad_norm": 0.7164722084999084, + "learning_rate": 9.994499449944995e-06, + "loss": 0.3108, + "num_input_tokens_seen": 768544, + "step": 3635 + }, + { + "epoch": 0.4004400440044004, + "grad_norm": 1.0851954221725464, + "learning_rate": 1.0008250825082509e-05, + "loss": 0.2457, + "num_input_tokens_seen": 769632, + "step": 3640 + }, + { + "epoch": 0.400990099009901, + "grad_norm": 0.35745131969451904, + "learning_rate": 1.0022002200220022e-05, + "loss": 0.2309, + "num_input_tokens_seen": 770784, + "step": 3645 + }, + { + "epoch": 0.40154015401540155, + "grad_norm": 0.5768214464187622, + "learning_rate": 1.0035753575357536e-05, + "loss": 0.2474, + "num_input_tokens_seen": 771840, + "step": 3650 + }, + { + "epoch": 0.4020902090209021, + "grad_norm": 0.4144318699836731, + "learning_rate": 1.004950495049505e-05, + "loss": 0.2457, + "num_input_tokens_seen": 772928, + "step": 3655 + }, + { + "epoch": 0.40264026402640263, + "grad_norm": 1.81864595413208, + "learning_rate": 1.0063256325632564e-05, + "loss": 0.2548, + "num_input_tokens_seen": 774048, + "step": 3660 + }, + { + "epoch": 0.40319031903190317, + "grad_norm": 0.6506831049919128, + "learning_rate": 1.0077007700770077e-05, + "loss": 0.3075, + "num_input_tokens_seen": 775104, + "step": 3665 + }, + { + "epoch": 0.40374037403740376, + "grad_norm": 0.3442818820476532, + "learning_rate": 1.009075907590759e-05, + "loss": 0.2344, + "num_input_tokens_seen": 776160, + "step": 3670 + }, + { + "epoch": 0.4042904290429043, + "grad_norm": 0.612693190574646, + "learning_rate": 1.0104510451045104e-05, + "loss": 0.2683, + "num_input_tokens_seen": 777216, + "step": 3675 + }, + { + "epoch": 0.40484048404840484, + "grad_norm": 0.617304265499115, + "learning_rate": 1.0118261826182619e-05, + "loss": 0.3293, + "num_input_tokens_seen": 778272, + "step": 3680 + }, + { + "epoch": 0.4053905390539054, + "grad_norm": 0.47308504581451416, + "learning_rate": 1.0132013201320132e-05, + "loss": 0.2476, + "num_input_tokens_seen": 779328, + "step": 3685 + }, + { + "epoch": 0.40594059405940597, + "grad_norm": 0.4086260497570038, + "learning_rate": 1.0145764576457646e-05, + "loss": 0.2516, + "num_input_tokens_seen": 780480, + "step": 3690 + }, + { + "epoch": 0.4064906490649065, + "grad_norm": 0.4599429965019226, + "learning_rate": 1.0159515951595159e-05, + "loss": 0.2229, + "num_input_tokens_seen": 781632, + "step": 3695 + }, + { + "epoch": 0.40704070407040704, + "grad_norm": 1.300039529800415, + "learning_rate": 1.0173267326732672e-05, + "loss": 0.2129, + "num_input_tokens_seen": 782720, + "step": 3700 + }, + { + "epoch": 0.4075907590759076, + "grad_norm": 0.39277157187461853, + "learning_rate": 1.0187018701870187e-05, + "loss": 0.2281, + "num_input_tokens_seen": 783712, + "step": 3705 + }, + { + "epoch": 0.4081408140814081, + "grad_norm": 0.27802276611328125, + "learning_rate": 1.02007700770077e-05, + "loss": 0.2085, + "num_input_tokens_seen": 784768, + "step": 3710 + }, + { + "epoch": 0.4086908690869087, + "grad_norm": 1.0022845268249512, + "learning_rate": 1.0214521452145214e-05, + "loss": 0.2457, + "num_input_tokens_seen": 785888, + "step": 3715 + }, + { + "epoch": 0.40924092409240925, + "grad_norm": 0.5994954705238342, + "learning_rate": 1.0228272827282728e-05, + "loss": 0.2548, + "num_input_tokens_seen": 786944, + "step": 3720 + }, + { + "epoch": 0.4097909790979098, + "grad_norm": 0.8083770871162415, + "learning_rate": 1.0242024202420243e-05, + "loss": 0.2006, + "num_input_tokens_seen": 788064, + "step": 3725 + }, + { + "epoch": 0.41034103410341033, + "grad_norm": 0.434576153755188, + "learning_rate": 1.0255775577557756e-05, + "loss": 0.2259, + "num_input_tokens_seen": 789088, + "step": 3730 + }, + { + "epoch": 0.41089108910891087, + "grad_norm": 0.6899390816688538, + "learning_rate": 1.026952695269527e-05, + "loss": 0.273, + "num_input_tokens_seen": 790144, + "step": 3735 + }, + { + "epoch": 0.41144114411441146, + "grad_norm": 0.5455084443092346, + "learning_rate": 1.0283278327832784e-05, + "loss": 0.2234, + "num_input_tokens_seen": 791136, + "step": 3740 + }, + { + "epoch": 0.411991199119912, + "grad_norm": 0.5510545372962952, + "learning_rate": 1.0297029702970298e-05, + "loss": 0.2679, + "num_input_tokens_seen": 792160, + "step": 3745 + }, + { + "epoch": 0.41254125412541254, + "grad_norm": 0.6588642001152039, + "learning_rate": 1.0310781078107811e-05, + "loss": 0.2831, + "num_input_tokens_seen": 793216, + "step": 3750 + }, + { + "epoch": 0.4130913091309131, + "grad_norm": 0.5573844909667969, + "learning_rate": 1.0324532453245326e-05, + "loss": 0.2515, + "num_input_tokens_seen": 794272, + "step": 3755 + }, + { + "epoch": 0.4136413641364136, + "grad_norm": 0.29668334126472473, + "learning_rate": 1.033828382838284e-05, + "loss": 0.2445, + "num_input_tokens_seen": 795328, + "step": 3760 + }, + { + "epoch": 0.4141914191419142, + "grad_norm": 0.6981882452964783, + "learning_rate": 1.0352035203520353e-05, + "loss": 0.2439, + "num_input_tokens_seen": 796352, + "step": 3765 + }, + { + "epoch": 0.41474147414741475, + "grad_norm": 0.5658084750175476, + "learning_rate": 1.0365786578657866e-05, + "loss": 0.2324, + "num_input_tokens_seen": 797376, + "step": 3770 + }, + { + "epoch": 0.4152915291529153, + "grad_norm": 0.8745054602622986, + "learning_rate": 1.0379537953795381e-05, + "loss": 0.2032, + "num_input_tokens_seen": 798432, + "step": 3775 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.38257360458374023, + "learning_rate": 1.0393289328932894e-05, + "loss": 0.1799, + "num_input_tokens_seen": 799552, + "step": 3780 + }, + { + "epoch": 0.4163916391639164, + "grad_norm": 0.5346426367759705, + "learning_rate": 1.0407040704070408e-05, + "loss": 0.2504, + "num_input_tokens_seen": 800576, + "step": 3785 + }, + { + "epoch": 0.41694169416941695, + "grad_norm": 0.6553177237510681, + "learning_rate": 1.0420792079207921e-05, + "loss": 0.2381, + "num_input_tokens_seen": 801664, + "step": 3790 + }, + { + "epoch": 0.4174917491749175, + "grad_norm": 0.7171846628189087, + "learning_rate": 1.0434543454345436e-05, + "loss": 0.2609, + "num_input_tokens_seen": 802624, + "step": 3795 + }, + { + "epoch": 0.41804180418041803, + "grad_norm": 0.34808042645454407, + "learning_rate": 1.044829482948295e-05, + "loss": 0.2648, + "num_input_tokens_seen": 803680, + "step": 3800 + }, + { + "epoch": 0.41859185918591857, + "grad_norm": 0.8169082999229431, + "learning_rate": 1.0462046204620463e-05, + "loss": 0.202, + "num_input_tokens_seen": 804768, + "step": 3805 + }, + { + "epoch": 0.41914191419141916, + "grad_norm": 0.6269155144691467, + "learning_rate": 1.0475797579757976e-05, + "loss": 0.2314, + "num_input_tokens_seen": 805760, + "step": 3810 + }, + { + "epoch": 0.4196919691969197, + "grad_norm": 0.8006105422973633, + "learning_rate": 1.048954895489549e-05, + "loss": 0.2481, + "num_input_tokens_seen": 806816, + "step": 3815 + }, + { + "epoch": 0.42024202420242024, + "grad_norm": 0.4005471169948578, + "learning_rate": 1.0503300330033005e-05, + "loss": 0.2521, + "num_input_tokens_seen": 807840, + "step": 3820 + }, + { + "epoch": 0.4207920792079208, + "grad_norm": 0.36119344830513, + "learning_rate": 1.0517051705170518e-05, + "loss": 0.2173, + "num_input_tokens_seen": 808864, + "step": 3825 + }, + { + "epoch": 0.4213421342134213, + "grad_norm": 0.356886625289917, + "learning_rate": 1.0530803080308031e-05, + "loss": 0.2431, + "num_input_tokens_seen": 809920, + "step": 3830 + }, + { + "epoch": 0.4218921892189219, + "grad_norm": 0.31030550599098206, + "learning_rate": 1.0544554455445545e-05, + "loss": 0.2187, + "num_input_tokens_seen": 811008, + "step": 3835 + }, + { + "epoch": 0.42244224422442245, + "grad_norm": 0.7037194967269897, + "learning_rate": 1.0558305830583058e-05, + "loss": 0.2357, + "num_input_tokens_seen": 812000, + "step": 3840 + }, + { + "epoch": 0.422992299229923, + "grad_norm": 0.9833235740661621, + "learning_rate": 1.0572057205720573e-05, + "loss": 0.2413, + "num_input_tokens_seen": 813088, + "step": 3845 + }, + { + "epoch": 0.4235423542354235, + "grad_norm": 0.29603686928749084, + "learning_rate": 1.0585808580858086e-05, + "loss": 0.2116, + "num_input_tokens_seen": 814080, + "step": 3850 + }, + { + "epoch": 0.4240924092409241, + "grad_norm": 0.7636271715164185, + "learning_rate": 1.05995599559956e-05, + "loss": 0.2363, + "num_input_tokens_seen": 815104, + "step": 3855 + }, + { + "epoch": 0.42464246424642466, + "grad_norm": 0.5221062302589417, + "learning_rate": 1.0613311331133113e-05, + "loss": 0.2086, + "num_input_tokens_seen": 816096, + "step": 3860 + }, + { + "epoch": 0.4251925192519252, + "grad_norm": 0.6338353753089905, + "learning_rate": 1.0627062706270628e-05, + "loss": 0.2407, + "num_input_tokens_seen": 817120, + "step": 3865 + }, + { + "epoch": 0.42574257425742573, + "grad_norm": 0.49614399671554565, + "learning_rate": 1.0640814081408142e-05, + "loss": 0.24, + "num_input_tokens_seen": 818176, + "step": 3870 + }, + { + "epoch": 0.42629262926292627, + "grad_norm": 0.6181225180625916, + "learning_rate": 1.0654565456545655e-05, + "loss": 0.2316, + "num_input_tokens_seen": 819168, + "step": 3875 + }, + { + "epoch": 0.42684268426842686, + "grad_norm": 1.2620793581008911, + "learning_rate": 1.0668316831683168e-05, + "loss": 0.2362, + "num_input_tokens_seen": 820256, + "step": 3880 + }, + { + "epoch": 0.4273927392739274, + "grad_norm": 0.5360103249549866, + "learning_rate": 1.0682068206820682e-05, + "loss": 0.2349, + "num_input_tokens_seen": 821312, + "step": 3885 + }, + { + "epoch": 0.42794279427942794, + "grad_norm": 0.43257737159729004, + "learning_rate": 1.0695819581958197e-05, + "loss": 0.209, + "num_input_tokens_seen": 822336, + "step": 3890 + }, + { + "epoch": 0.4284928492849285, + "grad_norm": 1.0576199293136597, + "learning_rate": 1.070957095709571e-05, + "loss": 0.2418, + "num_input_tokens_seen": 823360, + "step": 3895 + }, + { + "epoch": 0.429042904290429, + "grad_norm": 0.47751593589782715, + "learning_rate": 1.0723322332233223e-05, + "loss": 0.2313, + "num_input_tokens_seen": 824416, + "step": 3900 + }, + { + "epoch": 0.4295929592959296, + "grad_norm": 0.4437640905380249, + "learning_rate": 1.0737073707370737e-05, + "loss": 0.3061, + "num_input_tokens_seen": 825408, + "step": 3905 + }, + { + "epoch": 0.43014301430143015, + "grad_norm": 0.4536973834037781, + "learning_rate": 1.075082508250825e-05, + "loss": 0.187, + "num_input_tokens_seen": 826464, + "step": 3910 + }, + { + "epoch": 0.4306930693069307, + "grad_norm": 1.0003392696380615, + "learning_rate": 1.0764576457645765e-05, + "loss": 0.2162, + "num_input_tokens_seen": 827584, + "step": 3915 + }, + { + "epoch": 0.4312431243124312, + "grad_norm": 0.7140839099884033, + "learning_rate": 1.0778327832783278e-05, + "loss": 0.2495, + "num_input_tokens_seen": 828640, + "step": 3920 + }, + { + "epoch": 0.4317931793179318, + "grad_norm": 0.5209322571754456, + "learning_rate": 1.0792079207920792e-05, + "loss": 0.2102, + "num_input_tokens_seen": 829728, + "step": 3925 + }, + { + "epoch": 0.43234323432343236, + "grad_norm": 0.37099489569664, + "learning_rate": 1.0805830583058305e-05, + "loss": 0.2115, + "num_input_tokens_seen": 830752, + "step": 3930 + }, + { + "epoch": 0.4328932893289329, + "grad_norm": 0.8514182567596436, + "learning_rate": 1.081958195819582e-05, + "loss": 0.2477, + "num_input_tokens_seen": 831808, + "step": 3935 + }, + { + "epoch": 0.43344334433443343, + "grad_norm": 0.6210549473762512, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.2068, + "num_input_tokens_seen": 832928, + "step": 3940 + }, + { + "epoch": 0.43399339933993397, + "grad_norm": 0.6441908478736877, + "learning_rate": 1.0847084708470847e-05, + "loss": 0.2205, + "num_input_tokens_seen": 833984, + "step": 3945 + }, + { + "epoch": 0.43454345434543457, + "grad_norm": 0.42362600564956665, + "learning_rate": 1.0860836083608362e-05, + "loss": 0.2831, + "num_input_tokens_seen": 835008, + "step": 3950 + }, + { + "epoch": 0.4350935093509351, + "grad_norm": 0.8848764896392822, + "learning_rate": 1.0874587458745875e-05, + "loss": 0.2293, + "num_input_tokens_seen": 836032, + "step": 3955 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 0.31082943081855774, + "learning_rate": 1.0888338833883389e-05, + "loss": 0.2203, + "num_input_tokens_seen": 837056, + "step": 3960 + }, + { + "epoch": 0.4361936193619362, + "grad_norm": 0.37851306796073914, + "learning_rate": 1.0902090209020904e-05, + "loss": 0.1866, + "num_input_tokens_seen": 838112, + "step": 3965 + }, + { + "epoch": 0.4367436743674367, + "grad_norm": 0.4801458418369293, + "learning_rate": 1.0915841584158417e-05, + "loss": 0.2025, + "num_input_tokens_seen": 839104, + "step": 3970 + }, + { + "epoch": 0.4372937293729373, + "grad_norm": 0.5236366987228394, + "learning_rate": 1.092959295929593e-05, + "loss": 0.2611, + "num_input_tokens_seen": 840192, + "step": 3975 + }, + { + "epoch": 0.43784378437843785, + "grad_norm": 0.7935670614242554, + "learning_rate": 1.0943344334433444e-05, + "loss": 0.2008, + "num_input_tokens_seen": 841184, + "step": 3980 + }, + { + "epoch": 0.4383938393839384, + "grad_norm": 0.5363150238990784, + "learning_rate": 1.0957095709570959e-05, + "loss": 0.2103, + "num_input_tokens_seen": 842176, + "step": 3985 + }, + { + "epoch": 0.4389438943894389, + "grad_norm": 0.912334144115448, + "learning_rate": 1.0970847084708472e-05, + "loss": 0.2208, + "num_input_tokens_seen": 843232, + "step": 3990 + }, + { + "epoch": 0.4394939493949395, + "grad_norm": 0.33256790041923523, + "learning_rate": 1.0984598459845985e-05, + "loss": 0.2542, + "num_input_tokens_seen": 844288, + "step": 3995 + }, + { + "epoch": 0.44004400440044006, + "grad_norm": 0.356049120426178, + "learning_rate": 1.0998349834983499e-05, + "loss": 0.2269, + "num_input_tokens_seen": 845408, + "step": 4000 + }, + { + "epoch": 0.4405940594059406, + "grad_norm": 0.3958306610584259, + "learning_rate": 1.1012101210121014e-05, + "loss": 0.2126, + "num_input_tokens_seen": 846400, + "step": 4005 + }, + { + "epoch": 0.44114411441144114, + "grad_norm": 0.5905954241752625, + "learning_rate": 1.1025852585258527e-05, + "loss": 0.2281, + "num_input_tokens_seen": 847392, + "step": 4010 + }, + { + "epoch": 0.4416941694169417, + "grad_norm": 0.5037126541137695, + "learning_rate": 1.103960396039604e-05, + "loss": 0.2436, + "num_input_tokens_seen": 848448, + "step": 4015 + }, + { + "epoch": 0.44224422442244227, + "grad_norm": 0.722978949546814, + "learning_rate": 1.1053355335533554e-05, + "loss": 0.276, + "num_input_tokens_seen": 849504, + "step": 4020 + }, + { + "epoch": 0.4427942794279428, + "grad_norm": 0.6246247887611389, + "learning_rate": 1.1067106710671067e-05, + "loss": 0.1978, + "num_input_tokens_seen": 850496, + "step": 4025 + }, + { + "epoch": 0.44334433443344334, + "grad_norm": 0.9263583421707153, + "learning_rate": 1.1080858085808582e-05, + "loss": 0.2405, + "num_input_tokens_seen": 851520, + "step": 4030 + }, + { + "epoch": 0.4438943894389439, + "grad_norm": 0.5414137840270996, + "learning_rate": 1.1094609460946096e-05, + "loss": 0.2121, + "num_input_tokens_seen": 852544, + "step": 4035 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.3793150782585144, + "learning_rate": 1.1108360836083609e-05, + "loss": 0.2277, + "num_input_tokens_seen": 853600, + "step": 4040 + }, + { + "epoch": 0.444994499449945, + "grad_norm": 0.6140618920326233, + "learning_rate": 1.1122112211221122e-05, + "loss": 0.1839, + "num_input_tokens_seen": 854656, + "step": 4045 + }, + { + "epoch": 0.44554455445544555, + "grad_norm": 0.606401264667511, + "learning_rate": 1.1135863586358636e-05, + "loss": 0.3542, + "num_input_tokens_seen": 855712, + "step": 4050 + }, + { + "epoch": 0.4460946094609461, + "grad_norm": 0.31231123208999634, + "learning_rate": 1.114961496149615e-05, + "loss": 0.1862, + "num_input_tokens_seen": 856704, + "step": 4055 + }, + { + "epoch": 0.44664466446644663, + "grad_norm": 0.43438246846199036, + "learning_rate": 1.1163366336633664e-05, + "loss": 0.2158, + "num_input_tokens_seen": 857760, + "step": 4060 + }, + { + "epoch": 0.4471947194719472, + "grad_norm": 0.603003203868866, + "learning_rate": 1.1177117711771177e-05, + "loss": 0.2148, + "num_input_tokens_seen": 858848, + "step": 4065 + }, + { + "epoch": 0.44774477447744776, + "grad_norm": 0.7553208470344543, + "learning_rate": 1.119086908690869e-05, + "loss": 0.2392, + "num_input_tokens_seen": 859872, + "step": 4070 + }, + { + "epoch": 0.4482948294829483, + "grad_norm": 0.6201753616333008, + "learning_rate": 1.1204620462046204e-05, + "loss": 0.2771, + "num_input_tokens_seen": 860896, + "step": 4075 + }, + { + "epoch": 0.44884488448844884, + "grad_norm": 0.33558714389801025, + "learning_rate": 1.121837183718372e-05, + "loss": 0.2145, + "num_input_tokens_seen": 861984, + "step": 4080 + }, + { + "epoch": 0.4493949394939494, + "grad_norm": 0.3617570102214813, + "learning_rate": 1.1232123212321233e-05, + "loss": 0.2065, + "num_input_tokens_seen": 862944, + "step": 4085 + }, + { + "epoch": 0.44994499449944997, + "grad_norm": 0.3541041612625122, + "learning_rate": 1.1245874587458746e-05, + "loss": 0.2342, + "num_input_tokens_seen": 863968, + "step": 4090 + }, + { + "epoch": 0.4504950495049505, + "grad_norm": 0.3693390190601349, + "learning_rate": 1.125962596259626e-05, + "loss": 0.2033, + "num_input_tokens_seen": 865120, + "step": 4095 + }, + { + "epoch": 0.45104510451045104, + "grad_norm": 0.42433181405067444, + "learning_rate": 1.1273377337733774e-05, + "loss": 0.2179, + "num_input_tokens_seen": 866112, + "step": 4100 + }, + { + "epoch": 0.4515951595159516, + "grad_norm": 0.4231221377849579, + "learning_rate": 1.1287128712871288e-05, + "loss": 0.2468, + "num_input_tokens_seen": 867136, + "step": 4105 + }, + { + "epoch": 0.4521452145214521, + "grad_norm": 0.5164830088615417, + "learning_rate": 1.1300880088008801e-05, + "loss": 0.2422, + "num_input_tokens_seen": 868160, + "step": 4110 + }, + { + "epoch": 0.4526952695269527, + "grad_norm": 0.9441322088241577, + "learning_rate": 1.1314631463146314e-05, + "loss": 0.2071, + "num_input_tokens_seen": 869280, + "step": 4115 + }, + { + "epoch": 0.45324532453245325, + "grad_norm": 0.3645877540111542, + "learning_rate": 1.1328382838283828e-05, + "loss": 0.2336, + "num_input_tokens_seen": 870368, + "step": 4120 + }, + { + "epoch": 0.4537953795379538, + "grad_norm": 0.9174268245697021, + "learning_rate": 1.1342134213421343e-05, + "loss": 0.2135, + "num_input_tokens_seen": 871360, + "step": 4125 + }, + { + "epoch": 0.45434543454345433, + "grad_norm": 0.6135095357894897, + "learning_rate": 1.1355885588558856e-05, + "loss": 0.2163, + "num_input_tokens_seen": 872448, + "step": 4130 + }, + { + "epoch": 0.45489548954895487, + "grad_norm": 0.42431172728538513, + "learning_rate": 1.136963696369637e-05, + "loss": 0.2369, + "num_input_tokens_seen": 873536, + "step": 4135 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.3074084222316742, + "learning_rate": 1.1383388338833883e-05, + "loss": 0.1754, + "num_input_tokens_seen": 874592, + "step": 4140 + }, + { + "epoch": 0.455995599559956, + "grad_norm": 0.8659998178482056, + "learning_rate": 1.1397139713971398e-05, + "loss": 0.2005, + "num_input_tokens_seen": 875584, + "step": 4145 + }, + { + "epoch": 0.45654565456545654, + "grad_norm": 0.38182568550109863, + "learning_rate": 1.1410891089108911e-05, + "loss": 0.2407, + "num_input_tokens_seen": 876640, + "step": 4150 + }, + { + "epoch": 0.4570957095709571, + "grad_norm": 0.38993626832962036, + "learning_rate": 1.1424642464246424e-05, + "loss": 0.1975, + "num_input_tokens_seen": 877728, + "step": 4155 + }, + { + "epoch": 0.45764576457645767, + "grad_norm": 0.8000648617744446, + "learning_rate": 1.143839383938394e-05, + "loss": 0.2219, + "num_input_tokens_seen": 878752, + "step": 4160 + }, + { + "epoch": 0.4581958195819582, + "grad_norm": 0.3405664563179016, + "learning_rate": 1.1452145214521453e-05, + "loss": 0.1903, + "num_input_tokens_seen": 879744, + "step": 4165 + }, + { + "epoch": 0.45874587458745875, + "grad_norm": 0.4955825209617615, + "learning_rate": 1.1465896589658966e-05, + "loss": 0.2411, + "num_input_tokens_seen": 880800, + "step": 4170 + }, + { + "epoch": 0.4592959295929593, + "grad_norm": 0.2879650592803955, + "learning_rate": 1.1479647964796481e-05, + "loss": 0.2671, + "num_input_tokens_seen": 881824, + "step": 4175 + }, + { + "epoch": 0.4598459845984598, + "grad_norm": 0.7289828062057495, + "learning_rate": 1.1493399339933995e-05, + "loss": 0.2186, + "num_input_tokens_seen": 882848, + "step": 4180 + }, + { + "epoch": 0.4603960396039604, + "grad_norm": 0.5498496294021606, + "learning_rate": 1.1507150715071508e-05, + "loss": 0.1855, + "num_input_tokens_seen": 883808, + "step": 4185 + }, + { + "epoch": 0.46094609460946095, + "grad_norm": 0.4253113269805908, + "learning_rate": 1.1520902090209021e-05, + "loss": 0.2064, + "num_input_tokens_seen": 884832, + "step": 4190 + }, + { + "epoch": 0.4614961496149615, + "grad_norm": 0.8539285063743591, + "learning_rate": 1.1534653465346536e-05, + "loss": 0.2675, + "num_input_tokens_seen": 885856, + "step": 4195 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 4.157858371734619, + "learning_rate": 1.154840484048405e-05, + "loss": 0.215, + "num_input_tokens_seen": 886880, + "step": 4200 + }, + { + "epoch": 0.46259625962596257, + "grad_norm": 0.7206153273582458, + "learning_rate": 1.1562156215621563e-05, + "loss": 0.235, + "num_input_tokens_seen": 887936, + "step": 4205 + }, + { + "epoch": 0.46314631463146316, + "grad_norm": 0.5606929659843445, + "learning_rate": 1.1575907590759076e-05, + "loss": 0.2039, + "num_input_tokens_seen": 888992, + "step": 4210 + }, + { + "epoch": 0.4636963696369637, + "grad_norm": 0.47352078557014465, + "learning_rate": 1.158965896589659e-05, + "loss": 0.2165, + "num_input_tokens_seen": 890048, + "step": 4215 + }, + { + "epoch": 0.46424642464246424, + "grad_norm": 0.5706644654273987, + "learning_rate": 1.1603410341034105e-05, + "loss": 0.2522, + "num_input_tokens_seen": 891104, + "step": 4220 + }, + { + "epoch": 0.4647964796479648, + "grad_norm": 0.5674324035644531, + "learning_rate": 1.1617161716171618e-05, + "loss": 0.2161, + "num_input_tokens_seen": 892192, + "step": 4225 + }, + { + "epoch": 0.46534653465346537, + "grad_norm": 0.5791674256324768, + "learning_rate": 1.1630913091309132e-05, + "loss": 0.1669, + "num_input_tokens_seen": 893216, + "step": 4230 + }, + { + "epoch": 0.4658965896589659, + "grad_norm": 0.75510573387146, + "learning_rate": 1.1644664466446645e-05, + "loss": 0.2088, + "num_input_tokens_seen": 894272, + "step": 4235 + }, + { + "epoch": 0.46644664466446645, + "grad_norm": 0.5669931173324585, + "learning_rate": 1.165841584158416e-05, + "loss": 0.2242, + "num_input_tokens_seen": 895360, + "step": 4240 + }, + { + "epoch": 0.466996699669967, + "grad_norm": 0.4095533788204193, + "learning_rate": 1.1672167216721673e-05, + "loss": 0.2163, + "num_input_tokens_seen": 896416, + "step": 4245 + }, + { + "epoch": 0.4675467546754675, + "grad_norm": 0.4218509793281555, + "learning_rate": 1.1685918591859187e-05, + "loss": 0.1936, + "num_input_tokens_seen": 897440, + "step": 4250 + }, + { + "epoch": 0.4680968096809681, + "grad_norm": 1.0954713821411133, + "learning_rate": 1.16996699669967e-05, + "loss": 0.2392, + "num_input_tokens_seen": 898528, + "step": 4255 + }, + { + "epoch": 0.46864686468646866, + "grad_norm": 0.6423611640930176, + "learning_rate": 1.1713421342134213e-05, + "loss": 0.2466, + "num_input_tokens_seen": 899552, + "step": 4260 + }, + { + "epoch": 0.4691969196919692, + "grad_norm": 0.44633346796035767, + "learning_rate": 1.1727172717271728e-05, + "loss": 0.2901, + "num_input_tokens_seen": 900544, + "step": 4265 + }, + { + "epoch": 0.46974697469746973, + "grad_norm": 0.6120641231536865, + "learning_rate": 1.1740924092409242e-05, + "loss": 0.2134, + "num_input_tokens_seen": 901632, + "step": 4270 + }, + { + "epoch": 0.47029702970297027, + "grad_norm": 0.8596353530883789, + "learning_rate": 1.1754675467546755e-05, + "loss": 0.2207, + "num_input_tokens_seen": 902688, + "step": 4275 + }, + { + "epoch": 0.47084708470847086, + "grad_norm": 0.6146655678749084, + "learning_rate": 1.1768426842684268e-05, + "loss": 0.2599, + "num_input_tokens_seen": 903744, + "step": 4280 + }, + { + "epoch": 0.4713971397139714, + "grad_norm": 0.9494661688804626, + "learning_rate": 1.1782178217821782e-05, + "loss": 0.2212, + "num_input_tokens_seen": 904800, + "step": 4285 + }, + { + "epoch": 0.47194719471947194, + "grad_norm": 0.41292113065719604, + "learning_rate": 1.1795929592959297e-05, + "loss": 0.2426, + "num_input_tokens_seen": 905824, + "step": 4290 + }, + { + "epoch": 0.4724972497249725, + "grad_norm": 0.4103822410106659, + "learning_rate": 1.180968096809681e-05, + "loss": 0.215, + "num_input_tokens_seen": 906848, + "step": 4295 + }, + { + "epoch": 0.4730473047304731, + "grad_norm": 0.3828677535057068, + "learning_rate": 1.1823432343234323e-05, + "loss": 0.2093, + "num_input_tokens_seen": 907904, + "step": 4300 + }, + { + "epoch": 0.4735973597359736, + "grad_norm": 0.5197587013244629, + "learning_rate": 1.1837183718371837e-05, + "loss": 0.2748, + "num_input_tokens_seen": 909024, + "step": 4305 + }, + { + "epoch": 0.47414741474147415, + "grad_norm": 0.5296345353126526, + "learning_rate": 1.1850935093509352e-05, + "loss": 0.2302, + "num_input_tokens_seen": 910176, + "step": 4310 + }, + { + "epoch": 0.4746974697469747, + "grad_norm": 0.5253889560699463, + "learning_rate": 1.1864686468646865e-05, + "loss": 0.2406, + "num_input_tokens_seen": 911296, + "step": 4315 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.4044598340988159, + "learning_rate": 1.1878437843784379e-05, + "loss": 0.2514, + "num_input_tokens_seen": 912352, + "step": 4320 + }, + { + "epoch": 0.4757975797579758, + "grad_norm": 0.566603422164917, + "learning_rate": 1.1892189218921892e-05, + "loss": 0.2239, + "num_input_tokens_seen": 913408, + "step": 4325 + }, + { + "epoch": 0.47634763476347636, + "grad_norm": 0.5360569953918457, + "learning_rate": 1.1905940594059405e-05, + "loss": 0.2466, + "num_input_tokens_seen": 914464, + "step": 4330 + }, + { + "epoch": 0.4768976897689769, + "grad_norm": 0.4880426228046417, + "learning_rate": 1.191969196919692e-05, + "loss": 0.2207, + "num_input_tokens_seen": 915552, + "step": 4335 + }, + { + "epoch": 0.47744774477447743, + "grad_norm": 0.7653363347053528, + "learning_rate": 1.1933443344334434e-05, + "loss": 0.2219, + "num_input_tokens_seen": 916640, + "step": 4340 + }, + { + "epoch": 0.47799779977997797, + "grad_norm": 0.8758842349052429, + "learning_rate": 1.1947194719471947e-05, + "loss": 0.2235, + "num_input_tokens_seen": 917696, + "step": 4345 + }, + { + "epoch": 0.47854785478547857, + "grad_norm": 0.7966024875640869, + "learning_rate": 1.196094609460946e-05, + "loss": 0.1864, + "num_input_tokens_seen": 918784, + "step": 4350 + }, + { + "epoch": 0.4790979097909791, + "grad_norm": 0.43938517570495605, + "learning_rate": 1.1974697469746975e-05, + "loss": 0.219, + "num_input_tokens_seen": 919776, + "step": 4355 + }, + { + "epoch": 0.47964796479647964, + "grad_norm": 0.9431685209274292, + "learning_rate": 1.1988448844884489e-05, + "loss": 0.2206, + "num_input_tokens_seen": 920896, + "step": 4360 + }, + { + "epoch": 0.4801980198019802, + "grad_norm": 0.37980636954307556, + "learning_rate": 1.2002200220022004e-05, + "loss": 0.1736, + "num_input_tokens_seen": 921856, + "step": 4365 + }, + { + "epoch": 0.4807480748074808, + "grad_norm": 0.6447860598564148, + "learning_rate": 1.2015951595159517e-05, + "loss": 0.2634, + "num_input_tokens_seen": 922912, + "step": 4370 + }, + { + "epoch": 0.4812981298129813, + "grad_norm": 0.6388132572174072, + "learning_rate": 1.202970297029703e-05, + "loss": 0.2215, + "num_input_tokens_seen": 923968, + "step": 4375 + }, + { + "epoch": 0.48184818481848185, + "grad_norm": 0.4481780529022217, + "learning_rate": 1.2043454345434546e-05, + "loss": 0.2437, + "num_input_tokens_seen": 925024, + "step": 4380 + }, + { + "epoch": 0.4823982398239824, + "grad_norm": 0.5835367441177368, + "learning_rate": 1.2057205720572059e-05, + "loss": 0.185, + "num_input_tokens_seen": 926080, + "step": 4385 + }, + { + "epoch": 0.4829482948294829, + "grad_norm": 0.4549271762371063, + "learning_rate": 1.2070957095709572e-05, + "loss": 0.2128, + "num_input_tokens_seen": 927104, + "step": 4390 + }, + { + "epoch": 0.4834983498349835, + "grad_norm": 0.5323678255081177, + "learning_rate": 1.2084708470847086e-05, + "loss": 0.2824, + "num_input_tokens_seen": 928128, + "step": 4395 + }, + { + "epoch": 0.48404840484048406, + "grad_norm": 0.45848754048347473, + "learning_rate": 1.2098459845984599e-05, + "loss": 0.2247, + "num_input_tokens_seen": 929184, + "step": 4400 + }, + { + "epoch": 0.4845984598459846, + "grad_norm": 0.911005437374115, + "learning_rate": 1.2112211221122114e-05, + "loss": 0.2301, + "num_input_tokens_seen": 930176, + "step": 4405 + }, + { + "epoch": 0.48514851485148514, + "grad_norm": 0.6045058369636536, + "learning_rate": 1.2125962596259627e-05, + "loss": 0.2763, + "num_input_tokens_seen": 931136, + "step": 4410 + }, + { + "epoch": 0.4856985698569857, + "grad_norm": 0.6130238175392151, + "learning_rate": 1.213971397139714e-05, + "loss": 0.2198, + "num_input_tokens_seen": 932128, + "step": 4415 + }, + { + "epoch": 0.48624862486248627, + "grad_norm": 0.48029136657714844, + "learning_rate": 1.2153465346534654e-05, + "loss": 0.2191, + "num_input_tokens_seen": 933184, + "step": 4420 + }, + { + "epoch": 0.4867986798679868, + "grad_norm": 0.6591762900352478, + "learning_rate": 1.2167216721672167e-05, + "loss": 0.2204, + "num_input_tokens_seen": 934240, + "step": 4425 + }, + { + "epoch": 0.48734873487348734, + "grad_norm": 1.3021275997161865, + "learning_rate": 1.2180968096809682e-05, + "loss": 0.2738, + "num_input_tokens_seen": 935328, + "step": 4430 + }, + { + "epoch": 0.4878987898789879, + "grad_norm": 0.6157840490341187, + "learning_rate": 1.2194719471947196e-05, + "loss": 0.2015, + "num_input_tokens_seen": 936384, + "step": 4435 + }, + { + "epoch": 0.4884488448844885, + "grad_norm": 0.6549841165542603, + "learning_rate": 1.2208470847084709e-05, + "loss": 0.2654, + "num_input_tokens_seen": 937408, + "step": 4440 + }, + { + "epoch": 0.488998899889989, + "grad_norm": 0.6599363088607788, + "learning_rate": 1.2222222222222222e-05, + "loss": 0.244, + "num_input_tokens_seen": 938432, + "step": 4445 + }, + { + "epoch": 0.48954895489548955, + "grad_norm": 0.631687581539154, + "learning_rate": 1.2235973597359738e-05, + "loss": 0.206, + "num_input_tokens_seen": 939520, + "step": 4450 + }, + { + "epoch": 0.4900990099009901, + "grad_norm": 0.31967708468437195, + "learning_rate": 1.2249724972497251e-05, + "loss": 0.1945, + "num_input_tokens_seen": 940544, + "step": 4455 + }, + { + "epoch": 0.49064906490649063, + "grad_norm": 0.5226503610610962, + "learning_rate": 1.2263476347634764e-05, + "loss": 0.1815, + "num_input_tokens_seen": 941504, + "step": 4460 + }, + { + "epoch": 0.4911991199119912, + "grad_norm": 0.7374622821807861, + "learning_rate": 1.2277227722772278e-05, + "loss": 0.211, + "num_input_tokens_seen": 942560, + "step": 4465 + }, + { + "epoch": 0.49174917491749176, + "grad_norm": 0.4857286214828491, + "learning_rate": 1.2290979097909791e-05, + "loss": 0.1854, + "num_input_tokens_seen": 943680, + "step": 4470 + }, + { + "epoch": 0.4922992299229923, + "grad_norm": 0.4291743040084839, + "learning_rate": 1.2304730473047306e-05, + "loss": 0.2478, + "num_input_tokens_seen": 944768, + "step": 4475 + }, + { + "epoch": 0.49284928492849284, + "grad_norm": 0.3667222857475281, + "learning_rate": 1.231848184818482e-05, + "loss": 0.1839, + "num_input_tokens_seen": 945792, + "step": 4480 + }, + { + "epoch": 0.4933993399339934, + "grad_norm": 0.47331106662750244, + "learning_rate": 1.2332233223322333e-05, + "loss": 0.2308, + "num_input_tokens_seen": 946848, + "step": 4485 + }, + { + "epoch": 0.49394939493949397, + "grad_norm": 0.5601352453231812, + "learning_rate": 1.2345984598459846e-05, + "loss": 0.2051, + "num_input_tokens_seen": 947968, + "step": 4490 + }, + { + "epoch": 0.4944994499449945, + "grad_norm": 0.7730914354324341, + "learning_rate": 1.235973597359736e-05, + "loss": 0.2438, + "num_input_tokens_seen": 949056, + "step": 4495 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.5699912309646606, + "learning_rate": 1.2373487348734874e-05, + "loss": 0.2471, + "num_input_tokens_seen": 950144, + "step": 4500 + }, + { + "epoch": 0.4955995599559956, + "grad_norm": 0.5089842081069946, + "learning_rate": 1.2387238723872388e-05, + "loss": 0.2108, + "num_input_tokens_seen": 951200, + "step": 4505 + }, + { + "epoch": 0.4961496149614962, + "grad_norm": 0.33207467198371887, + "learning_rate": 1.2400990099009901e-05, + "loss": 0.185, + "num_input_tokens_seen": 952224, + "step": 4510 + }, + { + "epoch": 0.4966996699669967, + "grad_norm": 0.6775156855583191, + "learning_rate": 1.2414741474147414e-05, + "loss": 0.229, + "num_input_tokens_seen": 953280, + "step": 4515 + }, + { + "epoch": 0.49724972497249725, + "grad_norm": 0.3681921362876892, + "learning_rate": 1.2428492849284928e-05, + "loss": 0.2064, + "num_input_tokens_seen": 954368, + "step": 4520 + }, + { + "epoch": 0.4977997799779978, + "grad_norm": 0.3351973295211792, + "learning_rate": 1.2442244224422443e-05, + "loss": 0.2152, + "num_input_tokens_seen": 955424, + "step": 4525 + }, + { + "epoch": 0.49834983498349833, + "grad_norm": 0.3964555561542511, + "learning_rate": 1.2455995599559956e-05, + "loss": 0.2186, + "num_input_tokens_seen": 956416, + "step": 4530 + }, + { + "epoch": 0.4988998899889989, + "grad_norm": 0.5854042172431946, + "learning_rate": 1.246974697469747e-05, + "loss": 0.2308, + "num_input_tokens_seen": 957472, + "step": 4535 + }, + { + "epoch": 0.49944994499449946, + "grad_norm": 0.6100010871887207, + "learning_rate": 1.2483498349834983e-05, + "loss": 0.2098, + "num_input_tokens_seen": 958528, + "step": 4540 + }, + { + "epoch": 0.5, + "grad_norm": 0.44919613003730774, + "learning_rate": 1.2497249724972498e-05, + "loss": 0.2238, + "num_input_tokens_seen": 959584, + "step": 4545 + }, + { + "epoch": 0.5005500550055005, + "grad_norm": 0.5952291488647461, + "learning_rate": 1.2511001100110011e-05, + "loss": 0.2371, + "num_input_tokens_seen": 960640, + "step": 4550 + }, + { + "epoch": 0.5011001100110011, + "grad_norm": 0.22857581079006195, + "learning_rate": 1.2524752475247525e-05, + "loss": 0.1665, + "num_input_tokens_seen": 961728, + "step": 4555 + }, + { + "epoch": 0.5016501650165016, + "grad_norm": 0.6224180459976196, + "learning_rate": 1.253850385038504e-05, + "loss": 0.2218, + "num_input_tokens_seen": 962784, + "step": 4560 + }, + { + "epoch": 0.5022002200220022, + "grad_norm": 1.5861057043075562, + "learning_rate": 1.2552255225522553e-05, + "loss": 0.2613, + "num_input_tokens_seen": 963808, + "step": 4565 + }, + { + "epoch": 0.5027502750275028, + "grad_norm": 0.3400561213493347, + "learning_rate": 1.2566006600660066e-05, + "loss": 0.2142, + "num_input_tokens_seen": 964928, + "step": 4570 + }, + { + "epoch": 0.5033003300330033, + "grad_norm": 0.3841913342475891, + "learning_rate": 1.257975797579758e-05, + "loss": 0.2729, + "num_input_tokens_seen": 966048, + "step": 4575 + }, + { + "epoch": 0.5038503850385039, + "grad_norm": 0.4819294512271881, + "learning_rate": 1.2593509350935095e-05, + "loss": 0.2015, + "num_input_tokens_seen": 967040, + "step": 4580 + }, + { + "epoch": 0.5044004400440044, + "grad_norm": 0.6295074820518494, + "learning_rate": 1.2607260726072608e-05, + "loss": 0.2061, + "num_input_tokens_seen": 968064, + "step": 4585 + }, + { + "epoch": 0.504950495049505, + "grad_norm": 0.3698115944862366, + "learning_rate": 1.2621012101210123e-05, + "loss": 0.1941, + "num_input_tokens_seen": 969088, + "step": 4590 + }, + { + "epoch": 0.5055005500550055, + "grad_norm": 0.6812148094177246, + "learning_rate": 1.2634763476347637e-05, + "loss": 0.224, + "num_input_tokens_seen": 970112, + "step": 4595 + }, + { + "epoch": 0.506050605060506, + "grad_norm": 0.5991958379745483, + "learning_rate": 1.264851485148515e-05, + "loss": 0.2146, + "num_input_tokens_seen": 971136, + "step": 4600 + }, + { + "epoch": 0.5066006600660066, + "grad_norm": 0.5234134197235107, + "learning_rate": 1.2662266226622663e-05, + "loss": 0.1998, + "num_input_tokens_seen": 972160, + "step": 4605 + }, + { + "epoch": 0.5071507150715071, + "grad_norm": 0.4568168818950653, + "learning_rate": 1.2676017601760177e-05, + "loss": 0.21, + "num_input_tokens_seen": 973184, + "step": 4610 + }, + { + "epoch": 0.5077007700770076, + "grad_norm": 0.9273422956466675, + "learning_rate": 1.268976897689769e-05, + "loss": 0.214, + "num_input_tokens_seen": 974240, + "step": 4615 + }, + { + "epoch": 0.5082508250825083, + "grad_norm": 0.6258812546730042, + "learning_rate": 1.2703520352035203e-05, + "loss": 0.2085, + "num_input_tokens_seen": 975264, + "step": 4620 + }, + { + "epoch": 0.5088008800880088, + "grad_norm": 0.31128954887390137, + "learning_rate": 1.2717271727172717e-05, + "loss": 0.1581, + "num_input_tokens_seen": 976256, + "step": 4625 + }, + { + "epoch": 0.5093509350935094, + "grad_norm": 0.48010867834091187, + "learning_rate": 1.2731023102310233e-05, + "loss": 0.171, + "num_input_tokens_seen": 977248, + "step": 4630 + }, + { + "epoch": 0.5099009900990099, + "grad_norm": 0.6728882789611816, + "learning_rate": 1.2744774477447747e-05, + "loss": 0.2049, + "num_input_tokens_seen": 978336, + "step": 4635 + }, + { + "epoch": 0.5104510451045104, + "grad_norm": 0.5031853914260864, + "learning_rate": 1.275852585258526e-05, + "loss": 0.1842, + "num_input_tokens_seen": 979360, + "step": 4640 + }, + { + "epoch": 0.511001100110011, + "grad_norm": 0.3378356695175171, + "learning_rate": 1.2772277227722773e-05, + "loss": 0.2878, + "num_input_tokens_seen": 980416, + "step": 4645 + }, + { + "epoch": 0.5115511551155115, + "grad_norm": 0.6491838097572327, + "learning_rate": 1.2786028602860287e-05, + "loss": 0.2171, + "num_input_tokens_seen": 981472, + "step": 4650 + }, + { + "epoch": 0.5121012101210121, + "grad_norm": 0.7041122317314148, + "learning_rate": 1.27997799779978e-05, + "loss": 0.219, + "num_input_tokens_seen": 982592, + "step": 4655 + }, + { + "epoch": 0.5126512651265126, + "grad_norm": 0.5799067616462708, + "learning_rate": 1.2813531353135313e-05, + "loss": 0.2219, + "num_input_tokens_seen": 983648, + "step": 4660 + }, + { + "epoch": 0.5132013201320133, + "grad_norm": 0.9785295128822327, + "learning_rate": 1.2827282728272827e-05, + "loss": 0.2326, + "num_input_tokens_seen": 984640, + "step": 4665 + }, + { + "epoch": 0.5137513751375138, + "grad_norm": 0.270001620054245, + "learning_rate": 1.284103410341034e-05, + "loss": 0.1822, + "num_input_tokens_seen": 985728, + "step": 4670 + }, + { + "epoch": 0.5143014301430143, + "grad_norm": 0.5446102619171143, + "learning_rate": 1.2854785478547857e-05, + "loss": 0.2649, + "num_input_tokens_seen": 986816, + "step": 4675 + }, + { + "epoch": 0.5148514851485149, + "grad_norm": 0.40852800011634827, + "learning_rate": 1.286853685368537e-05, + "loss": 0.1755, + "num_input_tokens_seen": 987904, + "step": 4680 + }, + { + "epoch": 0.5154015401540154, + "grad_norm": 0.5742860436439514, + "learning_rate": 1.2882288228822884e-05, + "loss": 0.2204, + "num_input_tokens_seen": 988928, + "step": 4685 + }, + { + "epoch": 0.5159515951595159, + "grad_norm": 0.3753046691417694, + "learning_rate": 1.2896039603960397e-05, + "loss": 0.1823, + "num_input_tokens_seen": 989920, + "step": 4690 + }, + { + "epoch": 0.5165016501650165, + "grad_norm": 0.4472300708293915, + "learning_rate": 1.290979097909791e-05, + "loss": 0.1869, + "num_input_tokens_seen": 990944, + "step": 4695 + }, + { + "epoch": 0.517051705170517, + "grad_norm": 0.7026126980781555, + "learning_rate": 1.2923542354235424e-05, + "loss": 0.2105, + "num_input_tokens_seen": 991936, + "step": 4700 + }, + { + "epoch": 0.5176017601760176, + "grad_norm": 0.5924620032310486, + "learning_rate": 1.2937293729372937e-05, + "loss": 0.2041, + "num_input_tokens_seen": 992992, + "step": 4705 + }, + { + "epoch": 0.5181518151815182, + "grad_norm": 0.5197203755378723, + "learning_rate": 1.295104510451045e-05, + "loss": 0.2218, + "num_input_tokens_seen": 994048, + "step": 4710 + }, + { + "epoch": 0.5187018701870187, + "grad_norm": 0.455945760011673, + "learning_rate": 1.2964796479647964e-05, + "loss": 0.3047, + "num_input_tokens_seen": 995104, + "step": 4715 + }, + { + "epoch": 0.5192519251925193, + "grad_norm": 0.35934174060821533, + "learning_rate": 1.297854785478548e-05, + "loss": 0.2086, + "num_input_tokens_seen": 996192, + "step": 4720 + }, + { + "epoch": 0.5198019801980198, + "grad_norm": 0.5661352276802063, + "learning_rate": 1.2992299229922994e-05, + "loss": 0.2449, + "num_input_tokens_seen": 997216, + "step": 4725 + }, + { + "epoch": 0.5203520352035204, + "grad_norm": 0.792603075504303, + "learning_rate": 1.3006050605060507e-05, + "loss": 0.1869, + "num_input_tokens_seen": 998208, + "step": 4730 + }, + { + "epoch": 0.5209020902090209, + "grad_norm": 0.35587215423583984, + "learning_rate": 1.301980198019802e-05, + "loss": 0.1899, + "num_input_tokens_seen": 999264, + "step": 4735 + }, + { + "epoch": 0.5214521452145214, + "grad_norm": 0.5194666385650635, + "learning_rate": 1.3033553355335534e-05, + "loss": 0.2186, + "num_input_tokens_seen": 1000352, + "step": 4740 + }, + { + "epoch": 0.522002200220022, + "grad_norm": 0.5958698987960815, + "learning_rate": 1.3047304730473047e-05, + "loss": 0.1943, + "num_input_tokens_seen": 1001408, + "step": 4745 + }, + { + "epoch": 0.5225522552255225, + "grad_norm": 0.7078162431716919, + "learning_rate": 1.306105610561056e-05, + "loss": 0.238, + "num_input_tokens_seen": 1002528, + "step": 4750 + }, + { + "epoch": 0.523102310231023, + "grad_norm": 0.8827365040779114, + "learning_rate": 1.3074807480748076e-05, + "loss": 0.2079, + "num_input_tokens_seen": 1003552, + "step": 4755 + }, + { + "epoch": 0.5236523652365237, + "grad_norm": 0.7309926152229309, + "learning_rate": 1.3088558855885589e-05, + "loss": 0.2249, + "num_input_tokens_seen": 1004608, + "step": 4760 + }, + { + "epoch": 0.5242024202420242, + "grad_norm": 1.0050222873687744, + "learning_rate": 1.3102310231023102e-05, + "loss": 0.2556, + "num_input_tokens_seen": 1005664, + "step": 4765 + }, + { + "epoch": 0.5247524752475248, + "grad_norm": 0.9790956974029541, + "learning_rate": 1.3116061606160617e-05, + "loss": 0.2234, + "num_input_tokens_seen": 1006624, + "step": 4770 + }, + { + "epoch": 0.5253025302530253, + "grad_norm": 0.2823637127876282, + "learning_rate": 1.312981298129813e-05, + "loss": 0.2038, + "num_input_tokens_seen": 1007680, + "step": 4775 + }, + { + "epoch": 0.5258525852585259, + "grad_norm": 0.32556870579719543, + "learning_rate": 1.3143564356435644e-05, + "loss": 0.2052, + "num_input_tokens_seen": 1008704, + "step": 4780 + }, + { + "epoch": 0.5264026402640264, + "grad_norm": 1.230736494064331, + "learning_rate": 1.3157315731573159e-05, + "loss": 0.1673, + "num_input_tokens_seen": 1009760, + "step": 4785 + }, + { + "epoch": 0.5269526952695269, + "grad_norm": 0.6789531111717224, + "learning_rate": 1.3171067106710672e-05, + "loss": 0.2099, + "num_input_tokens_seen": 1010816, + "step": 4790 + }, + { + "epoch": 0.5275027502750275, + "grad_norm": 1.0543473958969116, + "learning_rate": 1.3184818481848186e-05, + "loss": 0.2703, + "num_input_tokens_seen": 1011872, + "step": 4795 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.5343778729438782, + "learning_rate": 1.3198569856985699e-05, + "loss": 0.1851, + "num_input_tokens_seen": 1012960, + "step": 4800 + }, + { + "epoch": 0.5286028602860287, + "grad_norm": 0.4426206350326538, + "learning_rate": 1.3212321232123212e-05, + "loss": 0.2234, + "num_input_tokens_seen": 1014016, + "step": 4805 + }, + { + "epoch": 0.5291529152915292, + "grad_norm": 0.40687596797943115, + "learning_rate": 1.3226072607260726e-05, + "loss": 0.177, + "num_input_tokens_seen": 1015040, + "step": 4810 + }, + { + "epoch": 0.5297029702970297, + "grad_norm": 0.2937842905521393, + "learning_rate": 1.3239823982398243e-05, + "loss": 0.193, + "num_input_tokens_seen": 1016064, + "step": 4815 + }, + { + "epoch": 0.5302530253025303, + "grad_norm": 0.41854435205459595, + "learning_rate": 1.3253575357535756e-05, + "loss": 0.1935, + "num_input_tokens_seen": 1017088, + "step": 4820 + }, + { + "epoch": 0.5308030803080308, + "grad_norm": 0.398586243391037, + "learning_rate": 1.326732673267327e-05, + "loss": 0.1904, + "num_input_tokens_seen": 1018176, + "step": 4825 + }, + { + "epoch": 0.5313531353135313, + "grad_norm": 0.5519256591796875, + "learning_rate": 1.3281078107810783e-05, + "loss": 0.1808, + "num_input_tokens_seen": 1019168, + "step": 4830 + }, + { + "epoch": 0.5319031903190319, + "grad_norm": 0.4191964268684387, + "learning_rate": 1.3294829482948296e-05, + "loss": 0.2219, + "num_input_tokens_seen": 1020224, + "step": 4835 + }, + { + "epoch": 0.5324532453245324, + "grad_norm": 0.5971078276634216, + "learning_rate": 1.330858085808581e-05, + "loss": 0.2138, + "num_input_tokens_seen": 1021280, + "step": 4840 + }, + { + "epoch": 0.533003300330033, + "grad_norm": 0.30151697993278503, + "learning_rate": 1.3322332233223323e-05, + "loss": 0.2506, + "num_input_tokens_seen": 1022368, + "step": 4845 + }, + { + "epoch": 0.5335533553355336, + "grad_norm": 1.0696656703948975, + "learning_rate": 1.3336083608360836e-05, + "loss": 0.2782, + "num_input_tokens_seen": 1023424, + "step": 4850 + }, + { + "epoch": 0.5341034103410341, + "grad_norm": 2.211493968963623, + "learning_rate": 1.334983498349835e-05, + "loss": 0.2704, + "num_input_tokens_seen": 1024448, + "step": 4855 + }, + { + "epoch": 0.5346534653465347, + "grad_norm": 0.8298560976982117, + "learning_rate": 1.3363586358635866e-05, + "loss": 0.2344, + "num_input_tokens_seen": 1025504, + "step": 4860 + }, + { + "epoch": 0.5352035203520352, + "grad_norm": 1.0706722736358643, + "learning_rate": 1.337733773377338e-05, + "loss": 0.2394, + "num_input_tokens_seen": 1026560, + "step": 4865 + }, + { + "epoch": 0.5357535753575358, + "grad_norm": 0.5388270020484924, + "learning_rate": 1.3391089108910893e-05, + "loss": 0.2348, + "num_input_tokens_seen": 1027648, + "step": 4870 + }, + { + "epoch": 0.5363036303630363, + "grad_norm": 0.46729621291160583, + "learning_rate": 1.3404840484048406e-05, + "loss": 0.1589, + "num_input_tokens_seen": 1028736, + "step": 4875 + }, + { + "epoch": 0.5368536853685368, + "grad_norm": 0.3954755365848541, + "learning_rate": 1.341859185918592e-05, + "loss": 0.2196, + "num_input_tokens_seen": 1029760, + "step": 4880 + }, + { + "epoch": 0.5374037403740374, + "grad_norm": 0.45274946093559265, + "learning_rate": 1.3432343234323433e-05, + "loss": 0.1765, + "num_input_tokens_seen": 1030848, + "step": 4885 + }, + { + "epoch": 0.5379537953795379, + "grad_norm": 0.33122047781944275, + "learning_rate": 1.3446094609460946e-05, + "loss": 0.1645, + "num_input_tokens_seen": 1032032, + "step": 4890 + }, + { + "epoch": 0.5385038503850385, + "grad_norm": 0.6224885582923889, + "learning_rate": 1.345984598459846e-05, + "loss": 0.1946, + "num_input_tokens_seen": 1033152, + "step": 4895 + }, + { + "epoch": 0.5390539053905391, + "grad_norm": 0.35997211933135986, + "learning_rate": 1.3473597359735973e-05, + "loss": 0.2239, + "num_input_tokens_seen": 1034176, + "step": 4900 + }, + { + "epoch": 0.5396039603960396, + "grad_norm": 0.2454964816570282, + "learning_rate": 1.3487348734873486e-05, + "loss": 0.2418, + "num_input_tokens_seen": 1035232, + "step": 4905 + }, + { + "epoch": 0.5401540154015402, + "grad_norm": 0.49656736850738525, + "learning_rate": 1.3501100110011003e-05, + "loss": 0.2079, + "num_input_tokens_seen": 1036352, + "step": 4910 + }, + { + "epoch": 0.5407040704070407, + "grad_norm": 0.850297749042511, + "learning_rate": 1.3514851485148516e-05, + "loss": 0.1835, + "num_input_tokens_seen": 1037376, + "step": 4915 + }, + { + "epoch": 0.5412541254125413, + "grad_norm": 0.6260995268821716, + "learning_rate": 1.352860286028603e-05, + "loss": 0.1891, + "num_input_tokens_seen": 1038464, + "step": 4920 + }, + { + "epoch": 0.5418041804180418, + "grad_norm": 0.40784573554992676, + "learning_rate": 1.3542354235423543e-05, + "loss": 0.1731, + "num_input_tokens_seen": 1039456, + "step": 4925 + }, + { + "epoch": 0.5423542354235423, + "grad_norm": 0.3648642897605896, + "learning_rate": 1.3556105610561056e-05, + "loss": 0.1759, + "num_input_tokens_seen": 1040544, + "step": 4930 + }, + { + "epoch": 0.5429042904290429, + "grad_norm": 1.5340096950531006, + "learning_rate": 1.356985698569857e-05, + "loss": 0.2171, + "num_input_tokens_seen": 1041600, + "step": 4935 + }, + { + "epoch": 0.5434543454345434, + "grad_norm": 0.4905911982059479, + "learning_rate": 1.3583608360836083e-05, + "loss": 0.1964, + "num_input_tokens_seen": 1042752, + "step": 4940 + }, + { + "epoch": 0.5440044004400441, + "grad_norm": 0.35384413599967957, + "learning_rate": 1.3597359735973596e-05, + "loss": 0.1879, + "num_input_tokens_seen": 1043712, + "step": 4945 + }, + { + "epoch": 0.5445544554455446, + "grad_norm": 0.6128098964691162, + "learning_rate": 1.3611111111111111e-05, + "loss": 0.1637, + "num_input_tokens_seen": 1044768, + "step": 4950 + }, + { + "epoch": 0.5451045104510451, + "grad_norm": 0.5759755969047546, + "learning_rate": 1.3624862486248626e-05, + "loss": 0.207, + "num_input_tokens_seen": 1045824, + "step": 4955 + }, + { + "epoch": 0.5456545654565457, + "grad_norm": 0.45097994804382324, + "learning_rate": 1.363861386138614e-05, + "loss": 0.2562, + "num_input_tokens_seen": 1046880, + "step": 4960 + }, + { + "epoch": 0.5462046204620462, + "grad_norm": 0.4799923300743103, + "learning_rate": 1.3652365236523653e-05, + "loss": 0.2166, + "num_input_tokens_seen": 1047904, + "step": 4965 + }, + { + "epoch": 0.5467546754675467, + "grad_norm": 0.8314235210418701, + "learning_rate": 1.3666116611661167e-05, + "loss": 0.2262, + "num_input_tokens_seen": 1048960, + "step": 4970 + }, + { + "epoch": 0.5473047304730473, + "grad_norm": 1.1322660446166992, + "learning_rate": 1.367986798679868e-05, + "loss": 0.2393, + "num_input_tokens_seen": 1049984, + "step": 4975 + }, + { + "epoch": 0.5478547854785478, + "grad_norm": 0.6306383609771729, + "learning_rate": 1.3693619361936195e-05, + "loss": 0.1883, + "num_input_tokens_seen": 1050976, + "step": 4980 + }, + { + "epoch": 0.5484048404840484, + "grad_norm": 0.4671180844306946, + "learning_rate": 1.3707370737073708e-05, + "loss": 0.1922, + "num_input_tokens_seen": 1052032, + "step": 4985 + }, + { + "epoch": 0.5489548954895489, + "grad_norm": 1.2058299779891968, + "learning_rate": 1.3721122112211222e-05, + "loss": 0.2159, + "num_input_tokens_seen": 1053056, + "step": 4990 + }, + { + "epoch": 0.5495049504950495, + "grad_norm": 0.8623759150505066, + "learning_rate": 1.3734873487348735e-05, + "loss": 0.1769, + "num_input_tokens_seen": 1054048, + "step": 4995 + }, + { + "epoch": 0.5500550055005501, + "grad_norm": 0.6041048765182495, + "learning_rate": 1.3748624862486248e-05, + "loss": 0.2141, + "num_input_tokens_seen": 1055104, + "step": 5000 + }, + { + "epoch": 0.5506050605060506, + "grad_norm": 0.5409774780273438, + "learning_rate": 1.3762376237623763e-05, + "loss": 0.2045, + "num_input_tokens_seen": 1056192, + "step": 5005 + }, + { + "epoch": 0.5511551155115512, + "grad_norm": 1.0184192657470703, + "learning_rate": 1.3776127612761278e-05, + "loss": 0.2115, + "num_input_tokens_seen": 1057312, + "step": 5010 + }, + { + "epoch": 0.5517051705170517, + "grad_norm": 0.32375118136405945, + "learning_rate": 1.3789878987898792e-05, + "loss": 0.157, + "num_input_tokens_seen": 1058368, + "step": 5015 + }, + { + "epoch": 0.5522552255225522, + "grad_norm": 0.5413753390312195, + "learning_rate": 1.3803630363036305e-05, + "loss": 0.2347, + "num_input_tokens_seen": 1059424, + "step": 5020 + }, + { + "epoch": 0.5528052805280528, + "grad_norm": 1.2179194688796997, + "learning_rate": 1.3817381738173818e-05, + "loss": 0.2256, + "num_input_tokens_seen": 1060448, + "step": 5025 + }, + { + "epoch": 0.5533553355335533, + "grad_norm": 0.9536162614822388, + "learning_rate": 1.3831133113311332e-05, + "loss": 0.1938, + "num_input_tokens_seen": 1061472, + "step": 5030 + }, + { + "epoch": 0.5539053905390539, + "grad_norm": 0.45127928256988525, + "learning_rate": 1.3844884488448845e-05, + "loss": 0.242, + "num_input_tokens_seen": 1062432, + "step": 5035 + }, + { + "epoch": 0.5544554455445545, + "grad_norm": 1.2344685792922974, + "learning_rate": 1.3858635863586358e-05, + "loss": 0.2107, + "num_input_tokens_seen": 1063456, + "step": 5040 + }, + { + "epoch": 0.555005500550055, + "grad_norm": 1.2380046844482422, + "learning_rate": 1.3872387238723872e-05, + "loss": 0.2417, + "num_input_tokens_seen": 1064480, + "step": 5045 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.641427218914032, + "learning_rate": 1.3886138613861389e-05, + "loss": 0.2262, + "num_input_tokens_seen": 1065536, + "step": 5050 + }, + { + "epoch": 0.5561056105610561, + "grad_norm": 0.7466795444488525, + "learning_rate": 1.3899889988998902e-05, + "loss": 0.1757, + "num_input_tokens_seen": 1066592, + "step": 5055 + }, + { + "epoch": 0.5566556655665567, + "grad_norm": 0.3821652829647064, + "learning_rate": 1.3913641364136415e-05, + "loss": 0.1798, + "num_input_tokens_seen": 1067616, + "step": 5060 + }, + { + "epoch": 0.5572057205720572, + "grad_norm": 0.873442530632019, + "learning_rate": 1.3927392739273929e-05, + "loss": 0.2298, + "num_input_tokens_seen": 1068672, + "step": 5065 + }, + { + "epoch": 0.5577557755775577, + "grad_norm": 0.4535605013370514, + "learning_rate": 1.3941144114411442e-05, + "loss": 0.3145, + "num_input_tokens_seen": 1069696, + "step": 5070 + }, + { + "epoch": 0.5583058305830583, + "grad_norm": 0.9998116493225098, + "learning_rate": 1.3954895489548955e-05, + "loss": 0.2278, + "num_input_tokens_seen": 1070752, + "step": 5075 + }, + { + "epoch": 0.5588558855885588, + "grad_norm": 0.8059996366500854, + "learning_rate": 1.3968646864686469e-05, + "loss": 0.1968, + "num_input_tokens_seen": 1071840, + "step": 5080 + }, + { + "epoch": 0.5594059405940595, + "grad_norm": 0.46543410420417786, + "learning_rate": 1.3982398239823982e-05, + "loss": 0.1917, + "num_input_tokens_seen": 1072928, + "step": 5085 + }, + { + "epoch": 0.55995599559956, + "grad_norm": 0.3332992494106293, + "learning_rate": 1.3996149614961495e-05, + "loss": 0.1991, + "num_input_tokens_seen": 1073888, + "step": 5090 + }, + { + "epoch": 0.5605060506050605, + "grad_norm": 0.7521728873252869, + "learning_rate": 1.4009900990099012e-05, + "loss": 0.1853, + "num_input_tokens_seen": 1074944, + "step": 5095 + }, + { + "epoch": 0.5610561056105611, + "grad_norm": 0.4927389323711395, + "learning_rate": 1.4023652365236525e-05, + "loss": 0.2275, + "num_input_tokens_seen": 1076096, + "step": 5100 + }, + { + "epoch": 0.5616061606160616, + "grad_norm": 0.7450542449951172, + "learning_rate": 1.4037403740374039e-05, + "loss": 0.1966, + "num_input_tokens_seen": 1077088, + "step": 5105 + }, + { + "epoch": 0.5621562156215621, + "grad_norm": 0.4107699990272522, + "learning_rate": 1.4051155115511552e-05, + "loss": 0.2298, + "num_input_tokens_seen": 1078144, + "step": 5110 + }, + { + "epoch": 0.5627062706270627, + "grad_norm": 0.6977160573005676, + "learning_rate": 1.4064906490649066e-05, + "loss": 0.2317, + "num_input_tokens_seen": 1079200, + "step": 5115 + }, + { + "epoch": 0.5632563256325632, + "grad_norm": 0.5700230002403259, + "learning_rate": 1.4078657865786579e-05, + "loss": 0.2173, + "num_input_tokens_seen": 1080256, + "step": 5120 + }, + { + "epoch": 0.5638063806380638, + "grad_norm": 0.6221499443054199, + "learning_rate": 1.4092409240924092e-05, + "loss": 0.208, + "num_input_tokens_seen": 1081344, + "step": 5125 + }, + { + "epoch": 0.5643564356435643, + "grad_norm": 0.5918216109275818, + "learning_rate": 1.4106160616061606e-05, + "loss": 0.1764, + "num_input_tokens_seen": 1082400, + "step": 5130 + }, + { + "epoch": 0.564906490649065, + "grad_norm": 0.46737611293792725, + "learning_rate": 1.4119911991199119e-05, + "loss": 0.2598, + "num_input_tokens_seen": 1083520, + "step": 5135 + }, + { + "epoch": 0.5654565456545655, + "grad_norm": 0.4260595738887787, + "learning_rate": 1.4133663366336632e-05, + "loss": 0.3027, + "num_input_tokens_seen": 1084576, + "step": 5140 + }, + { + "epoch": 0.566006600660066, + "grad_norm": 0.994447648525238, + "learning_rate": 1.4147414741474149e-05, + "loss": 0.2028, + "num_input_tokens_seen": 1085632, + "step": 5145 + }, + { + "epoch": 0.5665566556655666, + "grad_norm": 0.6605868935585022, + "learning_rate": 1.4161166116611662e-05, + "loss": 0.2465, + "num_input_tokens_seen": 1086656, + "step": 5150 + }, + { + "epoch": 0.5671067106710671, + "grad_norm": 0.6159387826919556, + "learning_rate": 1.4174917491749176e-05, + "loss": 0.1539, + "num_input_tokens_seen": 1087680, + "step": 5155 + }, + { + "epoch": 0.5676567656765676, + "grad_norm": 0.4537174701690674, + "learning_rate": 1.4188668866886689e-05, + "loss": 0.1973, + "num_input_tokens_seen": 1088704, + "step": 5160 + }, + { + "epoch": 0.5682068206820682, + "grad_norm": 1.015993595123291, + "learning_rate": 1.4202420242024202e-05, + "loss": 0.1822, + "num_input_tokens_seen": 1089728, + "step": 5165 + }, + { + "epoch": 0.5687568756875687, + "grad_norm": 0.42490604519844055, + "learning_rate": 1.4216171617161716e-05, + "loss": 0.2519, + "num_input_tokens_seen": 1090816, + "step": 5170 + }, + { + "epoch": 0.5693069306930693, + "grad_norm": 0.5351569652557373, + "learning_rate": 1.422992299229923e-05, + "loss": 0.2005, + "num_input_tokens_seen": 1091872, + "step": 5175 + }, + { + "epoch": 0.5698569856985699, + "grad_norm": 0.7589699625968933, + "learning_rate": 1.4243674367436744e-05, + "loss": 0.2763, + "num_input_tokens_seen": 1092992, + "step": 5180 + }, + { + "epoch": 0.5704070407040704, + "grad_norm": 0.6810176372528076, + "learning_rate": 1.4257425742574257e-05, + "loss": 0.2195, + "num_input_tokens_seen": 1094016, + "step": 5185 + }, + { + "epoch": 0.570957095709571, + "grad_norm": 0.6358166933059692, + "learning_rate": 1.4271177117711773e-05, + "loss": 0.181, + "num_input_tokens_seen": 1095104, + "step": 5190 + }, + { + "epoch": 0.5715071507150715, + "grad_norm": 0.2800721526145935, + "learning_rate": 1.4284928492849286e-05, + "loss": 0.1786, + "num_input_tokens_seen": 1096128, + "step": 5195 + }, + { + "epoch": 0.5720572057205721, + "grad_norm": 0.6115806102752686, + "learning_rate": 1.42986798679868e-05, + "loss": 0.1954, + "num_input_tokens_seen": 1097152, + "step": 5200 + }, + { + "epoch": 0.5726072607260726, + "grad_norm": 0.5078771710395813, + "learning_rate": 1.4312431243124314e-05, + "loss": 0.2197, + "num_input_tokens_seen": 1098176, + "step": 5205 + }, + { + "epoch": 0.5731573157315731, + "grad_norm": 0.783119261264801, + "learning_rate": 1.4326182618261828e-05, + "loss": 0.2128, + "num_input_tokens_seen": 1099168, + "step": 5210 + }, + { + "epoch": 0.5737073707370737, + "grad_norm": 0.45291122794151306, + "learning_rate": 1.4339933993399341e-05, + "loss": 0.2227, + "num_input_tokens_seen": 1100160, + "step": 5215 + }, + { + "epoch": 0.5742574257425742, + "grad_norm": 0.36735281348228455, + "learning_rate": 1.4353685368536854e-05, + "loss": 0.1984, + "num_input_tokens_seen": 1101184, + "step": 5220 + }, + { + "epoch": 0.5748074807480749, + "grad_norm": 0.38474780321121216, + "learning_rate": 1.4367436743674368e-05, + "loss": 0.1651, + "num_input_tokens_seen": 1102272, + "step": 5225 + }, + { + "epoch": 0.5753575357535754, + "grad_norm": 0.7367910146713257, + "learning_rate": 1.4381188118811881e-05, + "loss": 0.197, + "num_input_tokens_seen": 1103328, + "step": 5230 + }, + { + "epoch": 0.5759075907590759, + "grad_norm": 0.6537361741065979, + "learning_rate": 1.4394939493949398e-05, + "loss": 0.1808, + "num_input_tokens_seen": 1104384, + "step": 5235 + }, + { + "epoch": 0.5764576457645765, + "grad_norm": 0.40037330985069275, + "learning_rate": 1.4408690869086911e-05, + "loss": 0.1467, + "num_input_tokens_seen": 1105440, + "step": 5240 + }, + { + "epoch": 0.577007700770077, + "grad_norm": 0.5441921353340149, + "learning_rate": 1.4422442244224424e-05, + "loss": 0.2214, + "num_input_tokens_seen": 1106496, + "step": 5245 + }, + { + "epoch": 0.5775577557755776, + "grad_norm": 0.42916303873062134, + "learning_rate": 1.4436193619361938e-05, + "loss": 0.2024, + "num_input_tokens_seen": 1107552, + "step": 5250 + }, + { + "epoch": 0.5781078107810781, + "grad_norm": 0.8240891695022583, + "learning_rate": 1.4449944994499451e-05, + "loss": 0.185, + "num_input_tokens_seen": 1108640, + "step": 5255 + }, + { + "epoch": 0.5786578657865786, + "grad_norm": 1.4838982820510864, + "learning_rate": 1.4463696369636965e-05, + "loss": 0.2783, + "num_input_tokens_seen": 1109728, + "step": 5260 + }, + { + "epoch": 0.5792079207920792, + "grad_norm": 0.6966081857681274, + "learning_rate": 1.4477447744774478e-05, + "loss": 0.2579, + "num_input_tokens_seen": 1110784, + "step": 5265 + }, + { + "epoch": 0.5797579757975797, + "grad_norm": 0.6697250604629517, + "learning_rate": 1.4491199119911991e-05, + "loss": 0.17, + "num_input_tokens_seen": 1111840, + "step": 5270 + }, + { + "epoch": 0.5803080308030804, + "grad_norm": 0.6033509969711304, + "learning_rate": 1.4504950495049505e-05, + "loss": 0.2579, + "num_input_tokens_seen": 1112832, + "step": 5275 + }, + { + "epoch": 0.5808580858085809, + "grad_norm": 0.46986865997314453, + "learning_rate": 1.4518701870187018e-05, + "loss": 0.1946, + "num_input_tokens_seen": 1113920, + "step": 5280 + }, + { + "epoch": 0.5814081408140814, + "grad_norm": 0.6511896848678589, + "learning_rate": 1.4532453245324535e-05, + "loss": 0.2465, + "num_input_tokens_seen": 1115040, + "step": 5285 + }, + { + "epoch": 0.581958195819582, + "grad_norm": 0.6284393072128296, + "learning_rate": 1.4546204620462048e-05, + "loss": 0.1703, + "num_input_tokens_seen": 1116064, + "step": 5290 + }, + { + "epoch": 0.5825082508250825, + "grad_norm": 0.4336673319339752, + "learning_rate": 1.4559955995599561e-05, + "loss": 0.2361, + "num_input_tokens_seen": 1117120, + "step": 5295 + }, + { + "epoch": 0.583058305830583, + "grad_norm": 0.33698570728302, + "learning_rate": 1.4573707370737075e-05, + "loss": 0.2464, + "num_input_tokens_seen": 1118176, + "step": 5300 + }, + { + "epoch": 0.5836083608360836, + "grad_norm": 0.35824429988861084, + "learning_rate": 1.4587458745874588e-05, + "loss": 0.2265, + "num_input_tokens_seen": 1119232, + "step": 5305 + }, + { + "epoch": 0.5841584158415841, + "grad_norm": 0.7019874453544617, + "learning_rate": 1.4601210121012101e-05, + "loss": 0.1717, + "num_input_tokens_seen": 1120320, + "step": 5310 + }, + { + "epoch": 0.5847084708470847, + "grad_norm": 1.283165693283081, + "learning_rate": 1.4614961496149615e-05, + "loss": 0.2704, + "num_input_tokens_seen": 1121408, + "step": 5315 + }, + { + "epoch": 0.5852585258525853, + "grad_norm": 0.4572273790836334, + "learning_rate": 1.4628712871287128e-05, + "loss": 0.2142, + "num_input_tokens_seen": 1122496, + "step": 5320 + }, + { + "epoch": 0.5858085808580858, + "grad_norm": 0.3884008228778839, + "learning_rate": 1.4642464246424641e-05, + "loss": 0.2365, + "num_input_tokens_seen": 1123456, + "step": 5325 + }, + { + "epoch": 0.5863586358635864, + "grad_norm": 0.5008012056350708, + "learning_rate": 1.4656215621562158e-05, + "loss": 0.1808, + "num_input_tokens_seen": 1124512, + "step": 5330 + }, + { + "epoch": 0.5869086908690869, + "grad_norm": 0.5972585082054138, + "learning_rate": 1.4669966996699672e-05, + "loss": 0.2611, + "num_input_tokens_seen": 1125568, + "step": 5335 + }, + { + "epoch": 0.5874587458745875, + "grad_norm": 0.5476995706558228, + "learning_rate": 1.4683718371837185e-05, + "loss": 0.1813, + "num_input_tokens_seen": 1126624, + "step": 5340 + }, + { + "epoch": 0.588008800880088, + "grad_norm": 0.44038131833076477, + "learning_rate": 1.4697469746974698e-05, + "loss": 0.216, + "num_input_tokens_seen": 1127712, + "step": 5345 + }, + { + "epoch": 0.5885588558855885, + "grad_norm": 0.4205050766468048, + "learning_rate": 1.4711221122112212e-05, + "loss": 0.1769, + "num_input_tokens_seen": 1128736, + "step": 5350 + }, + { + "epoch": 0.5891089108910891, + "grad_norm": 0.9663904309272766, + "learning_rate": 1.4724972497249725e-05, + "loss": 0.1701, + "num_input_tokens_seen": 1129760, + "step": 5355 + }, + { + "epoch": 0.5896589658965896, + "grad_norm": 1.2689768075942993, + "learning_rate": 1.4738723872387238e-05, + "loss": 0.2204, + "num_input_tokens_seen": 1130880, + "step": 5360 + }, + { + "epoch": 0.5902090209020903, + "grad_norm": 0.6925552487373352, + "learning_rate": 1.4752475247524752e-05, + "loss": 0.1809, + "num_input_tokens_seen": 1131904, + "step": 5365 + }, + { + "epoch": 0.5907590759075908, + "grad_norm": 0.30846917629241943, + "learning_rate": 1.4766226622662267e-05, + "loss": 0.1967, + "num_input_tokens_seen": 1132960, + "step": 5370 + }, + { + "epoch": 0.5913091309130913, + "grad_norm": 0.9829226732254028, + "learning_rate": 1.477997799779978e-05, + "loss": 0.1894, + "num_input_tokens_seen": 1133952, + "step": 5375 + }, + { + "epoch": 0.5918591859185919, + "grad_norm": 0.31955191493034363, + "learning_rate": 1.4793729372937295e-05, + "loss": 0.1724, + "num_input_tokens_seen": 1135008, + "step": 5380 + }, + { + "epoch": 0.5924092409240924, + "grad_norm": 0.635518491268158, + "learning_rate": 1.4807480748074808e-05, + "loss": 0.1931, + "num_input_tokens_seen": 1136032, + "step": 5385 + }, + { + "epoch": 0.592959295929593, + "grad_norm": 0.43475642800331116, + "learning_rate": 1.4821232123212322e-05, + "loss": 0.199, + "num_input_tokens_seen": 1136992, + "step": 5390 + }, + { + "epoch": 0.5935093509350935, + "grad_norm": 0.48350948095321655, + "learning_rate": 1.4834983498349835e-05, + "loss": 0.1809, + "num_input_tokens_seen": 1138112, + "step": 5395 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.305255651473999, + "learning_rate": 1.484873487348735e-05, + "loss": 0.1945, + "num_input_tokens_seen": 1139136, + "step": 5400 + }, + { + "epoch": 0.5946094609460946, + "grad_norm": 0.8403945565223694, + "learning_rate": 1.4862486248624863e-05, + "loss": 0.2104, + "num_input_tokens_seen": 1140128, + "step": 5405 + }, + { + "epoch": 0.5951595159515951, + "grad_norm": 0.40080204606056213, + "learning_rate": 1.4876237623762377e-05, + "loss": 0.2005, + "num_input_tokens_seen": 1141216, + "step": 5410 + }, + { + "epoch": 0.5957095709570958, + "grad_norm": 0.5678466558456421, + "learning_rate": 1.488998899889989e-05, + "loss": 0.165, + "num_input_tokens_seen": 1142272, + "step": 5415 + }, + { + "epoch": 0.5962596259625963, + "grad_norm": 1.6811182498931885, + "learning_rate": 1.4903740374037404e-05, + "loss": 0.2954, + "num_input_tokens_seen": 1143328, + "step": 5420 + }, + { + "epoch": 0.5968096809680968, + "grad_norm": 0.3422979712486267, + "learning_rate": 1.4917491749174919e-05, + "loss": 0.1527, + "num_input_tokens_seen": 1144320, + "step": 5425 + }, + { + "epoch": 0.5973597359735974, + "grad_norm": 0.38258057832717896, + "learning_rate": 1.4931243124312434e-05, + "loss": 0.1755, + "num_input_tokens_seen": 1145376, + "step": 5430 + }, + { + "epoch": 0.5979097909790979, + "grad_norm": 0.5328806042671204, + "learning_rate": 1.4944994499449947e-05, + "loss": 0.1564, + "num_input_tokens_seen": 1146400, + "step": 5435 + }, + { + "epoch": 0.5984598459845984, + "grad_norm": 0.7373913526535034, + "learning_rate": 1.495874587458746e-05, + "loss": 0.2303, + "num_input_tokens_seen": 1147392, + "step": 5440 + }, + { + "epoch": 0.599009900990099, + "grad_norm": 0.852566123008728, + "learning_rate": 1.4972497249724974e-05, + "loss": 0.2148, + "num_input_tokens_seen": 1148448, + "step": 5445 + }, + { + "epoch": 0.5995599559955995, + "grad_norm": 0.3950382173061371, + "learning_rate": 1.4986248624862487e-05, + "loss": 0.1765, + "num_input_tokens_seen": 1149504, + "step": 5450 + }, + { + "epoch": 0.6001100110011001, + "grad_norm": 0.30462270975112915, + "learning_rate": 1.5e-05, + "loss": 0.1444, + "num_input_tokens_seen": 1150592, + "step": 5455 + }, + { + "epoch": 0.6006600660066007, + "grad_norm": 0.3914923667907715, + "learning_rate": 1.5013751375137514e-05, + "loss": 0.2077, + "num_input_tokens_seen": 1151648, + "step": 5460 + }, + { + "epoch": 0.6012101210121013, + "grad_norm": 0.6631340384483337, + "learning_rate": 1.5027502750275027e-05, + "loss": 0.2439, + "num_input_tokens_seen": 1152672, + "step": 5465 + }, + { + "epoch": 0.6017601760176018, + "grad_norm": 0.6991565823554993, + "learning_rate": 1.5041254125412544e-05, + "loss": 0.1915, + "num_input_tokens_seen": 1153728, + "step": 5470 + }, + { + "epoch": 0.6023102310231023, + "grad_norm": 0.5223761200904846, + "learning_rate": 1.5055005500550057e-05, + "loss": 0.2191, + "num_input_tokens_seen": 1154752, + "step": 5475 + }, + { + "epoch": 0.6028602860286029, + "grad_norm": 0.6256295442581177, + "learning_rate": 1.506875687568757e-05, + "loss": 0.1883, + "num_input_tokens_seen": 1155840, + "step": 5480 + }, + { + "epoch": 0.6034103410341034, + "grad_norm": 0.692446768283844, + "learning_rate": 1.5082508250825084e-05, + "loss": 0.191, + "num_input_tokens_seen": 1156928, + "step": 5485 + }, + { + "epoch": 0.6039603960396039, + "grad_norm": 0.6427645683288574, + "learning_rate": 1.5096259625962597e-05, + "loss": 0.192, + "num_input_tokens_seen": 1157984, + "step": 5490 + }, + { + "epoch": 0.6045104510451045, + "grad_norm": 0.4308859407901764, + "learning_rate": 1.511001100110011e-05, + "loss": 0.159, + "num_input_tokens_seen": 1159072, + "step": 5495 + }, + { + "epoch": 0.605060506050605, + "grad_norm": 1.1415784358978271, + "learning_rate": 1.5123762376237624e-05, + "loss": 0.2502, + "num_input_tokens_seen": 1160160, + "step": 5500 + }, + { + "epoch": 0.6056105610561056, + "grad_norm": 0.6450678110122681, + "learning_rate": 1.5137513751375137e-05, + "loss": 0.18, + "num_input_tokens_seen": 1161248, + "step": 5505 + }, + { + "epoch": 0.6061606160616062, + "grad_norm": 1.0091534852981567, + "learning_rate": 1.515126512651265e-05, + "loss": 0.216, + "num_input_tokens_seen": 1162272, + "step": 5510 + }, + { + "epoch": 0.6067106710671067, + "grad_norm": 0.4935312867164612, + "learning_rate": 1.5165016501650164e-05, + "loss": 0.2164, + "num_input_tokens_seen": 1163296, + "step": 5515 + }, + { + "epoch": 0.6072607260726073, + "grad_norm": 0.3496781587600708, + "learning_rate": 1.517876787678768e-05, + "loss": 0.1764, + "num_input_tokens_seen": 1164352, + "step": 5520 + }, + { + "epoch": 0.6078107810781078, + "grad_norm": 0.44003158807754517, + "learning_rate": 1.5192519251925194e-05, + "loss": 0.1557, + "num_input_tokens_seen": 1165344, + "step": 5525 + }, + { + "epoch": 0.6083608360836084, + "grad_norm": 0.8711504936218262, + "learning_rate": 1.5206270627062707e-05, + "loss": 0.1782, + "num_input_tokens_seen": 1166432, + "step": 5530 + }, + { + "epoch": 0.6089108910891089, + "grad_norm": 0.7351680397987366, + "learning_rate": 1.522002200220022e-05, + "loss": 0.2593, + "num_input_tokens_seen": 1167456, + "step": 5535 + }, + { + "epoch": 0.6094609460946094, + "grad_norm": 0.3289308547973633, + "learning_rate": 1.5233773377337734e-05, + "loss": 0.188, + "num_input_tokens_seen": 1168512, + "step": 5540 + }, + { + "epoch": 0.61001100110011, + "grad_norm": 0.47759532928466797, + "learning_rate": 1.5247524752475247e-05, + "loss": 0.1786, + "num_input_tokens_seen": 1169600, + "step": 5545 + }, + { + "epoch": 0.6105610561056105, + "grad_norm": 0.39530450105667114, + "learning_rate": 1.526127612761276e-05, + "loss": 0.1993, + "num_input_tokens_seen": 1170624, + "step": 5550 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.5977165102958679, + "learning_rate": 1.5275027502750276e-05, + "loss": 0.2077, + "num_input_tokens_seen": 1171680, + "step": 5555 + }, + { + "epoch": 0.6116611661166117, + "grad_norm": 0.6989103555679321, + "learning_rate": 1.5288778877887787e-05, + "loss": 0.2179, + "num_input_tokens_seen": 1172704, + "step": 5560 + }, + { + "epoch": 0.6122112211221122, + "grad_norm": 0.7928751111030579, + "learning_rate": 1.5302530253025306e-05, + "loss": 0.1761, + "num_input_tokens_seen": 1173760, + "step": 5565 + }, + { + "epoch": 0.6127612761276128, + "grad_norm": 1.4689582586288452, + "learning_rate": 1.5316281628162818e-05, + "loss": 0.2402, + "num_input_tokens_seen": 1174848, + "step": 5570 + }, + { + "epoch": 0.6133113311331133, + "grad_norm": 0.4959963262081146, + "learning_rate": 1.5330033003300333e-05, + "loss": 0.1963, + "num_input_tokens_seen": 1175872, + "step": 5575 + }, + { + "epoch": 0.6138613861386139, + "grad_norm": 0.7856466770172119, + "learning_rate": 1.5343784378437844e-05, + "loss": 0.2026, + "num_input_tokens_seen": 1176864, + "step": 5580 + }, + { + "epoch": 0.6144114411441144, + "grad_norm": 0.6670913100242615, + "learning_rate": 1.535753575357536e-05, + "loss": 0.1472, + "num_input_tokens_seen": 1177984, + "step": 5585 + }, + { + "epoch": 0.6149614961496149, + "grad_norm": 0.40291324257850647, + "learning_rate": 1.537128712871287e-05, + "loss": 0.1337, + "num_input_tokens_seen": 1179072, + "step": 5590 + }, + { + "epoch": 0.6155115511551155, + "grad_norm": 0.9173564314842224, + "learning_rate": 1.5385038503850386e-05, + "loss": 0.2154, + "num_input_tokens_seen": 1180128, + "step": 5595 + }, + { + "epoch": 0.6160616061606161, + "grad_norm": 0.5430124998092651, + "learning_rate": 1.5398789878987898e-05, + "loss": 0.1967, + "num_input_tokens_seen": 1181184, + "step": 5600 + }, + { + "epoch": 0.6166116611661167, + "grad_norm": 0.4538629353046417, + "learning_rate": 1.5412541254125413e-05, + "loss": 0.18, + "num_input_tokens_seen": 1182208, + "step": 5605 + }, + { + "epoch": 0.6171617161716172, + "grad_norm": 1.3192113637924194, + "learning_rate": 1.5426292629262928e-05, + "loss": 0.231, + "num_input_tokens_seen": 1183264, + "step": 5610 + }, + { + "epoch": 0.6177117711771177, + "grad_norm": 0.38945695757865906, + "learning_rate": 1.5440044004400443e-05, + "loss": 0.1684, + "num_input_tokens_seen": 1184256, + "step": 5615 + }, + { + "epoch": 0.6182618261826183, + "grad_norm": 0.3901664614677429, + "learning_rate": 1.5453795379537954e-05, + "loss": 0.2998, + "num_input_tokens_seen": 1185280, + "step": 5620 + }, + { + "epoch": 0.6188118811881188, + "grad_norm": 0.40475544333457947, + "learning_rate": 1.546754675467547e-05, + "loss": 0.164, + "num_input_tokens_seen": 1186368, + "step": 5625 + }, + { + "epoch": 0.6193619361936193, + "grad_norm": 0.4191018044948578, + "learning_rate": 1.548129812981298e-05, + "loss": 0.1761, + "num_input_tokens_seen": 1187424, + "step": 5630 + }, + { + "epoch": 0.6199119911991199, + "grad_norm": 0.42175713181495667, + "learning_rate": 1.5495049504950496e-05, + "loss": 0.1651, + "num_input_tokens_seen": 1188448, + "step": 5635 + }, + { + "epoch": 0.6204620462046204, + "grad_norm": 0.3704771101474762, + "learning_rate": 1.5508800880088008e-05, + "loss": 0.1938, + "num_input_tokens_seen": 1189504, + "step": 5640 + }, + { + "epoch": 0.621012101210121, + "grad_norm": 0.7593764662742615, + "learning_rate": 1.5522552255225523e-05, + "loss": 0.2028, + "num_input_tokens_seen": 1190592, + "step": 5645 + }, + { + "epoch": 0.6215621562156216, + "grad_norm": 0.3815988302230835, + "learning_rate": 1.5536303630363035e-05, + "loss": 0.1461, + "num_input_tokens_seen": 1191648, + "step": 5650 + }, + { + "epoch": 0.6221122112211221, + "grad_norm": 0.5899112820625305, + "learning_rate": 1.555005500550055e-05, + "loss": 0.2232, + "num_input_tokens_seen": 1192704, + "step": 5655 + }, + { + "epoch": 0.6226622662266227, + "grad_norm": 0.7541307210922241, + "learning_rate": 1.5563806380638065e-05, + "loss": 0.1984, + "num_input_tokens_seen": 1193728, + "step": 5660 + }, + { + "epoch": 0.6232123212321232, + "grad_norm": 0.3800145983695984, + "learning_rate": 1.557755775577558e-05, + "loss": 0.1674, + "num_input_tokens_seen": 1194784, + "step": 5665 + }, + { + "epoch": 0.6237623762376238, + "grad_norm": 0.40639859437942505, + "learning_rate": 1.559130913091309e-05, + "loss": 0.2033, + "num_input_tokens_seen": 1195808, + "step": 5670 + }, + { + "epoch": 0.6243124312431243, + "grad_norm": 0.4154425263404846, + "learning_rate": 1.5605060506050606e-05, + "loss": 0.2098, + "num_input_tokens_seen": 1196864, + "step": 5675 + }, + { + "epoch": 0.6248624862486248, + "grad_norm": 0.6357488036155701, + "learning_rate": 1.5618811881188118e-05, + "loss": 0.2854, + "num_input_tokens_seen": 1197952, + "step": 5680 + }, + { + "epoch": 0.6254125412541254, + "grad_norm": 1.2897452116012573, + "learning_rate": 1.5632563256325633e-05, + "loss": 0.2379, + "num_input_tokens_seen": 1198976, + "step": 5685 + }, + { + "epoch": 0.6259625962596259, + "grad_norm": 0.6517961621284485, + "learning_rate": 1.5646314631463148e-05, + "loss": 0.2222, + "num_input_tokens_seen": 1200032, + "step": 5690 + }, + { + "epoch": 0.6265126512651266, + "grad_norm": 0.7531476616859436, + "learning_rate": 1.566006600660066e-05, + "loss": 0.1755, + "num_input_tokens_seen": 1201120, + "step": 5695 + }, + { + "epoch": 0.6270627062706271, + "grad_norm": 0.4467290937900543, + "learning_rate": 1.5673817381738175e-05, + "loss": 0.2469, + "num_input_tokens_seen": 1202144, + "step": 5700 + }, + { + "epoch": 0.6276127612761276, + "grad_norm": 0.7974070310592651, + "learning_rate": 1.568756875687569e-05, + "loss": 0.2067, + "num_input_tokens_seen": 1203200, + "step": 5705 + }, + { + "epoch": 0.6281628162816282, + "grad_norm": 0.4276212155818939, + "learning_rate": 1.57013201320132e-05, + "loss": 0.2148, + "num_input_tokens_seen": 1204320, + "step": 5710 + }, + { + "epoch": 0.6287128712871287, + "grad_norm": 0.6439443230628967, + "learning_rate": 1.5715071507150717e-05, + "loss": 0.2114, + "num_input_tokens_seen": 1205344, + "step": 5715 + }, + { + "epoch": 0.6292629262926293, + "grad_norm": 0.7513719201087952, + "learning_rate": 1.572882288228823e-05, + "loss": 0.1495, + "num_input_tokens_seen": 1206336, + "step": 5720 + }, + { + "epoch": 0.6298129812981298, + "grad_norm": 0.30869176983833313, + "learning_rate": 1.5742574257425743e-05, + "loss": 0.2295, + "num_input_tokens_seen": 1207392, + "step": 5725 + }, + { + "epoch": 0.6303630363036303, + "grad_norm": 0.5151299834251404, + "learning_rate": 1.575632563256326e-05, + "loss": 0.1826, + "num_input_tokens_seen": 1208352, + "step": 5730 + }, + { + "epoch": 0.6309130913091309, + "grad_norm": 0.4333033859729767, + "learning_rate": 1.577007700770077e-05, + "loss": 0.1634, + "num_input_tokens_seen": 1209408, + "step": 5735 + }, + { + "epoch": 0.6314631463146315, + "grad_norm": 1.1543763875961304, + "learning_rate": 1.5783828382838285e-05, + "loss": 0.1999, + "num_input_tokens_seen": 1210432, + "step": 5740 + }, + { + "epoch": 0.6320132013201321, + "grad_norm": 0.502034604549408, + "learning_rate": 1.5797579757975797e-05, + "loss": 0.205, + "num_input_tokens_seen": 1211520, + "step": 5745 + }, + { + "epoch": 0.6325632563256326, + "grad_norm": 0.5231684446334839, + "learning_rate": 1.5811331133113312e-05, + "loss": 0.1613, + "num_input_tokens_seen": 1212608, + "step": 5750 + }, + { + "epoch": 0.6331133113311331, + "grad_norm": 0.7767606973648071, + "learning_rate": 1.5825082508250827e-05, + "loss": 0.2279, + "num_input_tokens_seen": 1213600, + "step": 5755 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.6712617874145508, + "learning_rate": 1.5838833883388342e-05, + "loss": 0.1983, + "num_input_tokens_seen": 1214656, + "step": 5760 + }, + { + "epoch": 0.6342134213421342, + "grad_norm": 0.42418500781059265, + "learning_rate": 1.5852585258525853e-05, + "loss": 0.1449, + "num_input_tokens_seen": 1215712, + "step": 5765 + }, + { + "epoch": 0.6347634763476347, + "grad_norm": 0.7821754813194275, + "learning_rate": 1.586633663366337e-05, + "loss": 0.2196, + "num_input_tokens_seen": 1216768, + "step": 5770 + }, + { + "epoch": 0.6353135313531353, + "grad_norm": 0.3535451292991638, + "learning_rate": 1.588008800880088e-05, + "loss": 0.1994, + "num_input_tokens_seen": 1217856, + "step": 5775 + }, + { + "epoch": 0.6358635863586358, + "grad_norm": 0.3621642589569092, + "learning_rate": 1.5893839383938395e-05, + "loss": 0.1602, + "num_input_tokens_seen": 1218880, + "step": 5780 + }, + { + "epoch": 0.6364136413641364, + "grad_norm": 0.5807592868804932, + "learning_rate": 1.5907590759075907e-05, + "loss": 0.1745, + "num_input_tokens_seen": 1219904, + "step": 5785 + }, + { + "epoch": 0.636963696369637, + "grad_norm": 0.9749211668968201, + "learning_rate": 1.5921342134213422e-05, + "loss": 0.2132, + "num_input_tokens_seen": 1220928, + "step": 5790 + }, + { + "epoch": 0.6375137513751375, + "grad_norm": 0.5970913767814636, + "learning_rate": 1.5935093509350934e-05, + "loss": 0.1917, + "num_input_tokens_seen": 1221952, + "step": 5795 + }, + { + "epoch": 0.6380638063806381, + "grad_norm": 0.46656835079193115, + "learning_rate": 1.5948844884488452e-05, + "loss": 0.2264, + "num_input_tokens_seen": 1223008, + "step": 5800 + }, + { + "epoch": 0.6386138613861386, + "grad_norm": 0.5354458689689636, + "learning_rate": 1.5962596259625964e-05, + "loss": 0.2135, + "num_input_tokens_seen": 1224032, + "step": 5805 + }, + { + "epoch": 0.6391639163916392, + "grad_norm": 0.6159695386886597, + "learning_rate": 1.597634763476348e-05, + "loss": 0.2177, + "num_input_tokens_seen": 1225056, + "step": 5810 + }, + { + "epoch": 0.6397139713971397, + "grad_norm": 0.6701444387435913, + "learning_rate": 1.599009900990099e-05, + "loss": 0.1819, + "num_input_tokens_seen": 1226176, + "step": 5815 + }, + { + "epoch": 0.6402640264026402, + "grad_norm": 0.25503334403038025, + "learning_rate": 1.6003850385038505e-05, + "loss": 0.1804, + "num_input_tokens_seen": 1227232, + "step": 5820 + }, + { + "epoch": 0.6408140814081408, + "grad_norm": 0.5719227194786072, + "learning_rate": 1.6017601760176017e-05, + "loss": 0.2758, + "num_input_tokens_seen": 1228256, + "step": 5825 + }, + { + "epoch": 0.6413641364136413, + "grad_norm": 0.283992737531662, + "learning_rate": 1.6031353135313532e-05, + "loss": 0.2222, + "num_input_tokens_seen": 1229344, + "step": 5830 + }, + { + "epoch": 0.641914191419142, + "grad_norm": 0.2826913595199585, + "learning_rate": 1.6045104510451044e-05, + "loss": 0.2243, + "num_input_tokens_seen": 1230432, + "step": 5835 + }, + { + "epoch": 0.6424642464246425, + "grad_norm": 0.40784841775894165, + "learning_rate": 1.605885588558856e-05, + "loss": 0.1869, + "num_input_tokens_seen": 1231488, + "step": 5840 + }, + { + "epoch": 0.643014301430143, + "grad_norm": 0.38614511489868164, + "learning_rate": 1.6072607260726074e-05, + "loss": 0.2112, + "num_input_tokens_seen": 1232512, + "step": 5845 + }, + { + "epoch": 0.6435643564356436, + "grad_norm": 0.9754448533058167, + "learning_rate": 1.608635863586359e-05, + "loss": 0.1976, + "num_input_tokens_seen": 1233632, + "step": 5850 + }, + { + "epoch": 0.6441144114411441, + "grad_norm": 0.2847738564014435, + "learning_rate": 1.61001100110011e-05, + "loss": 0.2247, + "num_input_tokens_seen": 1234624, + "step": 5855 + }, + { + "epoch": 0.6446644664466447, + "grad_norm": 0.7340969443321228, + "learning_rate": 1.6113861386138616e-05, + "loss": 0.1641, + "num_input_tokens_seen": 1235680, + "step": 5860 + }, + { + "epoch": 0.6452145214521452, + "grad_norm": 0.701762318611145, + "learning_rate": 1.6127612761276127e-05, + "loss": 0.2056, + "num_input_tokens_seen": 1236768, + "step": 5865 + }, + { + "epoch": 0.6457645764576457, + "grad_norm": 0.6927366852760315, + "learning_rate": 1.6141364136413642e-05, + "loss": 0.2324, + "num_input_tokens_seen": 1237856, + "step": 5870 + }, + { + "epoch": 0.6463146314631463, + "grad_norm": 0.2846428453922272, + "learning_rate": 1.6155115511551154e-05, + "loss": 0.2108, + "num_input_tokens_seen": 1238880, + "step": 5875 + }, + { + "epoch": 0.6468646864686468, + "grad_norm": 0.5980604887008667, + "learning_rate": 1.616886688668867e-05, + "loss": 0.2148, + "num_input_tokens_seen": 1239968, + "step": 5880 + }, + { + "epoch": 0.6474147414741475, + "grad_norm": 0.29395928978919983, + "learning_rate": 1.6182618261826184e-05, + "loss": 0.167, + "num_input_tokens_seen": 1241024, + "step": 5885 + }, + { + "epoch": 0.647964796479648, + "grad_norm": 0.5106644630432129, + "learning_rate": 1.6196369636963696e-05, + "loss": 0.2188, + "num_input_tokens_seen": 1242048, + "step": 5890 + }, + { + "epoch": 0.6485148514851485, + "grad_norm": 0.957036018371582, + "learning_rate": 1.621012101210121e-05, + "loss": 0.2575, + "num_input_tokens_seen": 1243040, + "step": 5895 + }, + { + "epoch": 0.6490649064906491, + "grad_norm": 0.4854499399662018, + "learning_rate": 1.6223872387238726e-05, + "loss": 0.2248, + "num_input_tokens_seen": 1244064, + "step": 5900 + }, + { + "epoch": 0.6496149614961496, + "grad_norm": 0.45618802309036255, + "learning_rate": 1.6237623762376237e-05, + "loss": 0.2112, + "num_input_tokens_seen": 1245088, + "step": 5905 + }, + { + "epoch": 0.6501650165016502, + "grad_norm": 0.47957536578178406, + "learning_rate": 1.6251375137513752e-05, + "loss": 0.2399, + "num_input_tokens_seen": 1246144, + "step": 5910 + }, + { + "epoch": 0.6507150715071507, + "grad_norm": 0.6135546565055847, + "learning_rate": 1.6265126512651267e-05, + "loss": 0.2382, + "num_input_tokens_seen": 1247232, + "step": 5915 + }, + { + "epoch": 0.6512651265126512, + "grad_norm": 0.8269505500793457, + "learning_rate": 1.627887788778878e-05, + "loss": 0.2436, + "num_input_tokens_seen": 1248288, + "step": 5920 + }, + { + "epoch": 0.6518151815181518, + "grad_norm": 0.39417901635169983, + "learning_rate": 1.6292629262926294e-05, + "loss": 0.218, + "num_input_tokens_seen": 1249376, + "step": 5925 + }, + { + "epoch": 0.6523652365236524, + "grad_norm": 0.54993736743927, + "learning_rate": 1.6306380638063806e-05, + "loss": 0.2116, + "num_input_tokens_seen": 1250432, + "step": 5930 + }, + { + "epoch": 0.652915291529153, + "grad_norm": 0.7996543049812317, + "learning_rate": 1.632013201320132e-05, + "loss": 0.2064, + "num_input_tokens_seen": 1251552, + "step": 5935 + }, + { + "epoch": 0.6534653465346535, + "grad_norm": 0.6243391633033752, + "learning_rate": 1.6333883388338836e-05, + "loss": 0.1891, + "num_input_tokens_seen": 1252608, + "step": 5940 + }, + { + "epoch": 0.654015401540154, + "grad_norm": 0.5315585136413574, + "learning_rate": 1.634763476347635e-05, + "loss": 0.2024, + "num_input_tokens_seen": 1253664, + "step": 5945 + }, + { + "epoch": 0.6545654565456546, + "grad_norm": 0.40344667434692383, + "learning_rate": 1.6361386138613863e-05, + "loss": 0.2541, + "num_input_tokens_seen": 1254720, + "step": 5950 + }, + { + "epoch": 0.6551155115511551, + "grad_norm": 0.7319573760032654, + "learning_rate": 1.6375137513751378e-05, + "loss": 0.1756, + "num_input_tokens_seen": 1255872, + "step": 5955 + }, + { + "epoch": 0.6556655665566556, + "grad_norm": 0.5894258618354797, + "learning_rate": 1.638888888888889e-05, + "loss": 0.163, + "num_input_tokens_seen": 1256928, + "step": 5960 + }, + { + "epoch": 0.6562156215621562, + "grad_norm": 0.4039192795753479, + "learning_rate": 1.6402640264026404e-05, + "loss": 0.1761, + "num_input_tokens_seen": 1257888, + "step": 5965 + }, + { + "epoch": 0.6567656765676567, + "grad_norm": 0.6019225716590881, + "learning_rate": 1.6416391639163916e-05, + "loss": 0.2184, + "num_input_tokens_seen": 1258944, + "step": 5970 + }, + { + "epoch": 0.6573157315731574, + "grad_norm": 0.6164206266403198, + "learning_rate": 1.643014301430143e-05, + "loss": 0.1969, + "num_input_tokens_seen": 1259936, + "step": 5975 + }, + { + "epoch": 0.6578657865786579, + "grad_norm": 0.4389398992061615, + "learning_rate": 1.6443894389438943e-05, + "loss": 0.1987, + "num_input_tokens_seen": 1260992, + "step": 5980 + }, + { + "epoch": 0.6584158415841584, + "grad_norm": 0.64046710729599, + "learning_rate": 1.645764576457646e-05, + "loss": 0.1765, + "num_input_tokens_seen": 1262016, + "step": 5985 + }, + { + "epoch": 0.658965896589659, + "grad_norm": 0.46057119965553284, + "learning_rate": 1.6471397139713973e-05, + "loss": 0.2394, + "num_input_tokens_seen": 1263072, + "step": 5990 + }, + { + "epoch": 0.6595159515951595, + "grad_norm": 0.946067750453949, + "learning_rate": 1.6485148514851488e-05, + "loss": 0.1776, + "num_input_tokens_seen": 1264160, + "step": 5995 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.6197021007537842, + "learning_rate": 1.6498899889989e-05, + "loss": 0.1726, + "num_input_tokens_seen": 1265152, + "step": 6000 + }, + { + "epoch": 0.6606160616061606, + "grad_norm": 0.6193330883979797, + "learning_rate": 1.6512651265126515e-05, + "loss": 0.1892, + "num_input_tokens_seen": 1266208, + "step": 6005 + }, + { + "epoch": 0.6611661166116611, + "grad_norm": 0.9222753643989563, + "learning_rate": 1.6526402640264026e-05, + "loss": 0.2687, + "num_input_tokens_seen": 1267264, + "step": 6010 + }, + { + "epoch": 0.6617161716171617, + "grad_norm": 0.9747940301895142, + "learning_rate": 1.654015401540154e-05, + "loss": 0.1772, + "num_input_tokens_seen": 1268320, + "step": 6015 + }, + { + "epoch": 0.6622662266226622, + "grad_norm": 1.2093687057495117, + "learning_rate": 1.6553905390539053e-05, + "loss": 0.2516, + "num_input_tokens_seen": 1269312, + "step": 6020 + }, + { + "epoch": 0.6628162816281629, + "grad_norm": 0.42787039279937744, + "learning_rate": 1.6567656765676568e-05, + "loss": 0.2295, + "num_input_tokens_seen": 1270400, + "step": 6025 + }, + { + "epoch": 0.6633663366336634, + "grad_norm": 0.49024754762649536, + "learning_rate": 1.658140814081408e-05, + "loss": 0.1775, + "num_input_tokens_seen": 1271488, + "step": 6030 + }, + { + "epoch": 0.6639163916391639, + "grad_norm": 0.7379528284072876, + "learning_rate": 1.6595159515951598e-05, + "loss": 0.1989, + "num_input_tokens_seen": 1272544, + "step": 6035 + }, + { + "epoch": 0.6644664466446645, + "grad_norm": 0.3725186884403229, + "learning_rate": 1.660891089108911e-05, + "loss": 0.191, + "num_input_tokens_seen": 1273568, + "step": 6040 + }, + { + "epoch": 0.665016501650165, + "grad_norm": 0.4269629120826721, + "learning_rate": 1.6622662266226625e-05, + "loss": 0.1867, + "num_input_tokens_seen": 1274656, + "step": 6045 + }, + { + "epoch": 0.6655665566556656, + "grad_norm": 0.4894213378429413, + "learning_rate": 1.6636413641364136e-05, + "loss": 0.187, + "num_input_tokens_seen": 1275712, + "step": 6050 + }, + { + "epoch": 0.6661166116611661, + "grad_norm": 0.4767107367515564, + "learning_rate": 1.665016501650165e-05, + "loss": 0.206, + "num_input_tokens_seen": 1276736, + "step": 6055 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.47223928570747375, + "learning_rate": 1.6663916391639163e-05, + "loss": 0.1397, + "num_input_tokens_seen": 1277760, + "step": 6060 + }, + { + "epoch": 0.6672167216721672, + "grad_norm": 0.7952574491500854, + "learning_rate": 1.6677667766776678e-05, + "loss": 0.2564, + "num_input_tokens_seen": 1278816, + "step": 6065 + }, + { + "epoch": 0.6677667766776678, + "grad_norm": 0.5067786574363708, + "learning_rate": 1.669141914191419e-05, + "loss": 0.1687, + "num_input_tokens_seen": 1279872, + "step": 6070 + }, + { + "epoch": 0.6683168316831684, + "grad_norm": 0.4399937391281128, + "learning_rate": 1.6705170517051705e-05, + "loss": 0.1817, + "num_input_tokens_seen": 1281024, + "step": 6075 + }, + { + "epoch": 0.6688668866886689, + "grad_norm": 0.4084644019603729, + "learning_rate": 1.671892189218922e-05, + "loss": 0.1869, + "num_input_tokens_seen": 1282048, + "step": 6080 + }, + { + "epoch": 0.6694169416941694, + "grad_norm": 0.4209664463996887, + "learning_rate": 1.6732673267326735e-05, + "loss": 0.1697, + "num_input_tokens_seen": 1283104, + "step": 6085 + }, + { + "epoch": 0.66996699669967, + "grad_norm": 0.9729294180870056, + "learning_rate": 1.6746424642464247e-05, + "loss": 0.1658, + "num_input_tokens_seen": 1284160, + "step": 6090 + }, + { + "epoch": 0.6705170517051705, + "grad_norm": 0.6126724481582642, + "learning_rate": 1.676017601760176e-05, + "loss": 0.1879, + "num_input_tokens_seen": 1285152, + "step": 6095 + }, + { + "epoch": 0.671067106710671, + "grad_norm": 0.5865444540977478, + "learning_rate": 1.6773927392739273e-05, + "loss": 0.1845, + "num_input_tokens_seen": 1286240, + "step": 6100 + }, + { + "epoch": 0.6716171617161716, + "grad_norm": 0.4932177662849426, + "learning_rate": 1.678767876787679e-05, + "loss": 0.1845, + "num_input_tokens_seen": 1287328, + "step": 6105 + }, + { + "epoch": 0.6721672167216721, + "grad_norm": 0.4799947440624237, + "learning_rate": 1.6801430143014303e-05, + "loss": 0.1663, + "num_input_tokens_seen": 1288416, + "step": 6110 + }, + { + "epoch": 0.6727172717271728, + "grad_norm": 0.24591322243213654, + "learning_rate": 1.6815181518151815e-05, + "loss": 0.1348, + "num_input_tokens_seen": 1289536, + "step": 6115 + }, + { + "epoch": 0.6732673267326733, + "grad_norm": 0.5371720790863037, + "learning_rate": 1.682893289328933e-05, + "loss": 0.2061, + "num_input_tokens_seen": 1290592, + "step": 6120 + }, + { + "epoch": 0.6738173817381738, + "grad_norm": 0.8492424488067627, + "learning_rate": 1.6842684268426842e-05, + "loss": 0.1968, + "num_input_tokens_seen": 1291616, + "step": 6125 + }, + { + "epoch": 0.6743674367436744, + "grad_norm": 0.3832399249076843, + "learning_rate": 1.6856435643564357e-05, + "loss": 0.1733, + "num_input_tokens_seen": 1292672, + "step": 6130 + }, + { + "epoch": 0.6749174917491749, + "grad_norm": 0.6103402972221375, + "learning_rate": 1.6870187018701872e-05, + "loss": 0.244, + "num_input_tokens_seen": 1293728, + "step": 6135 + }, + { + "epoch": 0.6754675467546755, + "grad_norm": 0.3542461097240448, + "learning_rate": 1.6883938393839387e-05, + "loss": 0.108, + "num_input_tokens_seen": 1294784, + "step": 6140 + }, + { + "epoch": 0.676017601760176, + "grad_norm": 0.44518399238586426, + "learning_rate": 1.68976897689769e-05, + "loss": 0.1538, + "num_input_tokens_seen": 1295872, + "step": 6145 + }, + { + "epoch": 0.6765676567656765, + "grad_norm": 0.29791611433029175, + "learning_rate": 1.6911441144114414e-05, + "loss": 0.1218, + "num_input_tokens_seen": 1296896, + "step": 6150 + }, + { + "epoch": 0.6771177117711771, + "grad_norm": 0.5810091495513916, + "learning_rate": 1.6925192519251925e-05, + "loss": 0.1901, + "num_input_tokens_seen": 1297952, + "step": 6155 + }, + { + "epoch": 0.6776677667766776, + "grad_norm": 0.8545283675193787, + "learning_rate": 1.693894389438944e-05, + "loss": 0.2241, + "num_input_tokens_seen": 1298976, + "step": 6160 + }, + { + "epoch": 0.6782178217821783, + "grad_norm": 0.8776722550392151, + "learning_rate": 1.6952695269526952e-05, + "loss": 0.187, + "num_input_tokens_seen": 1300096, + "step": 6165 + }, + { + "epoch": 0.6787678767876788, + "grad_norm": 0.46241119503974915, + "learning_rate": 1.6966446644664467e-05, + "loss": 0.2302, + "num_input_tokens_seen": 1301152, + "step": 6170 + }, + { + "epoch": 0.6793179317931793, + "grad_norm": 0.453264981508255, + "learning_rate": 1.6980198019801982e-05, + "loss": 0.2025, + "num_input_tokens_seen": 1302176, + "step": 6175 + }, + { + "epoch": 0.6798679867986799, + "grad_norm": 0.425784707069397, + "learning_rate": 1.6993949394939497e-05, + "loss": 0.1512, + "num_input_tokens_seen": 1303232, + "step": 6180 + }, + { + "epoch": 0.6804180418041804, + "grad_norm": 0.7332516312599182, + "learning_rate": 1.700770077007701e-05, + "loss": 0.1704, + "num_input_tokens_seen": 1304320, + "step": 6185 + }, + { + "epoch": 0.680968096809681, + "grad_norm": 1.0724416971206665, + "learning_rate": 1.7021452145214524e-05, + "loss": 0.1779, + "num_input_tokens_seen": 1305312, + "step": 6190 + }, + { + "epoch": 0.6815181518151815, + "grad_norm": 1.1575886011123657, + "learning_rate": 1.7035203520352035e-05, + "loss": 0.2112, + "num_input_tokens_seen": 1306304, + "step": 6195 + }, + { + "epoch": 0.682068206820682, + "grad_norm": 0.7234195470809937, + "learning_rate": 1.704895489548955e-05, + "loss": 0.1721, + "num_input_tokens_seen": 1307328, + "step": 6200 + }, + { + "epoch": 0.6826182618261826, + "grad_norm": 0.8243107795715332, + "learning_rate": 1.7062706270627062e-05, + "loss": 0.2186, + "num_input_tokens_seen": 1308384, + "step": 6205 + }, + { + "epoch": 0.6831683168316832, + "grad_norm": 0.7230414748191833, + "learning_rate": 1.7076457645764577e-05, + "loss": 0.1838, + "num_input_tokens_seen": 1309440, + "step": 6210 + }, + { + "epoch": 0.6837183718371838, + "grad_norm": 0.5196347236633301, + "learning_rate": 1.709020902090209e-05, + "loss": 0.1434, + "num_input_tokens_seen": 1310528, + "step": 6215 + }, + { + "epoch": 0.6842684268426843, + "grad_norm": 0.5403173565864563, + "learning_rate": 1.7103960396039607e-05, + "loss": 0.2299, + "num_input_tokens_seen": 1311584, + "step": 6220 + }, + { + "epoch": 0.6848184818481848, + "grad_norm": 0.6301103234291077, + "learning_rate": 1.711771177117712e-05, + "loss": 0.1855, + "num_input_tokens_seen": 1312640, + "step": 6225 + }, + { + "epoch": 0.6853685368536854, + "grad_norm": 0.8049775958061218, + "learning_rate": 1.7131463146314634e-05, + "loss": 0.1552, + "num_input_tokens_seen": 1313696, + "step": 6230 + }, + { + "epoch": 0.6859185918591859, + "grad_norm": 0.5073230266571045, + "learning_rate": 1.7145214521452146e-05, + "loss": 0.1992, + "num_input_tokens_seen": 1314784, + "step": 6235 + }, + { + "epoch": 0.6864686468646864, + "grad_norm": 0.5088921189308167, + "learning_rate": 1.715896589658966e-05, + "loss": 0.2354, + "num_input_tokens_seen": 1315872, + "step": 6240 + }, + { + "epoch": 0.687018701870187, + "grad_norm": 0.39181578159332275, + "learning_rate": 1.7172717271727172e-05, + "loss": 0.2047, + "num_input_tokens_seen": 1316960, + "step": 6245 + }, + { + "epoch": 0.6875687568756875, + "grad_norm": 0.5805835723876953, + "learning_rate": 1.7186468646864687e-05, + "loss": 0.1645, + "num_input_tokens_seen": 1318016, + "step": 6250 + }, + { + "epoch": 0.6881188118811881, + "grad_norm": 0.7944268584251404, + "learning_rate": 1.72002200220022e-05, + "loss": 0.2362, + "num_input_tokens_seen": 1319072, + "step": 6255 + }, + { + "epoch": 0.6886688668866887, + "grad_norm": 0.5991455912590027, + "learning_rate": 1.7213971397139714e-05, + "loss": 0.1821, + "num_input_tokens_seen": 1320128, + "step": 6260 + }, + { + "epoch": 0.6892189218921893, + "grad_norm": 0.7849922180175781, + "learning_rate": 1.7227722772277226e-05, + "loss": 0.2023, + "num_input_tokens_seen": 1321184, + "step": 6265 + }, + { + "epoch": 0.6897689768976898, + "grad_norm": 0.4007514715194702, + "learning_rate": 1.7241474147414744e-05, + "loss": 0.1799, + "num_input_tokens_seen": 1322240, + "step": 6270 + }, + { + "epoch": 0.6903190319031903, + "grad_norm": 1.0941194295883179, + "learning_rate": 1.7255225522552256e-05, + "loss": 0.1476, + "num_input_tokens_seen": 1323296, + "step": 6275 + }, + { + "epoch": 0.6908690869086909, + "grad_norm": 0.45769938826560974, + "learning_rate": 1.726897689768977e-05, + "loss": 0.1937, + "num_input_tokens_seen": 1324320, + "step": 6280 + }, + { + "epoch": 0.6914191419141914, + "grad_norm": 0.7716036438941956, + "learning_rate": 1.7282728272827282e-05, + "loss": 0.2031, + "num_input_tokens_seen": 1325408, + "step": 6285 + }, + { + "epoch": 0.6919691969196919, + "grad_norm": 0.5625100135803223, + "learning_rate": 1.7296479647964798e-05, + "loss": 0.1691, + "num_input_tokens_seen": 1326496, + "step": 6290 + }, + { + "epoch": 0.6925192519251925, + "grad_norm": 1.5911228656768799, + "learning_rate": 1.731023102310231e-05, + "loss": 0.3523, + "num_input_tokens_seen": 1327584, + "step": 6295 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.4713602066040039, + "learning_rate": 1.7323982398239824e-05, + "loss": 0.1683, + "num_input_tokens_seen": 1328640, + "step": 6300 + }, + { + "epoch": 0.6936193619361937, + "grad_norm": 0.6539673209190369, + "learning_rate": 1.733773377337734e-05, + "loss": 0.1608, + "num_input_tokens_seen": 1329728, + "step": 6305 + }, + { + "epoch": 0.6941694169416942, + "grad_norm": 0.6895763874053955, + "learning_rate": 1.735148514851485e-05, + "loss": 0.2572, + "num_input_tokens_seen": 1330816, + "step": 6310 + }, + { + "epoch": 0.6947194719471947, + "grad_norm": 0.9388552308082581, + "learning_rate": 1.7365236523652366e-05, + "loss": 0.1935, + "num_input_tokens_seen": 1331872, + "step": 6315 + }, + { + "epoch": 0.6952695269526953, + "grad_norm": 0.5699750185012817, + "learning_rate": 1.737898789878988e-05, + "loss": 0.1793, + "num_input_tokens_seen": 1332896, + "step": 6320 + }, + { + "epoch": 0.6958195819581958, + "grad_norm": 1.2642695903778076, + "learning_rate": 1.7392739273927393e-05, + "loss": 0.2187, + "num_input_tokens_seen": 1333952, + "step": 6325 + }, + { + "epoch": 0.6963696369636964, + "grad_norm": 0.433734655380249, + "learning_rate": 1.7406490649064908e-05, + "loss": 0.1611, + "num_input_tokens_seen": 1335040, + "step": 6330 + }, + { + "epoch": 0.6969196919691969, + "grad_norm": 0.4488394558429718, + "learning_rate": 1.7420242024202423e-05, + "loss": 0.1242, + "num_input_tokens_seen": 1336096, + "step": 6335 + }, + { + "epoch": 0.6974697469746974, + "grad_norm": 0.4859464168548584, + "learning_rate": 1.7433993399339934e-05, + "loss": 0.1944, + "num_input_tokens_seen": 1337152, + "step": 6340 + }, + { + "epoch": 0.698019801980198, + "grad_norm": 0.9566263556480408, + "learning_rate": 1.744774477447745e-05, + "loss": 0.1615, + "num_input_tokens_seen": 1338272, + "step": 6345 + }, + { + "epoch": 0.6985698569856986, + "grad_norm": 0.4754674434661865, + "learning_rate": 1.746149614961496e-05, + "loss": 0.1678, + "num_input_tokens_seen": 1339296, + "step": 6350 + }, + { + "epoch": 0.6991199119911992, + "grad_norm": 0.4327283203601837, + "learning_rate": 1.7475247524752476e-05, + "loss": 0.1572, + "num_input_tokens_seen": 1340320, + "step": 6355 + }, + { + "epoch": 0.6996699669966997, + "grad_norm": 1.2769664525985718, + "learning_rate": 1.748899889988999e-05, + "loss": 0.203, + "num_input_tokens_seen": 1341408, + "step": 6360 + }, + { + "epoch": 0.7002200220022002, + "grad_norm": 0.6113364100456238, + "learning_rate": 1.7502750275027506e-05, + "loss": 0.1814, + "num_input_tokens_seen": 1342432, + "step": 6365 + }, + { + "epoch": 0.7007700770077008, + "grad_norm": 0.706778883934021, + "learning_rate": 1.7516501650165018e-05, + "loss": 0.2677, + "num_input_tokens_seen": 1343456, + "step": 6370 + }, + { + "epoch": 0.7013201320132013, + "grad_norm": 0.32694923877716064, + "learning_rate": 1.7530253025302533e-05, + "loss": 0.1672, + "num_input_tokens_seen": 1344512, + "step": 6375 + }, + { + "epoch": 0.7018701870187019, + "grad_norm": 0.5618005990982056, + "learning_rate": 1.7544004400440045e-05, + "loss": 0.1529, + "num_input_tokens_seen": 1345504, + "step": 6380 + }, + { + "epoch": 0.7024202420242024, + "grad_norm": 0.9021275639533997, + "learning_rate": 1.755775577557756e-05, + "loss": 0.1816, + "num_input_tokens_seen": 1346560, + "step": 6385 + }, + { + "epoch": 0.7029702970297029, + "grad_norm": 0.6541978120803833, + "learning_rate": 1.757150715071507e-05, + "loss": 0.2441, + "num_input_tokens_seen": 1347648, + "step": 6390 + }, + { + "epoch": 0.7035203520352035, + "grad_norm": 0.34985455870628357, + "learning_rate": 1.7585258525852586e-05, + "loss": 0.1423, + "num_input_tokens_seen": 1348704, + "step": 6395 + }, + { + "epoch": 0.7040704070407041, + "grad_norm": 0.3284077048301697, + "learning_rate": 1.7599009900990098e-05, + "loss": 0.1684, + "num_input_tokens_seen": 1349824, + "step": 6400 + }, + { + "epoch": 0.7046204620462047, + "grad_norm": 0.38676974177360535, + "learning_rate": 1.7612761276127613e-05, + "loss": 0.138, + "num_input_tokens_seen": 1350816, + "step": 6405 + }, + { + "epoch": 0.7051705170517052, + "grad_norm": 0.4335177540779114, + "learning_rate": 1.7626512651265128e-05, + "loss": 0.2768, + "num_input_tokens_seen": 1351840, + "step": 6410 + }, + { + "epoch": 0.7057205720572057, + "grad_norm": 0.6535833477973938, + "learning_rate": 1.7640264026402643e-05, + "loss": 0.1551, + "num_input_tokens_seen": 1352896, + "step": 6415 + }, + { + "epoch": 0.7062706270627063, + "grad_norm": 0.6787734031677246, + "learning_rate": 1.7654015401540155e-05, + "loss": 0.1801, + "num_input_tokens_seen": 1354016, + "step": 6420 + }, + { + "epoch": 0.7068206820682068, + "grad_norm": 0.8367506861686707, + "learning_rate": 1.766776677667767e-05, + "loss": 0.1614, + "num_input_tokens_seen": 1355104, + "step": 6425 + }, + { + "epoch": 0.7073707370737073, + "grad_norm": 0.77280592918396, + "learning_rate": 1.768151815181518e-05, + "loss": 0.1951, + "num_input_tokens_seen": 1356128, + "step": 6430 + }, + { + "epoch": 0.7079207920792079, + "grad_norm": 0.3674517273902893, + "learning_rate": 1.7695269526952696e-05, + "loss": 0.1501, + "num_input_tokens_seen": 1357184, + "step": 6435 + }, + { + "epoch": 0.7084708470847084, + "grad_norm": 0.3546234667301178, + "learning_rate": 1.7709020902090208e-05, + "loss": 0.1305, + "num_input_tokens_seen": 1358208, + "step": 6440 + }, + { + "epoch": 0.7090209020902091, + "grad_norm": 0.6570871472358704, + "learning_rate": 1.7722772277227723e-05, + "loss": 0.2515, + "num_input_tokens_seen": 1359232, + "step": 6445 + }, + { + "epoch": 0.7095709570957096, + "grad_norm": 0.523777961730957, + "learning_rate": 1.7736523652365235e-05, + "loss": 0.1766, + "num_input_tokens_seen": 1360288, + "step": 6450 + }, + { + "epoch": 0.7101210121012101, + "grad_norm": 0.5206857323646545, + "learning_rate": 1.7750275027502753e-05, + "loss": 0.1804, + "num_input_tokens_seen": 1361344, + "step": 6455 + }, + { + "epoch": 0.7106710671067107, + "grad_norm": 0.3291563093662262, + "learning_rate": 1.7764026402640265e-05, + "loss": 0.1666, + "num_input_tokens_seen": 1362432, + "step": 6460 + }, + { + "epoch": 0.7112211221122112, + "grad_norm": 0.666671633720398, + "learning_rate": 1.777777777777778e-05, + "loss": 0.1728, + "num_input_tokens_seen": 1363488, + "step": 6465 + }, + { + "epoch": 0.7117711771177118, + "grad_norm": 0.41943520307540894, + "learning_rate": 1.779152915291529e-05, + "loss": 0.1601, + "num_input_tokens_seen": 1364608, + "step": 6470 + }, + { + "epoch": 0.7123212321232123, + "grad_norm": 0.7767130732536316, + "learning_rate": 1.7805280528052807e-05, + "loss": 0.196, + "num_input_tokens_seen": 1365600, + "step": 6475 + }, + { + "epoch": 0.7128712871287128, + "grad_norm": 0.43857643008232117, + "learning_rate": 1.781903190319032e-05, + "loss": 0.1712, + "num_input_tokens_seen": 1366624, + "step": 6480 + }, + { + "epoch": 0.7134213421342134, + "grad_norm": 0.7644559741020203, + "learning_rate": 1.7832783278327833e-05, + "loss": 0.2421, + "num_input_tokens_seen": 1367616, + "step": 6485 + }, + { + "epoch": 0.713971397139714, + "grad_norm": 0.32686758041381836, + "learning_rate": 1.7846534653465345e-05, + "loss": 0.1633, + "num_input_tokens_seen": 1368736, + "step": 6490 + }, + { + "epoch": 0.7145214521452146, + "grad_norm": 0.36859333515167236, + "learning_rate": 1.786028602860286e-05, + "loss": 0.2204, + "num_input_tokens_seen": 1369792, + "step": 6495 + }, + { + "epoch": 0.7150715071507151, + "grad_norm": 1.350877285003662, + "learning_rate": 1.7874037403740375e-05, + "loss": 0.2101, + "num_input_tokens_seen": 1370880, + "step": 6500 + }, + { + "epoch": 0.7156215621562156, + "grad_norm": 0.5571721196174622, + "learning_rate": 1.788778877887789e-05, + "loss": 0.2164, + "num_input_tokens_seen": 1371968, + "step": 6505 + }, + { + "epoch": 0.7161716171617162, + "grad_norm": 0.4601607620716095, + "learning_rate": 1.7901540154015402e-05, + "loss": 0.2181, + "num_input_tokens_seen": 1373088, + "step": 6510 + }, + { + "epoch": 0.7167216721672167, + "grad_norm": 0.7459254264831543, + "learning_rate": 1.7915291529152917e-05, + "loss": 0.1697, + "num_input_tokens_seen": 1374112, + "step": 6515 + }, + { + "epoch": 0.7172717271727173, + "grad_norm": 1.2582356929779053, + "learning_rate": 1.792904290429043e-05, + "loss": 0.2059, + "num_input_tokens_seen": 1375200, + "step": 6520 + }, + { + "epoch": 0.7178217821782178, + "grad_norm": 0.5275422930717468, + "learning_rate": 1.7942794279427944e-05, + "loss": 0.1715, + "num_input_tokens_seen": 1376224, + "step": 6525 + }, + { + "epoch": 0.7183718371837183, + "grad_norm": 0.562954843044281, + "learning_rate": 1.795654565456546e-05, + "loss": 0.2316, + "num_input_tokens_seen": 1377344, + "step": 6530 + }, + { + "epoch": 0.7189218921892189, + "grad_norm": 0.6069186329841614, + "learning_rate": 1.797029702970297e-05, + "loss": 0.2082, + "num_input_tokens_seen": 1378400, + "step": 6535 + }, + { + "epoch": 0.7194719471947195, + "grad_norm": 1.7345566749572754, + "learning_rate": 1.7984048404840485e-05, + "loss": 0.2458, + "num_input_tokens_seen": 1379520, + "step": 6540 + }, + { + "epoch": 0.7200220022002201, + "grad_norm": 0.3558056950569153, + "learning_rate": 1.7997799779977997e-05, + "loss": 0.202, + "num_input_tokens_seen": 1380544, + "step": 6545 + }, + { + "epoch": 0.7205720572057206, + "grad_norm": 0.6204750537872314, + "learning_rate": 1.8011551155115512e-05, + "loss": 0.178, + "num_input_tokens_seen": 1381600, + "step": 6550 + }, + { + "epoch": 0.7211221122112211, + "grad_norm": 0.6343278288841248, + "learning_rate": 1.8025302530253027e-05, + "loss": 0.1612, + "num_input_tokens_seen": 1382688, + "step": 6555 + }, + { + "epoch": 0.7216721672167217, + "grad_norm": 0.5725173950195312, + "learning_rate": 1.8039053905390542e-05, + "loss": 0.2209, + "num_input_tokens_seen": 1383776, + "step": 6560 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.5462890863418579, + "learning_rate": 1.8052805280528054e-05, + "loss": 0.168, + "num_input_tokens_seen": 1384736, + "step": 6565 + }, + { + "epoch": 0.7227722772277227, + "grad_norm": 0.9752607345581055, + "learning_rate": 1.806655665566557e-05, + "loss": 0.1895, + "num_input_tokens_seen": 1385760, + "step": 6570 + }, + { + "epoch": 0.7233223322332233, + "grad_norm": 1.3875365257263184, + "learning_rate": 1.808030803080308e-05, + "loss": 0.305, + "num_input_tokens_seen": 1386816, + "step": 6575 + }, + { + "epoch": 0.7238723872387238, + "grad_norm": 1.506052851676941, + "learning_rate": 1.8094059405940595e-05, + "loss": 0.1997, + "num_input_tokens_seen": 1387872, + "step": 6580 + }, + { + "epoch": 0.7244224422442245, + "grad_norm": 0.6042221188545227, + "learning_rate": 1.8107810781078107e-05, + "loss": 0.1929, + "num_input_tokens_seen": 1388896, + "step": 6585 + }, + { + "epoch": 0.724972497249725, + "grad_norm": 0.4207039177417755, + "learning_rate": 1.8121562156215622e-05, + "loss": 0.1612, + "num_input_tokens_seen": 1390016, + "step": 6590 + }, + { + "epoch": 0.7255225522552256, + "grad_norm": 0.5583294034004211, + "learning_rate": 1.8135313531353137e-05, + "loss": 0.2277, + "num_input_tokens_seen": 1391072, + "step": 6595 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.5145863890647888, + "learning_rate": 1.8149064906490652e-05, + "loss": 0.2162, + "num_input_tokens_seen": 1392128, + "step": 6600 + }, + { + "epoch": 0.7266226622662266, + "grad_norm": 1.917418122291565, + "learning_rate": 1.8162816281628164e-05, + "loss": 0.3241, + "num_input_tokens_seen": 1393216, + "step": 6605 + }, + { + "epoch": 0.7271727172717272, + "grad_norm": 0.8625128865242004, + "learning_rate": 1.817656765676568e-05, + "loss": 0.2, + "num_input_tokens_seen": 1394304, + "step": 6610 + }, + { + "epoch": 0.7277227722772277, + "grad_norm": 0.4659234583377838, + "learning_rate": 1.819031903190319e-05, + "loss": 0.1418, + "num_input_tokens_seen": 1395392, + "step": 6615 + }, + { + "epoch": 0.7282728272827282, + "grad_norm": 0.6915057897567749, + "learning_rate": 1.8204070407040706e-05, + "loss": 0.14, + "num_input_tokens_seen": 1396480, + "step": 6620 + }, + { + "epoch": 0.7288228822882288, + "grad_norm": 0.5049657821655273, + "learning_rate": 1.8217821782178217e-05, + "loss": 0.1661, + "num_input_tokens_seen": 1397536, + "step": 6625 + }, + { + "epoch": 0.7293729372937293, + "grad_norm": 0.3642219603061676, + "learning_rate": 1.8231573157315732e-05, + "loss": 0.1748, + "num_input_tokens_seen": 1398624, + "step": 6630 + }, + { + "epoch": 0.72992299229923, + "grad_norm": 0.3585623800754547, + "learning_rate": 1.8245324532453244e-05, + "loss": 0.1567, + "num_input_tokens_seen": 1399712, + "step": 6635 + }, + { + "epoch": 0.7304730473047305, + "grad_norm": 0.3706042766571045, + "learning_rate": 1.825907590759076e-05, + "loss": 0.252, + "num_input_tokens_seen": 1400768, + "step": 6640 + }, + { + "epoch": 0.731023102310231, + "grad_norm": 0.6520324945449829, + "learning_rate": 1.8272827282728274e-05, + "loss": 0.1744, + "num_input_tokens_seen": 1401824, + "step": 6645 + }, + { + "epoch": 0.7315731573157316, + "grad_norm": 0.5542408227920532, + "learning_rate": 1.828657865786579e-05, + "loss": 0.1749, + "num_input_tokens_seen": 1402912, + "step": 6650 + }, + { + "epoch": 0.7321232123212321, + "grad_norm": 0.6187161803245544, + "learning_rate": 1.83003300330033e-05, + "loss": 0.171, + "num_input_tokens_seen": 1403904, + "step": 6655 + }, + { + "epoch": 0.7326732673267327, + "grad_norm": 0.38160228729248047, + "learning_rate": 1.8314081408140816e-05, + "loss": 0.1637, + "num_input_tokens_seen": 1404992, + "step": 6660 + }, + { + "epoch": 0.7332233223322332, + "grad_norm": 0.42242738604545593, + "learning_rate": 1.8327832783278328e-05, + "loss": 0.1631, + "num_input_tokens_seen": 1406080, + "step": 6665 + }, + { + "epoch": 0.7337733773377337, + "grad_norm": 0.48295044898986816, + "learning_rate": 1.8341584158415843e-05, + "loss": 0.1751, + "num_input_tokens_seen": 1407136, + "step": 6670 + }, + { + "epoch": 0.7343234323432343, + "grad_norm": 0.5405387282371521, + "learning_rate": 1.8355335533553354e-05, + "loss": 0.128, + "num_input_tokens_seen": 1408192, + "step": 6675 + }, + { + "epoch": 0.7348734873487349, + "grad_norm": 0.6721389889717102, + "learning_rate": 1.836908690869087e-05, + "loss": 0.1719, + "num_input_tokens_seen": 1409184, + "step": 6680 + }, + { + "epoch": 0.7354235423542355, + "grad_norm": 0.5075832009315491, + "learning_rate": 1.838283828382838e-05, + "loss": 0.2397, + "num_input_tokens_seen": 1410240, + "step": 6685 + }, + { + "epoch": 0.735973597359736, + "grad_norm": 1.2460943460464478, + "learning_rate": 1.83965896589659e-05, + "loss": 0.1982, + "num_input_tokens_seen": 1411296, + "step": 6690 + }, + { + "epoch": 0.7365236523652365, + "grad_norm": 0.9700512290000916, + "learning_rate": 1.841034103410341e-05, + "loss": 0.2224, + "num_input_tokens_seen": 1412320, + "step": 6695 + }, + { + "epoch": 0.7370737073707371, + "grad_norm": 0.48601946234703064, + "learning_rate": 1.8424092409240926e-05, + "loss": 0.1438, + "num_input_tokens_seen": 1413376, + "step": 6700 + }, + { + "epoch": 0.7376237623762376, + "grad_norm": 0.5709961652755737, + "learning_rate": 1.8437843784378438e-05, + "loss": 0.1887, + "num_input_tokens_seen": 1414368, + "step": 6705 + }, + { + "epoch": 0.7381738173817382, + "grad_norm": 1.500440001487732, + "learning_rate": 1.8451595159515953e-05, + "loss": 0.1914, + "num_input_tokens_seen": 1415456, + "step": 6710 + }, + { + "epoch": 0.7387238723872387, + "grad_norm": 0.5558356046676636, + "learning_rate": 1.8465346534653464e-05, + "loss": 0.1972, + "num_input_tokens_seen": 1416576, + "step": 6715 + }, + { + "epoch": 0.7392739273927392, + "grad_norm": 0.46146342158317566, + "learning_rate": 1.847909790979098e-05, + "loss": 0.1528, + "num_input_tokens_seen": 1417632, + "step": 6720 + }, + { + "epoch": 0.7398239823982399, + "grad_norm": 0.5193172097206116, + "learning_rate": 1.8492849284928494e-05, + "loss": 0.2081, + "num_input_tokens_seen": 1418592, + "step": 6725 + }, + { + "epoch": 0.7403740374037404, + "grad_norm": 0.4516057074069977, + "learning_rate": 1.8506600660066006e-05, + "loss": 0.2025, + "num_input_tokens_seen": 1419712, + "step": 6730 + }, + { + "epoch": 0.740924092409241, + "grad_norm": 0.7183994054794312, + "learning_rate": 1.852035203520352e-05, + "loss": 0.2201, + "num_input_tokens_seen": 1420832, + "step": 6735 + }, + { + "epoch": 0.7414741474147415, + "grad_norm": 0.6189200282096863, + "learning_rate": 1.8534103410341036e-05, + "loss": 0.1469, + "num_input_tokens_seen": 1421920, + "step": 6740 + }, + { + "epoch": 0.742024202420242, + "grad_norm": 0.3307740390300751, + "learning_rate": 1.8547854785478548e-05, + "loss": 0.1674, + "num_input_tokens_seen": 1423008, + "step": 6745 + }, + { + "epoch": 0.7425742574257426, + "grad_norm": 0.3174445927143097, + "learning_rate": 1.8561606160616063e-05, + "loss": 0.1979, + "num_input_tokens_seen": 1424000, + "step": 6750 + }, + { + "epoch": 0.7431243124312431, + "grad_norm": 0.5564167499542236, + "learning_rate": 1.8575357535753578e-05, + "loss": 0.1227, + "num_input_tokens_seen": 1424992, + "step": 6755 + }, + { + "epoch": 0.7436743674367436, + "grad_norm": 0.6026650071144104, + "learning_rate": 1.858910891089109e-05, + "loss": 0.2074, + "num_input_tokens_seen": 1426016, + "step": 6760 + }, + { + "epoch": 0.7442244224422442, + "grad_norm": 0.45456141233444214, + "learning_rate": 1.8602860286028605e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1427008, + "step": 6765 + }, + { + "epoch": 0.7447744774477447, + "grad_norm": 0.4078741669654846, + "learning_rate": 1.8616611661166116e-05, + "loss": 0.1553, + "num_input_tokens_seen": 1428096, + "step": 6770 + }, + { + "epoch": 0.7453245324532454, + "grad_norm": 0.7428945302963257, + "learning_rate": 1.863036303630363e-05, + "loss": 0.2236, + "num_input_tokens_seen": 1429216, + "step": 6775 + }, + { + "epoch": 0.7458745874587459, + "grad_norm": 1.3815703392028809, + "learning_rate": 1.8644114411441143e-05, + "loss": 0.1755, + "num_input_tokens_seen": 1430272, + "step": 6780 + }, + { + "epoch": 0.7464246424642464, + "grad_norm": 0.7954837679862976, + "learning_rate": 1.865786578657866e-05, + "loss": 0.1944, + "num_input_tokens_seen": 1431296, + "step": 6785 + }, + { + "epoch": 0.746974697469747, + "grad_norm": 0.3801926374435425, + "learning_rate": 1.8671617161716173e-05, + "loss": 0.2115, + "num_input_tokens_seen": 1432384, + "step": 6790 + }, + { + "epoch": 0.7475247524752475, + "grad_norm": 0.3821070194244385, + "learning_rate": 1.8685368536853688e-05, + "loss": 0.1885, + "num_input_tokens_seen": 1433440, + "step": 6795 + }, + { + "epoch": 0.7480748074807481, + "grad_norm": 0.6827332377433777, + "learning_rate": 1.86991199119912e-05, + "loss": 0.1841, + "num_input_tokens_seen": 1434432, + "step": 6800 + }, + { + "epoch": 0.7486248624862486, + "grad_norm": 0.4055909216403961, + "learning_rate": 1.8712871287128715e-05, + "loss": 0.1399, + "num_input_tokens_seen": 1435488, + "step": 6805 + }, + { + "epoch": 0.7491749174917491, + "grad_norm": 0.5762221813201904, + "learning_rate": 1.8726622662266227e-05, + "loss": 0.2109, + "num_input_tokens_seen": 1436512, + "step": 6810 + }, + { + "epoch": 0.7497249724972497, + "grad_norm": 0.7207011580467224, + "learning_rate": 1.874037403740374e-05, + "loss": 0.1586, + "num_input_tokens_seen": 1437504, + "step": 6815 + }, + { + "epoch": 0.7502750275027503, + "grad_norm": 0.596562922000885, + "learning_rate": 1.8754125412541253e-05, + "loss": 0.122, + "num_input_tokens_seen": 1438592, + "step": 6820 + }, + { + "epoch": 0.7508250825082509, + "grad_norm": 0.606745719909668, + "learning_rate": 1.8767876787678768e-05, + "loss": 0.155, + "num_input_tokens_seen": 1439648, + "step": 6825 + }, + { + "epoch": 0.7513751375137514, + "grad_norm": 0.4731099605560303, + "learning_rate": 1.8781628162816283e-05, + "loss": 0.1955, + "num_input_tokens_seen": 1440640, + "step": 6830 + }, + { + "epoch": 0.7519251925192519, + "grad_norm": 0.37689200043678284, + "learning_rate": 1.87953795379538e-05, + "loss": 0.1462, + "num_input_tokens_seen": 1441696, + "step": 6835 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 1.0005466938018799, + "learning_rate": 1.880913091309131e-05, + "loss": 0.2292, + "num_input_tokens_seen": 1442720, + "step": 6840 + }, + { + "epoch": 0.753025302530253, + "grad_norm": 0.42715218663215637, + "learning_rate": 1.8822882288228825e-05, + "loss": 0.2, + "num_input_tokens_seen": 1443744, + "step": 6845 + }, + { + "epoch": 0.7535753575357536, + "grad_norm": 0.35513901710510254, + "learning_rate": 1.8836633663366337e-05, + "loss": 0.1511, + "num_input_tokens_seen": 1444928, + "step": 6850 + }, + { + "epoch": 0.7541254125412541, + "grad_norm": 0.9680611491203308, + "learning_rate": 1.8850385038503852e-05, + "loss": 0.1977, + "num_input_tokens_seen": 1446048, + "step": 6855 + }, + { + "epoch": 0.7546754675467546, + "grad_norm": 0.7398693561553955, + "learning_rate": 1.8864136413641363e-05, + "loss": 0.1274, + "num_input_tokens_seen": 1447040, + "step": 6860 + }, + { + "epoch": 0.7552255225522553, + "grad_norm": 1.0083763599395752, + "learning_rate": 1.887788778877888e-05, + "loss": 0.2516, + "num_input_tokens_seen": 1448096, + "step": 6865 + }, + { + "epoch": 0.7557755775577558, + "grad_norm": 0.44801878929138184, + "learning_rate": 1.889163916391639e-05, + "loss": 0.2188, + "num_input_tokens_seen": 1449184, + "step": 6870 + }, + { + "epoch": 0.7563256325632564, + "grad_norm": 0.8538862466812134, + "learning_rate": 1.890539053905391e-05, + "loss": 0.1195, + "num_input_tokens_seen": 1450240, + "step": 6875 + }, + { + "epoch": 0.7568756875687569, + "grad_norm": 0.48378780484199524, + "learning_rate": 1.891914191419142e-05, + "loss": 0.159, + "num_input_tokens_seen": 1451296, + "step": 6880 + }, + { + "epoch": 0.7574257425742574, + "grad_norm": 0.45517978072166443, + "learning_rate": 1.8932893289328935e-05, + "loss": 0.2173, + "num_input_tokens_seen": 1452352, + "step": 6885 + }, + { + "epoch": 0.757975797579758, + "grad_norm": 1.1415170431137085, + "learning_rate": 1.8946644664466447e-05, + "loss": 0.1984, + "num_input_tokens_seen": 1453408, + "step": 6890 + }, + { + "epoch": 0.7585258525852585, + "grad_norm": 0.38026249408721924, + "learning_rate": 1.8960396039603962e-05, + "loss": 0.1539, + "num_input_tokens_seen": 1454592, + "step": 6895 + }, + { + "epoch": 0.759075907590759, + "grad_norm": 0.9061934947967529, + "learning_rate": 1.8974147414741474e-05, + "loss": 0.2109, + "num_input_tokens_seen": 1455584, + "step": 6900 + }, + { + "epoch": 0.7596259625962596, + "grad_norm": 0.5468794107437134, + "learning_rate": 1.898789878987899e-05, + "loss": 0.1882, + "num_input_tokens_seen": 1456608, + "step": 6905 + }, + { + "epoch": 0.7601760176017601, + "grad_norm": 0.5466762185096741, + "learning_rate": 1.90016501650165e-05, + "loss": 0.1257, + "num_input_tokens_seen": 1457760, + "step": 6910 + }, + { + "epoch": 0.7607260726072608, + "grad_norm": 0.6937514543533325, + "learning_rate": 1.9015401540154015e-05, + "loss": 0.1835, + "num_input_tokens_seen": 1458752, + "step": 6915 + }, + { + "epoch": 0.7612761276127613, + "grad_norm": 1.189805507659912, + "learning_rate": 1.902915291529153e-05, + "loss": 0.1837, + "num_input_tokens_seen": 1459808, + "step": 6920 + }, + { + "epoch": 0.7618261826182618, + "grad_norm": 0.33157122135162354, + "learning_rate": 1.9042904290429045e-05, + "loss": 0.228, + "num_input_tokens_seen": 1460832, + "step": 6925 + }, + { + "epoch": 0.7623762376237624, + "grad_norm": 1.219226360321045, + "learning_rate": 1.9056655665566557e-05, + "loss": 0.2429, + "num_input_tokens_seen": 1461888, + "step": 6930 + }, + { + "epoch": 0.7629262926292629, + "grad_norm": 0.9226028323173523, + "learning_rate": 1.9070407040704072e-05, + "loss": 0.1314, + "num_input_tokens_seen": 1462912, + "step": 6935 + }, + { + "epoch": 0.7634763476347635, + "grad_norm": 0.32822325825691223, + "learning_rate": 1.9084158415841584e-05, + "loss": 0.1138, + "num_input_tokens_seen": 1463936, + "step": 6940 + }, + { + "epoch": 0.764026402640264, + "grad_norm": 0.3600684702396393, + "learning_rate": 1.90979097909791e-05, + "loss": 0.1679, + "num_input_tokens_seen": 1464992, + "step": 6945 + }, + { + "epoch": 0.7645764576457645, + "grad_norm": 0.8641826510429382, + "learning_rate": 1.9111661166116614e-05, + "loss": 0.1818, + "num_input_tokens_seen": 1466048, + "step": 6950 + }, + { + "epoch": 0.7651265126512651, + "grad_norm": 0.732683539390564, + "learning_rate": 1.9125412541254126e-05, + "loss": 0.2296, + "num_input_tokens_seen": 1467072, + "step": 6955 + }, + { + "epoch": 0.7656765676567657, + "grad_norm": 0.23683887720108032, + "learning_rate": 1.913916391639164e-05, + "loss": 0.1391, + "num_input_tokens_seen": 1468064, + "step": 6960 + }, + { + "epoch": 0.7662266226622663, + "grad_norm": 0.6394700407981873, + "learning_rate": 1.9152915291529152e-05, + "loss": 0.221, + "num_input_tokens_seen": 1469152, + "step": 6965 + }, + { + "epoch": 0.7667766776677668, + "grad_norm": 0.3249928057193756, + "learning_rate": 1.9166666666666667e-05, + "loss": 0.1995, + "num_input_tokens_seen": 1470208, + "step": 6970 + }, + { + "epoch": 0.7673267326732673, + "grad_norm": 0.9135459661483765, + "learning_rate": 1.9180418041804182e-05, + "loss": 0.2002, + "num_input_tokens_seen": 1471296, + "step": 6975 + }, + { + "epoch": 0.7678767876787679, + "grad_norm": 0.6280204057693481, + "learning_rate": 1.9194169416941697e-05, + "loss": 0.1496, + "num_input_tokens_seen": 1472384, + "step": 6980 + }, + { + "epoch": 0.7684268426842684, + "grad_norm": 0.9932824373245239, + "learning_rate": 1.920792079207921e-05, + "loss": 0.1795, + "num_input_tokens_seen": 1473440, + "step": 6985 + }, + { + "epoch": 0.768976897689769, + "grad_norm": 0.7704111337661743, + "learning_rate": 1.9221672167216724e-05, + "loss": 0.1857, + "num_input_tokens_seen": 1474464, + "step": 6990 + }, + { + "epoch": 0.7695269526952695, + "grad_norm": 0.37338727712631226, + "learning_rate": 1.9235423542354236e-05, + "loss": 0.1856, + "num_input_tokens_seen": 1475520, + "step": 6995 + }, + { + "epoch": 0.77007700770077, + "grad_norm": 0.3196263909339905, + "learning_rate": 1.924917491749175e-05, + "loss": 0.1862, + "num_input_tokens_seen": 1476544, + "step": 7000 + }, + { + "epoch": 0.7706270627062707, + "grad_norm": 0.5681230425834656, + "learning_rate": 1.9262926292629262e-05, + "loss": 0.1437, + "num_input_tokens_seen": 1477600, + "step": 7005 + }, + { + "epoch": 0.7711771177117712, + "grad_norm": 1.0499106645584106, + "learning_rate": 1.9276677667766777e-05, + "loss": 0.2175, + "num_input_tokens_seen": 1478624, + "step": 7010 + }, + { + "epoch": 0.7717271727172718, + "grad_norm": 0.6696943044662476, + "learning_rate": 1.929042904290429e-05, + "loss": 0.2498, + "num_input_tokens_seen": 1479712, + "step": 7015 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.5925551056861877, + "learning_rate": 1.9304180418041808e-05, + "loss": 0.2222, + "num_input_tokens_seen": 1480768, + "step": 7020 + }, + { + "epoch": 0.7728272827282728, + "grad_norm": 0.5640553832054138, + "learning_rate": 1.931793179317932e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1481856, + "step": 7025 + }, + { + "epoch": 0.7733773377337734, + "grad_norm": 0.36005088686943054, + "learning_rate": 1.9331683168316834e-05, + "loss": 0.1931, + "num_input_tokens_seen": 1482912, + "step": 7030 + }, + { + "epoch": 0.7739273927392739, + "grad_norm": 0.3241138458251953, + "learning_rate": 1.9345434543454346e-05, + "loss": 0.2108, + "num_input_tokens_seen": 1483936, + "step": 7035 + }, + { + "epoch": 0.7744774477447744, + "grad_norm": 0.5090518593788147, + "learning_rate": 1.935918591859186e-05, + "loss": 0.133, + "num_input_tokens_seen": 1484960, + "step": 7040 + }, + { + "epoch": 0.775027502750275, + "grad_norm": 0.6009769439697266, + "learning_rate": 1.9372937293729373e-05, + "loss": 0.1363, + "num_input_tokens_seen": 1485984, + "step": 7045 + }, + { + "epoch": 0.7755775577557755, + "grad_norm": 0.3015446960926056, + "learning_rate": 1.9386688668866888e-05, + "loss": 0.1675, + "num_input_tokens_seen": 1487040, + "step": 7050 + }, + { + "epoch": 0.7761276127612762, + "grad_norm": 0.5407645106315613, + "learning_rate": 1.94004400440044e-05, + "loss": 0.1405, + "num_input_tokens_seen": 1488096, + "step": 7055 + }, + { + "epoch": 0.7766776677667767, + "grad_norm": 0.5575186610221863, + "learning_rate": 1.9414191419141914e-05, + "loss": 0.1624, + "num_input_tokens_seen": 1489120, + "step": 7060 + }, + { + "epoch": 0.7772277227722773, + "grad_norm": 0.526098370552063, + "learning_rate": 1.942794279427943e-05, + "loss": 0.205, + "num_input_tokens_seen": 1490208, + "step": 7065 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.39987286925315857, + "learning_rate": 1.9441694169416944e-05, + "loss": 0.1933, + "num_input_tokens_seen": 1491232, + "step": 7070 + }, + { + "epoch": 0.7783278327832783, + "grad_norm": 0.808918297290802, + "learning_rate": 1.9455445544554456e-05, + "loss": 0.2592, + "num_input_tokens_seen": 1492320, + "step": 7075 + }, + { + "epoch": 0.7788778877887789, + "grad_norm": 0.3786839246749878, + "learning_rate": 1.946919691969197e-05, + "loss": 0.1233, + "num_input_tokens_seen": 1493344, + "step": 7080 + }, + { + "epoch": 0.7794279427942794, + "grad_norm": 0.6356212496757507, + "learning_rate": 1.9482948294829483e-05, + "loss": 0.1885, + "num_input_tokens_seen": 1494400, + "step": 7085 + }, + { + "epoch": 0.7799779977997799, + "grad_norm": 0.7318539619445801, + "learning_rate": 1.9496699669966998e-05, + "loss": 0.1865, + "num_input_tokens_seen": 1495456, + "step": 7090 + }, + { + "epoch": 0.7805280528052805, + "grad_norm": 0.9097305536270142, + "learning_rate": 1.951045104510451e-05, + "loss": 0.2665, + "num_input_tokens_seen": 1496480, + "step": 7095 + }, + { + "epoch": 0.7810781078107811, + "grad_norm": 0.8866129517555237, + "learning_rate": 1.9524202420242024e-05, + "loss": 0.2585, + "num_input_tokens_seen": 1497504, + "step": 7100 + }, + { + "epoch": 0.7816281628162817, + "grad_norm": 0.5902475118637085, + "learning_rate": 1.9537953795379536e-05, + "loss": 0.1773, + "num_input_tokens_seen": 1498560, + "step": 7105 + }, + { + "epoch": 0.7821782178217822, + "grad_norm": 0.3781207799911499, + "learning_rate": 1.9551705170517055e-05, + "loss": 0.1931, + "num_input_tokens_seen": 1499552, + "step": 7110 + }, + { + "epoch": 0.7827282728272827, + "grad_norm": 0.6802540421485901, + "learning_rate": 1.9565456545654566e-05, + "loss": 0.1463, + "num_input_tokens_seen": 1500576, + "step": 7115 + }, + { + "epoch": 0.7832783278327833, + "grad_norm": 0.808175802230835, + "learning_rate": 1.957920792079208e-05, + "loss": 0.1734, + "num_input_tokens_seen": 1501632, + "step": 7120 + }, + { + "epoch": 0.7838283828382838, + "grad_norm": 0.3855369985103607, + "learning_rate": 1.9592959295929593e-05, + "loss": 0.125, + "num_input_tokens_seen": 1502656, + "step": 7125 + }, + { + "epoch": 0.7843784378437844, + "grad_norm": 0.6870132088661194, + "learning_rate": 1.9606710671067108e-05, + "loss": 0.141, + "num_input_tokens_seen": 1503712, + "step": 7130 + }, + { + "epoch": 0.7849284928492849, + "grad_norm": 0.5779569745063782, + "learning_rate": 1.962046204620462e-05, + "loss": 0.1912, + "num_input_tokens_seen": 1504832, + "step": 7135 + }, + { + "epoch": 0.7854785478547854, + "grad_norm": 0.7743637561798096, + "learning_rate": 1.9634213421342135e-05, + "loss": 0.1448, + "num_input_tokens_seen": 1505856, + "step": 7140 + }, + { + "epoch": 0.786028602860286, + "grad_norm": 0.3751097619533539, + "learning_rate": 1.964796479647965e-05, + "loss": 0.1242, + "num_input_tokens_seen": 1506912, + "step": 7145 + }, + { + "epoch": 0.7865786578657866, + "grad_norm": 0.2875792682170868, + "learning_rate": 1.966171617161716e-05, + "loss": 0.1879, + "num_input_tokens_seen": 1508032, + "step": 7150 + }, + { + "epoch": 0.7871287128712872, + "grad_norm": 0.7209843397140503, + "learning_rate": 1.9675467546754676e-05, + "loss": 0.1533, + "num_input_tokens_seen": 1509056, + "step": 7155 + }, + { + "epoch": 0.7876787678767877, + "grad_norm": 0.8714553117752075, + "learning_rate": 1.968921892189219e-05, + "loss": 0.1459, + "num_input_tokens_seen": 1510112, + "step": 7160 + }, + { + "epoch": 0.7882288228822882, + "grad_norm": 0.47635844349861145, + "learning_rate": 1.9702970297029703e-05, + "loss": 0.1734, + "num_input_tokens_seen": 1511168, + "step": 7165 + }, + { + "epoch": 0.7887788778877888, + "grad_norm": 0.4225088655948639, + "learning_rate": 1.9716721672167218e-05, + "loss": 0.1962, + "num_input_tokens_seen": 1512256, + "step": 7170 + }, + { + "epoch": 0.7893289328932893, + "grad_norm": 0.6259415149688721, + "learning_rate": 1.9730473047304733e-05, + "loss": 0.1992, + "num_input_tokens_seen": 1513344, + "step": 7175 + }, + { + "epoch": 0.7898789878987899, + "grad_norm": 1.1400929689407349, + "learning_rate": 1.9744224422442245e-05, + "loss": 0.2343, + "num_input_tokens_seen": 1514400, + "step": 7180 + }, + { + "epoch": 0.7904290429042904, + "grad_norm": 0.9333766102790833, + "learning_rate": 1.975797579757976e-05, + "loss": 0.2825, + "num_input_tokens_seen": 1515424, + "step": 7185 + }, + { + "epoch": 0.7909790979097909, + "grad_norm": 0.9795905947685242, + "learning_rate": 1.977172717271727e-05, + "loss": 0.1814, + "num_input_tokens_seen": 1516448, + "step": 7190 + }, + { + "epoch": 0.7915291529152916, + "grad_norm": 0.4882076382637024, + "learning_rate": 1.9785478547854787e-05, + "loss": 0.2238, + "num_input_tokens_seen": 1517472, + "step": 7195 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.8273311257362366, + "learning_rate": 1.9799229922992298e-05, + "loss": 0.147, + "num_input_tokens_seen": 1518496, + "step": 7200 + }, + { + "epoch": 0.7926292629262927, + "grad_norm": 1.4818189144134521, + "learning_rate": 1.9812981298129817e-05, + "loss": 0.2012, + "num_input_tokens_seen": 1519520, + "step": 7205 + }, + { + "epoch": 0.7931793179317932, + "grad_norm": 0.6267262697219849, + "learning_rate": 1.982673267326733e-05, + "loss": 0.1916, + "num_input_tokens_seen": 1520608, + "step": 7210 + }, + { + "epoch": 0.7937293729372937, + "grad_norm": 0.5761584043502808, + "learning_rate": 1.9840484048404843e-05, + "loss": 0.1898, + "num_input_tokens_seen": 1521664, + "step": 7215 + }, + { + "epoch": 0.7942794279427943, + "grad_norm": 0.5359700322151184, + "learning_rate": 1.9854235423542355e-05, + "loss": 0.1442, + "num_input_tokens_seen": 1522688, + "step": 7220 + }, + { + "epoch": 0.7948294829482948, + "grad_norm": 0.3385660648345947, + "learning_rate": 1.986798679867987e-05, + "loss": 0.1792, + "num_input_tokens_seen": 1523776, + "step": 7225 + }, + { + "epoch": 0.7953795379537953, + "grad_norm": 0.8057303428649902, + "learning_rate": 1.9881738173817382e-05, + "loss": 0.1811, + "num_input_tokens_seen": 1524864, + "step": 7230 + }, + { + "epoch": 0.7959295929592959, + "grad_norm": 0.5438199043273926, + "learning_rate": 1.9895489548954897e-05, + "loss": 0.2024, + "num_input_tokens_seen": 1525888, + "step": 7235 + }, + { + "epoch": 0.7964796479647965, + "grad_norm": 0.5235419869422913, + "learning_rate": 1.990924092409241e-05, + "loss": 0.1365, + "num_input_tokens_seen": 1526976, + "step": 7240 + }, + { + "epoch": 0.7970297029702971, + "grad_norm": 0.31304702162742615, + "learning_rate": 1.9922992299229923e-05, + "loss": 0.1461, + "num_input_tokens_seen": 1527968, + "step": 7245 + }, + { + "epoch": 0.7975797579757976, + "grad_norm": 0.34412047266960144, + "learning_rate": 1.993674367436744e-05, + "loss": 0.1559, + "num_input_tokens_seen": 1528992, + "step": 7250 + }, + { + "epoch": 0.7981298129812981, + "grad_norm": 0.396169513463974, + "learning_rate": 1.9950495049504954e-05, + "loss": 0.138, + "num_input_tokens_seen": 1530016, + "step": 7255 + }, + { + "epoch": 0.7986798679867987, + "grad_norm": 0.5211819410324097, + "learning_rate": 1.9964246424642465e-05, + "loss": 0.1559, + "num_input_tokens_seen": 1531104, + "step": 7260 + }, + { + "epoch": 0.7992299229922992, + "grad_norm": 0.5496214628219604, + "learning_rate": 1.997799779977998e-05, + "loss": 0.1827, + "num_input_tokens_seen": 1532192, + "step": 7265 + }, + { + "epoch": 0.7997799779977998, + "grad_norm": 0.801777720451355, + "learning_rate": 1.9991749174917492e-05, + "loss": 0.2096, + "num_input_tokens_seen": 1533216, + "step": 7270 + }, + { + "epoch": 0.8003300330033003, + "grad_norm": 0.28689661622047424, + "learning_rate": 2.0005500550055007e-05, + "loss": 0.1143, + "num_input_tokens_seen": 1534240, + "step": 7275 + }, + { + "epoch": 0.8008800880088008, + "grad_norm": 1.7375954389572144, + "learning_rate": 2.001925192519252e-05, + "loss": 0.1749, + "num_input_tokens_seen": 1535296, + "step": 7280 + }, + { + "epoch": 0.8014301430143014, + "grad_norm": 1.6453691720962524, + "learning_rate": 2.0033003300330034e-05, + "loss": 0.1901, + "num_input_tokens_seen": 1536416, + "step": 7285 + }, + { + "epoch": 0.801980198019802, + "grad_norm": 0.6815221905708313, + "learning_rate": 2.0046754675467545e-05, + "loss": 0.2037, + "num_input_tokens_seen": 1537472, + "step": 7290 + }, + { + "epoch": 0.8025302530253026, + "grad_norm": 0.874712347984314, + "learning_rate": 2.006050605060506e-05, + "loss": 0.1503, + "num_input_tokens_seen": 1538464, + "step": 7295 + }, + { + "epoch": 0.8030803080308031, + "grad_norm": 0.5416201949119568, + "learning_rate": 2.0074257425742575e-05, + "loss": 0.1471, + "num_input_tokens_seen": 1539488, + "step": 7300 + }, + { + "epoch": 0.8036303630363036, + "grad_norm": 0.5010395646095276, + "learning_rate": 2.008800880088009e-05, + "loss": 0.149, + "num_input_tokens_seen": 1540576, + "step": 7305 + }, + { + "epoch": 0.8041804180418042, + "grad_norm": 1.070424199104309, + "learning_rate": 2.0101760176017602e-05, + "loss": 0.1716, + "num_input_tokens_seen": 1541632, + "step": 7310 + }, + { + "epoch": 0.8047304730473047, + "grad_norm": 0.8869340419769287, + "learning_rate": 2.0115511551155117e-05, + "loss": 0.1925, + "num_input_tokens_seen": 1542656, + "step": 7315 + }, + { + "epoch": 0.8052805280528053, + "grad_norm": 0.7285905480384827, + "learning_rate": 2.012926292629263e-05, + "loss": 0.2069, + "num_input_tokens_seen": 1543744, + "step": 7320 + }, + { + "epoch": 0.8058305830583058, + "grad_norm": 0.5402310490608215, + "learning_rate": 2.0143014301430144e-05, + "loss": 0.1135, + "num_input_tokens_seen": 1544896, + "step": 7325 + }, + { + "epoch": 0.8063806380638063, + "grad_norm": 0.8562530875205994, + "learning_rate": 2.0156765676567656e-05, + "loss": 0.1567, + "num_input_tokens_seen": 1545920, + "step": 7330 + }, + { + "epoch": 0.806930693069307, + "grad_norm": 0.7949786186218262, + "learning_rate": 2.017051705170517e-05, + "loss": 0.1673, + "num_input_tokens_seen": 1547040, + "step": 7335 + }, + { + "epoch": 0.8074807480748075, + "grad_norm": 1.373894453048706, + "learning_rate": 2.0184268426842686e-05, + "loss": 0.1745, + "num_input_tokens_seen": 1548128, + "step": 7340 + }, + { + "epoch": 0.8080308030803081, + "grad_norm": 1.0323963165283203, + "learning_rate": 2.01980198019802e-05, + "loss": 0.2022, + "num_input_tokens_seen": 1549216, + "step": 7345 + }, + { + "epoch": 0.8085808580858086, + "grad_norm": 0.6105633974075317, + "learning_rate": 2.0211771177117712e-05, + "loss": 0.1586, + "num_input_tokens_seen": 1550240, + "step": 7350 + }, + { + "epoch": 0.8091309130913091, + "grad_norm": 0.6105588674545288, + "learning_rate": 2.0225522552255227e-05, + "loss": 0.184, + "num_input_tokens_seen": 1551264, + "step": 7355 + }, + { + "epoch": 0.8096809680968097, + "grad_norm": 0.7469667196273804, + "learning_rate": 2.023927392739274e-05, + "loss": 0.1682, + "num_input_tokens_seen": 1552288, + "step": 7360 + }, + { + "epoch": 0.8102310231023102, + "grad_norm": 0.49185261130332947, + "learning_rate": 2.0253025302530254e-05, + "loss": 0.2273, + "num_input_tokens_seen": 1553344, + "step": 7365 + }, + { + "epoch": 0.8107810781078107, + "grad_norm": 1.382428526878357, + "learning_rate": 2.026677667766777e-05, + "loss": 0.1351, + "num_input_tokens_seen": 1554432, + "step": 7370 + }, + { + "epoch": 0.8113311331133113, + "grad_norm": 0.40807926654815674, + "learning_rate": 2.028052805280528e-05, + "loss": 0.1702, + "num_input_tokens_seen": 1555456, + "step": 7375 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.5690475702285767, + "learning_rate": 2.0294279427942796e-05, + "loss": 0.2731, + "num_input_tokens_seen": 1556544, + "step": 7380 + }, + { + "epoch": 0.8124312431243125, + "grad_norm": 0.48537588119506836, + "learning_rate": 2.0308030803080307e-05, + "loss": 0.1443, + "num_input_tokens_seen": 1557600, + "step": 7385 + }, + { + "epoch": 0.812981298129813, + "grad_norm": 0.4835354685783386, + "learning_rate": 2.0321782178217822e-05, + "loss": 0.207, + "num_input_tokens_seen": 1558624, + "step": 7390 + }, + { + "epoch": 0.8135313531353136, + "grad_norm": 1.3387978076934814, + "learning_rate": 2.0335533553355338e-05, + "loss": 0.2583, + "num_input_tokens_seen": 1559680, + "step": 7395 + }, + { + "epoch": 0.8140814081408141, + "grad_norm": 0.548322319984436, + "learning_rate": 2.0349284928492853e-05, + "loss": 0.1883, + "num_input_tokens_seen": 1560768, + "step": 7400 + }, + { + "epoch": 0.8146314631463146, + "grad_norm": 0.5908242464065552, + "learning_rate": 2.0363036303630364e-05, + "loss": 0.1129, + "num_input_tokens_seen": 1561824, + "step": 7405 + }, + { + "epoch": 0.8151815181518152, + "grad_norm": 0.7345666885375977, + "learning_rate": 2.037678767876788e-05, + "loss": 0.1688, + "num_input_tokens_seen": 1562880, + "step": 7410 + }, + { + "epoch": 0.8157315731573157, + "grad_norm": 0.8650287985801697, + "learning_rate": 2.039053905390539e-05, + "loss": 0.2646, + "num_input_tokens_seen": 1563936, + "step": 7415 + }, + { + "epoch": 0.8162816281628162, + "grad_norm": 0.5692025423049927, + "learning_rate": 2.0404290429042906e-05, + "loss": 0.131, + "num_input_tokens_seen": 1564960, + "step": 7420 + }, + { + "epoch": 0.8168316831683168, + "grad_norm": 0.6342630386352539, + "learning_rate": 2.0418041804180418e-05, + "loss": 0.2209, + "num_input_tokens_seen": 1565984, + "step": 7425 + }, + { + "epoch": 0.8173817381738174, + "grad_norm": 0.3181948959827423, + "learning_rate": 2.0431793179317933e-05, + "loss": 0.1337, + "num_input_tokens_seen": 1567008, + "step": 7430 + }, + { + "epoch": 0.817931793179318, + "grad_norm": 0.4687976539134979, + "learning_rate": 2.0445544554455444e-05, + "loss": 0.1909, + "num_input_tokens_seen": 1568064, + "step": 7435 + }, + { + "epoch": 0.8184818481848185, + "grad_norm": 0.46075010299682617, + "learning_rate": 2.0459295929592963e-05, + "loss": 0.1756, + "num_input_tokens_seen": 1569184, + "step": 7440 + }, + { + "epoch": 0.819031903190319, + "grad_norm": 0.39294153451919556, + "learning_rate": 2.0473047304730474e-05, + "loss": 0.1721, + "num_input_tokens_seen": 1570336, + "step": 7445 + }, + { + "epoch": 0.8195819581958196, + "grad_norm": 0.713615894317627, + "learning_rate": 2.048679867986799e-05, + "loss": 0.2039, + "num_input_tokens_seen": 1571328, + "step": 7450 + }, + { + "epoch": 0.8201320132013201, + "grad_norm": 0.533332884311676, + "learning_rate": 2.05005500550055e-05, + "loss": 0.1913, + "num_input_tokens_seen": 1572448, + "step": 7455 + }, + { + "epoch": 0.8206820682068207, + "grad_norm": 0.8962217569351196, + "learning_rate": 2.0514301430143016e-05, + "loss": 0.1556, + "num_input_tokens_seen": 1573568, + "step": 7460 + }, + { + "epoch": 0.8212321232123212, + "grad_norm": 0.5504018068313599, + "learning_rate": 2.0528052805280528e-05, + "loss": 0.1512, + "num_input_tokens_seen": 1574592, + "step": 7465 + }, + { + "epoch": 0.8217821782178217, + "grad_norm": 0.2077416181564331, + "learning_rate": 2.0541804180418043e-05, + "loss": 0.1376, + "num_input_tokens_seen": 1575680, + "step": 7470 + }, + { + "epoch": 0.8223322332233224, + "grad_norm": 0.4456455111503601, + "learning_rate": 2.0555555555555555e-05, + "loss": 0.122, + "num_input_tokens_seen": 1576736, + "step": 7475 + }, + { + "epoch": 0.8228822882288229, + "grad_norm": 0.841300368309021, + "learning_rate": 2.056930693069307e-05, + "loss": 0.1956, + "num_input_tokens_seen": 1577760, + "step": 7480 + }, + { + "epoch": 0.8234323432343235, + "grad_norm": 1.0810790061950684, + "learning_rate": 2.0583058305830585e-05, + "loss": 0.1631, + "num_input_tokens_seen": 1578816, + "step": 7485 + }, + { + "epoch": 0.823982398239824, + "grad_norm": 0.4526212215423584, + "learning_rate": 2.05968096809681e-05, + "loss": 0.1791, + "num_input_tokens_seen": 1579904, + "step": 7490 + }, + { + "epoch": 0.8245324532453245, + "grad_norm": 0.5578178763389587, + "learning_rate": 2.061056105610561e-05, + "loss": 0.2069, + "num_input_tokens_seen": 1581024, + "step": 7495 + }, + { + "epoch": 0.8250825082508251, + "grad_norm": 0.33663442730903625, + "learning_rate": 2.0624312431243126e-05, + "loss": 0.1297, + "num_input_tokens_seen": 1582048, + "step": 7500 + }, + { + "epoch": 0.8256325632563256, + "grad_norm": 1.0556122064590454, + "learning_rate": 2.0638063806380638e-05, + "loss": 0.2384, + "num_input_tokens_seen": 1583136, + "step": 7505 + }, + { + "epoch": 0.8261826182618262, + "grad_norm": 0.6488553285598755, + "learning_rate": 2.0651815181518153e-05, + "loss": 0.1583, + "num_input_tokens_seen": 1584096, + "step": 7510 + }, + { + "epoch": 0.8267326732673267, + "grad_norm": 1.2965718507766724, + "learning_rate": 2.0665566556655665e-05, + "loss": 0.1699, + "num_input_tokens_seen": 1585120, + "step": 7515 + }, + { + "epoch": 0.8272827282728272, + "grad_norm": 1.0369453430175781, + "learning_rate": 2.067931793179318e-05, + "loss": 0.1424, + "num_input_tokens_seen": 1586144, + "step": 7520 + }, + { + "epoch": 0.8278327832783279, + "grad_norm": 0.964227557182312, + "learning_rate": 2.069306930693069e-05, + "loss": 0.1907, + "num_input_tokens_seen": 1587136, + "step": 7525 + }, + { + "epoch": 0.8283828382838284, + "grad_norm": 0.3781053423881531, + "learning_rate": 2.0706820682068206e-05, + "loss": 0.175, + "num_input_tokens_seen": 1588192, + "step": 7530 + }, + { + "epoch": 0.828932893289329, + "grad_norm": 0.36020755767822266, + "learning_rate": 2.072057205720572e-05, + "loss": 0.1614, + "num_input_tokens_seen": 1589216, + "step": 7535 + }, + { + "epoch": 0.8294829482948295, + "grad_norm": 0.7117501497268677, + "learning_rate": 2.0734323432343237e-05, + "loss": 0.1616, + "num_input_tokens_seen": 1590304, + "step": 7540 + }, + { + "epoch": 0.83003300330033, + "grad_norm": 0.4261341989040375, + "learning_rate": 2.0748074807480748e-05, + "loss": 0.1279, + "num_input_tokens_seen": 1591456, + "step": 7545 + }, + { + "epoch": 0.8305830583058306, + "grad_norm": 0.7554845809936523, + "learning_rate": 2.0761826182618263e-05, + "loss": 0.1636, + "num_input_tokens_seen": 1592576, + "step": 7550 + }, + { + "epoch": 0.8311331133113311, + "grad_norm": 1.5239973068237305, + "learning_rate": 2.0775577557755775e-05, + "loss": 0.1366, + "num_input_tokens_seen": 1593664, + "step": 7555 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.7057969570159912, + "learning_rate": 2.078932893289329e-05, + "loss": 0.2668, + "num_input_tokens_seen": 1594688, + "step": 7560 + }, + { + "epoch": 0.8322332233223322, + "grad_norm": 0.7099169492721558, + "learning_rate": 2.0803080308030805e-05, + "loss": 0.1989, + "num_input_tokens_seen": 1595712, + "step": 7565 + }, + { + "epoch": 0.8327832783278328, + "grad_norm": 0.42418187856674194, + "learning_rate": 2.0816831683168317e-05, + "loss": 0.0973, + "num_input_tokens_seen": 1596768, + "step": 7570 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.5151785612106323, + "learning_rate": 2.083058305830583e-05, + "loss": 0.163, + "num_input_tokens_seen": 1597856, + "step": 7575 + }, + { + "epoch": 0.8338833883388339, + "grad_norm": 0.5215595364570618, + "learning_rate": 2.0844334433443347e-05, + "loss": 0.1434, + "num_input_tokens_seen": 1598944, + "step": 7580 + }, + { + "epoch": 0.8344334433443344, + "grad_norm": 0.6808140873908997, + "learning_rate": 2.085808580858086e-05, + "loss": 0.1799, + "num_input_tokens_seen": 1599936, + "step": 7585 + }, + { + "epoch": 0.834983498349835, + "grad_norm": 0.5649706125259399, + "learning_rate": 2.0871837183718373e-05, + "loss": 0.1708, + "num_input_tokens_seen": 1600992, + "step": 7590 + }, + { + "epoch": 0.8355335533553355, + "grad_norm": 0.40821629762649536, + "learning_rate": 2.088558855885589e-05, + "loss": 0.1145, + "num_input_tokens_seen": 1602016, + "step": 7595 + }, + { + "epoch": 0.8360836083608361, + "grad_norm": 0.36829128861427307, + "learning_rate": 2.08993399339934e-05, + "loss": 0.1405, + "num_input_tokens_seen": 1603040, + "step": 7600 + }, + { + "epoch": 0.8366336633663366, + "grad_norm": 0.8464269042015076, + "learning_rate": 2.0913091309130915e-05, + "loss": 0.1676, + "num_input_tokens_seen": 1604064, + "step": 7605 + }, + { + "epoch": 0.8371837183718371, + "grad_norm": 1.4221216440200806, + "learning_rate": 2.0926842684268427e-05, + "loss": 0.2007, + "num_input_tokens_seen": 1605120, + "step": 7610 + }, + { + "epoch": 0.8377337733773378, + "grad_norm": 0.5909570455551147, + "learning_rate": 2.0940594059405942e-05, + "loss": 0.1521, + "num_input_tokens_seen": 1606176, + "step": 7615 + }, + { + "epoch": 0.8382838283828383, + "grad_norm": 0.6934711337089539, + "learning_rate": 2.0954345434543454e-05, + "loss": 0.1698, + "num_input_tokens_seen": 1607232, + "step": 7620 + }, + { + "epoch": 0.8388338833883389, + "grad_norm": 0.5687171220779419, + "learning_rate": 2.0968096809680972e-05, + "loss": 0.1868, + "num_input_tokens_seen": 1608256, + "step": 7625 + }, + { + "epoch": 0.8393839383938394, + "grad_norm": 0.8044399619102478, + "learning_rate": 2.0981848184818484e-05, + "loss": 0.139, + "num_input_tokens_seen": 1609312, + "step": 7630 + }, + { + "epoch": 0.8399339933993399, + "grad_norm": 1.1268370151519775, + "learning_rate": 2.0995599559956e-05, + "loss": 0.3035, + "num_input_tokens_seen": 1610400, + "step": 7635 + }, + { + "epoch": 0.8404840484048405, + "grad_norm": 0.43306857347488403, + "learning_rate": 2.100935093509351e-05, + "loss": 0.1543, + "num_input_tokens_seen": 1611520, + "step": 7640 + }, + { + "epoch": 0.841034103410341, + "grad_norm": 0.3652004897594452, + "learning_rate": 2.1023102310231025e-05, + "loss": 0.1199, + "num_input_tokens_seen": 1612576, + "step": 7645 + }, + { + "epoch": 0.8415841584158416, + "grad_norm": 0.4353644847869873, + "learning_rate": 2.1036853685368537e-05, + "loss": 0.2249, + "num_input_tokens_seen": 1613664, + "step": 7650 + }, + { + "epoch": 0.8421342134213421, + "grad_norm": 0.7061324119567871, + "learning_rate": 2.1050605060506052e-05, + "loss": 0.1306, + "num_input_tokens_seen": 1614720, + "step": 7655 + }, + { + "epoch": 0.8426842684268426, + "grad_norm": 0.5593791007995605, + "learning_rate": 2.1064356435643564e-05, + "loss": 0.1483, + "num_input_tokens_seen": 1615840, + "step": 7660 + }, + { + "epoch": 0.8432343234323433, + "grad_norm": 0.7657767534255981, + "learning_rate": 2.107810781078108e-05, + "loss": 0.1215, + "num_input_tokens_seen": 1616960, + "step": 7665 + }, + { + "epoch": 0.8437843784378438, + "grad_norm": 0.25743651390075684, + "learning_rate": 2.109185918591859e-05, + "loss": 0.1797, + "num_input_tokens_seen": 1617984, + "step": 7670 + }, + { + "epoch": 0.8443344334433444, + "grad_norm": 0.7746001482009888, + "learning_rate": 2.110561056105611e-05, + "loss": 0.227, + "num_input_tokens_seen": 1619040, + "step": 7675 + }, + { + "epoch": 0.8448844884488449, + "grad_norm": 0.9198168516159058, + "learning_rate": 2.111936193619362e-05, + "loss": 0.205, + "num_input_tokens_seen": 1620128, + "step": 7680 + }, + { + "epoch": 0.8454345434543454, + "grad_norm": 1.030364751815796, + "learning_rate": 2.1133113311331136e-05, + "loss": 0.1573, + "num_input_tokens_seen": 1621152, + "step": 7685 + }, + { + "epoch": 0.845984598459846, + "grad_norm": 0.6984764337539673, + "learning_rate": 2.1146864686468647e-05, + "loss": 0.1507, + "num_input_tokens_seen": 1622208, + "step": 7690 + }, + { + "epoch": 0.8465346534653465, + "grad_norm": 0.5440589189529419, + "learning_rate": 2.1160616061606162e-05, + "loss": 0.0957, + "num_input_tokens_seen": 1623360, + "step": 7695 + }, + { + "epoch": 0.847084708470847, + "grad_norm": 0.5382826924324036, + "learning_rate": 2.1174367436743674e-05, + "loss": 0.2192, + "num_input_tokens_seen": 1624448, + "step": 7700 + }, + { + "epoch": 0.8476347634763476, + "grad_norm": 1.05409836769104, + "learning_rate": 2.118811881188119e-05, + "loss": 0.1469, + "num_input_tokens_seen": 1625504, + "step": 7705 + }, + { + "epoch": 0.8481848184818482, + "grad_norm": 0.6315287947654724, + "learning_rate": 2.12018701870187e-05, + "loss": 0.1603, + "num_input_tokens_seen": 1626688, + "step": 7710 + }, + { + "epoch": 0.8487348734873488, + "grad_norm": 0.8865576386451721, + "learning_rate": 2.1215621562156216e-05, + "loss": 0.1773, + "num_input_tokens_seen": 1627712, + "step": 7715 + }, + { + "epoch": 0.8492849284928493, + "grad_norm": 0.5607420206069946, + "learning_rate": 2.122937293729373e-05, + "loss": 0.1641, + "num_input_tokens_seen": 1628704, + "step": 7720 + }, + { + "epoch": 0.8498349834983498, + "grad_norm": 1.0049190521240234, + "learning_rate": 2.1243124312431246e-05, + "loss": 0.1752, + "num_input_tokens_seen": 1629824, + "step": 7725 + }, + { + "epoch": 0.8503850385038504, + "grad_norm": 0.9026128053665161, + "learning_rate": 2.1256875687568757e-05, + "loss": 0.1749, + "num_input_tokens_seen": 1630880, + "step": 7730 + }, + { + "epoch": 0.8509350935093509, + "grad_norm": 0.4969799518585205, + "learning_rate": 2.1270627062706272e-05, + "loss": 0.1967, + "num_input_tokens_seen": 1631904, + "step": 7735 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.9583823680877686, + "learning_rate": 2.1284378437843784e-05, + "loss": 0.2153, + "num_input_tokens_seen": 1632992, + "step": 7740 + }, + { + "epoch": 0.852035203520352, + "grad_norm": 0.8811134696006775, + "learning_rate": 2.12981298129813e-05, + "loss": 0.1636, + "num_input_tokens_seen": 1634080, + "step": 7745 + }, + { + "epoch": 0.8525852585258525, + "grad_norm": 2.2257120609283447, + "learning_rate": 2.131188118811881e-05, + "loss": 0.2781, + "num_input_tokens_seen": 1635104, + "step": 7750 + }, + { + "epoch": 0.8531353135313532, + "grad_norm": 0.7358458638191223, + "learning_rate": 2.1325632563256326e-05, + "loss": 0.1371, + "num_input_tokens_seen": 1636096, + "step": 7755 + }, + { + "epoch": 0.8536853685368537, + "grad_norm": 0.1862139105796814, + "learning_rate": 2.133938393839384e-05, + "loss": 0.0917, + "num_input_tokens_seen": 1637056, + "step": 7760 + }, + { + "epoch": 0.8542354235423543, + "grad_norm": 0.40669959783554077, + "learning_rate": 2.1353135313531356e-05, + "loss": 0.1922, + "num_input_tokens_seen": 1638176, + "step": 7765 + }, + { + "epoch": 0.8547854785478548, + "grad_norm": 1.0032354593276978, + "learning_rate": 2.1366886688668868e-05, + "loss": 0.1594, + "num_input_tokens_seen": 1639232, + "step": 7770 + }, + { + "epoch": 0.8553355335533553, + "grad_norm": 0.48631879687309265, + "learning_rate": 2.1380638063806383e-05, + "loss": 0.1881, + "num_input_tokens_seen": 1640320, + "step": 7775 + }, + { + "epoch": 0.8558855885588559, + "grad_norm": 0.5374449491500854, + "learning_rate": 2.1394389438943894e-05, + "loss": 0.1637, + "num_input_tokens_seen": 1641344, + "step": 7780 + }, + { + "epoch": 0.8564356435643564, + "grad_norm": 1.0757150650024414, + "learning_rate": 2.140814081408141e-05, + "loss": 0.1808, + "num_input_tokens_seen": 1642464, + "step": 7785 + }, + { + "epoch": 0.856985698569857, + "grad_norm": 0.7648646235466003, + "learning_rate": 2.1421892189218924e-05, + "loss": 0.2396, + "num_input_tokens_seen": 1643552, + "step": 7790 + }, + { + "epoch": 0.8575357535753575, + "grad_norm": 0.30780455470085144, + "learning_rate": 2.1435643564356436e-05, + "loss": 0.1472, + "num_input_tokens_seen": 1644608, + "step": 7795 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 0.3599533140659332, + "learning_rate": 2.144939493949395e-05, + "loss": 0.2181, + "num_input_tokens_seen": 1645664, + "step": 7800 + }, + { + "epoch": 0.8586358635863587, + "grad_norm": 0.8126536011695862, + "learning_rate": 2.1463146314631463e-05, + "loss": 0.155, + "num_input_tokens_seen": 1646688, + "step": 7805 + }, + { + "epoch": 0.8591859185918592, + "grad_norm": 1.0830340385437012, + "learning_rate": 2.1476897689768978e-05, + "loss": 0.1809, + "num_input_tokens_seen": 1647712, + "step": 7810 + }, + { + "epoch": 0.8597359735973598, + "grad_norm": 0.7228633165359497, + "learning_rate": 2.1490649064906493e-05, + "loss": 0.1691, + "num_input_tokens_seen": 1648704, + "step": 7815 + }, + { + "epoch": 0.8602860286028603, + "grad_norm": 0.3521786630153656, + "learning_rate": 2.1504400440044008e-05, + "loss": 0.1402, + "num_input_tokens_seen": 1649728, + "step": 7820 + }, + { + "epoch": 0.8608360836083608, + "grad_norm": 0.4602038264274597, + "learning_rate": 2.151815181518152e-05, + "loss": 0.1356, + "num_input_tokens_seen": 1650784, + "step": 7825 + }, + { + "epoch": 0.8613861386138614, + "grad_norm": 0.2710581421852112, + "learning_rate": 2.1531903190319035e-05, + "loss": 0.1571, + "num_input_tokens_seen": 1651872, + "step": 7830 + }, + { + "epoch": 0.8619361936193619, + "grad_norm": 0.5260083675384521, + "learning_rate": 2.1545654565456546e-05, + "loss": 0.1506, + "num_input_tokens_seen": 1652896, + "step": 7835 + }, + { + "epoch": 0.8624862486248625, + "grad_norm": 0.9034708142280579, + "learning_rate": 2.155940594059406e-05, + "loss": 0.1626, + "num_input_tokens_seen": 1653952, + "step": 7840 + }, + { + "epoch": 0.863036303630363, + "grad_norm": 1.3674778938293457, + "learning_rate": 2.1573157315731573e-05, + "loss": 0.2294, + "num_input_tokens_seen": 1654976, + "step": 7845 + }, + { + "epoch": 0.8635863586358636, + "grad_norm": 1.1698476076126099, + "learning_rate": 2.1586908690869088e-05, + "loss": 0.1751, + "num_input_tokens_seen": 1656064, + "step": 7850 + }, + { + "epoch": 0.8641364136413642, + "grad_norm": 1.0501532554626465, + "learning_rate": 2.16006600660066e-05, + "loss": 0.1601, + "num_input_tokens_seen": 1657120, + "step": 7855 + }, + { + "epoch": 0.8646864686468647, + "grad_norm": 0.38418668508529663, + "learning_rate": 2.1614411441144118e-05, + "loss": 0.1724, + "num_input_tokens_seen": 1658144, + "step": 7860 + }, + { + "epoch": 0.8652365236523653, + "grad_norm": 0.5312408804893494, + "learning_rate": 2.162816281628163e-05, + "loss": 0.1887, + "num_input_tokens_seen": 1659136, + "step": 7865 + }, + { + "epoch": 0.8657865786578658, + "grad_norm": 0.6450070738792419, + "learning_rate": 2.1641914191419145e-05, + "loss": 0.2408, + "num_input_tokens_seen": 1660192, + "step": 7870 + }, + { + "epoch": 0.8663366336633663, + "grad_norm": 0.4651031494140625, + "learning_rate": 2.1655665566556656e-05, + "loss": 0.1625, + "num_input_tokens_seen": 1661280, + "step": 7875 + }, + { + "epoch": 0.8668866886688669, + "grad_norm": 0.5051153302192688, + "learning_rate": 2.166941694169417e-05, + "loss": 0.1314, + "num_input_tokens_seen": 1662336, + "step": 7880 + }, + { + "epoch": 0.8674367436743674, + "grad_norm": 0.5448316931724548, + "learning_rate": 2.1683168316831683e-05, + "loss": 0.1246, + "num_input_tokens_seen": 1663392, + "step": 7885 + }, + { + "epoch": 0.8679867986798679, + "grad_norm": 0.4788656234741211, + "learning_rate": 2.1696919691969198e-05, + "loss": 0.2847, + "num_input_tokens_seen": 1664416, + "step": 7890 + }, + { + "epoch": 0.8685368536853685, + "grad_norm": 0.38866522908210754, + "learning_rate": 2.171067106710671e-05, + "loss": 0.2766, + "num_input_tokens_seen": 1665504, + "step": 7895 + }, + { + "epoch": 0.8690869086908691, + "grad_norm": 0.7691097855567932, + "learning_rate": 2.1724422442244225e-05, + "loss": 0.1321, + "num_input_tokens_seen": 1666592, + "step": 7900 + }, + { + "epoch": 0.8696369636963697, + "grad_norm": 1.5485566854476929, + "learning_rate": 2.1738173817381736e-05, + "loss": 0.2414, + "num_input_tokens_seen": 1667680, + "step": 7905 + }, + { + "epoch": 0.8701870187018702, + "grad_norm": 0.49369823932647705, + "learning_rate": 2.1751925192519255e-05, + "loss": 0.2429, + "num_input_tokens_seen": 1668736, + "step": 7910 + }, + { + "epoch": 0.8707370737073707, + "grad_norm": 0.6830143928527832, + "learning_rate": 2.1765676567656767e-05, + "loss": 0.1747, + "num_input_tokens_seen": 1669824, + "step": 7915 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 1.8998098373413086, + "learning_rate": 2.177942794279428e-05, + "loss": 0.1283, + "num_input_tokens_seen": 1670880, + "step": 7920 + }, + { + "epoch": 0.8718371837183718, + "grad_norm": 0.41780176758766174, + "learning_rate": 2.1793179317931793e-05, + "loss": 0.1486, + "num_input_tokens_seen": 1671936, + "step": 7925 + }, + { + "epoch": 0.8723872387238724, + "grad_norm": 0.2569856643676758, + "learning_rate": 2.1806930693069308e-05, + "loss": 0.1557, + "num_input_tokens_seen": 1672928, + "step": 7930 + }, + { + "epoch": 0.8729372937293729, + "grad_norm": 1.2110341787338257, + "learning_rate": 2.182068206820682e-05, + "loss": 0.1314, + "num_input_tokens_seen": 1673984, + "step": 7935 + }, + { + "epoch": 0.8734873487348734, + "grad_norm": 0.6469191908836365, + "learning_rate": 2.1834433443344335e-05, + "loss": 0.132, + "num_input_tokens_seen": 1675040, + "step": 7940 + }, + { + "epoch": 0.8740374037403741, + "grad_norm": 1.1740649938583374, + "learning_rate": 2.1848184818481847e-05, + "loss": 0.1904, + "num_input_tokens_seen": 1676096, + "step": 7945 + }, + { + "epoch": 0.8745874587458746, + "grad_norm": 0.49126729369163513, + "learning_rate": 2.186193619361936e-05, + "loss": 0.1386, + "num_input_tokens_seen": 1677184, + "step": 7950 + }, + { + "epoch": 0.8751375137513752, + "grad_norm": 1.523196816444397, + "learning_rate": 2.1875687568756877e-05, + "loss": 0.1777, + "num_input_tokens_seen": 1678304, + "step": 7955 + }, + { + "epoch": 0.8756875687568757, + "grad_norm": 0.893833577632904, + "learning_rate": 2.1889438943894392e-05, + "loss": 0.1213, + "num_input_tokens_seen": 1679360, + "step": 7960 + }, + { + "epoch": 0.8762376237623762, + "grad_norm": 0.4640323519706726, + "learning_rate": 2.1903190319031903e-05, + "loss": 0.1816, + "num_input_tokens_seen": 1680384, + "step": 7965 + }, + { + "epoch": 0.8767876787678768, + "grad_norm": 0.7728114724159241, + "learning_rate": 2.191694169416942e-05, + "loss": 0.1592, + "num_input_tokens_seen": 1681440, + "step": 7970 + }, + { + "epoch": 0.8773377337733773, + "grad_norm": 0.6209378838539124, + "learning_rate": 2.193069306930693e-05, + "loss": 0.1487, + "num_input_tokens_seen": 1682464, + "step": 7975 + }, + { + "epoch": 0.8778877887788779, + "grad_norm": 0.28649768233299255, + "learning_rate": 2.1944444444444445e-05, + "loss": 0.1956, + "num_input_tokens_seen": 1683552, + "step": 7980 + }, + { + "epoch": 0.8784378437843784, + "grad_norm": 1.0788241624832153, + "learning_rate": 2.195819581958196e-05, + "loss": 0.183, + "num_input_tokens_seen": 1684576, + "step": 7985 + }, + { + "epoch": 0.878987898789879, + "grad_norm": 0.11140725016593933, + "learning_rate": 2.1971947194719472e-05, + "loss": 0.0607, + "num_input_tokens_seen": 1685664, + "step": 7990 + }, + { + "epoch": 0.8795379537953796, + "grad_norm": 0.695238471031189, + "learning_rate": 2.1985698569856987e-05, + "loss": 0.139, + "num_input_tokens_seen": 1686720, + "step": 7995 + }, + { + "epoch": 0.8800880088008801, + "grad_norm": 0.8095741271972656, + "learning_rate": 2.1999449944994502e-05, + "loss": 0.2205, + "num_input_tokens_seen": 1687744, + "step": 8000 + }, + { + "epoch": 0.8806380638063807, + "grad_norm": 0.6229837536811829, + "learning_rate": 2.2013201320132017e-05, + "loss": 0.1633, + "num_input_tokens_seen": 1688800, + "step": 8005 + }, + { + "epoch": 0.8811881188118812, + "grad_norm": 0.8102208375930786, + "learning_rate": 2.202695269526953e-05, + "loss": 0.1965, + "num_input_tokens_seen": 1689856, + "step": 8010 + }, + { + "epoch": 0.8817381738173817, + "grad_norm": 0.22478114068508148, + "learning_rate": 2.2040704070407044e-05, + "loss": 0.2205, + "num_input_tokens_seen": 1690912, + "step": 8015 + }, + { + "epoch": 0.8822882288228823, + "grad_norm": 0.3248347043991089, + "learning_rate": 2.2054455445544555e-05, + "loss": 0.1862, + "num_input_tokens_seen": 1691936, + "step": 8020 + }, + { + "epoch": 0.8828382838283828, + "grad_norm": 1.523643970489502, + "learning_rate": 2.206820682068207e-05, + "loss": 0.2822, + "num_input_tokens_seen": 1692960, + "step": 8025 + }, + { + "epoch": 0.8833883388338833, + "grad_norm": 0.4339728355407715, + "learning_rate": 2.2081958195819582e-05, + "loss": 0.1285, + "num_input_tokens_seen": 1694016, + "step": 8030 + }, + { + "epoch": 0.8839383938393839, + "grad_norm": 0.27770867943763733, + "learning_rate": 2.2095709570957097e-05, + "loss": 0.1096, + "num_input_tokens_seen": 1695104, + "step": 8035 + }, + { + "epoch": 0.8844884488448845, + "grad_norm": 0.34798377752304077, + "learning_rate": 2.210946094609461e-05, + "loss": 0.0902, + "num_input_tokens_seen": 1696192, + "step": 8040 + }, + { + "epoch": 0.8850385038503851, + "grad_norm": 0.7594767212867737, + "learning_rate": 2.2123212321232124e-05, + "loss": 0.1815, + "num_input_tokens_seen": 1697280, + "step": 8045 + }, + { + "epoch": 0.8855885588558856, + "grad_norm": 0.6923033595085144, + "learning_rate": 2.213696369636964e-05, + "loss": 0.1593, + "num_input_tokens_seen": 1698240, + "step": 8050 + }, + { + "epoch": 0.8861386138613861, + "grad_norm": 0.5488566160202026, + "learning_rate": 2.2150715071507154e-05, + "loss": 0.1418, + "num_input_tokens_seen": 1699360, + "step": 8055 + }, + { + "epoch": 0.8866886688668867, + "grad_norm": 0.6276061534881592, + "learning_rate": 2.2164466446644666e-05, + "loss": 0.1428, + "num_input_tokens_seen": 1700416, + "step": 8060 + }, + { + "epoch": 0.8872387238723872, + "grad_norm": 0.3230753242969513, + "learning_rate": 2.217821782178218e-05, + "loss": 0.1234, + "num_input_tokens_seen": 1701504, + "step": 8065 + }, + { + "epoch": 0.8877887788778878, + "grad_norm": 0.4081329107284546, + "learning_rate": 2.2191969196919692e-05, + "loss": 0.1199, + "num_input_tokens_seen": 1702560, + "step": 8070 + }, + { + "epoch": 0.8883388338833883, + "grad_norm": 0.44825851917266846, + "learning_rate": 2.2205720572057207e-05, + "loss": 0.1598, + "num_input_tokens_seen": 1703584, + "step": 8075 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.580216109752655, + "learning_rate": 2.221947194719472e-05, + "loss": 0.1581, + "num_input_tokens_seen": 1704576, + "step": 8080 + }, + { + "epoch": 0.8894389438943895, + "grad_norm": 0.4522213935852051, + "learning_rate": 2.2233223322332234e-05, + "loss": 0.2066, + "num_input_tokens_seen": 1705728, + "step": 8085 + }, + { + "epoch": 0.88998899889989, + "grad_norm": 0.4120364785194397, + "learning_rate": 2.2246974697469746e-05, + "loss": 0.1238, + "num_input_tokens_seen": 1706784, + "step": 8090 + }, + { + "epoch": 0.8905390539053906, + "grad_norm": 0.3665301203727722, + "learning_rate": 2.2260726072607264e-05, + "loss": 0.0988, + "num_input_tokens_seen": 1707936, + "step": 8095 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.6927698254585266, + "learning_rate": 2.2274477447744776e-05, + "loss": 0.2002, + "num_input_tokens_seen": 1709056, + "step": 8100 + }, + { + "epoch": 0.8916391639163916, + "grad_norm": 0.8063819408416748, + "learning_rate": 2.228822882288229e-05, + "loss": 0.131, + "num_input_tokens_seen": 1710144, + "step": 8105 + }, + { + "epoch": 0.8921892189218922, + "grad_norm": 0.4227294325828552, + "learning_rate": 2.2301980198019802e-05, + "loss": 0.2188, + "num_input_tokens_seen": 1711232, + "step": 8110 + }, + { + "epoch": 0.8927392739273927, + "grad_norm": 0.10828021168708801, + "learning_rate": 2.2315731573157317e-05, + "loss": 0.1154, + "num_input_tokens_seen": 1712320, + "step": 8115 + }, + { + "epoch": 0.8932893289328933, + "grad_norm": 0.7886452674865723, + "learning_rate": 2.232948294829483e-05, + "loss": 0.2318, + "num_input_tokens_seen": 1713376, + "step": 8120 + }, + { + "epoch": 0.8938393839383938, + "grad_norm": 0.606360137462616, + "learning_rate": 2.2343234323432344e-05, + "loss": 0.1741, + "num_input_tokens_seen": 1714432, + "step": 8125 + }, + { + "epoch": 0.8943894389438944, + "grad_norm": 0.6236941814422607, + "learning_rate": 2.2356985698569856e-05, + "loss": 0.1954, + "num_input_tokens_seen": 1715392, + "step": 8130 + }, + { + "epoch": 0.894939493949395, + "grad_norm": 0.6111241579055786, + "learning_rate": 2.237073707370737e-05, + "loss": 0.1646, + "num_input_tokens_seen": 1716352, + "step": 8135 + }, + { + "epoch": 0.8954895489548955, + "grad_norm": 1.0119892358779907, + "learning_rate": 2.2384488448844886e-05, + "loss": 0.1587, + "num_input_tokens_seen": 1717376, + "step": 8140 + }, + { + "epoch": 0.8960396039603961, + "grad_norm": 0.35106122493743896, + "learning_rate": 2.23982398239824e-05, + "loss": 0.098, + "num_input_tokens_seen": 1718496, + "step": 8145 + }, + { + "epoch": 0.8965896589658966, + "grad_norm": 0.7409602999687195, + "learning_rate": 2.2411991199119913e-05, + "loss": 0.1612, + "num_input_tokens_seen": 1719552, + "step": 8150 + }, + { + "epoch": 0.8971397139713971, + "grad_norm": 0.4570341110229492, + "learning_rate": 2.2425742574257428e-05, + "loss": 0.1375, + "num_input_tokens_seen": 1720640, + "step": 8155 + }, + { + "epoch": 0.8976897689768977, + "grad_norm": 0.5596632361412048, + "learning_rate": 2.243949394939494e-05, + "loss": 0.1069, + "num_input_tokens_seen": 1721728, + "step": 8160 + }, + { + "epoch": 0.8982398239823982, + "grad_norm": 0.37307798862457275, + "learning_rate": 2.2453245324532454e-05, + "loss": 0.1962, + "num_input_tokens_seen": 1722752, + "step": 8165 + }, + { + "epoch": 0.8987898789878987, + "grad_norm": 2.270534038543701, + "learning_rate": 2.2466996699669966e-05, + "loss": 0.223, + "num_input_tokens_seen": 1723872, + "step": 8170 + }, + { + "epoch": 0.8993399339933993, + "grad_norm": 0.3879084289073944, + "learning_rate": 2.248074807480748e-05, + "loss": 0.15, + "num_input_tokens_seen": 1724896, + "step": 8175 + }, + { + "epoch": 0.8998899889988999, + "grad_norm": 0.3409857451915741, + "learning_rate": 2.2494499449944996e-05, + "loss": 0.2364, + "num_input_tokens_seen": 1725984, + "step": 8180 + }, + { + "epoch": 0.9004400440044005, + "grad_norm": 0.6945348978042603, + "learning_rate": 2.2508250825082508e-05, + "loss": 0.1294, + "num_input_tokens_seen": 1727040, + "step": 8185 + }, + { + "epoch": 0.900990099009901, + "grad_norm": 0.46849849820137024, + "learning_rate": 2.2522002200220023e-05, + "loss": 0.1285, + "num_input_tokens_seen": 1728064, + "step": 8190 + }, + { + "epoch": 0.9015401540154016, + "grad_norm": 0.8371366858482361, + "learning_rate": 2.2535753575357538e-05, + "loss": 0.1926, + "num_input_tokens_seen": 1729088, + "step": 8195 + }, + { + "epoch": 0.9020902090209021, + "grad_norm": 0.617095410823822, + "learning_rate": 2.2549504950495053e-05, + "loss": 0.1864, + "num_input_tokens_seen": 1730208, + "step": 8200 + }, + { + "epoch": 0.9026402640264026, + "grad_norm": 0.6542221307754517, + "learning_rate": 2.2563256325632565e-05, + "loss": 0.1633, + "num_input_tokens_seen": 1731200, + "step": 8205 + }, + { + "epoch": 0.9031903190319032, + "grad_norm": 0.6131922006607056, + "learning_rate": 2.257700770077008e-05, + "loss": 0.1923, + "num_input_tokens_seen": 1732224, + "step": 8210 + }, + { + "epoch": 0.9037403740374037, + "grad_norm": 1.4117765426635742, + "learning_rate": 2.259075907590759e-05, + "loss": 0.1535, + "num_input_tokens_seen": 1733280, + "step": 8215 + }, + { + "epoch": 0.9042904290429042, + "grad_norm": 0.48112866282463074, + "learning_rate": 2.2604510451045106e-05, + "loss": 0.1386, + "num_input_tokens_seen": 1734336, + "step": 8220 + }, + { + "epoch": 0.9048404840484049, + "grad_norm": 1.1731230020523071, + "learning_rate": 2.2618261826182618e-05, + "loss": 0.1439, + "num_input_tokens_seen": 1735360, + "step": 8225 + }, + { + "epoch": 0.9053905390539054, + "grad_norm": 1.0625547170639038, + "learning_rate": 2.2632013201320133e-05, + "loss": 0.1476, + "num_input_tokens_seen": 1736448, + "step": 8230 + }, + { + "epoch": 0.905940594059406, + "grad_norm": 0.4534178376197815, + "learning_rate": 2.2645764576457648e-05, + "loss": 0.1363, + "num_input_tokens_seen": 1737536, + "step": 8235 + }, + { + "epoch": 0.9064906490649065, + "grad_norm": 0.6832531690597534, + "learning_rate": 2.2659515951595163e-05, + "loss": 0.1103, + "num_input_tokens_seen": 1738528, + "step": 8240 + }, + { + "epoch": 0.907040704070407, + "grad_norm": 0.9224683046340942, + "learning_rate": 2.2673267326732675e-05, + "loss": 0.191, + "num_input_tokens_seen": 1739584, + "step": 8245 + }, + { + "epoch": 0.9075907590759076, + "grad_norm": 0.8074817657470703, + "learning_rate": 2.268701870187019e-05, + "loss": 0.1412, + "num_input_tokens_seen": 1740608, + "step": 8250 + }, + { + "epoch": 0.9081408140814081, + "grad_norm": 0.18717610836029053, + "learning_rate": 2.27007700770077e-05, + "loss": 0.1994, + "num_input_tokens_seen": 1741664, + "step": 8255 + }, + { + "epoch": 0.9086908690869087, + "grad_norm": 0.4041171371936798, + "learning_rate": 2.2714521452145216e-05, + "loss": 0.0931, + "num_input_tokens_seen": 1742656, + "step": 8260 + }, + { + "epoch": 0.9092409240924092, + "grad_norm": 0.9502046704292297, + "learning_rate": 2.2728272827282728e-05, + "loss": 0.2828, + "num_input_tokens_seen": 1743648, + "step": 8265 + }, + { + "epoch": 0.9097909790979097, + "grad_norm": 0.45591485500335693, + "learning_rate": 2.2742024202420243e-05, + "loss": 0.1657, + "num_input_tokens_seen": 1744704, + "step": 8270 + }, + { + "epoch": 0.9103410341034104, + "grad_norm": 0.25619062781333923, + "learning_rate": 2.2755775577557755e-05, + "loss": 0.1117, + "num_input_tokens_seen": 1745760, + "step": 8275 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 1.2084217071533203, + "learning_rate": 2.276952695269527e-05, + "loss": 0.1796, + "num_input_tokens_seen": 1746880, + "step": 8280 + }, + { + "epoch": 0.9114411441144115, + "grad_norm": 0.6955836415290833, + "learning_rate": 2.2783278327832785e-05, + "loss": 0.1253, + "num_input_tokens_seen": 1747968, + "step": 8285 + }, + { + "epoch": 0.911991199119912, + "grad_norm": 0.9033387899398804, + "learning_rate": 2.27970297029703e-05, + "loss": 0.1694, + "num_input_tokens_seen": 1748992, + "step": 8290 + }, + { + "epoch": 0.9125412541254125, + "grad_norm": 1.1268633604049683, + "learning_rate": 2.281078107810781e-05, + "loss": 0.1241, + "num_input_tokens_seen": 1750048, + "step": 8295 + }, + { + "epoch": 0.9130913091309131, + "grad_norm": 1.217848777770996, + "learning_rate": 2.2824532453245327e-05, + "loss": 0.2002, + "num_input_tokens_seen": 1751072, + "step": 8300 + }, + { + "epoch": 0.9136413641364136, + "grad_norm": 0.9187437295913696, + "learning_rate": 2.2838283828382838e-05, + "loss": 0.2038, + "num_input_tokens_seen": 1752096, + "step": 8305 + }, + { + "epoch": 0.9141914191419142, + "grad_norm": 0.47805628180503845, + "learning_rate": 2.2852035203520353e-05, + "loss": 0.1867, + "num_input_tokens_seen": 1753152, + "step": 8310 + }, + { + "epoch": 0.9147414741474147, + "grad_norm": 0.42982929944992065, + "learning_rate": 2.2865786578657865e-05, + "loss": 0.1267, + "num_input_tokens_seen": 1754240, + "step": 8315 + }, + { + "epoch": 0.9152915291529153, + "grad_norm": 0.7833993434906006, + "learning_rate": 2.287953795379538e-05, + "loss": 0.1535, + "num_input_tokens_seen": 1755328, + "step": 8320 + }, + { + "epoch": 0.9158415841584159, + "grad_norm": 0.2961665391921997, + "learning_rate": 2.289328932893289e-05, + "loss": 0.1204, + "num_input_tokens_seen": 1756384, + "step": 8325 + }, + { + "epoch": 0.9163916391639164, + "grad_norm": 0.5537283420562744, + "learning_rate": 2.290704070407041e-05, + "loss": 0.1453, + "num_input_tokens_seen": 1757376, + "step": 8330 + }, + { + "epoch": 0.916941694169417, + "grad_norm": 0.37555888295173645, + "learning_rate": 2.2920792079207922e-05, + "loss": 0.1434, + "num_input_tokens_seen": 1758432, + "step": 8335 + }, + { + "epoch": 0.9174917491749175, + "grad_norm": 0.7696210741996765, + "learning_rate": 2.2934543454345437e-05, + "loss": 0.1387, + "num_input_tokens_seen": 1759424, + "step": 8340 + }, + { + "epoch": 0.918041804180418, + "grad_norm": 0.7127953171730042, + "learning_rate": 2.294829482948295e-05, + "loss": 0.1771, + "num_input_tokens_seen": 1760480, + "step": 8345 + }, + { + "epoch": 0.9185918591859186, + "grad_norm": 0.9269065856933594, + "learning_rate": 2.2962046204620464e-05, + "loss": 0.1107, + "num_input_tokens_seen": 1761536, + "step": 8350 + }, + { + "epoch": 0.9191419141914191, + "grad_norm": 0.5588352680206299, + "learning_rate": 2.2975797579757975e-05, + "loss": 0.1482, + "num_input_tokens_seen": 1762624, + "step": 8355 + }, + { + "epoch": 0.9196919691969196, + "grad_norm": 0.9971650242805481, + "learning_rate": 2.298954895489549e-05, + "loss": 0.2177, + "num_input_tokens_seen": 1763648, + "step": 8360 + }, + { + "epoch": 0.9202420242024203, + "grad_norm": 0.28213509917259216, + "learning_rate": 2.3003300330033002e-05, + "loss": 0.1352, + "num_input_tokens_seen": 1764672, + "step": 8365 + }, + { + "epoch": 0.9207920792079208, + "grad_norm": 0.7019941806793213, + "learning_rate": 2.3017051705170517e-05, + "loss": 0.2162, + "num_input_tokens_seen": 1765728, + "step": 8370 + }, + { + "epoch": 0.9213421342134214, + "grad_norm": 0.3166615664958954, + "learning_rate": 2.3030803080308032e-05, + "loss": 0.0892, + "num_input_tokens_seen": 1766784, + "step": 8375 + }, + { + "epoch": 0.9218921892189219, + "grad_norm": 0.9328470230102539, + "learning_rate": 2.3044554455445547e-05, + "loss": 0.1746, + "num_input_tokens_seen": 1767904, + "step": 8380 + }, + { + "epoch": 0.9224422442244224, + "grad_norm": 0.4316571056842804, + "learning_rate": 2.305830583058306e-05, + "loss": 0.196, + "num_input_tokens_seen": 1768928, + "step": 8385 + }, + { + "epoch": 0.922992299229923, + "grad_norm": 0.6485761404037476, + "learning_rate": 2.3072057205720574e-05, + "loss": 0.0958, + "num_input_tokens_seen": 1770016, + "step": 8390 + }, + { + "epoch": 0.9235423542354235, + "grad_norm": 1.2692272663116455, + "learning_rate": 2.308580858085809e-05, + "loss": 0.1065, + "num_input_tokens_seen": 1771040, + "step": 8395 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 1.1475574970245361, + "learning_rate": 2.30995599559956e-05, + "loss": 0.1179, + "num_input_tokens_seen": 1772096, + "step": 8400 + }, + { + "epoch": 0.9246424642464246, + "grad_norm": 0.5107559561729431, + "learning_rate": 2.3113311331133115e-05, + "loss": 0.2196, + "num_input_tokens_seen": 1773184, + "step": 8405 + }, + { + "epoch": 0.9251925192519251, + "grad_norm": 0.3779429793357849, + "learning_rate": 2.3127062706270627e-05, + "loss": 0.1196, + "num_input_tokens_seen": 1774240, + "step": 8410 + }, + { + "epoch": 0.9257425742574258, + "grad_norm": 0.4740526080131531, + "learning_rate": 2.3140814081408142e-05, + "loss": 0.1255, + "num_input_tokens_seen": 1775360, + "step": 8415 + }, + { + "epoch": 0.9262926292629263, + "grad_norm": 0.48229190707206726, + "learning_rate": 2.3154565456545654e-05, + "loss": 0.1272, + "num_input_tokens_seen": 1776480, + "step": 8420 + }, + { + "epoch": 0.9268426842684269, + "grad_norm": 0.49248456954956055, + "learning_rate": 2.3168316831683172e-05, + "loss": 0.1082, + "num_input_tokens_seen": 1777536, + "step": 8425 + }, + { + "epoch": 0.9273927392739274, + "grad_norm": 0.5937752723693848, + "learning_rate": 2.3182068206820684e-05, + "loss": 0.1787, + "num_input_tokens_seen": 1778624, + "step": 8430 + }, + { + "epoch": 0.9279427942794279, + "grad_norm": 0.41693758964538574, + "learning_rate": 2.31958195819582e-05, + "loss": 0.2179, + "num_input_tokens_seen": 1779616, + "step": 8435 + }, + { + "epoch": 0.9284928492849285, + "grad_norm": 1.097233772277832, + "learning_rate": 2.320957095709571e-05, + "loss": 0.1785, + "num_input_tokens_seen": 1780672, + "step": 8440 + }, + { + "epoch": 0.929042904290429, + "grad_norm": 0.44114598631858826, + "learning_rate": 2.3223322332233226e-05, + "loss": 0.1646, + "num_input_tokens_seen": 1781792, + "step": 8445 + }, + { + "epoch": 0.9295929592959296, + "grad_norm": 0.933688223361969, + "learning_rate": 2.3237073707370737e-05, + "loss": 0.1669, + "num_input_tokens_seen": 1782816, + "step": 8450 + }, + { + "epoch": 0.9301430143014301, + "grad_norm": 1.020552158355713, + "learning_rate": 2.3250825082508252e-05, + "loss": 0.2027, + "num_input_tokens_seen": 1783936, + "step": 8455 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.4434794485569, + "learning_rate": 2.3264576457645764e-05, + "loss": 0.2536, + "num_input_tokens_seen": 1784992, + "step": 8460 + }, + { + "epoch": 0.9312431243124313, + "grad_norm": 0.3412996530532837, + "learning_rate": 2.327832783278328e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1786112, + "step": 8465 + }, + { + "epoch": 0.9317931793179318, + "grad_norm": 2.0037782192230225, + "learning_rate": 2.3292079207920794e-05, + "loss": 0.2024, + "num_input_tokens_seen": 1787136, + "step": 8470 + }, + { + "epoch": 0.9323432343234324, + "grad_norm": 0.8467315435409546, + "learning_rate": 2.330583058305831e-05, + "loss": 0.1558, + "num_input_tokens_seen": 1788160, + "step": 8475 + }, + { + "epoch": 0.9328932893289329, + "grad_norm": 0.9216558933258057, + "learning_rate": 2.331958195819582e-05, + "loss": 0.1335, + "num_input_tokens_seen": 1789184, + "step": 8480 + }, + { + "epoch": 0.9334433443344334, + "grad_norm": 0.8018965125083923, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.1684, + "num_input_tokens_seen": 1790176, + "step": 8485 + }, + { + "epoch": 0.933993399339934, + "grad_norm": 0.61113041639328, + "learning_rate": 2.3347084708470847e-05, + "loss": 0.1348, + "num_input_tokens_seen": 1791232, + "step": 8490 + }, + { + "epoch": 0.9345434543454345, + "grad_norm": 0.6555071473121643, + "learning_rate": 2.3360836083608363e-05, + "loss": 0.2593, + "num_input_tokens_seen": 1792224, + "step": 8495 + }, + { + "epoch": 0.935093509350935, + "grad_norm": 0.6883305311203003, + "learning_rate": 2.3374587458745874e-05, + "loss": 0.1383, + "num_input_tokens_seen": 1793248, + "step": 8500 + }, + { + "epoch": 0.9356435643564357, + "grad_norm": 0.6761580109596252, + "learning_rate": 2.338833883388339e-05, + "loss": 0.1257, + "num_input_tokens_seen": 1794336, + "step": 8505 + }, + { + "epoch": 0.9361936193619362, + "grad_norm": 0.5040254592895508, + "learning_rate": 2.34020902090209e-05, + "loss": 0.1125, + "num_input_tokens_seen": 1795424, + "step": 8510 + }, + { + "epoch": 0.9367436743674368, + "grad_norm": 0.7636817097663879, + "learning_rate": 2.341584158415842e-05, + "loss": 0.119, + "num_input_tokens_seen": 1796480, + "step": 8515 + }, + { + "epoch": 0.9372937293729373, + "grad_norm": 0.29057127237319946, + "learning_rate": 2.342959295929593e-05, + "loss": 0.177, + "num_input_tokens_seen": 1797600, + "step": 8520 + }, + { + "epoch": 0.9378437843784379, + "grad_norm": 0.3291165232658386, + "learning_rate": 2.3443344334433446e-05, + "loss": 0.128, + "num_input_tokens_seen": 1798656, + "step": 8525 + }, + { + "epoch": 0.9383938393839384, + "grad_norm": 1.1930230855941772, + "learning_rate": 2.3457095709570958e-05, + "loss": 0.188, + "num_input_tokens_seen": 1799744, + "step": 8530 + }, + { + "epoch": 0.9389438943894389, + "grad_norm": 0.42711517214775085, + "learning_rate": 2.3470847084708473e-05, + "loss": 0.1771, + "num_input_tokens_seen": 1800768, + "step": 8535 + }, + { + "epoch": 0.9394939493949395, + "grad_norm": 0.22071461379528046, + "learning_rate": 2.3484598459845984e-05, + "loss": 0.1428, + "num_input_tokens_seen": 1801792, + "step": 8540 + }, + { + "epoch": 0.94004400440044, + "grad_norm": 0.7873121500015259, + "learning_rate": 2.34983498349835e-05, + "loss": 0.1564, + "num_input_tokens_seen": 1802848, + "step": 8545 + }, + { + "epoch": 0.9405940594059405, + "grad_norm": 1.6913957595825195, + "learning_rate": 2.351210121012101e-05, + "loss": 0.1879, + "num_input_tokens_seen": 1803840, + "step": 8550 + }, + { + "epoch": 0.9411441144114412, + "grad_norm": 1.2222909927368164, + "learning_rate": 2.3525852585258526e-05, + "loss": 0.2032, + "num_input_tokens_seen": 1804928, + "step": 8555 + }, + { + "epoch": 0.9416941694169417, + "grad_norm": 0.26133060455322266, + "learning_rate": 2.3539603960396038e-05, + "loss": 0.1361, + "num_input_tokens_seen": 1805888, + "step": 8560 + }, + { + "epoch": 0.9422442244224423, + "grad_norm": 3.688931703567505, + "learning_rate": 2.3553355335533556e-05, + "loss": 0.2536, + "num_input_tokens_seen": 1807008, + "step": 8565 + }, + { + "epoch": 0.9427942794279428, + "grad_norm": 0.70442134141922, + "learning_rate": 2.3567106710671068e-05, + "loss": 0.0984, + "num_input_tokens_seen": 1808000, + "step": 8570 + }, + { + "epoch": 0.9433443344334433, + "grad_norm": 0.39651191234588623, + "learning_rate": 2.3580858085808583e-05, + "loss": 0.1411, + "num_input_tokens_seen": 1809088, + "step": 8575 + }, + { + "epoch": 0.9438943894389439, + "grad_norm": 0.6065769195556641, + "learning_rate": 2.3594609460946095e-05, + "loss": 0.2231, + "num_input_tokens_seen": 1810208, + "step": 8580 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 1.0015478134155273, + "learning_rate": 2.360836083608361e-05, + "loss": 0.2067, + "num_input_tokens_seen": 1811264, + "step": 8585 + }, + { + "epoch": 0.944994499449945, + "grad_norm": 0.5098174810409546, + "learning_rate": 2.3622112211221125e-05, + "loss": 0.1447, + "num_input_tokens_seen": 1812384, + "step": 8590 + }, + { + "epoch": 0.9455445544554455, + "grad_norm": 1.5524526834487915, + "learning_rate": 2.3635863586358636e-05, + "loss": 0.1791, + "num_input_tokens_seen": 1813472, + "step": 8595 + }, + { + "epoch": 0.9460946094609461, + "grad_norm": 0.2994003891944885, + "learning_rate": 2.364961496149615e-05, + "loss": 0.1014, + "num_input_tokens_seen": 1814592, + "step": 8600 + }, + { + "epoch": 0.9466446644664467, + "grad_norm": 0.7982486486434937, + "learning_rate": 2.3663366336633663e-05, + "loss": 0.2286, + "num_input_tokens_seen": 1815584, + "step": 8605 + }, + { + "epoch": 0.9471947194719472, + "grad_norm": 0.686051070690155, + "learning_rate": 2.3677117711771178e-05, + "loss": 0.1232, + "num_input_tokens_seen": 1816640, + "step": 8610 + }, + { + "epoch": 0.9477447744774478, + "grad_norm": 0.3735858201980591, + "learning_rate": 2.3690869086908693e-05, + "loss": 0.1435, + "num_input_tokens_seen": 1817696, + "step": 8615 + }, + { + "epoch": 0.9482948294829483, + "grad_norm": 0.6856710910797119, + "learning_rate": 2.3704620462046208e-05, + "loss": 0.1615, + "num_input_tokens_seen": 1818848, + "step": 8620 + }, + { + "epoch": 0.9488448844884488, + "grad_norm": 0.48393091559410095, + "learning_rate": 2.371837183718372e-05, + "loss": 0.1345, + "num_input_tokens_seen": 1819936, + "step": 8625 + }, + { + "epoch": 0.9493949394939494, + "grad_norm": 0.5056939721107483, + "learning_rate": 2.3732123212321235e-05, + "loss": 0.1439, + "num_input_tokens_seen": 1821024, + "step": 8630 + }, + { + "epoch": 0.9499449944994499, + "grad_norm": 0.4657742977142334, + "learning_rate": 2.3745874587458746e-05, + "loss": 0.1115, + "num_input_tokens_seen": 1822048, + "step": 8635 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.474462628364563, + "learning_rate": 2.375962596259626e-05, + "loss": 0.1842, + "num_input_tokens_seen": 1823168, + "step": 8640 + }, + { + "epoch": 0.9510451045104511, + "grad_norm": 0.7416431307792664, + "learning_rate": 2.3773377337733773e-05, + "loss": 0.1158, + "num_input_tokens_seen": 1824256, + "step": 8645 + }, + { + "epoch": 0.9515951595159516, + "grad_norm": 1.4137849807739258, + "learning_rate": 2.3787128712871288e-05, + "loss": 0.1798, + "num_input_tokens_seen": 1825376, + "step": 8650 + }, + { + "epoch": 0.9521452145214522, + "grad_norm": 0.2591065764427185, + "learning_rate": 2.38008800880088e-05, + "loss": 0.1514, + "num_input_tokens_seen": 1826400, + "step": 8655 + }, + { + "epoch": 0.9526952695269527, + "grad_norm": 0.6343482732772827, + "learning_rate": 2.3814631463146318e-05, + "loss": 0.1923, + "num_input_tokens_seen": 1827520, + "step": 8660 + }, + { + "epoch": 0.9532453245324533, + "grad_norm": 0.5735124349594116, + "learning_rate": 2.382838283828383e-05, + "loss": 0.155, + "num_input_tokens_seen": 1828544, + "step": 8665 + }, + { + "epoch": 0.9537953795379538, + "grad_norm": 0.6014032959938049, + "learning_rate": 2.3842134213421345e-05, + "loss": 0.1654, + "num_input_tokens_seen": 1829568, + "step": 8670 + }, + { + "epoch": 0.9543454345434543, + "grad_norm": 0.6091482043266296, + "learning_rate": 2.3855885588558857e-05, + "loss": 0.1641, + "num_input_tokens_seen": 1830656, + "step": 8675 + }, + { + "epoch": 0.9548954895489549, + "grad_norm": 0.21380217373371124, + "learning_rate": 2.386963696369637e-05, + "loss": 0.1277, + "num_input_tokens_seen": 1831680, + "step": 8680 + }, + { + "epoch": 0.9554455445544554, + "grad_norm": 0.33877092599868774, + "learning_rate": 2.3883388338833883e-05, + "loss": 0.1476, + "num_input_tokens_seen": 1832736, + "step": 8685 + }, + { + "epoch": 0.9559955995599559, + "grad_norm": 0.9343296885490417, + "learning_rate": 2.38971397139714e-05, + "loss": 0.1083, + "num_input_tokens_seen": 1833856, + "step": 8690 + }, + { + "epoch": 0.9565456545654566, + "grad_norm": 0.26509907841682434, + "learning_rate": 2.391089108910891e-05, + "loss": 0.1276, + "num_input_tokens_seen": 1834848, + "step": 8695 + }, + { + "epoch": 0.9570957095709571, + "grad_norm": 0.5523957014083862, + "learning_rate": 2.3924642464246425e-05, + "loss": 0.1852, + "num_input_tokens_seen": 1835936, + "step": 8700 + }, + { + "epoch": 0.9576457645764577, + "grad_norm": 0.2973683774471283, + "learning_rate": 2.393839383938394e-05, + "loss": 0.084, + "num_input_tokens_seen": 1836992, + "step": 8705 + }, + { + "epoch": 0.9581958195819582, + "grad_norm": 0.26643475890159607, + "learning_rate": 2.3952145214521455e-05, + "loss": 0.1194, + "num_input_tokens_seen": 1837984, + "step": 8710 + }, + { + "epoch": 0.9587458745874587, + "grad_norm": 0.8128577470779419, + "learning_rate": 2.3965896589658967e-05, + "loss": 0.1146, + "num_input_tokens_seen": 1839040, + "step": 8715 + }, + { + "epoch": 0.9592959295929593, + "grad_norm": 0.7807128429412842, + "learning_rate": 2.3979647964796482e-05, + "loss": 0.1092, + "num_input_tokens_seen": 1840128, + "step": 8720 + }, + { + "epoch": 0.9598459845984598, + "grad_norm": 0.6172998547554016, + "learning_rate": 2.3993399339933994e-05, + "loss": 0.1236, + "num_input_tokens_seen": 1841152, + "step": 8725 + }, + { + "epoch": 0.9603960396039604, + "grad_norm": 0.5443148612976074, + "learning_rate": 2.400715071507151e-05, + "loss": 0.1863, + "num_input_tokens_seen": 1842272, + "step": 8730 + }, + { + "epoch": 0.9609460946094609, + "grad_norm": 0.4212806820869446, + "learning_rate": 2.402090209020902e-05, + "loss": 0.1904, + "num_input_tokens_seen": 1843296, + "step": 8735 + }, + { + "epoch": 0.9614961496149615, + "grad_norm": 1.447284460067749, + "learning_rate": 2.4034653465346535e-05, + "loss": 0.2329, + "num_input_tokens_seen": 1844352, + "step": 8740 + }, + { + "epoch": 0.9620462046204621, + "grad_norm": 0.4075712263584137, + "learning_rate": 2.4048404840484047e-05, + "loss": 0.0991, + "num_input_tokens_seen": 1845408, + "step": 8745 + }, + { + "epoch": 0.9625962596259626, + "grad_norm": 0.6566622853279114, + "learning_rate": 2.4062156215621565e-05, + "loss": 0.1412, + "num_input_tokens_seen": 1846400, + "step": 8750 + }, + { + "epoch": 0.9631463146314632, + "grad_norm": 0.6195446252822876, + "learning_rate": 2.4075907590759077e-05, + "loss": 0.1464, + "num_input_tokens_seen": 1847456, + "step": 8755 + }, + { + "epoch": 0.9636963696369637, + "grad_norm": 0.5303398370742798, + "learning_rate": 2.4089658965896592e-05, + "loss": 0.094, + "num_input_tokens_seen": 1848416, + "step": 8760 + }, + { + "epoch": 0.9642464246424642, + "grad_norm": 0.5636575818061829, + "learning_rate": 2.4103410341034104e-05, + "loss": 0.2057, + "num_input_tokens_seen": 1849472, + "step": 8765 + }, + { + "epoch": 0.9647964796479648, + "grad_norm": 0.7047317028045654, + "learning_rate": 2.411716171617162e-05, + "loss": 0.1876, + "num_input_tokens_seen": 1850528, + "step": 8770 + }, + { + "epoch": 0.9653465346534653, + "grad_norm": 0.7921712398529053, + "learning_rate": 2.413091309130913e-05, + "loss": 0.2074, + "num_input_tokens_seen": 1851680, + "step": 8775 + }, + { + "epoch": 0.9658965896589659, + "grad_norm": 0.308829665184021, + "learning_rate": 2.4144664466446645e-05, + "loss": 0.1608, + "num_input_tokens_seen": 1852672, + "step": 8780 + }, + { + "epoch": 0.9664466446644664, + "grad_norm": 0.37446027994155884, + "learning_rate": 2.415841584158416e-05, + "loss": 0.1326, + "num_input_tokens_seen": 1853696, + "step": 8785 + }, + { + "epoch": 0.966996699669967, + "grad_norm": 0.5999844670295715, + "learning_rate": 2.4172167216721672e-05, + "loss": 0.1456, + "num_input_tokens_seen": 1854688, + "step": 8790 + }, + { + "epoch": 0.9675467546754676, + "grad_norm": 0.5107947587966919, + "learning_rate": 2.4185918591859187e-05, + "loss": 0.1612, + "num_input_tokens_seen": 1855712, + "step": 8795 + }, + { + "epoch": 0.9680968096809681, + "grad_norm": 0.5276122689247131, + "learning_rate": 2.4199669966996702e-05, + "loss": 0.1775, + "num_input_tokens_seen": 1856832, + "step": 8800 + }, + { + "epoch": 0.9686468646864687, + "grad_norm": 0.5288296937942505, + "learning_rate": 2.4213421342134214e-05, + "loss": 0.1306, + "num_input_tokens_seen": 1857920, + "step": 8805 + }, + { + "epoch": 0.9691969196919692, + "grad_norm": 1.065514087677002, + "learning_rate": 2.422717271727173e-05, + "loss": 0.1737, + "num_input_tokens_seen": 1858944, + "step": 8810 + }, + { + "epoch": 0.9697469746974697, + "grad_norm": 0.53021639585495, + "learning_rate": 2.4240924092409244e-05, + "loss": 0.3482, + "num_input_tokens_seen": 1860000, + "step": 8815 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.9864945411682129, + "learning_rate": 2.4254675467546756e-05, + "loss": 0.1414, + "num_input_tokens_seen": 1861056, + "step": 8820 + }, + { + "epoch": 0.9708470847084708, + "grad_norm": 0.32304683327674866, + "learning_rate": 2.426842684268427e-05, + "loss": 0.1083, + "num_input_tokens_seen": 1862080, + "step": 8825 + }, + { + "epoch": 0.9713971397139713, + "grad_norm": 0.4092005491256714, + "learning_rate": 2.4282178217821782e-05, + "loss": 0.1009, + "num_input_tokens_seen": 1863104, + "step": 8830 + }, + { + "epoch": 0.971947194719472, + "grad_norm": 0.43776658177375793, + "learning_rate": 2.4295929592959297e-05, + "loss": 0.1378, + "num_input_tokens_seen": 1864128, + "step": 8835 + }, + { + "epoch": 0.9724972497249725, + "grad_norm": 0.4822258949279785, + "learning_rate": 2.430968096809681e-05, + "loss": 0.1537, + "num_input_tokens_seen": 1865184, + "step": 8840 + }, + { + "epoch": 0.9730473047304731, + "grad_norm": 0.9296622276306152, + "learning_rate": 2.4323432343234327e-05, + "loss": 0.1528, + "num_input_tokens_seen": 1866240, + "step": 8845 + }, + { + "epoch": 0.9735973597359736, + "grad_norm": 0.47591835260391235, + "learning_rate": 2.433718371837184e-05, + "loss": 0.1423, + "num_input_tokens_seen": 1867360, + "step": 8850 + }, + { + "epoch": 0.9741474147414741, + "grad_norm": 0.5007222890853882, + "learning_rate": 2.4350935093509354e-05, + "loss": 0.1352, + "num_input_tokens_seen": 1868480, + "step": 8855 + }, + { + "epoch": 0.9746974697469747, + "grad_norm": 0.44893333315849304, + "learning_rate": 2.4364686468646866e-05, + "loss": 0.0936, + "num_input_tokens_seen": 1869440, + "step": 8860 + }, + { + "epoch": 0.9752475247524752, + "grad_norm": 0.8137829899787903, + "learning_rate": 2.437843784378438e-05, + "loss": 0.1572, + "num_input_tokens_seen": 1870496, + "step": 8865 + }, + { + "epoch": 0.9757975797579758, + "grad_norm": 0.8400242328643799, + "learning_rate": 2.4392189218921893e-05, + "loss": 0.1523, + "num_input_tokens_seen": 1871616, + "step": 8870 + }, + { + "epoch": 0.9763476347634763, + "grad_norm": 0.4155260920524597, + "learning_rate": 2.4405940594059408e-05, + "loss": 0.1487, + "num_input_tokens_seen": 1872672, + "step": 8875 + }, + { + "epoch": 0.976897689768977, + "grad_norm": 0.9759862422943115, + "learning_rate": 2.441969196919692e-05, + "loss": 0.1023, + "num_input_tokens_seen": 1873760, + "step": 8880 + }, + { + "epoch": 0.9774477447744775, + "grad_norm": 0.9476038217544556, + "learning_rate": 2.4433443344334434e-05, + "loss": 0.1467, + "num_input_tokens_seen": 1874784, + "step": 8885 + }, + { + "epoch": 0.977997799779978, + "grad_norm": 1.0241371393203735, + "learning_rate": 2.444719471947195e-05, + "loss": 0.1531, + "num_input_tokens_seen": 1875808, + "step": 8890 + }, + { + "epoch": 0.9785478547854786, + "grad_norm": 0.5704377889633179, + "learning_rate": 2.4460946094609464e-05, + "loss": 0.2164, + "num_input_tokens_seen": 1876832, + "step": 8895 + }, + { + "epoch": 0.9790979097909791, + "grad_norm": 0.533187747001648, + "learning_rate": 2.4474697469746976e-05, + "loss": 0.1236, + "num_input_tokens_seen": 1877792, + "step": 8900 + }, + { + "epoch": 0.9796479647964796, + "grad_norm": 0.5354129672050476, + "learning_rate": 2.448844884488449e-05, + "loss": 0.1824, + "num_input_tokens_seen": 1878880, + "step": 8905 + }, + { + "epoch": 0.9801980198019802, + "grad_norm": 0.3984960615634918, + "learning_rate": 2.4502200220022003e-05, + "loss": 0.116, + "num_input_tokens_seen": 1879936, + "step": 8910 + }, + { + "epoch": 0.9807480748074807, + "grad_norm": 0.7035013437271118, + "learning_rate": 2.4515951595159518e-05, + "loss": 0.1804, + "num_input_tokens_seen": 1880992, + "step": 8915 + }, + { + "epoch": 0.9812981298129813, + "grad_norm": 1.0270813703536987, + "learning_rate": 2.452970297029703e-05, + "loss": 0.1723, + "num_input_tokens_seen": 1882048, + "step": 8920 + }, + { + "epoch": 0.9818481848184818, + "grad_norm": 0.6999450325965881, + "learning_rate": 2.4543454345434544e-05, + "loss": 0.1472, + "num_input_tokens_seen": 1883104, + "step": 8925 + }, + { + "epoch": 0.9823982398239824, + "grad_norm": 1.0483410358428955, + "learning_rate": 2.4557205720572056e-05, + "loss": 0.143, + "num_input_tokens_seen": 1884224, + "step": 8930 + }, + { + "epoch": 0.982948294829483, + "grad_norm": 0.1658347249031067, + "learning_rate": 2.457095709570957e-05, + "loss": 0.1166, + "num_input_tokens_seen": 1885248, + "step": 8935 + }, + { + "epoch": 0.9834983498349835, + "grad_norm": 0.9092065691947937, + "learning_rate": 2.4584708470847086e-05, + "loss": 0.1548, + "num_input_tokens_seen": 1886304, + "step": 8940 + }, + { + "epoch": 0.9840484048404841, + "grad_norm": 0.33969199657440186, + "learning_rate": 2.45984598459846e-05, + "loss": 0.142, + "num_input_tokens_seen": 1887360, + "step": 8945 + }, + { + "epoch": 0.9845984598459846, + "grad_norm": 0.2873116731643677, + "learning_rate": 2.4612211221122113e-05, + "loss": 0.1659, + "num_input_tokens_seen": 1888416, + "step": 8950 + }, + { + "epoch": 0.9851485148514851, + "grad_norm": 0.6725215911865234, + "learning_rate": 2.4625962596259628e-05, + "loss": 0.164, + "num_input_tokens_seen": 1889472, + "step": 8955 + }, + { + "epoch": 0.9856985698569857, + "grad_norm": 0.4294087886810303, + "learning_rate": 2.463971397139714e-05, + "loss": 0.085, + "num_input_tokens_seen": 1890592, + "step": 8960 + }, + { + "epoch": 0.9862486248624862, + "grad_norm": 0.7152068614959717, + "learning_rate": 2.4653465346534655e-05, + "loss": 0.1501, + "num_input_tokens_seen": 1891648, + "step": 8965 + }, + { + "epoch": 0.9867986798679867, + "grad_norm": 0.9997127652168274, + "learning_rate": 2.4667216721672166e-05, + "loss": 0.2722, + "num_input_tokens_seen": 1892672, + "step": 8970 + }, + { + "epoch": 0.9873487348734874, + "grad_norm": 0.24674075841903687, + "learning_rate": 2.468096809680968e-05, + "loss": 0.2288, + "num_input_tokens_seen": 1893760, + "step": 8975 + }, + { + "epoch": 0.9878987898789879, + "grad_norm": 0.3701174259185791, + "learning_rate": 2.4694719471947196e-05, + "loss": 0.1207, + "num_input_tokens_seen": 1894816, + "step": 8980 + }, + { + "epoch": 0.9884488448844885, + "grad_norm": 0.1656997799873352, + "learning_rate": 2.470847084708471e-05, + "loss": 0.099, + "num_input_tokens_seen": 1895904, + "step": 8985 + }, + { + "epoch": 0.988998899889989, + "grad_norm": 0.4785105586051941, + "learning_rate": 2.4722222222222223e-05, + "loss": 0.1504, + "num_input_tokens_seen": 1896992, + "step": 8990 + }, + { + "epoch": 0.9895489548954896, + "grad_norm": 1.6047697067260742, + "learning_rate": 2.4735973597359738e-05, + "loss": 0.1341, + "num_input_tokens_seen": 1898016, + "step": 8995 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.33192169666290283, + "learning_rate": 2.474972497249725e-05, + "loss": 0.1564, + "num_input_tokens_seen": 1899104, + "step": 9000 + }, + { + "epoch": 0.9906490649064906, + "grad_norm": 0.9256336092948914, + "learning_rate": 2.4763476347634765e-05, + "loss": 0.2735, + "num_input_tokens_seen": 1900128, + "step": 9005 + }, + { + "epoch": 0.9911991199119912, + "grad_norm": 0.527118980884552, + "learning_rate": 2.477722772277228e-05, + "loss": 0.1269, + "num_input_tokens_seen": 1901184, + "step": 9010 + }, + { + "epoch": 0.9917491749174917, + "grad_norm": 1.2708821296691895, + "learning_rate": 2.479097909790979e-05, + "loss": 0.1358, + "num_input_tokens_seen": 1902272, + "step": 9015 + }, + { + "epoch": 0.9922992299229924, + "grad_norm": 1.2616714239120483, + "learning_rate": 2.4804730473047307e-05, + "loss": 0.1744, + "num_input_tokens_seen": 1903264, + "step": 9020 + }, + { + "epoch": 0.9928492849284929, + "grad_norm": 2.704986572265625, + "learning_rate": 2.4818481848184818e-05, + "loss": 0.2435, + "num_input_tokens_seen": 1904288, + "step": 9025 + }, + { + "epoch": 0.9933993399339934, + "grad_norm": 1.0919245481491089, + "learning_rate": 2.4832233223322333e-05, + "loss": 0.1316, + "num_input_tokens_seen": 1905408, + "step": 9030 + }, + { + "epoch": 0.993949394939494, + "grad_norm": 0.7493544816970825, + "learning_rate": 2.4845984598459848e-05, + "loss": 0.1161, + "num_input_tokens_seen": 1906464, + "step": 9035 + }, + { + "epoch": 0.9944994499449945, + "grad_norm": 0.7543742060661316, + "learning_rate": 2.4859735973597363e-05, + "loss": 0.1216, + "num_input_tokens_seen": 1907552, + "step": 9040 + }, + { + "epoch": 0.995049504950495, + "grad_norm": 0.6001253128051758, + "learning_rate": 2.4873487348734875e-05, + "loss": 0.1767, + "num_input_tokens_seen": 1908576, + "step": 9045 + }, + { + "epoch": 0.9955995599559956, + "grad_norm": 0.3197242319583893, + "learning_rate": 2.488723872387239e-05, + "loss": 0.114, + "num_input_tokens_seen": 1909664, + "step": 9050 + }, + { + "epoch": 0.9961496149614961, + "grad_norm": 0.21243731677532196, + "learning_rate": 2.49009900990099e-05, + "loss": 0.1277, + "num_input_tokens_seen": 1910720, + "step": 9055 + }, + { + "epoch": 0.9966996699669967, + "grad_norm": 0.42599576711654663, + "learning_rate": 2.4914741474147417e-05, + "loss": 0.1353, + "num_input_tokens_seen": 1911840, + "step": 9060 + }, + { + "epoch": 0.9972497249724972, + "grad_norm": 0.7365682125091553, + "learning_rate": 2.492849284928493e-05, + "loss": 0.1905, + "num_input_tokens_seen": 1912896, + "step": 9065 + }, + { + "epoch": 0.9977997799779978, + "grad_norm": 0.3827113211154938, + "learning_rate": 2.4942244224422443e-05, + "loss": 0.1231, + "num_input_tokens_seen": 1913952, + "step": 9070 + }, + { + "epoch": 0.9983498349834984, + "grad_norm": 0.6465194225311279, + "learning_rate": 2.4955995599559955e-05, + "loss": 0.202, + "num_input_tokens_seen": 1915008, + "step": 9075 + }, + { + "epoch": 0.9988998899889989, + "grad_norm": 0.5631294250488281, + "learning_rate": 2.4969746974697474e-05, + "loss": 0.1503, + "num_input_tokens_seen": 1916064, + "step": 9080 + }, + { + "epoch": 0.9994499449944995, + "grad_norm": 0.571312665939331, + "learning_rate": 2.4983498349834985e-05, + "loss": 0.1237, + "num_input_tokens_seen": 1917056, + "step": 9085 + }, + { + "epoch": 1.0, + "grad_norm": 0.19271007180213928, + "learning_rate": 2.49972497249725e-05, + "loss": 0.1899, + "num_input_tokens_seen": 1917952, + "step": 9090 + }, + { + "epoch": 1.0, + "eval_loss": 0.1525431126356125, + "eval_runtime": 36.9943, + "eval_samples_per_second": 109.206, + "eval_steps_per_second": 27.301, + "num_input_tokens_seen": 1917952, + "step": 9090 + }, + { + "epoch": 1.0005500550055006, + "grad_norm": 0.9261417984962463, + "learning_rate": 2.501100110011001e-05, + "loss": 0.1441, + "num_input_tokens_seen": 1919008, + "step": 9095 + }, + { + "epoch": 1.001100110011001, + "grad_norm": 0.8489517569541931, + "learning_rate": 2.5024752475247527e-05, + "loss": 0.2141, + "num_input_tokens_seen": 1920032, + "step": 9100 + }, + { + "epoch": 1.0016501650165017, + "grad_norm": 0.5692808032035828, + "learning_rate": 2.5038503850385042e-05, + "loss": 0.1107, + "num_input_tokens_seen": 1921056, + "step": 9105 + }, + { + "epoch": 1.0022002200220022, + "grad_norm": 1.2763087749481201, + "learning_rate": 2.5052255225522554e-05, + "loss": 0.1215, + "num_input_tokens_seen": 1922112, + "step": 9110 + }, + { + "epoch": 1.0027502750275028, + "grad_norm": 0.7960019707679749, + "learning_rate": 2.506600660066007e-05, + "loss": 0.1231, + "num_input_tokens_seen": 1923136, + "step": 9115 + }, + { + "epoch": 1.0033003300330032, + "grad_norm": 0.8640592694282532, + "learning_rate": 2.507975797579758e-05, + "loss": 0.1442, + "num_input_tokens_seen": 1924160, + "step": 9120 + }, + { + "epoch": 1.0038503850385039, + "grad_norm": 0.4787689447402954, + "learning_rate": 2.5093509350935095e-05, + "loss": 0.1169, + "num_input_tokens_seen": 1925248, + "step": 9125 + }, + { + "epoch": 1.0044004400440043, + "grad_norm": 1.1443976163864136, + "learning_rate": 2.5107260726072607e-05, + "loss": 0.1193, + "num_input_tokens_seen": 1926240, + "step": 9130 + }, + { + "epoch": 1.004950495049505, + "grad_norm": 0.35006242990493774, + "learning_rate": 2.5121012101210122e-05, + "loss": 0.1904, + "num_input_tokens_seen": 1927296, + "step": 9135 + }, + { + "epoch": 1.0055005500550056, + "grad_norm": 0.28560903668403625, + "learning_rate": 2.5134763476347634e-05, + "loss": 0.1372, + "num_input_tokens_seen": 1928320, + "step": 9140 + }, + { + "epoch": 1.006050605060506, + "grad_norm": 1.4554386138916016, + "learning_rate": 2.514851485148515e-05, + "loss": 0.1699, + "num_input_tokens_seen": 1929408, + "step": 9145 + }, + { + "epoch": 1.0066006600660067, + "grad_norm": 0.9323750734329224, + "learning_rate": 2.5162266226622667e-05, + "loss": 0.1728, + "num_input_tokens_seen": 1930496, + "step": 9150 + }, + { + "epoch": 1.007150715071507, + "grad_norm": 1.0502411127090454, + "learning_rate": 2.5176017601760175e-05, + "loss": 0.1215, + "num_input_tokens_seen": 1931584, + "step": 9155 + }, + { + "epoch": 1.0077007700770078, + "grad_norm": 0.729180097579956, + "learning_rate": 2.5189768976897694e-05, + "loss": 0.1534, + "num_input_tokens_seen": 1932576, + "step": 9160 + }, + { + "epoch": 1.0082508250825082, + "grad_norm": 0.9782981276512146, + "learning_rate": 2.5203520352035202e-05, + "loss": 0.1473, + "num_input_tokens_seen": 1933600, + "step": 9165 + }, + { + "epoch": 1.0088008800880088, + "grad_norm": 0.4412749707698822, + "learning_rate": 2.521727172717272e-05, + "loss": 0.1146, + "num_input_tokens_seen": 1934592, + "step": 9170 + }, + { + "epoch": 1.0093509350935093, + "grad_norm": 0.4044129550457001, + "learning_rate": 2.5231023102310232e-05, + "loss": 0.1122, + "num_input_tokens_seen": 1935584, + "step": 9175 + }, + { + "epoch": 1.00990099009901, + "grad_norm": 0.8111941814422607, + "learning_rate": 2.5244774477447747e-05, + "loss": 0.142, + "num_input_tokens_seen": 1936640, + "step": 9180 + }, + { + "epoch": 1.0104510451045106, + "grad_norm": 0.6130894422531128, + "learning_rate": 2.525852585258526e-05, + "loss": 0.0961, + "num_input_tokens_seen": 1937664, + "step": 9185 + }, + { + "epoch": 1.011001100110011, + "grad_norm": 2.4453816413879395, + "learning_rate": 2.5272277227722774e-05, + "loss": 0.1724, + "num_input_tokens_seen": 1938720, + "step": 9190 + }, + { + "epoch": 1.0115511551155116, + "grad_norm": 0.4644315838813782, + "learning_rate": 2.528602860286029e-05, + "loss": 0.0956, + "num_input_tokens_seen": 1939744, + "step": 9195 + }, + { + "epoch": 1.012101210121012, + "grad_norm": 0.6218547224998474, + "learning_rate": 2.52997799779978e-05, + "loss": 0.1617, + "num_input_tokens_seen": 1940864, + "step": 9200 + }, + { + "epoch": 1.0126512651265127, + "grad_norm": 0.5348107814788818, + "learning_rate": 2.5313531353135316e-05, + "loss": 0.151, + "num_input_tokens_seen": 1941888, + "step": 9205 + }, + { + "epoch": 1.0132013201320131, + "grad_norm": 0.7412659525871277, + "learning_rate": 2.5327282728272827e-05, + "loss": 0.1217, + "num_input_tokens_seen": 1942912, + "step": 9210 + }, + { + "epoch": 1.0137513751375138, + "grad_norm": 0.599461019039154, + "learning_rate": 2.5341034103410342e-05, + "loss": 0.1718, + "num_input_tokens_seen": 1943968, + "step": 9215 + }, + { + "epoch": 1.0143014301430142, + "grad_norm": 1.0507436990737915, + "learning_rate": 2.5354785478547854e-05, + "loss": 0.1622, + "num_input_tokens_seen": 1944992, + "step": 9220 + }, + { + "epoch": 1.0148514851485149, + "grad_norm": 0.4353038966655731, + "learning_rate": 2.536853685368537e-05, + "loss": 0.0898, + "num_input_tokens_seen": 1946048, + "step": 9225 + }, + { + "epoch": 1.0154015401540153, + "grad_norm": 1.5966063737869263, + "learning_rate": 2.538228822882288e-05, + "loss": 0.1678, + "num_input_tokens_seen": 1947008, + "step": 9230 + }, + { + "epoch": 1.015951595159516, + "grad_norm": 0.7966309785842896, + "learning_rate": 2.53960396039604e-05, + "loss": 0.109, + "num_input_tokens_seen": 1948032, + "step": 9235 + }, + { + "epoch": 1.0165016501650166, + "grad_norm": 0.3263801336288452, + "learning_rate": 2.5409790979097907e-05, + "loss": 0.1356, + "num_input_tokens_seen": 1949056, + "step": 9240 + }, + { + "epoch": 1.017051705170517, + "grad_norm": 0.5857527852058411, + "learning_rate": 2.5423542354235426e-05, + "loss": 0.1901, + "num_input_tokens_seen": 1950112, + "step": 9245 + }, + { + "epoch": 1.0176017601760177, + "grad_norm": 1.2270137071609497, + "learning_rate": 2.543729372937294e-05, + "loss": 0.2654, + "num_input_tokens_seen": 1951200, + "step": 9250 + }, + { + "epoch": 1.018151815181518, + "grad_norm": 0.6062153577804565, + "learning_rate": 2.5451045104510453e-05, + "loss": 0.1453, + "num_input_tokens_seen": 1952224, + "step": 9255 + }, + { + "epoch": 1.0187018701870187, + "grad_norm": 0.5455009341239929, + "learning_rate": 2.5464796479647968e-05, + "loss": 0.1189, + "num_input_tokens_seen": 1953312, + "step": 9260 + }, + { + "epoch": 1.0192519251925192, + "grad_norm": 0.3479532301425934, + "learning_rate": 2.547854785478548e-05, + "loss": 0.1221, + "num_input_tokens_seen": 1954336, + "step": 9265 + }, + { + "epoch": 1.0198019801980198, + "grad_norm": 1.1570942401885986, + "learning_rate": 2.5492299229922994e-05, + "loss": 0.1475, + "num_input_tokens_seen": 1955392, + "step": 9270 + }, + { + "epoch": 1.0203520352035202, + "grad_norm": 0.6146875023841858, + "learning_rate": 2.5506050605060506e-05, + "loss": 0.1648, + "num_input_tokens_seen": 1956416, + "step": 9275 + }, + { + "epoch": 1.020902090209021, + "grad_norm": 0.39962950348854065, + "learning_rate": 2.551980198019802e-05, + "loss": 0.1167, + "num_input_tokens_seen": 1957408, + "step": 9280 + }, + { + "epoch": 1.0214521452145215, + "grad_norm": 1.3805103302001953, + "learning_rate": 2.5533553355335533e-05, + "loss": 0.1434, + "num_input_tokens_seen": 1958464, + "step": 9285 + }, + { + "epoch": 1.022002200220022, + "grad_norm": 0.6757040023803711, + "learning_rate": 2.5547304730473048e-05, + "loss": 0.143, + "num_input_tokens_seen": 1959552, + "step": 9290 + }, + { + "epoch": 1.0225522552255226, + "grad_norm": 0.7387606501579285, + "learning_rate": 2.5561056105610566e-05, + "loss": 0.1212, + "num_input_tokens_seen": 1960544, + "step": 9295 + }, + { + "epoch": 1.023102310231023, + "grad_norm": 1.3662981986999512, + "learning_rate": 2.5574807480748074e-05, + "loss": 0.1466, + "num_input_tokens_seen": 1961600, + "step": 9300 + }, + { + "epoch": 1.0236523652365237, + "grad_norm": 0.8537704348564148, + "learning_rate": 2.5588558855885593e-05, + "loss": 0.1816, + "num_input_tokens_seen": 1962688, + "step": 9305 + }, + { + "epoch": 1.0242024202420241, + "grad_norm": 0.6373304724693298, + "learning_rate": 2.56023102310231e-05, + "loss": 0.18, + "num_input_tokens_seen": 1963744, + "step": 9310 + }, + { + "epoch": 1.0247524752475248, + "grad_norm": 0.5700729489326477, + "learning_rate": 2.561606160616062e-05, + "loss": 0.1223, + "num_input_tokens_seen": 1964832, + "step": 9315 + }, + { + "epoch": 1.0253025302530252, + "grad_norm": 0.7780032753944397, + "learning_rate": 2.5629812981298128e-05, + "loss": 0.1446, + "num_input_tokens_seen": 1965920, + "step": 9320 + }, + { + "epoch": 1.0258525852585259, + "grad_norm": 1.0806196928024292, + "learning_rate": 2.5643564356435646e-05, + "loss": 0.1651, + "num_input_tokens_seen": 1966912, + "step": 9325 + }, + { + "epoch": 1.0264026402640265, + "grad_norm": 0.4705083668231964, + "learning_rate": 2.5657315731573155e-05, + "loss": 0.1039, + "num_input_tokens_seen": 1968096, + "step": 9330 + }, + { + "epoch": 1.026952695269527, + "grad_norm": 0.7240033149719238, + "learning_rate": 2.5671067106710673e-05, + "loss": 0.1932, + "num_input_tokens_seen": 1969216, + "step": 9335 + }, + { + "epoch": 1.0275027502750276, + "grad_norm": 0.5614789724349976, + "learning_rate": 2.5684818481848188e-05, + "loss": 0.102, + "num_input_tokens_seen": 1970304, + "step": 9340 + }, + { + "epoch": 1.028052805280528, + "grad_norm": 0.45818766951560974, + "learning_rate": 2.56985698569857e-05, + "loss": 0.1541, + "num_input_tokens_seen": 1971360, + "step": 9345 + }, + { + "epoch": 1.0286028602860287, + "grad_norm": 0.43587377667427063, + "learning_rate": 2.5712321232123215e-05, + "loss": 0.1555, + "num_input_tokens_seen": 1972416, + "step": 9350 + }, + { + "epoch": 1.029152915291529, + "grad_norm": 0.7762618660926819, + "learning_rate": 2.5726072607260726e-05, + "loss": 0.0957, + "num_input_tokens_seen": 1973440, + "step": 9355 + }, + { + "epoch": 1.0297029702970297, + "grad_norm": 0.6102132797241211, + "learning_rate": 2.573982398239824e-05, + "loss": 0.1881, + "num_input_tokens_seen": 1974432, + "step": 9360 + }, + { + "epoch": 1.0302530253025302, + "grad_norm": 0.7496085166931152, + "learning_rate": 2.5753575357535753e-05, + "loss": 0.2523, + "num_input_tokens_seen": 1975488, + "step": 9365 + }, + { + "epoch": 1.0308030803080308, + "grad_norm": 0.34030139446258545, + "learning_rate": 2.5767326732673268e-05, + "loss": 0.16, + "num_input_tokens_seen": 1976576, + "step": 9370 + }, + { + "epoch": 1.0313531353135315, + "grad_norm": 0.53285813331604, + "learning_rate": 2.578107810781078e-05, + "loss": 0.0999, + "num_input_tokens_seen": 1977632, + "step": 9375 + }, + { + "epoch": 1.0319031903190319, + "grad_norm": 1.4389506578445435, + "learning_rate": 2.5794829482948295e-05, + "loss": 0.1711, + "num_input_tokens_seen": 1978720, + "step": 9380 + }, + { + "epoch": 1.0324532453245325, + "grad_norm": 0.7767769694328308, + "learning_rate": 2.5808580858085813e-05, + "loss": 0.2065, + "num_input_tokens_seen": 1979744, + "step": 9385 + }, + { + "epoch": 1.033003300330033, + "grad_norm": 0.23202522099018097, + "learning_rate": 2.582233223322332e-05, + "loss": 0.173, + "num_input_tokens_seen": 1980832, + "step": 9390 + }, + { + "epoch": 1.0335533553355336, + "grad_norm": 0.4437086880207062, + "learning_rate": 2.583608360836084e-05, + "loss": 0.1873, + "num_input_tokens_seen": 1981888, + "step": 9395 + }, + { + "epoch": 1.034103410341034, + "grad_norm": 0.6857516169548035, + "learning_rate": 2.584983498349835e-05, + "loss": 0.1223, + "num_input_tokens_seen": 1982944, + "step": 9400 + }, + { + "epoch": 1.0346534653465347, + "grad_norm": 1.12942373752594, + "learning_rate": 2.5863586358635867e-05, + "loss": 0.2223, + "num_input_tokens_seen": 1984000, + "step": 9405 + }, + { + "epoch": 1.0352035203520351, + "grad_norm": 0.44396746158599854, + "learning_rate": 2.587733773377338e-05, + "loss": 0.1244, + "num_input_tokens_seen": 1985024, + "step": 9410 + }, + { + "epoch": 1.0357535753575358, + "grad_norm": 1.7371193170547485, + "learning_rate": 2.5891089108910893e-05, + "loss": 0.187, + "num_input_tokens_seen": 1986080, + "step": 9415 + }, + { + "epoch": 1.0363036303630364, + "grad_norm": 0.5584322810173035, + "learning_rate": 2.5904840484048405e-05, + "loss": 0.0862, + "num_input_tokens_seen": 1987104, + "step": 9420 + }, + { + "epoch": 1.0368536853685368, + "grad_norm": 0.9286614060401917, + "learning_rate": 2.591859185918592e-05, + "loss": 0.1237, + "num_input_tokens_seen": 1988192, + "step": 9425 + }, + { + "epoch": 1.0374037403740375, + "grad_norm": 0.16089262068271637, + "learning_rate": 2.5932343234323435e-05, + "loss": 0.0969, + "num_input_tokens_seen": 1989216, + "step": 9430 + }, + { + "epoch": 1.037953795379538, + "grad_norm": 0.7382605075836182, + "learning_rate": 2.5946094609460947e-05, + "loss": 0.1341, + "num_input_tokens_seen": 1990272, + "step": 9435 + }, + { + "epoch": 1.0385038503850386, + "grad_norm": 1.2044520378112793, + "learning_rate": 2.5959845984598462e-05, + "loss": 0.2467, + "num_input_tokens_seen": 1991264, + "step": 9440 + }, + { + "epoch": 1.039053905390539, + "grad_norm": 0.15086670219898224, + "learning_rate": 2.5973597359735973e-05, + "loss": 0.1097, + "num_input_tokens_seen": 1992288, + "step": 9445 + }, + { + "epoch": 1.0396039603960396, + "grad_norm": 0.7648624777793884, + "learning_rate": 2.598734873487349e-05, + "loss": 0.1311, + "num_input_tokens_seen": 1993344, + "step": 9450 + }, + { + "epoch": 1.04015401540154, + "grad_norm": 0.5882207751274109, + "learning_rate": 2.6001100110011e-05, + "loss": 0.1569, + "num_input_tokens_seen": 1994464, + "step": 9455 + }, + { + "epoch": 1.0407040704070407, + "grad_norm": 0.32074031233787537, + "learning_rate": 2.601485148514852e-05, + "loss": 0.1076, + "num_input_tokens_seen": 1995584, + "step": 9460 + }, + { + "epoch": 1.0412541254125411, + "grad_norm": 0.6556397676467896, + "learning_rate": 2.6028602860286027e-05, + "loss": 0.1158, + "num_input_tokens_seen": 1996640, + "step": 9465 + }, + { + "epoch": 1.0418041804180418, + "grad_norm": 0.7177724838256836, + "learning_rate": 2.6042354235423545e-05, + "loss": 0.1151, + "num_input_tokens_seen": 1997696, + "step": 9470 + }, + { + "epoch": 1.0423542354235424, + "grad_norm": 1.2492862939834595, + "learning_rate": 2.6056105610561054e-05, + "loss": 0.1794, + "num_input_tokens_seen": 1998720, + "step": 9475 + }, + { + "epoch": 1.0429042904290429, + "grad_norm": 0.3916110098361969, + "learning_rate": 2.6069856985698572e-05, + "loss": 0.1214, + "num_input_tokens_seen": 1999840, + "step": 9480 + }, + { + "epoch": 1.0434543454345435, + "grad_norm": 0.21417716145515442, + "learning_rate": 2.6083608360836087e-05, + "loss": 0.1654, + "num_input_tokens_seen": 2000832, + "step": 9485 + }, + { + "epoch": 1.044004400440044, + "grad_norm": 0.9039224982261658, + "learning_rate": 2.60973597359736e-05, + "loss": 0.14, + "num_input_tokens_seen": 2001952, + "step": 9490 + }, + { + "epoch": 1.0445544554455446, + "grad_norm": 0.5129244923591614, + "learning_rate": 2.6111111111111114e-05, + "loss": 0.1604, + "num_input_tokens_seen": 2002976, + "step": 9495 + }, + { + "epoch": 1.045104510451045, + "grad_norm": 0.7085320949554443, + "learning_rate": 2.6124862486248625e-05, + "loss": 0.155, + "num_input_tokens_seen": 2004000, + "step": 9500 + }, + { + "epoch": 1.0456545654565457, + "grad_norm": 0.8452202677726746, + "learning_rate": 2.613861386138614e-05, + "loss": 0.1591, + "num_input_tokens_seen": 2005088, + "step": 9505 + }, + { + "epoch": 1.046204620462046, + "grad_norm": 1.0197169780731201, + "learning_rate": 2.6152365236523652e-05, + "loss": 0.1562, + "num_input_tokens_seen": 2006144, + "step": 9510 + }, + { + "epoch": 1.0467546754675467, + "grad_norm": 0.8019288778305054, + "learning_rate": 2.6166116611661167e-05, + "loss": 0.1213, + "num_input_tokens_seen": 2007200, + "step": 9515 + }, + { + "epoch": 1.0473047304730474, + "grad_norm": 0.6361414790153503, + "learning_rate": 2.617986798679868e-05, + "loss": 0.1071, + "num_input_tokens_seen": 2008288, + "step": 9520 + }, + { + "epoch": 1.0478547854785478, + "grad_norm": 0.47884777188301086, + "learning_rate": 2.6193619361936194e-05, + "loss": 0.0919, + "num_input_tokens_seen": 2009344, + "step": 9525 + }, + { + "epoch": 1.0484048404840485, + "grad_norm": 0.3440686762332916, + "learning_rate": 2.6207370737073712e-05, + "loss": 0.2554, + "num_input_tokens_seen": 2010432, + "step": 9530 + }, + { + "epoch": 1.048954895489549, + "grad_norm": 0.5275391340255737, + "learning_rate": 2.622112211221122e-05, + "loss": 0.13, + "num_input_tokens_seen": 2011392, + "step": 9535 + }, + { + "epoch": 1.0495049504950495, + "grad_norm": 0.6411548852920532, + "learning_rate": 2.623487348734874e-05, + "loss": 0.1608, + "num_input_tokens_seen": 2012416, + "step": 9540 + }, + { + "epoch": 1.05005500550055, + "grad_norm": 0.9121536612510681, + "learning_rate": 2.6248624862486247e-05, + "loss": 0.1673, + "num_input_tokens_seen": 2013440, + "step": 9545 + }, + { + "epoch": 1.0506050605060506, + "grad_norm": 0.8033546805381775, + "learning_rate": 2.6262376237623766e-05, + "loss": 0.0955, + "num_input_tokens_seen": 2014464, + "step": 9550 + }, + { + "epoch": 1.051155115511551, + "grad_norm": 0.5939643383026123, + "learning_rate": 2.6276127612761274e-05, + "loss": 0.0642, + "num_input_tokens_seen": 2015584, + "step": 9555 + }, + { + "epoch": 1.0517051705170517, + "grad_norm": 1.5446159839630127, + "learning_rate": 2.6289878987898792e-05, + "loss": 0.1649, + "num_input_tokens_seen": 2016576, + "step": 9560 + }, + { + "epoch": 1.0522552255225524, + "grad_norm": 1.2257355451583862, + "learning_rate": 2.6303630363036304e-05, + "loss": 0.0999, + "num_input_tokens_seen": 2017568, + "step": 9565 + }, + { + "epoch": 1.0528052805280528, + "grad_norm": 0.7985963821411133, + "learning_rate": 2.631738173817382e-05, + "loss": 0.0807, + "num_input_tokens_seen": 2018656, + "step": 9570 + }, + { + "epoch": 1.0533553355335534, + "grad_norm": 0.9017400741577148, + "learning_rate": 2.6331133113311334e-05, + "loss": 0.2081, + "num_input_tokens_seen": 2019680, + "step": 9575 + }, + { + "epoch": 1.0539053905390539, + "grad_norm": 0.6743223667144775, + "learning_rate": 2.6344884488448846e-05, + "loss": 0.115, + "num_input_tokens_seen": 2020736, + "step": 9580 + }, + { + "epoch": 1.0544554455445545, + "grad_norm": 0.6031255125999451, + "learning_rate": 2.635863586358636e-05, + "loss": 0.1648, + "num_input_tokens_seen": 2021792, + "step": 9585 + }, + { + "epoch": 1.055005500550055, + "grad_norm": 0.2875900864601135, + "learning_rate": 2.6372387238723872e-05, + "loss": 0.0961, + "num_input_tokens_seen": 2022880, + "step": 9590 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 2.9943995475769043, + "learning_rate": 2.6386138613861387e-05, + "loss": 0.2458, + "num_input_tokens_seen": 2023904, + "step": 9595 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 1.956761360168457, + "learning_rate": 2.63998899889989e-05, + "loss": 0.2176, + "num_input_tokens_seen": 2024992, + "step": 9600 + }, + { + "epoch": 1.0566556655665567, + "grad_norm": 1.517600178718567, + "learning_rate": 2.6413641364136414e-05, + "loss": 0.2774, + "num_input_tokens_seen": 2026048, + "step": 9605 + }, + { + "epoch": 1.0572057205720573, + "grad_norm": 0.7665375471115112, + "learning_rate": 2.6427392739273926e-05, + "loss": 0.2001, + "num_input_tokens_seen": 2027040, + "step": 9610 + }, + { + "epoch": 1.0577557755775577, + "grad_norm": 0.3939901888370514, + "learning_rate": 2.644114411441144e-05, + "loss": 0.1194, + "num_input_tokens_seen": 2028128, + "step": 9615 + }, + { + "epoch": 1.0583058305830584, + "grad_norm": 0.8132117986679077, + "learning_rate": 2.645489548954896e-05, + "loss": 0.1606, + "num_input_tokens_seen": 2029184, + "step": 9620 + }, + { + "epoch": 1.0588558855885588, + "grad_norm": 0.9248853921890259, + "learning_rate": 2.646864686468647e-05, + "loss": 0.1518, + "num_input_tokens_seen": 2030304, + "step": 9625 + }, + { + "epoch": 1.0594059405940595, + "grad_norm": 1.2313616275787354, + "learning_rate": 2.6482398239823986e-05, + "loss": 0.2569, + "num_input_tokens_seen": 2031424, + "step": 9630 + }, + { + "epoch": 1.0599559955995599, + "grad_norm": 0.7267094254493713, + "learning_rate": 2.6496149614961498e-05, + "loss": 0.0634, + "num_input_tokens_seen": 2032384, + "step": 9635 + }, + { + "epoch": 1.0605060506050605, + "grad_norm": 0.6296340823173523, + "learning_rate": 2.6509900990099013e-05, + "loss": 0.1438, + "num_input_tokens_seen": 2033472, + "step": 9640 + }, + { + "epoch": 1.061056105610561, + "grad_norm": 0.3453880548477173, + "learning_rate": 2.6523652365236524e-05, + "loss": 0.1682, + "num_input_tokens_seen": 2034496, + "step": 9645 + }, + { + "epoch": 1.0616061606160616, + "grad_norm": 1.4589784145355225, + "learning_rate": 2.653740374037404e-05, + "loss": 0.1692, + "num_input_tokens_seen": 2035552, + "step": 9650 + }, + { + "epoch": 1.0621562156215623, + "grad_norm": 0.0979529470205307, + "learning_rate": 2.655115511551155e-05, + "loss": 0.1286, + "num_input_tokens_seen": 2036576, + "step": 9655 + }, + { + "epoch": 1.0627062706270627, + "grad_norm": 1.0741206407546997, + "learning_rate": 2.6564906490649066e-05, + "loss": 0.1918, + "num_input_tokens_seen": 2037664, + "step": 9660 + }, + { + "epoch": 1.0632563256325633, + "grad_norm": 0.4295612871646881, + "learning_rate": 2.657865786578658e-05, + "loss": 0.1683, + "num_input_tokens_seen": 2038656, + "step": 9665 + }, + { + "epoch": 1.0638063806380638, + "grad_norm": 0.29075950384140015, + "learning_rate": 2.6592409240924093e-05, + "loss": 0.0944, + "num_input_tokens_seen": 2039744, + "step": 9670 + }, + { + "epoch": 1.0643564356435644, + "grad_norm": 0.5884512066841125, + "learning_rate": 2.6606160616061608e-05, + "loss": 0.0859, + "num_input_tokens_seen": 2040736, + "step": 9675 + }, + { + "epoch": 1.0649064906490648, + "grad_norm": 0.583324134349823, + "learning_rate": 2.661991199119912e-05, + "loss": 0.1467, + "num_input_tokens_seen": 2041728, + "step": 9680 + }, + { + "epoch": 1.0654565456545655, + "grad_norm": 0.499977707862854, + "learning_rate": 2.6633663366336638e-05, + "loss": 0.179, + "num_input_tokens_seen": 2042720, + "step": 9685 + }, + { + "epoch": 1.066006600660066, + "grad_norm": 0.3530011773109436, + "learning_rate": 2.6647414741474146e-05, + "loss": 0.0855, + "num_input_tokens_seen": 2043808, + "step": 9690 + }, + { + "epoch": 1.0665566556655666, + "grad_norm": 0.8604651093482971, + "learning_rate": 2.6661166116611665e-05, + "loss": 0.1673, + "num_input_tokens_seen": 2044832, + "step": 9695 + }, + { + "epoch": 1.0671067106710672, + "grad_norm": 0.7765317559242249, + "learning_rate": 2.6674917491749173e-05, + "loss": 0.1448, + "num_input_tokens_seen": 2045920, + "step": 9700 + }, + { + "epoch": 1.0676567656765676, + "grad_norm": 0.5228130221366882, + "learning_rate": 2.668866886688669e-05, + "loss": 0.1232, + "num_input_tokens_seen": 2046944, + "step": 9705 + }, + { + "epoch": 1.0682068206820683, + "grad_norm": 0.8513439297676086, + "learning_rate": 2.6702420242024206e-05, + "loss": 0.105, + "num_input_tokens_seen": 2048000, + "step": 9710 + }, + { + "epoch": 1.0687568756875687, + "grad_norm": 0.5753186941146851, + "learning_rate": 2.6716171617161718e-05, + "loss": 0.144, + "num_input_tokens_seen": 2049056, + "step": 9715 + }, + { + "epoch": 1.0693069306930694, + "grad_norm": 0.21336205303668976, + "learning_rate": 2.6729922992299233e-05, + "loss": 0.0957, + "num_input_tokens_seen": 2050176, + "step": 9720 + }, + { + "epoch": 1.0698569856985698, + "grad_norm": 0.4151450991630554, + "learning_rate": 2.6743674367436745e-05, + "loss": 0.0685, + "num_input_tokens_seen": 2051296, + "step": 9725 + }, + { + "epoch": 1.0704070407040704, + "grad_norm": 0.9366836547851562, + "learning_rate": 2.675742574257426e-05, + "loss": 0.105, + "num_input_tokens_seen": 2052352, + "step": 9730 + }, + { + "epoch": 1.0709570957095709, + "grad_norm": 0.3286270201206207, + "learning_rate": 2.677117711771177e-05, + "loss": 0.0587, + "num_input_tokens_seen": 2053408, + "step": 9735 + }, + { + "epoch": 1.0715071507150715, + "grad_norm": 0.8367893695831299, + "learning_rate": 2.6784928492849286e-05, + "loss": 0.3374, + "num_input_tokens_seen": 2054400, + "step": 9740 + }, + { + "epoch": 1.0720572057205722, + "grad_norm": 1.0799272060394287, + "learning_rate": 2.6798679867986798e-05, + "loss": 0.0861, + "num_input_tokens_seen": 2055488, + "step": 9745 + }, + { + "epoch": 1.0726072607260726, + "grad_norm": 1.9805899858474731, + "learning_rate": 2.6812431243124313e-05, + "loss": 0.1447, + "num_input_tokens_seen": 2056544, + "step": 9750 + }, + { + "epoch": 1.0731573157315732, + "grad_norm": 0.17288672924041748, + "learning_rate": 2.6826182618261825e-05, + "loss": 0.1252, + "num_input_tokens_seen": 2057600, + "step": 9755 + }, + { + "epoch": 1.0737073707370737, + "grad_norm": 0.61463463306427, + "learning_rate": 2.683993399339934e-05, + "loss": 0.1383, + "num_input_tokens_seen": 2058656, + "step": 9760 + }, + { + "epoch": 1.0742574257425743, + "grad_norm": 0.7703807950019836, + "learning_rate": 2.685368536853686e-05, + "loss": 0.1372, + "num_input_tokens_seen": 2059680, + "step": 9765 + }, + { + "epoch": 1.0748074807480748, + "grad_norm": 0.3090851306915283, + "learning_rate": 2.6867436743674367e-05, + "loss": 0.1071, + "num_input_tokens_seen": 2060736, + "step": 9770 + }, + { + "epoch": 1.0753575357535754, + "grad_norm": 0.12650763988494873, + "learning_rate": 2.6881188118811885e-05, + "loss": 0.2307, + "num_input_tokens_seen": 2061792, + "step": 9775 + }, + { + "epoch": 1.0759075907590758, + "grad_norm": 1.4642138481140137, + "learning_rate": 2.6894939493949393e-05, + "loss": 0.1645, + "num_input_tokens_seen": 2062784, + "step": 9780 + }, + { + "epoch": 1.0764576457645765, + "grad_norm": 0.7553495764732361, + "learning_rate": 2.6908690869086912e-05, + "loss": 0.1692, + "num_input_tokens_seen": 2063776, + "step": 9785 + }, + { + "epoch": 1.0770077007700771, + "grad_norm": 1.247004508972168, + "learning_rate": 2.6922442244224423e-05, + "loss": 0.085, + "num_input_tokens_seen": 2064864, + "step": 9790 + }, + { + "epoch": 1.0775577557755776, + "grad_norm": 1.8947811126708984, + "learning_rate": 2.693619361936194e-05, + "loss": 0.1948, + "num_input_tokens_seen": 2065920, + "step": 9795 + }, + { + "epoch": 1.0781078107810782, + "grad_norm": 0.40568849444389343, + "learning_rate": 2.694994499449945e-05, + "loss": 0.2375, + "num_input_tokens_seen": 2066976, + "step": 9800 + }, + { + "epoch": 1.0786578657865786, + "grad_norm": 0.5068634748458862, + "learning_rate": 2.6963696369636965e-05, + "loss": 0.1552, + "num_input_tokens_seen": 2068096, + "step": 9805 + }, + { + "epoch": 1.0792079207920793, + "grad_norm": 0.5740848183631897, + "learning_rate": 2.697744774477448e-05, + "loss": 0.0874, + "num_input_tokens_seen": 2069184, + "step": 9810 + }, + { + "epoch": 1.0797579757975797, + "grad_norm": 0.9048424959182739, + "learning_rate": 2.6991199119911992e-05, + "loss": 0.2328, + "num_input_tokens_seen": 2070240, + "step": 9815 + }, + { + "epoch": 1.0803080308030804, + "grad_norm": 1.0770504474639893, + "learning_rate": 2.7004950495049507e-05, + "loss": 0.2146, + "num_input_tokens_seen": 2071328, + "step": 9820 + }, + { + "epoch": 1.0808580858085808, + "grad_norm": 0.9562861323356628, + "learning_rate": 2.701870187018702e-05, + "loss": 0.0781, + "num_input_tokens_seen": 2072352, + "step": 9825 + }, + { + "epoch": 1.0814081408140814, + "grad_norm": 0.8649007081985474, + "learning_rate": 2.7032453245324534e-05, + "loss": 0.154, + "num_input_tokens_seen": 2073376, + "step": 9830 + }, + { + "epoch": 1.0819581958195819, + "grad_norm": 1.4660450220108032, + "learning_rate": 2.7046204620462045e-05, + "loss": 0.2116, + "num_input_tokens_seen": 2074400, + "step": 9835 + }, + { + "epoch": 1.0825082508250825, + "grad_norm": 0.5014069080352783, + "learning_rate": 2.705995599559956e-05, + "loss": 0.1007, + "num_input_tokens_seen": 2075584, + "step": 9840 + }, + { + "epoch": 1.0830583058305832, + "grad_norm": 0.6599369645118713, + "learning_rate": 2.7073707370737072e-05, + "loss": 0.2038, + "num_input_tokens_seen": 2076672, + "step": 9845 + }, + { + "epoch": 1.0836083608360836, + "grad_norm": 1.5094966888427734, + "learning_rate": 2.708745874587459e-05, + "loss": 0.1716, + "num_input_tokens_seen": 2077696, + "step": 9850 + }, + { + "epoch": 1.0841584158415842, + "grad_norm": 0.26468563079833984, + "learning_rate": 2.7101210121012105e-05, + "loss": 0.1416, + "num_input_tokens_seen": 2078784, + "step": 9855 + }, + { + "epoch": 1.0847084708470847, + "grad_norm": 0.9988409280776978, + "learning_rate": 2.7114961496149617e-05, + "loss": 0.1194, + "num_input_tokens_seen": 2079808, + "step": 9860 + }, + { + "epoch": 1.0852585258525853, + "grad_norm": 1.2784254550933838, + "learning_rate": 2.7128712871287132e-05, + "loss": 0.1457, + "num_input_tokens_seen": 2080896, + "step": 9865 + }, + { + "epoch": 1.0858085808580857, + "grad_norm": 0.2290945202112198, + "learning_rate": 2.7142464246424644e-05, + "loss": 0.1338, + "num_input_tokens_seen": 2081952, + "step": 9870 + }, + { + "epoch": 1.0863586358635864, + "grad_norm": 0.3310009837150574, + "learning_rate": 2.715621562156216e-05, + "loss": 0.14, + "num_input_tokens_seen": 2083072, + "step": 9875 + }, + { + "epoch": 1.0869086908690868, + "grad_norm": 0.2784070372581482, + "learning_rate": 2.716996699669967e-05, + "loss": 0.2069, + "num_input_tokens_seen": 2084096, + "step": 9880 + }, + { + "epoch": 1.0874587458745875, + "grad_norm": 2.204082727432251, + "learning_rate": 2.7183718371837185e-05, + "loss": 0.273, + "num_input_tokens_seen": 2085184, + "step": 9885 + }, + { + "epoch": 1.0880088008800881, + "grad_norm": 0.2445363700389862, + "learning_rate": 2.7197469746974697e-05, + "loss": 0.0989, + "num_input_tokens_seen": 2086336, + "step": 9890 + }, + { + "epoch": 1.0885588558855885, + "grad_norm": 1.8305723667144775, + "learning_rate": 2.7211221122112212e-05, + "loss": 0.1105, + "num_input_tokens_seen": 2087360, + "step": 9895 + }, + { + "epoch": 1.0891089108910892, + "grad_norm": 0.5813896656036377, + "learning_rate": 2.7224972497249727e-05, + "loss": 0.0728, + "num_input_tokens_seen": 2088384, + "step": 9900 + }, + { + "epoch": 1.0896589658965896, + "grad_norm": 0.5326317548751831, + "learning_rate": 2.723872387238724e-05, + "loss": 0.1104, + "num_input_tokens_seen": 2089408, + "step": 9905 + }, + { + "epoch": 1.0902090209020903, + "grad_norm": 0.5272458791732788, + "learning_rate": 2.7252475247524757e-05, + "loss": 0.122, + "num_input_tokens_seen": 2090464, + "step": 9910 + }, + { + "epoch": 1.0907590759075907, + "grad_norm": 0.5380730628967285, + "learning_rate": 2.7266226622662266e-05, + "loss": 0.1442, + "num_input_tokens_seen": 2091552, + "step": 9915 + }, + { + "epoch": 1.0913091309130913, + "grad_norm": 0.5918432474136353, + "learning_rate": 2.7279977997799784e-05, + "loss": 0.1907, + "num_input_tokens_seen": 2092576, + "step": 9920 + }, + { + "epoch": 1.0918591859185918, + "grad_norm": 0.7126460671424866, + "learning_rate": 2.7293729372937292e-05, + "loss": 0.1563, + "num_input_tokens_seen": 2093664, + "step": 9925 + }, + { + "epoch": 1.0924092409240924, + "grad_norm": 0.32490694522857666, + "learning_rate": 2.730748074807481e-05, + "loss": 0.0541, + "num_input_tokens_seen": 2094720, + "step": 9930 + }, + { + "epoch": 1.0929592959295928, + "grad_norm": 0.36558443307876587, + "learning_rate": 2.732123212321232e-05, + "loss": 0.1018, + "num_input_tokens_seen": 2095776, + "step": 9935 + }, + { + "epoch": 1.0935093509350935, + "grad_norm": 1.5117782354354858, + "learning_rate": 2.7334983498349837e-05, + "loss": 0.0963, + "num_input_tokens_seen": 2096896, + "step": 9940 + }, + { + "epoch": 1.0940594059405941, + "grad_norm": 0.6616590023040771, + "learning_rate": 2.7348734873487352e-05, + "loss": 0.1598, + "num_input_tokens_seen": 2097920, + "step": 9945 + }, + { + "epoch": 1.0946094609460946, + "grad_norm": 0.2392638474702835, + "learning_rate": 2.7362486248624864e-05, + "loss": 0.1023, + "num_input_tokens_seen": 2098976, + "step": 9950 + }, + { + "epoch": 1.0951595159515952, + "grad_norm": 1.1900817155838013, + "learning_rate": 2.737623762376238e-05, + "loss": 0.1585, + "num_input_tokens_seen": 2100032, + "step": 9955 + }, + { + "epoch": 1.0957095709570956, + "grad_norm": 0.9514257311820984, + "learning_rate": 2.738998899889989e-05, + "loss": 0.1963, + "num_input_tokens_seen": 2101024, + "step": 9960 + }, + { + "epoch": 1.0962596259625963, + "grad_norm": 0.6578905582427979, + "learning_rate": 2.7403740374037406e-05, + "loss": 0.1526, + "num_input_tokens_seen": 2102048, + "step": 9965 + }, + { + "epoch": 1.0968096809680967, + "grad_norm": 2.705458164215088, + "learning_rate": 2.7417491749174917e-05, + "loss": 0.1206, + "num_input_tokens_seen": 2103136, + "step": 9970 + }, + { + "epoch": 1.0973597359735974, + "grad_norm": 0.7111691832542419, + "learning_rate": 2.7431243124312433e-05, + "loss": 0.0929, + "num_input_tokens_seen": 2104192, + "step": 9975 + }, + { + "epoch": 1.0979097909790978, + "grad_norm": 0.7046410441398621, + "learning_rate": 2.7444994499449944e-05, + "loss": 0.1557, + "num_input_tokens_seen": 2105280, + "step": 9980 + }, + { + "epoch": 1.0984598459845984, + "grad_norm": 0.1464482694864273, + "learning_rate": 2.745874587458746e-05, + "loss": 0.107, + "num_input_tokens_seen": 2106368, + "step": 9985 + }, + { + "epoch": 1.099009900990099, + "grad_norm": 0.46442079544067383, + "learning_rate": 2.747249724972497e-05, + "loss": 0.1889, + "num_input_tokens_seen": 2107360, + "step": 9990 + }, + { + "epoch": 1.0995599559955995, + "grad_norm": 0.11563214659690857, + "learning_rate": 2.7486248624862486e-05, + "loss": 0.1003, + "num_input_tokens_seen": 2108480, + "step": 9995 + }, + { + "epoch": 1.1001100110011002, + "grad_norm": 0.7248350977897644, + "learning_rate": 2.7500000000000004e-05, + "loss": 0.2157, + "num_input_tokens_seen": 2109504, + "step": 10000 + }, + { + "epoch": 1.1006600660066006, + "grad_norm": 0.4299868941307068, + "learning_rate": 2.7513751375137513e-05, + "loss": 0.164, + "num_input_tokens_seen": 2110592, + "step": 10005 + }, + { + "epoch": 1.1012101210121013, + "grad_norm": 0.8621258735656738, + "learning_rate": 2.752750275027503e-05, + "loss": 0.0827, + "num_input_tokens_seen": 2111648, + "step": 10010 + }, + { + "epoch": 1.1017601760176017, + "grad_norm": 0.605210542678833, + "learning_rate": 2.7541254125412543e-05, + "loss": 0.0975, + "num_input_tokens_seen": 2112640, + "step": 10015 + }, + { + "epoch": 1.1023102310231023, + "grad_norm": 0.9967685341835022, + "learning_rate": 2.7555005500550058e-05, + "loss": 0.1352, + "num_input_tokens_seen": 2113696, + "step": 10020 + }, + { + "epoch": 1.1028602860286028, + "grad_norm": 0.4963717758655548, + "learning_rate": 2.756875687568757e-05, + "loss": 0.0813, + "num_input_tokens_seen": 2114752, + "step": 10025 + }, + { + "epoch": 1.1034103410341034, + "grad_norm": 0.9580217003822327, + "learning_rate": 2.7582508250825084e-05, + "loss": 0.1609, + "num_input_tokens_seen": 2115808, + "step": 10030 + }, + { + "epoch": 1.103960396039604, + "grad_norm": 0.6186661124229431, + "learning_rate": 2.7596259625962596e-05, + "loss": 0.1171, + "num_input_tokens_seen": 2116864, + "step": 10035 + }, + { + "epoch": 1.1045104510451045, + "grad_norm": 0.3704301118850708, + "learning_rate": 2.761001100110011e-05, + "loss": 0.1837, + "num_input_tokens_seen": 2117920, + "step": 10040 + }, + { + "epoch": 1.1050605060506051, + "grad_norm": 0.10260321199893951, + "learning_rate": 2.7623762376237626e-05, + "loss": 0.0617, + "num_input_tokens_seen": 2118944, + "step": 10045 + }, + { + "epoch": 1.1056105610561056, + "grad_norm": 0.957978367805481, + "learning_rate": 2.7637513751375138e-05, + "loss": 0.1004, + "num_input_tokens_seen": 2119936, + "step": 10050 + }, + { + "epoch": 1.1061606160616062, + "grad_norm": 0.4852833151817322, + "learning_rate": 2.7651265126512653e-05, + "loss": 0.0832, + "num_input_tokens_seen": 2121056, + "step": 10055 + }, + { + "epoch": 1.1067106710671066, + "grad_norm": 0.4753902554512024, + "learning_rate": 2.7665016501650165e-05, + "loss": 0.1527, + "num_input_tokens_seen": 2122144, + "step": 10060 + }, + { + "epoch": 1.1072607260726073, + "grad_norm": 0.1632029414176941, + "learning_rate": 2.767876787678768e-05, + "loss": 0.0896, + "num_input_tokens_seen": 2123200, + "step": 10065 + }, + { + "epoch": 1.1078107810781077, + "grad_norm": 0.26742255687713623, + "learning_rate": 2.769251925192519e-05, + "loss": 0.1465, + "num_input_tokens_seen": 2124256, + "step": 10070 + }, + { + "epoch": 1.1083608360836084, + "grad_norm": 0.4739685356616974, + "learning_rate": 2.770627062706271e-05, + "loss": 0.1076, + "num_input_tokens_seen": 2125376, + "step": 10075 + }, + { + "epoch": 1.108910891089109, + "grad_norm": 0.6719575524330139, + "learning_rate": 2.7720022002200218e-05, + "loss": 0.1215, + "num_input_tokens_seen": 2126464, + "step": 10080 + }, + { + "epoch": 1.1094609460946094, + "grad_norm": 0.2547152042388916, + "learning_rate": 2.7733773377337736e-05, + "loss": 0.0992, + "num_input_tokens_seen": 2127520, + "step": 10085 + }, + { + "epoch": 1.11001100110011, + "grad_norm": 0.5307769775390625, + "learning_rate": 2.774752475247525e-05, + "loss": 0.124, + "num_input_tokens_seen": 2128576, + "step": 10090 + }, + { + "epoch": 1.1105610561056105, + "grad_norm": 0.6930115818977356, + "learning_rate": 2.7761276127612763e-05, + "loss": 0.2153, + "num_input_tokens_seen": 2129696, + "step": 10095 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 1.0686159133911133, + "learning_rate": 2.7775027502750278e-05, + "loss": 0.1205, + "num_input_tokens_seen": 2130752, + "step": 10100 + }, + { + "epoch": 1.1116611661166116, + "grad_norm": 1.218489646911621, + "learning_rate": 2.778877887788779e-05, + "loss": 0.1938, + "num_input_tokens_seen": 2131936, + "step": 10105 + }, + { + "epoch": 1.1122112211221122, + "grad_norm": 0.9126092791557312, + "learning_rate": 2.7802530253025305e-05, + "loss": 0.2512, + "num_input_tokens_seen": 2133024, + "step": 10110 + }, + { + "epoch": 1.1127612761276127, + "grad_norm": 0.4664251506328583, + "learning_rate": 2.7816281628162816e-05, + "loss": 0.0786, + "num_input_tokens_seen": 2134176, + "step": 10115 + }, + { + "epoch": 1.1133113311331133, + "grad_norm": 0.43987998366355896, + "learning_rate": 2.783003300330033e-05, + "loss": 0.0715, + "num_input_tokens_seen": 2135264, + "step": 10120 + }, + { + "epoch": 1.113861386138614, + "grad_norm": 1.6090103387832642, + "learning_rate": 2.7843784378437843e-05, + "loss": 0.1659, + "num_input_tokens_seen": 2136256, + "step": 10125 + }, + { + "epoch": 1.1144114411441144, + "grad_norm": 1.026064395904541, + "learning_rate": 2.7857535753575358e-05, + "loss": 0.1426, + "num_input_tokens_seen": 2137312, + "step": 10130 + }, + { + "epoch": 1.114961496149615, + "grad_norm": 0.3714904189109802, + "learning_rate": 2.7871287128712877e-05, + "loss": 0.2792, + "num_input_tokens_seen": 2138464, + "step": 10135 + }, + { + "epoch": 1.1155115511551155, + "grad_norm": 0.31712064146995544, + "learning_rate": 2.7885038503850385e-05, + "loss": 0.1116, + "num_input_tokens_seen": 2139488, + "step": 10140 + }, + { + "epoch": 1.1160616061606161, + "grad_norm": 0.650695264339447, + "learning_rate": 2.7898789878987903e-05, + "loss": 0.1815, + "num_input_tokens_seen": 2140480, + "step": 10145 + }, + { + "epoch": 1.1166116611661165, + "grad_norm": 0.5959699153900146, + "learning_rate": 2.791254125412541e-05, + "loss": 0.1854, + "num_input_tokens_seen": 2141504, + "step": 10150 + }, + { + "epoch": 1.1171617161716172, + "grad_norm": 1.4169590473175049, + "learning_rate": 2.792629262926293e-05, + "loss": 0.1196, + "num_input_tokens_seen": 2142560, + "step": 10155 + }, + { + "epoch": 1.1177117711771176, + "grad_norm": 0.11543862521648407, + "learning_rate": 2.794004400440044e-05, + "loss": 0.1507, + "num_input_tokens_seen": 2143616, + "step": 10160 + }, + { + "epoch": 1.1182618261826183, + "grad_norm": 1.6664820909500122, + "learning_rate": 2.7953795379537957e-05, + "loss": 0.1305, + "num_input_tokens_seen": 2144640, + "step": 10165 + }, + { + "epoch": 1.118811881188119, + "grad_norm": 0.2962397634983063, + "learning_rate": 2.7967546754675465e-05, + "loss": 0.145, + "num_input_tokens_seen": 2145696, + "step": 10170 + }, + { + "epoch": 1.1193619361936193, + "grad_norm": 1.0105019807815552, + "learning_rate": 2.7981298129812983e-05, + "loss": 0.1472, + "num_input_tokens_seen": 2146784, + "step": 10175 + }, + { + "epoch": 1.11991199119912, + "grad_norm": 1.1619235277175903, + "learning_rate": 2.79950495049505e-05, + "loss": 0.1369, + "num_input_tokens_seen": 2147744, + "step": 10180 + }, + { + "epoch": 1.1204620462046204, + "grad_norm": 0.4178502857685089, + "learning_rate": 2.800880088008801e-05, + "loss": 0.1168, + "num_input_tokens_seen": 2148896, + "step": 10185 + }, + { + "epoch": 1.121012101210121, + "grad_norm": 0.7772708535194397, + "learning_rate": 2.8022552255225525e-05, + "loss": 0.0881, + "num_input_tokens_seen": 2149952, + "step": 10190 + }, + { + "epoch": 1.1215621562156215, + "grad_norm": 0.12806735932826996, + "learning_rate": 2.8036303630363037e-05, + "loss": 0.115, + "num_input_tokens_seen": 2150976, + "step": 10195 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.962883472442627, + "learning_rate": 2.8050055005500552e-05, + "loss": 0.1522, + "num_input_tokens_seen": 2152000, + "step": 10200 + }, + { + "epoch": 1.1226622662266226, + "grad_norm": 0.8321930170059204, + "learning_rate": 2.8063806380638064e-05, + "loss": 0.0775, + "num_input_tokens_seen": 2153056, + "step": 10205 + }, + { + "epoch": 1.1232123212321232, + "grad_norm": 0.6261402368545532, + "learning_rate": 2.807755775577558e-05, + "loss": 0.159, + "num_input_tokens_seen": 2154144, + "step": 10210 + }, + { + "epoch": 1.1237623762376239, + "grad_norm": 0.8282939195632935, + "learning_rate": 2.809130913091309e-05, + "loss": 0.1337, + "num_input_tokens_seen": 2155168, + "step": 10215 + }, + { + "epoch": 1.1243124312431243, + "grad_norm": 0.8962451815605164, + "learning_rate": 2.8105060506050605e-05, + "loss": 0.2224, + "num_input_tokens_seen": 2156160, + "step": 10220 + }, + { + "epoch": 1.124862486248625, + "grad_norm": 1.0327225923538208, + "learning_rate": 2.8118811881188124e-05, + "loss": 0.1302, + "num_input_tokens_seen": 2157152, + "step": 10225 + }, + { + "epoch": 1.1254125412541254, + "grad_norm": 0.6375531554222107, + "learning_rate": 2.8132563256325632e-05, + "loss": 0.1775, + "num_input_tokens_seen": 2158208, + "step": 10230 + }, + { + "epoch": 1.125962596259626, + "grad_norm": 0.338245689868927, + "learning_rate": 2.814631463146315e-05, + "loss": 0.1819, + "num_input_tokens_seen": 2159232, + "step": 10235 + }, + { + "epoch": 1.1265126512651265, + "grad_norm": 0.23718464374542236, + "learning_rate": 2.8160066006600662e-05, + "loss": 0.1105, + "num_input_tokens_seen": 2160320, + "step": 10240 + }, + { + "epoch": 1.127062706270627, + "grad_norm": 1.0915998220443726, + "learning_rate": 2.8173817381738177e-05, + "loss": 0.1824, + "num_input_tokens_seen": 2161472, + "step": 10245 + }, + { + "epoch": 1.1276127612761275, + "grad_norm": 0.664787232875824, + "learning_rate": 2.818756875687569e-05, + "loss": 0.1046, + "num_input_tokens_seen": 2162496, + "step": 10250 + }, + { + "epoch": 1.1281628162816282, + "grad_norm": 0.43936359882354736, + "learning_rate": 2.8201320132013204e-05, + "loss": 0.1589, + "num_input_tokens_seen": 2163488, + "step": 10255 + }, + { + "epoch": 1.1287128712871288, + "grad_norm": 0.9391791224479675, + "learning_rate": 2.8215071507150715e-05, + "loss": 0.1103, + "num_input_tokens_seen": 2164544, + "step": 10260 + }, + { + "epoch": 1.1292629262926293, + "grad_norm": 0.6376591324806213, + "learning_rate": 2.822882288228823e-05, + "loss": 0.134, + "num_input_tokens_seen": 2165600, + "step": 10265 + }, + { + "epoch": 1.12981298129813, + "grad_norm": 1.1705083847045898, + "learning_rate": 2.8242574257425742e-05, + "loss": 0.1712, + "num_input_tokens_seen": 2166624, + "step": 10270 + }, + { + "epoch": 1.1303630363036303, + "grad_norm": 0.17208416759967804, + "learning_rate": 2.8256325632563257e-05, + "loss": 0.0576, + "num_input_tokens_seen": 2167648, + "step": 10275 + }, + { + "epoch": 1.130913091309131, + "grad_norm": 0.7193348407745361, + "learning_rate": 2.8270077007700772e-05, + "loss": 0.2023, + "num_input_tokens_seen": 2168704, + "step": 10280 + }, + { + "epoch": 1.1314631463146314, + "grad_norm": 0.329252690076828, + "learning_rate": 2.8283828382838284e-05, + "loss": 0.0831, + "num_input_tokens_seen": 2169760, + "step": 10285 + }, + { + "epoch": 1.132013201320132, + "grad_norm": 0.6367638111114502, + "learning_rate": 2.82975797579758e-05, + "loss": 0.2061, + "num_input_tokens_seen": 2170816, + "step": 10290 + }, + { + "epoch": 1.1325632563256325, + "grad_norm": 0.9916961789131165, + "learning_rate": 2.831133113311331e-05, + "loss": 0.1227, + "num_input_tokens_seen": 2171872, + "step": 10295 + }, + { + "epoch": 1.1331133113311331, + "grad_norm": 0.7050809860229492, + "learning_rate": 2.832508250825083e-05, + "loss": 0.0933, + "num_input_tokens_seen": 2172896, + "step": 10300 + }, + { + "epoch": 1.1336633663366338, + "grad_norm": 0.8502938151359558, + "learning_rate": 2.8338833883388337e-05, + "loss": 0.1561, + "num_input_tokens_seen": 2173984, + "step": 10305 + }, + { + "epoch": 1.1342134213421342, + "grad_norm": 1.734834909439087, + "learning_rate": 2.8352585258525856e-05, + "loss": 0.3685, + "num_input_tokens_seen": 2175072, + "step": 10310 + }, + { + "epoch": 1.1347634763476349, + "grad_norm": 0.6565236449241638, + "learning_rate": 2.8366336633663364e-05, + "loss": 0.1454, + "num_input_tokens_seen": 2176128, + "step": 10315 + }, + { + "epoch": 1.1353135313531353, + "grad_norm": 0.4679032862186432, + "learning_rate": 2.8380088008800882e-05, + "loss": 0.1603, + "num_input_tokens_seen": 2177184, + "step": 10320 + }, + { + "epoch": 1.135863586358636, + "grad_norm": 0.5716491937637329, + "learning_rate": 2.8393839383938397e-05, + "loss": 0.065, + "num_input_tokens_seen": 2178336, + "step": 10325 + }, + { + "epoch": 1.1364136413641364, + "grad_norm": 0.8510189056396484, + "learning_rate": 2.840759075907591e-05, + "loss": 0.1384, + "num_input_tokens_seen": 2179360, + "step": 10330 + }, + { + "epoch": 1.136963696369637, + "grad_norm": 1.7864453792572021, + "learning_rate": 2.8421342134213424e-05, + "loss": 0.1497, + "num_input_tokens_seen": 2180448, + "step": 10335 + }, + { + "epoch": 1.1375137513751374, + "grad_norm": 0.689195990562439, + "learning_rate": 2.8435093509350936e-05, + "loss": 0.1447, + "num_input_tokens_seen": 2181440, + "step": 10340 + }, + { + "epoch": 1.138063806380638, + "grad_norm": 0.27479982376098633, + "learning_rate": 2.844884488448845e-05, + "loss": 0.1263, + "num_input_tokens_seen": 2182528, + "step": 10345 + }, + { + "epoch": 1.1386138613861387, + "grad_norm": 0.3455021381378174, + "learning_rate": 2.8462596259625963e-05, + "loss": 0.069, + "num_input_tokens_seen": 2183552, + "step": 10350 + }, + { + "epoch": 1.1391639163916392, + "grad_norm": 0.5767011642456055, + "learning_rate": 2.8476347634763478e-05, + "loss": 0.1756, + "num_input_tokens_seen": 2184640, + "step": 10355 + }, + { + "epoch": 1.1397139713971396, + "grad_norm": 0.2028391808271408, + "learning_rate": 2.849009900990099e-05, + "loss": 0.1006, + "num_input_tokens_seen": 2185760, + "step": 10360 + }, + { + "epoch": 1.1402640264026402, + "grad_norm": 0.21201364696025848, + "learning_rate": 2.8503850385038504e-05, + "loss": 0.113, + "num_input_tokens_seen": 2186848, + "step": 10365 + }, + { + "epoch": 1.140814081408141, + "grad_norm": 0.6132184863090515, + "learning_rate": 2.8517601760176023e-05, + "loss": 0.1436, + "num_input_tokens_seen": 2187904, + "step": 10370 + }, + { + "epoch": 1.1413641364136413, + "grad_norm": 0.09043122828006744, + "learning_rate": 2.853135313531353e-05, + "loss": 0.1186, + "num_input_tokens_seen": 2188960, + "step": 10375 + }, + { + "epoch": 1.141914191419142, + "grad_norm": 0.3322926461696625, + "learning_rate": 2.854510451045105e-05, + "loss": 0.1423, + "num_input_tokens_seen": 2190048, + "step": 10380 + }, + { + "epoch": 1.1424642464246424, + "grad_norm": 0.520545244216919, + "learning_rate": 2.8558855885588558e-05, + "loss": 0.0938, + "num_input_tokens_seen": 2191136, + "step": 10385 + }, + { + "epoch": 1.143014301430143, + "grad_norm": 0.4876759946346283, + "learning_rate": 2.8572607260726076e-05, + "loss": 0.0574, + "num_input_tokens_seen": 2192160, + "step": 10390 + }, + { + "epoch": 1.1435643564356435, + "grad_norm": 0.5862481594085693, + "learning_rate": 2.8586358635863584e-05, + "loss": 0.1261, + "num_input_tokens_seen": 2193216, + "step": 10395 + }, + { + "epoch": 1.1441144114411441, + "grad_norm": 0.43448567390441895, + "learning_rate": 2.8600110011001103e-05, + "loss": 0.136, + "num_input_tokens_seen": 2194304, + "step": 10400 + }, + { + "epoch": 1.1446644664466445, + "grad_norm": 0.3898552358150482, + "learning_rate": 2.8613861386138614e-05, + "loss": 0.0694, + "num_input_tokens_seen": 2195392, + "step": 10405 + }, + { + "epoch": 1.1452145214521452, + "grad_norm": 2.5224719047546387, + "learning_rate": 2.862761276127613e-05, + "loss": 0.2092, + "num_input_tokens_seen": 2196512, + "step": 10410 + }, + { + "epoch": 1.1457645764576458, + "grad_norm": 0.8712701797485352, + "learning_rate": 2.8641364136413645e-05, + "loss": 0.1101, + "num_input_tokens_seen": 2197632, + "step": 10415 + }, + { + "epoch": 1.1463146314631463, + "grad_norm": 0.504366934299469, + "learning_rate": 2.8655115511551156e-05, + "loss": 0.1857, + "num_input_tokens_seen": 2198784, + "step": 10420 + }, + { + "epoch": 1.146864686468647, + "grad_norm": 0.8886321187019348, + "learning_rate": 2.866886688668867e-05, + "loss": 0.1029, + "num_input_tokens_seen": 2199744, + "step": 10425 + }, + { + "epoch": 1.1474147414741473, + "grad_norm": 0.618187665939331, + "learning_rate": 2.8682618261826183e-05, + "loss": 0.1661, + "num_input_tokens_seen": 2200736, + "step": 10430 + }, + { + "epoch": 1.147964796479648, + "grad_norm": 0.34426218271255493, + "learning_rate": 2.8696369636963698e-05, + "loss": 0.1774, + "num_input_tokens_seen": 2201728, + "step": 10435 + }, + { + "epoch": 1.1485148514851484, + "grad_norm": 0.45215803384780884, + "learning_rate": 2.871012101210121e-05, + "loss": 0.144, + "num_input_tokens_seen": 2202752, + "step": 10440 + }, + { + "epoch": 1.149064906490649, + "grad_norm": 0.5685248374938965, + "learning_rate": 2.8723872387238725e-05, + "loss": 0.1616, + "num_input_tokens_seen": 2203872, + "step": 10445 + }, + { + "epoch": 1.1496149614961495, + "grad_norm": 0.0989997386932373, + "learning_rate": 2.8737623762376236e-05, + "loss": 0.1244, + "num_input_tokens_seen": 2204896, + "step": 10450 + }, + { + "epoch": 1.1501650165016502, + "grad_norm": 1.724343180656433, + "learning_rate": 2.875137513751375e-05, + "loss": 0.1273, + "num_input_tokens_seen": 2205888, + "step": 10455 + }, + { + "epoch": 1.1507150715071508, + "grad_norm": 0.4877420961856842, + "learning_rate": 2.876512651265127e-05, + "loss": 0.2014, + "num_input_tokens_seen": 2206912, + "step": 10460 + }, + { + "epoch": 1.1512651265126512, + "grad_norm": 1.059232473373413, + "learning_rate": 2.877887788778878e-05, + "loss": 0.0756, + "num_input_tokens_seen": 2207968, + "step": 10465 + }, + { + "epoch": 1.1518151815181519, + "grad_norm": 0.6622796058654785, + "learning_rate": 2.8792629262926296e-05, + "loss": 0.1454, + "num_input_tokens_seen": 2208992, + "step": 10470 + }, + { + "epoch": 1.1523652365236523, + "grad_norm": 1.2268307209014893, + "learning_rate": 2.8806380638063808e-05, + "loss": 0.1299, + "num_input_tokens_seen": 2210048, + "step": 10475 + }, + { + "epoch": 1.152915291529153, + "grad_norm": 0.8337674140930176, + "learning_rate": 2.8820132013201323e-05, + "loss": 0.1294, + "num_input_tokens_seen": 2211136, + "step": 10480 + }, + { + "epoch": 1.1534653465346534, + "grad_norm": 0.4800446033477783, + "learning_rate": 2.8833883388338835e-05, + "loss": 0.1596, + "num_input_tokens_seen": 2212160, + "step": 10485 + }, + { + "epoch": 1.154015401540154, + "grad_norm": 0.5821419954299927, + "learning_rate": 2.884763476347635e-05, + "loss": 0.0493, + "num_input_tokens_seen": 2213184, + "step": 10490 + }, + { + "epoch": 1.1545654565456545, + "grad_norm": 0.47328317165374756, + "learning_rate": 2.886138613861386e-05, + "loss": 0.0731, + "num_input_tokens_seen": 2214208, + "step": 10495 + }, + { + "epoch": 1.155115511551155, + "grad_norm": 0.3008512556552887, + "learning_rate": 2.8875137513751377e-05, + "loss": 0.0666, + "num_input_tokens_seen": 2215200, + "step": 10500 + }, + { + "epoch": 1.1556655665566558, + "grad_norm": 0.25634676218032837, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.1276, + "num_input_tokens_seen": 2216320, + "step": 10505 + }, + { + "epoch": 1.1562156215621562, + "grad_norm": 0.8788845539093018, + "learning_rate": 2.8902640264026403e-05, + "loss": 0.1689, + "num_input_tokens_seen": 2217376, + "step": 10510 + }, + { + "epoch": 1.1567656765676568, + "grad_norm": 1.2580487728118896, + "learning_rate": 2.8916391639163922e-05, + "loss": 0.1356, + "num_input_tokens_seen": 2218400, + "step": 10515 + }, + { + "epoch": 1.1573157315731573, + "grad_norm": 0.5650283098220825, + "learning_rate": 2.893014301430143e-05, + "loss": 0.1384, + "num_input_tokens_seen": 2219488, + "step": 10520 + }, + { + "epoch": 1.157865786578658, + "grad_norm": 0.519961416721344, + "learning_rate": 2.894389438943895e-05, + "loss": 0.1169, + "num_input_tokens_seen": 2220480, + "step": 10525 + }, + { + "epoch": 1.1584158415841583, + "grad_norm": 0.9348523616790771, + "learning_rate": 2.8957645764576457e-05, + "loss": 0.1371, + "num_input_tokens_seen": 2221536, + "step": 10530 + }, + { + "epoch": 1.158965896589659, + "grad_norm": 0.7794874906539917, + "learning_rate": 2.8971397139713975e-05, + "loss": 0.1259, + "num_input_tokens_seen": 2222528, + "step": 10535 + }, + { + "epoch": 1.1595159515951594, + "grad_norm": 0.5218387246131897, + "learning_rate": 2.8985148514851483e-05, + "loss": 0.1039, + "num_input_tokens_seen": 2223648, + "step": 10540 + }, + { + "epoch": 1.16006600660066, + "grad_norm": 0.8302571773529053, + "learning_rate": 2.8998899889989002e-05, + "loss": 0.1078, + "num_input_tokens_seen": 2224672, + "step": 10545 + }, + { + "epoch": 1.1606160616061607, + "grad_norm": 1.0468751192092896, + "learning_rate": 2.901265126512651e-05, + "loss": 0.0817, + "num_input_tokens_seen": 2225728, + "step": 10550 + }, + { + "epoch": 1.1611661166116611, + "grad_norm": 1.4865095615386963, + "learning_rate": 2.902640264026403e-05, + "loss": 0.1766, + "num_input_tokens_seen": 2226784, + "step": 10555 + }, + { + "epoch": 1.1617161716171618, + "grad_norm": 0.3315727412700653, + "learning_rate": 2.9040154015401544e-05, + "loss": 0.0975, + "num_input_tokens_seen": 2227872, + "step": 10560 + }, + { + "epoch": 1.1622662266226622, + "grad_norm": 1.2304610013961792, + "learning_rate": 2.9053905390539055e-05, + "loss": 0.126, + "num_input_tokens_seen": 2228864, + "step": 10565 + }, + { + "epoch": 1.1628162816281629, + "grad_norm": 0.8942092061042786, + "learning_rate": 2.906765676567657e-05, + "loss": 0.1157, + "num_input_tokens_seen": 2229920, + "step": 10570 + }, + { + "epoch": 1.1633663366336633, + "grad_norm": 0.6383025646209717, + "learning_rate": 2.9081408140814082e-05, + "loss": 0.1391, + "num_input_tokens_seen": 2231008, + "step": 10575 + }, + { + "epoch": 1.163916391639164, + "grad_norm": 0.6148030757904053, + "learning_rate": 2.9095159515951597e-05, + "loss": 0.1078, + "num_input_tokens_seen": 2232064, + "step": 10580 + }, + { + "epoch": 1.1644664466446644, + "grad_norm": 0.3203279972076416, + "learning_rate": 2.910891089108911e-05, + "loss": 0.0441, + "num_input_tokens_seen": 2233152, + "step": 10585 + }, + { + "epoch": 1.165016501650165, + "grad_norm": 0.9224383234977722, + "learning_rate": 2.9122662266226624e-05, + "loss": 0.1226, + "num_input_tokens_seen": 2234176, + "step": 10590 + }, + { + "epoch": 1.1655665566556657, + "grad_norm": 0.3816969692707062, + "learning_rate": 2.9136413641364135e-05, + "loss": 0.0684, + "num_input_tokens_seen": 2235264, + "step": 10595 + }, + { + "epoch": 1.166116611661166, + "grad_norm": 0.6739741563796997, + "learning_rate": 2.915016501650165e-05, + "loss": 0.155, + "num_input_tokens_seen": 2236352, + "step": 10600 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.715955376625061, + "learning_rate": 2.916391639163917e-05, + "loss": 0.1045, + "num_input_tokens_seen": 2237472, + "step": 10605 + }, + { + "epoch": 1.1672167216721672, + "grad_norm": 1.504650354385376, + "learning_rate": 2.9177667766776677e-05, + "loss": 0.2078, + "num_input_tokens_seen": 2238528, + "step": 10610 + }, + { + "epoch": 1.1677667766776678, + "grad_norm": 0.36795130372047424, + "learning_rate": 2.9191419141914195e-05, + "loss": 0.0542, + "num_input_tokens_seen": 2239616, + "step": 10615 + }, + { + "epoch": 1.1683168316831682, + "grad_norm": 0.4263624846935272, + "learning_rate": 2.9205170517051704e-05, + "loss": 0.1632, + "num_input_tokens_seen": 2240640, + "step": 10620 + }, + { + "epoch": 1.168866886688669, + "grad_norm": 1.103963017463684, + "learning_rate": 2.9218921892189222e-05, + "loss": 0.1756, + "num_input_tokens_seen": 2241664, + "step": 10625 + }, + { + "epoch": 1.1694169416941693, + "grad_norm": 1.447287678718567, + "learning_rate": 2.9232673267326734e-05, + "loss": 0.2022, + "num_input_tokens_seen": 2242720, + "step": 10630 + }, + { + "epoch": 1.16996699669967, + "grad_norm": 0.7540843486785889, + "learning_rate": 2.924642464246425e-05, + "loss": 0.1507, + "num_input_tokens_seen": 2243808, + "step": 10635 + }, + { + "epoch": 1.1705170517051706, + "grad_norm": 0.6868851184844971, + "learning_rate": 2.926017601760176e-05, + "loss": 0.0995, + "num_input_tokens_seen": 2244864, + "step": 10640 + }, + { + "epoch": 1.171067106710671, + "grad_norm": 1.3687561750411987, + "learning_rate": 2.9273927392739276e-05, + "loss": 0.1707, + "num_input_tokens_seen": 2245920, + "step": 10645 + }, + { + "epoch": 1.1716171617161717, + "grad_norm": 0.6743130087852478, + "learning_rate": 2.928767876787679e-05, + "loss": 0.2044, + "num_input_tokens_seen": 2246976, + "step": 10650 + }, + { + "epoch": 1.1721672167216721, + "grad_norm": 0.09466291218996048, + "learning_rate": 2.9301430143014302e-05, + "loss": 0.0671, + "num_input_tokens_seen": 2248064, + "step": 10655 + }, + { + "epoch": 1.1727172717271728, + "grad_norm": 0.5514299273490906, + "learning_rate": 2.9315181518151817e-05, + "loss": 0.1591, + "num_input_tokens_seen": 2249152, + "step": 10660 + }, + { + "epoch": 1.1732673267326732, + "grad_norm": 0.4747658371925354, + "learning_rate": 2.932893289328933e-05, + "loss": 0.0808, + "num_input_tokens_seen": 2250240, + "step": 10665 + }, + { + "epoch": 1.1738173817381738, + "grad_norm": 0.834757924079895, + "learning_rate": 2.9342684268426844e-05, + "loss": 0.0962, + "num_input_tokens_seen": 2251328, + "step": 10670 + }, + { + "epoch": 1.1743674367436743, + "grad_norm": 1.1436487436294556, + "learning_rate": 2.9356435643564356e-05, + "loss": 0.2093, + "num_input_tokens_seen": 2252352, + "step": 10675 + }, + { + "epoch": 1.174917491749175, + "grad_norm": 0.46672457456588745, + "learning_rate": 2.937018701870187e-05, + "loss": 0.0745, + "num_input_tokens_seen": 2253408, + "step": 10680 + }, + { + "epoch": 1.1754675467546756, + "grad_norm": 0.24802210927009583, + "learning_rate": 2.9383938393839382e-05, + "loss": 0.0806, + "num_input_tokens_seen": 2254464, + "step": 10685 + }, + { + "epoch": 1.176017601760176, + "grad_norm": 0.055221475660800934, + "learning_rate": 2.93976897689769e-05, + "loss": 0.0593, + "num_input_tokens_seen": 2255552, + "step": 10690 + }, + { + "epoch": 1.1765676567656767, + "grad_norm": 0.3423124849796295, + "learning_rate": 2.9411441144114416e-05, + "loss": 0.1073, + "num_input_tokens_seen": 2256544, + "step": 10695 + }, + { + "epoch": 1.177117711771177, + "grad_norm": 0.5393990874290466, + "learning_rate": 2.9425192519251928e-05, + "loss": 0.114, + "num_input_tokens_seen": 2257632, + "step": 10700 + }, + { + "epoch": 1.1776677667766777, + "grad_norm": 0.886768102645874, + "learning_rate": 2.9438943894389443e-05, + "loss": 0.1169, + "num_input_tokens_seen": 2258720, + "step": 10705 + }, + { + "epoch": 1.1782178217821782, + "grad_norm": 1.3369195461273193, + "learning_rate": 2.9452695269526954e-05, + "loss": 0.1805, + "num_input_tokens_seen": 2259776, + "step": 10710 + }, + { + "epoch": 1.1787678767876788, + "grad_norm": 0.909748911857605, + "learning_rate": 2.946644664466447e-05, + "loss": 0.1803, + "num_input_tokens_seen": 2260896, + "step": 10715 + }, + { + "epoch": 1.1793179317931792, + "grad_norm": 0.5978996157646179, + "learning_rate": 2.948019801980198e-05, + "loss": 0.1051, + "num_input_tokens_seen": 2261984, + "step": 10720 + }, + { + "epoch": 1.1798679867986799, + "grad_norm": 0.320467472076416, + "learning_rate": 2.9493949394939496e-05, + "loss": 0.1478, + "num_input_tokens_seen": 2263072, + "step": 10725 + }, + { + "epoch": 1.1804180418041805, + "grad_norm": 0.5841292142868042, + "learning_rate": 2.9507700770077008e-05, + "loss": 0.1503, + "num_input_tokens_seen": 2264096, + "step": 10730 + }, + { + "epoch": 1.180968096809681, + "grad_norm": 0.4231630861759186, + "learning_rate": 2.9521452145214523e-05, + "loss": 0.1559, + "num_input_tokens_seen": 2265152, + "step": 10735 + }, + { + "epoch": 1.1815181518151816, + "grad_norm": 0.34051260352134705, + "learning_rate": 2.9535203520352034e-05, + "loss": 0.1314, + "num_input_tokens_seen": 2266176, + "step": 10740 + }, + { + "epoch": 1.182068206820682, + "grad_norm": 1.2998591661453247, + "learning_rate": 2.954895489548955e-05, + "loss": 0.1835, + "num_input_tokens_seen": 2267200, + "step": 10745 + }, + { + "epoch": 1.1826182618261827, + "grad_norm": 0.29588234424591064, + "learning_rate": 2.9562706270627068e-05, + "loss": 0.1816, + "num_input_tokens_seen": 2268256, + "step": 10750 + }, + { + "epoch": 1.183168316831683, + "grad_norm": 0.792872965335846, + "learning_rate": 2.9576457645764576e-05, + "loss": 0.0884, + "num_input_tokens_seen": 2269312, + "step": 10755 + }, + { + "epoch": 1.1837183718371838, + "grad_norm": 0.5595928430557251, + "learning_rate": 2.9590209020902094e-05, + "loss": 0.1368, + "num_input_tokens_seen": 2270400, + "step": 10760 + }, + { + "epoch": 1.1842684268426842, + "grad_norm": 1.2260222434997559, + "learning_rate": 2.9603960396039603e-05, + "loss": 0.1518, + "num_input_tokens_seen": 2271392, + "step": 10765 + }, + { + "epoch": 1.1848184818481848, + "grad_norm": 0.631142258644104, + "learning_rate": 2.961771177117712e-05, + "loss": 0.1102, + "num_input_tokens_seen": 2272448, + "step": 10770 + }, + { + "epoch": 1.1853685368536855, + "grad_norm": 0.48741114139556885, + "learning_rate": 2.963146314631463e-05, + "loss": 0.1159, + "num_input_tokens_seen": 2273504, + "step": 10775 + }, + { + "epoch": 1.185918591859186, + "grad_norm": 0.746793270111084, + "learning_rate": 2.9645214521452148e-05, + "loss": 0.0706, + "num_input_tokens_seen": 2274592, + "step": 10780 + }, + { + "epoch": 1.1864686468646866, + "grad_norm": 1.0383847951889038, + "learning_rate": 2.9658965896589656e-05, + "loss": 0.1278, + "num_input_tokens_seen": 2275648, + "step": 10785 + }, + { + "epoch": 1.187018701870187, + "grad_norm": 0.46405741572380066, + "learning_rate": 2.9672717271727175e-05, + "loss": 0.0331, + "num_input_tokens_seen": 2276736, + "step": 10790 + }, + { + "epoch": 1.1875687568756876, + "grad_norm": 0.11243343353271484, + "learning_rate": 2.968646864686469e-05, + "loss": 0.1231, + "num_input_tokens_seen": 2277856, + "step": 10795 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.8101747035980225, + "learning_rate": 2.97002200220022e-05, + "loss": 0.1493, + "num_input_tokens_seen": 2278944, + "step": 10800 + }, + { + "epoch": 1.1886688668866887, + "grad_norm": 0.2114105373620987, + "learning_rate": 2.9713971397139716e-05, + "loss": 0.1228, + "num_input_tokens_seen": 2279936, + "step": 10805 + }, + { + "epoch": 1.1892189218921891, + "grad_norm": 0.24895958602428436, + "learning_rate": 2.9727722772277228e-05, + "loss": 0.2039, + "num_input_tokens_seen": 2280960, + "step": 10810 + }, + { + "epoch": 1.1897689768976898, + "grad_norm": 0.3981257975101471, + "learning_rate": 2.9741474147414743e-05, + "loss": 0.1282, + "num_input_tokens_seen": 2282048, + "step": 10815 + }, + { + "epoch": 1.1903190319031904, + "grad_norm": 1.244163990020752, + "learning_rate": 2.9755225522552255e-05, + "loss": 0.1724, + "num_input_tokens_seen": 2283008, + "step": 10820 + }, + { + "epoch": 1.1908690869086909, + "grad_norm": 1.5182052850723267, + "learning_rate": 2.976897689768977e-05, + "loss": 0.1537, + "num_input_tokens_seen": 2284128, + "step": 10825 + }, + { + "epoch": 1.1914191419141915, + "grad_norm": 0.47641196846961975, + "learning_rate": 2.978272827282728e-05, + "loss": 0.1182, + "num_input_tokens_seen": 2285152, + "step": 10830 + }, + { + "epoch": 1.191969196919692, + "grad_norm": 1.5749949216842651, + "learning_rate": 2.9796479647964796e-05, + "loss": 0.1346, + "num_input_tokens_seen": 2286208, + "step": 10835 + }, + { + "epoch": 1.1925192519251926, + "grad_norm": 0.6706904768943787, + "learning_rate": 2.9810231023102315e-05, + "loss": 0.1227, + "num_input_tokens_seen": 2287296, + "step": 10840 + }, + { + "epoch": 1.193069306930693, + "grad_norm": 0.19210799038410187, + "learning_rate": 2.9823982398239823e-05, + "loss": 0.081, + "num_input_tokens_seen": 2288288, + "step": 10845 + }, + { + "epoch": 1.1936193619361937, + "grad_norm": 0.7186866998672485, + "learning_rate": 2.983773377337734e-05, + "loss": 0.1271, + "num_input_tokens_seen": 2289248, + "step": 10850 + }, + { + "epoch": 1.194169416941694, + "grad_norm": 0.49643123149871826, + "learning_rate": 2.9851485148514853e-05, + "loss": 0.1477, + "num_input_tokens_seen": 2290240, + "step": 10855 + }, + { + "epoch": 1.1947194719471947, + "grad_norm": 0.3701644837856293, + "learning_rate": 2.9865236523652368e-05, + "loss": 0.1586, + "num_input_tokens_seen": 2291360, + "step": 10860 + }, + { + "epoch": 1.1952695269526954, + "grad_norm": 1.225958228111267, + "learning_rate": 2.987898789878988e-05, + "loss": 0.1409, + "num_input_tokens_seen": 2292480, + "step": 10865 + }, + { + "epoch": 1.1958195819581958, + "grad_norm": 1.5483888387680054, + "learning_rate": 2.9892739273927395e-05, + "loss": 0.1234, + "num_input_tokens_seen": 2293504, + "step": 10870 + }, + { + "epoch": 1.1963696369636962, + "grad_norm": 0.8685733675956726, + "learning_rate": 2.9906490649064907e-05, + "loss": 0.1146, + "num_input_tokens_seen": 2294528, + "step": 10875 + }, + { + "epoch": 1.196919691969197, + "grad_norm": 0.3359638750553131, + "learning_rate": 2.992024202420242e-05, + "loss": 0.1826, + "num_input_tokens_seen": 2295616, + "step": 10880 + }, + { + "epoch": 1.1974697469746975, + "grad_norm": 0.7423651814460754, + "learning_rate": 2.9933993399339937e-05, + "loss": 0.1326, + "num_input_tokens_seen": 2296736, + "step": 10885 + }, + { + "epoch": 1.198019801980198, + "grad_norm": 1.440860390663147, + "learning_rate": 2.994774477447745e-05, + "loss": 0.1107, + "num_input_tokens_seen": 2297824, + "step": 10890 + }, + { + "epoch": 1.1985698569856986, + "grad_norm": 0.896753191947937, + "learning_rate": 2.9961496149614963e-05, + "loss": 0.1325, + "num_input_tokens_seen": 2298880, + "step": 10895 + }, + { + "epoch": 1.199119911991199, + "grad_norm": 0.8535280823707581, + "learning_rate": 2.9975247524752475e-05, + "loss": 0.097, + "num_input_tokens_seen": 2299968, + "step": 10900 + }, + { + "epoch": 1.1996699669966997, + "grad_norm": 0.8836201429367065, + "learning_rate": 2.9988998899889993e-05, + "loss": 0.1536, + "num_input_tokens_seen": 2301024, + "step": 10905 + }, + { + "epoch": 1.2002200220022001, + "grad_norm": 1.063437819480896, + "learning_rate": 3.0002750275027502e-05, + "loss": 0.1318, + "num_input_tokens_seen": 2302048, + "step": 10910 + }, + { + "epoch": 1.2007700770077008, + "grad_norm": 0.4040836989879608, + "learning_rate": 3.001650165016502e-05, + "loss": 0.0867, + "num_input_tokens_seen": 2303072, + "step": 10915 + }, + { + "epoch": 1.2013201320132012, + "grad_norm": 1.3413995504379272, + "learning_rate": 3.003025302530253e-05, + "loss": 0.0809, + "num_input_tokens_seen": 2304160, + "step": 10920 + }, + { + "epoch": 1.2018701870187019, + "grad_norm": 0.3917124569416046, + "learning_rate": 3.0044004400440047e-05, + "loss": 0.0964, + "num_input_tokens_seen": 2305184, + "step": 10925 + }, + { + "epoch": 1.2024202420242025, + "grad_norm": 0.3123643100261688, + "learning_rate": 3.0057755775577562e-05, + "loss": 0.0736, + "num_input_tokens_seen": 2306176, + "step": 10930 + }, + { + "epoch": 1.202970297029703, + "grad_norm": 0.20161400735378265, + "learning_rate": 3.0071507150715074e-05, + "loss": 0.0725, + "num_input_tokens_seen": 2307232, + "step": 10935 + }, + { + "epoch": 1.2035203520352036, + "grad_norm": 0.1548217386007309, + "learning_rate": 3.008525852585259e-05, + "loss": 0.064, + "num_input_tokens_seen": 2308320, + "step": 10940 + }, + { + "epoch": 1.204070407040704, + "grad_norm": 0.4144607186317444, + "learning_rate": 3.00990099009901e-05, + "loss": 0.0833, + "num_input_tokens_seen": 2309344, + "step": 10945 + }, + { + "epoch": 1.2046204620462047, + "grad_norm": 0.6727710366249084, + "learning_rate": 3.0112761276127615e-05, + "loss": 0.1547, + "num_input_tokens_seen": 2310368, + "step": 10950 + }, + { + "epoch": 1.205170517051705, + "grad_norm": 1.6164000034332275, + "learning_rate": 3.0126512651265127e-05, + "loss": 0.1934, + "num_input_tokens_seen": 2311360, + "step": 10955 + }, + { + "epoch": 1.2057205720572057, + "grad_norm": 0.2144426703453064, + "learning_rate": 3.0140264026402642e-05, + "loss": 0.1172, + "num_input_tokens_seen": 2312416, + "step": 10960 + }, + { + "epoch": 1.2062706270627062, + "grad_norm": 0.6464664936065674, + "learning_rate": 3.0154015401540154e-05, + "loss": 0.1503, + "num_input_tokens_seen": 2313408, + "step": 10965 + }, + { + "epoch": 1.2068206820682068, + "grad_norm": 0.15754584968090057, + "learning_rate": 3.016776677667767e-05, + "loss": 0.1558, + "num_input_tokens_seen": 2314400, + "step": 10970 + }, + { + "epoch": 1.2073707370737075, + "grad_norm": 0.9450722932815552, + "learning_rate": 3.0181518151815187e-05, + "loss": 0.1192, + "num_input_tokens_seen": 2315424, + "step": 10975 + }, + { + "epoch": 1.2079207920792079, + "grad_norm": 1.51470947265625, + "learning_rate": 3.0195269526952695e-05, + "loss": 0.1587, + "num_input_tokens_seen": 2316576, + "step": 10980 + }, + { + "epoch": 1.2084708470847085, + "grad_norm": 0.5233744978904724, + "learning_rate": 3.0209020902090214e-05, + "loss": 0.1606, + "num_input_tokens_seen": 2317664, + "step": 10985 + }, + { + "epoch": 1.209020902090209, + "grad_norm": 0.8217519521713257, + "learning_rate": 3.0222772277227722e-05, + "loss": 0.1246, + "num_input_tokens_seen": 2318688, + "step": 10990 + }, + { + "epoch": 1.2095709570957096, + "grad_norm": 0.2787885069847107, + "learning_rate": 3.023652365236524e-05, + "loss": 0.1006, + "num_input_tokens_seen": 2319776, + "step": 10995 + }, + { + "epoch": 1.21012101210121, + "grad_norm": 0.3593376874923706, + "learning_rate": 3.025027502750275e-05, + "loss": 0.2938, + "num_input_tokens_seen": 2320896, + "step": 11000 + }, + { + "epoch": 1.2106710671067107, + "grad_norm": 1.0639126300811768, + "learning_rate": 3.0264026402640267e-05, + "loss": 0.1637, + "num_input_tokens_seen": 2322016, + "step": 11005 + }, + { + "epoch": 1.2112211221122111, + "grad_norm": 0.7233069539070129, + "learning_rate": 3.0277777777777776e-05, + "loss": 0.1138, + "num_input_tokens_seen": 2323072, + "step": 11010 + }, + { + "epoch": 1.2117711771177118, + "grad_norm": 0.7517113089561462, + "learning_rate": 3.0291529152915294e-05, + "loss": 0.1686, + "num_input_tokens_seen": 2324096, + "step": 11015 + }, + { + "epoch": 1.2123212321232124, + "grad_norm": 0.8493897318840027, + "learning_rate": 3.0305280528052806e-05, + "loss": 0.1371, + "num_input_tokens_seen": 2325120, + "step": 11020 + }, + { + "epoch": 1.2128712871287128, + "grad_norm": 0.5031720995903015, + "learning_rate": 3.031903190319032e-05, + "loss": 0.1086, + "num_input_tokens_seen": 2326144, + "step": 11025 + }, + { + "epoch": 1.2134213421342135, + "grad_norm": 0.5483672618865967, + "learning_rate": 3.0332783278327836e-05, + "loss": 0.1489, + "num_input_tokens_seen": 2327296, + "step": 11030 + }, + { + "epoch": 1.213971397139714, + "grad_norm": 0.4189458191394806, + "learning_rate": 3.0346534653465347e-05, + "loss": 0.1344, + "num_input_tokens_seen": 2328384, + "step": 11035 + }, + { + "epoch": 1.2145214521452146, + "grad_norm": 1.884634256362915, + "learning_rate": 3.0360286028602862e-05, + "loss": 0.2015, + "num_input_tokens_seen": 2329408, + "step": 11040 + }, + { + "epoch": 1.215071507150715, + "grad_norm": 0.6886909604072571, + "learning_rate": 3.0374037403740374e-05, + "loss": 0.1002, + "num_input_tokens_seen": 2330464, + "step": 11045 + }, + { + "epoch": 1.2156215621562156, + "grad_norm": 0.12160451710224152, + "learning_rate": 3.038778877887789e-05, + "loss": 0.1376, + "num_input_tokens_seen": 2331488, + "step": 11050 + }, + { + "epoch": 1.216171617161716, + "grad_norm": 0.47056862711906433, + "learning_rate": 3.04015401540154e-05, + "loss": 0.1531, + "num_input_tokens_seen": 2332608, + "step": 11055 + }, + { + "epoch": 1.2167216721672167, + "grad_norm": 0.4848686158657074, + "learning_rate": 3.0415291529152916e-05, + "loss": 0.1927, + "num_input_tokens_seen": 2333760, + "step": 11060 + }, + { + "epoch": 1.2172717271727174, + "grad_norm": 0.8077079057693481, + "learning_rate": 3.0429042904290427e-05, + "loss": 0.2436, + "num_input_tokens_seen": 2334784, + "step": 11065 + }, + { + "epoch": 1.2178217821782178, + "grad_norm": 0.3898846209049225, + "learning_rate": 3.0442794279427942e-05, + "loss": 0.11, + "num_input_tokens_seen": 2335808, + "step": 11070 + }, + { + "epoch": 1.2183718371837184, + "grad_norm": 0.5157893896102905, + "learning_rate": 3.045654565456546e-05, + "loss": 0.1592, + "num_input_tokens_seen": 2336864, + "step": 11075 + }, + { + "epoch": 1.2189218921892189, + "grad_norm": 0.4371909201145172, + "learning_rate": 3.0470297029702973e-05, + "loss": 0.1563, + "num_input_tokens_seen": 2337856, + "step": 11080 + }, + { + "epoch": 1.2194719471947195, + "grad_norm": 0.3675212860107422, + "learning_rate": 3.0484048404840488e-05, + "loss": 0.1315, + "num_input_tokens_seen": 2338912, + "step": 11085 + }, + { + "epoch": 1.22002200220022, + "grad_norm": 0.7085523009300232, + "learning_rate": 3.0497799779978e-05, + "loss": 0.1696, + "num_input_tokens_seen": 2339968, + "step": 11090 + }, + { + "epoch": 1.2205720572057206, + "grad_norm": 0.33060741424560547, + "learning_rate": 3.0511551155115514e-05, + "loss": 0.2069, + "num_input_tokens_seen": 2341120, + "step": 11095 + }, + { + "epoch": 1.221122112211221, + "grad_norm": 0.5265447497367859, + "learning_rate": 3.052530253025302e-05, + "loss": 0.0666, + "num_input_tokens_seen": 2342144, + "step": 11100 + }, + { + "epoch": 1.2216721672167217, + "grad_norm": 0.33994901180267334, + "learning_rate": 3.053905390539054e-05, + "loss": 0.1248, + "num_input_tokens_seen": 2343168, + "step": 11105 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.42572546005249023, + "learning_rate": 3.055280528052805e-05, + "loss": 0.1288, + "num_input_tokens_seen": 2344256, + "step": 11110 + }, + { + "epoch": 1.2227722772277227, + "grad_norm": 3.567314624786377, + "learning_rate": 3.056655665566557e-05, + "loss": 0.3287, + "num_input_tokens_seen": 2345280, + "step": 11115 + }, + { + "epoch": 1.2233223322332234, + "grad_norm": 1.0585107803344727, + "learning_rate": 3.058030803080308e-05, + "loss": 0.1318, + "num_input_tokens_seen": 2346272, + "step": 11120 + }, + { + "epoch": 1.2238723872387238, + "grad_norm": 0.4544161558151245, + "learning_rate": 3.0594059405940594e-05, + "loss": 0.0764, + "num_input_tokens_seen": 2347296, + "step": 11125 + }, + { + "epoch": 1.2244224422442245, + "grad_norm": 0.27187296748161316, + "learning_rate": 3.060781078107811e-05, + "loss": 0.1047, + "num_input_tokens_seen": 2348352, + "step": 11130 + }, + { + "epoch": 1.224972497249725, + "grad_norm": 1.7237119674682617, + "learning_rate": 3.0621562156215624e-05, + "loss": 0.192, + "num_input_tokens_seen": 2349440, + "step": 11135 + }, + { + "epoch": 1.2255225522552256, + "grad_norm": 0.5063005685806274, + "learning_rate": 3.0635313531353136e-05, + "loss": 0.2605, + "num_input_tokens_seen": 2350560, + "step": 11140 + }, + { + "epoch": 1.226072607260726, + "grad_norm": 3.138986587524414, + "learning_rate": 3.064906490649065e-05, + "loss": 0.237, + "num_input_tokens_seen": 2351616, + "step": 11145 + }, + { + "epoch": 1.2266226622662266, + "grad_norm": 0.5671175718307495, + "learning_rate": 3.0662816281628166e-05, + "loss": 0.1101, + "num_input_tokens_seen": 2352704, + "step": 11150 + }, + { + "epoch": 1.2271727172717273, + "grad_norm": 0.18427348136901855, + "learning_rate": 3.067656765676568e-05, + "loss": 0.1032, + "num_input_tokens_seen": 2353728, + "step": 11155 + }, + { + "epoch": 1.2277227722772277, + "grad_norm": 0.551439106464386, + "learning_rate": 3.069031903190319e-05, + "loss": 0.1316, + "num_input_tokens_seen": 2354784, + "step": 11160 + }, + { + "epoch": 1.2282728272827284, + "grad_norm": 0.19831177592277527, + "learning_rate": 3.070407040704071e-05, + "loss": 0.0913, + "num_input_tokens_seen": 2355776, + "step": 11165 + }, + { + "epoch": 1.2288228822882288, + "grad_norm": 0.4658576250076294, + "learning_rate": 3.071782178217822e-05, + "loss": 0.0858, + "num_input_tokens_seen": 2356768, + "step": 11170 + }, + { + "epoch": 1.2293729372937294, + "grad_norm": 0.5520645380020142, + "learning_rate": 3.073157315731574e-05, + "loss": 0.1105, + "num_input_tokens_seen": 2357824, + "step": 11175 + }, + { + "epoch": 1.2299229922992299, + "grad_norm": 1.420275092124939, + "learning_rate": 3.074532453245324e-05, + "loss": 0.2521, + "num_input_tokens_seen": 2358816, + "step": 11180 + }, + { + "epoch": 1.2304730473047305, + "grad_norm": 0.39778175950050354, + "learning_rate": 3.075907590759076e-05, + "loss": 0.1233, + "num_input_tokens_seen": 2359776, + "step": 11185 + }, + { + "epoch": 1.231023102310231, + "grad_norm": 0.4553810954093933, + "learning_rate": 3.077282728272827e-05, + "loss": 0.1235, + "num_input_tokens_seen": 2360864, + "step": 11190 + }, + { + "epoch": 1.2315731573157316, + "grad_norm": 1.9631881713867188, + "learning_rate": 3.078657865786579e-05, + "loss": 0.1583, + "num_input_tokens_seen": 2361984, + "step": 11195 + }, + { + "epoch": 1.2321232123212322, + "grad_norm": 1.1149675846099854, + "learning_rate": 3.0800330033003296e-05, + "loss": 0.149, + "num_input_tokens_seen": 2363104, + "step": 11200 + }, + { + "epoch": 1.2326732673267327, + "grad_norm": 1.1655442714691162, + "learning_rate": 3.0814081408140815e-05, + "loss": 0.2058, + "num_input_tokens_seen": 2364224, + "step": 11205 + }, + { + "epoch": 1.2332233223322333, + "grad_norm": 0.3527495265007019, + "learning_rate": 3.082783278327833e-05, + "loss": 0.0879, + "num_input_tokens_seen": 2365280, + "step": 11210 + }, + { + "epoch": 1.2337733773377337, + "grad_norm": 1.090828537940979, + "learning_rate": 3.0841584158415845e-05, + "loss": 0.106, + "num_input_tokens_seen": 2366400, + "step": 11215 + }, + { + "epoch": 1.2343234323432344, + "grad_norm": 0.4719640016555786, + "learning_rate": 3.0855335533553357e-05, + "loss": 0.1499, + "num_input_tokens_seen": 2367456, + "step": 11220 + }, + { + "epoch": 1.2348734873487348, + "grad_norm": 0.6206756234169006, + "learning_rate": 3.086908690869087e-05, + "loss": 0.1464, + "num_input_tokens_seen": 2368608, + "step": 11225 + }, + { + "epoch": 1.2354235423542355, + "grad_norm": 0.7470353841781616, + "learning_rate": 3.0882838283828387e-05, + "loss": 0.165, + "num_input_tokens_seen": 2369632, + "step": 11230 + }, + { + "epoch": 1.2359735973597359, + "grad_norm": 0.4485289752483368, + "learning_rate": 3.08965896589659e-05, + "loss": 0.1024, + "num_input_tokens_seen": 2370720, + "step": 11235 + }, + { + "epoch": 1.2365236523652365, + "grad_norm": 0.8550967574119568, + "learning_rate": 3.091034103410341e-05, + "loss": 0.0763, + "num_input_tokens_seen": 2371744, + "step": 11240 + }, + { + "epoch": 1.2370737073707372, + "grad_norm": 1.6188687086105347, + "learning_rate": 3.092409240924092e-05, + "loss": 0.2163, + "num_input_tokens_seen": 2372768, + "step": 11245 + }, + { + "epoch": 1.2376237623762376, + "grad_norm": 1.2215383052825928, + "learning_rate": 3.093784378437844e-05, + "loss": 0.1466, + "num_input_tokens_seen": 2373792, + "step": 11250 + }, + { + "epoch": 1.2381738173817383, + "grad_norm": 0.6778189539909363, + "learning_rate": 3.095159515951595e-05, + "loss": 0.1401, + "num_input_tokens_seen": 2374848, + "step": 11255 + }, + { + "epoch": 1.2387238723872387, + "grad_norm": 0.9825783967971802, + "learning_rate": 3.096534653465346e-05, + "loss": 0.1587, + "num_input_tokens_seen": 2375872, + "step": 11260 + }, + { + "epoch": 1.2392739273927393, + "grad_norm": 1.102650761604309, + "learning_rate": 3.097909790979098e-05, + "loss": 0.118, + "num_input_tokens_seen": 2376960, + "step": 11265 + }, + { + "epoch": 1.2398239823982398, + "grad_norm": 0.4484265148639679, + "learning_rate": 3.0992849284928493e-05, + "loss": 0.0869, + "num_input_tokens_seen": 2378016, + "step": 11270 + }, + { + "epoch": 1.2403740374037404, + "grad_norm": 1.2899274826049805, + "learning_rate": 3.100660066006601e-05, + "loss": 0.1053, + "num_input_tokens_seen": 2379104, + "step": 11275 + }, + { + "epoch": 1.2409240924092408, + "grad_norm": 0.1679367870092392, + "learning_rate": 3.1020352035203523e-05, + "loss": 0.0432, + "num_input_tokens_seen": 2380192, + "step": 11280 + }, + { + "epoch": 1.2414741474147415, + "grad_norm": 1.4093217849731445, + "learning_rate": 3.1034103410341035e-05, + "loss": 0.1017, + "num_input_tokens_seen": 2381216, + "step": 11285 + }, + { + "epoch": 1.2420242024202421, + "grad_norm": 0.29530397057533264, + "learning_rate": 3.104785478547855e-05, + "loss": 0.1382, + "num_input_tokens_seen": 2382304, + "step": 11290 + }, + { + "epoch": 1.2425742574257426, + "grad_norm": 1.3826783895492554, + "learning_rate": 3.1061606160616065e-05, + "loss": 0.1473, + "num_input_tokens_seen": 2383424, + "step": 11295 + }, + { + "epoch": 1.2431243124312432, + "grad_norm": 0.826379120349884, + "learning_rate": 3.107535753575358e-05, + "loss": 0.1049, + "num_input_tokens_seen": 2384480, + "step": 11300 + }, + { + "epoch": 1.2436743674367436, + "grad_norm": 0.7393776774406433, + "learning_rate": 3.108910891089109e-05, + "loss": 0.1443, + "num_input_tokens_seen": 2385472, + "step": 11305 + }, + { + "epoch": 1.2442244224422443, + "grad_norm": 0.8649192452430725, + "learning_rate": 3.110286028602861e-05, + "loss": 0.1382, + "num_input_tokens_seen": 2386528, + "step": 11310 + }, + { + "epoch": 1.2447744774477447, + "grad_norm": 1.404071569442749, + "learning_rate": 3.111661166116612e-05, + "loss": 0.0959, + "num_input_tokens_seen": 2387616, + "step": 11315 + }, + { + "epoch": 1.2453245324532454, + "grad_norm": 0.13211053609848022, + "learning_rate": 3.113036303630363e-05, + "loss": 0.145, + "num_input_tokens_seen": 2388608, + "step": 11320 + }, + { + "epoch": 1.2458745874587458, + "grad_norm": 0.3369462788105011, + "learning_rate": 3.114411441144114e-05, + "loss": 0.0418, + "num_input_tokens_seen": 2389664, + "step": 11325 + }, + { + "epoch": 1.2464246424642464, + "grad_norm": 0.39086318016052246, + "learning_rate": 3.115786578657866e-05, + "loss": 0.0847, + "num_input_tokens_seen": 2390656, + "step": 11330 + }, + { + "epoch": 1.246974697469747, + "grad_norm": 0.5785441398620605, + "learning_rate": 3.117161716171617e-05, + "loss": 0.2043, + "num_input_tokens_seen": 2391744, + "step": 11335 + }, + { + "epoch": 1.2475247524752475, + "grad_norm": 0.3564135730266571, + "learning_rate": 3.118536853685369e-05, + "loss": 0.078, + "num_input_tokens_seen": 2392800, + "step": 11340 + }, + { + "epoch": 1.2480748074807482, + "grad_norm": 0.44140028953552246, + "learning_rate": 3.1199119911991195e-05, + "loss": 0.1546, + "num_input_tokens_seen": 2393856, + "step": 11345 + }, + { + "epoch": 1.2486248624862486, + "grad_norm": 0.348808616399765, + "learning_rate": 3.1212871287128714e-05, + "loss": 0.0736, + "num_input_tokens_seen": 2394944, + "step": 11350 + }, + { + "epoch": 1.2491749174917492, + "grad_norm": 0.6475159525871277, + "learning_rate": 3.122662266226623e-05, + "loss": 0.1536, + "num_input_tokens_seen": 2395968, + "step": 11355 + }, + { + "epoch": 1.2497249724972497, + "grad_norm": 0.25745901465415955, + "learning_rate": 3.1240374037403744e-05, + "loss": 0.0855, + "num_input_tokens_seen": 2396992, + "step": 11360 + }, + { + "epoch": 1.2502750275027503, + "grad_norm": 0.28426840901374817, + "learning_rate": 3.1254125412541256e-05, + "loss": 0.066, + "num_input_tokens_seen": 2398080, + "step": 11365 + }, + { + "epoch": 1.2508250825082508, + "grad_norm": 0.6020145416259766, + "learning_rate": 3.126787678767877e-05, + "loss": 0.1397, + "num_input_tokens_seen": 2399200, + "step": 11370 + }, + { + "epoch": 1.2513751375137514, + "grad_norm": 1.2067800760269165, + "learning_rate": 3.1281628162816286e-05, + "loss": 0.1416, + "num_input_tokens_seen": 2400224, + "step": 11375 + }, + { + "epoch": 1.251925192519252, + "grad_norm": 0.7839394807815552, + "learning_rate": 3.12953795379538e-05, + "loss": 0.0871, + "num_input_tokens_seen": 2401216, + "step": 11380 + }, + { + "epoch": 1.2524752475247525, + "grad_norm": 0.473944753408432, + "learning_rate": 3.130913091309131e-05, + "loss": 0.1569, + "num_input_tokens_seen": 2402240, + "step": 11385 + }, + { + "epoch": 1.253025302530253, + "grad_norm": 0.2275448590517044, + "learning_rate": 3.132288228822882e-05, + "loss": 0.1231, + "num_input_tokens_seen": 2403392, + "step": 11390 + }, + { + "epoch": 1.2535753575357536, + "grad_norm": 0.6507533192634583, + "learning_rate": 3.133663366336634e-05, + "loss": 0.1828, + "num_input_tokens_seen": 2404480, + "step": 11395 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.8631024956703186, + "learning_rate": 3.135038503850386e-05, + "loss": 0.1157, + "num_input_tokens_seen": 2405568, + "step": 11400 + }, + { + "epoch": 1.2546754675467546, + "grad_norm": 0.4128894507884979, + "learning_rate": 3.136413641364136e-05, + "loss": 0.1473, + "num_input_tokens_seen": 2406688, + "step": 11405 + }, + { + "epoch": 1.2552255225522553, + "grad_norm": 0.17755667865276337, + "learning_rate": 3.137788778877888e-05, + "loss": 0.1068, + "num_input_tokens_seen": 2407744, + "step": 11410 + }, + { + "epoch": 1.2557755775577557, + "grad_norm": 1.446066975593567, + "learning_rate": 3.139163916391639e-05, + "loss": 0.1511, + "num_input_tokens_seen": 2408800, + "step": 11415 + }, + { + "epoch": 1.2563256325632564, + "grad_norm": 0.6133572459220886, + "learning_rate": 3.140539053905391e-05, + "loss": 0.1255, + "num_input_tokens_seen": 2409888, + "step": 11420 + }, + { + "epoch": 1.256875687568757, + "grad_norm": 1.5946720838546753, + "learning_rate": 3.1419141914191416e-05, + "loss": 0.0921, + "num_input_tokens_seen": 2410944, + "step": 11425 + }, + { + "epoch": 1.2574257425742574, + "grad_norm": 0.3163400888442993, + "learning_rate": 3.1432893289328934e-05, + "loss": 0.1161, + "num_input_tokens_seen": 2412000, + "step": 11430 + }, + { + "epoch": 1.2579757975797579, + "grad_norm": 1.3950716257095337, + "learning_rate": 3.1446644664466446e-05, + "loss": 0.2658, + "num_input_tokens_seen": 2413024, + "step": 11435 + }, + { + "epoch": 1.2585258525852585, + "grad_norm": 0.5446452498435974, + "learning_rate": 3.1460396039603964e-05, + "loss": 0.1978, + "num_input_tokens_seen": 2414144, + "step": 11440 + }, + { + "epoch": 1.2590759075907592, + "grad_norm": 0.46271270513534546, + "learning_rate": 3.1474147414741476e-05, + "loss": 0.1473, + "num_input_tokens_seen": 2415168, + "step": 11445 + }, + { + "epoch": 1.2596259625962596, + "grad_norm": 0.3797013759613037, + "learning_rate": 3.148789878987899e-05, + "loss": 0.1196, + "num_input_tokens_seen": 2416256, + "step": 11450 + }, + { + "epoch": 1.2601760176017602, + "grad_norm": 1.6309878826141357, + "learning_rate": 3.1501650165016506e-05, + "loss": 0.1354, + "num_input_tokens_seen": 2417280, + "step": 11455 + }, + { + "epoch": 1.2607260726072607, + "grad_norm": 0.760063886642456, + "learning_rate": 3.151540154015402e-05, + "loss": 0.1432, + "num_input_tokens_seen": 2418304, + "step": 11460 + }, + { + "epoch": 1.2612761276127613, + "grad_norm": 0.45694002509117126, + "learning_rate": 3.152915291529153e-05, + "loss": 0.0807, + "num_input_tokens_seen": 2419328, + "step": 11465 + }, + { + "epoch": 1.261826182618262, + "grad_norm": 0.29843881726264954, + "learning_rate": 3.154290429042904e-05, + "loss": 0.0605, + "num_input_tokens_seen": 2420384, + "step": 11470 + }, + { + "epoch": 1.2623762376237624, + "grad_norm": 0.5874069333076477, + "learning_rate": 3.155665566556656e-05, + "loss": 0.0835, + "num_input_tokens_seen": 2421440, + "step": 11475 + }, + { + "epoch": 1.2629262926292628, + "grad_norm": 0.6923736333847046, + "learning_rate": 3.157040704070407e-05, + "loss": 0.1775, + "num_input_tokens_seen": 2422464, + "step": 11480 + }, + { + "epoch": 1.2634763476347635, + "grad_norm": 0.6658527255058289, + "learning_rate": 3.158415841584158e-05, + "loss": 0.0957, + "num_input_tokens_seen": 2423520, + "step": 11485 + }, + { + "epoch": 1.2640264026402641, + "grad_norm": 1.8680907487869263, + "learning_rate": 3.1597909790979094e-05, + "loss": 0.1352, + "num_input_tokens_seen": 2424672, + "step": 11490 + }, + { + "epoch": 1.2645764576457645, + "grad_norm": 0.6656249761581421, + "learning_rate": 3.161166116611661e-05, + "loss": 0.1318, + "num_input_tokens_seen": 2425696, + "step": 11495 + }, + { + "epoch": 1.2651265126512652, + "grad_norm": 0.5501734018325806, + "learning_rate": 3.162541254125413e-05, + "loss": 0.1442, + "num_input_tokens_seen": 2426656, + "step": 11500 + }, + { + "epoch": 1.2656765676567656, + "grad_norm": 0.2774524390697479, + "learning_rate": 3.163916391639164e-05, + "loss": 0.0918, + "num_input_tokens_seen": 2427776, + "step": 11505 + }, + { + "epoch": 1.2662266226622663, + "grad_norm": 0.37342894077301025, + "learning_rate": 3.1652915291529154e-05, + "loss": 0.0398, + "num_input_tokens_seen": 2428832, + "step": 11510 + }, + { + "epoch": 1.266776677667767, + "grad_norm": 0.8919945359230042, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.1654, + "num_input_tokens_seen": 2429888, + "step": 11515 + }, + { + "epoch": 1.2673267326732673, + "grad_norm": 1.9576950073242188, + "learning_rate": 3.1680418041804185e-05, + "loss": 0.243, + "num_input_tokens_seen": 2430976, + "step": 11520 + }, + { + "epoch": 1.2678767876787678, + "grad_norm": 1.4756892919540405, + "learning_rate": 3.1694169416941696e-05, + "loss": 0.19, + "num_input_tokens_seen": 2432032, + "step": 11525 + }, + { + "epoch": 1.2684268426842684, + "grad_norm": 0.2657015025615692, + "learning_rate": 3.170792079207921e-05, + "loss": 0.086, + "num_input_tokens_seen": 2433120, + "step": 11530 + }, + { + "epoch": 1.268976897689769, + "grad_norm": 0.47228559851646423, + "learning_rate": 3.172167216721672e-05, + "loss": 0.0658, + "num_input_tokens_seen": 2434240, + "step": 11535 + }, + { + "epoch": 1.2695269526952695, + "grad_norm": 0.0535954013466835, + "learning_rate": 3.173542354235424e-05, + "loss": 0.0875, + "num_input_tokens_seen": 2435296, + "step": 11540 + }, + { + "epoch": 1.2700770077007701, + "grad_norm": 0.968681812286377, + "learning_rate": 3.174917491749175e-05, + "loss": 0.1147, + "num_input_tokens_seen": 2436384, + "step": 11545 + }, + { + "epoch": 1.2706270627062706, + "grad_norm": 0.9924623966217041, + "learning_rate": 3.176292629262926e-05, + "loss": 0.0591, + "num_input_tokens_seen": 2437408, + "step": 11550 + }, + { + "epoch": 1.2711771177117712, + "grad_norm": 1.0655251741409302, + "learning_rate": 3.177667766776678e-05, + "loss": 0.0593, + "num_input_tokens_seen": 2438464, + "step": 11555 + }, + { + "epoch": 1.2717271727172716, + "grad_norm": 0.9566510319709778, + "learning_rate": 3.179042904290429e-05, + "loss": 0.1646, + "num_input_tokens_seen": 2439488, + "step": 11560 + }, + { + "epoch": 1.2722772277227723, + "grad_norm": 0.5370635986328125, + "learning_rate": 3.180418041804181e-05, + "loss": 0.1821, + "num_input_tokens_seen": 2440544, + "step": 11565 + }, + { + "epoch": 1.2728272827282727, + "grad_norm": 1.1610918045043945, + "learning_rate": 3.1817931793179315e-05, + "loss": 0.1753, + "num_input_tokens_seen": 2441600, + "step": 11570 + }, + { + "epoch": 1.2733773377337734, + "grad_norm": 0.48623159527778625, + "learning_rate": 3.183168316831683e-05, + "loss": 0.1107, + "num_input_tokens_seen": 2442656, + "step": 11575 + }, + { + "epoch": 1.273927392739274, + "grad_norm": 0.6991000175476074, + "learning_rate": 3.1845434543454345e-05, + "loss": 0.1464, + "num_input_tokens_seen": 2443680, + "step": 11580 + }, + { + "epoch": 1.2744774477447744, + "grad_norm": 0.501965343952179, + "learning_rate": 3.185918591859186e-05, + "loss": 0.0851, + "num_input_tokens_seen": 2444736, + "step": 11585 + }, + { + "epoch": 1.275027502750275, + "grad_norm": 0.4980604350566864, + "learning_rate": 3.1872937293729375e-05, + "loss": 0.2091, + "num_input_tokens_seen": 2445824, + "step": 11590 + }, + { + "epoch": 1.2755775577557755, + "grad_norm": 0.6737959384918213, + "learning_rate": 3.1886688668866887e-05, + "loss": 0.0661, + "num_input_tokens_seen": 2446816, + "step": 11595 + }, + { + "epoch": 1.2761276127612762, + "grad_norm": 0.7333210706710815, + "learning_rate": 3.1900440044004405e-05, + "loss": 0.1152, + "num_input_tokens_seen": 2447904, + "step": 11600 + }, + { + "epoch": 1.2766776677667766, + "grad_norm": 0.6960572600364685, + "learning_rate": 3.1914191419141917e-05, + "loss": 0.1476, + "num_input_tokens_seen": 2448960, + "step": 11605 + }, + { + "epoch": 1.2772277227722773, + "grad_norm": 0.9466004371643066, + "learning_rate": 3.192794279427943e-05, + "loss": 0.1942, + "num_input_tokens_seen": 2450016, + "step": 11610 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.3369308114051819, + "learning_rate": 3.194169416941694e-05, + "loss": 0.1212, + "num_input_tokens_seen": 2451072, + "step": 11615 + }, + { + "epoch": 1.2783278327832783, + "grad_norm": 2.24379563331604, + "learning_rate": 3.195544554455446e-05, + "loss": 0.1887, + "num_input_tokens_seen": 2452128, + "step": 11620 + }, + { + "epoch": 1.278877887788779, + "grad_norm": 1.3083982467651367, + "learning_rate": 3.196919691969197e-05, + "loss": 0.1341, + "num_input_tokens_seen": 2453216, + "step": 11625 + }, + { + "epoch": 1.2794279427942794, + "grad_norm": 0.562201738357544, + "learning_rate": 3.198294829482948e-05, + "loss": 0.0838, + "num_input_tokens_seen": 2454304, + "step": 11630 + }, + { + "epoch": 1.27997799779978, + "grad_norm": 0.9387811422348022, + "learning_rate": 3.1996699669967e-05, + "loss": 0.0938, + "num_input_tokens_seen": 2455392, + "step": 11635 + }, + { + "epoch": 1.2805280528052805, + "grad_norm": 1.5632662773132324, + "learning_rate": 3.201045104510451e-05, + "loss": 0.1085, + "num_input_tokens_seen": 2456512, + "step": 11640 + }, + { + "epoch": 1.2810781078107811, + "grad_norm": 0.7213608622550964, + "learning_rate": 3.202420242024203e-05, + "loss": 0.1425, + "num_input_tokens_seen": 2457504, + "step": 11645 + }, + { + "epoch": 1.2816281628162816, + "grad_norm": 0.8651918768882751, + "learning_rate": 3.2037953795379535e-05, + "loss": 0.1287, + "num_input_tokens_seen": 2458592, + "step": 11650 + }, + { + "epoch": 1.2821782178217822, + "grad_norm": 0.6669200658798218, + "learning_rate": 3.2051705170517053e-05, + "loss": 0.0944, + "num_input_tokens_seen": 2459616, + "step": 11655 + }, + { + "epoch": 1.2827282728272826, + "grad_norm": 0.4502369463443756, + "learning_rate": 3.2065456545654565e-05, + "loss": 0.1404, + "num_input_tokens_seen": 2460704, + "step": 11660 + }, + { + "epoch": 1.2832783278327833, + "grad_norm": 0.5317671298980713, + "learning_rate": 3.2079207920792084e-05, + "loss": 0.1363, + "num_input_tokens_seen": 2461760, + "step": 11665 + }, + { + "epoch": 1.283828382838284, + "grad_norm": 0.7345555424690247, + "learning_rate": 3.2092959295929595e-05, + "loss": 0.1605, + "num_input_tokens_seen": 2462816, + "step": 11670 + }, + { + "epoch": 1.2843784378437844, + "grad_norm": 0.6800090074539185, + "learning_rate": 3.210671067106711e-05, + "loss": 0.0754, + "num_input_tokens_seen": 2463936, + "step": 11675 + }, + { + "epoch": 1.284928492849285, + "grad_norm": 0.22642774879932404, + "learning_rate": 3.2120462046204625e-05, + "loss": 0.1045, + "num_input_tokens_seen": 2464928, + "step": 11680 + }, + { + "epoch": 1.2854785478547854, + "grad_norm": 0.1645340621471405, + "learning_rate": 3.213421342134214e-05, + "loss": 0.1292, + "num_input_tokens_seen": 2466016, + "step": 11685 + }, + { + "epoch": 1.286028602860286, + "grad_norm": 0.0512181892991066, + "learning_rate": 3.214796479647965e-05, + "loss": 0.0814, + "num_input_tokens_seen": 2467168, + "step": 11690 + }, + { + "epoch": 1.2865786578657865, + "grad_norm": 0.7365952730178833, + "learning_rate": 3.216171617161716e-05, + "loss": 0.1535, + "num_input_tokens_seen": 2468192, + "step": 11695 + }, + { + "epoch": 1.2871287128712872, + "grad_norm": 0.07719071209430695, + "learning_rate": 3.217546754675468e-05, + "loss": 0.0981, + "num_input_tokens_seen": 2469248, + "step": 11700 + }, + { + "epoch": 1.2876787678767876, + "grad_norm": 1.7515231370925903, + "learning_rate": 3.218921892189219e-05, + "loss": 0.1011, + "num_input_tokens_seen": 2470272, + "step": 11705 + }, + { + "epoch": 1.2882288228822882, + "grad_norm": 0.13819977641105652, + "learning_rate": 3.22029702970297e-05, + "loss": 0.0501, + "num_input_tokens_seen": 2471264, + "step": 11710 + }, + { + "epoch": 1.2887788778877889, + "grad_norm": 0.2754998803138733, + "learning_rate": 3.2216721672167214e-05, + "loss": 0.1245, + "num_input_tokens_seen": 2472384, + "step": 11715 + }, + { + "epoch": 1.2893289328932893, + "grad_norm": 1.0697048902511597, + "learning_rate": 3.223047304730473e-05, + "loss": 0.2227, + "num_input_tokens_seen": 2473472, + "step": 11720 + }, + { + "epoch": 1.2898789878987897, + "grad_norm": 0.13137048482894897, + "learning_rate": 3.224422442244225e-05, + "loss": 0.0872, + "num_input_tokens_seen": 2474560, + "step": 11725 + }, + { + "epoch": 1.2904290429042904, + "grad_norm": 0.21675217151641846, + "learning_rate": 3.225797579757976e-05, + "loss": 0.1204, + "num_input_tokens_seen": 2475616, + "step": 11730 + }, + { + "epoch": 1.290979097909791, + "grad_norm": 0.4184437096118927, + "learning_rate": 3.2271727172717274e-05, + "loss": 0.0564, + "num_input_tokens_seen": 2476608, + "step": 11735 + }, + { + "epoch": 1.2915291529152915, + "grad_norm": 0.9201971888542175, + "learning_rate": 3.2285478547854786e-05, + "loss": 0.1617, + "num_input_tokens_seen": 2477664, + "step": 11740 + }, + { + "epoch": 1.2920792079207921, + "grad_norm": 0.5615583062171936, + "learning_rate": 3.2299229922992304e-05, + "loss": 0.0784, + "num_input_tokens_seen": 2478720, + "step": 11745 + }, + { + "epoch": 1.2926292629262925, + "grad_norm": 0.25244203209877014, + "learning_rate": 3.2312981298129816e-05, + "loss": 0.1089, + "num_input_tokens_seen": 2479776, + "step": 11750 + }, + { + "epoch": 1.2931793179317932, + "grad_norm": 1.1775163412094116, + "learning_rate": 3.232673267326733e-05, + "loss": 0.1937, + "num_input_tokens_seen": 2480800, + "step": 11755 + }, + { + "epoch": 1.2937293729372938, + "grad_norm": 1.8175867795944214, + "learning_rate": 3.234048404840484e-05, + "loss": 0.2332, + "num_input_tokens_seen": 2481824, + "step": 11760 + }, + { + "epoch": 1.2942794279427943, + "grad_norm": 0.48691055178642273, + "learning_rate": 3.235423542354236e-05, + "loss": 0.0538, + "num_input_tokens_seen": 2482912, + "step": 11765 + }, + { + "epoch": 1.2948294829482947, + "grad_norm": 0.6076750159263611, + "learning_rate": 3.236798679867987e-05, + "loss": 0.1358, + "num_input_tokens_seen": 2483904, + "step": 11770 + }, + { + "epoch": 1.2953795379537953, + "grad_norm": 0.49148568511009216, + "learning_rate": 3.238173817381738e-05, + "loss": 0.122, + "num_input_tokens_seen": 2484928, + "step": 11775 + }, + { + "epoch": 1.295929592959296, + "grad_norm": 1.8138755559921265, + "learning_rate": 3.23954895489549e-05, + "loss": 0.1639, + "num_input_tokens_seen": 2486080, + "step": 11780 + }, + { + "epoch": 1.2964796479647964, + "grad_norm": 0.7247316241264343, + "learning_rate": 3.240924092409241e-05, + "loss": 0.1389, + "num_input_tokens_seen": 2487104, + "step": 11785 + }, + { + "epoch": 1.297029702970297, + "grad_norm": 0.5683165192604065, + "learning_rate": 3.242299229922993e-05, + "loss": 0.0944, + "num_input_tokens_seen": 2488128, + "step": 11790 + }, + { + "epoch": 1.2975797579757975, + "grad_norm": 0.4911315143108368, + "learning_rate": 3.2436743674367434e-05, + "loss": 0.1295, + "num_input_tokens_seen": 2489184, + "step": 11795 + }, + { + "epoch": 1.2981298129812981, + "grad_norm": 0.5748597979545593, + "learning_rate": 3.245049504950495e-05, + "loss": 0.1237, + "num_input_tokens_seen": 2490336, + "step": 11800 + }, + { + "epoch": 1.2986798679867988, + "grad_norm": 2.9599597454071045, + "learning_rate": 3.2464246424642464e-05, + "loss": 0.2201, + "num_input_tokens_seen": 2491488, + "step": 11805 + }, + { + "epoch": 1.2992299229922992, + "grad_norm": 0.2737593948841095, + "learning_rate": 3.247799779977998e-05, + "loss": 0.165, + "num_input_tokens_seen": 2492480, + "step": 11810 + }, + { + "epoch": 1.2997799779977997, + "grad_norm": 1.2902487516403198, + "learning_rate": 3.249174917491749e-05, + "loss": 0.1169, + "num_input_tokens_seen": 2493568, + "step": 11815 + }, + { + "epoch": 1.3003300330033003, + "grad_norm": 0.5275384783744812, + "learning_rate": 3.2505500550055006e-05, + "loss": 0.2354, + "num_input_tokens_seen": 2494656, + "step": 11820 + }, + { + "epoch": 1.300880088008801, + "grad_norm": 1.7805094718933105, + "learning_rate": 3.2519251925192524e-05, + "loss": 0.2152, + "num_input_tokens_seen": 2495680, + "step": 11825 + }, + { + "epoch": 1.3014301430143014, + "grad_norm": 1.2704086303710938, + "learning_rate": 3.2533003300330036e-05, + "loss": 0.1176, + "num_input_tokens_seen": 2496704, + "step": 11830 + }, + { + "epoch": 1.301980198019802, + "grad_norm": 1.06804621219635, + "learning_rate": 3.254675467546755e-05, + "loss": 0.1235, + "num_input_tokens_seen": 2497824, + "step": 11835 + }, + { + "epoch": 1.3025302530253025, + "grad_norm": 0.1742139607667923, + "learning_rate": 3.256050605060506e-05, + "loss": 0.0427, + "num_input_tokens_seen": 2498912, + "step": 11840 + }, + { + "epoch": 1.303080308030803, + "grad_norm": 0.4353344738483429, + "learning_rate": 3.257425742574258e-05, + "loss": 0.1156, + "num_input_tokens_seen": 2500032, + "step": 11845 + }, + { + "epoch": 1.3036303630363038, + "grad_norm": 1.3430063724517822, + "learning_rate": 3.258800880088009e-05, + "loss": 0.1877, + "num_input_tokens_seen": 2501088, + "step": 11850 + }, + { + "epoch": 1.3041804180418042, + "grad_norm": 0.03881209343671799, + "learning_rate": 3.26017601760176e-05, + "loss": 0.1436, + "num_input_tokens_seen": 2502144, + "step": 11855 + }, + { + "epoch": 1.3047304730473046, + "grad_norm": 0.3480032980442047, + "learning_rate": 3.261551155115511e-05, + "loss": 0.0741, + "num_input_tokens_seen": 2503168, + "step": 11860 + }, + { + "epoch": 1.3052805280528053, + "grad_norm": 0.41873615980148315, + "learning_rate": 3.262926292629263e-05, + "loss": 0.19, + "num_input_tokens_seen": 2504192, + "step": 11865 + }, + { + "epoch": 1.305830583058306, + "grad_norm": 0.34509187936782837, + "learning_rate": 3.264301430143015e-05, + "loss": 0.0797, + "num_input_tokens_seen": 2505216, + "step": 11870 + }, + { + "epoch": 1.3063806380638063, + "grad_norm": 0.40847674012184143, + "learning_rate": 3.2656765676567654e-05, + "loss": 0.1429, + "num_input_tokens_seen": 2506272, + "step": 11875 + }, + { + "epoch": 1.306930693069307, + "grad_norm": 0.081968754529953, + "learning_rate": 3.267051705170517e-05, + "loss": 0.0559, + "num_input_tokens_seen": 2507360, + "step": 11880 + }, + { + "epoch": 1.3074807480748074, + "grad_norm": 1.1419612169265747, + "learning_rate": 3.2684268426842685e-05, + "loss": 0.1817, + "num_input_tokens_seen": 2508416, + "step": 11885 + }, + { + "epoch": 1.308030803080308, + "grad_norm": 0.6353503465652466, + "learning_rate": 3.26980198019802e-05, + "loss": 0.0615, + "num_input_tokens_seen": 2509440, + "step": 11890 + }, + { + "epoch": 1.3085808580858087, + "grad_norm": 0.5110296607017517, + "learning_rate": 3.2711771177117715e-05, + "loss": 0.1059, + "num_input_tokens_seen": 2510496, + "step": 11895 + }, + { + "epoch": 1.3091309130913091, + "grad_norm": 0.8352024555206299, + "learning_rate": 3.2725522552255226e-05, + "loss": 0.0749, + "num_input_tokens_seen": 2511488, + "step": 11900 + }, + { + "epoch": 1.3096809680968096, + "grad_norm": 0.4276456832885742, + "learning_rate": 3.273927392739274e-05, + "loss": 0.0674, + "num_input_tokens_seen": 2512544, + "step": 11905 + }, + { + "epoch": 1.3102310231023102, + "grad_norm": 0.2196958214044571, + "learning_rate": 3.2753025302530256e-05, + "loss": 0.1395, + "num_input_tokens_seen": 2513600, + "step": 11910 + }, + { + "epoch": 1.3107810781078109, + "grad_norm": 1.6338164806365967, + "learning_rate": 3.276677667766777e-05, + "loss": 0.2117, + "num_input_tokens_seen": 2514624, + "step": 11915 + }, + { + "epoch": 1.3113311331133113, + "grad_norm": 0.3541707694530487, + "learning_rate": 3.278052805280528e-05, + "loss": 0.0757, + "num_input_tokens_seen": 2515712, + "step": 11920 + }, + { + "epoch": 1.311881188118812, + "grad_norm": 0.7444448471069336, + "learning_rate": 3.27942794279428e-05, + "loss": 0.1244, + "num_input_tokens_seen": 2516768, + "step": 11925 + }, + { + "epoch": 1.3124312431243124, + "grad_norm": 0.5540369749069214, + "learning_rate": 3.280803080308031e-05, + "loss": 0.1736, + "num_input_tokens_seen": 2517824, + "step": 11930 + }, + { + "epoch": 1.312981298129813, + "grad_norm": 0.3393772840499878, + "learning_rate": 3.282178217821782e-05, + "loss": 0.0705, + "num_input_tokens_seen": 2518912, + "step": 11935 + }, + { + "epoch": 1.3135313531353137, + "grad_norm": 0.835364818572998, + "learning_rate": 3.283553355335533e-05, + "loss": 0.1326, + "num_input_tokens_seen": 2519968, + "step": 11940 + }, + { + "epoch": 1.314081408140814, + "grad_norm": 0.09872327744960785, + "learning_rate": 3.284928492849285e-05, + "loss": 0.0789, + "num_input_tokens_seen": 2521024, + "step": 11945 + }, + { + "epoch": 1.3146314631463145, + "grad_norm": 0.7024236917495728, + "learning_rate": 3.286303630363036e-05, + "loss": 0.116, + "num_input_tokens_seen": 2522048, + "step": 11950 + }, + { + "epoch": 1.3151815181518152, + "grad_norm": 0.5092417001724243, + "learning_rate": 3.287678767876788e-05, + "loss": 0.1173, + "num_input_tokens_seen": 2523104, + "step": 11955 + }, + { + "epoch": 1.3157315731573158, + "grad_norm": 0.10247951000928879, + "learning_rate": 3.289053905390539e-05, + "loss": 0.1888, + "num_input_tokens_seen": 2524160, + "step": 11960 + }, + { + "epoch": 1.3162816281628162, + "grad_norm": 0.597764790058136, + "learning_rate": 3.2904290429042905e-05, + "loss": 0.2433, + "num_input_tokens_seen": 2525248, + "step": 11965 + }, + { + "epoch": 1.316831683168317, + "grad_norm": 0.2119462639093399, + "learning_rate": 3.291804180418042e-05, + "loss": 0.0737, + "num_input_tokens_seen": 2526304, + "step": 11970 + }, + { + "epoch": 1.3173817381738173, + "grad_norm": 0.42340341210365295, + "learning_rate": 3.2931793179317935e-05, + "loss": 0.1612, + "num_input_tokens_seen": 2527360, + "step": 11975 + }, + { + "epoch": 1.317931793179318, + "grad_norm": 0.6640898585319519, + "learning_rate": 3.294554455445545e-05, + "loss": 0.114, + "num_input_tokens_seen": 2528416, + "step": 11980 + }, + { + "epoch": 1.3184818481848186, + "grad_norm": 0.5516306161880493, + "learning_rate": 3.295929592959296e-05, + "loss": 0.1666, + "num_input_tokens_seen": 2529440, + "step": 11985 + }, + { + "epoch": 1.319031903190319, + "grad_norm": 1.2498000860214233, + "learning_rate": 3.297304730473048e-05, + "loss": 0.198, + "num_input_tokens_seen": 2530432, + "step": 11990 + }, + { + "epoch": 1.3195819581958195, + "grad_norm": 0.5621117353439331, + "learning_rate": 3.298679867986799e-05, + "loss": 0.1719, + "num_input_tokens_seen": 2531424, + "step": 11995 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 0.5439420342445374, + "learning_rate": 3.30005500550055e-05, + "loss": 0.1722, + "num_input_tokens_seen": 2532480, + "step": 12000 + }, + { + "epoch": 1.3206820682068208, + "grad_norm": 0.4147339165210724, + "learning_rate": 3.301430143014301e-05, + "loss": 0.0628, + "num_input_tokens_seen": 2533568, + "step": 12005 + }, + { + "epoch": 1.3212321232123212, + "grad_norm": 0.09587717056274414, + "learning_rate": 3.302805280528053e-05, + "loss": 0.1378, + "num_input_tokens_seen": 2534560, + "step": 12010 + }, + { + "epoch": 1.3217821782178218, + "grad_norm": 0.30952414870262146, + "learning_rate": 3.304180418041805e-05, + "loss": 0.124, + "num_input_tokens_seen": 2535584, + "step": 12015 + }, + { + "epoch": 1.3223322332233223, + "grad_norm": 0.4798837900161743, + "learning_rate": 3.3055555555555553e-05, + "loss": 0.0797, + "num_input_tokens_seen": 2536576, + "step": 12020 + }, + { + "epoch": 1.322882288228823, + "grad_norm": 1.4505568742752075, + "learning_rate": 3.306930693069307e-05, + "loss": 0.1425, + "num_input_tokens_seen": 2537568, + "step": 12025 + }, + { + "epoch": 1.3234323432343233, + "grad_norm": 0.4644469618797302, + "learning_rate": 3.3083058305830583e-05, + "loss": 0.0743, + "num_input_tokens_seen": 2538592, + "step": 12030 + }, + { + "epoch": 1.323982398239824, + "grad_norm": 2.5807337760925293, + "learning_rate": 3.30968096809681e-05, + "loss": 0.1742, + "num_input_tokens_seen": 2539680, + "step": 12035 + }, + { + "epoch": 1.3245324532453244, + "grad_norm": 0.394159734249115, + "learning_rate": 3.311056105610561e-05, + "loss": 0.0733, + "num_input_tokens_seen": 2540736, + "step": 12040 + }, + { + "epoch": 1.325082508250825, + "grad_norm": 0.2876368463039398, + "learning_rate": 3.3124312431243125e-05, + "loss": 0.1177, + "num_input_tokens_seen": 2541824, + "step": 12045 + }, + { + "epoch": 1.3256325632563257, + "grad_norm": 0.4233713448047638, + "learning_rate": 3.313806380638064e-05, + "loss": 0.1418, + "num_input_tokens_seen": 2542912, + "step": 12050 + }, + { + "epoch": 1.3261826182618262, + "grad_norm": 0.3520033061504364, + "learning_rate": 3.3151815181518155e-05, + "loss": 0.1121, + "num_input_tokens_seen": 2544064, + "step": 12055 + }, + { + "epoch": 1.3267326732673268, + "grad_norm": 0.24486419558525085, + "learning_rate": 3.316556655665567e-05, + "loss": 0.0599, + "num_input_tokens_seen": 2545088, + "step": 12060 + }, + { + "epoch": 1.3272827282728272, + "grad_norm": 0.21246758103370667, + "learning_rate": 3.317931793179318e-05, + "loss": 0.1037, + "num_input_tokens_seen": 2546144, + "step": 12065 + }, + { + "epoch": 1.3278327832783279, + "grad_norm": 0.801331102848053, + "learning_rate": 3.31930693069307e-05, + "loss": 0.0337, + "num_input_tokens_seen": 2547136, + "step": 12070 + }, + { + "epoch": 1.3283828382838283, + "grad_norm": 0.2650447189807892, + "learning_rate": 3.320682068206821e-05, + "loss": 0.0681, + "num_input_tokens_seen": 2548160, + "step": 12075 + }, + { + "epoch": 1.328932893289329, + "grad_norm": 1.35024893283844, + "learning_rate": 3.322057205720572e-05, + "loss": 0.0642, + "num_input_tokens_seen": 2549216, + "step": 12080 + }, + { + "epoch": 1.3294829482948294, + "grad_norm": 1.5254490375518799, + "learning_rate": 3.323432343234323e-05, + "loss": 0.0968, + "num_input_tokens_seen": 2550240, + "step": 12085 + }, + { + "epoch": 1.33003300330033, + "grad_norm": 0.7069251537322998, + "learning_rate": 3.324807480748075e-05, + "loss": 0.1177, + "num_input_tokens_seen": 2551296, + "step": 12090 + }, + { + "epoch": 1.3305830583058307, + "grad_norm": 1.205658197402954, + "learning_rate": 3.326182618261826e-05, + "loss": 0.1631, + "num_input_tokens_seen": 2552352, + "step": 12095 + }, + { + "epoch": 1.331133113311331, + "grad_norm": 0.5981531739234924, + "learning_rate": 3.3275577557755774e-05, + "loss": 0.0775, + "num_input_tokens_seen": 2553376, + "step": 12100 + }, + { + "epoch": 1.3316831683168318, + "grad_norm": 0.9343361854553223, + "learning_rate": 3.328932893289329e-05, + "loss": 0.0928, + "num_input_tokens_seen": 2554400, + "step": 12105 + }, + { + "epoch": 1.3322332233223322, + "grad_norm": 0.4543226659297943, + "learning_rate": 3.3303080308030804e-05, + "loss": 0.0857, + "num_input_tokens_seen": 2555456, + "step": 12110 + }, + { + "epoch": 1.3327832783278328, + "grad_norm": 0.33616411685943604, + "learning_rate": 3.331683168316832e-05, + "loss": 0.0621, + "num_input_tokens_seen": 2556608, + "step": 12115 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3570513129234314, + "learning_rate": 3.3330583058305834e-05, + "loss": 0.2577, + "num_input_tokens_seen": 2557696, + "step": 12120 + }, + { + "epoch": 1.333883388338834, + "grad_norm": 0.38253548741340637, + "learning_rate": 3.3344334433443346e-05, + "loss": 0.1216, + "num_input_tokens_seen": 2558816, + "step": 12125 + }, + { + "epoch": 1.3344334433443343, + "grad_norm": 0.9506904482841492, + "learning_rate": 3.335808580858086e-05, + "loss": 0.0755, + "num_input_tokens_seen": 2559808, + "step": 12130 + }, + { + "epoch": 1.334983498349835, + "grad_norm": 0.23127245903015137, + "learning_rate": 3.3371837183718376e-05, + "loss": 0.1391, + "num_input_tokens_seen": 2560864, + "step": 12135 + }, + { + "epoch": 1.3355335533553356, + "grad_norm": 0.6871346831321716, + "learning_rate": 3.338558855885589e-05, + "loss": 0.1217, + "num_input_tokens_seen": 2561952, + "step": 12140 + }, + { + "epoch": 1.336083608360836, + "grad_norm": 0.4636540710926056, + "learning_rate": 3.33993399339934e-05, + "loss": 0.1245, + "num_input_tokens_seen": 2563008, + "step": 12145 + }, + { + "epoch": 1.3366336633663367, + "grad_norm": 0.9023073315620422, + "learning_rate": 3.341309130913092e-05, + "loss": 0.1005, + "num_input_tokens_seen": 2564032, + "step": 12150 + }, + { + "epoch": 1.3371837183718371, + "grad_norm": 0.42135220766067505, + "learning_rate": 3.342684268426843e-05, + "loss": 0.0999, + "num_input_tokens_seen": 2565088, + "step": 12155 + }, + { + "epoch": 1.3377337733773378, + "grad_norm": 0.4202187955379486, + "learning_rate": 3.344059405940595e-05, + "loss": 0.1125, + "num_input_tokens_seen": 2566144, + "step": 12160 + }, + { + "epoch": 1.3382838283828382, + "grad_norm": 0.5459176301956177, + "learning_rate": 3.345434543454345e-05, + "loss": 0.0958, + "num_input_tokens_seen": 2567200, + "step": 12165 + }, + { + "epoch": 1.3388338833883389, + "grad_norm": 0.18629975616931915, + "learning_rate": 3.346809680968097e-05, + "loss": 0.1217, + "num_input_tokens_seen": 2568288, + "step": 12170 + }, + { + "epoch": 1.3393839383938393, + "grad_norm": 0.4162296652793884, + "learning_rate": 3.348184818481848e-05, + "loss": 0.0501, + "num_input_tokens_seen": 2569408, + "step": 12175 + }, + { + "epoch": 1.33993399339934, + "grad_norm": 0.510158360004425, + "learning_rate": 3.3495599559956e-05, + "loss": 0.1368, + "num_input_tokens_seen": 2570496, + "step": 12180 + }, + { + "epoch": 1.3404840484048406, + "grad_norm": 0.5124765038490295, + "learning_rate": 3.3509350935093506e-05, + "loss": 0.0593, + "num_input_tokens_seen": 2571520, + "step": 12185 + }, + { + "epoch": 1.341034103410341, + "grad_norm": 1.0190820693969727, + "learning_rate": 3.3523102310231024e-05, + "loss": 0.1241, + "num_input_tokens_seen": 2572576, + "step": 12190 + }, + { + "epoch": 1.3415841584158417, + "grad_norm": 1.583067774772644, + "learning_rate": 3.353685368536854e-05, + "loss": 0.103, + "num_input_tokens_seen": 2573632, + "step": 12195 + }, + { + "epoch": 1.342134213421342, + "grad_norm": 0.8716180324554443, + "learning_rate": 3.3550605060506054e-05, + "loss": 0.0832, + "num_input_tokens_seen": 2574752, + "step": 12200 + }, + { + "epoch": 1.3426842684268427, + "grad_norm": 0.6379635334014893, + "learning_rate": 3.3564356435643566e-05, + "loss": 0.155, + "num_input_tokens_seen": 2575808, + "step": 12205 + }, + { + "epoch": 1.3432343234323432, + "grad_norm": 0.3724440336227417, + "learning_rate": 3.357810781078108e-05, + "loss": 0.0964, + "num_input_tokens_seen": 2576864, + "step": 12210 + }, + { + "epoch": 1.3437843784378438, + "grad_norm": 0.3982633054256439, + "learning_rate": 3.3591859185918596e-05, + "loss": 0.1151, + "num_input_tokens_seen": 2577952, + "step": 12215 + }, + { + "epoch": 1.3443344334433442, + "grad_norm": 0.37924203276634216, + "learning_rate": 3.360561056105611e-05, + "loss": 0.101, + "num_input_tokens_seen": 2578976, + "step": 12220 + }, + { + "epoch": 1.344884488448845, + "grad_norm": 1.1108380556106567, + "learning_rate": 3.361936193619362e-05, + "loss": 0.1069, + "num_input_tokens_seen": 2580032, + "step": 12225 + }, + { + "epoch": 1.3454345434543455, + "grad_norm": 0.8214551210403442, + "learning_rate": 3.363311331133113e-05, + "loss": 0.1132, + "num_input_tokens_seen": 2581056, + "step": 12230 + }, + { + "epoch": 1.345984598459846, + "grad_norm": 0.36248767375946045, + "learning_rate": 3.364686468646865e-05, + "loss": 0.1132, + "num_input_tokens_seen": 2582112, + "step": 12235 + }, + { + "epoch": 1.3465346534653464, + "grad_norm": 0.9167879223823547, + "learning_rate": 3.366061606160617e-05, + "loss": 0.2294, + "num_input_tokens_seen": 2583168, + "step": 12240 + }, + { + "epoch": 1.347084708470847, + "grad_norm": 1.009018063545227, + "learning_rate": 3.367436743674367e-05, + "loss": 0.1149, + "num_input_tokens_seen": 2584160, + "step": 12245 + }, + { + "epoch": 1.3476347634763477, + "grad_norm": 1.7476271390914917, + "learning_rate": 3.368811881188119e-05, + "loss": 0.1048, + "num_input_tokens_seen": 2585152, + "step": 12250 + }, + { + "epoch": 1.3481848184818481, + "grad_norm": 0.4661087989807129, + "learning_rate": 3.37018701870187e-05, + "loss": 0.0923, + "num_input_tokens_seen": 2586240, + "step": 12255 + }, + { + "epoch": 1.3487348734873488, + "grad_norm": 0.21357527375221252, + "learning_rate": 3.371562156215622e-05, + "loss": 0.3684, + "num_input_tokens_seen": 2587296, + "step": 12260 + }, + { + "epoch": 1.3492849284928492, + "grad_norm": 1.3356828689575195, + "learning_rate": 3.3729372937293726e-05, + "loss": 0.2201, + "num_input_tokens_seen": 2588352, + "step": 12265 + }, + { + "epoch": 1.3498349834983498, + "grad_norm": 0.3698181211948395, + "learning_rate": 3.3743124312431245e-05, + "loss": 0.0963, + "num_input_tokens_seen": 2589408, + "step": 12270 + }, + { + "epoch": 1.3503850385038505, + "grad_norm": 0.08997397124767303, + "learning_rate": 3.3756875687568756e-05, + "loss": 0.1259, + "num_input_tokens_seen": 2590400, + "step": 12275 + }, + { + "epoch": 1.350935093509351, + "grad_norm": 0.2003326117992401, + "learning_rate": 3.3770627062706275e-05, + "loss": 0.1835, + "num_input_tokens_seen": 2591424, + "step": 12280 + }, + { + "epoch": 1.3514851485148514, + "grad_norm": 0.1986812800168991, + "learning_rate": 3.3784378437843786e-05, + "loss": 0.0434, + "num_input_tokens_seen": 2592480, + "step": 12285 + }, + { + "epoch": 1.352035203520352, + "grad_norm": 1.9020440578460693, + "learning_rate": 3.37981298129813e-05, + "loss": 0.1522, + "num_input_tokens_seen": 2593536, + "step": 12290 + }, + { + "epoch": 1.3525852585258527, + "grad_norm": 0.7471144199371338, + "learning_rate": 3.3811881188118816e-05, + "loss": 0.1156, + "num_input_tokens_seen": 2594560, + "step": 12295 + }, + { + "epoch": 1.353135313531353, + "grad_norm": 0.17543064057826996, + "learning_rate": 3.382563256325633e-05, + "loss": 0.1834, + "num_input_tokens_seen": 2595648, + "step": 12300 + }, + { + "epoch": 1.3536853685368537, + "grad_norm": 0.4523971676826477, + "learning_rate": 3.383938393839384e-05, + "loss": 0.1081, + "num_input_tokens_seen": 2596672, + "step": 12305 + }, + { + "epoch": 1.3542354235423542, + "grad_norm": 0.10613629966974258, + "learning_rate": 3.385313531353135e-05, + "loss": 0.1011, + "num_input_tokens_seen": 2597696, + "step": 12310 + }, + { + "epoch": 1.3547854785478548, + "grad_norm": 0.100362129509449, + "learning_rate": 3.386688668866887e-05, + "loss": 0.0432, + "num_input_tokens_seen": 2598688, + "step": 12315 + }, + { + "epoch": 1.3553355335533555, + "grad_norm": 0.79204922914505, + "learning_rate": 3.388063806380638e-05, + "loss": 0.1398, + "num_input_tokens_seen": 2599808, + "step": 12320 + }, + { + "epoch": 1.3558855885588559, + "grad_norm": 0.4649028480052948, + "learning_rate": 3.389438943894389e-05, + "loss": 0.1624, + "num_input_tokens_seen": 2600896, + "step": 12325 + }, + { + "epoch": 1.3564356435643563, + "grad_norm": 0.3252987265586853, + "learning_rate": 3.3908140814081405e-05, + "loss": 0.1375, + "num_input_tokens_seen": 2601920, + "step": 12330 + }, + { + "epoch": 1.356985698569857, + "grad_norm": 0.6830798387527466, + "learning_rate": 3.392189218921892e-05, + "loss": 0.1541, + "num_input_tokens_seen": 2602944, + "step": 12335 + }, + { + "epoch": 1.3575357535753576, + "grad_norm": 0.7389244437217712, + "learning_rate": 3.393564356435644e-05, + "loss": 0.142, + "num_input_tokens_seen": 2604032, + "step": 12340 + }, + { + "epoch": 1.358085808580858, + "grad_norm": 2.2001917362213135, + "learning_rate": 3.394939493949395e-05, + "loss": 0.2076, + "num_input_tokens_seen": 2605120, + "step": 12345 + }, + { + "epoch": 1.3586358635863587, + "grad_norm": 0.8736252784729004, + "learning_rate": 3.3963146314631465e-05, + "loss": 0.1048, + "num_input_tokens_seen": 2606144, + "step": 12350 + }, + { + "epoch": 1.359185918591859, + "grad_norm": 0.30010661482810974, + "learning_rate": 3.397689768976898e-05, + "loss": 0.0657, + "num_input_tokens_seen": 2607168, + "step": 12355 + }, + { + "epoch": 1.3597359735973598, + "grad_norm": 0.9494466185569763, + "learning_rate": 3.3990649064906495e-05, + "loss": 0.1394, + "num_input_tokens_seen": 2608224, + "step": 12360 + }, + { + "epoch": 1.3602860286028604, + "grad_norm": 0.9063318371772766, + "learning_rate": 3.400440044004401e-05, + "loss": 0.1252, + "num_input_tokens_seen": 2609248, + "step": 12365 + }, + { + "epoch": 1.3608360836083608, + "grad_norm": 0.10896951705217361, + "learning_rate": 3.401815181518152e-05, + "loss": 0.1126, + "num_input_tokens_seen": 2610304, + "step": 12370 + }, + { + "epoch": 1.3613861386138613, + "grad_norm": 0.692603588104248, + "learning_rate": 3.403190319031903e-05, + "loss": 0.0865, + "num_input_tokens_seen": 2611296, + "step": 12375 + }, + { + "epoch": 1.361936193619362, + "grad_norm": 0.18307767808437347, + "learning_rate": 3.404565456545655e-05, + "loss": 0.1425, + "num_input_tokens_seen": 2612320, + "step": 12380 + }, + { + "epoch": 1.3624862486248626, + "grad_norm": 0.5181381106376648, + "learning_rate": 3.405940594059407e-05, + "loss": 0.09, + "num_input_tokens_seen": 2613472, + "step": 12385 + }, + { + "epoch": 1.363036303630363, + "grad_norm": 1.116041898727417, + "learning_rate": 3.407315731573157e-05, + "loss": 0.1342, + "num_input_tokens_seen": 2614560, + "step": 12390 + }, + { + "epoch": 1.3635863586358636, + "grad_norm": 0.2987697422504425, + "learning_rate": 3.408690869086909e-05, + "loss": 0.1097, + "num_input_tokens_seen": 2615648, + "step": 12395 + }, + { + "epoch": 1.364136413641364, + "grad_norm": 1.0629256963729858, + "learning_rate": 3.41006600660066e-05, + "loss": 0.1012, + "num_input_tokens_seen": 2616736, + "step": 12400 + }, + { + "epoch": 1.3646864686468647, + "grad_norm": 0.8116479516029358, + "learning_rate": 3.411441144114412e-05, + "loss": 0.2005, + "num_input_tokens_seen": 2617760, + "step": 12405 + }, + { + "epoch": 1.3652365236523654, + "grad_norm": 0.39505720138549805, + "learning_rate": 3.4128162816281625e-05, + "loss": 0.1228, + "num_input_tokens_seen": 2618816, + "step": 12410 + }, + { + "epoch": 1.3657865786578658, + "grad_norm": 0.8934148550033569, + "learning_rate": 3.4141914191419144e-05, + "loss": 0.1686, + "num_input_tokens_seen": 2619840, + "step": 12415 + }, + { + "epoch": 1.3663366336633662, + "grad_norm": 0.34560614824295044, + "learning_rate": 3.4155665566556655e-05, + "loss": 0.0829, + "num_input_tokens_seen": 2620864, + "step": 12420 + }, + { + "epoch": 1.3668866886688669, + "grad_norm": 0.791789174079895, + "learning_rate": 3.4169416941694174e-05, + "loss": 0.0982, + "num_input_tokens_seen": 2621888, + "step": 12425 + }, + { + "epoch": 1.3674367436743675, + "grad_norm": 0.4670161306858063, + "learning_rate": 3.4183168316831685e-05, + "loss": 0.2518, + "num_input_tokens_seen": 2622880, + "step": 12430 + }, + { + "epoch": 1.367986798679868, + "grad_norm": 0.1724851280450821, + "learning_rate": 3.41969196919692e-05, + "loss": 0.0459, + "num_input_tokens_seen": 2623904, + "step": 12435 + }, + { + "epoch": 1.3685368536853686, + "grad_norm": 1.2301820516586304, + "learning_rate": 3.4210671067106715e-05, + "loss": 0.0994, + "num_input_tokens_seen": 2624960, + "step": 12440 + }, + { + "epoch": 1.369086908690869, + "grad_norm": 0.9754864573478699, + "learning_rate": 3.422442244224423e-05, + "loss": 0.1376, + "num_input_tokens_seen": 2626048, + "step": 12445 + }, + { + "epoch": 1.3696369636963697, + "grad_norm": 0.4350806772708893, + "learning_rate": 3.423817381738174e-05, + "loss": 0.1472, + "num_input_tokens_seen": 2627040, + "step": 12450 + }, + { + "epoch": 1.3701870187018703, + "grad_norm": 0.36041417717933655, + "learning_rate": 3.425192519251925e-05, + "loss": 0.049, + "num_input_tokens_seen": 2628096, + "step": 12455 + }, + { + "epoch": 1.3707370737073707, + "grad_norm": 1.0303102731704712, + "learning_rate": 3.426567656765677e-05, + "loss": 0.1906, + "num_input_tokens_seen": 2629088, + "step": 12460 + }, + { + "epoch": 1.3712871287128712, + "grad_norm": 0.6416630148887634, + "learning_rate": 3.427942794279428e-05, + "loss": 0.0863, + "num_input_tokens_seen": 2630240, + "step": 12465 + }, + { + "epoch": 1.3718371837183718, + "grad_norm": 0.4638105630874634, + "learning_rate": 3.429317931793179e-05, + "loss": 0.1153, + "num_input_tokens_seen": 2631360, + "step": 12470 + }, + { + "epoch": 1.3723872387238725, + "grad_norm": 0.5874318480491638, + "learning_rate": 3.430693069306931e-05, + "loss": 0.1746, + "num_input_tokens_seen": 2632416, + "step": 12475 + }, + { + "epoch": 1.372937293729373, + "grad_norm": 0.3452278971672058, + "learning_rate": 3.432068206820682e-05, + "loss": 0.1015, + "num_input_tokens_seen": 2633440, + "step": 12480 + }, + { + "epoch": 1.3734873487348735, + "grad_norm": 0.559192955493927, + "learning_rate": 3.433443344334434e-05, + "loss": 0.0863, + "num_input_tokens_seen": 2634432, + "step": 12485 + }, + { + "epoch": 1.374037403740374, + "grad_norm": 1.1577574014663696, + "learning_rate": 3.4348184818481846e-05, + "loss": 0.1453, + "num_input_tokens_seen": 2635520, + "step": 12490 + }, + { + "epoch": 1.3745874587458746, + "grad_norm": 0.7832301259040833, + "learning_rate": 3.4361936193619364e-05, + "loss": 0.0726, + "num_input_tokens_seen": 2636576, + "step": 12495 + }, + { + "epoch": 1.3751375137513753, + "grad_norm": 0.3845019042491913, + "learning_rate": 3.4375687568756876e-05, + "loss": 0.1544, + "num_input_tokens_seen": 2637568, + "step": 12500 + }, + { + "epoch": 1.3756875687568757, + "grad_norm": 0.8571763038635254, + "learning_rate": 3.4389438943894394e-05, + "loss": 0.1731, + "num_input_tokens_seen": 2638656, + "step": 12505 + }, + { + "epoch": 1.3762376237623761, + "grad_norm": 0.061938606202602386, + "learning_rate": 3.4403190319031906e-05, + "loss": 0.134, + "num_input_tokens_seen": 2639648, + "step": 12510 + }, + { + "epoch": 1.3767876787678768, + "grad_norm": 0.23029227554798126, + "learning_rate": 3.441694169416942e-05, + "loss": 0.069, + "num_input_tokens_seen": 2640672, + "step": 12515 + }, + { + "epoch": 1.3773377337733774, + "grad_norm": 0.8009185791015625, + "learning_rate": 3.443069306930693e-05, + "loss": 0.1117, + "num_input_tokens_seen": 2641696, + "step": 12520 + }, + { + "epoch": 1.3778877887788779, + "grad_norm": 0.3921099603176117, + "learning_rate": 3.444444444444445e-05, + "loss": 0.062, + "num_input_tokens_seen": 2642816, + "step": 12525 + }, + { + "epoch": 1.3784378437843785, + "grad_norm": 0.7481478452682495, + "learning_rate": 3.445819581958196e-05, + "loss": 0.093, + "num_input_tokens_seen": 2643776, + "step": 12530 + }, + { + "epoch": 1.378987898789879, + "grad_norm": 0.4500546455383301, + "learning_rate": 3.447194719471947e-05, + "loss": 0.2125, + "num_input_tokens_seen": 2644800, + "step": 12535 + }, + { + "epoch": 1.3795379537953796, + "grad_norm": 0.2627178132534027, + "learning_rate": 3.448569856985699e-05, + "loss": 0.1189, + "num_input_tokens_seen": 2645856, + "step": 12540 + }, + { + "epoch": 1.38008800880088, + "grad_norm": 0.42391762137413025, + "learning_rate": 3.44994499449945e-05, + "loss": 0.0797, + "num_input_tokens_seen": 2646944, + "step": 12545 + }, + { + "epoch": 1.3806380638063807, + "grad_norm": 1.3066896200180054, + "learning_rate": 3.451320132013202e-05, + "loss": 0.1082, + "num_input_tokens_seen": 2648000, + "step": 12550 + }, + { + "epoch": 1.381188118811881, + "grad_norm": 0.7339982986450195, + "learning_rate": 3.4526952695269524e-05, + "loss": 0.1178, + "num_input_tokens_seen": 2649056, + "step": 12555 + }, + { + "epoch": 1.3817381738173817, + "grad_norm": 0.052541978657245636, + "learning_rate": 3.454070407040704e-05, + "loss": 0.1173, + "num_input_tokens_seen": 2650112, + "step": 12560 + }, + { + "epoch": 1.3822882288228824, + "grad_norm": 0.44921284914016724, + "learning_rate": 3.4554455445544554e-05, + "loss": 0.094, + "num_input_tokens_seen": 2651168, + "step": 12565 + }, + { + "epoch": 1.3828382838283828, + "grad_norm": 0.2987186014652252, + "learning_rate": 3.456820682068207e-05, + "loss": 0.1334, + "num_input_tokens_seen": 2652224, + "step": 12570 + }, + { + "epoch": 1.3833883388338835, + "grad_norm": 1.5964641571044922, + "learning_rate": 3.4581958195819584e-05, + "loss": 0.1813, + "num_input_tokens_seen": 2653248, + "step": 12575 + }, + { + "epoch": 1.3839383938393839, + "grad_norm": 0.2463194578886032, + "learning_rate": 3.4595709570957096e-05, + "loss": 0.1152, + "num_input_tokens_seen": 2654272, + "step": 12580 + }, + { + "epoch": 1.3844884488448845, + "grad_norm": 1.1363675594329834, + "learning_rate": 3.4609460946094614e-05, + "loss": 0.114, + "num_input_tokens_seen": 2655328, + "step": 12585 + }, + { + "epoch": 1.385038503850385, + "grad_norm": 0.5974399447441101, + "learning_rate": 3.4623212321232126e-05, + "loss": 0.0969, + "num_input_tokens_seen": 2656416, + "step": 12590 + }, + { + "epoch": 1.3855885588558856, + "grad_norm": 0.4647061228752136, + "learning_rate": 3.463696369636964e-05, + "loss": 0.0669, + "num_input_tokens_seen": 2657440, + "step": 12595 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.9647894501686096, + "learning_rate": 3.465071507150715e-05, + "loss": 0.2301, + "num_input_tokens_seen": 2658560, + "step": 12600 + }, + { + "epoch": 1.3866886688668867, + "grad_norm": 0.470043420791626, + "learning_rate": 3.466446644664467e-05, + "loss": 0.0777, + "num_input_tokens_seen": 2659552, + "step": 12605 + }, + { + "epoch": 1.3872387238723873, + "grad_norm": 0.08723509311676025, + "learning_rate": 3.467821782178218e-05, + "loss": 0.1061, + "num_input_tokens_seen": 2660672, + "step": 12610 + }, + { + "epoch": 1.3877887788778878, + "grad_norm": 0.4575030207633972, + "learning_rate": 3.469196919691969e-05, + "loss": 0.1286, + "num_input_tokens_seen": 2661664, + "step": 12615 + }, + { + "epoch": 1.3883388338833884, + "grad_norm": 0.28669923543930054, + "learning_rate": 3.470572057205721e-05, + "loss": 0.1352, + "num_input_tokens_seen": 2662624, + "step": 12620 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.7443604469299316, + "learning_rate": 3.471947194719472e-05, + "loss": 0.1576, + "num_input_tokens_seen": 2663680, + "step": 12625 + }, + { + "epoch": 1.3894389438943895, + "grad_norm": 0.3690871596336365, + "learning_rate": 3.473322332233224e-05, + "loss": 0.0931, + "num_input_tokens_seen": 2664704, + "step": 12630 + }, + { + "epoch": 1.38998899889989, + "grad_norm": 0.6025363206863403, + "learning_rate": 3.4746974697469745e-05, + "loss": 0.161, + "num_input_tokens_seen": 2665760, + "step": 12635 + }, + { + "epoch": 1.3905390539053906, + "grad_norm": 0.44167160987854004, + "learning_rate": 3.476072607260726e-05, + "loss": 0.1902, + "num_input_tokens_seen": 2666880, + "step": 12640 + }, + { + "epoch": 1.391089108910891, + "grad_norm": 0.494714617729187, + "learning_rate": 3.4774477447744775e-05, + "loss": 0.0956, + "num_input_tokens_seen": 2667904, + "step": 12645 + }, + { + "epoch": 1.3916391639163916, + "grad_norm": 0.6273417472839355, + "learning_rate": 3.478822882288229e-05, + "loss": 0.1192, + "num_input_tokens_seen": 2669024, + "step": 12650 + }, + { + "epoch": 1.3921892189218923, + "grad_norm": 0.20460152626037598, + "learning_rate": 3.48019801980198e-05, + "loss": 0.0388, + "num_input_tokens_seen": 2670112, + "step": 12655 + }, + { + "epoch": 1.3927392739273927, + "grad_norm": 0.2642812728881836, + "learning_rate": 3.4815731573157316e-05, + "loss": 0.1244, + "num_input_tokens_seen": 2671136, + "step": 12660 + }, + { + "epoch": 1.3932893289328934, + "grad_norm": 0.9426190853118896, + "learning_rate": 3.4829482948294835e-05, + "loss": 0.1816, + "num_input_tokens_seen": 2672128, + "step": 12665 + }, + { + "epoch": 1.3938393839383938, + "grad_norm": 0.4616858661174774, + "learning_rate": 3.4843234323432346e-05, + "loss": 0.1183, + "num_input_tokens_seen": 2673120, + "step": 12670 + }, + { + "epoch": 1.3943894389438944, + "grad_norm": 0.6387504935264587, + "learning_rate": 3.485698569856986e-05, + "loss": 0.065, + "num_input_tokens_seen": 2674208, + "step": 12675 + }, + { + "epoch": 1.3949394939493949, + "grad_norm": 2.227327823638916, + "learning_rate": 3.487073707370737e-05, + "loss": 0.1738, + "num_input_tokens_seen": 2675264, + "step": 12680 + }, + { + "epoch": 1.3954895489548955, + "grad_norm": 0.26637503504753113, + "learning_rate": 3.488448844884489e-05, + "loss": 0.1898, + "num_input_tokens_seen": 2676256, + "step": 12685 + }, + { + "epoch": 1.396039603960396, + "grad_norm": 0.032152000814676285, + "learning_rate": 3.48982398239824e-05, + "loss": 0.1636, + "num_input_tokens_seen": 2677312, + "step": 12690 + }, + { + "epoch": 1.3965896589658966, + "grad_norm": 1.4400475025177002, + "learning_rate": 3.491199119911991e-05, + "loss": 0.182, + "num_input_tokens_seen": 2678368, + "step": 12695 + }, + { + "epoch": 1.3971397139713972, + "grad_norm": 1.5235657691955566, + "learning_rate": 3.492574257425742e-05, + "loss": 0.1003, + "num_input_tokens_seen": 2679456, + "step": 12700 + }, + { + "epoch": 1.3976897689768977, + "grad_norm": 1.122688889503479, + "learning_rate": 3.493949394939494e-05, + "loss": 0.163, + "num_input_tokens_seen": 2680544, + "step": 12705 + }, + { + "epoch": 1.3982398239823983, + "grad_norm": 0.29263681173324585, + "learning_rate": 3.495324532453246e-05, + "loss": 0.2481, + "num_input_tokens_seen": 2681568, + "step": 12710 + }, + { + "epoch": 1.3987898789878987, + "grad_norm": 0.44850441813468933, + "learning_rate": 3.4966996699669965e-05, + "loss": 0.1304, + "num_input_tokens_seen": 2682656, + "step": 12715 + }, + { + "epoch": 1.3993399339933994, + "grad_norm": 0.8933459520339966, + "learning_rate": 3.498074807480748e-05, + "loss": 0.0814, + "num_input_tokens_seen": 2683680, + "step": 12720 + }, + { + "epoch": 1.3998899889988998, + "grad_norm": 0.10077466815710068, + "learning_rate": 3.4994499449944995e-05, + "loss": 0.1664, + "num_input_tokens_seen": 2684704, + "step": 12725 + }, + { + "epoch": 1.4004400440044005, + "grad_norm": 0.3939666748046875, + "learning_rate": 3.5008250825082513e-05, + "loss": 0.1191, + "num_input_tokens_seen": 2685792, + "step": 12730 + }, + { + "epoch": 1.400990099009901, + "grad_norm": 0.21437564492225647, + "learning_rate": 3.5022002200220025e-05, + "loss": 0.1252, + "num_input_tokens_seen": 2686848, + "step": 12735 + }, + { + "epoch": 1.4015401540154016, + "grad_norm": 0.3158610761165619, + "learning_rate": 3.503575357535754e-05, + "loss": 0.1717, + "num_input_tokens_seen": 2687904, + "step": 12740 + }, + { + "epoch": 1.4020902090209022, + "grad_norm": 0.19639262557029724, + "learning_rate": 3.504950495049505e-05, + "loss": 0.1242, + "num_input_tokens_seen": 2688992, + "step": 12745 + }, + { + "epoch": 1.4026402640264026, + "grad_norm": 0.114288829267025, + "learning_rate": 3.506325632563257e-05, + "loss": 0.0544, + "num_input_tokens_seen": 2690080, + "step": 12750 + }, + { + "epoch": 1.403190319031903, + "grad_norm": 0.2602940499782562, + "learning_rate": 3.507700770077008e-05, + "loss": 0.0963, + "num_input_tokens_seen": 2691200, + "step": 12755 + }, + { + "epoch": 1.4037403740374037, + "grad_norm": 0.14969071745872498, + "learning_rate": 3.509075907590759e-05, + "loss": 0.187, + "num_input_tokens_seen": 2692224, + "step": 12760 + }, + { + "epoch": 1.4042904290429044, + "grad_norm": 0.25466594099998474, + "learning_rate": 3.510451045104511e-05, + "loss": 0.0878, + "num_input_tokens_seen": 2693280, + "step": 12765 + }, + { + "epoch": 1.4048404840484048, + "grad_norm": 0.6105249524116516, + "learning_rate": 3.511826182618262e-05, + "loss": 0.1146, + "num_input_tokens_seen": 2694368, + "step": 12770 + }, + { + "epoch": 1.4053905390539054, + "grad_norm": 0.3025067150592804, + "learning_rate": 3.513201320132014e-05, + "loss": 0.0734, + "num_input_tokens_seen": 2695392, + "step": 12775 + }, + { + "epoch": 1.4059405940594059, + "grad_norm": 0.1587013602256775, + "learning_rate": 3.5145764576457644e-05, + "loss": 0.0656, + "num_input_tokens_seen": 2696416, + "step": 12780 + }, + { + "epoch": 1.4064906490649065, + "grad_norm": 0.2734033763408661, + "learning_rate": 3.515951595159516e-05, + "loss": 0.131, + "num_input_tokens_seen": 2697472, + "step": 12785 + }, + { + "epoch": 1.4070407040704072, + "grad_norm": 0.4628617465496063, + "learning_rate": 3.5173267326732674e-05, + "loss": 0.0566, + "num_input_tokens_seen": 2698432, + "step": 12790 + }, + { + "epoch": 1.4075907590759076, + "grad_norm": 0.41068485379219055, + "learning_rate": 3.518701870187019e-05, + "loss": 0.2395, + "num_input_tokens_seen": 2699456, + "step": 12795 + }, + { + "epoch": 1.408140814081408, + "grad_norm": 0.5672887563705444, + "learning_rate": 3.52007700770077e-05, + "loss": 0.1216, + "num_input_tokens_seen": 2700512, + "step": 12800 + }, + { + "epoch": 1.4086908690869087, + "grad_norm": 0.5064691305160522, + "learning_rate": 3.5214521452145215e-05, + "loss": 0.1194, + "num_input_tokens_seen": 2701536, + "step": 12805 + }, + { + "epoch": 1.4092409240924093, + "grad_norm": 0.5044609904289246, + "learning_rate": 3.5228272827282734e-05, + "loss": 0.0677, + "num_input_tokens_seen": 2702624, + "step": 12810 + }, + { + "epoch": 1.4097909790979097, + "grad_norm": 0.22362960875034332, + "learning_rate": 3.5242024202420245e-05, + "loss": 0.0777, + "num_input_tokens_seen": 2703712, + "step": 12815 + }, + { + "epoch": 1.4103410341034104, + "grad_norm": 0.3577941358089447, + "learning_rate": 3.525577557755776e-05, + "loss": 0.1118, + "num_input_tokens_seen": 2704704, + "step": 12820 + }, + { + "epoch": 1.4108910891089108, + "grad_norm": 0.6363549828529358, + "learning_rate": 3.526952695269527e-05, + "loss": 0.1028, + "num_input_tokens_seen": 2705760, + "step": 12825 + }, + { + "epoch": 1.4114411441144115, + "grad_norm": 0.18406032025814056, + "learning_rate": 3.528327832783279e-05, + "loss": 0.073, + "num_input_tokens_seen": 2706752, + "step": 12830 + }, + { + "epoch": 1.411991199119912, + "grad_norm": 0.2788960933685303, + "learning_rate": 3.52970297029703e-05, + "loss": 0.0592, + "num_input_tokens_seen": 2707808, + "step": 12835 + }, + { + "epoch": 1.4125412541254125, + "grad_norm": 0.19194373488426208, + "learning_rate": 3.531078107810781e-05, + "loss": 0.0552, + "num_input_tokens_seen": 2708800, + "step": 12840 + }, + { + "epoch": 1.413091309130913, + "grad_norm": 0.32984116673469543, + "learning_rate": 3.532453245324532e-05, + "loss": 0.0496, + "num_input_tokens_seen": 2709856, + "step": 12845 + }, + { + "epoch": 1.4136413641364136, + "grad_norm": 0.17632915079593658, + "learning_rate": 3.533828382838284e-05, + "loss": 0.064, + "num_input_tokens_seen": 2710912, + "step": 12850 + }, + { + "epoch": 1.4141914191419143, + "grad_norm": 0.9536510109901428, + "learning_rate": 3.535203520352036e-05, + "loss": 0.0622, + "num_input_tokens_seen": 2711936, + "step": 12855 + }, + { + "epoch": 1.4147414741474147, + "grad_norm": 0.8845292925834656, + "learning_rate": 3.5365786578657864e-05, + "loss": 0.1306, + "num_input_tokens_seen": 2713056, + "step": 12860 + }, + { + "epoch": 1.4152915291529153, + "grad_norm": 0.9773988127708435, + "learning_rate": 3.537953795379538e-05, + "loss": 0.131, + "num_input_tokens_seen": 2714208, + "step": 12865 + }, + { + "epoch": 1.4158415841584158, + "grad_norm": 1.2201615571975708, + "learning_rate": 3.5393289328932894e-05, + "loss": 0.1767, + "num_input_tokens_seen": 2715296, + "step": 12870 + }, + { + "epoch": 1.4163916391639164, + "grad_norm": 1.2716270685195923, + "learning_rate": 3.540704070407041e-05, + "loss": 0.0883, + "num_input_tokens_seen": 2716384, + "step": 12875 + }, + { + "epoch": 1.416941694169417, + "grad_norm": 0.45725148916244507, + "learning_rate": 3.542079207920792e-05, + "loss": 0.0713, + "num_input_tokens_seen": 2717408, + "step": 12880 + }, + { + "epoch": 1.4174917491749175, + "grad_norm": 0.5113770365715027, + "learning_rate": 3.5434543454345436e-05, + "loss": 0.0802, + "num_input_tokens_seen": 2718464, + "step": 12885 + }, + { + "epoch": 1.418041804180418, + "grad_norm": 0.4330337941646576, + "learning_rate": 3.544829482948295e-05, + "loss": 0.0576, + "num_input_tokens_seen": 2719552, + "step": 12890 + }, + { + "epoch": 1.4185918591859186, + "grad_norm": 0.49128133058547974, + "learning_rate": 3.5462046204620466e-05, + "loss": 0.0449, + "num_input_tokens_seen": 2720608, + "step": 12895 + }, + { + "epoch": 1.4191419141914192, + "grad_norm": 1.6058751344680786, + "learning_rate": 3.547579757975798e-05, + "loss": 0.1143, + "num_input_tokens_seen": 2721664, + "step": 12900 + }, + { + "epoch": 1.4196919691969196, + "grad_norm": 1.1444306373596191, + "learning_rate": 3.548954895489549e-05, + "loss": 0.0595, + "num_input_tokens_seen": 2722688, + "step": 12905 + }, + { + "epoch": 1.4202420242024203, + "grad_norm": 0.7176637649536133, + "learning_rate": 3.550330033003301e-05, + "loss": 0.1587, + "num_input_tokens_seen": 2723808, + "step": 12910 + }, + { + "epoch": 1.4207920792079207, + "grad_norm": 0.2907896041870117, + "learning_rate": 3.551705170517052e-05, + "loss": 0.0427, + "num_input_tokens_seen": 2724896, + "step": 12915 + }, + { + "epoch": 1.4213421342134214, + "grad_norm": 0.616950511932373, + "learning_rate": 3.553080308030803e-05, + "loss": 0.1105, + "num_input_tokens_seen": 2725920, + "step": 12920 + }, + { + "epoch": 1.421892189218922, + "grad_norm": 0.623439371585846, + "learning_rate": 3.554455445544554e-05, + "loss": 0.0782, + "num_input_tokens_seen": 2726976, + "step": 12925 + }, + { + "epoch": 1.4224422442244224, + "grad_norm": 0.9681440591812134, + "learning_rate": 3.555830583058306e-05, + "loss": 0.1033, + "num_input_tokens_seen": 2728032, + "step": 12930 + }, + { + "epoch": 1.4229922992299229, + "grad_norm": 0.6091700792312622, + "learning_rate": 3.557205720572057e-05, + "loss": 0.1918, + "num_input_tokens_seen": 2729056, + "step": 12935 + }, + { + "epoch": 1.4235423542354235, + "grad_norm": 0.6155377626419067, + "learning_rate": 3.558580858085809e-05, + "loss": 0.1827, + "num_input_tokens_seen": 2730112, + "step": 12940 + }, + { + "epoch": 1.4240924092409242, + "grad_norm": 0.37261494994163513, + "learning_rate": 3.55995599559956e-05, + "loss": 0.0611, + "num_input_tokens_seen": 2731200, + "step": 12945 + }, + { + "epoch": 1.4246424642464246, + "grad_norm": 0.015520459972321987, + "learning_rate": 3.5613311331133114e-05, + "loss": 0.1164, + "num_input_tokens_seen": 2732224, + "step": 12950 + }, + { + "epoch": 1.4251925192519252, + "grad_norm": 1.0243709087371826, + "learning_rate": 3.562706270627063e-05, + "loss": 0.2333, + "num_input_tokens_seen": 2733248, + "step": 12955 + }, + { + "epoch": 1.4257425742574257, + "grad_norm": 0.18021298944950104, + "learning_rate": 3.5640814081408144e-05, + "loss": 0.085, + "num_input_tokens_seen": 2734336, + "step": 12960 + }, + { + "epoch": 1.4262926292629263, + "grad_norm": 0.27344024181365967, + "learning_rate": 3.5654565456545656e-05, + "loss": 0.1161, + "num_input_tokens_seen": 2735424, + "step": 12965 + }, + { + "epoch": 1.426842684268427, + "grad_norm": 0.3225903809070587, + "learning_rate": 3.566831683168317e-05, + "loss": 0.1592, + "num_input_tokens_seen": 2736480, + "step": 12970 + }, + { + "epoch": 1.4273927392739274, + "grad_norm": 0.7782862186431885, + "learning_rate": 3.5682068206820686e-05, + "loss": 0.1057, + "num_input_tokens_seen": 2737472, + "step": 12975 + }, + { + "epoch": 1.4279427942794278, + "grad_norm": 0.19214634597301483, + "learning_rate": 3.56958195819582e-05, + "loss": 0.0959, + "num_input_tokens_seen": 2738592, + "step": 12980 + }, + { + "epoch": 1.4284928492849285, + "grad_norm": 1.0332105159759521, + "learning_rate": 3.570957095709571e-05, + "loss": 0.1354, + "num_input_tokens_seen": 2739616, + "step": 12985 + }, + { + "epoch": 1.4290429042904291, + "grad_norm": 0.1466515213251114, + "learning_rate": 3.572332233223323e-05, + "loss": 0.0273, + "num_input_tokens_seen": 2740608, + "step": 12990 + }, + { + "epoch": 1.4295929592959296, + "grad_norm": 0.31511765718460083, + "learning_rate": 3.573707370737074e-05, + "loss": 0.0605, + "num_input_tokens_seen": 2741728, + "step": 12995 + }, + { + "epoch": 1.4301430143014302, + "grad_norm": 0.4974091053009033, + "learning_rate": 3.575082508250826e-05, + "loss": 0.1312, + "num_input_tokens_seen": 2742752, + "step": 13000 + }, + { + "epoch": 1.4306930693069306, + "grad_norm": 1.3345800638198853, + "learning_rate": 3.576457645764576e-05, + "loss": 0.0864, + "num_input_tokens_seen": 2743808, + "step": 13005 + }, + { + "epoch": 1.4312431243124313, + "grad_norm": 0.3018587529659271, + "learning_rate": 3.577832783278328e-05, + "loss": 0.0481, + "num_input_tokens_seen": 2744864, + "step": 13010 + }, + { + "epoch": 1.431793179317932, + "grad_norm": 0.8293886184692383, + "learning_rate": 3.579207920792079e-05, + "loss": 0.1465, + "num_input_tokens_seen": 2745856, + "step": 13015 + }, + { + "epoch": 1.4323432343234324, + "grad_norm": 0.2714496850967407, + "learning_rate": 3.580583058305831e-05, + "loss": 0.0456, + "num_input_tokens_seen": 2746848, + "step": 13020 + }, + { + "epoch": 1.4328932893289328, + "grad_norm": 0.32363277673721313, + "learning_rate": 3.5819581958195816e-05, + "loss": 0.1513, + "num_input_tokens_seen": 2747904, + "step": 13025 + }, + { + "epoch": 1.4334433443344334, + "grad_norm": 0.24776476621627808, + "learning_rate": 3.5833333333333335e-05, + "loss": 0.0646, + "num_input_tokens_seen": 2748896, + "step": 13030 + }, + { + "epoch": 1.433993399339934, + "grad_norm": 0.05618767812848091, + "learning_rate": 3.5847084708470846e-05, + "loss": 0.0477, + "num_input_tokens_seen": 2749920, + "step": 13035 + }, + { + "epoch": 1.4345434543454345, + "grad_norm": 0.16859370470046997, + "learning_rate": 3.5860836083608365e-05, + "loss": 0.1264, + "num_input_tokens_seen": 2750944, + "step": 13040 + }, + { + "epoch": 1.4350935093509352, + "grad_norm": 0.7519280910491943, + "learning_rate": 3.5874587458745876e-05, + "loss": 0.1808, + "num_input_tokens_seen": 2752000, + "step": 13045 + }, + { + "epoch": 1.4356435643564356, + "grad_norm": 0.6154278516769409, + "learning_rate": 3.588833883388339e-05, + "loss": 0.0638, + "num_input_tokens_seen": 2753088, + "step": 13050 + }, + { + "epoch": 1.4361936193619362, + "grad_norm": 0.6447708010673523, + "learning_rate": 3.5902090209020907e-05, + "loss": 0.0568, + "num_input_tokens_seen": 2754176, + "step": 13055 + }, + { + "epoch": 1.4367436743674367, + "grad_norm": 0.04713538661599159, + "learning_rate": 3.591584158415842e-05, + "loss": 0.1636, + "num_input_tokens_seen": 2755264, + "step": 13060 + }, + { + "epoch": 1.4372937293729373, + "grad_norm": 0.4362725019454956, + "learning_rate": 3.592959295929593e-05, + "loss": 0.1276, + "num_input_tokens_seen": 2756320, + "step": 13065 + }, + { + "epoch": 1.4378437843784377, + "grad_norm": 0.2415550947189331, + "learning_rate": 3.594334433443344e-05, + "loss": 0.1169, + "num_input_tokens_seen": 2757440, + "step": 13070 + }, + { + "epoch": 1.4383938393839384, + "grad_norm": 0.3989028036594391, + "learning_rate": 3.595709570957096e-05, + "loss": 0.0692, + "num_input_tokens_seen": 2758496, + "step": 13075 + }, + { + "epoch": 1.438943894389439, + "grad_norm": 2.1499805450439453, + "learning_rate": 3.597084708470847e-05, + "loss": 0.0885, + "num_input_tokens_seen": 2759552, + "step": 13080 + }, + { + "epoch": 1.4394939493949395, + "grad_norm": 1.5071080923080444, + "learning_rate": 3.598459845984598e-05, + "loss": 0.1323, + "num_input_tokens_seen": 2760672, + "step": 13085 + }, + { + "epoch": 1.4400440044004401, + "grad_norm": 0.8136719465255737, + "learning_rate": 3.59983498349835e-05, + "loss": 0.0932, + "num_input_tokens_seen": 2761664, + "step": 13090 + }, + { + "epoch": 1.4405940594059405, + "grad_norm": 1.3208180665969849, + "learning_rate": 3.601210121012101e-05, + "loss": 0.2093, + "num_input_tokens_seen": 2762688, + "step": 13095 + }, + { + "epoch": 1.4411441144114412, + "grad_norm": 0.7889430522918701, + "learning_rate": 3.602585258525853e-05, + "loss": 0.0781, + "num_input_tokens_seen": 2763680, + "step": 13100 + }, + { + "epoch": 1.4416941694169416, + "grad_norm": 0.4056258499622345, + "learning_rate": 3.603960396039604e-05, + "loss": 0.0602, + "num_input_tokens_seen": 2764672, + "step": 13105 + }, + { + "epoch": 1.4422442244224423, + "grad_norm": 0.18520037829875946, + "learning_rate": 3.6053355335533555e-05, + "loss": 0.0965, + "num_input_tokens_seen": 2765792, + "step": 13110 + }, + { + "epoch": 1.4427942794279427, + "grad_norm": 1.2454235553741455, + "learning_rate": 3.606710671067107e-05, + "loss": 0.2633, + "num_input_tokens_seen": 2766816, + "step": 13115 + }, + { + "epoch": 1.4433443344334433, + "grad_norm": 0.09059985727071762, + "learning_rate": 3.6080858085808585e-05, + "loss": 0.0516, + "num_input_tokens_seen": 2767872, + "step": 13120 + }, + { + "epoch": 1.443894389438944, + "grad_norm": 0.22453314065933228, + "learning_rate": 3.60946094609461e-05, + "loss": 0.1175, + "num_input_tokens_seen": 2768928, + "step": 13125 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4307139217853546, + "learning_rate": 3.610836083608361e-05, + "loss": 0.0634, + "num_input_tokens_seen": 2769952, + "step": 13130 + }, + { + "epoch": 1.444994499449945, + "grad_norm": 1.0919471979141235, + "learning_rate": 3.612211221122113e-05, + "loss": 0.1132, + "num_input_tokens_seen": 2770976, + "step": 13135 + }, + { + "epoch": 1.4455445544554455, + "grad_norm": 1.0962411165237427, + "learning_rate": 3.613586358635864e-05, + "loss": 0.1672, + "num_input_tokens_seen": 2772032, + "step": 13140 + }, + { + "epoch": 1.4460946094609461, + "grad_norm": 0.44425854086875916, + "learning_rate": 3.614961496149615e-05, + "loss": 0.0564, + "num_input_tokens_seen": 2773088, + "step": 13145 + }, + { + "epoch": 1.4466446644664466, + "grad_norm": 0.022037561982870102, + "learning_rate": 3.616336633663366e-05, + "loss": 0.0875, + "num_input_tokens_seen": 2774176, + "step": 13150 + }, + { + "epoch": 1.4471947194719472, + "grad_norm": 0.35572174191474915, + "learning_rate": 3.617711771177118e-05, + "loss": 0.115, + "num_input_tokens_seen": 2775168, + "step": 13155 + }, + { + "epoch": 1.4477447744774476, + "grad_norm": 0.9560561180114746, + "learning_rate": 3.619086908690869e-05, + "loss": 0.0769, + "num_input_tokens_seen": 2776192, + "step": 13160 + }, + { + "epoch": 1.4482948294829483, + "grad_norm": 0.4490640461444855, + "learning_rate": 3.620462046204621e-05, + "loss": 0.1066, + "num_input_tokens_seen": 2777184, + "step": 13165 + }, + { + "epoch": 1.448844884488449, + "grad_norm": 0.18527577817440033, + "learning_rate": 3.6218371837183715e-05, + "loss": 0.1086, + "num_input_tokens_seen": 2778240, + "step": 13170 + }, + { + "epoch": 1.4493949394939494, + "grad_norm": 0.0996105745434761, + "learning_rate": 3.6232123212321234e-05, + "loss": 0.0445, + "num_input_tokens_seen": 2779296, + "step": 13175 + }, + { + "epoch": 1.44994499449945, + "grad_norm": 0.7255309224128723, + "learning_rate": 3.624587458745875e-05, + "loss": 0.0952, + "num_input_tokens_seen": 2780320, + "step": 13180 + }, + { + "epoch": 1.4504950495049505, + "grad_norm": 0.7883357405662537, + "learning_rate": 3.6259625962596264e-05, + "loss": 0.1317, + "num_input_tokens_seen": 2781440, + "step": 13185 + }, + { + "epoch": 1.451045104510451, + "grad_norm": 0.39742299914360046, + "learning_rate": 3.6273377337733775e-05, + "loss": 0.0862, + "num_input_tokens_seen": 2782464, + "step": 13190 + }, + { + "epoch": 1.4515951595159515, + "grad_norm": 0.37839189171791077, + "learning_rate": 3.628712871287129e-05, + "loss": 0.1145, + "num_input_tokens_seen": 2783488, + "step": 13195 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 1.2294433116912842, + "learning_rate": 3.6300880088008806e-05, + "loss": 0.1708, + "num_input_tokens_seen": 2784576, + "step": 13200 + }, + { + "epoch": 1.4526952695269526, + "grad_norm": 0.3425261378288269, + "learning_rate": 3.631463146314632e-05, + "loss": 0.0995, + "num_input_tokens_seen": 2785632, + "step": 13205 + }, + { + "epoch": 1.4532453245324533, + "grad_norm": 0.3206830322742462, + "learning_rate": 3.632838283828383e-05, + "loss": 0.1333, + "num_input_tokens_seen": 2786624, + "step": 13210 + }, + { + "epoch": 1.453795379537954, + "grad_norm": 1.30632483959198, + "learning_rate": 3.634213421342134e-05, + "loss": 0.1182, + "num_input_tokens_seen": 2787712, + "step": 13215 + }, + { + "epoch": 1.4543454345434543, + "grad_norm": 0.5641224384307861, + "learning_rate": 3.635588558855886e-05, + "loss": 0.0669, + "num_input_tokens_seen": 2788768, + "step": 13220 + }, + { + "epoch": 1.4548954895489548, + "grad_norm": 0.056407734751701355, + "learning_rate": 3.636963696369638e-05, + "loss": 0.0832, + "num_input_tokens_seen": 2789792, + "step": 13225 + }, + { + "epoch": 1.4554455445544554, + "grad_norm": 0.1705465465784073, + "learning_rate": 3.638338833883388e-05, + "loss": 0.1374, + "num_input_tokens_seen": 2790880, + "step": 13230 + }, + { + "epoch": 1.455995599559956, + "grad_norm": 1.3561131954193115, + "learning_rate": 3.63971397139714e-05, + "loss": 0.072, + "num_input_tokens_seen": 2791872, + "step": 13235 + }, + { + "epoch": 1.4565456545654565, + "grad_norm": 0.20163494348526, + "learning_rate": 3.641089108910891e-05, + "loss": 0.1677, + "num_input_tokens_seen": 2792864, + "step": 13240 + }, + { + "epoch": 1.4570957095709571, + "grad_norm": 0.5495847463607788, + "learning_rate": 3.642464246424643e-05, + "loss": 0.1619, + "num_input_tokens_seen": 2793920, + "step": 13245 + }, + { + "epoch": 1.4576457645764576, + "grad_norm": 0.714704692363739, + "learning_rate": 3.6438393839383936e-05, + "loss": 0.0929, + "num_input_tokens_seen": 2795008, + "step": 13250 + }, + { + "epoch": 1.4581958195819582, + "grad_norm": 0.39751961827278137, + "learning_rate": 3.6452145214521454e-05, + "loss": 0.1176, + "num_input_tokens_seen": 2796064, + "step": 13255 + }, + { + "epoch": 1.4587458745874589, + "grad_norm": 0.13407064974308014, + "learning_rate": 3.6465896589658966e-05, + "loss": 0.0422, + "num_input_tokens_seen": 2797120, + "step": 13260 + }, + { + "epoch": 1.4592959295929593, + "grad_norm": 0.16798344254493713, + "learning_rate": 3.6479647964796484e-05, + "loss": 0.0738, + "num_input_tokens_seen": 2798176, + "step": 13265 + }, + { + "epoch": 1.4598459845984597, + "grad_norm": 0.08614683151245117, + "learning_rate": 3.649339933993399e-05, + "loss": 0.0939, + "num_input_tokens_seen": 2799168, + "step": 13270 + }, + { + "epoch": 1.4603960396039604, + "grad_norm": 0.5278185606002808, + "learning_rate": 3.650715071507151e-05, + "loss": 0.114, + "num_input_tokens_seen": 2800192, + "step": 13275 + }, + { + "epoch": 1.460946094609461, + "grad_norm": 1.2692089080810547, + "learning_rate": 3.6520902090209026e-05, + "loss": 0.1653, + "num_input_tokens_seen": 2801280, + "step": 13280 + }, + { + "epoch": 1.4614961496149614, + "grad_norm": 0.17631450295448303, + "learning_rate": 3.653465346534654e-05, + "loss": 0.1178, + "num_input_tokens_seen": 2802336, + "step": 13285 + }, + { + "epoch": 1.462046204620462, + "grad_norm": 0.3753258287906647, + "learning_rate": 3.654840484048405e-05, + "loss": 0.1086, + "num_input_tokens_seen": 2803328, + "step": 13290 + }, + { + "epoch": 1.4625962596259625, + "grad_norm": 1.5494616031646729, + "learning_rate": 3.656215621562156e-05, + "loss": 0.1185, + "num_input_tokens_seen": 2804352, + "step": 13295 + }, + { + "epoch": 1.4631463146314632, + "grad_norm": 0.8274705410003662, + "learning_rate": 3.657590759075908e-05, + "loss": 0.1012, + "num_input_tokens_seen": 2805472, + "step": 13300 + }, + { + "epoch": 1.4636963696369638, + "grad_norm": 0.6994725465774536, + "learning_rate": 3.658965896589659e-05, + "loss": 0.1026, + "num_input_tokens_seen": 2806464, + "step": 13305 + }, + { + "epoch": 1.4642464246424642, + "grad_norm": 0.2068370133638382, + "learning_rate": 3.66034103410341e-05, + "loss": 0.1002, + "num_input_tokens_seen": 2807552, + "step": 13310 + }, + { + "epoch": 1.4647964796479647, + "grad_norm": 0.162474125623703, + "learning_rate": 3.6617161716171614e-05, + "loss": 0.1649, + "num_input_tokens_seen": 2808672, + "step": 13315 + }, + { + "epoch": 1.4653465346534653, + "grad_norm": 0.3008946180343628, + "learning_rate": 3.663091309130913e-05, + "loss": 0.0578, + "num_input_tokens_seen": 2809696, + "step": 13320 + }, + { + "epoch": 1.465896589658966, + "grad_norm": 0.6316596865653992, + "learning_rate": 3.664466446644665e-05, + "loss": 0.1217, + "num_input_tokens_seen": 2810720, + "step": 13325 + }, + { + "epoch": 1.4664466446644664, + "grad_norm": 0.07213882356882095, + "learning_rate": 3.665841584158416e-05, + "loss": 0.0732, + "num_input_tokens_seen": 2811840, + "step": 13330 + }, + { + "epoch": 1.466996699669967, + "grad_norm": 0.20902319252490997, + "learning_rate": 3.6672167216721674e-05, + "loss": 0.0797, + "num_input_tokens_seen": 2812864, + "step": 13335 + }, + { + "epoch": 1.4675467546754675, + "grad_norm": 1.0054751634597778, + "learning_rate": 3.6685918591859186e-05, + "loss": 0.1024, + "num_input_tokens_seen": 2813920, + "step": 13340 + }, + { + "epoch": 1.4680968096809681, + "grad_norm": 0.5431863069534302, + "learning_rate": 3.6699669966996705e-05, + "loss": 0.1605, + "num_input_tokens_seen": 2814912, + "step": 13345 + }, + { + "epoch": 1.4686468646864688, + "grad_norm": 1.1761343479156494, + "learning_rate": 3.6713421342134216e-05, + "loss": 0.1275, + "num_input_tokens_seen": 2816064, + "step": 13350 + }, + { + "epoch": 1.4691969196919692, + "grad_norm": 0.2794145941734314, + "learning_rate": 3.672717271727173e-05, + "loss": 0.1228, + "num_input_tokens_seen": 2817184, + "step": 13355 + }, + { + "epoch": 1.4697469746974696, + "grad_norm": 1.1113548278808594, + "learning_rate": 3.674092409240924e-05, + "loss": 0.1862, + "num_input_tokens_seen": 2818240, + "step": 13360 + }, + { + "epoch": 1.4702970297029703, + "grad_norm": 0.49195653200149536, + "learning_rate": 3.675467546754676e-05, + "loss": 0.1066, + "num_input_tokens_seen": 2819264, + "step": 13365 + }, + { + "epoch": 1.470847084708471, + "grad_norm": 0.6407157182693481, + "learning_rate": 3.676842684268427e-05, + "loss": 0.0836, + "num_input_tokens_seen": 2820352, + "step": 13370 + }, + { + "epoch": 1.4713971397139713, + "grad_norm": 0.4906962811946869, + "learning_rate": 3.678217821782178e-05, + "loss": 0.1667, + "num_input_tokens_seen": 2821344, + "step": 13375 + }, + { + "epoch": 1.471947194719472, + "grad_norm": 0.7820373773574829, + "learning_rate": 3.67959295929593e-05, + "loss": 0.1718, + "num_input_tokens_seen": 2822368, + "step": 13380 + }, + { + "epoch": 1.4724972497249724, + "grad_norm": 0.3520985245704651, + "learning_rate": 3.680968096809681e-05, + "loss": 0.1296, + "num_input_tokens_seen": 2823424, + "step": 13385 + }, + { + "epoch": 1.473047304730473, + "grad_norm": 0.3968389928340912, + "learning_rate": 3.682343234323433e-05, + "loss": 0.082, + "num_input_tokens_seen": 2824512, + "step": 13390 + }, + { + "epoch": 1.4735973597359737, + "grad_norm": 0.7085200548171997, + "learning_rate": 3.6837183718371835e-05, + "loss": 0.0996, + "num_input_tokens_seen": 2825568, + "step": 13395 + }, + { + "epoch": 1.4741474147414741, + "grad_norm": 0.4121987223625183, + "learning_rate": 3.685093509350935e-05, + "loss": 0.0477, + "num_input_tokens_seen": 2826528, + "step": 13400 + }, + { + "epoch": 1.4746974697469746, + "grad_norm": 0.1426338404417038, + "learning_rate": 3.6864686468646865e-05, + "loss": 0.0902, + "num_input_tokens_seen": 2827648, + "step": 13405 + }, + { + "epoch": 1.4752475247524752, + "grad_norm": 1.5549620389938354, + "learning_rate": 3.687843784378438e-05, + "loss": 0.1316, + "num_input_tokens_seen": 2828672, + "step": 13410 + }, + { + "epoch": 1.4757975797579759, + "grad_norm": 0.6158241033554077, + "learning_rate": 3.6892189218921895e-05, + "loss": 0.1643, + "num_input_tokens_seen": 2829760, + "step": 13415 + }, + { + "epoch": 1.4763476347634763, + "grad_norm": 0.6253651976585388, + "learning_rate": 3.6905940594059406e-05, + "loss": 0.1419, + "num_input_tokens_seen": 2830784, + "step": 13420 + }, + { + "epoch": 1.476897689768977, + "grad_norm": 1.056343674659729, + "learning_rate": 3.6919691969196925e-05, + "loss": 0.0825, + "num_input_tokens_seen": 2831776, + "step": 13425 + }, + { + "epoch": 1.4774477447744774, + "grad_norm": 0.3915981650352478, + "learning_rate": 3.6933443344334437e-05, + "loss": 0.1164, + "num_input_tokens_seen": 2832800, + "step": 13430 + }, + { + "epoch": 1.477997799779978, + "grad_norm": 0.4380963146686554, + "learning_rate": 3.694719471947195e-05, + "loss": 0.1161, + "num_input_tokens_seen": 2833856, + "step": 13435 + }, + { + "epoch": 1.4785478547854787, + "grad_norm": 0.636225163936615, + "learning_rate": 3.696094609460946e-05, + "loss": 0.1757, + "num_input_tokens_seen": 2834848, + "step": 13440 + }, + { + "epoch": 1.479097909790979, + "grad_norm": 1.3169615268707275, + "learning_rate": 3.697469746974698e-05, + "loss": 0.1797, + "num_input_tokens_seen": 2835968, + "step": 13445 + }, + { + "epoch": 1.4796479647964795, + "grad_norm": 1.5366853475570679, + "learning_rate": 3.698844884488449e-05, + "loss": 0.0903, + "num_input_tokens_seen": 2837024, + "step": 13450 + }, + { + "epoch": 1.4801980198019802, + "grad_norm": 0.22400285303592682, + "learning_rate": 3.7002200220022e-05, + "loss": 0.1097, + "num_input_tokens_seen": 2838144, + "step": 13455 + }, + { + "epoch": 1.4807480748074808, + "grad_norm": 1.5104200839996338, + "learning_rate": 3.701595159515952e-05, + "loss": 0.2211, + "num_input_tokens_seen": 2839200, + "step": 13460 + }, + { + "epoch": 1.4812981298129813, + "grad_norm": 0.9644211530685425, + "learning_rate": 3.702970297029703e-05, + "loss": 0.158, + "num_input_tokens_seen": 2840256, + "step": 13465 + }, + { + "epoch": 1.481848184818482, + "grad_norm": 1.1285244226455688, + "learning_rate": 3.704345434543455e-05, + "loss": 0.1678, + "num_input_tokens_seen": 2841376, + "step": 13470 + }, + { + "epoch": 1.4823982398239823, + "grad_norm": 1.0115526914596558, + "learning_rate": 3.7057205720572055e-05, + "loss": 0.0946, + "num_input_tokens_seen": 2842400, + "step": 13475 + }, + { + "epoch": 1.482948294829483, + "grad_norm": 1.977739930152893, + "learning_rate": 3.7070957095709573e-05, + "loss": 0.1894, + "num_input_tokens_seen": 2843392, + "step": 13480 + }, + { + "epoch": 1.4834983498349836, + "grad_norm": 1.8297345638275146, + "learning_rate": 3.7084708470847085e-05, + "loss": 0.1451, + "num_input_tokens_seen": 2844416, + "step": 13485 + }, + { + "epoch": 1.484048404840484, + "grad_norm": 0.8817604780197144, + "learning_rate": 3.7098459845984604e-05, + "loss": 0.1232, + "num_input_tokens_seen": 2845504, + "step": 13490 + }, + { + "epoch": 1.4845984598459845, + "grad_norm": 0.12674762308597565, + "learning_rate": 3.7112211221122115e-05, + "loss": 0.0658, + "num_input_tokens_seen": 2846560, + "step": 13495 + }, + { + "epoch": 1.4851485148514851, + "grad_norm": 0.36358219385147095, + "learning_rate": 3.712596259625963e-05, + "loss": 0.0726, + "num_input_tokens_seen": 2847648, + "step": 13500 + }, + { + "epoch": 1.4856985698569858, + "grad_norm": 1.601947546005249, + "learning_rate": 3.7139713971397145e-05, + "loss": 0.1296, + "num_input_tokens_seen": 2848768, + "step": 13505 + }, + { + "epoch": 1.4862486248624862, + "grad_norm": 1.1913442611694336, + "learning_rate": 3.715346534653466e-05, + "loss": 0.1224, + "num_input_tokens_seen": 2849792, + "step": 13510 + }, + { + "epoch": 1.4867986798679869, + "grad_norm": 0.5623719692230225, + "learning_rate": 3.716721672167217e-05, + "loss": 0.1015, + "num_input_tokens_seen": 2850880, + "step": 13515 + }, + { + "epoch": 1.4873487348734873, + "grad_norm": 0.3937624394893646, + "learning_rate": 3.718096809680968e-05, + "loss": 0.1308, + "num_input_tokens_seen": 2851936, + "step": 13520 + }, + { + "epoch": 1.487898789878988, + "grad_norm": 0.7631428837776184, + "learning_rate": 3.71947194719472e-05, + "loss": 0.0745, + "num_input_tokens_seen": 2852928, + "step": 13525 + }, + { + "epoch": 1.4884488448844886, + "grad_norm": 0.8821262121200562, + "learning_rate": 3.720847084708471e-05, + "loss": 0.2813, + "num_input_tokens_seen": 2853984, + "step": 13530 + }, + { + "epoch": 1.488998899889989, + "grad_norm": 0.39820465445518494, + "learning_rate": 3.722222222222222e-05, + "loss": 0.0818, + "num_input_tokens_seen": 2855040, + "step": 13535 + }, + { + "epoch": 1.4895489548954894, + "grad_norm": 0.9357584714889526, + "learning_rate": 3.7235973597359734e-05, + "loss": 0.1641, + "num_input_tokens_seen": 2856032, + "step": 13540 + }, + { + "epoch": 1.49009900990099, + "grad_norm": 0.5482360124588013, + "learning_rate": 3.724972497249725e-05, + "loss": 0.1738, + "num_input_tokens_seen": 2857088, + "step": 13545 + }, + { + "epoch": 1.4906490649064907, + "grad_norm": 0.20350101590156555, + "learning_rate": 3.7263476347634764e-05, + "loss": 0.0644, + "num_input_tokens_seen": 2858176, + "step": 13550 + }, + { + "epoch": 1.4911991199119912, + "grad_norm": 0.9105263948440552, + "learning_rate": 3.727722772277228e-05, + "loss": 0.0949, + "num_input_tokens_seen": 2859232, + "step": 13555 + }, + { + "epoch": 1.4917491749174918, + "grad_norm": 0.23929119110107422, + "learning_rate": 3.7290979097909794e-05, + "loss": 0.1078, + "num_input_tokens_seen": 2860288, + "step": 13560 + }, + { + "epoch": 1.4922992299229922, + "grad_norm": 0.6883929371833801, + "learning_rate": 3.7304730473047305e-05, + "loss": 0.0525, + "num_input_tokens_seen": 2861312, + "step": 13565 + }, + { + "epoch": 1.492849284928493, + "grad_norm": 0.36794155836105347, + "learning_rate": 3.7318481848184824e-05, + "loss": 0.078, + "num_input_tokens_seen": 2862368, + "step": 13570 + }, + { + "epoch": 1.4933993399339933, + "grad_norm": 0.48687297105789185, + "learning_rate": 3.7332233223322336e-05, + "loss": 0.1823, + "num_input_tokens_seen": 2863392, + "step": 13575 + }, + { + "epoch": 1.493949394939494, + "grad_norm": 0.5538462400436401, + "learning_rate": 3.734598459845985e-05, + "loss": 0.0892, + "num_input_tokens_seen": 2864480, + "step": 13580 + }, + { + "epoch": 1.4944994499449944, + "grad_norm": 0.5500831007957458, + "learning_rate": 3.735973597359736e-05, + "loss": 0.1038, + "num_input_tokens_seen": 2865504, + "step": 13585 + }, + { + "epoch": 1.495049504950495, + "grad_norm": 0.38399970531463623, + "learning_rate": 3.737348734873488e-05, + "loss": 0.0485, + "num_input_tokens_seen": 2866560, + "step": 13590 + }, + { + "epoch": 1.4955995599559957, + "grad_norm": 2.9656896591186523, + "learning_rate": 3.738723872387239e-05, + "loss": 0.1613, + "num_input_tokens_seen": 2867680, + "step": 13595 + }, + { + "epoch": 1.4961496149614961, + "grad_norm": 0.5330045819282532, + "learning_rate": 3.74009900990099e-05, + "loss": 0.1406, + "num_input_tokens_seen": 2868736, + "step": 13600 + }, + { + "epoch": 1.4966996699669968, + "grad_norm": 0.33370479941368103, + "learning_rate": 3.741474147414742e-05, + "loss": 0.0524, + "num_input_tokens_seen": 2869824, + "step": 13605 + }, + { + "epoch": 1.4972497249724972, + "grad_norm": 0.950812578201294, + "learning_rate": 3.742849284928493e-05, + "loss": 0.1474, + "num_input_tokens_seen": 2870912, + "step": 13610 + }, + { + "epoch": 1.4977997799779978, + "grad_norm": 0.05155425891280174, + "learning_rate": 3.744224422442245e-05, + "loss": 0.0661, + "num_input_tokens_seen": 2871936, + "step": 13615 + }, + { + "epoch": 1.4983498349834983, + "grad_norm": 0.6609978675842285, + "learning_rate": 3.7455995599559954e-05, + "loss": 0.0462, + "num_input_tokens_seen": 2873024, + "step": 13620 + }, + { + "epoch": 1.498899889988999, + "grad_norm": 0.8918898701667786, + "learning_rate": 3.746974697469747e-05, + "loss": 0.152, + "num_input_tokens_seen": 2874048, + "step": 13625 + }, + { + "epoch": 1.4994499449944994, + "grad_norm": 0.834872841835022, + "learning_rate": 3.7483498349834984e-05, + "loss": 0.0987, + "num_input_tokens_seen": 2875136, + "step": 13630 + }, + { + "epoch": 1.5, + "grad_norm": 1.8187826871871948, + "learning_rate": 3.74972497249725e-05, + "loss": 0.2171, + "num_input_tokens_seen": 2876224, + "step": 13635 + }, + { + "epoch": 1.5005500550055006, + "grad_norm": 0.7166464328765869, + "learning_rate": 3.751100110011001e-05, + "loss": 0.0821, + "num_input_tokens_seen": 2877248, + "step": 13640 + }, + { + "epoch": 1.501100110011001, + "grad_norm": 1.2266654968261719, + "learning_rate": 3.7524752475247526e-05, + "loss": 0.1127, + "num_input_tokens_seen": 2878336, + "step": 13645 + }, + { + "epoch": 1.5016501650165015, + "grad_norm": 0.07815966755151749, + "learning_rate": 3.7538503850385044e-05, + "loss": 0.1198, + "num_input_tokens_seen": 2879360, + "step": 13650 + }, + { + "epoch": 1.5022002200220022, + "grad_norm": 0.5476231575012207, + "learning_rate": 3.7552255225522556e-05, + "loss": 0.094, + "num_input_tokens_seen": 2880416, + "step": 13655 + }, + { + "epoch": 1.5027502750275028, + "grad_norm": 0.23064756393432617, + "learning_rate": 3.756600660066007e-05, + "loss": 0.1136, + "num_input_tokens_seen": 2881504, + "step": 13660 + }, + { + "epoch": 1.5033003300330035, + "grad_norm": 0.3329172432422638, + "learning_rate": 3.757975797579758e-05, + "loss": 0.0597, + "num_input_tokens_seen": 2882528, + "step": 13665 + }, + { + "epoch": 1.5038503850385039, + "grad_norm": 0.6792532801628113, + "learning_rate": 3.75935093509351e-05, + "loss": 0.0695, + "num_input_tokens_seen": 2883616, + "step": 13670 + }, + { + "epoch": 1.5044004400440043, + "grad_norm": 0.42269086837768555, + "learning_rate": 3.760726072607261e-05, + "loss": 0.1347, + "num_input_tokens_seen": 2884736, + "step": 13675 + }, + { + "epoch": 1.504950495049505, + "grad_norm": 0.8730998039245605, + "learning_rate": 3.762101210121012e-05, + "loss": 0.0694, + "num_input_tokens_seen": 2885760, + "step": 13680 + }, + { + "epoch": 1.5055005500550056, + "grad_norm": 0.5136335492134094, + "learning_rate": 3.763476347634763e-05, + "loss": 0.1048, + "num_input_tokens_seen": 2886848, + "step": 13685 + }, + { + "epoch": 1.506050605060506, + "grad_norm": 0.4543250799179077, + "learning_rate": 3.764851485148515e-05, + "loss": 0.1222, + "num_input_tokens_seen": 2887904, + "step": 13690 + }, + { + "epoch": 1.5066006600660065, + "grad_norm": 0.8650085926055908, + "learning_rate": 3.766226622662267e-05, + "loss": 0.0989, + "num_input_tokens_seen": 2889024, + "step": 13695 + }, + { + "epoch": 1.507150715071507, + "grad_norm": 0.6431118845939636, + "learning_rate": 3.7676017601760174e-05, + "loss": 0.1493, + "num_input_tokens_seen": 2890080, + "step": 13700 + }, + { + "epoch": 1.5077007700770078, + "grad_norm": 0.3622806668281555, + "learning_rate": 3.768976897689769e-05, + "loss": 0.1124, + "num_input_tokens_seen": 2891104, + "step": 13705 + }, + { + "epoch": 1.5082508250825084, + "grad_norm": 0.23129314184188843, + "learning_rate": 3.7703520352035204e-05, + "loss": 0.0848, + "num_input_tokens_seen": 2892128, + "step": 13710 + }, + { + "epoch": 1.5088008800880088, + "grad_norm": 0.5216957926750183, + "learning_rate": 3.771727172717272e-05, + "loss": 0.0723, + "num_input_tokens_seen": 2893120, + "step": 13715 + }, + { + "epoch": 1.5093509350935093, + "grad_norm": 1.0041905641555786, + "learning_rate": 3.7731023102310235e-05, + "loss": 0.1018, + "num_input_tokens_seen": 2894176, + "step": 13720 + }, + { + "epoch": 1.50990099009901, + "grad_norm": 0.35655832290649414, + "learning_rate": 3.7744774477447746e-05, + "loss": 0.1281, + "num_input_tokens_seen": 2895296, + "step": 13725 + }, + { + "epoch": 1.5104510451045106, + "grad_norm": 0.393033504486084, + "learning_rate": 3.775852585258526e-05, + "loss": 0.0686, + "num_input_tokens_seen": 2896352, + "step": 13730 + }, + { + "epoch": 1.511001100110011, + "grad_norm": 0.35273393988609314, + "learning_rate": 3.7772277227722776e-05, + "loss": 0.1157, + "num_input_tokens_seen": 2897408, + "step": 13735 + }, + { + "epoch": 1.5115511551155114, + "grad_norm": 1.1440860033035278, + "learning_rate": 3.778602860286029e-05, + "loss": 0.1589, + "num_input_tokens_seen": 2898528, + "step": 13740 + }, + { + "epoch": 1.512101210121012, + "grad_norm": 0.9072802066802979, + "learning_rate": 3.77997799779978e-05, + "loss": 0.2401, + "num_input_tokens_seen": 2899616, + "step": 13745 + }, + { + "epoch": 1.5126512651265127, + "grad_norm": 0.7651717066764832, + "learning_rate": 3.781353135313532e-05, + "loss": 0.0881, + "num_input_tokens_seen": 2900608, + "step": 13750 + }, + { + "epoch": 1.5132013201320134, + "grad_norm": 0.5092761516571045, + "learning_rate": 3.782728272827283e-05, + "loss": 0.1028, + "num_input_tokens_seen": 2901600, + "step": 13755 + }, + { + "epoch": 1.5137513751375138, + "grad_norm": 0.6353174448013306, + "learning_rate": 3.784103410341034e-05, + "loss": 0.1439, + "num_input_tokens_seen": 2902592, + "step": 13760 + }, + { + "epoch": 1.5143014301430142, + "grad_norm": 0.05457570031285286, + "learning_rate": 3.785478547854785e-05, + "loss": 0.107, + "num_input_tokens_seen": 2903648, + "step": 13765 + }, + { + "epoch": 1.5148514851485149, + "grad_norm": 0.2451760172843933, + "learning_rate": 3.786853685368537e-05, + "loss": 0.0919, + "num_input_tokens_seen": 2904704, + "step": 13770 + }, + { + "epoch": 1.5154015401540155, + "grad_norm": 0.3370577394962311, + "learning_rate": 3.788228822882288e-05, + "loss": 0.0757, + "num_input_tokens_seen": 2905728, + "step": 13775 + }, + { + "epoch": 1.515951595159516, + "grad_norm": 0.1761585921049118, + "learning_rate": 3.78960396039604e-05, + "loss": 0.032, + "num_input_tokens_seen": 2906784, + "step": 13780 + }, + { + "epoch": 1.5165016501650164, + "grad_norm": 0.5494135618209839, + "learning_rate": 3.7909790979097906e-05, + "loss": 0.0442, + "num_input_tokens_seen": 2907840, + "step": 13785 + }, + { + "epoch": 1.517051705170517, + "grad_norm": 0.2512750029563904, + "learning_rate": 3.7923542354235425e-05, + "loss": 0.0403, + "num_input_tokens_seen": 2908928, + "step": 13790 + }, + { + "epoch": 1.5176017601760177, + "grad_norm": 0.35339975357055664, + "learning_rate": 3.793729372937294e-05, + "loss": 0.1594, + "num_input_tokens_seen": 2909984, + "step": 13795 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.5337433218955994, + "learning_rate": 3.7951045104510455e-05, + "loss": 0.1004, + "num_input_tokens_seen": 2910976, + "step": 13800 + }, + { + "epoch": 1.5187018701870187, + "grad_norm": 1.6295729875564575, + "learning_rate": 3.7964796479647967e-05, + "loss": 0.1813, + "num_input_tokens_seen": 2912032, + "step": 13805 + }, + { + "epoch": 1.5192519251925192, + "grad_norm": 0.8198409676551819, + "learning_rate": 3.797854785478548e-05, + "loss": 0.0734, + "num_input_tokens_seen": 2913088, + "step": 13810 + }, + { + "epoch": 1.5198019801980198, + "grad_norm": 0.36734333634376526, + "learning_rate": 3.7992299229923e-05, + "loss": 0.2254, + "num_input_tokens_seen": 2914144, + "step": 13815 + }, + { + "epoch": 1.5203520352035205, + "grad_norm": 0.20965182781219482, + "learning_rate": 3.800605060506051e-05, + "loss": 0.1232, + "num_input_tokens_seen": 2915232, + "step": 13820 + }, + { + "epoch": 1.520902090209021, + "grad_norm": 0.9494355916976929, + "learning_rate": 3.801980198019802e-05, + "loss": 0.1099, + "num_input_tokens_seen": 2916288, + "step": 13825 + }, + { + "epoch": 1.5214521452145213, + "grad_norm": 0.8203251361846924, + "learning_rate": 3.803355335533553e-05, + "loss": 0.1241, + "num_input_tokens_seen": 2917376, + "step": 13830 + }, + { + "epoch": 1.522002200220022, + "grad_norm": 0.4921155869960785, + "learning_rate": 3.804730473047305e-05, + "loss": 0.1446, + "num_input_tokens_seen": 2918432, + "step": 13835 + }, + { + "epoch": 1.5225522552255226, + "grad_norm": 0.44026416540145874, + "learning_rate": 3.806105610561057e-05, + "loss": 0.1155, + "num_input_tokens_seen": 2919552, + "step": 13840 + }, + { + "epoch": 1.523102310231023, + "grad_norm": 1.0942237377166748, + "learning_rate": 3.807480748074807e-05, + "loss": 0.14, + "num_input_tokens_seen": 2920640, + "step": 13845 + }, + { + "epoch": 1.5236523652365237, + "grad_norm": 0.25917908549308777, + "learning_rate": 3.808855885588559e-05, + "loss": 0.11, + "num_input_tokens_seen": 2921696, + "step": 13850 + }, + { + "epoch": 1.5242024202420241, + "grad_norm": 0.45149993896484375, + "learning_rate": 3.8102310231023103e-05, + "loss": 0.0856, + "num_input_tokens_seen": 2922720, + "step": 13855 + }, + { + "epoch": 1.5247524752475248, + "grad_norm": 0.8615642189979553, + "learning_rate": 3.811606160616062e-05, + "loss": 0.1189, + "num_input_tokens_seen": 2923776, + "step": 13860 + }, + { + "epoch": 1.5253025302530254, + "grad_norm": 0.9655223488807678, + "learning_rate": 3.812981298129813e-05, + "loss": 0.0378, + "num_input_tokens_seen": 2924768, + "step": 13865 + }, + { + "epoch": 1.5258525852585259, + "grad_norm": 0.6354826092720032, + "learning_rate": 3.8143564356435645e-05, + "loss": 0.0688, + "num_input_tokens_seen": 2925856, + "step": 13870 + }, + { + "epoch": 1.5264026402640263, + "grad_norm": 1.5560768842697144, + "learning_rate": 3.815731573157316e-05, + "loss": 0.1236, + "num_input_tokens_seen": 2926976, + "step": 13875 + }, + { + "epoch": 1.526952695269527, + "grad_norm": 0.7485619187355042, + "learning_rate": 3.8171067106710675e-05, + "loss": 0.155, + "num_input_tokens_seen": 2928000, + "step": 13880 + }, + { + "epoch": 1.5275027502750276, + "grad_norm": 0.6365114450454712, + "learning_rate": 3.818481848184819e-05, + "loss": 0.041, + "num_input_tokens_seen": 2929056, + "step": 13885 + }, + { + "epoch": 1.528052805280528, + "grad_norm": 1.1751692295074463, + "learning_rate": 3.81985698569857e-05, + "loss": 0.1448, + "num_input_tokens_seen": 2930208, + "step": 13890 + }, + { + "epoch": 1.5286028602860287, + "grad_norm": 0.8178260922431946, + "learning_rate": 3.821232123212322e-05, + "loss": 0.0381, + "num_input_tokens_seen": 2931200, + "step": 13895 + }, + { + "epoch": 1.529152915291529, + "grad_norm": 0.13277706503868103, + "learning_rate": 3.822607260726073e-05, + "loss": 0.2064, + "num_input_tokens_seen": 2932288, + "step": 13900 + }, + { + "epoch": 1.5297029702970297, + "grad_norm": 0.45118093490600586, + "learning_rate": 3.823982398239824e-05, + "loss": 0.1222, + "num_input_tokens_seen": 2933280, + "step": 13905 + }, + { + "epoch": 1.5302530253025304, + "grad_norm": 0.3953815698623657, + "learning_rate": 3.825357535753575e-05, + "loss": 0.1395, + "num_input_tokens_seen": 2934336, + "step": 13910 + }, + { + "epoch": 1.5308030803080308, + "grad_norm": 0.40087494254112244, + "learning_rate": 3.826732673267327e-05, + "loss": 0.07, + "num_input_tokens_seen": 2935360, + "step": 13915 + }, + { + "epoch": 1.5313531353135312, + "grad_norm": 0.1523161381483078, + "learning_rate": 3.828107810781078e-05, + "loss": 0.1465, + "num_input_tokens_seen": 2936352, + "step": 13920 + }, + { + "epoch": 1.5319031903190319, + "grad_norm": 0.8892845511436462, + "learning_rate": 3.8294829482948294e-05, + "loss": 0.0595, + "num_input_tokens_seen": 2937408, + "step": 13925 + }, + { + "epoch": 1.5324532453245325, + "grad_norm": 0.3625562787055969, + "learning_rate": 3.830858085808581e-05, + "loss": 0.0927, + "num_input_tokens_seen": 2938560, + "step": 13930 + }, + { + "epoch": 1.533003300330033, + "grad_norm": 0.8016629219055176, + "learning_rate": 3.8322332233223324e-05, + "loss": 0.0974, + "num_input_tokens_seen": 2939584, + "step": 13935 + }, + { + "epoch": 1.5335533553355336, + "grad_norm": 0.5089036226272583, + "learning_rate": 3.833608360836084e-05, + "loss": 0.0925, + "num_input_tokens_seen": 2940640, + "step": 13940 + }, + { + "epoch": 1.534103410341034, + "grad_norm": 0.37000882625579834, + "learning_rate": 3.8349834983498354e-05, + "loss": 0.1003, + "num_input_tokens_seen": 2941664, + "step": 13945 + }, + { + "epoch": 1.5346534653465347, + "grad_norm": 0.7434203624725342, + "learning_rate": 3.8363586358635866e-05, + "loss": 0.0819, + "num_input_tokens_seen": 2942720, + "step": 13950 + }, + { + "epoch": 1.5352035203520353, + "grad_norm": 1.3217408657073975, + "learning_rate": 3.837733773377338e-05, + "loss": 0.1675, + "num_input_tokens_seen": 2943744, + "step": 13955 + }, + { + "epoch": 1.5357535753575358, + "grad_norm": 1.086965799331665, + "learning_rate": 3.8391089108910896e-05, + "loss": 0.0868, + "num_input_tokens_seen": 2944704, + "step": 13960 + }, + { + "epoch": 1.5363036303630362, + "grad_norm": 0.38010555505752563, + "learning_rate": 3.840484048404841e-05, + "loss": 0.1131, + "num_input_tokens_seen": 2945760, + "step": 13965 + }, + { + "epoch": 1.5368536853685368, + "grad_norm": 0.6135572791099548, + "learning_rate": 3.841859185918592e-05, + "loss": 0.1625, + "num_input_tokens_seen": 2946720, + "step": 13970 + }, + { + "epoch": 1.5374037403740375, + "grad_norm": 0.25799909234046936, + "learning_rate": 3.843234323432344e-05, + "loss": 0.1357, + "num_input_tokens_seen": 2947776, + "step": 13975 + }, + { + "epoch": 1.537953795379538, + "grad_norm": 0.25645825266838074, + "learning_rate": 3.844609460946095e-05, + "loss": 0.1629, + "num_input_tokens_seen": 2948800, + "step": 13980 + }, + { + "epoch": 1.5385038503850383, + "grad_norm": 0.5225154757499695, + "learning_rate": 3.845984598459846e-05, + "loss": 0.0971, + "num_input_tokens_seen": 2949824, + "step": 13985 + }, + { + "epoch": 1.539053905390539, + "grad_norm": 0.09436585009098053, + "learning_rate": 3.847359735973597e-05, + "loss": 0.0501, + "num_input_tokens_seen": 2950912, + "step": 13990 + }, + { + "epoch": 1.5396039603960396, + "grad_norm": 0.9918358325958252, + "learning_rate": 3.848734873487349e-05, + "loss": 0.1803, + "num_input_tokens_seen": 2951968, + "step": 13995 + }, + { + "epoch": 1.5401540154015403, + "grad_norm": 0.41957318782806396, + "learning_rate": 3.8501100110011e-05, + "loss": 0.1319, + "num_input_tokens_seen": 2953024, + "step": 14000 + }, + { + "epoch": 1.5407040704070407, + "grad_norm": 0.14164096117019653, + "learning_rate": 3.851485148514852e-05, + "loss": 0.1371, + "num_input_tokens_seen": 2954048, + "step": 14005 + }, + { + "epoch": 1.5412541254125411, + "grad_norm": 0.4917909801006317, + "learning_rate": 3.8528602860286026e-05, + "loss": 0.0874, + "num_input_tokens_seen": 2955040, + "step": 14010 + }, + { + "epoch": 1.5418041804180418, + "grad_norm": 0.3935847282409668, + "learning_rate": 3.8542354235423544e-05, + "loss": 0.0554, + "num_input_tokens_seen": 2956064, + "step": 14015 + }, + { + "epoch": 1.5423542354235424, + "grad_norm": 0.8416143655776978, + "learning_rate": 3.8556105610561056e-05, + "loss": 0.0997, + "num_input_tokens_seen": 2957152, + "step": 14020 + }, + { + "epoch": 1.5429042904290429, + "grad_norm": 0.5107012391090393, + "learning_rate": 3.8569856985698574e-05, + "loss": 0.0894, + "num_input_tokens_seen": 2958208, + "step": 14025 + }, + { + "epoch": 1.5434543454345433, + "grad_norm": 1.2831201553344727, + "learning_rate": 3.8583608360836086e-05, + "loss": 0.1177, + "num_input_tokens_seen": 2959200, + "step": 14030 + }, + { + "epoch": 1.544004400440044, + "grad_norm": 0.425799161195755, + "learning_rate": 3.85973597359736e-05, + "loss": 0.2155, + "num_input_tokens_seen": 2960256, + "step": 14035 + }, + { + "epoch": 1.5445544554455446, + "grad_norm": 0.1317177712917328, + "learning_rate": 3.8611111111111116e-05, + "loss": 0.1085, + "num_input_tokens_seen": 2961312, + "step": 14040 + }, + { + "epoch": 1.5451045104510452, + "grad_norm": 0.20171277225017548, + "learning_rate": 3.862486248624863e-05, + "loss": 0.1934, + "num_input_tokens_seen": 2962400, + "step": 14045 + }, + { + "epoch": 1.5456545654565457, + "grad_norm": 0.6216064095497131, + "learning_rate": 3.863861386138614e-05, + "loss": 0.0603, + "num_input_tokens_seen": 2963488, + "step": 14050 + }, + { + "epoch": 1.546204620462046, + "grad_norm": 0.29080793261528015, + "learning_rate": 3.865236523652365e-05, + "loss": 0.1987, + "num_input_tokens_seen": 2964448, + "step": 14055 + }, + { + "epoch": 1.5467546754675467, + "grad_norm": 0.1019199937582016, + "learning_rate": 3.866611661166117e-05, + "loss": 0.0604, + "num_input_tokens_seen": 2965568, + "step": 14060 + }, + { + "epoch": 1.5473047304730474, + "grad_norm": 0.7471179962158203, + "learning_rate": 3.867986798679868e-05, + "loss": 0.1271, + "num_input_tokens_seen": 2966624, + "step": 14065 + }, + { + "epoch": 1.5478547854785478, + "grad_norm": 0.3162144124507904, + "learning_rate": 3.869361936193619e-05, + "loss": 0.1164, + "num_input_tokens_seen": 2967648, + "step": 14070 + }, + { + "epoch": 1.5484048404840483, + "grad_norm": 0.2641426920890808, + "learning_rate": 3.870737073707371e-05, + "loss": 0.0243, + "num_input_tokens_seen": 2968672, + "step": 14075 + }, + { + "epoch": 1.548954895489549, + "grad_norm": 0.10016254335641861, + "learning_rate": 3.872112211221122e-05, + "loss": 0.0485, + "num_input_tokens_seen": 2969696, + "step": 14080 + }, + { + "epoch": 1.5495049504950495, + "grad_norm": 0.17431902885437012, + "learning_rate": 3.873487348734874e-05, + "loss": 0.0624, + "num_input_tokens_seen": 2970720, + "step": 14085 + }, + { + "epoch": 1.5500550055005502, + "grad_norm": 1.0966272354125977, + "learning_rate": 3.8748624862486246e-05, + "loss": 0.1409, + "num_input_tokens_seen": 2971744, + "step": 14090 + }, + { + "epoch": 1.5506050605060506, + "grad_norm": 0.32480916380882263, + "learning_rate": 3.8762376237623765e-05, + "loss": 0.0542, + "num_input_tokens_seen": 2972768, + "step": 14095 + }, + { + "epoch": 1.551155115511551, + "grad_norm": 0.703676164150238, + "learning_rate": 3.8776127612761276e-05, + "loss": 0.1114, + "num_input_tokens_seen": 2973760, + "step": 14100 + }, + { + "epoch": 1.5517051705170517, + "grad_norm": 0.4863467514514923, + "learning_rate": 3.8789878987898795e-05, + "loss": 0.1179, + "num_input_tokens_seen": 2974784, + "step": 14105 + }, + { + "epoch": 1.5522552255225524, + "grad_norm": 1.6492706537246704, + "learning_rate": 3.8803630363036306e-05, + "loss": 0.0798, + "num_input_tokens_seen": 2975840, + "step": 14110 + }, + { + "epoch": 1.5528052805280528, + "grad_norm": 1.3062403202056885, + "learning_rate": 3.881738173817382e-05, + "loss": 0.145, + "num_input_tokens_seen": 2976832, + "step": 14115 + }, + { + "epoch": 1.5533553355335532, + "grad_norm": 0.15997812151908875, + "learning_rate": 3.8831133113311336e-05, + "loss": 0.0852, + "num_input_tokens_seen": 2977792, + "step": 14120 + }, + { + "epoch": 1.5539053905390539, + "grad_norm": 1.2069268226623535, + "learning_rate": 3.884488448844885e-05, + "loss": 0.1056, + "num_input_tokens_seen": 2978784, + "step": 14125 + }, + { + "epoch": 1.5544554455445545, + "grad_norm": 0.4685227870941162, + "learning_rate": 3.885863586358636e-05, + "loss": 0.0335, + "num_input_tokens_seen": 2979840, + "step": 14130 + }, + { + "epoch": 1.5550055005500552, + "grad_norm": 0.4018576145172119, + "learning_rate": 3.887238723872387e-05, + "loss": 0.0765, + "num_input_tokens_seen": 2980928, + "step": 14135 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.7137395739555359, + "learning_rate": 3.888613861386139e-05, + "loss": 0.0847, + "num_input_tokens_seen": 2982016, + "step": 14140 + }, + { + "epoch": 1.556105610561056, + "grad_norm": 0.6667285561561584, + "learning_rate": 3.88998899889989e-05, + "loss": 0.0684, + "num_input_tokens_seen": 2983136, + "step": 14145 + }, + { + "epoch": 1.5566556655665567, + "grad_norm": 0.4448798596858978, + "learning_rate": 3.891364136413641e-05, + "loss": 0.2593, + "num_input_tokens_seen": 2984192, + "step": 14150 + }, + { + "epoch": 1.5572057205720573, + "grad_norm": 0.20255953073501587, + "learning_rate": 3.8927392739273925e-05, + "loss": 0.0653, + "num_input_tokens_seen": 2985280, + "step": 14155 + }, + { + "epoch": 1.5577557755775577, + "grad_norm": 0.668776273727417, + "learning_rate": 3.894114411441144e-05, + "loss": 0.1026, + "num_input_tokens_seen": 2986400, + "step": 14160 + }, + { + "epoch": 1.5583058305830582, + "grad_norm": 0.43669670820236206, + "learning_rate": 3.895489548954896e-05, + "loss": 0.0408, + "num_input_tokens_seen": 2987456, + "step": 14165 + }, + { + "epoch": 1.5588558855885588, + "grad_norm": 0.28863221406936646, + "learning_rate": 3.896864686468647e-05, + "loss": 0.0333, + "num_input_tokens_seen": 2988448, + "step": 14170 + }, + { + "epoch": 1.5594059405940595, + "grad_norm": 1.1481637954711914, + "learning_rate": 3.8982398239823985e-05, + "loss": 0.0753, + "num_input_tokens_seen": 2989504, + "step": 14175 + }, + { + "epoch": 1.55995599559956, + "grad_norm": 0.760010302066803, + "learning_rate": 3.8996149614961497e-05, + "loss": 0.1367, + "num_input_tokens_seen": 2990528, + "step": 14180 + }, + { + "epoch": 1.5605060506050605, + "grad_norm": 0.6392102241516113, + "learning_rate": 3.9009900990099015e-05, + "loss": 0.0398, + "num_input_tokens_seen": 2991552, + "step": 14185 + }, + { + "epoch": 1.561056105610561, + "grad_norm": 0.5336851477622986, + "learning_rate": 3.902365236523653e-05, + "loss": 0.0404, + "num_input_tokens_seen": 2992576, + "step": 14190 + }, + { + "epoch": 1.5616061606160616, + "grad_norm": 0.40718916058540344, + "learning_rate": 3.903740374037404e-05, + "loss": 0.08, + "num_input_tokens_seen": 2993696, + "step": 14195 + }, + { + "epoch": 1.5621562156215623, + "grad_norm": 0.21292513608932495, + "learning_rate": 3.905115511551155e-05, + "loss": 0.1692, + "num_input_tokens_seen": 2994816, + "step": 14200 + }, + { + "epoch": 1.5627062706270627, + "grad_norm": 0.9177663922309875, + "learning_rate": 3.906490649064907e-05, + "loss": 0.2223, + "num_input_tokens_seen": 2995872, + "step": 14205 + }, + { + "epoch": 1.5632563256325631, + "grad_norm": 0.11842896789312363, + "learning_rate": 3.907865786578658e-05, + "loss": 0.1292, + "num_input_tokens_seen": 2996928, + "step": 14210 + }, + { + "epoch": 1.5638063806380638, + "grad_norm": 0.35964006185531616, + "learning_rate": 3.909240924092409e-05, + "loss": 0.052, + "num_input_tokens_seen": 2998016, + "step": 14215 + }, + { + "epoch": 1.5643564356435644, + "grad_norm": 0.1682671159505844, + "learning_rate": 3.910616061606161e-05, + "loss": 0.1073, + "num_input_tokens_seen": 2999104, + "step": 14220 + }, + { + "epoch": 1.564906490649065, + "grad_norm": 0.17979875206947327, + "learning_rate": 3.911991199119912e-05, + "loss": 0.1885, + "num_input_tokens_seen": 3000224, + "step": 14225 + }, + { + "epoch": 1.5654565456545655, + "grad_norm": 0.4122098684310913, + "learning_rate": 3.913366336633664e-05, + "loss": 0.0659, + "num_input_tokens_seen": 3001216, + "step": 14230 + }, + { + "epoch": 1.566006600660066, + "grad_norm": 0.977638840675354, + "learning_rate": 3.9147414741474145e-05, + "loss": 0.1336, + "num_input_tokens_seen": 3002272, + "step": 14235 + }, + { + "epoch": 1.5665566556655666, + "grad_norm": 0.2641672194004059, + "learning_rate": 3.9161166116611664e-05, + "loss": 0.0837, + "num_input_tokens_seen": 3003296, + "step": 14240 + }, + { + "epoch": 1.5671067106710672, + "grad_norm": 0.54715895652771, + "learning_rate": 3.9174917491749175e-05, + "loss": 0.1288, + "num_input_tokens_seen": 3004320, + "step": 14245 + }, + { + "epoch": 1.5676567656765676, + "grad_norm": 0.5693050026893616, + "learning_rate": 3.9188668866886694e-05, + "loss": 0.0764, + "num_input_tokens_seen": 3005440, + "step": 14250 + }, + { + "epoch": 1.568206820682068, + "grad_norm": 1.8450208902359009, + "learning_rate": 3.9202420242024205e-05, + "loss": 0.1543, + "num_input_tokens_seen": 3006496, + "step": 14255 + }, + { + "epoch": 1.5687568756875687, + "grad_norm": 0.239373117685318, + "learning_rate": 3.921617161716172e-05, + "loss": 0.0528, + "num_input_tokens_seen": 3007616, + "step": 14260 + }, + { + "epoch": 1.5693069306930694, + "grad_norm": 0.3674434721469879, + "learning_rate": 3.9229922992299235e-05, + "loss": 0.1428, + "num_input_tokens_seen": 3008640, + "step": 14265 + }, + { + "epoch": 1.56985698569857, + "grad_norm": 0.7926903367042542, + "learning_rate": 3.924367436743675e-05, + "loss": 0.1072, + "num_input_tokens_seen": 3009728, + "step": 14270 + }, + { + "epoch": 1.5704070407040704, + "grad_norm": 0.6256716251373291, + "learning_rate": 3.925742574257426e-05, + "loss": 0.1094, + "num_input_tokens_seen": 3010752, + "step": 14275 + }, + { + "epoch": 1.5709570957095709, + "grad_norm": 0.8281091451644897, + "learning_rate": 3.927117711771177e-05, + "loss": 0.0548, + "num_input_tokens_seen": 3011776, + "step": 14280 + }, + { + "epoch": 1.5715071507150715, + "grad_norm": 1.0270475149154663, + "learning_rate": 3.928492849284929e-05, + "loss": 0.1599, + "num_input_tokens_seen": 3012928, + "step": 14285 + }, + { + "epoch": 1.5720572057205722, + "grad_norm": 1.1951053142547607, + "learning_rate": 3.92986798679868e-05, + "loss": 0.1678, + "num_input_tokens_seen": 3013920, + "step": 14290 + }, + { + "epoch": 1.5726072607260726, + "grad_norm": 1.0753732919692993, + "learning_rate": 3.931243124312431e-05, + "loss": 0.0981, + "num_input_tokens_seen": 3014976, + "step": 14295 + }, + { + "epoch": 1.573157315731573, + "grad_norm": 0.08916877955198288, + "learning_rate": 3.9326182618261824e-05, + "loss": 0.0683, + "num_input_tokens_seen": 3016064, + "step": 14300 + }, + { + "epoch": 1.5737073707370737, + "grad_norm": 0.46497204899787903, + "learning_rate": 3.933993399339934e-05, + "loss": 0.0408, + "num_input_tokens_seen": 3017152, + "step": 14305 + }, + { + "epoch": 1.5742574257425743, + "grad_norm": 0.8268952369689941, + "learning_rate": 3.935368536853686e-05, + "loss": 0.163, + "num_input_tokens_seen": 3018208, + "step": 14310 + }, + { + "epoch": 1.574807480748075, + "grad_norm": 0.06307977437973022, + "learning_rate": 3.9367436743674365e-05, + "loss": 0.0843, + "num_input_tokens_seen": 3019296, + "step": 14315 + }, + { + "epoch": 1.5753575357535754, + "grad_norm": 0.11825919896364212, + "learning_rate": 3.9381188118811884e-05, + "loss": 0.0374, + "num_input_tokens_seen": 3020320, + "step": 14320 + }, + { + "epoch": 1.5759075907590758, + "grad_norm": 0.4404515326023102, + "learning_rate": 3.9394939493949396e-05, + "loss": 0.1553, + "num_input_tokens_seen": 3021344, + "step": 14325 + }, + { + "epoch": 1.5764576457645765, + "grad_norm": 0.569857656955719, + "learning_rate": 3.9408690869086914e-05, + "loss": 0.0615, + "num_input_tokens_seen": 3022368, + "step": 14330 + }, + { + "epoch": 1.5770077007700771, + "grad_norm": 0.7638111710548401, + "learning_rate": 3.9422442244224426e-05, + "loss": 0.034, + "num_input_tokens_seen": 3023456, + "step": 14335 + }, + { + "epoch": 1.5775577557755776, + "grad_norm": 1.239748239517212, + "learning_rate": 3.943619361936194e-05, + "loss": 0.1386, + "num_input_tokens_seen": 3024512, + "step": 14340 + }, + { + "epoch": 1.578107810781078, + "grad_norm": 0.9415028095245361, + "learning_rate": 3.944994499449945e-05, + "loss": 0.1475, + "num_input_tokens_seen": 3025600, + "step": 14345 + }, + { + "epoch": 1.5786578657865786, + "grad_norm": 1.5169148445129395, + "learning_rate": 3.946369636963697e-05, + "loss": 0.1562, + "num_input_tokens_seen": 3026688, + "step": 14350 + }, + { + "epoch": 1.5792079207920793, + "grad_norm": 0.06257209181785583, + "learning_rate": 3.947744774477448e-05, + "loss": 0.0525, + "num_input_tokens_seen": 3027808, + "step": 14355 + }, + { + "epoch": 1.5797579757975797, + "grad_norm": 0.03381974622607231, + "learning_rate": 3.949119911991199e-05, + "loss": 0.048, + "num_input_tokens_seen": 3028832, + "step": 14360 + }, + { + "epoch": 1.5803080308030804, + "grad_norm": 0.22634316980838776, + "learning_rate": 3.950495049504951e-05, + "loss": 0.0498, + "num_input_tokens_seen": 3029856, + "step": 14365 + }, + { + "epoch": 1.5808580858085808, + "grad_norm": 1.8392438888549805, + "learning_rate": 3.951870187018702e-05, + "loss": 0.2444, + "num_input_tokens_seen": 3030912, + "step": 14370 + }, + { + "epoch": 1.5814081408140814, + "grad_norm": 0.5831330418586731, + "learning_rate": 3.953245324532453e-05, + "loss": 0.1168, + "num_input_tokens_seen": 3031968, + "step": 14375 + }, + { + "epoch": 1.581958195819582, + "grad_norm": 0.8145103454589844, + "learning_rate": 3.9546204620462044e-05, + "loss": 0.1072, + "num_input_tokens_seen": 3033056, + "step": 14380 + }, + { + "epoch": 1.5825082508250825, + "grad_norm": 0.06389785557985306, + "learning_rate": 3.955995599559956e-05, + "loss": 0.0815, + "num_input_tokens_seen": 3034080, + "step": 14385 + }, + { + "epoch": 1.583058305830583, + "grad_norm": 0.8895437121391296, + "learning_rate": 3.9573707370737074e-05, + "loss": 0.0734, + "num_input_tokens_seen": 3035168, + "step": 14390 + }, + { + "epoch": 1.5836083608360836, + "grad_norm": 0.7852998375892639, + "learning_rate": 3.958745874587459e-05, + "loss": 0.1022, + "num_input_tokens_seen": 3036256, + "step": 14395 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.47166192531585693, + "learning_rate": 3.9601210121012104e-05, + "loss": 0.1597, + "num_input_tokens_seen": 3037280, + "step": 14400 + }, + { + "epoch": 1.5847084708470847, + "grad_norm": 1.1935248374938965, + "learning_rate": 3.9614961496149616e-05, + "loss": 0.2251, + "num_input_tokens_seen": 3038304, + "step": 14405 + }, + { + "epoch": 1.5852585258525853, + "grad_norm": 0.4417862892150879, + "learning_rate": 3.9628712871287134e-05, + "loss": 0.054, + "num_input_tokens_seen": 3039296, + "step": 14410 + }, + { + "epoch": 1.5858085808580857, + "grad_norm": 0.2718786895275116, + "learning_rate": 3.9642464246424646e-05, + "loss": 0.0799, + "num_input_tokens_seen": 3040352, + "step": 14415 + }, + { + "epoch": 1.5863586358635864, + "grad_norm": 0.6948838829994202, + "learning_rate": 3.965621562156216e-05, + "loss": 0.0885, + "num_input_tokens_seen": 3041408, + "step": 14420 + }, + { + "epoch": 1.586908690869087, + "grad_norm": 0.45340582728385925, + "learning_rate": 3.966996699669967e-05, + "loss": 0.0698, + "num_input_tokens_seen": 3042432, + "step": 14425 + }, + { + "epoch": 1.5874587458745875, + "grad_norm": 0.3424164056777954, + "learning_rate": 3.968371837183719e-05, + "loss": 0.1189, + "num_input_tokens_seen": 3043392, + "step": 14430 + }, + { + "epoch": 1.588008800880088, + "grad_norm": 1.1540344953536987, + "learning_rate": 3.96974697469747e-05, + "loss": 0.1405, + "num_input_tokens_seen": 3044384, + "step": 14435 + }, + { + "epoch": 1.5885588558855885, + "grad_norm": 0.23182132840156555, + "learning_rate": 3.971122112211221e-05, + "loss": 0.141, + "num_input_tokens_seen": 3045376, + "step": 14440 + }, + { + "epoch": 1.5891089108910892, + "grad_norm": 0.8729972839355469, + "learning_rate": 3.972497249724973e-05, + "loss": 0.1807, + "num_input_tokens_seen": 3046400, + "step": 14445 + }, + { + "epoch": 1.5896589658965896, + "grad_norm": 0.41116851568222046, + "learning_rate": 3.973872387238724e-05, + "loss": 0.0921, + "num_input_tokens_seen": 3047488, + "step": 14450 + }, + { + "epoch": 1.5902090209020903, + "grad_norm": 0.4787972569465637, + "learning_rate": 3.975247524752476e-05, + "loss": 0.1063, + "num_input_tokens_seen": 3048512, + "step": 14455 + }, + { + "epoch": 1.5907590759075907, + "grad_norm": 0.4575387239456177, + "learning_rate": 3.9766226622662264e-05, + "loss": 0.0871, + "num_input_tokens_seen": 3049600, + "step": 14460 + }, + { + "epoch": 1.5913091309130913, + "grad_norm": 0.27950039505958557, + "learning_rate": 3.977997799779978e-05, + "loss": 0.044, + "num_input_tokens_seen": 3050656, + "step": 14465 + }, + { + "epoch": 1.591859185918592, + "grad_norm": 0.8853936791419983, + "learning_rate": 3.9793729372937295e-05, + "loss": 0.1428, + "num_input_tokens_seen": 3051744, + "step": 14470 + }, + { + "epoch": 1.5924092409240924, + "grad_norm": 0.015821857377886772, + "learning_rate": 3.980748074807481e-05, + "loss": 0.04, + "num_input_tokens_seen": 3052736, + "step": 14475 + }, + { + "epoch": 1.5929592959295928, + "grad_norm": 0.19282574951648712, + "learning_rate": 3.982123212321232e-05, + "loss": 0.1603, + "num_input_tokens_seen": 3053824, + "step": 14480 + }, + { + "epoch": 1.5935093509350935, + "grad_norm": 0.386949747800827, + "learning_rate": 3.9834983498349836e-05, + "loss": 0.1172, + "num_input_tokens_seen": 3054880, + "step": 14485 + }, + { + "epoch": 1.5940594059405941, + "grad_norm": 1.1211735010147095, + "learning_rate": 3.9848734873487355e-05, + "loss": 0.1219, + "num_input_tokens_seen": 3055936, + "step": 14490 + }, + { + "epoch": 1.5946094609460946, + "grad_norm": 0.6451030969619751, + "learning_rate": 3.9862486248624866e-05, + "loss": 0.0863, + "num_input_tokens_seen": 3056992, + "step": 14495 + }, + { + "epoch": 1.595159515951595, + "grad_norm": 0.33247631788253784, + "learning_rate": 3.987623762376238e-05, + "loss": 0.0715, + "num_input_tokens_seen": 3058048, + "step": 14500 + }, + { + "epoch": 1.5957095709570956, + "grad_norm": 1.3396141529083252, + "learning_rate": 3.988998899889989e-05, + "loss": 0.1661, + "num_input_tokens_seen": 3059072, + "step": 14505 + }, + { + "epoch": 1.5962596259625963, + "grad_norm": 1.4126694202423096, + "learning_rate": 3.990374037403741e-05, + "loss": 0.2095, + "num_input_tokens_seen": 3060224, + "step": 14510 + }, + { + "epoch": 1.596809680968097, + "grad_norm": 1.837876319885254, + "learning_rate": 3.991749174917492e-05, + "loss": 0.1146, + "num_input_tokens_seen": 3061312, + "step": 14515 + }, + { + "epoch": 1.5973597359735974, + "grad_norm": 0.26255977153778076, + "learning_rate": 3.993124312431243e-05, + "loss": 0.0467, + "num_input_tokens_seen": 3062368, + "step": 14520 + }, + { + "epoch": 1.5979097909790978, + "grad_norm": 1.148353099822998, + "learning_rate": 3.994499449944994e-05, + "loss": 0.1413, + "num_input_tokens_seen": 3063392, + "step": 14525 + }, + { + "epoch": 1.5984598459845984, + "grad_norm": 0.37844398617744446, + "learning_rate": 3.995874587458746e-05, + "loss": 0.1035, + "num_input_tokens_seen": 3064384, + "step": 14530 + }, + { + "epoch": 1.599009900990099, + "grad_norm": 0.18801452219486237, + "learning_rate": 3.997249724972497e-05, + "loss": 0.0723, + "num_input_tokens_seen": 3065376, + "step": 14535 + }, + { + "epoch": 1.5995599559955995, + "grad_norm": 0.13341712951660156, + "learning_rate": 3.9986248624862485e-05, + "loss": 0.05, + "num_input_tokens_seen": 3066464, + "step": 14540 + }, + { + "epoch": 1.6001100110011, + "grad_norm": 0.4131052792072296, + "learning_rate": 4e-05, + "loss": 0.0599, + "num_input_tokens_seen": 3067424, + "step": 14545 + }, + { + "epoch": 1.6006600660066006, + "grad_norm": 0.3810952305793762, + "learning_rate": 4.0013751375137515e-05, + "loss": 0.0393, + "num_input_tokens_seen": 3068512, + "step": 14550 + }, + { + "epoch": 1.6012101210121013, + "grad_norm": 0.13168591260910034, + "learning_rate": 4.002750275027503e-05, + "loss": 0.1203, + "num_input_tokens_seen": 3069568, + "step": 14555 + }, + { + "epoch": 1.601760176017602, + "grad_norm": 0.1528344303369522, + "learning_rate": 4.0041254125412545e-05, + "loss": 0.0729, + "num_input_tokens_seen": 3070624, + "step": 14560 + }, + { + "epoch": 1.6023102310231023, + "grad_norm": 0.3612789511680603, + "learning_rate": 4.005500550055006e-05, + "loss": 0.0813, + "num_input_tokens_seen": 3071712, + "step": 14565 + }, + { + "epoch": 1.6028602860286028, + "grad_norm": 0.2461632937192917, + "learning_rate": 4.006875687568757e-05, + "loss": 0.0806, + "num_input_tokens_seen": 3072768, + "step": 14570 + }, + { + "epoch": 1.6034103410341034, + "grad_norm": 0.20565658807754517, + "learning_rate": 4.008250825082509e-05, + "loss": 0.0844, + "num_input_tokens_seen": 3073792, + "step": 14575 + }, + { + "epoch": 1.603960396039604, + "grad_norm": 0.1605272740125656, + "learning_rate": 4.00962596259626e-05, + "loss": 0.0576, + "num_input_tokens_seen": 3074880, + "step": 14580 + }, + { + "epoch": 1.6045104510451045, + "grad_norm": 1.239594578742981, + "learning_rate": 4.011001100110011e-05, + "loss": 0.1481, + "num_input_tokens_seen": 3076000, + "step": 14585 + }, + { + "epoch": 1.605060506050605, + "grad_norm": 0.24391838908195496, + "learning_rate": 4.012376237623763e-05, + "loss": 0.0749, + "num_input_tokens_seen": 3077056, + "step": 14590 + }, + { + "epoch": 1.6056105610561056, + "grad_norm": 0.4005683362483978, + "learning_rate": 4.013751375137514e-05, + "loss": 0.1352, + "num_input_tokens_seen": 3078144, + "step": 14595 + }, + { + "epoch": 1.6061606160616062, + "grad_norm": 1.2288296222686768, + "learning_rate": 4.015126512651265e-05, + "loss": 0.0815, + "num_input_tokens_seen": 3079200, + "step": 14600 + }, + { + "epoch": 1.6067106710671069, + "grad_norm": 0.8339346051216125, + "learning_rate": 4.0165016501650163e-05, + "loss": 0.1311, + "num_input_tokens_seen": 3080256, + "step": 14605 + }, + { + "epoch": 1.6072607260726073, + "grad_norm": 0.6712623834609985, + "learning_rate": 4.017876787678768e-05, + "loss": 0.0938, + "num_input_tokens_seen": 3081376, + "step": 14610 + }, + { + "epoch": 1.6078107810781077, + "grad_norm": 0.14673662185668945, + "learning_rate": 4.0192519251925194e-05, + "loss": 0.0842, + "num_input_tokens_seen": 3082464, + "step": 14615 + }, + { + "epoch": 1.6083608360836084, + "grad_norm": 1.0401320457458496, + "learning_rate": 4.020627062706271e-05, + "loss": 0.2538, + "num_input_tokens_seen": 3083456, + "step": 14620 + }, + { + "epoch": 1.608910891089109, + "grad_norm": 0.5458011031150818, + "learning_rate": 4.022002200220022e-05, + "loss": 0.0847, + "num_input_tokens_seen": 3084512, + "step": 14625 + }, + { + "epoch": 1.6094609460946094, + "grad_norm": 0.5640442371368408, + "learning_rate": 4.0233773377337735e-05, + "loss": 0.0381, + "num_input_tokens_seen": 3085568, + "step": 14630 + }, + { + "epoch": 1.6100110011001099, + "grad_norm": 1.0084110498428345, + "learning_rate": 4.0247524752475254e-05, + "loss": 0.2321, + "num_input_tokens_seen": 3086592, + "step": 14635 + }, + { + "epoch": 1.6105610561056105, + "grad_norm": 1.0047551393508911, + "learning_rate": 4.0261276127612765e-05, + "loss": 0.1473, + "num_input_tokens_seen": 3087680, + "step": 14640 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.2866940498352051, + "learning_rate": 4.027502750275028e-05, + "loss": 0.0553, + "num_input_tokens_seen": 3088736, + "step": 14645 + }, + { + "epoch": 1.6116611661166118, + "grad_norm": 0.8117822408676147, + "learning_rate": 4.028877887788779e-05, + "loss": 0.1092, + "num_input_tokens_seen": 3089792, + "step": 14650 + }, + { + "epoch": 1.6122112211221122, + "grad_norm": 1.1540013551712036, + "learning_rate": 4.030253025302531e-05, + "loss": 0.1263, + "num_input_tokens_seen": 3090816, + "step": 14655 + }, + { + "epoch": 1.6127612761276127, + "grad_norm": 0.08667613565921783, + "learning_rate": 4.031628162816282e-05, + "loss": 0.0389, + "num_input_tokens_seen": 3091872, + "step": 14660 + }, + { + "epoch": 1.6133113311331133, + "grad_norm": 0.3226078748703003, + "learning_rate": 4.033003300330033e-05, + "loss": 0.0845, + "num_input_tokens_seen": 3092960, + "step": 14665 + }, + { + "epoch": 1.613861386138614, + "grad_norm": 0.6777084469795227, + "learning_rate": 4.034378437843784e-05, + "loss": 0.1298, + "num_input_tokens_seen": 3093984, + "step": 14670 + }, + { + "epoch": 1.6144114411441144, + "grad_norm": 0.699558436870575, + "learning_rate": 4.035753575357536e-05, + "loss": 0.0611, + "num_input_tokens_seen": 3095008, + "step": 14675 + }, + { + "epoch": 1.6149614961496148, + "grad_norm": 0.2569146752357483, + "learning_rate": 4.037128712871288e-05, + "loss": 0.0426, + "num_input_tokens_seen": 3096064, + "step": 14680 + }, + { + "epoch": 1.6155115511551155, + "grad_norm": 0.8902429938316345, + "learning_rate": 4.0385038503850384e-05, + "loss": 0.0962, + "num_input_tokens_seen": 3097088, + "step": 14685 + }, + { + "epoch": 1.6160616061606161, + "grad_norm": 0.5938352942466736, + "learning_rate": 4.03987898789879e-05, + "loss": 0.0576, + "num_input_tokens_seen": 3098144, + "step": 14690 + }, + { + "epoch": 1.6166116611661168, + "grad_norm": 1.0058869123458862, + "learning_rate": 4.0412541254125414e-05, + "loss": 0.0936, + "num_input_tokens_seen": 3099168, + "step": 14695 + }, + { + "epoch": 1.6171617161716172, + "grad_norm": 0.7388920187950134, + "learning_rate": 4.042629262926293e-05, + "loss": 0.1171, + "num_input_tokens_seen": 3100288, + "step": 14700 + }, + { + "epoch": 1.6177117711771176, + "grad_norm": 0.5706870555877686, + "learning_rate": 4.044004400440044e-05, + "loss": 0.0656, + "num_input_tokens_seen": 3101344, + "step": 14705 + }, + { + "epoch": 1.6182618261826183, + "grad_norm": 1.6019775867462158, + "learning_rate": 4.0453795379537956e-05, + "loss": 0.1076, + "num_input_tokens_seen": 3102400, + "step": 14710 + }, + { + "epoch": 1.618811881188119, + "grad_norm": 0.8536680340766907, + "learning_rate": 4.046754675467547e-05, + "loss": 0.1105, + "num_input_tokens_seen": 3103488, + "step": 14715 + }, + { + "epoch": 1.6193619361936193, + "grad_norm": 1.3624101877212524, + "learning_rate": 4.0481298129812986e-05, + "loss": 0.0664, + "num_input_tokens_seen": 3104512, + "step": 14720 + }, + { + "epoch": 1.6199119911991198, + "grad_norm": 0.948969304561615, + "learning_rate": 4.04950495049505e-05, + "loss": 0.1631, + "num_input_tokens_seen": 3105536, + "step": 14725 + }, + { + "epoch": 1.6204620462046204, + "grad_norm": 0.3346261978149414, + "learning_rate": 4.050880088008801e-05, + "loss": 0.1873, + "num_input_tokens_seen": 3106528, + "step": 14730 + }, + { + "epoch": 1.621012101210121, + "grad_norm": 0.2799863815307617, + "learning_rate": 4.052255225522553e-05, + "loss": 0.1741, + "num_input_tokens_seen": 3107584, + "step": 14735 + }, + { + "epoch": 1.6215621562156217, + "grad_norm": 1.4079840183258057, + "learning_rate": 4.053630363036304e-05, + "loss": 0.1394, + "num_input_tokens_seen": 3108704, + "step": 14740 + }, + { + "epoch": 1.6221122112211221, + "grad_norm": 0.2336292713880539, + "learning_rate": 4.055005500550055e-05, + "loss": 0.0377, + "num_input_tokens_seen": 3109760, + "step": 14745 + }, + { + "epoch": 1.6226622662266226, + "grad_norm": 0.05268154293298721, + "learning_rate": 4.056380638063806e-05, + "loss": 0.0499, + "num_input_tokens_seen": 3110816, + "step": 14750 + }, + { + "epoch": 1.6232123212321232, + "grad_norm": 0.45258352160453796, + "learning_rate": 4.057755775577558e-05, + "loss": 0.0868, + "num_input_tokens_seen": 3111808, + "step": 14755 + }, + { + "epoch": 1.6237623762376239, + "grad_norm": 3.159850597381592, + "learning_rate": 4.059130913091309e-05, + "loss": 0.1673, + "num_input_tokens_seen": 3112928, + "step": 14760 + }, + { + "epoch": 1.6243124312431243, + "grad_norm": 0.1483909636735916, + "learning_rate": 4.0605060506050604e-05, + "loss": 0.0789, + "num_input_tokens_seen": 3114048, + "step": 14765 + }, + { + "epoch": 1.6248624862486247, + "grad_norm": 0.7043692469596863, + "learning_rate": 4.061881188118812e-05, + "loss": 0.0789, + "num_input_tokens_seen": 3115136, + "step": 14770 + }, + { + "epoch": 1.6254125412541254, + "grad_norm": 0.10420393943786621, + "learning_rate": 4.0632563256325634e-05, + "loss": 0.1661, + "num_input_tokens_seen": 3116192, + "step": 14775 + }, + { + "epoch": 1.625962596259626, + "grad_norm": 1.862324833869934, + "learning_rate": 4.064631463146315e-05, + "loss": 0.1776, + "num_input_tokens_seen": 3117248, + "step": 14780 + }, + { + "epoch": 1.6265126512651267, + "grad_norm": 0.0520731620490551, + "learning_rate": 4.0660066006600664e-05, + "loss": 0.0809, + "num_input_tokens_seen": 3118304, + "step": 14785 + }, + { + "epoch": 1.627062706270627, + "grad_norm": 0.7076858878135681, + "learning_rate": 4.0673817381738176e-05, + "loss": 0.1556, + "num_input_tokens_seen": 3119392, + "step": 14790 + }, + { + "epoch": 1.6276127612761275, + "grad_norm": 0.30703461170196533, + "learning_rate": 4.068756875687569e-05, + "loss": 0.0971, + "num_input_tokens_seen": 3120544, + "step": 14795 + }, + { + "epoch": 1.6281628162816282, + "grad_norm": 0.6423525214195251, + "learning_rate": 4.0701320132013206e-05, + "loss": 0.0986, + "num_input_tokens_seen": 3121600, + "step": 14800 + }, + { + "epoch": 1.6287128712871288, + "grad_norm": 0.8395255208015442, + "learning_rate": 4.071507150715072e-05, + "loss": 0.1072, + "num_input_tokens_seen": 3122656, + "step": 14805 + }, + { + "epoch": 1.6292629262926293, + "grad_norm": 1.4831196069717407, + "learning_rate": 4.072882288228823e-05, + "loss": 0.0566, + "num_input_tokens_seen": 3123712, + "step": 14810 + }, + { + "epoch": 1.6298129812981297, + "grad_norm": 0.2272280603647232, + "learning_rate": 4.074257425742574e-05, + "loss": 0.042, + "num_input_tokens_seen": 3124736, + "step": 14815 + }, + { + "epoch": 1.6303630363036303, + "grad_norm": 0.06223186105489731, + "learning_rate": 4.075632563256326e-05, + "loss": 0.2045, + "num_input_tokens_seen": 3125728, + "step": 14820 + }, + { + "epoch": 1.630913091309131, + "grad_norm": 0.20209439098834991, + "learning_rate": 4.077007700770077e-05, + "loss": 0.1253, + "num_input_tokens_seen": 3126816, + "step": 14825 + }, + { + "epoch": 1.6314631463146316, + "grad_norm": 0.3089151084423065, + "learning_rate": 4.078382838283828e-05, + "loss": 0.1046, + "num_input_tokens_seen": 3127904, + "step": 14830 + }, + { + "epoch": 1.632013201320132, + "grad_norm": 0.408451646566391, + "learning_rate": 4.07975797579758e-05, + "loss": 0.0831, + "num_input_tokens_seen": 3128960, + "step": 14835 + }, + { + "epoch": 1.6325632563256325, + "grad_norm": 0.6807791590690613, + "learning_rate": 4.081133113311331e-05, + "loss": 0.1523, + "num_input_tokens_seen": 3129984, + "step": 14840 + }, + { + "epoch": 1.6331133113311331, + "grad_norm": 0.18959277868270874, + "learning_rate": 4.082508250825083e-05, + "loss": 0.1265, + "num_input_tokens_seen": 3131040, + "step": 14845 + }, + { + "epoch": 1.6336633663366338, + "grad_norm": 1.1669104099273682, + "learning_rate": 4.0838833883388336e-05, + "loss": 0.1154, + "num_input_tokens_seen": 3132096, + "step": 14850 + }, + { + "epoch": 1.6342134213421342, + "grad_norm": 0.2869950532913208, + "learning_rate": 4.0852585258525855e-05, + "loss": 0.0574, + "num_input_tokens_seen": 3133152, + "step": 14855 + }, + { + "epoch": 1.6347634763476346, + "grad_norm": 0.3192732036113739, + "learning_rate": 4.0866336633663366e-05, + "loss": 0.0502, + "num_input_tokens_seen": 3134272, + "step": 14860 + }, + { + "epoch": 1.6353135313531353, + "grad_norm": 1.6155925989151, + "learning_rate": 4.0880088008800885e-05, + "loss": 0.1256, + "num_input_tokens_seen": 3135360, + "step": 14865 + }, + { + "epoch": 1.635863586358636, + "grad_norm": 0.4387495517730713, + "learning_rate": 4.0893839383938396e-05, + "loss": 0.1097, + "num_input_tokens_seen": 3136480, + "step": 14870 + }, + { + "epoch": 1.6364136413641364, + "grad_norm": 0.5820419192314148, + "learning_rate": 4.090759075907591e-05, + "loss": 0.0715, + "num_input_tokens_seen": 3137504, + "step": 14875 + }, + { + "epoch": 1.636963696369637, + "grad_norm": 0.21422311663627625, + "learning_rate": 4.0921342134213426e-05, + "loss": 0.1592, + "num_input_tokens_seen": 3138560, + "step": 14880 + }, + { + "epoch": 1.6375137513751374, + "grad_norm": 0.3806493580341339, + "learning_rate": 4.093509350935094e-05, + "loss": 0.0425, + "num_input_tokens_seen": 3139648, + "step": 14885 + }, + { + "epoch": 1.638063806380638, + "grad_norm": 0.07084851711988449, + "learning_rate": 4.094884488448845e-05, + "loss": 0.0749, + "num_input_tokens_seen": 3140704, + "step": 14890 + }, + { + "epoch": 1.6386138613861387, + "grad_norm": 0.2028263956308365, + "learning_rate": 4.096259625962596e-05, + "loss": 0.1128, + "num_input_tokens_seen": 3141760, + "step": 14895 + }, + { + "epoch": 1.6391639163916392, + "grad_norm": 0.3887360990047455, + "learning_rate": 4.097634763476348e-05, + "loss": 0.103, + "num_input_tokens_seen": 3142752, + "step": 14900 + }, + { + "epoch": 1.6397139713971396, + "grad_norm": 0.7899550795555115, + "learning_rate": 4.099009900990099e-05, + "loss": 0.1108, + "num_input_tokens_seen": 3143808, + "step": 14905 + }, + { + "epoch": 1.6402640264026402, + "grad_norm": 0.7364814877510071, + "learning_rate": 4.10038503850385e-05, + "loss": 0.0832, + "num_input_tokens_seen": 3144864, + "step": 14910 + }, + { + "epoch": 1.640814081408141, + "grad_norm": 1.112565517425537, + "learning_rate": 4.101760176017602e-05, + "loss": 0.1163, + "num_input_tokens_seen": 3145888, + "step": 14915 + }, + { + "epoch": 1.6413641364136413, + "grad_norm": 0.5616143345832825, + "learning_rate": 4.103135313531353e-05, + "loss": 0.1447, + "num_input_tokens_seen": 3146880, + "step": 14920 + }, + { + "epoch": 1.641914191419142, + "grad_norm": 0.014004690572619438, + "learning_rate": 4.104510451045105e-05, + "loss": 0.0814, + "num_input_tokens_seen": 3147936, + "step": 14925 + }, + { + "epoch": 1.6424642464246424, + "grad_norm": 1.2101542949676514, + "learning_rate": 4.1058855885588557e-05, + "loss": 0.1023, + "num_input_tokens_seen": 3149088, + "step": 14930 + }, + { + "epoch": 1.643014301430143, + "grad_norm": 0.2569274306297302, + "learning_rate": 4.1072607260726075e-05, + "loss": 0.0584, + "num_input_tokens_seen": 3150176, + "step": 14935 + }, + { + "epoch": 1.6435643564356437, + "grad_norm": 1.0217996835708618, + "learning_rate": 4.108635863586359e-05, + "loss": 0.1354, + "num_input_tokens_seen": 3151328, + "step": 14940 + }, + { + "epoch": 1.6441144114411441, + "grad_norm": 0.4726658761501312, + "learning_rate": 4.1100110011001105e-05, + "loss": 0.0752, + "num_input_tokens_seen": 3152384, + "step": 14945 + }, + { + "epoch": 1.6446644664466445, + "grad_norm": 1.2642512321472168, + "learning_rate": 4.111386138613862e-05, + "loss": 0.1351, + "num_input_tokens_seen": 3153504, + "step": 14950 + }, + { + "epoch": 1.6452145214521452, + "grad_norm": 0.8221815824508667, + "learning_rate": 4.112761276127613e-05, + "loss": 0.1925, + "num_input_tokens_seen": 3154560, + "step": 14955 + }, + { + "epoch": 1.6457645764576458, + "grad_norm": 0.43185946345329285, + "learning_rate": 4.114136413641365e-05, + "loss": 0.0948, + "num_input_tokens_seen": 3155616, + "step": 14960 + }, + { + "epoch": 1.6463146314631463, + "grad_norm": 1.255321979522705, + "learning_rate": 4.115511551155116e-05, + "loss": 0.1161, + "num_input_tokens_seen": 3156704, + "step": 14965 + }, + { + "epoch": 1.6468646864686467, + "grad_norm": 0.4198469817638397, + "learning_rate": 4.116886688668867e-05, + "loss": 0.032, + "num_input_tokens_seen": 3157728, + "step": 14970 + }, + { + "epoch": 1.6474147414741473, + "grad_norm": 0.09489233791828156, + "learning_rate": 4.118261826182618e-05, + "loss": 0.1039, + "num_input_tokens_seen": 3158752, + "step": 14975 + }, + { + "epoch": 1.647964796479648, + "grad_norm": 0.2828688621520996, + "learning_rate": 4.11963696369637e-05, + "loss": 0.0883, + "num_input_tokens_seen": 3159776, + "step": 14980 + }, + { + "epoch": 1.6485148514851486, + "grad_norm": 0.22205905616283417, + "learning_rate": 4.121012101210121e-05, + "loss": 0.1227, + "num_input_tokens_seen": 3160832, + "step": 14985 + }, + { + "epoch": 1.649064906490649, + "grad_norm": 0.6461731195449829, + "learning_rate": 4.1223872387238724e-05, + "loss": 0.1196, + "num_input_tokens_seen": 3161920, + "step": 14990 + }, + { + "epoch": 1.6496149614961495, + "grad_norm": 1.5776258707046509, + "learning_rate": 4.1237623762376235e-05, + "loss": 0.1516, + "num_input_tokens_seen": 3163072, + "step": 14995 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.05407121032476425, + "learning_rate": 4.1251375137513754e-05, + "loss": 0.0488, + "num_input_tokens_seen": 3164160, + "step": 15000 + }, + { + "epoch": 1.6507150715071508, + "grad_norm": 1.6538090705871582, + "learning_rate": 4.126512651265127e-05, + "loss": 0.1238, + "num_input_tokens_seen": 3165216, + "step": 15005 + }, + { + "epoch": 1.6512651265126512, + "grad_norm": 0.5985383987426758, + "learning_rate": 4.1278877887788784e-05, + "loss": 0.0862, + "num_input_tokens_seen": 3166304, + "step": 15010 + }, + { + "epoch": 1.6518151815181517, + "grad_norm": 0.3162398040294647, + "learning_rate": 4.1292629262926295e-05, + "loss": 0.0874, + "num_input_tokens_seen": 3167360, + "step": 15015 + }, + { + "epoch": 1.6523652365236523, + "grad_norm": 0.669813871383667, + "learning_rate": 4.130638063806381e-05, + "loss": 0.0781, + "num_input_tokens_seen": 3168480, + "step": 15020 + }, + { + "epoch": 1.652915291529153, + "grad_norm": 1.438652515411377, + "learning_rate": 4.1320132013201325e-05, + "loss": 0.1461, + "num_input_tokens_seen": 3169568, + "step": 15025 + }, + { + "epoch": 1.6534653465346536, + "grad_norm": 1.5093554258346558, + "learning_rate": 4.133388338833884e-05, + "loss": 0.175, + "num_input_tokens_seen": 3170592, + "step": 15030 + }, + { + "epoch": 1.654015401540154, + "grad_norm": 0.6472808718681335, + "learning_rate": 4.134763476347635e-05, + "loss": 0.1653, + "num_input_tokens_seen": 3171648, + "step": 15035 + }, + { + "epoch": 1.6545654565456545, + "grad_norm": 0.3065539300441742, + "learning_rate": 4.136138613861386e-05, + "loss": 0.0767, + "num_input_tokens_seen": 3172736, + "step": 15040 + }, + { + "epoch": 1.655115511551155, + "grad_norm": 0.4285143315792084, + "learning_rate": 4.137513751375138e-05, + "loss": 0.0897, + "num_input_tokens_seen": 3173792, + "step": 15045 + }, + { + "epoch": 1.6556655665566558, + "grad_norm": 0.5077517628669739, + "learning_rate": 4.138888888888889e-05, + "loss": 0.1199, + "num_input_tokens_seen": 3174880, + "step": 15050 + }, + { + "epoch": 1.6562156215621562, + "grad_norm": 1.0934946537017822, + "learning_rate": 4.14026402640264e-05, + "loss": 0.1396, + "num_input_tokens_seen": 3175968, + "step": 15055 + }, + { + "epoch": 1.6567656765676566, + "grad_norm": 0.3483816683292389, + "learning_rate": 4.141639163916392e-05, + "loss": 0.0655, + "num_input_tokens_seen": 3177056, + "step": 15060 + }, + { + "epoch": 1.6573157315731573, + "grad_norm": 0.26372230052948, + "learning_rate": 4.143014301430143e-05, + "loss": 0.1099, + "num_input_tokens_seen": 3178144, + "step": 15065 + }, + { + "epoch": 1.657865786578658, + "grad_norm": 0.658473014831543, + "learning_rate": 4.144389438943895e-05, + "loss": 0.0689, + "num_input_tokens_seen": 3179168, + "step": 15070 + }, + { + "epoch": 1.6584158415841586, + "grad_norm": 3.0189120769500732, + "learning_rate": 4.1457645764576456e-05, + "loss": 0.129, + "num_input_tokens_seen": 3180224, + "step": 15075 + }, + { + "epoch": 1.658965896589659, + "grad_norm": 1.8456051349639893, + "learning_rate": 4.1471397139713974e-05, + "loss": 0.1117, + "num_input_tokens_seen": 3181248, + "step": 15080 + }, + { + "epoch": 1.6595159515951594, + "grad_norm": 0.9663506150245667, + "learning_rate": 4.1485148514851486e-05, + "loss": 0.1607, + "num_input_tokens_seen": 3182336, + "step": 15085 + }, + { + "epoch": 1.66006600660066, + "grad_norm": 1.5620126724243164, + "learning_rate": 4.1498899889989004e-05, + "loss": 0.1467, + "num_input_tokens_seen": 3183392, + "step": 15090 + }, + { + "epoch": 1.6606160616061607, + "grad_norm": 0.583629310131073, + "learning_rate": 4.151265126512651e-05, + "loss": 0.1342, + "num_input_tokens_seen": 3184480, + "step": 15095 + }, + { + "epoch": 1.6611661166116611, + "grad_norm": 0.5818629860877991, + "learning_rate": 4.152640264026403e-05, + "loss": 0.1086, + "num_input_tokens_seen": 3185472, + "step": 15100 + }, + { + "epoch": 1.6617161716171616, + "grad_norm": 0.3876453936100006, + "learning_rate": 4.1540154015401546e-05, + "loss": 0.1007, + "num_input_tokens_seen": 3186528, + "step": 15105 + }, + { + "epoch": 1.6622662266226622, + "grad_norm": 0.8106303811073303, + "learning_rate": 4.155390539053906e-05, + "loss": 0.1539, + "num_input_tokens_seen": 3187520, + "step": 15110 + }, + { + "epoch": 1.6628162816281629, + "grad_norm": 0.7556512951850891, + "learning_rate": 4.156765676567657e-05, + "loss": 0.1052, + "num_input_tokens_seen": 3188544, + "step": 15115 + }, + { + "epoch": 1.6633663366336635, + "grad_norm": 0.5404903292655945, + "learning_rate": 4.158140814081408e-05, + "loss": 0.0939, + "num_input_tokens_seen": 3189632, + "step": 15120 + }, + { + "epoch": 1.663916391639164, + "grad_norm": 0.08360553532838821, + "learning_rate": 4.15951595159516e-05, + "loss": 0.0966, + "num_input_tokens_seen": 3190688, + "step": 15125 + }, + { + "epoch": 1.6644664466446644, + "grad_norm": 1.7752355337142944, + "learning_rate": 4.160891089108911e-05, + "loss": 0.1782, + "num_input_tokens_seen": 3191712, + "step": 15130 + }, + { + "epoch": 1.665016501650165, + "grad_norm": 0.4100207984447479, + "learning_rate": 4.162266226622662e-05, + "loss": 0.0757, + "num_input_tokens_seen": 3192736, + "step": 15135 + }, + { + "epoch": 1.6655665566556657, + "grad_norm": 0.4239916503429413, + "learning_rate": 4.1636413641364134e-05, + "loss": 0.0851, + "num_input_tokens_seen": 3193728, + "step": 15140 + }, + { + "epoch": 1.666116611661166, + "grad_norm": 1.8156495094299316, + "learning_rate": 4.165016501650165e-05, + "loss": 0.1163, + "num_input_tokens_seen": 3194784, + "step": 15145 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.101002536714077, + "learning_rate": 4.166391639163917e-05, + "loss": 0.1196, + "num_input_tokens_seen": 3195840, + "step": 15150 + }, + { + "epoch": 1.6672167216721672, + "grad_norm": 0.5493484139442444, + "learning_rate": 4.1677667766776676e-05, + "loss": 0.1893, + "num_input_tokens_seen": 3196928, + "step": 15155 + }, + { + "epoch": 1.6677667766776678, + "grad_norm": 1.850878357887268, + "learning_rate": 4.1691419141914194e-05, + "loss": 0.1531, + "num_input_tokens_seen": 3198016, + "step": 15160 + }, + { + "epoch": 1.6683168316831685, + "grad_norm": 0.43208548426628113, + "learning_rate": 4.1705170517051706e-05, + "loss": 0.0576, + "num_input_tokens_seen": 3199072, + "step": 15165 + }, + { + "epoch": 1.668866886688669, + "grad_norm": 1.371974229812622, + "learning_rate": 4.1718921892189224e-05, + "loss": 0.1697, + "num_input_tokens_seen": 3200128, + "step": 15170 + }, + { + "epoch": 1.6694169416941693, + "grad_norm": 1.2204720973968506, + "learning_rate": 4.1732673267326736e-05, + "loss": 0.079, + "num_input_tokens_seen": 3201120, + "step": 15175 + }, + { + "epoch": 1.66996699669967, + "grad_norm": 0.09175815433263779, + "learning_rate": 4.174642464246425e-05, + "loss": 0.0825, + "num_input_tokens_seen": 3202112, + "step": 15180 + }, + { + "epoch": 1.6705170517051706, + "grad_norm": 0.22959323227405548, + "learning_rate": 4.176017601760176e-05, + "loss": 0.0978, + "num_input_tokens_seen": 3203136, + "step": 15185 + }, + { + "epoch": 1.671067106710671, + "grad_norm": 0.2680257558822632, + "learning_rate": 4.177392739273928e-05, + "loss": 0.0208, + "num_input_tokens_seen": 3204224, + "step": 15190 + }, + { + "epoch": 1.6716171617161715, + "grad_norm": 0.5740973949432373, + "learning_rate": 4.178767876787679e-05, + "loss": 0.1143, + "num_input_tokens_seen": 3205344, + "step": 15195 + }, + { + "epoch": 1.6721672167216721, + "grad_norm": 0.4773351848125458, + "learning_rate": 4.18014301430143e-05, + "loss": 0.0981, + "num_input_tokens_seen": 3206336, + "step": 15200 + }, + { + "epoch": 1.6727172717271728, + "grad_norm": 0.16363665461540222, + "learning_rate": 4.181518151815182e-05, + "loss": 0.0809, + "num_input_tokens_seen": 3207456, + "step": 15205 + }, + { + "epoch": 1.6732673267326734, + "grad_norm": 0.98543781042099, + "learning_rate": 4.182893289328933e-05, + "loss": 0.0666, + "num_input_tokens_seen": 3208480, + "step": 15210 + }, + { + "epoch": 1.6738173817381738, + "grad_norm": 0.1695813089609146, + "learning_rate": 4.184268426842684e-05, + "loss": 0.1273, + "num_input_tokens_seen": 3209536, + "step": 15215 + }, + { + "epoch": 1.6743674367436743, + "grad_norm": 0.026274237781763077, + "learning_rate": 4.1856435643564355e-05, + "loss": 0.0737, + "num_input_tokens_seen": 3210592, + "step": 15220 + }, + { + "epoch": 1.674917491749175, + "grad_norm": 0.8020855188369751, + "learning_rate": 4.187018701870187e-05, + "loss": 0.0967, + "num_input_tokens_seen": 3211584, + "step": 15225 + }, + { + "epoch": 1.6754675467546756, + "grad_norm": 0.5579555630683899, + "learning_rate": 4.1883938393839385e-05, + "loss": 0.1702, + "num_input_tokens_seen": 3212640, + "step": 15230 + }, + { + "epoch": 1.676017601760176, + "grad_norm": 0.6832210421562195, + "learning_rate": 4.18976897689769e-05, + "loss": 0.0709, + "num_input_tokens_seen": 3213664, + "step": 15235 + }, + { + "epoch": 1.6765676567656764, + "grad_norm": 0.1584293097257614, + "learning_rate": 4.1911441144114415e-05, + "loss": 0.0658, + "num_input_tokens_seen": 3214784, + "step": 15240 + }, + { + "epoch": 1.677117711771177, + "grad_norm": 0.6653137803077698, + "learning_rate": 4.1925192519251926e-05, + "loss": 0.0448, + "num_input_tokens_seen": 3215872, + "step": 15245 + }, + { + "epoch": 1.6776677667766777, + "grad_norm": 0.08614132553339005, + "learning_rate": 4.1938943894389445e-05, + "loss": 0.037, + "num_input_tokens_seen": 3216896, + "step": 15250 + }, + { + "epoch": 1.6782178217821784, + "grad_norm": 0.9299843907356262, + "learning_rate": 4.1952695269526956e-05, + "loss": 0.0811, + "num_input_tokens_seen": 3217920, + "step": 15255 + }, + { + "epoch": 1.6787678767876788, + "grad_norm": 0.5119621753692627, + "learning_rate": 4.196644664466447e-05, + "loss": 0.0625, + "num_input_tokens_seen": 3218944, + "step": 15260 + }, + { + "epoch": 1.6793179317931792, + "grad_norm": 0.23178733885288239, + "learning_rate": 4.198019801980198e-05, + "loss": 0.1784, + "num_input_tokens_seen": 3219904, + "step": 15265 + }, + { + "epoch": 1.6798679867986799, + "grad_norm": 0.3928689658641815, + "learning_rate": 4.19939493949395e-05, + "loss": 0.1247, + "num_input_tokens_seen": 3221056, + "step": 15270 + }, + { + "epoch": 1.6804180418041805, + "grad_norm": 0.3759753704071045, + "learning_rate": 4.200770077007701e-05, + "loss": 0.1465, + "num_input_tokens_seen": 3222080, + "step": 15275 + }, + { + "epoch": 1.680968096809681, + "grad_norm": 1.4606338739395142, + "learning_rate": 4.202145214521452e-05, + "loss": 0.1757, + "num_input_tokens_seen": 3223232, + "step": 15280 + }, + { + "epoch": 1.6815181518151814, + "grad_norm": 0.1803155094385147, + "learning_rate": 4.203520352035203e-05, + "loss": 0.0758, + "num_input_tokens_seen": 3224256, + "step": 15285 + }, + { + "epoch": 1.682068206820682, + "grad_norm": 0.19740675389766693, + "learning_rate": 4.204895489548955e-05, + "loss": 0.0778, + "num_input_tokens_seen": 3225312, + "step": 15290 + }, + { + "epoch": 1.6826182618261827, + "grad_norm": 0.34185054898262024, + "learning_rate": 4.206270627062707e-05, + "loss": 0.0783, + "num_input_tokens_seen": 3226368, + "step": 15295 + }, + { + "epoch": 1.6831683168316833, + "grad_norm": 0.49198290705680847, + "learning_rate": 4.2076457645764575e-05, + "loss": 0.0702, + "num_input_tokens_seen": 3227424, + "step": 15300 + }, + { + "epoch": 1.6837183718371838, + "grad_norm": 0.32357057929039, + "learning_rate": 4.209020902090209e-05, + "loss": 0.0808, + "num_input_tokens_seen": 3228512, + "step": 15305 + }, + { + "epoch": 1.6842684268426842, + "grad_norm": 0.4735960066318512, + "learning_rate": 4.2103960396039605e-05, + "loss": 0.096, + "num_input_tokens_seen": 3229504, + "step": 15310 + }, + { + "epoch": 1.6848184818481848, + "grad_norm": 0.35125651955604553, + "learning_rate": 4.2117711771177123e-05, + "loss": 0.1444, + "num_input_tokens_seen": 3230560, + "step": 15315 + }, + { + "epoch": 1.6853685368536855, + "grad_norm": 0.20301440358161926, + "learning_rate": 4.213146314631463e-05, + "loss": 0.0891, + "num_input_tokens_seen": 3231648, + "step": 15320 + }, + { + "epoch": 1.685918591859186, + "grad_norm": 0.44776198267936707, + "learning_rate": 4.214521452145215e-05, + "loss": 0.255, + "num_input_tokens_seen": 3232704, + "step": 15325 + }, + { + "epoch": 1.6864686468646863, + "grad_norm": 0.13561686873435974, + "learning_rate": 4.215896589658966e-05, + "loss": 0.125, + "num_input_tokens_seen": 3233824, + "step": 15330 + }, + { + "epoch": 1.687018701870187, + "grad_norm": 0.7291394472122192, + "learning_rate": 4.217271727172718e-05, + "loss": 0.0861, + "num_input_tokens_seen": 3234848, + "step": 15335 + }, + { + "epoch": 1.6875687568756876, + "grad_norm": 0.4341597557067871, + "learning_rate": 4.218646864686469e-05, + "loss": 0.0509, + "num_input_tokens_seen": 3235968, + "step": 15340 + }, + { + "epoch": 1.688118811881188, + "grad_norm": 1.2024083137512207, + "learning_rate": 4.22002200220022e-05, + "loss": 0.0621, + "num_input_tokens_seen": 3237024, + "step": 15345 + }, + { + "epoch": 1.6886688668866887, + "grad_norm": 0.03303994983434677, + "learning_rate": 4.221397139713972e-05, + "loss": 0.1092, + "num_input_tokens_seen": 3238112, + "step": 15350 + }, + { + "epoch": 1.6892189218921891, + "grad_norm": 1.200775146484375, + "learning_rate": 4.222772277227723e-05, + "loss": 0.1167, + "num_input_tokens_seen": 3239136, + "step": 15355 + }, + { + "epoch": 1.6897689768976898, + "grad_norm": 1.5715299844741821, + "learning_rate": 4.224147414741474e-05, + "loss": 0.1094, + "num_input_tokens_seen": 3240160, + "step": 15360 + }, + { + "epoch": 1.6903190319031904, + "grad_norm": 0.4675613343715668, + "learning_rate": 4.2255225522552254e-05, + "loss": 0.0845, + "num_input_tokens_seen": 3241280, + "step": 15365 + }, + { + "epoch": 1.6908690869086909, + "grad_norm": 0.7514757513999939, + "learning_rate": 4.226897689768977e-05, + "loss": 0.1051, + "num_input_tokens_seen": 3242304, + "step": 15370 + }, + { + "epoch": 1.6914191419141913, + "grad_norm": 0.46364474296569824, + "learning_rate": 4.2282728272827284e-05, + "loss": 0.0448, + "num_input_tokens_seen": 3243328, + "step": 15375 + }, + { + "epoch": 1.691969196919692, + "grad_norm": 0.156573086977005, + "learning_rate": 4.2296479647964795e-05, + "loss": 0.16, + "num_input_tokens_seen": 3244384, + "step": 15380 + }, + { + "epoch": 1.6925192519251926, + "grad_norm": 0.2268248051404953, + "learning_rate": 4.2310231023102314e-05, + "loss": 0.0777, + "num_input_tokens_seen": 3245472, + "step": 15385 + }, + { + "epoch": 1.693069306930693, + "grad_norm": 1.7875877618789673, + "learning_rate": 4.2323982398239825e-05, + "loss": 0.1364, + "num_input_tokens_seen": 3246528, + "step": 15390 + }, + { + "epoch": 1.6936193619361937, + "grad_norm": 0.44744566082954407, + "learning_rate": 4.2337733773377344e-05, + "loss": 0.1249, + "num_input_tokens_seen": 3247616, + "step": 15395 + }, + { + "epoch": 1.694169416941694, + "grad_norm": 0.4257092773914337, + "learning_rate": 4.2351485148514855e-05, + "loss": 0.097, + "num_input_tokens_seen": 3248736, + "step": 15400 + }, + { + "epoch": 1.6947194719471947, + "grad_norm": 0.3535780608654022, + "learning_rate": 4.236523652365237e-05, + "loss": 0.0702, + "num_input_tokens_seen": 3249824, + "step": 15405 + }, + { + "epoch": 1.6952695269526954, + "grad_norm": 0.7636797428131104, + "learning_rate": 4.237898789878988e-05, + "loss": 0.0641, + "num_input_tokens_seen": 3250880, + "step": 15410 + }, + { + "epoch": 1.6958195819581958, + "grad_norm": 0.3346405625343323, + "learning_rate": 4.23927392739274e-05, + "loss": 0.1009, + "num_input_tokens_seen": 3251872, + "step": 15415 + }, + { + "epoch": 1.6963696369636962, + "grad_norm": 0.23092211782932281, + "learning_rate": 4.240649064906491e-05, + "loss": 0.0189, + "num_input_tokens_seen": 3252928, + "step": 15420 + }, + { + "epoch": 1.696919691969197, + "grad_norm": 0.47295117378234863, + "learning_rate": 4.242024202420242e-05, + "loss": 0.0311, + "num_input_tokens_seen": 3253984, + "step": 15425 + }, + { + "epoch": 1.6974697469746975, + "grad_norm": 0.0767972469329834, + "learning_rate": 4.243399339933994e-05, + "loss": 0.0716, + "num_input_tokens_seen": 3255008, + "step": 15430 + }, + { + "epoch": 1.698019801980198, + "grad_norm": 2.0791170597076416, + "learning_rate": 4.244774477447745e-05, + "loss": 0.1489, + "num_input_tokens_seen": 3256096, + "step": 15435 + }, + { + "epoch": 1.6985698569856986, + "grad_norm": 0.12472908198833466, + "learning_rate": 4.246149614961496e-05, + "loss": 0.0899, + "num_input_tokens_seen": 3257184, + "step": 15440 + }, + { + "epoch": 1.699119911991199, + "grad_norm": 0.8568087816238403, + "learning_rate": 4.2475247524752474e-05, + "loss": 0.1473, + "num_input_tokens_seen": 3258208, + "step": 15445 + }, + { + "epoch": 1.6996699669966997, + "grad_norm": 1.0596871376037598, + "learning_rate": 4.248899889988999e-05, + "loss": 0.164, + "num_input_tokens_seen": 3259328, + "step": 15450 + }, + { + "epoch": 1.7002200220022003, + "grad_norm": 0.4454914927482605, + "learning_rate": 4.2502750275027504e-05, + "loss": 0.0794, + "num_input_tokens_seen": 3260416, + "step": 15455 + }, + { + "epoch": 1.7007700770077008, + "grad_norm": 0.28830307722091675, + "learning_rate": 4.251650165016502e-05, + "loss": 0.1657, + "num_input_tokens_seen": 3261536, + "step": 15460 + }, + { + "epoch": 1.7013201320132012, + "grad_norm": 0.1915234476327896, + "learning_rate": 4.253025302530253e-05, + "loss": 0.0496, + "num_input_tokens_seen": 3262720, + "step": 15465 + }, + { + "epoch": 1.7018701870187019, + "grad_norm": 0.44653722643852234, + "learning_rate": 4.2544004400440046e-05, + "loss": 0.0474, + "num_input_tokens_seen": 3263808, + "step": 15470 + }, + { + "epoch": 1.7024202420242025, + "grad_norm": 1.4098337888717651, + "learning_rate": 4.2557755775577564e-05, + "loss": 0.122, + "num_input_tokens_seen": 3264800, + "step": 15475 + }, + { + "epoch": 1.702970297029703, + "grad_norm": 0.04477832093834877, + "learning_rate": 4.2571507150715076e-05, + "loss": 0.0354, + "num_input_tokens_seen": 3265952, + "step": 15480 + }, + { + "epoch": 1.7035203520352034, + "grad_norm": 0.8902339339256287, + "learning_rate": 4.258525852585259e-05, + "loss": 0.1348, + "num_input_tokens_seen": 3266976, + "step": 15485 + }, + { + "epoch": 1.704070407040704, + "grad_norm": 1.722641110420227, + "learning_rate": 4.25990099009901e-05, + "loss": 0.1244, + "num_input_tokens_seen": 3267968, + "step": 15490 + }, + { + "epoch": 1.7046204620462047, + "grad_norm": 0.183444544672966, + "learning_rate": 4.261276127612762e-05, + "loss": 0.0716, + "num_input_tokens_seen": 3269056, + "step": 15495 + }, + { + "epoch": 1.7051705170517053, + "grad_norm": 1.4706462621688843, + "learning_rate": 4.262651265126513e-05, + "loss": 0.0842, + "num_input_tokens_seen": 3270080, + "step": 15500 + }, + { + "epoch": 1.7057205720572057, + "grad_norm": 2.310751438140869, + "learning_rate": 4.264026402640264e-05, + "loss": 0.1819, + "num_input_tokens_seen": 3271104, + "step": 15505 + }, + { + "epoch": 1.7062706270627062, + "grad_norm": 0.029385093599557877, + "learning_rate": 4.265401540154015e-05, + "loss": 0.0481, + "num_input_tokens_seen": 3272128, + "step": 15510 + }, + { + "epoch": 1.7068206820682068, + "grad_norm": 0.36663228273391724, + "learning_rate": 4.266776677667767e-05, + "loss": 0.1731, + "num_input_tokens_seen": 3273248, + "step": 15515 + }, + { + "epoch": 1.7073707370737075, + "grad_norm": 1.7960718870162964, + "learning_rate": 4.268151815181519e-05, + "loss": 0.186, + "num_input_tokens_seen": 3274272, + "step": 15520 + }, + { + "epoch": 1.7079207920792079, + "grad_norm": 0.2865848243236542, + "learning_rate": 4.2695269526952694e-05, + "loss": 0.078, + "num_input_tokens_seen": 3275232, + "step": 15525 + }, + { + "epoch": 1.7084708470847083, + "grad_norm": 0.8713964819908142, + "learning_rate": 4.270902090209021e-05, + "loss": 0.0898, + "num_input_tokens_seen": 3276256, + "step": 15530 + }, + { + "epoch": 1.709020902090209, + "grad_norm": 1.2911831140518188, + "learning_rate": 4.2722772277227724e-05, + "loss": 0.2946, + "num_input_tokens_seen": 3277280, + "step": 15535 + }, + { + "epoch": 1.7095709570957096, + "grad_norm": 0.057930391281843185, + "learning_rate": 4.273652365236524e-05, + "loss": 0.108, + "num_input_tokens_seen": 3278272, + "step": 15540 + }, + { + "epoch": 1.7101210121012103, + "grad_norm": 0.3662884533405304, + "learning_rate": 4.275027502750275e-05, + "loss": 0.0837, + "num_input_tokens_seen": 3279360, + "step": 15545 + }, + { + "epoch": 1.7106710671067107, + "grad_norm": 0.7143902778625488, + "learning_rate": 4.2764026402640266e-05, + "loss": 0.0643, + "num_input_tokens_seen": 3280448, + "step": 15550 + }, + { + "epoch": 1.7112211221122111, + "grad_norm": 0.11803371459245682, + "learning_rate": 4.277777777777778e-05, + "loss": 0.0944, + "num_input_tokens_seen": 3281472, + "step": 15555 + }, + { + "epoch": 1.7117711771177118, + "grad_norm": 0.369458943605423, + "learning_rate": 4.2791529152915296e-05, + "loss": 0.072, + "num_input_tokens_seen": 3282528, + "step": 15560 + }, + { + "epoch": 1.7123212321232124, + "grad_norm": 0.4262913763523102, + "learning_rate": 4.280528052805281e-05, + "loss": 0.1411, + "num_input_tokens_seen": 3283584, + "step": 15565 + }, + { + "epoch": 1.7128712871287128, + "grad_norm": 0.4488135874271393, + "learning_rate": 4.281903190319032e-05, + "loss": 0.1073, + "num_input_tokens_seen": 3284608, + "step": 15570 + }, + { + "epoch": 1.7134213421342133, + "grad_norm": 0.36252379417419434, + "learning_rate": 4.283278327832784e-05, + "loss": 0.0272, + "num_input_tokens_seen": 3285600, + "step": 15575 + }, + { + "epoch": 1.713971397139714, + "grad_norm": 0.2646045684814453, + "learning_rate": 4.284653465346535e-05, + "loss": 0.0671, + "num_input_tokens_seen": 3286688, + "step": 15580 + }, + { + "epoch": 1.7145214521452146, + "grad_norm": 0.2897568941116333, + "learning_rate": 4.286028602860286e-05, + "loss": 0.0995, + "num_input_tokens_seen": 3287808, + "step": 15585 + }, + { + "epoch": 1.7150715071507152, + "grad_norm": 0.3123484253883362, + "learning_rate": 4.287403740374037e-05, + "loss": 0.0878, + "num_input_tokens_seen": 3288864, + "step": 15590 + }, + { + "epoch": 1.7156215621562156, + "grad_norm": 1.0103871822357178, + "learning_rate": 4.288778877887789e-05, + "loss": 0.1534, + "num_input_tokens_seen": 3289920, + "step": 15595 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.1493414044380188, + "learning_rate": 4.29015401540154e-05, + "loss": 0.0287, + "num_input_tokens_seen": 3290976, + "step": 15600 + }, + { + "epoch": 1.7167216721672167, + "grad_norm": 0.9152912497520447, + "learning_rate": 4.2915291529152915e-05, + "loss": 0.1828, + "num_input_tokens_seen": 3292000, + "step": 15605 + }, + { + "epoch": 1.7172717271727174, + "grad_norm": 0.6446062922477722, + "learning_rate": 4.2929042904290426e-05, + "loss": 0.1251, + "num_input_tokens_seen": 3293056, + "step": 15610 + }, + { + "epoch": 1.7178217821782178, + "grad_norm": 1.512433648109436, + "learning_rate": 4.2942794279427945e-05, + "loss": 0.1131, + "num_input_tokens_seen": 3294080, + "step": 15615 + }, + { + "epoch": 1.7183718371837182, + "grad_norm": 0.844835638999939, + "learning_rate": 4.295654565456546e-05, + "loss": 0.1737, + "num_input_tokens_seen": 3295136, + "step": 15620 + }, + { + "epoch": 1.7189218921892189, + "grad_norm": 1.1350959539413452, + "learning_rate": 4.2970297029702975e-05, + "loss": 0.1018, + "num_input_tokens_seen": 3296288, + "step": 15625 + }, + { + "epoch": 1.7194719471947195, + "grad_norm": 0.2119835913181305, + "learning_rate": 4.2984048404840487e-05, + "loss": 0.038, + "num_input_tokens_seen": 3297344, + "step": 15630 + }, + { + "epoch": 1.7200220022002202, + "grad_norm": 0.6303231716156006, + "learning_rate": 4.2997799779978e-05, + "loss": 0.0741, + "num_input_tokens_seen": 3298432, + "step": 15635 + }, + { + "epoch": 1.7205720572057206, + "grad_norm": 0.31483885645866394, + "learning_rate": 4.3011551155115517e-05, + "loss": 0.0639, + "num_input_tokens_seen": 3299424, + "step": 15640 + }, + { + "epoch": 1.721122112211221, + "grad_norm": 0.2223777323961258, + "learning_rate": 4.302530253025303e-05, + "loss": 0.0746, + "num_input_tokens_seen": 3300480, + "step": 15645 + }, + { + "epoch": 1.7216721672167217, + "grad_norm": 0.5530178546905518, + "learning_rate": 4.303905390539054e-05, + "loss": 0.1, + "num_input_tokens_seen": 3301504, + "step": 15650 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 1.3308100700378418, + "learning_rate": 4.305280528052805e-05, + "loss": 0.0596, + "num_input_tokens_seen": 3302656, + "step": 15655 + }, + { + "epoch": 1.7227722772277227, + "grad_norm": 0.15529319643974304, + "learning_rate": 4.306655665566557e-05, + "loss": 0.0758, + "num_input_tokens_seen": 3303712, + "step": 15660 + }, + { + "epoch": 1.7233223322332232, + "grad_norm": 0.2008226215839386, + "learning_rate": 4.308030803080308e-05, + "loss": 0.0311, + "num_input_tokens_seen": 3304800, + "step": 15665 + }, + { + "epoch": 1.7238723872387238, + "grad_norm": 0.4038276970386505, + "learning_rate": 4.309405940594059e-05, + "loss": 0.0817, + "num_input_tokens_seen": 3305792, + "step": 15670 + }, + { + "epoch": 1.7244224422442245, + "grad_norm": 0.2526167631149292, + "learning_rate": 4.310781078107811e-05, + "loss": 0.105, + "num_input_tokens_seen": 3306848, + "step": 15675 + }, + { + "epoch": 1.7249724972497251, + "grad_norm": 0.25772905349731445, + "learning_rate": 4.3121562156215623e-05, + "loss": 0.0245, + "num_input_tokens_seen": 3307872, + "step": 15680 + }, + { + "epoch": 1.7255225522552256, + "grad_norm": 1.0551375150680542, + "learning_rate": 4.313531353135314e-05, + "loss": 0.2154, + "num_input_tokens_seen": 3308928, + "step": 15685 + }, + { + "epoch": 1.726072607260726, + "grad_norm": 0.5301448106765747, + "learning_rate": 4.314906490649065e-05, + "loss": 0.1612, + "num_input_tokens_seen": 3309984, + "step": 15690 + }, + { + "epoch": 1.7266226622662266, + "grad_norm": 0.30923113226890564, + "learning_rate": 4.3162816281628165e-05, + "loss": 0.1603, + "num_input_tokens_seen": 3311040, + "step": 15695 + }, + { + "epoch": 1.7271727172717273, + "grad_norm": 0.4895942807197571, + "learning_rate": 4.317656765676568e-05, + "loss": 0.0787, + "num_input_tokens_seen": 3312192, + "step": 15700 + }, + { + "epoch": 1.7277227722772277, + "grad_norm": 0.06393909454345703, + "learning_rate": 4.3190319031903195e-05, + "loss": 0.0351, + "num_input_tokens_seen": 3313216, + "step": 15705 + }, + { + "epoch": 1.7282728272827281, + "grad_norm": 0.8423793315887451, + "learning_rate": 4.320407040704071e-05, + "loss": 0.1661, + "num_input_tokens_seen": 3314240, + "step": 15710 + }, + { + "epoch": 1.7288228822882288, + "grad_norm": 0.1641324758529663, + "learning_rate": 4.321782178217822e-05, + "loss": 0.0399, + "num_input_tokens_seen": 3315296, + "step": 15715 + }, + { + "epoch": 1.7293729372937294, + "grad_norm": 0.9419856667518616, + "learning_rate": 4.323157315731574e-05, + "loss": 0.1276, + "num_input_tokens_seen": 3316288, + "step": 15720 + }, + { + "epoch": 1.72992299229923, + "grad_norm": 0.7294703125953674, + "learning_rate": 4.324532453245325e-05, + "loss": 0.1156, + "num_input_tokens_seen": 3317376, + "step": 15725 + }, + { + "epoch": 1.7304730473047305, + "grad_norm": 0.20797251164913177, + "learning_rate": 4.325907590759076e-05, + "loss": 0.0942, + "num_input_tokens_seen": 3318464, + "step": 15730 + }, + { + "epoch": 1.731023102310231, + "grad_norm": 0.024229399859905243, + "learning_rate": 4.327282728272827e-05, + "loss": 0.0444, + "num_input_tokens_seen": 3319520, + "step": 15735 + }, + { + "epoch": 1.7315731573157316, + "grad_norm": 0.7402864694595337, + "learning_rate": 4.328657865786579e-05, + "loss": 0.0969, + "num_input_tokens_seen": 3320576, + "step": 15740 + }, + { + "epoch": 1.7321232123212322, + "grad_norm": 0.48836395144462585, + "learning_rate": 4.33003300330033e-05, + "loss": 0.0783, + "num_input_tokens_seen": 3321536, + "step": 15745 + }, + { + "epoch": 1.7326732673267327, + "grad_norm": 1.0160654783248901, + "learning_rate": 4.3314081408140814e-05, + "loss": 0.1781, + "num_input_tokens_seen": 3322592, + "step": 15750 + }, + { + "epoch": 1.733223322332233, + "grad_norm": 0.6587823629379272, + "learning_rate": 4.332783278327833e-05, + "loss": 0.1997, + "num_input_tokens_seen": 3323712, + "step": 15755 + }, + { + "epoch": 1.7337733773377337, + "grad_norm": 0.33723318576812744, + "learning_rate": 4.3341584158415844e-05, + "loss": 0.0422, + "num_input_tokens_seen": 3324768, + "step": 15760 + }, + { + "epoch": 1.7343234323432344, + "grad_norm": 0.43444356322288513, + "learning_rate": 4.335533553355336e-05, + "loss": 0.0793, + "num_input_tokens_seen": 3325824, + "step": 15765 + }, + { + "epoch": 1.734873487348735, + "grad_norm": 0.08084696531295776, + "learning_rate": 4.336908690869087e-05, + "loss": 0.0469, + "num_input_tokens_seen": 3326880, + "step": 15770 + }, + { + "epoch": 1.7354235423542355, + "grad_norm": 0.09072709083557129, + "learning_rate": 4.3382838283828386e-05, + "loss": 0.0799, + "num_input_tokens_seen": 3327904, + "step": 15775 + }, + { + "epoch": 1.7359735973597359, + "grad_norm": 1.412994146347046, + "learning_rate": 4.33965896589659e-05, + "loss": 0.1298, + "num_input_tokens_seen": 3328960, + "step": 15780 + }, + { + "epoch": 1.7365236523652365, + "grad_norm": 0.18652990460395813, + "learning_rate": 4.3410341034103416e-05, + "loss": 0.0966, + "num_input_tokens_seen": 3330016, + "step": 15785 + }, + { + "epoch": 1.7370737073707372, + "grad_norm": 0.5735145807266235, + "learning_rate": 4.342409240924093e-05, + "loss": 0.0648, + "num_input_tokens_seen": 3331040, + "step": 15790 + }, + { + "epoch": 1.7376237623762376, + "grad_norm": 0.2783939838409424, + "learning_rate": 4.343784378437844e-05, + "loss": 0.0946, + "num_input_tokens_seen": 3332128, + "step": 15795 + }, + { + "epoch": 1.738173817381738, + "grad_norm": 0.6088883280754089, + "learning_rate": 4.345159515951595e-05, + "loss": 0.0508, + "num_input_tokens_seen": 3333216, + "step": 15800 + }, + { + "epoch": 1.7387238723872387, + "grad_norm": 0.2598150968551636, + "learning_rate": 4.346534653465347e-05, + "loss": 0.0781, + "num_input_tokens_seen": 3334240, + "step": 15805 + }, + { + "epoch": 1.7392739273927393, + "grad_norm": 0.06277503818273544, + "learning_rate": 4.347909790979098e-05, + "loss": 0.0844, + "num_input_tokens_seen": 3335328, + "step": 15810 + }, + { + "epoch": 1.73982398239824, + "grad_norm": 1.6163300275802612, + "learning_rate": 4.349284928492849e-05, + "loss": 0.1554, + "num_input_tokens_seen": 3336288, + "step": 15815 + }, + { + "epoch": 1.7403740374037404, + "grad_norm": 0.0756157711148262, + "learning_rate": 4.350660066006601e-05, + "loss": 0.0687, + "num_input_tokens_seen": 3337344, + "step": 15820 + }, + { + "epoch": 1.7409240924092408, + "grad_norm": 0.33479228615760803, + "learning_rate": 4.352035203520352e-05, + "loss": 0.0484, + "num_input_tokens_seen": 3338368, + "step": 15825 + }, + { + "epoch": 1.7414741474147415, + "grad_norm": 0.5278699398040771, + "learning_rate": 4.3534103410341034e-05, + "loss": 0.0466, + "num_input_tokens_seen": 3339424, + "step": 15830 + }, + { + "epoch": 1.7420242024202421, + "grad_norm": 0.04041624814271927, + "learning_rate": 4.3547854785478546e-05, + "loss": 0.0742, + "num_input_tokens_seen": 3340512, + "step": 15835 + }, + { + "epoch": 1.7425742574257426, + "grad_norm": 0.2424328625202179, + "learning_rate": 4.3561606160616064e-05, + "loss": 0.0491, + "num_input_tokens_seen": 3341568, + "step": 15840 + }, + { + "epoch": 1.743124312431243, + "grad_norm": 0.31806111335754395, + "learning_rate": 4.3575357535753576e-05, + "loss": 0.1068, + "num_input_tokens_seen": 3342688, + "step": 15845 + }, + { + "epoch": 1.7436743674367436, + "grad_norm": 0.4716373383998871, + "learning_rate": 4.3589108910891094e-05, + "loss": 0.0781, + "num_input_tokens_seen": 3343680, + "step": 15850 + }, + { + "epoch": 1.7442244224422443, + "grad_norm": 0.9895553588867188, + "learning_rate": 4.3602860286028606e-05, + "loss": 0.1217, + "num_input_tokens_seen": 3344736, + "step": 15855 + }, + { + "epoch": 1.7447744774477447, + "grad_norm": 2.2260334491729736, + "learning_rate": 4.361661166116612e-05, + "loss": 0.2101, + "num_input_tokens_seen": 3345760, + "step": 15860 + }, + { + "epoch": 1.7453245324532454, + "grad_norm": 0.21715910732746124, + "learning_rate": 4.3630363036303636e-05, + "loss": 0.0675, + "num_input_tokens_seen": 3346816, + "step": 15865 + }, + { + "epoch": 1.7458745874587458, + "grad_norm": 1.172498106956482, + "learning_rate": 4.364411441144115e-05, + "loss": 0.152, + "num_input_tokens_seen": 3347872, + "step": 15870 + }, + { + "epoch": 1.7464246424642464, + "grad_norm": 0.0599919818341732, + "learning_rate": 4.365786578657866e-05, + "loss": 0.1121, + "num_input_tokens_seen": 3348928, + "step": 15875 + }, + { + "epoch": 1.746974697469747, + "grad_norm": 0.03647712990641594, + "learning_rate": 4.367161716171617e-05, + "loss": 0.0526, + "num_input_tokens_seen": 3350048, + "step": 15880 + }, + { + "epoch": 1.7475247524752475, + "grad_norm": 0.7260103821754456, + "learning_rate": 4.368536853685369e-05, + "loss": 0.1025, + "num_input_tokens_seen": 3351040, + "step": 15885 + }, + { + "epoch": 1.748074807480748, + "grad_norm": 1.068367600440979, + "learning_rate": 4.36991199119912e-05, + "loss": 0.0412, + "num_input_tokens_seen": 3352096, + "step": 15890 + }, + { + "epoch": 1.7486248624862486, + "grad_norm": 0.14987316727638245, + "learning_rate": 4.371287128712871e-05, + "loss": 0.0802, + "num_input_tokens_seen": 3353184, + "step": 15895 + }, + { + "epoch": 1.7491749174917492, + "grad_norm": 0.26616665720939636, + "learning_rate": 4.372662266226623e-05, + "loss": 0.0389, + "num_input_tokens_seen": 3354272, + "step": 15900 + }, + { + "epoch": 1.7497249724972497, + "grad_norm": 0.2751552164554596, + "learning_rate": 4.374037403740374e-05, + "loss": 0.1421, + "num_input_tokens_seen": 3355328, + "step": 15905 + }, + { + "epoch": 1.7502750275027503, + "grad_norm": 0.42162656784057617, + "learning_rate": 4.375412541254126e-05, + "loss": 0.0354, + "num_input_tokens_seen": 3356416, + "step": 15910 + }, + { + "epoch": 1.7508250825082508, + "grad_norm": 0.5298181176185608, + "learning_rate": 4.3767876787678766e-05, + "loss": 0.1136, + "num_input_tokens_seen": 3357440, + "step": 15915 + }, + { + "epoch": 1.7513751375137514, + "grad_norm": 0.7169519066810608, + "learning_rate": 4.3781628162816284e-05, + "loss": 0.1453, + "num_input_tokens_seen": 3358560, + "step": 15920 + }, + { + "epoch": 1.751925192519252, + "grad_norm": 0.14587652683258057, + "learning_rate": 4.3795379537953796e-05, + "loss": 0.0781, + "num_input_tokens_seen": 3359648, + "step": 15925 + }, + { + "epoch": 1.7524752475247525, + "grad_norm": 0.3518576920032501, + "learning_rate": 4.3809130913091315e-05, + "loss": 0.1173, + "num_input_tokens_seen": 3360672, + "step": 15930 + }, + { + "epoch": 1.753025302530253, + "grad_norm": 1.3980557918548584, + "learning_rate": 4.382288228822882e-05, + "loss": 0.1526, + "num_input_tokens_seen": 3361696, + "step": 15935 + }, + { + "epoch": 1.7535753575357536, + "grad_norm": 1.5341086387634277, + "learning_rate": 4.383663366336634e-05, + "loss": 0.1671, + "num_input_tokens_seen": 3362720, + "step": 15940 + }, + { + "epoch": 1.7541254125412542, + "grad_norm": 0.6836516857147217, + "learning_rate": 4.3850385038503856e-05, + "loss": 0.0469, + "num_input_tokens_seen": 3363776, + "step": 15945 + }, + { + "epoch": 1.7546754675467546, + "grad_norm": 1.3134472370147705, + "learning_rate": 4.386413641364137e-05, + "loss": 0.103, + "num_input_tokens_seen": 3364832, + "step": 15950 + }, + { + "epoch": 1.7552255225522553, + "grad_norm": 3.1366779804229736, + "learning_rate": 4.387788778877888e-05, + "loss": 0.1147, + "num_input_tokens_seen": 3365856, + "step": 15955 + }, + { + "epoch": 1.7557755775577557, + "grad_norm": 0.23616738617420197, + "learning_rate": 4.389163916391639e-05, + "loss": 0.0925, + "num_input_tokens_seen": 3366976, + "step": 15960 + }, + { + "epoch": 1.7563256325632564, + "grad_norm": 0.22447995841503143, + "learning_rate": 4.390539053905391e-05, + "loss": 0.0673, + "num_input_tokens_seen": 3368032, + "step": 15965 + }, + { + "epoch": 1.756875687568757, + "grad_norm": 0.424257755279541, + "learning_rate": 4.391914191419142e-05, + "loss": 0.0467, + "num_input_tokens_seen": 3369088, + "step": 15970 + }, + { + "epoch": 1.7574257425742574, + "grad_norm": 0.4305118918418884, + "learning_rate": 4.393289328932893e-05, + "loss": 0.0313, + "num_input_tokens_seen": 3370144, + "step": 15975 + }, + { + "epoch": 1.7579757975797579, + "grad_norm": 0.28201037645339966, + "learning_rate": 4.3946644664466445e-05, + "loss": 0.1423, + "num_input_tokens_seen": 3371200, + "step": 15980 + }, + { + "epoch": 1.7585258525852585, + "grad_norm": 0.1323656588792801, + "learning_rate": 4.396039603960396e-05, + "loss": 0.1139, + "num_input_tokens_seen": 3372256, + "step": 15985 + }, + { + "epoch": 1.7590759075907592, + "grad_norm": 0.14522281289100647, + "learning_rate": 4.397414741474148e-05, + "loss": 0.125, + "num_input_tokens_seen": 3373280, + "step": 15990 + }, + { + "epoch": 1.7596259625962596, + "grad_norm": 0.5993466973304749, + "learning_rate": 4.3987898789878986e-05, + "loss": 0.0931, + "num_input_tokens_seen": 3374336, + "step": 15995 + }, + { + "epoch": 1.76017601760176, + "grad_norm": 1.335776448249817, + "learning_rate": 4.4001650165016505e-05, + "loss": 0.188, + "num_input_tokens_seen": 3375360, + "step": 16000 + }, + { + "epoch": 1.7607260726072607, + "grad_norm": 0.33885571360588074, + "learning_rate": 4.4015401540154017e-05, + "loss": 0.0691, + "num_input_tokens_seen": 3376416, + "step": 16005 + }, + { + "epoch": 1.7612761276127613, + "grad_norm": 0.057964783161878586, + "learning_rate": 4.4029152915291535e-05, + "loss": 0.0674, + "num_input_tokens_seen": 3377472, + "step": 16010 + }, + { + "epoch": 1.761826182618262, + "grad_norm": 1.1330845355987549, + "learning_rate": 4.4042904290429047e-05, + "loss": 0.1047, + "num_input_tokens_seen": 3378528, + "step": 16015 + }, + { + "epoch": 1.7623762376237624, + "grad_norm": 0.48567649722099304, + "learning_rate": 4.405665566556656e-05, + "loss": 0.0659, + "num_input_tokens_seen": 3379648, + "step": 16020 + }, + { + "epoch": 1.7629262926292628, + "grad_norm": 0.12823835015296936, + "learning_rate": 4.407040704070407e-05, + "loss": 0.0176, + "num_input_tokens_seen": 3380768, + "step": 16025 + }, + { + "epoch": 1.7634763476347635, + "grad_norm": 0.18711401522159576, + "learning_rate": 4.408415841584159e-05, + "loss": 0.1086, + "num_input_tokens_seen": 3381920, + "step": 16030 + }, + { + "epoch": 1.7640264026402641, + "grad_norm": 0.3657953143119812, + "learning_rate": 4.40979097909791e-05, + "loss": 0.2656, + "num_input_tokens_seen": 3382976, + "step": 16035 + }, + { + "epoch": 1.7645764576457645, + "grad_norm": 0.2126644253730774, + "learning_rate": 4.411166116611661e-05, + "loss": 0.0538, + "num_input_tokens_seen": 3384096, + "step": 16040 + }, + { + "epoch": 1.765126512651265, + "grad_norm": 0.43505004048347473, + "learning_rate": 4.412541254125413e-05, + "loss": 0.0868, + "num_input_tokens_seen": 3385120, + "step": 16045 + }, + { + "epoch": 1.7656765676567656, + "grad_norm": 0.5311605334281921, + "learning_rate": 4.413916391639164e-05, + "loss": 0.0888, + "num_input_tokens_seen": 3386208, + "step": 16050 + }, + { + "epoch": 1.7662266226622663, + "grad_norm": 0.6635057330131531, + "learning_rate": 4.4152915291529153e-05, + "loss": 0.0617, + "num_input_tokens_seen": 3387232, + "step": 16055 + }, + { + "epoch": 1.766776677667767, + "grad_norm": 0.7172558903694153, + "learning_rate": 4.4166666666666665e-05, + "loss": 0.044, + "num_input_tokens_seen": 3388256, + "step": 16060 + }, + { + "epoch": 1.7673267326732673, + "grad_norm": 0.36596938967704773, + "learning_rate": 4.4180418041804183e-05, + "loss": 0.091, + "num_input_tokens_seen": 3389344, + "step": 16065 + }, + { + "epoch": 1.7678767876787678, + "grad_norm": 0.32058507204055786, + "learning_rate": 4.4194169416941695e-05, + "loss": 0.1157, + "num_input_tokens_seen": 3390400, + "step": 16070 + }, + { + "epoch": 1.7684268426842684, + "grad_norm": 0.7047101855278015, + "learning_rate": 4.4207920792079214e-05, + "loss": 0.0993, + "num_input_tokens_seen": 3391456, + "step": 16075 + }, + { + "epoch": 1.768976897689769, + "grad_norm": 0.09523092955350876, + "learning_rate": 4.422167216721672e-05, + "loss": 0.0653, + "num_input_tokens_seen": 3392576, + "step": 16080 + }, + { + "epoch": 1.7695269526952695, + "grad_norm": 0.5234764218330383, + "learning_rate": 4.423542354235424e-05, + "loss": 0.0404, + "num_input_tokens_seen": 3393696, + "step": 16085 + }, + { + "epoch": 1.77007700770077, + "grad_norm": 0.759030818939209, + "learning_rate": 4.4249174917491755e-05, + "loss": 0.149, + "num_input_tokens_seen": 3394688, + "step": 16090 + }, + { + "epoch": 1.7706270627062706, + "grad_norm": 0.4733004570007324, + "learning_rate": 4.426292629262927e-05, + "loss": 0.0969, + "num_input_tokens_seen": 3395712, + "step": 16095 + }, + { + "epoch": 1.7711771177117712, + "grad_norm": 1.8025351762771606, + "learning_rate": 4.427667766776678e-05, + "loss": 0.3359, + "num_input_tokens_seen": 3396768, + "step": 16100 + }, + { + "epoch": 1.7717271727172719, + "grad_norm": 1.3709163665771484, + "learning_rate": 4.429042904290429e-05, + "loss": 0.1173, + "num_input_tokens_seen": 3397792, + "step": 16105 + }, + { + "epoch": 1.7722772277227723, + "grad_norm": 0.2023913711309433, + "learning_rate": 4.430418041804181e-05, + "loss": 0.0602, + "num_input_tokens_seen": 3398848, + "step": 16110 + }, + { + "epoch": 1.7728272827282727, + "grad_norm": 1.9205501079559326, + "learning_rate": 4.431793179317932e-05, + "loss": 0.096, + "num_input_tokens_seen": 3399968, + "step": 16115 + }, + { + "epoch": 1.7733773377337734, + "grad_norm": 0.12574470043182373, + "learning_rate": 4.433168316831683e-05, + "loss": 0.0383, + "num_input_tokens_seen": 3401024, + "step": 16120 + }, + { + "epoch": 1.773927392739274, + "grad_norm": 0.7498872876167297, + "learning_rate": 4.4345434543454344e-05, + "loss": 0.1227, + "num_input_tokens_seen": 3402048, + "step": 16125 + }, + { + "epoch": 1.7744774477447744, + "grad_norm": 0.2251761257648468, + "learning_rate": 4.435918591859186e-05, + "loss": 0.1327, + "num_input_tokens_seen": 3403136, + "step": 16130 + }, + { + "epoch": 1.7750275027502749, + "grad_norm": 0.6672793030738831, + "learning_rate": 4.437293729372938e-05, + "loss": 0.0713, + "num_input_tokens_seen": 3404192, + "step": 16135 + }, + { + "epoch": 1.7755775577557755, + "grad_norm": 2.08801007270813, + "learning_rate": 4.4386688668866885e-05, + "loss": 0.1193, + "num_input_tokens_seen": 3405248, + "step": 16140 + }, + { + "epoch": 1.7761276127612762, + "grad_norm": 0.24011504650115967, + "learning_rate": 4.4400440044004404e-05, + "loss": 0.1009, + "num_input_tokens_seen": 3406304, + "step": 16145 + }, + { + "epoch": 1.7766776677667768, + "grad_norm": 0.36839568614959717, + "learning_rate": 4.4414191419141916e-05, + "loss": 0.1137, + "num_input_tokens_seen": 3407360, + "step": 16150 + }, + { + "epoch": 1.7772277227722773, + "grad_norm": 0.32695141434669495, + "learning_rate": 4.4427942794279434e-05, + "loss": 0.0789, + "num_input_tokens_seen": 3408480, + "step": 16155 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.22764550149440765, + "learning_rate": 4.444169416941694e-05, + "loss": 0.1304, + "num_input_tokens_seen": 3409504, + "step": 16160 + }, + { + "epoch": 1.7783278327832783, + "grad_norm": 0.1632443368434906, + "learning_rate": 4.445544554455446e-05, + "loss": 0.1279, + "num_input_tokens_seen": 3410560, + "step": 16165 + }, + { + "epoch": 1.778877887788779, + "grad_norm": 0.6937828063964844, + "learning_rate": 4.446919691969197e-05, + "loss": 0.1667, + "num_input_tokens_seen": 3411616, + "step": 16170 + }, + { + "epoch": 1.7794279427942794, + "grad_norm": 0.5860657691955566, + "learning_rate": 4.448294829482949e-05, + "loss": 0.0736, + "num_input_tokens_seen": 3412608, + "step": 16175 + }, + { + "epoch": 1.7799779977997798, + "grad_norm": 1.3069536685943604, + "learning_rate": 4.4496699669967e-05, + "loss": 0.1259, + "num_input_tokens_seen": 3413664, + "step": 16180 + }, + { + "epoch": 1.7805280528052805, + "grad_norm": 0.48397547006607056, + "learning_rate": 4.451045104510451e-05, + "loss": 0.0759, + "num_input_tokens_seen": 3414688, + "step": 16185 + }, + { + "epoch": 1.7810781078107811, + "grad_norm": 0.29950428009033203, + "learning_rate": 4.452420242024203e-05, + "loss": 0.0448, + "num_input_tokens_seen": 3415744, + "step": 16190 + }, + { + "epoch": 1.7816281628162818, + "grad_norm": 0.07312231510877609, + "learning_rate": 4.453795379537954e-05, + "loss": 0.0294, + "num_input_tokens_seen": 3416800, + "step": 16195 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 0.7949062585830688, + "learning_rate": 4.455170517051705e-05, + "loss": 0.0495, + "num_input_tokens_seen": 3417920, + "step": 16200 + }, + { + "epoch": 1.7827282728272826, + "grad_norm": 1.258863925933838, + "learning_rate": 4.4565456545654564e-05, + "loss": 0.162, + "num_input_tokens_seen": 3419008, + "step": 16205 + }, + { + "epoch": 1.7832783278327833, + "grad_norm": 0.5699275732040405, + "learning_rate": 4.457920792079208e-05, + "loss": 0.0847, + "num_input_tokens_seen": 3420064, + "step": 16210 + }, + { + "epoch": 1.783828382838284, + "grad_norm": 0.2505934536457062, + "learning_rate": 4.4592959295929594e-05, + "loss": 0.0916, + "num_input_tokens_seen": 3421056, + "step": 16215 + }, + { + "epoch": 1.7843784378437844, + "grad_norm": 0.2918126881122589, + "learning_rate": 4.4606710671067106e-05, + "loss": 0.173, + "num_input_tokens_seen": 3422080, + "step": 16220 + }, + { + "epoch": 1.7849284928492848, + "grad_norm": 0.24101100862026215, + "learning_rate": 4.4620462046204624e-05, + "loss": 0.2266, + "num_input_tokens_seen": 3423232, + "step": 16225 + }, + { + "epoch": 1.7854785478547854, + "grad_norm": 1.8168046474456787, + "learning_rate": 4.4634213421342136e-05, + "loss": 0.0771, + "num_input_tokens_seen": 3424320, + "step": 16230 + }, + { + "epoch": 1.786028602860286, + "grad_norm": 0.3977178931236267, + "learning_rate": 4.4647964796479654e-05, + "loss": 0.1442, + "num_input_tokens_seen": 3425344, + "step": 16235 + }, + { + "epoch": 1.7865786578657867, + "grad_norm": 1.3458155393600464, + "learning_rate": 4.4661716171617166e-05, + "loss": 0.1999, + "num_input_tokens_seen": 3426400, + "step": 16240 + }, + { + "epoch": 1.7871287128712872, + "grad_norm": 0.09889614582061768, + "learning_rate": 4.467546754675468e-05, + "loss": 0.0966, + "num_input_tokens_seen": 3427488, + "step": 16245 + }, + { + "epoch": 1.7876787678767876, + "grad_norm": 0.44010382890701294, + "learning_rate": 4.468921892189219e-05, + "loss": 0.1639, + "num_input_tokens_seen": 3428544, + "step": 16250 + }, + { + "epoch": 1.7882288228822882, + "grad_norm": 0.15189120173454285, + "learning_rate": 4.470297029702971e-05, + "loss": 0.1261, + "num_input_tokens_seen": 3429600, + "step": 16255 + }, + { + "epoch": 1.7887788778877889, + "grad_norm": 0.2523922622203827, + "learning_rate": 4.471672167216722e-05, + "loss": 0.039, + "num_input_tokens_seen": 3430656, + "step": 16260 + }, + { + "epoch": 1.7893289328932893, + "grad_norm": 1.702744722366333, + "learning_rate": 4.473047304730473e-05, + "loss": 0.1622, + "num_input_tokens_seen": 3431680, + "step": 16265 + }, + { + "epoch": 1.7898789878987897, + "grad_norm": 0.14777404069900513, + "learning_rate": 4.474422442244225e-05, + "loss": 0.076, + "num_input_tokens_seen": 3432736, + "step": 16270 + }, + { + "epoch": 1.7904290429042904, + "grad_norm": 1.331701636314392, + "learning_rate": 4.475797579757976e-05, + "loss": 0.0787, + "num_input_tokens_seen": 3433824, + "step": 16275 + }, + { + "epoch": 1.790979097909791, + "grad_norm": 0.41204699873924255, + "learning_rate": 4.477172717271727e-05, + "loss": 0.0468, + "num_input_tokens_seen": 3434848, + "step": 16280 + }, + { + "epoch": 1.7915291529152917, + "grad_norm": 0.8503662347793579, + "learning_rate": 4.4785478547854784e-05, + "loss": 0.077, + "num_input_tokens_seen": 3435904, + "step": 16285 + }, + { + "epoch": 1.7920792079207921, + "grad_norm": 0.6471123695373535, + "learning_rate": 4.47992299229923e-05, + "loss": 0.1279, + "num_input_tokens_seen": 3437056, + "step": 16290 + }, + { + "epoch": 1.7926292629262925, + "grad_norm": 0.0953652411699295, + "learning_rate": 4.4812981298129815e-05, + "loss": 0.0537, + "num_input_tokens_seen": 3438144, + "step": 16295 + }, + { + "epoch": 1.7931793179317932, + "grad_norm": 0.08235855400562286, + "learning_rate": 4.482673267326733e-05, + "loss": 0.0193, + "num_input_tokens_seen": 3439168, + "step": 16300 + }, + { + "epoch": 1.7937293729372938, + "grad_norm": 0.08847884833812714, + "learning_rate": 4.484048404840484e-05, + "loss": 0.042, + "num_input_tokens_seen": 3440224, + "step": 16305 + }, + { + "epoch": 1.7942794279427943, + "grad_norm": 0.2543696165084839, + "learning_rate": 4.4854235423542356e-05, + "loss": 0.1724, + "num_input_tokens_seen": 3441312, + "step": 16310 + }, + { + "epoch": 1.7948294829482947, + "grad_norm": 0.303745836019516, + "learning_rate": 4.486798679867987e-05, + "loss": 0.0721, + "num_input_tokens_seen": 3442368, + "step": 16315 + }, + { + "epoch": 1.7953795379537953, + "grad_norm": 0.43115583062171936, + "learning_rate": 4.4881738173817386e-05, + "loss": 0.0489, + "num_input_tokens_seen": 3443456, + "step": 16320 + }, + { + "epoch": 1.795929592959296, + "grad_norm": 0.4407898187637329, + "learning_rate": 4.48954895489549e-05, + "loss": 0.1056, + "num_input_tokens_seen": 3444480, + "step": 16325 + }, + { + "epoch": 1.7964796479647966, + "grad_norm": 0.6379287242889404, + "learning_rate": 4.490924092409241e-05, + "loss": 0.0657, + "num_input_tokens_seen": 3445536, + "step": 16330 + }, + { + "epoch": 1.797029702970297, + "grad_norm": 0.18934166431427002, + "learning_rate": 4.492299229922993e-05, + "loss": 0.0574, + "num_input_tokens_seen": 3446592, + "step": 16335 + }, + { + "epoch": 1.7975797579757975, + "grad_norm": 0.8346429467201233, + "learning_rate": 4.493674367436744e-05, + "loss": 0.1406, + "num_input_tokens_seen": 3447680, + "step": 16340 + }, + { + "epoch": 1.7981298129812981, + "grad_norm": 0.6549205183982849, + "learning_rate": 4.495049504950495e-05, + "loss": 0.1042, + "num_input_tokens_seen": 3448704, + "step": 16345 + }, + { + "epoch": 1.7986798679867988, + "grad_norm": 1.4357531070709229, + "learning_rate": 4.496424642464246e-05, + "loss": 0.1885, + "num_input_tokens_seen": 3449760, + "step": 16350 + }, + { + "epoch": 1.7992299229922992, + "grad_norm": 0.6038101315498352, + "learning_rate": 4.497799779977998e-05, + "loss": 0.1382, + "num_input_tokens_seen": 3450816, + "step": 16355 + }, + { + "epoch": 1.7997799779977997, + "grad_norm": 0.4453345537185669, + "learning_rate": 4.499174917491749e-05, + "loss": 0.059, + "num_input_tokens_seen": 3451936, + "step": 16360 + }, + { + "epoch": 1.8003300330033003, + "grad_norm": 0.36702051758766174, + "learning_rate": 4.5005500550055005e-05, + "loss": 0.0508, + "num_input_tokens_seen": 3452960, + "step": 16365 + }, + { + "epoch": 1.800880088008801, + "grad_norm": 0.15661102533340454, + "learning_rate": 4.501925192519252e-05, + "loss": 0.1061, + "num_input_tokens_seen": 3453952, + "step": 16370 + }, + { + "epoch": 1.8014301430143014, + "grad_norm": 0.17583073675632477, + "learning_rate": 4.5033003300330035e-05, + "loss": 0.0514, + "num_input_tokens_seen": 3455008, + "step": 16375 + }, + { + "epoch": 1.801980198019802, + "grad_norm": 0.3727463185787201, + "learning_rate": 4.504675467546755e-05, + "loss": 0.1028, + "num_input_tokens_seen": 3456064, + "step": 16380 + }, + { + "epoch": 1.8025302530253025, + "grad_norm": 0.5848924517631531, + "learning_rate": 4.506050605060506e-05, + "loss": 0.07, + "num_input_tokens_seen": 3457120, + "step": 16385 + }, + { + "epoch": 1.803080308030803, + "grad_norm": 0.1703139692544937, + "learning_rate": 4.507425742574258e-05, + "loss": 0.1525, + "num_input_tokens_seen": 3458176, + "step": 16390 + }, + { + "epoch": 1.8036303630363038, + "grad_norm": 0.2340395152568817, + "learning_rate": 4.508800880088009e-05, + "loss": 0.0422, + "num_input_tokens_seen": 3459232, + "step": 16395 + }, + { + "epoch": 1.8041804180418042, + "grad_norm": 0.1798979640007019, + "learning_rate": 4.510176017601761e-05, + "loss": 0.0798, + "num_input_tokens_seen": 3460256, + "step": 16400 + }, + { + "epoch": 1.8047304730473046, + "grad_norm": 0.17273037135601044, + "learning_rate": 4.511551155115512e-05, + "loss": 0.0819, + "num_input_tokens_seen": 3461280, + "step": 16405 + }, + { + "epoch": 1.8052805280528053, + "grad_norm": 0.36698874831199646, + "learning_rate": 4.512926292629263e-05, + "loss": 0.0803, + "num_input_tokens_seen": 3462368, + "step": 16410 + }, + { + "epoch": 1.805830583058306, + "grad_norm": 0.22671784460544586, + "learning_rate": 4.514301430143015e-05, + "loss": 0.152, + "num_input_tokens_seen": 3463392, + "step": 16415 + }, + { + "epoch": 1.8063806380638063, + "grad_norm": 2.070012092590332, + "learning_rate": 4.515676567656766e-05, + "loss": 0.1999, + "num_input_tokens_seen": 3464448, + "step": 16420 + }, + { + "epoch": 1.806930693069307, + "grad_norm": 1.1894667148590088, + "learning_rate": 4.517051705170517e-05, + "loss": 0.1557, + "num_input_tokens_seen": 3465536, + "step": 16425 + }, + { + "epoch": 1.8074807480748074, + "grad_norm": 0.3309531509876251, + "learning_rate": 4.5184268426842683e-05, + "loss": 0.091, + "num_input_tokens_seen": 3466560, + "step": 16430 + }, + { + "epoch": 1.808030803080308, + "grad_norm": 0.1330578327178955, + "learning_rate": 4.51980198019802e-05, + "loss": 0.1766, + "num_input_tokens_seen": 3467584, + "step": 16435 + }, + { + "epoch": 1.8085808580858087, + "grad_norm": 1.2780808210372925, + "learning_rate": 4.5211771177117713e-05, + "loss": 0.098, + "num_input_tokens_seen": 3468640, + "step": 16440 + }, + { + "epoch": 1.8091309130913091, + "grad_norm": 1.580976128578186, + "learning_rate": 4.5225522552255225e-05, + "loss": 0.0567, + "num_input_tokens_seen": 3469728, + "step": 16445 + }, + { + "epoch": 1.8096809680968096, + "grad_norm": 0.3746146261692047, + "learning_rate": 4.523927392739274e-05, + "loss": 0.0286, + "num_input_tokens_seen": 3470784, + "step": 16450 + }, + { + "epoch": 1.8102310231023102, + "grad_norm": 1.0711805820465088, + "learning_rate": 4.5253025302530255e-05, + "loss": 0.1027, + "num_input_tokens_seen": 3471744, + "step": 16455 + }, + { + "epoch": 1.8107810781078109, + "grad_norm": 0.4155312180519104, + "learning_rate": 4.5266776677667774e-05, + "loss": 0.0449, + "num_input_tokens_seen": 3472800, + "step": 16460 + }, + { + "epoch": 1.8113311331133113, + "grad_norm": 0.6188485026359558, + "learning_rate": 4.5280528052805285e-05, + "loss": 0.1005, + "num_input_tokens_seen": 3473824, + "step": 16465 + }, + { + "epoch": 1.811881188118812, + "grad_norm": 0.30739569664001465, + "learning_rate": 4.52942794279428e-05, + "loss": 0.0688, + "num_input_tokens_seen": 3474944, + "step": 16470 + }, + { + "epoch": 1.8124312431243124, + "grad_norm": 1.3715087175369263, + "learning_rate": 4.530803080308031e-05, + "loss": 0.1261, + "num_input_tokens_seen": 3476000, + "step": 16475 + }, + { + "epoch": 1.812981298129813, + "grad_norm": 0.3955473303794861, + "learning_rate": 4.532178217821783e-05, + "loss": 0.0425, + "num_input_tokens_seen": 3477088, + "step": 16480 + }, + { + "epoch": 1.8135313531353137, + "grad_norm": 0.5350502133369446, + "learning_rate": 4.533553355335534e-05, + "loss": 0.1014, + "num_input_tokens_seen": 3478144, + "step": 16485 + }, + { + "epoch": 1.814081408140814, + "grad_norm": 0.32141897082328796, + "learning_rate": 4.534928492849285e-05, + "loss": 0.0365, + "num_input_tokens_seen": 3479200, + "step": 16490 + }, + { + "epoch": 1.8146314631463145, + "grad_norm": 1.2519384622573853, + "learning_rate": 4.536303630363036e-05, + "loss": 0.1021, + "num_input_tokens_seen": 3480224, + "step": 16495 + }, + { + "epoch": 1.8151815181518152, + "grad_norm": 0.2944902777671814, + "learning_rate": 4.537678767876788e-05, + "loss": 0.0775, + "num_input_tokens_seen": 3481216, + "step": 16500 + }, + { + "epoch": 1.8157315731573158, + "grad_norm": 0.17079904675483704, + "learning_rate": 4.539053905390539e-05, + "loss": 0.0578, + "num_input_tokens_seen": 3482304, + "step": 16505 + }, + { + "epoch": 1.8162816281628162, + "grad_norm": 0.27368929982185364, + "learning_rate": 4.5404290429042904e-05, + "loss": 0.1302, + "num_input_tokens_seen": 3483328, + "step": 16510 + }, + { + "epoch": 1.8168316831683167, + "grad_norm": 0.14544299244880676, + "learning_rate": 4.541804180418042e-05, + "loss": 0.0254, + "num_input_tokens_seen": 3484352, + "step": 16515 + }, + { + "epoch": 1.8173817381738173, + "grad_norm": 0.3361780643463135, + "learning_rate": 4.5431793179317934e-05, + "loss": 0.0348, + "num_input_tokens_seen": 3485504, + "step": 16520 + }, + { + "epoch": 1.817931793179318, + "grad_norm": 0.7095352411270142, + "learning_rate": 4.544554455445545e-05, + "loss": 0.0792, + "num_input_tokens_seen": 3486528, + "step": 16525 + }, + { + "epoch": 1.8184818481848186, + "grad_norm": 0.4367629289627075, + "learning_rate": 4.545929592959296e-05, + "loss": 0.0669, + "num_input_tokens_seen": 3487552, + "step": 16530 + }, + { + "epoch": 1.819031903190319, + "grad_norm": 1.425900936126709, + "learning_rate": 4.5473047304730476e-05, + "loss": 0.1934, + "num_input_tokens_seen": 3488672, + "step": 16535 + }, + { + "epoch": 1.8195819581958195, + "grad_norm": 0.29838085174560547, + "learning_rate": 4.548679867986799e-05, + "loss": 0.0937, + "num_input_tokens_seen": 3489728, + "step": 16540 + }, + { + "epoch": 1.8201320132013201, + "grad_norm": 0.14071448147296906, + "learning_rate": 4.5500550055005506e-05, + "loss": 0.0448, + "num_input_tokens_seen": 3490720, + "step": 16545 + }, + { + "epoch": 1.8206820682068208, + "grad_norm": 0.36263507604599, + "learning_rate": 4.551430143014301e-05, + "loss": 0.0607, + "num_input_tokens_seen": 3491744, + "step": 16550 + }, + { + "epoch": 1.8212321232123212, + "grad_norm": 0.06833754479885101, + "learning_rate": 4.552805280528053e-05, + "loss": 0.0948, + "num_input_tokens_seen": 3492832, + "step": 16555 + }, + { + "epoch": 1.8217821782178216, + "grad_norm": 0.83236163854599, + "learning_rate": 4.554180418041805e-05, + "loss": 0.138, + "num_input_tokens_seen": 3493920, + "step": 16560 + }, + { + "epoch": 1.8223322332233223, + "grad_norm": 0.7465716004371643, + "learning_rate": 4.555555555555556e-05, + "loss": 0.1292, + "num_input_tokens_seen": 3494944, + "step": 16565 + }, + { + "epoch": 1.822882288228823, + "grad_norm": 0.5651321411132812, + "learning_rate": 4.556930693069307e-05, + "loss": 0.1289, + "num_input_tokens_seen": 3495968, + "step": 16570 + }, + { + "epoch": 1.8234323432343236, + "grad_norm": 0.2552390992641449, + "learning_rate": 4.558305830583058e-05, + "loss": 0.14, + "num_input_tokens_seen": 3497024, + "step": 16575 + }, + { + "epoch": 1.823982398239824, + "grad_norm": 0.1661858707666397, + "learning_rate": 4.55968096809681e-05, + "loss": 0.0555, + "num_input_tokens_seen": 3498048, + "step": 16580 + }, + { + "epoch": 1.8245324532453244, + "grad_norm": 0.2055174708366394, + "learning_rate": 4.561056105610561e-05, + "loss": 0.1084, + "num_input_tokens_seen": 3499040, + "step": 16585 + }, + { + "epoch": 1.825082508250825, + "grad_norm": 0.5261797904968262, + "learning_rate": 4.5624312431243124e-05, + "loss": 0.0394, + "num_input_tokens_seen": 3500128, + "step": 16590 + }, + { + "epoch": 1.8256325632563257, + "grad_norm": 0.4851122498512268, + "learning_rate": 4.5638063806380636e-05, + "loss": 0.1399, + "num_input_tokens_seen": 3501152, + "step": 16595 + }, + { + "epoch": 1.8261826182618262, + "grad_norm": 0.8062751889228821, + "learning_rate": 4.5651815181518154e-05, + "loss": 0.0786, + "num_input_tokens_seen": 3502176, + "step": 16600 + }, + { + "epoch": 1.8267326732673266, + "grad_norm": 1.1153497695922852, + "learning_rate": 4.566556655665567e-05, + "loss": 0.1009, + "num_input_tokens_seen": 3503232, + "step": 16605 + }, + { + "epoch": 1.8272827282728272, + "grad_norm": 0.8614901900291443, + "learning_rate": 4.567931793179318e-05, + "loss": 0.0889, + "num_input_tokens_seen": 3504320, + "step": 16610 + }, + { + "epoch": 1.8278327832783279, + "grad_norm": 0.590184211730957, + "learning_rate": 4.5693069306930696e-05, + "loss": 0.0889, + "num_input_tokens_seen": 3505376, + "step": 16615 + }, + { + "epoch": 1.8283828382838285, + "grad_norm": 0.5304601788520813, + "learning_rate": 4.570682068206821e-05, + "loss": 0.0564, + "num_input_tokens_seen": 3506464, + "step": 16620 + }, + { + "epoch": 1.828932893289329, + "grad_norm": 0.23403026163578033, + "learning_rate": 4.5720572057205726e-05, + "loss": 0.0478, + "num_input_tokens_seen": 3507520, + "step": 16625 + }, + { + "epoch": 1.8294829482948294, + "grad_norm": 0.14686265587806702, + "learning_rate": 4.573432343234324e-05, + "loss": 0.0997, + "num_input_tokens_seen": 3508576, + "step": 16630 + }, + { + "epoch": 1.83003300330033, + "grad_norm": 1.3739064931869507, + "learning_rate": 4.574807480748075e-05, + "loss": 0.1426, + "num_input_tokens_seen": 3509600, + "step": 16635 + }, + { + "epoch": 1.8305830583058307, + "grad_norm": 0.2864699363708496, + "learning_rate": 4.576182618261826e-05, + "loss": 0.0775, + "num_input_tokens_seen": 3510592, + "step": 16640 + }, + { + "epoch": 1.831133113311331, + "grad_norm": 0.5018652677536011, + "learning_rate": 4.577557755775578e-05, + "loss": 0.1279, + "num_input_tokens_seen": 3511648, + "step": 16645 + }, + { + "epoch": 1.8316831683168315, + "grad_norm": 1.1564775705337524, + "learning_rate": 4.578932893289329e-05, + "loss": 0.0788, + "num_input_tokens_seen": 3512704, + "step": 16650 + }, + { + "epoch": 1.8322332233223322, + "grad_norm": 0.59308260679245, + "learning_rate": 4.58030803080308e-05, + "loss": 0.0585, + "num_input_tokens_seen": 3513792, + "step": 16655 + }, + { + "epoch": 1.8327832783278328, + "grad_norm": 0.5531859397888184, + "learning_rate": 4.581683168316832e-05, + "loss": 0.1416, + "num_input_tokens_seen": 3514816, + "step": 16660 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.28503257036209106, + "learning_rate": 4.583058305830583e-05, + "loss": 0.0531, + "num_input_tokens_seen": 3515840, + "step": 16665 + }, + { + "epoch": 1.833883388338834, + "grad_norm": 0.31277844309806824, + "learning_rate": 4.5844334433443345e-05, + "loss": 0.2752, + "num_input_tokens_seen": 3516832, + "step": 16670 + }, + { + "epoch": 1.8344334433443343, + "grad_norm": 1.216021180152893, + "learning_rate": 4.5858085808580856e-05, + "loss": 0.1556, + "num_input_tokens_seen": 3517856, + "step": 16675 + }, + { + "epoch": 1.834983498349835, + "grad_norm": 0.40330663323402405, + "learning_rate": 4.5871837183718375e-05, + "loss": 0.168, + "num_input_tokens_seen": 3518944, + "step": 16680 + }, + { + "epoch": 1.8355335533553356, + "grad_norm": 1.2416313886642456, + "learning_rate": 4.5885588558855886e-05, + "loss": 0.1359, + "num_input_tokens_seen": 3520064, + "step": 16685 + }, + { + "epoch": 1.836083608360836, + "grad_norm": 0.7538466453552246, + "learning_rate": 4.5899339933993405e-05, + "loss": 0.0646, + "num_input_tokens_seen": 3521184, + "step": 16690 + }, + { + "epoch": 1.8366336633663365, + "grad_norm": 0.7600897550582886, + "learning_rate": 4.5913091309130916e-05, + "loss": 0.0473, + "num_input_tokens_seen": 3522304, + "step": 16695 + }, + { + "epoch": 1.8371837183718371, + "grad_norm": 1.6868754625320435, + "learning_rate": 4.592684268426843e-05, + "loss": 0.0616, + "num_input_tokens_seen": 3523360, + "step": 16700 + }, + { + "epoch": 1.8377337733773378, + "grad_norm": 0.2745113968849182, + "learning_rate": 4.5940594059405946e-05, + "loss": 0.0585, + "num_input_tokens_seen": 3524416, + "step": 16705 + }, + { + "epoch": 1.8382838283828384, + "grad_norm": 0.2763141095638275, + "learning_rate": 4.595434543454346e-05, + "loss": 0.0293, + "num_input_tokens_seen": 3525472, + "step": 16710 + }, + { + "epoch": 1.8388338833883389, + "grad_norm": 0.06629329919815063, + "learning_rate": 4.596809680968097e-05, + "loss": 0.1046, + "num_input_tokens_seen": 3526528, + "step": 16715 + }, + { + "epoch": 1.8393839383938393, + "grad_norm": 0.8099681735038757, + "learning_rate": 4.598184818481848e-05, + "loss": 0.0968, + "num_input_tokens_seen": 3527584, + "step": 16720 + }, + { + "epoch": 1.83993399339934, + "grad_norm": 0.04706259444355965, + "learning_rate": 4.5995599559956e-05, + "loss": 0.0818, + "num_input_tokens_seen": 3528672, + "step": 16725 + }, + { + "epoch": 1.8404840484048406, + "grad_norm": 0.1768088936805725, + "learning_rate": 4.600935093509351e-05, + "loss": 0.0775, + "num_input_tokens_seen": 3529760, + "step": 16730 + }, + { + "epoch": 1.841034103410341, + "grad_norm": 0.2822157144546509, + "learning_rate": 4.602310231023102e-05, + "loss": 0.041, + "num_input_tokens_seen": 3530848, + "step": 16735 + }, + { + "epoch": 1.8415841584158414, + "grad_norm": 0.3265603184700012, + "learning_rate": 4.603685368536854e-05, + "loss": 0.0612, + "num_input_tokens_seen": 3531872, + "step": 16740 + }, + { + "epoch": 1.842134213421342, + "grad_norm": 0.9379614591598511, + "learning_rate": 4.605060506050605e-05, + "loss": 0.1, + "num_input_tokens_seen": 3532864, + "step": 16745 + }, + { + "epoch": 1.8426842684268427, + "grad_norm": 0.3272651433944702, + "learning_rate": 4.606435643564357e-05, + "loss": 0.0841, + "num_input_tokens_seen": 3533856, + "step": 16750 + }, + { + "epoch": 1.8432343234323434, + "grad_norm": 0.41559121012687683, + "learning_rate": 4.6078107810781077e-05, + "loss": 0.0753, + "num_input_tokens_seen": 3534944, + "step": 16755 + }, + { + "epoch": 1.8437843784378438, + "grad_norm": 0.20985351502895355, + "learning_rate": 4.6091859185918595e-05, + "loss": 0.0618, + "num_input_tokens_seen": 3536000, + "step": 16760 + }, + { + "epoch": 1.8443344334433442, + "grad_norm": 0.16929227113723755, + "learning_rate": 4.610561056105611e-05, + "loss": 0.0599, + "num_input_tokens_seen": 3537088, + "step": 16765 + }, + { + "epoch": 1.844884488448845, + "grad_norm": 1.1653488874435425, + "learning_rate": 4.6119361936193625e-05, + "loss": 0.109, + "num_input_tokens_seen": 3538144, + "step": 16770 + }, + { + "epoch": 1.8454345434543455, + "grad_norm": 0.11592928320169449, + "learning_rate": 4.613311331133113e-05, + "loss": 0.157, + "num_input_tokens_seen": 3539232, + "step": 16775 + }, + { + "epoch": 1.845984598459846, + "grad_norm": 1.9786242246627808, + "learning_rate": 4.614686468646865e-05, + "loss": 0.1457, + "num_input_tokens_seen": 3540384, + "step": 16780 + }, + { + "epoch": 1.8465346534653464, + "grad_norm": 0.06053705886006355, + "learning_rate": 4.616061606160617e-05, + "loss": 0.1929, + "num_input_tokens_seen": 3541472, + "step": 16785 + }, + { + "epoch": 1.847084708470847, + "grad_norm": 0.821667492389679, + "learning_rate": 4.617436743674368e-05, + "loss": 0.1067, + "num_input_tokens_seen": 3542528, + "step": 16790 + }, + { + "epoch": 1.8476347634763477, + "grad_norm": 0.06660270690917969, + "learning_rate": 4.618811881188119e-05, + "loss": 0.0871, + "num_input_tokens_seen": 3543552, + "step": 16795 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.19730190932750702, + "learning_rate": 4.62018701870187e-05, + "loss": 0.1924, + "num_input_tokens_seen": 3544608, + "step": 16800 + }, + { + "epoch": 1.8487348734873488, + "grad_norm": 0.38283100724220276, + "learning_rate": 4.621562156215622e-05, + "loss": 0.0772, + "num_input_tokens_seen": 3545632, + "step": 16805 + }, + { + "epoch": 1.8492849284928492, + "grad_norm": 0.46122467517852783, + "learning_rate": 4.622937293729373e-05, + "loss": 0.0855, + "num_input_tokens_seen": 3546720, + "step": 16810 + }, + { + "epoch": 1.8498349834983498, + "grad_norm": 0.291898250579834, + "learning_rate": 4.6243124312431244e-05, + "loss": 0.0289, + "num_input_tokens_seen": 3547808, + "step": 16815 + }, + { + "epoch": 1.8503850385038505, + "grad_norm": 0.7357883453369141, + "learning_rate": 4.6256875687568755e-05, + "loss": 0.0716, + "num_input_tokens_seen": 3548832, + "step": 16820 + }, + { + "epoch": 1.850935093509351, + "grad_norm": 0.856930673122406, + "learning_rate": 4.6270627062706274e-05, + "loss": 0.0836, + "num_input_tokens_seen": 3549888, + "step": 16825 + }, + { + "epoch": 1.8514851485148514, + "grad_norm": 0.9982084631919861, + "learning_rate": 4.6284378437843785e-05, + "loss": 0.1574, + "num_input_tokens_seen": 3550944, + "step": 16830 + }, + { + "epoch": 1.852035203520352, + "grad_norm": 0.9096155762672424, + "learning_rate": 4.62981298129813e-05, + "loss": 0.1218, + "num_input_tokens_seen": 3551968, + "step": 16835 + }, + { + "epoch": 1.8525852585258527, + "grad_norm": 0.34212300181388855, + "learning_rate": 4.6311881188118815e-05, + "loss": 0.1272, + "num_input_tokens_seen": 3552960, + "step": 16840 + }, + { + "epoch": 1.8531353135313533, + "grad_norm": 1.0076608657836914, + "learning_rate": 4.632563256325633e-05, + "loss": 0.0692, + "num_input_tokens_seen": 3554016, + "step": 16845 + }, + { + "epoch": 1.8536853685368537, + "grad_norm": 0.9006211161613464, + "learning_rate": 4.6339383938393845e-05, + "loss": 0.1358, + "num_input_tokens_seen": 3555072, + "step": 16850 + }, + { + "epoch": 1.8542354235423542, + "grad_norm": 0.33907386660575867, + "learning_rate": 4.635313531353136e-05, + "loss": 0.1095, + "num_input_tokens_seen": 3556160, + "step": 16855 + }, + { + "epoch": 1.8547854785478548, + "grad_norm": 0.448793888092041, + "learning_rate": 4.636688668866887e-05, + "loss": 0.1386, + "num_input_tokens_seen": 3557152, + "step": 16860 + }, + { + "epoch": 1.8553355335533555, + "grad_norm": 0.07740559428930283, + "learning_rate": 4.638063806380638e-05, + "loss": 0.1139, + "num_input_tokens_seen": 3558208, + "step": 16865 + }, + { + "epoch": 1.8558855885588559, + "grad_norm": 0.055487196892499924, + "learning_rate": 4.63943894389439e-05, + "loss": 0.1469, + "num_input_tokens_seen": 3559232, + "step": 16870 + }, + { + "epoch": 1.8564356435643563, + "grad_norm": 0.07138486206531525, + "learning_rate": 4.640814081408141e-05, + "loss": 0.0869, + "num_input_tokens_seen": 3560320, + "step": 16875 + }, + { + "epoch": 1.856985698569857, + "grad_norm": 0.1886049509048462, + "learning_rate": 4.642189218921892e-05, + "loss": 0.0666, + "num_input_tokens_seen": 3561440, + "step": 16880 + }, + { + "epoch": 1.8575357535753576, + "grad_norm": 1.6912750005722046, + "learning_rate": 4.643564356435644e-05, + "loss": 0.1696, + "num_input_tokens_seen": 3562496, + "step": 16885 + }, + { + "epoch": 1.858085808580858, + "grad_norm": 1.2504651546478271, + "learning_rate": 4.644939493949395e-05, + "loss": 0.1386, + "num_input_tokens_seen": 3563648, + "step": 16890 + }, + { + "epoch": 1.8586358635863587, + "grad_norm": 0.10874874889850616, + "learning_rate": 4.6463146314631464e-05, + "loss": 0.0749, + "num_input_tokens_seen": 3564640, + "step": 16895 + }, + { + "epoch": 1.859185918591859, + "grad_norm": 0.209328293800354, + "learning_rate": 4.6476897689768976e-05, + "loss": 0.0371, + "num_input_tokens_seen": 3565696, + "step": 16900 + }, + { + "epoch": 1.8597359735973598, + "grad_norm": 1.284653663635254, + "learning_rate": 4.6490649064906494e-05, + "loss": 0.07, + "num_input_tokens_seen": 3566720, + "step": 16905 + }, + { + "epoch": 1.8602860286028604, + "grad_norm": 0.5431052446365356, + "learning_rate": 4.6504400440044006e-05, + "loss": 0.1506, + "num_input_tokens_seen": 3567776, + "step": 16910 + }, + { + "epoch": 1.8608360836083608, + "grad_norm": 0.17094603180885315, + "learning_rate": 4.6518151815181524e-05, + "loss": 0.0943, + "num_input_tokens_seen": 3568864, + "step": 16915 + }, + { + "epoch": 1.8613861386138613, + "grad_norm": 0.9566942453384399, + "learning_rate": 4.653190319031903e-05, + "loss": 0.2302, + "num_input_tokens_seen": 3569952, + "step": 16920 + }, + { + "epoch": 1.861936193619362, + "grad_norm": 0.5103092789649963, + "learning_rate": 4.654565456545655e-05, + "loss": 0.073, + "num_input_tokens_seen": 3571072, + "step": 16925 + }, + { + "epoch": 1.8624862486248626, + "grad_norm": 0.47209295630455017, + "learning_rate": 4.6559405940594066e-05, + "loss": 0.0551, + "num_input_tokens_seen": 3572224, + "step": 16930 + }, + { + "epoch": 1.863036303630363, + "grad_norm": 0.6851508617401123, + "learning_rate": 4.657315731573158e-05, + "loss": 0.0938, + "num_input_tokens_seen": 3573248, + "step": 16935 + }, + { + "epoch": 1.8635863586358636, + "grad_norm": 0.8673045635223389, + "learning_rate": 4.658690869086909e-05, + "loss": 0.0862, + "num_input_tokens_seen": 3574272, + "step": 16940 + }, + { + "epoch": 1.864136413641364, + "grad_norm": 1.172633171081543, + "learning_rate": 4.66006600660066e-05, + "loss": 0.1688, + "num_input_tokens_seen": 3575296, + "step": 16945 + }, + { + "epoch": 1.8646864686468647, + "grad_norm": 0.2862122654914856, + "learning_rate": 4.661441144114412e-05, + "loss": 0.0806, + "num_input_tokens_seen": 3576384, + "step": 16950 + }, + { + "epoch": 1.8652365236523654, + "grad_norm": 0.43755123019218445, + "learning_rate": 4.662816281628163e-05, + "loss": 0.101, + "num_input_tokens_seen": 3577408, + "step": 16955 + }, + { + "epoch": 1.8657865786578658, + "grad_norm": 1.056722640991211, + "learning_rate": 4.664191419141914e-05, + "loss": 0.0766, + "num_input_tokens_seen": 3578400, + "step": 16960 + }, + { + "epoch": 1.8663366336633662, + "grad_norm": 0.15931057929992676, + "learning_rate": 4.6655665566556654e-05, + "loss": 0.0878, + "num_input_tokens_seen": 3579424, + "step": 16965 + }, + { + "epoch": 1.8668866886688669, + "grad_norm": 0.6005264520645142, + "learning_rate": 4.666941694169417e-05, + "loss": 0.1034, + "num_input_tokens_seen": 3580480, + "step": 16970 + }, + { + "epoch": 1.8674367436743675, + "grad_norm": 0.3894113302230835, + "learning_rate": 4.668316831683169e-05, + "loss": 0.1171, + "num_input_tokens_seen": 3581536, + "step": 16975 + }, + { + "epoch": 1.867986798679868, + "grad_norm": 0.42939063906669617, + "learning_rate": 4.6696919691969196e-05, + "loss": 0.1127, + "num_input_tokens_seen": 3582656, + "step": 16980 + }, + { + "epoch": 1.8685368536853684, + "grad_norm": 0.15417449176311493, + "learning_rate": 4.6710671067106714e-05, + "loss": 0.0812, + "num_input_tokens_seen": 3583712, + "step": 16985 + }, + { + "epoch": 1.869086908690869, + "grad_norm": 0.22385463118553162, + "learning_rate": 4.6724422442244226e-05, + "loss": 0.0456, + "num_input_tokens_seen": 3584800, + "step": 16990 + }, + { + "epoch": 1.8696369636963697, + "grad_norm": 0.29770606756210327, + "learning_rate": 4.6738173817381744e-05, + "loss": 0.0887, + "num_input_tokens_seen": 3585856, + "step": 16995 + }, + { + "epoch": 1.8701870187018703, + "grad_norm": 0.26125267148017883, + "learning_rate": 4.675192519251925e-05, + "loss": 0.0327, + "num_input_tokens_seen": 3586880, + "step": 17000 + }, + { + "epoch": 1.8707370737073707, + "grad_norm": 1.7381306886672974, + "learning_rate": 4.676567656765677e-05, + "loss": 0.0862, + "num_input_tokens_seen": 3587936, + "step": 17005 + }, + { + "epoch": 1.8712871287128712, + "grad_norm": 0.14800751209259033, + "learning_rate": 4.677942794279428e-05, + "loss": 0.1017, + "num_input_tokens_seen": 3588928, + "step": 17010 + }, + { + "epoch": 1.8718371837183718, + "grad_norm": 0.31784263253211975, + "learning_rate": 4.67931793179318e-05, + "loss": 0.0548, + "num_input_tokens_seen": 3589984, + "step": 17015 + }, + { + "epoch": 1.8723872387238725, + "grad_norm": 0.03580499812960625, + "learning_rate": 4.680693069306931e-05, + "loss": 0.06, + "num_input_tokens_seen": 3591040, + "step": 17020 + }, + { + "epoch": 1.872937293729373, + "grad_norm": 1.349998950958252, + "learning_rate": 4.682068206820682e-05, + "loss": 0.1385, + "num_input_tokens_seen": 3592032, + "step": 17025 + }, + { + "epoch": 1.8734873487348733, + "grad_norm": 0.7828575372695923, + "learning_rate": 4.683443344334434e-05, + "loss": 0.0356, + "num_input_tokens_seen": 3593024, + "step": 17030 + }, + { + "epoch": 1.874037403740374, + "grad_norm": 0.3936466574668884, + "learning_rate": 4.684818481848185e-05, + "loss": 0.0582, + "num_input_tokens_seen": 3594016, + "step": 17035 + }, + { + "epoch": 1.8745874587458746, + "grad_norm": 0.5693790912628174, + "learning_rate": 4.686193619361936e-05, + "loss": 0.0624, + "num_input_tokens_seen": 3595104, + "step": 17040 + }, + { + "epoch": 1.8751375137513753, + "grad_norm": 0.5791136622428894, + "learning_rate": 4.6875687568756875e-05, + "loss": 0.1257, + "num_input_tokens_seen": 3596224, + "step": 17045 + }, + { + "epoch": 1.8756875687568757, + "grad_norm": 0.13217973709106445, + "learning_rate": 4.688943894389439e-05, + "loss": 0.0596, + "num_input_tokens_seen": 3597248, + "step": 17050 + }, + { + "epoch": 1.8762376237623761, + "grad_norm": 0.19262506067752838, + "learning_rate": 4.6903190319031905e-05, + "loss": 0.0415, + "num_input_tokens_seen": 3598208, + "step": 17055 + }, + { + "epoch": 1.8767876787678768, + "grad_norm": 0.13882213830947876, + "learning_rate": 4.6916941694169416e-05, + "loss": 0.0541, + "num_input_tokens_seen": 3599232, + "step": 17060 + }, + { + "epoch": 1.8773377337733774, + "grad_norm": 1.3966736793518066, + "learning_rate": 4.693069306930693e-05, + "loss": 0.2048, + "num_input_tokens_seen": 3600320, + "step": 17065 + }, + { + "epoch": 1.8778877887788779, + "grad_norm": 0.848358154296875, + "learning_rate": 4.6944444444444446e-05, + "loss": 0.1467, + "num_input_tokens_seen": 3601408, + "step": 17070 + }, + { + "epoch": 1.8784378437843783, + "grad_norm": 0.07419785112142563, + "learning_rate": 4.6958195819581965e-05, + "loss": 0.054, + "num_input_tokens_seen": 3602464, + "step": 17075 + }, + { + "epoch": 1.878987898789879, + "grad_norm": 0.22220775485038757, + "learning_rate": 4.6971947194719476e-05, + "loss": 0.0868, + "num_input_tokens_seen": 3603520, + "step": 17080 + }, + { + "epoch": 1.8795379537953796, + "grad_norm": 0.13281811773777008, + "learning_rate": 4.698569856985699e-05, + "loss": 0.0367, + "num_input_tokens_seen": 3604608, + "step": 17085 + }, + { + "epoch": 1.8800880088008802, + "grad_norm": 0.22594058513641357, + "learning_rate": 4.69994499449945e-05, + "loss": 0.0951, + "num_input_tokens_seen": 3605664, + "step": 17090 + }, + { + "epoch": 1.8806380638063807, + "grad_norm": 0.8667323589324951, + "learning_rate": 4.701320132013202e-05, + "loss": 0.1219, + "num_input_tokens_seen": 3606688, + "step": 17095 + }, + { + "epoch": 1.881188118811881, + "grad_norm": 0.09721270948648453, + "learning_rate": 4.702695269526953e-05, + "loss": 0.0727, + "num_input_tokens_seen": 3607744, + "step": 17100 + }, + { + "epoch": 1.8817381738173817, + "grad_norm": 0.5712056756019592, + "learning_rate": 4.704070407040704e-05, + "loss": 0.0723, + "num_input_tokens_seen": 3608832, + "step": 17105 + }, + { + "epoch": 1.8822882288228824, + "grad_norm": 0.13360227644443512, + "learning_rate": 4.705445544554455e-05, + "loss": 0.0359, + "num_input_tokens_seen": 3609824, + "step": 17110 + }, + { + "epoch": 1.8828382838283828, + "grad_norm": 0.6432952880859375, + "learning_rate": 4.706820682068207e-05, + "loss": 0.0789, + "num_input_tokens_seen": 3610880, + "step": 17115 + }, + { + "epoch": 1.8833883388338832, + "grad_norm": 0.011204712092876434, + "learning_rate": 4.708195819581958e-05, + "loss": 0.1334, + "num_input_tokens_seen": 3611904, + "step": 17120 + }, + { + "epoch": 1.8839383938393839, + "grad_norm": 0.509645402431488, + "learning_rate": 4.7095709570957095e-05, + "loss": 0.0741, + "num_input_tokens_seen": 3612896, + "step": 17125 + }, + { + "epoch": 1.8844884488448845, + "grad_norm": 0.10417225956916809, + "learning_rate": 4.710946094609461e-05, + "loss": 0.0501, + "num_input_tokens_seen": 3613856, + "step": 17130 + }, + { + "epoch": 1.8850385038503852, + "grad_norm": 0.1333257257938385, + "learning_rate": 4.7123212321232125e-05, + "loss": 0.0548, + "num_input_tokens_seen": 3614944, + "step": 17135 + }, + { + "epoch": 1.8855885588558856, + "grad_norm": 2.0475103855133057, + "learning_rate": 4.7136963696369643e-05, + "loss": 0.0483, + "num_input_tokens_seen": 3616032, + "step": 17140 + }, + { + "epoch": 1.886138613861386, + "grad_norm": 0.2557998597621918, + "learning_rate": 4.715071507150715e-05, + "loss": 0.0593, + "num_input_tokens_seen": 3617056, + "step": 17145 + }, + { + "epoch": 1.8866886688668867, + "grad_norm": 0.03145528957247734, + "learning_rate": 4.716446644664467e-05, + "loss": 0.1037, + "num_input_tokens_seen": 3618176, + "step": 17150 + }, + { + "epoch": 1.8872387238723873, + "grad_norm": 0.448354572057724, + "learning_rate": 4.717821782178218e-05, + "loss": 0.1836, + "num_input_tokens_seen": 3619232, + "step": 17155 + }, + { + "epoch": 1.8877887788778878, + "grad_norm": 0.5629302859306335, + "learning_rate": 4.71919691969197e-05, + "loss": 0.0979, + "num_input_tokens_seen": 3620320, + "step": 17160 + }, + { + "epoch": 1.8883388338833882, + "grad_norm": 0.6601421236991882, + "learning_rate": 4.720572057205721e-05, + "loss": 0.1946, + "num_input_tokens_seen": 3621376, + "step": 17165 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.42871469259262085, + "learning_rate": 4.721947194719472e-05, + "loss": 0.0571, + "num_input_tokens_seen": 3622368, + "step": 17170 + }, + { + "epoch": 1.8894389438943895, + "grad_norm": 0.22277456521987915, + "learning_rate": 4.723322332233224e-05, + "loss": 0.1728, + "num_input_tokens_seen": 3623392, + "step": 17175 + }, + { + "epoch": 1.8899889988998901, + "grad_norm": 0.09708639979362488, + "learning_rate": 4.724697469746975e-05, + "loss": 0.0292, + "num_input_tokens_seen": 3624416, + "step": 17180 + }, + { + "epoch": 1.8905390539053906, + "grad_norm": 0.17113740742206573, + "learning_rate": 4.726072607260726e-05, + "loss": 0.101, + "num_input_tokens_seen": 3625440, + "step": 17185 + }, + { + "epoch": 1.891089108910891, + "grad_norm": 0.36442145705223083, + "learning_rate": 4.7274477447744774e-05, + "loss": 0.1163, + "num_input_tokens_seen": 3626496, + "step": 17190 + }, + { + "epoch": 1.8916391639163916, + "grad_norm": 0.9289804697036743, + "learning_rate": 4.728822882288229e-05, + "loss": 0.2327, + "num_input_tokens_seen": 3627456, + "step": 17195 + }, + { + "epoch": 1.8921892189218923, + "grad_norm": 0.12288135290145874, + "learning_rate": 4.7301980198019804e-05, + "loss": 0.088, + "num_input_tokens_seen": 3628480, + "step": 17200 + }, + { + "epoch": 1.8927392739273927, + "grad_norm": 0.9256926774978638, + "learning_rate": 4.7315731573157315e-05, + "loss": 0.1211, + "num_input_tokens_seen": 3629504, + "step": 17205 + }, + { + "epoch": 1.8932893289328931, + "grad_norm": 0.7910517454147339, + "learning_rate": 4.7329482948294834e-05, + "loss": 0.0697, + "num_input_tokens_seen": 3630592, + "step": 17210 + }, + { + "epoch": 1.8938393839383938, + "grad_norm": 0.8891146779060364, + "learning_rate": 4.7343234323432345e-05, + "loss": 0.0688, + "num_input_tokens_seen": 3631712, + "step": 17215 + }, + { + "epoch": 1.8943894389438944, + "grad_norm": 0.6199049353599548, + "learning_rate": 4.7356985698569864e-05, + "loss": 0.1153, + "num_input_tokens_seen": 3632768, + "step": 17220 + }, + { + "epoch": 1.894939493949395, + "grad_norm": 0.633186399936676, + "learning_rate": 4.737073707370737e-05, + "loss": 0.1135, + "num_input_tokens_seen": 3633856, + "step": 17225 + }, + { + "epoch": 1.8954895489548955, + "grad_norm": 0.1042611226439476, + "learning_rate": 4.738448844884489e-05, + "loss": 0.0581, + "num_input_tokens_seen": 3634944, + "step": 17230 + }, + { + "epoch": 1.896039603960396, + "grad_norm": 0.43741971254348755, + "learning_rate": 4.73982398239824e-05, + "loss": 0.0371, + "num_input_tokens_seen": 3635968, + "step": 17235 + }, + { + "epoch": 1.8965896589658966, + "grad_norm": 0.46627429127693176, + "learning_rate": 4.741199119911992e-05, + "loss": 0.0792, + "num_input_tokens_seen": 3637056, + "step": 17240 + }, + { + "epoch": 1.8971397139713972, + "grad_norm": 0.03366702049970627, + "learning_rate": 4.742574257425743e-05, + "loss": 0.0602, + "num_input_tokens_seen": 3638080, + "step": 17245 + }, + { + "epoch": 1.8976897689768977, + "grad_norm": 0.20403838157653809, + "learning_rate": 4.743949394939494e-05, + "loss": 0.0776, + "num_input_tokens_seen": 3639104, + "step": 17250 + }, + { + "epoch": 1.898239823982398, + "grad_norm": 0.11732736229896545, + "learning_rate": 4.745324532453246e-05, + "loss": 0.0903, + "num_input_tokens_seen": 3640256, + "step": 17255 + }, + { + "epoch": 1.8987898789878987, + "grad_norm": 0.5436583757400513, + "learning_rate": 4.746699669966997e-05, + "loss": 0.0645, + "num_input_tokens_seen": 3641280, + "step": 17260 + }, + { + "epoch": 1.8993399339933994, + "grad_norm": 0.170260950922966, + "learning_rate": 4.748074807480748e-05, + "loss": 0.0342, + "num_input_tokens_seen": 3642336, + "step": 17265 + }, + { + "epoch": 1.8998899889989, + "grad_norm": 0.13316543400287628, + "learning_rate": 4.7494499449944994e-05, + "loss": 0.1466, + "num_input_tokens_seen": 3643392, + "step": 17270 + }, + { + "epoch": 1.9004400440044005, + "grad_norm": 1.5641123056411743, + "learning_rate": 4.750825082508251e-05, + "loss": 0.185, + "num_input_tokens_seen": 3644480, + "step": 17275 + }, + { + "epoch": 1.900990099009901, + "grad_norm": 0.19077374041080475, + "learning_rate": 4.7522002200220024e-05, + "loss": 0.1149, + "num_input_tokens_seen": 3645568, + "step": 17280 + }, + { + "epoch": 1.9015401540154016, + "grad_norm": 0.1485022008419037, + "learning_rate": 4.7535753575357536e-05, + "loss": 0.0728, + "num_input_tokens_seen": 3646592, + "step": 17285 + }, + { + "epoch": 1.9020902090209022, + "grad_norm": 1.2859290838241577, + "learning_rate": 4.754950495049505e-05, + "loss": 0.0885, + "num_input_tokens_seen": 3647616, + "step": 17290 + }, + { + "epoch": 1.9026402640264026, + "grad_norm": 0.38247352838516235, + "learning_rate": 4.7563256325632566e-05, + "loss": 0.0483, + "num_input_tokens_seen": 3648704, + "step": 17295 + }, + { + "epoch": 1.903190319031903, + "grad_norm": 2.1812124252319336, + "learning_rate": 4.757700770077008e-05, + "loss": 0.1263, + "num_input_tokens_seen": 3649792, + "step": 17300 + }, + { + "epoch": 1.9037403740374037, + "grad_norm": 1.6218100786209106, + "learning_rate": 4.7590759075907596e-05, + "loss": 0.1289, + "num_input_tokens_seen": 3650880, + "step": 17305 + }, + { + "epoch": 1.9042904290429044, + "grad_norm": 0.38366273045539856, + "learning_rate": 4.760451045104511e-05, + "loss": 0.0786, + "num_input_tokens_seen": 3652000, + "step": 17310 + }, + { + "epoch": 1.904840484048405, + "grad_norm": 0.08928404748439789, + "learning_rate": 4.761826182618262e-05, + "loss": 0.0298, + "num_input_tokens_seen": 3653024, + "step": 17315 + }, + { + "epoch": 1.9053905390539054, + "grad_norm": 0.050970762968063354, + "learning_rate": 4.763201320132014e-05, + "loss": 0.0533, + "num_input_tokens_seen": 3654144, + "step": 17320 + }, + { + "epoch": 1.9059405940594059, + "grad_norm": 0.15902993083000183, + "learning_rate": 4.764576457645765e-05, + "loss": 0.0402, + "num_input_tokens_seen": 3655168, + "step": 17325 + }, + { + "epoch": 1.9064906490649065, + "grad_norm": 1.0631752014160156, + "learning_rate": 4.765951595159516e-05, + "loss": 0.1809, + "num_input_tokens_seen": 3656160, + "step": 17330 + }, + { + "epoch": 1.9070407040704072, + "grad_norm": 0.06889283657073975, + "learning_rate": 4.767326732673267e-05, + "loss": 0.0435, + "num_input_tokens_seen": 3657312, + "step": 17335 + }, + { + "epoch": 1.9075907590759076, + "grad_norm": 0.2994094491004944, + "learning_rate": 4.768701870187019e-05, + "loss": 0.0507, + "num_input_tokens_seen": 3658400, + "step": 17340 + }, + { + "epoch": 1.908140814081408, + "grad_norm": 0.09029698371887207, + "learning_rate": 4.77007700770077e-05, + "loss": 0.044, + "num_input_tokens_seen": 3659424, + "step": 17345 + }, + { + "epoch": 1.9086908690869087, + "grad_norm": 0.11123943328857422, + "learning_rate": 4.7714521452145214e-05, + "loss": 0.1308, + "num_input_tokens_seen": 3660544, + "step": 17350 + }, + { + "epoch": 1.9092409240924093, + "grad_norm": 0.11969320476055145, + "learning_rate": 4.772827282728273e-05, + "loss": 0.0812, + "num_input_tokens_seen": 3661600, + "step": 17355 + }, + { + "epoch": 1.9097909790979097, + "grad_norm": 1.6778771877288818, + "learning_rate": 4.7742024202420244e-05, + "loss": 0.2425, + "num_input_tokens_seen": 3662656, + "step": 17360 + }, + { + "epoch": 1.9103410341034104, + "grad_norm": 0.058931633830070496, + "learning_rate": 4.775577557755776e-05, + "loss": 0.016, + "num_input_tokens_seen": 3663712, + "step": 17365 + }, + { + "epoch": 1.9108910891089108, + "grad_norm": 0.3883708715438843, + "learning_rate": 4.776952695269527e-05, + "loss": 0.1259, + "num_input_tokens_seen": 3664736, + "step": 17370 + }, + { + "epoch": 1.9114411441144115, + "grad_norm": 0.9670299291610718, + "learning_rate": 4.7783278327832786e-05, + "loss": 0.1592, + "num_input_tokens_seen": 3665888, + "step": 17375 + }, + { + "epoch": 1.911991199119912, + "grad_norm": 1.4506014585494995, + "learning_rate": 4.77970297029703e-05, + "loss": 0.1296, + "num_input_tokens_seen": 3666912, + "step": 17380 + }, + { + "epoch": 1.9125412541254125, + "grad_norm": 0.9291673898696899, + "learning_rate": 4.7810781078107816e-05, + "loss": 0.1453, + "num_input_tokens_seen": 3667936, + "step": 17385 + }, + { + "epoch": 1.913091309130913, + "grad_norm": 0.10546759516000748, + "learning_rate": 4.782453245324532e-05, + "loss": 0.1544, + "num_input_tokens_seen": 3669056, + "step": 17390 + }, + { + "epoch": 1.9136413641364136, + "grad_norm": 1.3890610933303833, + "learning_rate": 4.783828382838284e-05, + "loss": 0.065, + "num_input_tokens_seen": 3670112, + "step": 17395 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.18734312057495117, + "learning_rate": 4.785203520352036e-05, + "loss": 0.1262, + "num_input_tokens_seen": 3671104, + "step": 17400 + }, + { + "epoch": 1.9147414741474147, + "grad_norm": 1.581282377243042, + "learning_rate": 4.786578657865787e-05, + "loss": 0.0887, + "num_input_tokens_seen": 3672128, + "step": 17405 + }, + { + "epoch": 1.9152915291529153, + "grad_norm": 0.927034318447113, + "learning_rate": 4.787953795379538e-05, + "loss": 0.0737, + "num_input_tokens_seen": 3673248, + "step": 17410 + }, + { + "epoch": 1.9158415841584158, + "grad_norm": 1.4238756895065308, + "learning_rate": 4.789328932893289e-05, + "loss": 0.1805, + "num_input_tokens_seen": 3674240, + "step": 17415 + }, + { + "epoch": 1.9163916391639164, + "grad_norm": 1.4572259187698364, + "learning_rate": 4.790704070407041e-05, + "loss": 0.1965, + "num_input_tokens_seen": 3675392, + "step": 17420 + }, + { + "epoch": 1.916941694169417, + "grad_norm": 1.0301783084869385, + "learning_rate": 4.792079207920792e-05, + "loss": 0.1204, + "num_input_tokens_seen": 3676544, + "step": 17425 + }, + { + "epoch": 1.9174917491749175, + "grad_norm": 0.23910367488861084, + "learning_rate": 4.7934543454345435e-05, + "loss": 0.0447, + "num_input_tokens_seen": 3677568, + "step": 17430 + }, + { + "epoch": 1.918041804180418, + "grad_norm": 1.225404977798462, + "learning_rate": 4.7948294829482946e-05, + "loss": 0.1089, + "num_input_tokens_seen": 3678592, + "step": 17435 + }, + { + "epoch": 1.9185918591859186, + "grad_norm": 0.10460031032562256, + "learning_rate": 4.7962046204620465e-05, + "loss": 0.0662, + "num_input_tokens_seen": 3679616, + "step": 17440 + }, + { + "epoch": 1.9191419141914192, + "grad_norm": 0.4198507070541382, + "learning_rate": 4.797579757975798e-05, + "loss": 0.1697, + "num_input_tokens_seen": 3680576, + "step": 17445 + }, + { + "epoch": 1.9196919691969196, + "grad_norm": 0.04164275527000427, + "learning_rate": 4.798954895489549e-05, + "loss": 0.1097, + "num_input_tokens_seen": 3681632, + "step": 17450 + }, + { + "epoch": 1.9202420242024203, + "grad_norm": 0.44500118494033813, + "learning_rate": 4.8003300330033006e-05, + "loss": 0.0402, + "num_input_tokens_seen": 3682656, + "step": 17455 + }, + { + "epoch": 1.9207920792079207, + "grad_norm": 0.8607485890388489, + "learning_rate": 4.801705170517052e-05, + "loss": 0.1321, + "num_input_tokens_seen": 3683680, + "step": 17460 + }, + { + "epoch": 1.9213421342134214, + "grad_norm": 0.15364418923854828, + "learning_rate": 4.8030803080308037e-05, + "loss": 0.0641, + "num_input_tokens_seen": 3684704, + "step": 17465 + }, + { + "epoch": 1.921892189218922, + "grad_norm": 1.5727465152740479, + "learning_rate": 4.804455445544555e-05, + "loss": 0.1078, + "num_input_tokens_seen": 3685728, + "step": 17470 + }, + { + "epoch": 1.9224422442244224, + "grad_norm": 0.07653044164180756, + "learning_rate": 4.805830583058306e-05, + "loss": 0.0705, + "num_input_tokens_seen": 3686784, + "step": 17475 + }, + { + "epoch": 1.9229922992299229, + "grad_norm": 0.19839109480381012, + "learning_rate": 4.807205720572057e-05, + "loss": 0.0761, + "num_input_tokens_seen": 3687808, + "step": 17480 + }, + { + "epoch": 1.9235423542354235, + "grad_norm": 1.1732439994812012, + "learning_rate": 4.808580858085809e-05, + "loss": 0.088, + "num_input_tokens_seen": 3688832, + "step": 17485 + }, + { + "epoch": 1.9240924092409242, + "grad_norm": 0.9607388377189636, + "learning_rate": 4.80995599559956e-05, + "loss": 0.119, + "num_input_tokens_seen": 3689920, + "step": 17490 + }, + { + "epoch": 1.9246424642464246, + "grad_norm": 2.4696481227874756, + "learning_rate": 4.811331133113311e-05, + "loss": 0.1074, + "num_input_tokens_seen": 3691008, + "step": 17495 + }, + { + "epoch": 1.925192519251925, + "grad_norm": 0.25884512066841125, + "learning_rate": 4.812706270627063e-05, + "loss": 0.1044, + "num_input_tokens_seen": 3692096, + "step": 17500 + }, + { + "epoch": 1.9257425742574257, + "grad_norm": 0.5908961296081543, + "learning_rate": 4.814081408140814e-05, + "loss": 0.0598, + "num_input_tokens_seen": 3693184, + "step": 17505 + }, + { + "epoch": 1.9262926292629263, + "grad_norm": 0.6696294546127319, + "learning_rate": 4.8154565456545655e-05, + "loss": 0.1525, + "num_input_tokens_seen": 3694240, + "step": 17510 + }, + { + "epoch": 1.926842684268427, + "grad_norm": 0.15069323778152466, + "learning_rate": 4.816831683168317e-05, + "loss": 0.1191, + "num_input_tokens_seen": 3695328, + "step": 17515 + }, + { + "epoch": 1.9273927392739274, + "grad_norm": 0.17046238481998444, + "learning_rate": 4.8182068206820685e-05, + "loss": 0.0399, + "num_input_tokens_seen": 3696448, + "step": 17520 + }, + { + "epoch": 1.9279427942794278, + "grad_norm": 0.2264419049024582, + "learning_rate": 4.81958195819582e-05, + "loss": 0.0732, + "num_input_tokens_seen": 3697536, + "step": 17525 + }, + { + "epoch": 1.9284928492849285, + "grad_norm": 0.17750504612922668, + "learning_rate": 4.8209570957095715e-05, + "loss": 0.0293, + "num_input_tokens_seen": 3698624, + "step": 17530 + }, + { + "epoch": 1.9290429042904291, + "grad_norm": 1.7745254039764404, + "learning_rate": 4.822332233223323e-05, + "loss": 0.1319, + "num_input_tokens_seen": 3699648, + "step": 17535 + }, + { + "epoch": 1.9295929592959296, + "grad_norm": 0.8522242307662964, + "learning_rate": 4.823707370737074e-05, + "loss": 0.0651, + "num_input_tokens_seen": 3700768, + "step": 17540 + }, + { + "epoch": 1.93014301430143, + "grad_norm": 0.7970889806747437, + "learning_rate": 4.825082508250826e-05, + "loss": 0.1382, + "num_input_tokens_seen": 3701824, + "step": 17545 + }, + { + "epoch": 1.9306930693069306, + "grad_norm": 0.08153097331523895, + "learning_rate": 4.826457645764577e-05, + "loss": 0.0431, + "num_input_tokens_seen": 3702848, + "step": 17550 + }, + { + "epoch": 1.9312431243124313, + "grad_norm": 0.08369014412164688, + "learning_rate": 4.827832783278328e-05, + "loss": 0.0243, + "num_input_tokens_seen": 3703968, + "step": 17555 + }, + { + "epoch": 1.931793179317932, + "grad_norm": 0.1921272575855255, + "learning_rate": 4.829207920792079e-05, + "loss": 0.0789, + "num_input_tokens_seen": 3704992, + "step": 17560 + }, + { + "epoch": 1.9323432343234324, + "grad_norm": 0.28724774718284607, + "learning_rate": 4.830583058305831e-05, + "loss": 0.0784, + "num_input_tokens_seen": 3706080, + "step": 17565 + }, + { + "epoch": 1.9328932893289328, + "grad_norm": 0.26378458738327026, + "learning_rate": 4.831958195819582e-05, + "loss": 0.0944, + "num_input_tokens_seen": 3707168, + "step": 17570 + }, + { + "epoch": 1.9334433443344334, + "grad_norm": 1.303293228149414, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.0991, + "num_input_tokens_seen": 3708224, + "step": 17575 + }, + { + "epoch": 1.933993399339934, + "grad_norm": 0.48946166038513184, + "learning_rate": 4.8347084708470845e-05, + "loss": 0.0855, + "num_input_tokens_seen": 3709312, + "step": 17580 + }, + { + "epoch": 1.9345434543454345, + "grad_norm": 0.08224671334028244, + "learning_rate": 4.8360836083608364e-05, + "loss": 0.095, + "num_input_tokens_seen": 3710336, + "step": 17585 + }, + { + "epoch": 1.935093509350935, + "grad_norm": 0.21117304265499115, + "learning_rate": 4.837458745874588e-05, + "loss": 0.0445, + "num_input_tokens_seen": 3711392, + "step": 17590 + }, + { + "epoch": 1.9356435643564356, + "grad_norm": 0.3402740955352783, + "learning_rate": 4.838833883388339e-05, + "loss": 0.1182, + "num_input_tokens_seen": 3712448, + "step": 17595 + }, + { + "epoch": 1.9361936193619362, + "grad_norm": 0.3789803087711334, + "learning_rate": 4.8402090209020905e-05, + "loss": 0.0817, + "num_input_tokens_seen": 3713504, + "step": 17600 + }, + { + "epoch": 1.9367436743674369, + "grad_norm": 0.47148650884628296, + "learning_rate": 4.841584158415842e-05, + "loss": 0.1057, + "num_input_tokens_seen": 3714496, + "step": 17605 + }, + { + "epoch": 1.9372937293729373, + "grad_norm": 0.6551855802536011, + "learning_rate": 4.8429592959295936e-05, + "loss": 0.0639, + "num_input_tokens_seen": 3715520, + "step": 17610 + }, + { + "epoch": 1.9378437843784377, + "grad_norm": 0.9348134398460388, + "learning_rate": 4.844334433443344e-05, + "loss": 0.0517, + "num_input_tokens_seen": 3716544, + "step": 17615 + }, + { + "epoch": 1.9383938393839384, + "grad_norm": 0.11823836714029312, + "learning_rate": 4.845709570957096e-05, + "loss": 0.0855, + "num_input_tokens_seen": 3717568, + "step": 17620 + }, + { + "epoch": 1.938943894389439, + "grad_norm": 0.5255739688873291, + "learning_rate": 4.847084708470847e-05, + "loss": 0.0678, + "num_input_tokens_seen": 3718528, + "step": 17625 + }, + { + "epoch": 1.9394939493949395, + "grad_norm": 0.11971700936555862, + "learning_rate": 4.848459845984599e-05, + "loss": 0.1321, + "num_input_tokens_seen": 3719552, + "step": 17630 + }, + { + "epoch": 1.94004400440044, + "grad_norm": 1.0162495374679565, + "learning_rate": 4.84983498349835e-05, + "loss": 0.2151, + "num_input_tokens_seen": 3720640, + "step": 17635 + }, + { + "epoch": 1.9405940594059405, + "grad_norm": 0.5895476937294006, + "learning_rate": 4.851210121012101e-05, + "loss": 0.069, + "num_input_tokens_seen": 3721760, + "step": 17640 + }, + { + "epoch": 1.9411441144114412, + "grad_norm": 0.42229020595550537, + "learning_rate": 4.852585258525853e-05, + "loss": 0.0468, + "num_input_tokens_seen": 3722784, + "step": 17645 + }, + { + "epoch": 1.9416941694169418, + "grad_norm": 0.11477363109588623, + "learning_rate": 4.853960396039604e-05, + "loss": 0.0337, + "num_input_tokens_seen": 3723840, + "step": 17650 + }, + { + "epoch": 1.9422442244224423, + "grad_norm": 0.2758713960647583, + "learning_rate": 4.8553355335533554e-05, + "loss": 0.0825, + "num_input_tokens_seen": 3724896, + "step": 17655 + }, + { + "epoch": 1.9427942794279427, + "grad_norm": 0.4801672399044037, + "learning_rate": 4.8567106710671066e-05, + "loss": 0.1311, + "num_input_tokens_seen": 3726048, + "step": 17660 + }, + { + "epoch": 1.9433443344334433, + "grad_norm": 0.17833755910396576, + "learning_rate": 4.8580858085808584e-05, + "loss": 0.2119, + "num_input_tokens_seen": 3727104, + "step": 17665 + }, + { + "epoch": 1.943894389438944, + "grad_norm": 0.3242426812648773, + "learning_rate": 4.8594609460946096e-05, + "loss": 0.059, + "num_input_tokens_seen": 3728128, + "step": 17670 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 1.3078879117965698, + "learning_rate": 4.860836083608361e-05, + "loss": 0.2099, + "num_input_tokens_seen": 3729184, + "step": 17675 + }, + { + "epoch": 1.9449944994499448, + "grad_norm": 0.5721650719642639, + "learning_rate": 4.8622112211221126e-05, + "loss": 0.08, + "num_input_tokens_seen": 3730208, + "step": 17680 + }, + { + "epoch": 1.9455445544554455, + "grad_norm": 0.48449239134788513, + "learning_rate": 4.863586358635864e-05, + "loss": 0.0797, + "num_input_tokens_seen": 3731296, + "step": 17685 + }, + { + "epoch": 1.9460946094609461, + "grad_norm": 0.9595338106155396, + "learning_rate": 4.8649614961496156e-05, + "loss": 0.1519, + "num_input_tokens_seen": 3732416, + "step": 17690 + }, + { + "epoch": 1.9466446644664468, + "grad_norm": 0.09729936718940735, + "learning_rate": 4.866336633663367e-05, + "loss": 0.0315, + "num_input_tokens_seen": 3733472, + "step": 17695 + }, + { + "epoch": 1.9471947194719472, + "grad_norm": 0.19694235920906067, + "learning_rate": 4.867711771177118e-05, + "loss": 0.0988, + "num_input_tokens_seen": 3734560, + "step": 17700 + }, + { + "epoch": 1.9477447744774476, + "grad_norm": 0.21312382817268372, + "learning_rate": 4.869086908690869e-05, + "loss": 0.0586, + "num_input_tokens_seen": 3735616, + "step": 17705 + }, + { + "epoch": 1.9482948294829483, + "grad_norm": 1.1964730024337769, + "learning_rate": 4.870462046204621e-05, + "loss": 0.0915, + "num_input_tokens_seen": 3736672, + "step": 17710 + }, + { + "epoch": 1.948844884488449, + "grad_norm": 0.341438353061676, + "learning_rate": 4.871837183718372e-05, + "loss": 0.0955, + "num_input_tokens_seen": 3737696, + "step": 17715 + }, + { + "epoch": 1.9493949394939494, + "grad_norm": 0.7755430340766907, + "learning_rate": 4.873212321232123e-05, + "loss": 0.0839, + "num_input_tokens_seen": 3738688, + "step": 17720 + }, + { + "epoch": 1.9499449944994498, + "grad_norm": 0.0955599993467331, + "learning_rate": 4.874587458745875e-05, + "loss": 0.0521, + "num_input_tokens_seen": 3739744, + "step": 17725 + }, + { + "epoch": 1.9504950495049505, + "grad_norm": 0.6373463273048401, + "learning_rate": 4.875962596259626e-05, + "loss": 0.0901, + "num_input_tokens_seen": 3740768, + "step": 17730 + }, + { + "epoch": 1.951045104510451, + "grad_norm": 0.06823354214429855, + "learning_rate": 4.8773377337733774e-05, + "loss": 0.0264, + "num_input_tokens_seen": 3741824, + "step": 17735 + }, + { + "epoch": 1.9515951595159517, + "grad_norm": 0.7531183958053589, + "learning_rate": 4.8787128712871286e-05, + "loss": 0.1112, + "num_input_tokens_seen": 3742880, + "step": 17740 + }, + { + "epoch": 1.9521452145214522, + "grad_norm": 0.09750822186470032, + "learning_rate": 4.8800880088008804e-05, + "loss": 0.0362, + "num_input_tokens_seen": 3744032, + "step": 17745 + }, + { + "epoch": 1.9526952695269526, + "grad_norm": 0.23551678657531738, + "learning_rate": 4.8814631463146316e-05, + "loss": 0.1021, + "num_input_tokens_seen": 3745120, + "step": 17750 + }, + { + "epoch": 1.9532453245324533, + "grad_norm": 0.3191271722316742, + "learning_rate": 4.8828382838283835e-05, + "loss": 0.0682, + "num_input_tokens_seen": 3746144, + "step": 17755 + }, + { + "epoch": 1.953795379537954, + "grad_norm": 0.044004395604133606, + "learning_rate": 4.884213421342134e-05, + "loss": 0.0575, + "num_input_tokens_seen": 3747168, + "step": 17760 + }, + { + "epoch": 1.9543454345434543, + "grad_norm": 0.6264644861221313, + "learning_rate": 4.885588558855886e-05, + "loss": 0.0575, + "num_input_tokens_seen": 3748192, + "step": 17765 + }, + { + "epoch": 1.9548954895489548, + "grad_norm": 0.3675900399684906, + "learning_rate": 4.8869636963696376e-05, + "loss": 0.1063, + "num_input_tokens_seen": 3749312, + "step": 17770 + }, + { + "epoch": 1.9554455445544554, + "grad_norm": 0.9855292439460754, + "learning_rate": 4.888338833883389e-05, + "loss": 0.094, + "num_input_tokens_seen": 3750336, + "step": 17775 + }, + { + "epoch": 1.955995599559956, + "grad_norm": 0.15905886888504028, + "learning_rate": 4.88971397139714e-05, + "loss": 0.1145, + "num_input_tokens_seen": 3751360, + "step": 17780 + }, + { + "epoch": 1.9565456545654567, + "grad_norm": 0.181302011013031, + "learning_rate": 4.891089108910891e-05, + "loss": 0.0943, + "num_input_tokens_seen": 3752448, + "step": 17785 + }, + { + "epoch": 1.9570957095709571, + "grad_norm": 0.35482755303382874, + "learning_rate": 4.892464246424643e-05, + "loss": 0.0838, + "num_input_tokens_seen": 3753504, + "step": 17790 + }, + { + "epoch": 1.9576457645764576, + "grad_norm": 0.3454083502292633, + "learning_rate": 4.893839383938394e-05, + "loss": 0.0898, + "num_input_tokens_seen": 3754592, + "step": 17795 + }, + { + "epoch": 1.9581958195819582, + "grad_norm": 0.5445109605789185, + "learning_rate": 4.895214521452145e-05, + "loss": 0.1046, + "num_input_tokens_seen": 3755680, + "step": 17800 + }, + { + "epoch": 1.9587458745874589, + "grad_norm": 0.26405876874923706, + "learning_rate": 4.8965896589658965e-05, + "loss": 0.1057, + "num_input_tokens_seen": 3756672, + "step": 17805 + }, + { + "epoch": 1.9592959295929593, + "grad_norm": 1.2194164991378784, + "learning_rate": 4.897964796479648e-05, + "loss": 0.1527, + "num_input_tokens_seen": 3757696, + "step": 17810 + }, + { + "epoch": 1.9598459845984597, + "grad_norm": 0.6803944706916809, + "learning_rate": 4.8993399339933995e-05, + "loss": 0.074, + "num_input_tokens_seen": 3758720, + "step": 17815 + }, + { + "epoch": 1.9603960396039604, + "grad_norm": 0.9943951964378357, + "learning_rate": 4.9007150715071506e-05, + "loss": 0.0496, + "num_input_tokens_seen": 3759808, + "step": 17820 + }, + { + "epoch": 1.960946094609461, + "grad_norm": 1.1679892539978027, + "learning_rate": 4.9020902090209025e-05, + "loss": 0.1746, + "num_input_tokens_seen": 3760864, + "step": 17825 + }, + { + "epoch": 1.9614961496149617, + "grad_norm": 0.15947210788726807, + "learning_rate": 4.9034653465346536e-05, + "loss": 0.0585, + "num_input_tokens_seen": 3761856, + "step": 17830 + }, + { + "epoch": 1.962046204620462, + "grad_norm": 1.624751091003418, + "learning_rate": 4.9048404840484055e-05, + "loss": 0.1473, + "num_input_tokens_seen": 3762912, + "step": 17835 + }, + { + "epoch": 1.9625962596259625, + "grad_norm": 2.064112663269043, + "learning_rate": 4.906215621562156e-05, + "loss": 0.1265, + "num_input_tokens_seen": 3764000, + "step": 17840 + }, + { + "epoch": 1.9631463146314632, + "grad_norm": 1.52320396900177, + "learning_rate": 4.907590759075908e-05, + "loss": 0.2273, + "num_input_tokens_seen": 3765024, + "step": 17845 + }, + { + "epoch": 1.9636963696369638, + "grad_norm": 0.7020909190177917, + "learning_rate": 4.908965896589659e-05, + "loss": 0.0794, + "num_input_tokens_seen": 3766048, + "step": 17850 + }, + { + "epoch": 1.9642464246424642, + "grad_norm": 0.624345600605011, + "learning_rate": 4.910341034103411e-05, + "loss": 0.0986, + "num_input_tokens_seen": 3767072, + "step": 17855 + }, + { + "epoch": 1.9647964796479647, + "grad_norm": 0.5706025958061218, + "learning_rate": 4.911716171617162e-05, + "loss": 0.1494, + "num_input_tokens_seen": 3768160, + "step": 17860 + }, + { + "epoch": 1.9653465346534653, + "grad_norm": 1.4838758707046509, + "learning_rate": 4.913091309130913e-05, + "loss": 0.1123, + "num_input_tokens_seen": 3769216, + "step": 17865 + }, + { + "epoch": 1.965896589658966, + "grad_norm": 1.569298267364502, + "learning_rate": 4.914466446644665e-05, + "loss": 0.1239, + "num_input_tokens_seen": 3770304, + "step": 17870 + }, + { + "epoch": 1.9664466446644664, + "grad_norm": 1.634426236152649, + "learning_rate": 4.915841584158416e-05, + "loss": 0.1947, + "num_input_tokens_seen": 3771392, + "step": 17875 + }, + { + "epoch": 1.966996699669967, + "grad_norm": 1.9120997190475464, + "learning_rate": 4.917216721672167e-05, + "loss": 0.0784, + "num_input_tokens_seen": 3772384, + "step": 17880 + }, + { + "epoch": 1.9675467546754675, + "grad_norm": 0.018184006214141846, + "learning_rate": 4.9185918591859185e-05, + "loss": 0.06, + "num_input_tokens_seen": 3773472, + "step": 17885 + }, + { + "epoch": 1.9680968096809681, + "grad_norm": 0.4019433856010437, + "learning_rate": 4.9199669966996703e-05, + "loss": 0.0918, + "num_input_tokens_seen": 3774528, + "step": 17890 + }, + { + "epoch": 1.9686468646864688, + "grad_norm": 0.33268702030181885, + "learning_rate": 4.9213421342134215e-05, + "loss": 0.0524, + "num_input_tokens_seen": 3775584, + "step": 17895 + }, + { + "epoch": 1.9691969196919692, + "grad_norm": 0.36010706424713135, + "learning_rate": 4.922717271727173e-05, + "loss": 0.1063, + "num_input_tokens_seen": 3776672, + "step": 17900 + }, + { + "epoch": 1.9697469746974696, + "grad_norm": 0.3628518581390381, + "learning_rate": 4.924092409240924e-05, + "loss": 0.0689, + "num_input_tokens_seen": 3777696, + "step": 17905 + }, + { + "epoch": 1.9702970297029703, + "grad_norm": 0.13352766633033752, + "learning_rate": 4.925467546754676e-05, + "loss": 0.0709, + "num_input_tokens_seen": 3778752, + "step": 17910 + }, + { + "epoch": 1.970847084708471, + "grad_norm": 0.609768807888031, + "learning_rate": 4.9268426842684275e-05, + "loss": 0.0681, + "num_input_tokens_seen": 3779808, + "step": 17915 + }, + { + "epoch": 1.9713971397139713, + "grad_norm": 0.23082499206066132, + "learning_rate": 4.928217821782179e-05, + "loss": 0.0457, + "num_input_tokens_seen": 3780832, + "step": 17920 + }, + { + "epoch": 1.971947194719472, + "grad_norm": 0.9489403963088989, + "learning_rate": 4.92959295929593e-05, + "loss": 0.0888, + "num_input_tokens_seen": 3781856, + "step": 17925 + }, + { + "epoch": 1.9724972497249724, + "grad_norm": 2.32858943939209, + "learning_rate": 4.930968096809681e-05, + "loss": 0.1828, + "num_input_tokens_seen": 3782976, + "step": 17930 + }, + { + "epoch": 1.973047304730473, + "grad_norm": 0.3201535940170288, + "learning_rate": 4.932343234323433e-05, + "loss": 0.1982, + "num_input_tokens_seen": 3784000, + "step": 17935 + }, + { + "epoch": 1.9735973597359737, + "grad_norm": 1.0599374771118164, + "learning_rate": 4.933718371837184e-05, + "loss": 0.0656, + "num_input_tokens_seen": 3785056, + "step": 17940 + }, + { + "epoch": 1.9741474147414741, + "grad_norm": 0.5032879710197449, + "learning_rate": 4.935093509350935e-05, + "loss": 0.0822, + "num_input_tokens_seen": 3786080, + "step": 17945 + }, + { + "epoch": 1.9746974697469746, + "grad_norm": 0.05115422233939171, + "learning_rate": 4.9364686468646864e-05, + "loss": 0.1856, + "num_input_tokens_seen": 3787136, + "step": 17950 + }, + { + "epoch": 1.9752475247524752, + "grad_norm": 0.11148287355899811, + "learning_rate": 4.937843784378438e-05, + "loss": 0.0259, + "num_input_tokens_seen": 3788224, + "step": 17955 + }, + { + "epoch": 1.9757975797579759, + "grad_norm": 0.3856600522994995, + "learning_rate": 4.9392189218921894e-05, + "loss": 0.0562, + "num_input_tokens_seen": 3789376, + "step": 17960 + }, + { + "epoch": 1.9763476347634763, + "grad_norm": 0.08948864787817001, + "learning_rate": 4.9405940594059405e-05, + "loss": 0.1526, + "num_input_tokens_seen": 3790432, + "step": 17965 + }, + { + "epoch": 1.976897689768977, + "grad_norm": 0.1377209573984146, + "learning_rate": 4.9419691969196924e-05, + "loss": 0.0501, + "num_input_tokens_seen": 3791520, + "step": 17970 + }, + { + "epoch": 1.9774477447744774, + "grad_norm": 0.514190673828125, + "learning_rate": 4.9433443344334435e-05, + "loss": 0.0279, + "num_input_tokens_seen": 3792608, + "step": 17975 + }, + { + "epoch": 1.977997799779978, + "grad_norm": 0.04099754989147186, + "learning_rate": 4.9447194719471954e-05, + "loss": 0.1686, + "num_input_tokens_seen": 3793600, + "step": 17980 + }, + { + "epoch": 1.9785478547854787, + "grad_norm": 1.7330788373947144, + "learning_rate": 4.946094609460946e-05, + "loss": 0.1036, + "num_input_tokens_seen": 3794592, + "step": 17985 + }, + { + "epoch": 1.979097909790979, + "grad_norm": 1.1211724281311035, + "learning_rate": 4.947469746974698e-05, + "loss": 0.1287, + "num_input_tokens_seen": 3795584, + "step": 17990 + }, + { + "epoch": 1.9796479647964795, + "grad_norm": 0.23173996806144714, + "learning_rate": 4.948844884488449e-05, + "loss": 0.0835, + "num_input_tokens_seen": 3796640, + "step": 17995 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.6735087633132935, + "learning_rate": 4.950220022002201e-05, + "loss": 0.1158, + "num_input_tokens_seen": 3797728, + "step": 18000 + }, + { + "epoch": 1.9807480748074808, + "grad_norm": 0.5281583666801453, + "learning_rate": 4.951595159515952e-05, + "loss": 0.0624, + "num_input_tokens_seen": 3798784, + "step": 18005 + }, + { + "epoch": 1.9812981298129813, + "grad_norm": 0.4262010157108307, + "learning_rate": 4.952970297029703e-05, + "loss": 0.0603, + "num_input_tokens_seen": 3799808, + "step": 18010 + }, + { + "epoch": 1.9818481848184817, + "grad_norm": 0.8156878352165222, + "learning_rate": 4.954345434543455e-05, + "loss": 0.0522, + "num_input_tokens_seen": 3800928, + "step": 18015 + }, + { + "epoch": 1.9823982398239823, + "grad_norm": 0.3908501863479614, + "learning_rate": 4.955720572057206e-05, + "loss": 0.0355, + "num_input_tokens_seen": 3802016, + "step": 18020 + }, + { + "epoch": 1.982948294829483, + "grad_norm": 0.4158523380756378, + "learning_rate": 4.957095709570957e-05, + "loss": 0.015, + "num_input_tokens_seen": 3803008, + "step": 18025 + }, + { + "epoch": 1.9834983498349836, + "grad_norm": 1.2787238359451294, + "learning_rate": 4.9584708470847084e-05, + "loss": 0.0917, + "num_input_tokens_seen": 3804096, + "step": 18030 + }, + { + "epoch": 1.984048404840484, + "grad_norm": 1.0397597551345825, + "learning_rate": 4.95984598459846e-05, + "loss": 0.1326, + "num_input_tokens_seen": 3805120, + "step": 18035 + }, + { + "epoch": 1.9845984598459845, + "grad_norm": 0.1847805678844452, + "learning_rate": 4.9612211221122114e-05, + "loss": 0.1255, + "num_input_tokens_seen": 3806176, + "step": 18040 + }, + { + "epoch": 1.9851485148514851, + "grad_norm": 0.6546884775161743, + "learning_rate": 4.9625962596259626e-05, + "loss": 0.0984, + "num_input_tokens_seen": 3807264, + "step": 18045 + }, + { + "epoch": 1.9856985698569858, + "grad_norm": 0.6040946841239929, + "learning_rate": 4.9639713971397144e-05, + "loss": 0.0992, + "num_input_tokens_seen": 3808320, + "step": 18050 + }, + { + "epoch": 1.9862486248624862, + "grad_norm": 0.4632338583469391, + "learning_rate": 4.9653465346534656e-05, + "loss": 0.07, + "num_input_tokens_seen": 3809408, + "step": 18055 + }, + { + "epoch": 1.9867986798679866, + "grad_norm": 0.19841426610946655, + "learning_rate": 4.9667216721672174e-05, + "loss": 0.1033, + "num_input_tokens_seen": 3810528, + "step": 18060 + }, + { + "epoch": 1.9873487348734873, + "grad_norm": 0.7997258901596069, + "learning_rate": 4.968096809680968e-05, + "loss": 0.1465, + "num_input_tokens_seen": 3811584, + "step": 18065 + }, + { + "epoch": 1.987898789878988, + "grad_norm": 0.7345101833343506, + "learning_rate": 4.96947194719472e-05, + "loss": 0.0334, + "num_input_tokens_seen": 3812704, + "step": 18070 + }, + { + "epoch": 1.9884488448844886, + "grad_norm": 0.3689400553703308, + "learning_rate": 4.970847084708471e-05, + "loss": 0.1226, + "num_input_tokens_seen": 3813760, + "step": 18075 + }, + { + "epoch": 1.988998899889989, + "grad_norm": 0.10480378568172455, + "learning_rate": 4.972222222222223e-05, + "loss": 0.0446, + "num_input_tokens_seen": 3814784, + "step": 18080 + }, + { + "epoch": 1.9895489548954894, + "grad_norm": 0.49263232946395874, + "learning_rate": 4.973597359735974e-05, + "loss": 0.1042, + "num_input_tokens_seen": 3815872, + "step": 18085 + }, + { + "epoch": 1.99009900990099, + "grad_norm": 1.1165801286697388, + "learning_rate": 4.974972497249725e-05, + "loss": 0.1725, + "num_input_tokens_seen": 3816992, + "step": 18090 + }, + { + "epoch": 1.9906490649064907, + "grad_norm": 0.3851116895675659, + "learning_rate": 4.976347634763476e-05, + "loss": 0.056, + "num_input_tokens_seen": 3818080, + "step": 18095 + }, + { + "epoch": 1.9911991199119912, + "grad_norm": 1.0648751258850098, + "learning_rate": 4.977722772277228e-05, + "loss": 0.1797, + "num_input_tokens_seen": 3819136, + "step": 18100 + }, + { + "epoch": 1.9917491749174916, + "grad_norm": 1.9671701192855835, + "learning_rate": 4.979097909790979e-05, + "loss": 0.0951, + "num_input_tokens_seen": 3820128, + "step": 18105 + }, + { + "epoch": 1.9922992299229922, + "grad_norm": 0.47584331035614014, + "learning_rate": 4.9804730473047304e-05, + "loss": 0.0437, + "num_input_tokens_seen": 3821152, + "step": 18110 + }, + { + "epoch": 1.992849284928493, + "grad_norm": 0.33125796914100647, + "learning_rate": 4.981848184818482e-05, + "loss": 0.1119, + "num_input_tokens_seen": 3822176, + "step": 18115 + }, + { + "epoch": 1.9933993399339935, + "grad_norm": 0.32750406861305237, + "learning_rate": 4.9832233223322334e-05, + "loss": 0.0682, + "num_input_tokens_seen": 3823232, + "step": 18120 + }, + { + "epoch": 1.993949394939494, + "grad_norm": 0.25567302107810974, + "learning_rate": 4.9845984598459846e-05, + "loss": 0.0335, + "num_input_tokens_seen": 3824288, + "step": 18125 + }, + { + "epoch": 1.9944994499449944, + "grad_norm": 0.08695580065250397, + "learning_rate": 4.985973597359736e-05, + "loss": 0.0458, + "num_input_tokens_seen": 3825280, + "step": 18130 + }, + { + "epoch": 1.995049504950495, + "grad_norm": 1.49562406539917, + "learning_rate": 4.9873487348734876e-05, + "loss": 0.1288, + "num_input_tokens_seen": 3826336, + "step": 18135 + }, + { + "epoch": 1.9955995599559957, + "grad_norm": 0.5661713480949402, + "learning_rate": 4.988723872387239e-05, + "loss": 0.057, + "num_input_tokens_seen": 3827360, + "step": 18140 + }, + { + "epoch": 1.9961496149614961, + "grad_norm": 1.5042314529418945, + "learning_rate": 4.9900990099009906e-05, + "loss": 0.1402, + "num_input_tokens_seen": 3828448, + "step": 18145 + }, + { + "epoch": 1.9966996699669965, + "grad_norm": 1.7912181615829468, + "learning_rate": 4.991474147414742e-05, + "loss": 0.1363, + "num_input_tokens_seen": 3829536, + "step": 18150 + }, + { + "epoch": 1.9972497249724972, + "grad_norm": 0.06226017698645592, + "learning_rate": 4.992849284928493e-05, + "loss": 0.0527, + "num_input_tokens_seen": 3830560, + "step": 18155 + }, + { + "epoch": 1.9977997799779978, + "grad_norm": 0.19414234161376953, + "learning_rate": 4.994224422442245e-05, + "loss": 0.0365, + "num_input_tokens_seen": 3831616, + "step": 18160 + }, + { + "epoch": 1.9983498349834985, + "grad_norm": 0.06592351198196411, + "learning_rate": 4.995599559955996e-05, + "loss": 0.1002, + "num_input_tokens_seen": 3832672, + "step": 18165 + }, + { + "epoch": 1.998899889988999, + "grad_norm": 0.1813223659992218, + "learning_rate": 4.996974697469747e-05, + "loss": 0.1342, + "num_input_tokens_seen": 3833792, + "step": 18170 + }, + { + "epoch": 1.9994499449944994, + "grad_norm": 0.23710490763187408, + "learning_rate": 4.998349834983498e-05, + "loss": 0.1057, + "num_input_tokens_seen": 3834912, + "step": 18175 + }, + { + "epoch": 2.0, + "grad_norm": 0.08999241143465042, + "learning_rate": 4.99972497249725e-05, + "loss": 0.0784, + "num_input_tokens_seen": 3835840, + "step": 18180 + }, + { + "epoch": 2.0, + "eval_loss": 0.09340229630470276, + "eval_runtime": 36.9827, + "eval_samples_per_second": 109.24, + "eval_steps_per_second": 27.31, + "num_input_tokens_seen": 3835840, + "step": 18180 + }, + { + "epoch": 2.0005500550055006, + "grad_norm": 0.7263160943984985, + "learning_rate": 4.999999992626784e-05, + "loss": 0.0653, + "num_input_tokens_seen": 3836896, + "step": 18185 + }, + { + "epoch": 2.0011001100110013, + "grad_norm": 0.039708636701107025, + "learning_rate": 4.999999962673094e-05, + "loss": 0.0641, + "num_input_tokens_seen": 3837984, + "step": 18190 + }, + { + "epoch": 2.0016501650165015, + "grad_norm": 0.2201630175113678, + "learning_rate": 4.9999999096781035e-05, + "loss": 0.0502, + "num_input_tokens_seen": 3839072, + "step": 18195 + }, + { + "epoch": 2.002200220022002, + "grad_norm": 0.06188293546438217, + "learning_rate": 4.999999833641813e-05, + "loss": 0.0714, + "num_input_tokens_seen": 3840096, + "step": 18200 + }, + { + "epoch": 2.002750275027503, + "grad_norm": 1.1785234212875366, + "learning_rate": 4.9999997345642244e-05, + "loss": 0.1867, + "num_input_tokens_seen": 3841056, + "step": 18205 + }, + { + "epoch": 2.0033003300330035, + "grad_norm": 1.291121482849121, + "learning_rate": 4.999999612445338e-05, + "loss": 0.0723, + "num_input_tokens_seen": 3842144, + "step": 18210 + }, + { + "epoch": 2.0038503850385037, + "grad_norm": 0.5521655082702637, + "learning_rate": 4.999999467285154e-05, + "loss": 0.1006, + "num_input_tokens_seen": 3843200, + "step": 18215 + }, + { + "epoch": 2.0044004400440043, + "grad_norm": 0.35488802194595337, + "learning_rate": 4.999999299083675e-05, + "loss": 0.0916, + "num_input_tokens_seen": 3844256, + "step": 18220 + }, + { + "epoch": 2.004950495049505, + "grad_norm": 0.37767165899276733, + "learning_rate": 4.999999107840901e-05, + "loss": 0.1277, + "num_input_tokens_seen": 3845344, + "step": 18225 + }, + { + "epoch": 2.0055005500550056, + "grad_norm": 0.17200249433517456, + "learning_rate": 4.9999988935568364e-05, + "loss": 0.1434, + "num_input_tokens_seen": 3846336, + "step": 18230 + }, + { + "epoch": 2.0060506050605063, + "grad_norm": 0.45493170619010925, + "learning_rate": 4.999998656231481e-05, + "loss": 0.1306, + "num_input_tokens_seen": 3847392, + "step": 18235 + }, + { + "epoch": 2.0066006600660065, + "grad_norm": 1.9012967348098755, + "learning_rate": 4.999998395864838e-05, + "loss": 0.0953, + "num_input_tokens_seen": 3848480, + "step": 18240 + }, + { + "epoch": 2.007150715071507, + "grad_norm": 0.21595624089241028, + "learning_rate": 4.999998112456909e-05, + "loss": 0.0398, + "num_input_tokens_seen": 3849536, + "step": 18245 + }, + { + "epoch": 2.0077007700770078, + "grad_norm": 0.2929876446723938, + "learning_rate": 4.9999978060076966e-05, + "loss": 0.1151, + "num_input_tokens_seen": 3850560, + "step": 18250 + }, + { + "epoch": 2.0082508250825084, + "grad_norm": 0.2114570587873459, + "learning_rate": 4.999997476517205e-05, + "loss": 0.022, + "num_input_tokens_seen": 3851680, + "step": 18255 + }, + { + "epoch": 2.0088008800880086, + "grad_norm": 0.033755432814359665, + "learning_rate": 4.999997123985436e-05, + "loss": 0.0507, + "num_input_tokens_seen": 3852672, + "step": 18260 + }, + { + "epoch": 2.0093509350935093, + "grad_norm": 1.7123748064041138, + "learning_rate": 4.999996748412392e-05, + "loss": 0.0665, + "num_input_tokens_seen": 3853824, + "step": 18265 + }, + { + "epoch": 2.00990099009901, + "grad_norm": 1.4192657470703125, + "learning_rate": 4.999996349798079e-05, + "loss": 0.1209, + "num_input_tokens_seen": 3854848, + "step": 18270 + }, + { + "epoch": 2.0104510451045106, + "grad_norm": 0.06359855085611343, + "learning_rate": 4.999995928142499e-05, + "loss": 0.0533, + "num_input_tokens_seen": 3855904, + "step": 18275 + }, + { + "epoch": 2.011001100110011, + "grad_norm": 0.20812629163265228, + "learning_rate": 4.999995483445655e-05, + "loss": 0.0955, + "num_input_tokens_seen": 3856928, + "step": 18280 + }, + { + "epoch": 2.0115511551155114, + "grad_norm": 0.5496929883956909, + "learning_rate": 4.999995015707554e-05, + "loss": 0.0647, + "num_input_tokens_seen": 3857952, + "step": 18285 + }, + { + "epoch": 2.012101210121012, + "grad_norm": 0.07379333674907684, + "learning_rate": 4.999994524928198e-05, + "loss": 0.0404, + "num_input_tokens_seen": 3859072, + "step": 18290 + }, + { + "epoch": 2.0126512651265127, + "grad_norm": 0.16827444732189178, + "learning_rate": 4.999994011107591e-05, + "loss": 0.0852, + "num_input_tokens_seen": 3860160, + "step": 18295 + }, + { + "epoch": 2.0132013201320134, + "grad_norm": 0.1510893851518631, + "learning_rate": 4.99999347424574e-05, + "loss": 0.0854, + "num_input_tokens_seen": 3861216, + "step": 18300 + }, + { + "epoch": 2.0137513751375136, + "grad_norm": 0.11637188494205475, + "learning_rate": 4.9999929143426484e-05, + "loss": 0.0576, + "num_input_tokens_seen": 3862272, + "step": 18305 + }, + { + "epoch": 2.014301430143014, + "grad_norm": 0.24620600044727325, + "learning_rate": 4.999992331398321e-05, + "loss": 0.0867, + "num_input_tokens_seen": 3863360, + "step": 18310 + }, + { + "epoch": 2.014851485148515, + "grad_norm": 0.8065462112426758, + "learning_rate": 4.999991725412765e-05, + "loss": 0.0762, + "num_input_tokens_seen": 3864416, + "step": 18315 + }, + { + "epoch": 2.0154015401540155, + "grad_norm": 0.9465771317481995, + "learning_rate": 4.999991096385984e-05, + "loss": 0.1531, + "num_input_tokens_seen": 3865440, + "step": 18320 + }, + { + "epoch": 2.015951595159516, + "grad_norm": 0.17094829678535461, + "learning_rate": 4.9999904443179855e-05, + "loss": 0.1021, + "num_input_tokens_seen": 3866496, + "step": 18325 + }, + { + "epoch": 2.0165016501650164, + "grad_norm": 0.10052970796823502, + "learning_rate": 4.999989769208774e-05, + "loss": 0.0643, + "num_input_tokens_seen": 3867520, + "step": 18330 + }, + { + "epoch": 2.017051705170517, + "grad_norm": 0.15033330023288727, + "learning_rate": 4.999989071058357e-05, + "loss": 0.0579, + "num_input_tokens_seen": 3868576, + "step": 18335 + }, + { + "epoch": 2.0176017601760177, + "grad_norm": 0.6169372797012329, + "learning_rate": 4.99998834986674e-05, + "loss": 0.095, + "num_input_tokens_seen": 3869632, + "step": 18340 + }, + { + "epoch": 2.0181518151815183, + "grad_norm": 0.4214542508125305, + "learning_rate": 4.99998760563393e-05, + "loss": 0.1284, + "num_input_tokens_seen": 3870592, + "step": 18345 + }, + { + "epoch": 2.0187018701870185, + "grad_norm": 0.22503036260604858, + "learning_rate": 4.9999868383599336e-05, + "loss": 0.0602, + "num_input_tokens_seen": 3871680, + "step": 18350 + }, + { + "epoch": 2.019251925192519, + "grad_norm": 0.030698101967573166, + "learning_rate": 4.9999860480447583e-05, + "loss": 0.0326, + "num_input_tokens_seen": 3872704, + "step": 18355 + }, + { + "epoch": 2.01980198019802, + "grad_norm": 0.6156591773033142, + "learning_rate": 4.999985234688411e-05, + "loss": 0.1313, + "num_input_tokens_seen": 3873728, + "step": 18360 + }, + { + "epoch": 2.0203520352035205, + "grad_norm": 0.5511041879653931, + "learning_rate": 4.9999843982909e-05, + "loss": 0.0945, + "num_input_tokens_seen": 3874752, + "step": 18365 + }, + { + "epoch": 2.020902090209021, + "grad_norm": 0.36622974276542664, + "learning_rate": 4.999983538852232e-05, + "loss": 0.0448, + "num_input_tokens_seen": 3875808, + "step": 18370 + }, + { + "epoch": 2.0214521452145213, + "grad_norm": 0.7567864656448364, + "learning_rate": 4.999982656372416e-05, + "loss": 0.0413, + "num_input_tokens_seen": 3876864, + "step": 18375 + }, + { + "epoch": 2.022002200220022, + "grad_norm": 0.8944949507713318, + "learning_rate": 4.999981750851459e-05, + "loss": 0.048, + "num_input_tokens_seen": 3877920, + "step": 18380 + }, + { + "epoch": 2.0225522552255226, + "grad_norm": 0.328755646944046, + "learning_rate": 4.9999808222893695e-05, + "loss": 0.0331, + "num_input_tokens_seen": 3878912, + "step": 18385 + }, + { + "epoch": 2.0231023102310233, + "grad_norm": 1.5602648258209229, + "learning_rate": 4.999979870686157e-05, + "loss": 0.0559, + "num_input_tokens_seen": 3879968, + "step": 18390 + }, + { + "epoch": 2.0236523652365235, + "grad_norm": 0.051188260316848755, + "learning_rate": 4.9999788960418284e-05, + "loss": 0.0365, + "num_input_tokens_seen": 3881024, + "step": 18395 + }, + { + "epoch": 2.024202420242024, + "grad_norm": 0.6934897899627686, + "learning_rate": 4.999977898356395e-05, + "loss": 0.1271, + "num_input_tokens_seen": 3882144, + "step": 18400 + }, + { + "epoch": 2.0247524752475248, + "grad_norm": 0.3034997880458832, + "learning_rate": 4.999976877629865e-05, + "loss": 0.0931, + "num_input_tokens_seen": 3883232, + "step": 18405 + }, + { + "epoch": 2.0253025302530254, + "grad_norm": 0.7031285166740417, + "learning_rate": 4.999975833862247e-05, + "loss": 0.1083, + "num_input_tokens_seen": 3884256, + "step": 18410 + }, + { + "epoch": 2.0258525852585256, + "grad_norm": 1.0393720865249634, + "learning_rate": 4.999974767053552e-05, + "loss": 0.1175, + "num_input_tokens_seen": 3885312, + "step": 18415 + }, + { + "epoch": 2.0264026402640263, + "grad_norm": 0.03742745891213417, + "learning_rate": 4.9999736772037884e-05, + "loss": 0.0245, + "num_input_tokens_seen": 3886368, + "step": 18420 + }, + { + "epoch": 2.026952695269527, + "grad_norm": 1.3186230659484863, + "learning_rate": 4.999972564312968e-05, + "loss": 0.1017, + "num_input_tokens_seen": 3887488, + "step": 18425 + }, + { + "epoch": 2.0275027502750276, + "grad_norm": 1.3951586484909058, + "learning_rate": 4.999971428381099e-05, + "loss": 0.0914, + "num_input_tokens_seen": 3888544, + "step": 18430 + }, + { + "epoch": 2.0280528052805282, + "grad_norm": 0.1618073582649231, + "learning_rate": 4.999970269408194e-05, + "loss": 0.0163, + "num_input_tokens_seen": 3889632, + "step": 18435 + }, + { + "epoch": 2.0286028602860284, + "grad_norm": 0.7443603873252869, + "learning_rate": 4.999969087394262e-05, + "loss": 0.1672, + "num_input_tokens_seen": 3890624, + "step": 18440 + }, + { + "epoch": 2.029152915291529, + "grad_norm": 0.312214195728302, + "learning_rate": 4.999967882339315e-05, + "loss": 0.0571, + "num_input_tokens_seen": 3891744, + "step": 18445 + }, + { + "epoch": 2.0297029702970297, + "grad_norm": 1.4320095777511597, + "learning_rate": 4.9999666542433633e-05, + "loss": 0.0411, + "num_input_tokens_seen": 3892864, + "step": 18450 + }, + { + "epoch": 2.0302530253025304, + "grad_norm": 2.3991215229034424, + "learning_rate": 4.9999654031064184e-05, + "loss": 0.0731, + "num_input_tokens_seen": 3893920, + "step": 18455 + }, + { + "epoch": 2.0308030803080306, + "grad_norm": 0.11447683721780777, + "learning_rate": 4.999964128928493e-05, + "loss": 0.0834, + "num_input_tokens_seen": 3895008, + "step": 18460 + }, + { + "epoch": 2.0313531353135312, + "grad_norm": 0.7108030319213867, + "learning_rate": 4.9999628317095964e-05, + "loss": 0.0933, + "num_input_tokens_seen": 3896064, + "step": 18465 + }, + { + "epoch": 2.031903190319032, + "grad_norm": 0.32401520013809204, + "learning_rate": 4.9999615114497424e-05, + "loss": 0.0282, + "num_input_tokens_seen": 3897120, + "step": 18470 + }, + { + "epoch": 2.0324532453245325, + "grad_norm": 0.01963309571146965, + "learning_rate": 4.9999601681489435e-05, + "loss": 0.0817, + "num_input_tokens_seen": 3898112, + "step": 18475 + }, + { + "epoch": 2.033003300330033, + "grad_norm": 1.274498462677002, + "learning_rate": 4.999958801807211e-05, + "loss": 0.1729, + "num_input_tokens_seen": 3899136, + "step": 18480 + }, + { + "epoch": 2.0335533553355334, + "grad_norm": 0.22849662601947784, + "learning_rate": 4.9999574124245575e-05, + "loss": 0.0844, + "num_input_tokens_seen": 3900128, + "step": 18485 + }, + { + "epoch": 2.034103410341034, + "grad_norm": 0.1420845240354538, + "learning_rate": 4.999956000000996e-05, + "loss": 0.0604, + "num_input_tokens_seen": 3901152, + "step": 18490 + }, + { + "epoch": 2.0346534653465347, + "grad_norm": 1.6883337497711182, + "learning_rate": 4.99995456453654e-05, + "loss": 0.138, + "num_input_tokens_seen": 3902240, + "step": 18495 + }, + { + "epoch": 2.0352035203520353, + "grad_norm": 0.10760863870382309, + "learning_rate": 4.999953106031202e-05, + "loss": 0.1289, + "num_input_tokens_seen": 3903328, + "step": 18500 + }, + { + "epoch": 2.0357535753575355, + "grad_norm": 0.251088410615921, + "learning_rate": 4.9999516244849956e-05, + "loss": 0.0809, + "num_input_tokens_seen": 3904352, + "step": 18505 + }, + { + "epoch": 2.036303630363036, + "grad_norm": 0.5314651131629944, + "learning_rate": 4.999950119897935e-05, + "loss": 0.0658, + "num_input_tokens_seen": 3905408, + "step": 18510 + }, + { + "epoch": 2.036853685368537, + "grad_norm": 2.387695550918579, + "learning_rate": 4.999948592270034e-05, + "loss": 0.1679, + "num_input_tokens_seen": 3906464, + "step": 18515 + }, + { + "epoch": 2.0374037403740375, + "grad_norm": 0.6246073246002197, + "learning_rate": 4.999947041601306e-05, + "loss": 0.0485, + "num_input_tokens_seen": 3907584, + "step": 18520 + }, + { + "epoch": 2.037953795379538, + "grad_norm": 0.20261839032173157, + "learning_rate": 4.9999454678917665e-05, + "loss": 0.0758, + "num_input_tokens_seen": 3908672, + "step": 18525 + }, + { + "epoch": 2.0385038503850383, + "grad_norm": 0.7579432725906372, + "learning_rate": 4.9999438711414284e-05, + "loss": 0.0342, + "num_input_tokens_seen": 3909760, + "step": 18530 + }, + { + "epoch": 2.039053905390539, + "grad_norm": 0.011861979030072689, + "learning_rate": 4.999942251350307e-05, + "loss": 0.0853, + "num_input_tokens_seen": 3910816, + "step": 18535 + }, + { + "epoch": 2.0396039603960396, + "grad_norm": 2.7241430282592773, + "learning_rate": 4.999940608518418e-05, + "loss": 0.3095, + "num_input_tokens_seen": 3911840, + "step": 18540 + }, + { + "epoch": 2.0401540154015403, + "grad_norm": 0.5020557641983032, + "learning_rate": 4.999938942645776e-05, + "loss": 0.1597, + "num_input_tokens_seen": 3912864, + "step": 18545 + }, + { + "epoch": 2.0407040704070405, + "grad_norm": 2.499570369720459, + "learning_rate": 4.999937253732397e-05, + "loss": 0.058, + "num_input_tokens_seen": 3913920, + "step": 18550 + }, + { + "epoch": 2.041254125412541, + "grad_norm": 2.746185064315796, + "learning_rate": 4.999935541778295e-05, + "loss": 0.1793, + "num_input_tokens_seen": 3915008, + "step": 18555 + }, + { + "epoch": 2.041804180418042, + "grad_norm": 0.3066761791706085, + "learning_rate": 4.999933806783487e-05, + "loss": 0.0696, + "num_input_tokens_seen": 3916096, + "step": 18560 + }, + { + "epoch": 2.0423542354235424, + "grad_norm": 1.2413322925567627, + "learning_rate": 4.9999320487479884e-05, + "loss": 0.09, + "num_input_tokens_seen": 3917056, + "step": 18565 + }, + { + "epoch": 2.042904290429043, + "grad_norm": 0.07330399006605148, + "learning_rate": 4.999930267671816e-05, + "loss": 0.1294, + "num_input_tokens_seen": 3918144, + "step": 18570 + }, + { + "epoch": 2.0434543454345433, + "grad_norm": 0.07430948317050934, + "learning_rate": 4.999928463554986e-05, + "loss": 0.1667, + "num_input_tokens_seen": 3919232, + "step": 18575 + }, + { + "epoch": 2.044004400440044, + "grad_norm": 0.9778586626052856, + "learning_rate": 4.999926636397515e-05, + "loss": 0.1573, + "num_input_tokens_seen": 3920256, + "step": 18580 + }, + { + "epoch": 2.0445544554455446, + "grad_norm": 0.25474274158477783, + "learning_rate": 4.9999247861994194e-05, + "loss": 0.0406, + "num_input_tokens_seen": 3921376, + "step": 18585 + }, + { + "epoch": 2.0451045104510452, + "grad_norm": 0.43848299980163574, + "learning_rate": 4.999922912960717e-05, + "loss": 0.0892, + "num_input_tokens_seen": 3922432, + "step": 18590 + }, + { + "epoch": 2.0456545654565454, + "grad_norm": 0.9217526912689209, + "learning_rate": 4.9999210166814236e-05, + "loss": 0.063, + "num_input_tokens_seen": 3923488, + "step": 18595 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 0.06638381630182266, + "learning_rate": 4.999919097361559e-05, + "loss": 0.0674, + "num_input_tokens_seen": 3924512, + "step": 18600 + }, + { + "epoch": 2.0467546754675467, + "grad_norm": 0.255413681268692, + "learning_rate": 4.999917155001139e-05, + "loss": 0.0307, + "num_input_tokens_seen": 3925600, + "step": 18605 + }, + { + "epoch": 2.0473047304730474, + "grad_norm": 0.36795181035995483, + "learning_rate": 4.9999151896001826e-05, + "loss": 0.144, + "num_input_tokens_seen": 3926656, + "step": 18610 + }, + { + "epoch": 2.047854785478548, + "grad_norm": 0.4710030257701874, + "learning_rate": 4.999913201158707e-05, + "loss": 0.099, + "num_input_tokens_seen": 3927744, + "step": 18615 + }, + { + "epoch": 2.0484048404840483, + "grad_norm": 0.08901151269674301, + "learning_rate": 4.999911189676732e-05, + "loss": 0.0487, + "num_input_tokens_seen": 3928736, + "step": 18620 + }, + { + "epoch": 2.048954895489549, + "grad_norm": 0.5232357382774353, + "learning_rate": 4.999909155154274e-05, + "loss": 0.0677, + "num_input_tokens_seen": 3929760, + "step": 18625 + }, + { + "epoch": 2.0495049504950495, + "grad_norm": 0.17936767637729645, + "learning_rate": 4.999907097591353e-05, + "loss": 0.0524, + "num_input_tokens_seen": 3930816, + "step": 18630 + }, + { + "epoch": 2.05005500550055, + "grad_norm": 0.8941658735275269, + "learning_rate": 4.999905016987989e-05, + "loss": 0.1376, + "num_input_tokens_seen": 3931904, + "step": 18635 + }, + { + "epoch": 2.0506050605060504, + "grad_norm": 0.6758994460105896, + "learning_rate": 4.999902913344199e-05, + "loss": 0.1098, + "num_input_tokens_seen": 3932896, + "step": 18640 + }, + { + "epoch": 2.051155115511551, + "grad_norm": 0.4150141179561615, + "learning_rate": 4.999900786660003e-05, + "loss": 0.0655, + "num_input_tokens_seen": 3933984, + "step": 18645 + }, + { + "epoch": 2.0517051705170517, + "grad_norm": 0.07564493268728256, + "learning_rate": 4.9998986369354214e-05, + "loss": 0.0535, + "num_input_tokens_seen": 3935072, + "step": 18650 + }, + { + "epoch": 2.0522552255225524, + "grad_norm": 2.0880167484283447, + "learning_rate": 4.999896464170475e-05, + "loss": 0.21, + "num_input_tokens_seen": 3936160, + "step": 18655 + }, + { + "epoch": 2.052805280528053, + "grad_norm": 0.05607307329773903, + "learning_rate": 4.9998942683651806e-05, + "loss": 0.037, + "num_input_tokens_seen": 3937216, + "step": 18660 + }, + { + "epoch": 2.053355335533553, + "grad_norm": 0.4461285471916199, + "learning_rate": 4.999892049519561e-05, + "loss": 0.1132, + "num_input_tokens_seen": 3938208, + "step": 18665 + }, + { + "epoch": 2.053905390539054, + "grad_norm": 0.1366727501153946, + "learning_rate": 4.9998898076336365e-05, + "loss": 0.0206, + "num_input_tokens_seen": 3939232, + "step": 18670 + }, + { + "epoch": 2.0544554455445545, + "grad_norm": 0.4791722595691681, + "learning_rate": 4.999887542707427e-05, + "loss": 0.1663, + "num_input_tokens_seen": 3940288, + "step": 18675 + }, + { + "epoch": 2.055005500550055, + "grad_norm": 0.5687710642814636, + "learning_rate": 4.9998852547409525e-05, + "loss": 0.0424, + "num_input_tokens_seen": 3941376, + "step": 18680 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.01947443000972271, + "learning_rate": 4.999882943734237e-05, + "loss": 0.0309, + "num_input_tokens_seen": 3942432, + "step": 18685 + }, + { + "epoch": 2.056105610561056, + "grad_norm": 0.6415368318557739, + "learning_rate": 4.999880609687298e-05, + "loss": 0.0985, + "num_input_tokens_seen": 3943424, + "step": 18690 + }, + { + "epoch": 2.0566556655665567, + "grad_norm": 0.0399094820022583, + "learning_rate": 4.9998782526001596e-05, + "loss": 0.0331, + "num_input_tokens_seen": 3944448, + "step": 18695 + }, + { + "epoch": 2.0572057205720573, + "grad_norm": 0.3204480707645416, + "learning_rate": 4.9998758724728435e-05, + "loss": 0.1107, + "num_input_tokens_seen": 3945504, + "step": 18700 + }, + { + "epoch": 2.057755775577558, + "grad_norm": 0.4787912368774414, + "learning_rate": 4.999873469305371e-05, + "loss": 0.1523, + "num_input_tokens_seen": 3946656, + "step": 18705 + }, + { + "epoch": 2.058305830583058, + "grad_norm": 0.6904472708702087, + "learning_rate": 4.9998710430977635e-05, + "loss": 0.1746, + "num_input_tokens_seen": 3947616, + "step": 18710 + }, + { + "epoch": 2.058855885588559, + "grad_norm": 0.9446296095848083, + "learning_rate": 4.999868593850045e-05, + "loss": 0.1102, + "num_input_tokens_seen": 3948672, + "step": 18715 + }, + { + "epoch": 2.0594059405940595, + "grad_norm": 0.08309479057788849, + "learning_rate": 4.999866121562237e-05, + "loss": 0.0468, + "num_input_tokens_seen": 3949760, + "step": 18720 + }, + { + "epoch": 2.05995599559956, + "grad_norm": 0.26830795407295227, + "learning_rate": 4.9998636262343625e-05, + "loss": 0.0351, + "num_input_tokens_seen": 3950752, + "step": 18725 + }, + { + "epoch": 2.0605060506050603, + "grad_norm": 0.14757084846496582, + "learning_rate": 4.999861107866444e-05, + "loss": 0.0765, + "num_input_tokens_seen": 3951776, + "step": 18730 + }, + { + "epoch": 2.061056105610561, + "grad_norm": 0.18759524822235107, + "learning_rate": 4.999858566458506e-05, + "loss": 0.0802, + "num_input_tokens_seen": 3952832, + "step": 18735 + }, + { + "epoch": 2.0616061606160616, + "grad_norm": 0.3776915371417999, + "learning_rate": 4.9998560020105706e-05, + "loss": 0.0265, + "num_input_tokens_seen": 3953920, + "step": 18740 + }, + { + "epoch": 2.0621562156215623, + "grad_norm": 2.179623603820801, + "learning_rate": 4.999853414522662e-05, + "loss": 0.1585, + "num_input_tokens_seen": 3955008, + "step": 18745 + }, + { + "epoch": 2.062706270627063, + "grad_norm": 0.5601312518119812, + "learning_rate": 4.999850803994805e-05, + "loss": 0.1019, + "num_input_tokens_seen": 3956096, + "step": 18750 + }, + { + "epoch": 2.063256325632563, + "grad_norm": 0.9138340950012207, + "learning_rate": 4.999848170427021e-05, + "loss": 0.0989, + "num_input_tokens_seen": 3957152, + "step": 18755 + }, + { + "epoch": 2.0638063806380638, + "grad_norm": 0.0226264800876379, + "learning_rate": 4.999845513819337e-05, + "loss": 0.0331, + "num_input_tokens_seen": 3958208, + "step": 18760 + }, + { + "epoch": 2.0643564356435644, + "grad_norm": 0.35135215520858765, + "learning_rate": 4.9998428341717765e-05, + "loss": 0.0442, + "num_input_tokens_seen": 3959264, + "step": 18765 + }, + { + "epoch": 2.064906490649065, + "grad_norm": 0.18771660327911377, + "learning_rate": 4.999840131484363e-05, + "loss": 0.1188, + "num_input_tokens_seen": 3960352, + "step": 18770 + }, + { + "epoch": 2.0654565456545653, + "grad_norm": 0.2608099579811096, + "learning_rate": 4.999837405757124e-05, + "loss": 0.0447, + "num_input_tokens_seen": 3961440, + "step": 18775 + }, + { + "epoch": 2.066006600660066, + "grad_norm": 0.08856550604104996, + "learning_rate": 4.999834656990082e-05, + "loss": 0.0578, + "num_input_tokens_seen": 3962464, + "step": 18780 + }, + { + "epoch": 2.0665566556655666, + "grad_norm": 0.606529712677002, + "learning_rate": 4.999831885183265e-05, + "loss": 0.0308, + "num_input_tokens_seen": 3963552, + "step": 18785 + }, + { + "epoch": 2.067106710671067, + "grad_norm": 0.11866212636232376, + "learning_rate": 4.9998290903366965e-05, + "loss": 0.0744, + "num_input_tokens_seen": 3964704, + "step": 18790 + }, + { + "epoch": 2.067656765676568, + "grad_norm": 0.492336630821228, + "learning_rate": 4.999826272450402e-05, + "loss": 0.1464, + "num_input_tokens_seen": 3965696, + "step": 18795 + }, + { + "epoch": 2.068206820682068, + "grad_norm": 0.1232079565525055, + "learning_rate": 4.9998234315244085e-05, + "loss": 0.0449, + "num_input_tokens_seen": 3966688, + "step": 18800 + }, + { + "epoch": 2.0687568756875687, + "grad_norm": 0.9964995980262756, + "learning_rate": 4.999820567558742e-05, + "loss": 0.1071, + "num_input_tokens_seen": 3967712, + "step": 18805 + }, + { + "epoch": 2.0693069306930694, + "grad_norm": 0.9119172692298889, + "learning_rate": 4.999817680553429e-05, + "loss": 0.0663, + "num_input_tokens_seen": 3968704, + "step": 18810 + }, + { + "epoch": 2.06985698569857, + "grad_norm": 0.8857730031013489, + "learning_rate": 4.9998147705084964e-05, + "loss": 0.0365, + "num_input_tokens_seen": 3969760, + "step": 18815 + }, + { + "epoch": 2.0704070407040702, + "grad_norm": 0.1260545402765274, + "learning_rate": 4.99981183742397e-05, + "loss": 0.0683, + "num_input_tokens_seen": 3970816, + "step": 18820 + }, + { + "epoch": 2.070957095709571, + "grad_norm": 0.12656527757644653, + "learning_rate": 4.9998088812998776e-05, + "loss": 0.0941, + "num_input_tokens_seen": 3971872, + "step": 18825 + }, + { + "epoch": 2.0715071507150715, + "grad_norm": 1.0716837644577026, + "learning_rate": 4.999805902136246e-05, + "loss": 0.1065, + "num_input_tokens_seen": 3972864, + "step": 18830 + }, + { + "epoch": 2.072057205720572, + "grad_norm": 0.5566033720970154, + "learning_rate": 4.999802899933104e-05, + "loss": 0.059, + "num_input_tokens_seen": 3973888, + "step": 18835 + }, + { + "epoch": 2.072607260726073, + "grad_norm": 0.19371239840984344, + "learning_rate": 4.9997998746904775e-05, + "loss": 0.0483, + "num_input_tokens_seen": 3974880, + "step": 18840 + }, + { + "epoch": 2.073157315731573, + "grad_norm": 1.0617784261703491, + "learning_rate": 4.999796826408395e-05, + "loss": 0.2231, + "num_input_tokens_seen": 3975904, + "step": 18845 + }, + { + "epoch": 2.0737073707370737, + "grad_norm": 0.0456346832215786, + "learning_rate": 4.9997937550868845e-05, + "loss": 0.1012, + "num_input_tokens_seen": 3976896, + "step": 18850 + }, + { + "epoch": 2.0742574257425743, + "grad_norm": 0.9437252283096313, + "learning_rate": 4.999790660725975e-05, + "loss": 0.0949, + "num_input_tokens_seen": 3977888, + "step": 18855 + }, + { + "epoch": 2.074807480748075, + "grad_norm": 0.3451574444770813, + "learning_rate": 4.9997875433256935e-05, + "loss": 0.082, + "num_input_tokens_seen": 3978944, + "step": 18860 + }, + { + "epoch": 2.075357535753575, + "grad_norm": 0.4101276397705078, + "learning_rate": 4.999784402886071e-05, + "loss": 0.1554, + "num_input_tokens_seen": 3980064, + "step": 18865 + }, + { + "epoch": 2.075907590759076, + "grad_norm": 0.27609702944755554, + "learning_rate": 4.999781239407134e-05, + "loss": 0.1084, + "num_input_tokens_seen": 3981056, + "step": 18870 + }, + { + "epoch": 2.0764576457645765, + "grad_norm": 0.357574999332428, + "learning_rate": 4.999778052888914e-05, + "loss": 0.1242, + "num_input_tokens_seen": 3982080, + "step": 18875 + }, + { + "epoch": 2.077007700770077, + "grad_norm": 0.2713108956813812, + "learning_rate": 4.9997748433314384e-05, + "loss": 0.0211, + "num_input_tokens_seen": 3983168, + "step": 18880 + }, + { + "epoch": 2.0775577557755778, + "grad_norm": 0.400402307510376, + "learning_rate": 4.999771610734737e-05, + "loss": 0.1208, + "num_input_tokens_seen": 3984160, + "step": 18885 + }, + { + "epoch": 2.078107810781078, + "grad_norm": 0.06782571226358414, + "learning_rate": 4.999768355098842e-05, + "loss": 0.0294, + "num_input_tokens_seen": 3985248, + "step": 18890 + }, + { + "epoch": 2.0786578657865786, + "grad_norm": 1.813965082168579, + "learning_rate": 4.99976507642378e-05, + "loss": 0.086, + "num_input_tokens_seen": 3986304, + "step": 18895 + }, + { + "epoch": 2.0792079207920793, + "grad_norm": 0.2572343647480011, + "learning_rate": 4.999761774709583e-05, + "loss": 0.0429, + "num_input_tokens_seen": 3987360, + "step": 18900 + }, + { + "epoch": 2.07975797579758, + "grad_norm": 1.1294032335281372, + "learning_rate": 4.9997584499562814e-05, + "loss": 0.067, + "num_input_tokens_seen": 3988352, + "step": 18905 + }, + { + "epoch": 2.08030803080308, + "grad_norm": 0.3623747229576111, + "learning_rate": 4.999755102163906e-05, + "loss": 0.0981, + "num_input_tokens_seen": 3989408, + "step": 18910 + }, + { + "epoch": 2.080858085808581, + "grad_norm": 0.3045366704463959, + "learning_rate": 4.999751731332487e-05, + "loss": 0.0525, + "num_input_tokens_seen": 3990464, + "step": 18915 + }, + { + "epoch": 2.0814081408140814, + "grad_norm": 0.2498622089624405, + "learning_rate": 4.9997483374620565e-05, + "loss": 0.0415, + "num_input_tokens_seen": 3991520, + "step": 18920 + }, + { + "epoch": 2.081958195819582, + "grad_norm": 0.0723116472363472, + "learning_rate": 4.9997449205526443e-05, + "loss": 0.0656, + "num_input_tokens_seen": 3992576, + "step": 18925 + }, + { + "epoch": 2.0825082508250823, + "grad_norm": 0.298582524061203, + "learning_rate": 4.999741480604283e-05, + "loss": 0.0378, + "num_input_tokens_seen": 3993600, + "step": 18930 + }, + { + "epoch": 2.083058305830583, + "grad_norm": 0.38444289565086365, + "learning_rate": 4.9997380176170034e-05, + "loss": 0.1268, + "num_input_tokens_seen": 3994592, + "step": 18935 + }, + { + "epoch": 2.0836083608360836, + "grad_norm": 0.014415789395570755, + "learning_rate": 4.9997345315908384e-05, + "loss": 0.1917, + "num_input_tokens_seen": 3995648, + "step": 18940 + }, + { + "epoch": 2.0841584158415842, + "grad_norm": 1.8438695669174194, + "learning_rate": 4.99973102252582e-05, + "loss": 0.1973, + "num_input_tokens_seen": 3996672, + "step": 18945 + }, + { + "epoch": 2.084708470847085, + "grad_norm": 0.08406209945678711, + "learning_rate": 4.999727490421979e-05, + "loss": 0.0939, + "num_input_tokens_seen": 3997696, + "step": 18950 + }, + { + "epoch": 2.085258525852585, + "grad_norm": 0.2740252614021301, + "learning_rate": 4.9997239352793515e-05, + "loss": 0.079, + "num_input_tokens_seen": 3998784, + "step": 18955 + }, + { + "epoch": 2.0858085808580857, + "grad_norm": 0.9743459224700928, + "learning_rate": 4.9997203570979656e-05, + "loss": 0.0668, + "num_input_tokens_seen": 3999840, + "step": 18960 + }, + { + "epoch": 2.0863586358635864, + "grad_norm": 0.34945565462112427, + "learning_rate": 4.999716755877858e-05, + "loss": 0.0429, + "num_input_tokens_seen": 4000864, + "step": 18965 + }, + { + "epoch": 2.086908690869087, + "grad_norm": 0.41532015800476074, + "learning_rate": 4.9997131316190604e-05, + "loss": 0.0896, + "num_input_tokens_seen": 4001952, + "step": 18970 + }, + { + "epoch": 2.0874587458745877, + "grad_norm": 0.03221442177891731, + "learning_rate": 4.999709484321606e-05, + "loss": 0.1351, + "num_input_tokens_seen": 4003040, + "step": 18975 + }, + { + "epoch": 2.088008800880088, + "grad_norm": 3.4191224575042725, + "learning_rate": 4.999705813985529e-05, + "loss": 0.1343, + "num_input_tokens_seen": 4004160, + "step": 18980 + }, + { + "epoch": 2.0885588558855885, + "grad_norm": 0.1564362645149231, + "learning_rate": 4.999702120610863e-05, + "loss": 0.1485, + "num_input_tokens_seen": 4005248, + "step": 18985 + }, + { + "epoch": 2.089108910891089, + "grad_norm": 1.3746325969696045, + "learning_rate": 4.999698404197642e-05, + "loss": 0.0935, + "num_input_tokens_seen": 4006272, + "step": 18990 + }, + { + "epoch": 2.08965896589659, + "grad_norm": 0.1968020796775818, + "learning_rate": 4.9996946647458996e-05, + "loss": 0.0557, + "num_input_tokens_seen": 4007296, + "step": 18995 + }, + { + "epoch": 2.09020902090209, + "grad_norm": 0.8905907273292542, + "learning_rate": 4.999690902255672e-05, + "loss": 0.0912, + "num_input_tokens_seen": 4008320, + "step": 19000 + }, + { + "epoch": 2.0907590759075907, + "grad_norm": 0.9126741290092468, + "learning_rate": 4.999687116726992e-05, + "loss": 0.0693, + "num_input_tokens_seen": 4009376, + "step": 19005 + }, + { + "epoch": 2.0913091309130913, + "grad_norm": 0.6282766461372375, + "learning_rate": 4.999683308159896e-05, + "loss": 0.0543, + "num_input_tokens_seen": 4010496, + "step": 19010 + }, + { + "epoch": 2.091859185918592, + "grad_norm": 0.30891624093055725, + "learning_rate": 4.999679476554418e-05, + "loss": 0.0687, + "num_input_tokens_seen": 4011584, + "step": 19015 + }, + { + "epoch": 2.092409240924092, + "grad_norm": 1.3324719667434692, + "learning_rate": 4.999675621910594e-05, + "loss": 0.0884, + "num_input_tokens_seen": 4012672, + "step": 19020 + }, + { + "epoch": 2.092959295929593, + "grad_norm": 0.4929545521736145, + "learning_rate": 4.999671744228459e-05, + "loss": 0.0342, + "num_input_tokens_seen": 4013696, + "step": 19025 + }, + { + "epoch": 2.0935093509350935, + "grad_norm": 1.4855461120605469, + "learning_rate": 4.999667843508049e-05, + "loss": 0.1097, + "num_input_tokens_seen": 4014752, + "step": 19030 + }, + { + "epoch": 2.094059405940594, + "grad_norm": 0.9333707094192505, + "learning_rate": 4.9996639197494e-05, + "loss": 0.1103, + "num_input_tokens_seen": 4015872, + "step": 19035 + }, + { + "epoch": 2.094609460946095, + "grad_norm": 0.15533305704593658, + "learning_rate": 4.999659972952548e-05, + "loss": 0.0307, + "num_input_tokens_seen": 4016928, + "step": 19040 + }, + { + "epoch": 2.095159515951595, + "grad_norm": 1.2966781854629517, + "learning_rate": 4.99965600311753e-05, + "loss": 0.0852, + "num_input_tokens_seen": 4017984, + "step": 19045 + }, + { + "epoch": 2.0957095709570956, + "grad_norm": 0.4575272500514984, + "learning_rate": 4.999652010244382e-05, + "loss": 0.157, + "num_input_tokens_seen": 4019040, + "step": 19050 + }, + { + "epoch": 2.0962596259625963, + "grad_norm": 1.0662670135498047, + "learning_rate": 4.9996479943331395e-05, + "loss": 0.0585, + "num_input_tokens_seen": 4020064, + "step": 19055 + }, + { + "epoch": 2.096809680968097, + "grad_norm": 0.370108038187027, + "learning_rate": 4.999643955383842e-05, + "loss": 0.0462, + "num_input_tokens_seen": 4021088, + "step": 19060 + }, + { + "epoch": 2.097359735973597, + "grad_norm": 0.21170523762702942, + "learning_rate": 4.999639893396526e-05, + "loss": 0.076, + "num_input_tokens_seen": 4022144, + "step": 19065 + }, + { + "epoch": 2.097909790979098, + "grad_norm": 0.24724991619586945, + "learning_rate": 4.999635808371228e-05, + "loss": 0.0619, + "num_input_tokens_seen": 4023232, + "step": 19070 + }, + { + "epoch": 2.0984598459845984, + "grad_norm": 0.2726917564868927, + "learning_rate": 4.999631700307986e-05, + "loss": 0.1622, + "num_input_tokens_seen": 4024320, + "step": 19075 + }, + { + "epoch": 2.099009900990099, + "grad_norm": 0.026074400171637535, + "learning_rate": 4.999627569206839e-05, + "loss": 0.1085, + "num_input_tokens_seen": 4025344, + "step": 19080 + }, + { + "epoch": 2.0995599559955997, + "grad_norm": 0.15434464812278748, + "learning_rate": 4.999623415067823e-05, + "loss": 0.1236, + "num_input_tokens_seen": 4026336, + "step": 19085 + }, + { + "epoch": 2.1001100110011, + "grad_norm": 0.39140594005584717, + "learning_rate": 4.9996192378909786e-05, + "loss": 0.1494, + "num_input_tokens_seen": 4027424, + "step": 19090 + }, + { + "epoch": 2.1006600660066006, + "grad_norm": 0.06730911880731583, + "learning_rate": 4.999615037676342e-05, + "loss": 0.1167, + "num_input_tokens_seen": 4028480, + "step": 19095 + }, + { + "epoch": 2.1012101210121013, + "grad_norm": 1.286352515220642, + "learning_rate": 4.999610814423954e-05, + "loss": 0.1488, + "num_input_tokens_seen": 4029568, + "step": 19100 + }, + { + "epoch": 2.101760176017602, + "grad_norm": 1.0119049549102783, + "learning_rate": 4.9996065681338515e-05, + "loss": 0.0303, + "num_input_tokens_seen": 4030624, + "step": 19105 + }, + { + "epoch": 2.102310231023102, + "grad_norm": 0.3914945125579834, + "learning_rate": 4.9996022988060755e-05, + "loss": 0.0942, + "num_input_tokens_seen": 4031648, + "step": 19110 + }, + { + "epoch": 2.1028602860286028, + "grad_norm": 0.1528337448835373, + "learning_rate": 4.999598006440664e-05, + "loss": 0.0218, + "num_input_tokens_seen": 4032640, + "step": 19115 + }, + { + "epoch": 2.1034103410341034, + "grad_norm": 0.6366077065467834, + "learning_rate": 4.999593691037657e-05, + "loss": 0.1, + "num_input_tokens_seen": 4033728, + "step": 19120 + }, + { + "epoch": 2.103960396039604, + "grad_norm": 0.07702034711837769, + "learning_rate": 4.999589352597095e-05, + "loss": 0.0923, + "num_input_tokens_seen": 4034752, + "step": 19125 + }, + { + "epoch": 2.1045104510451047, + "grad_norm": 0.07902459800243378, + "learning_rate": 4.999584991119017e-05, + "loss": 0.1144, + "num_input_tokens_seen": 4035776, + "step": 19130 + }, + { + "epoch": 2.105060506050605, + "grad_norm": 1.2212316989898682, + "learning_rate": 4.999580606603463e-05, + "loss": 0.1195, + "num_input_tokens_seen": 4036832, + "step": 19135 + }, + { + "epoch": 2.1056105610561056, + "grad_norm": 0.5513807535171509, + "learning_rate": 4.999576199050475e-05, + "loss": 0.1595, + "num_input_tokens_seen": 4037824, + "step": 19140 + }, + { + "epoch": 2.106160616061606, + "grad_norm": 0.10746482759714127, + "learning_rate": 4.999571768460093e-05, + "loss": 0.0761, + "num_input_tokens_seen": 4038848, + "step": 19145 + }, + { + "epoch": 2.106710671067107, + "grad_norm": 0.03864101320505142, + "learning_rate": 4.9995673148323566e-05, + "loss": 0.0363, + "num_input_tokens_seen": 4039872, + "step": 19150 + }, + { + "epoch": 2.107260726072607, + "grad_norm": 0.0412336029112339, + "learning_rate": 4.999562838167307e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4040928, + "step": 19155 + }, + { + "epoch": 2.1078107810781077, + "grad_norm": 0.46643561124801636, + "learning_rate": 4.999558338464987e-05, + "loss": 0.0551, + "num_input_tokens_seen": 4041984, + "step": 19160 + }, + { + "epoch": 2.1083608360836084, + "grad_norm": 0.5339401364326477, + "learning_rate": 4.999553815725437e-05, + "loss": 0.0709, + "num_input_tokens_seen": 4043104, + "step": 19165 + }, + { + "epoch": 2.108910891089109, + "grad_norm": 0.356754332780838, + "learning_rate": 4.999549269948699e-05, + "loss": 0.0669, + "num_input_tokens_seen": 4044224, + "step": 19170 + }, + { + "epoch": 2.1094609460946097, + "grad_norm": 1.155169129371643, + "learning_rate": 4.999544701134815e-05, + "loss": 0.1023, + "num_input_tokens_seen": 4045312, + "step": 19175 + }, + { + "epoch": 2.11001100110011, + "grad_norm": 0.5063312649726868, + "learning_rate": 4.9995401092838255e-05, + "loss": 0.1232, + "num_input_tokens_seen": 4046304, + "step": 19180 + }, + { + "epoch": 2.1105610561056105, + "grad_norm": 0.8924984931945801, + "learning_rate": 4.9995354943957755e-05, + "loss": 0.0691, + "num_input_tokens_seen": 4047360, + "step": 19185 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.8260666728019714, + "learning_rate": 4.999530856470706e-05, + "loss": 0.1534, + "num_input_tokens_seen": 4048416, + "step": 19190 + }, + { + "epoch": 2.111661166116612, + "grad_norm": 1.1477965116500854, + "learning_rate": 4.99952619550866e-05, + "loss": 0.13, + "num_input_tokens_seen": 4049440, + "step": 19195 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.20926468074321747, + "learning_rate": 4.999521511509681e-05, + "loss": 0.0466, + "num_input_tokens_seen": 4050464, + "step": 19200 + }, + { + "epoch": 2.1127612761276127, + "grad_norm": 0.7297373414039612, + "learning_rate": 4.99951680447381e-05, + "loss": 0.1253, + "num_input_tokens_seen": 4051424, + "step": 19205 + }, + { + "epoch": 2.1133113311331133, + "grad_norm": 0.20046833157539368, + "learning_rate": 4.999512074401094e-05, + "loss": 0.0401, + "num_input_tokens_seen": 4052480, + "step": 19210 + }, + { + "epoch": 2.113861386138614, + "grad_norm": 0.0822799950838089, + "learning_rate": 4.9995073212915736e-05, + "loss": 0.0847, + "num_input_tokens_seen": 4053504, + "step": 19215 + }, + { + "epoch": 2.1144114411441146, + "grad_norm": 0.45285433530807495, + "learning_rate": 4.9995025451452935e-05, + "loss": 0.0603, + "num_input_tokens_seen": 4054496, + "step": 19220 + }, + { + "epoch": 2.114961496149615, + "grad_norm": 0.28722867369651794, + "learning_rate": 4.9994977459622976e-05, + "loss": 0.1222, + "num_input_tokens_seen": 4055520, + "step": 19225 + }, + { + "epoch": 2.1155115511551155, + "grad_norm": 0.4049420952796936, + "learning_rate": 4.999492923742631e-05, + "loss": 0.0622, + "num_input_tokens_seen": 4056544, + "step": 19230 + }, + { + "epoch": 2.116061606160616, + "grad_norm": 0.1276259422302246, + "learning_rate": 4.9994880784863375e-05, + "loss": 0.2685, + "num_input_tokens_seen": 4057600, + "step": 19235 + }, + { + "epoch": 2.1166116611661168, + "grad_norm": 0.09926128387451172, + "learning_rate": 4.9994832101934607e-05, + "loss": 0.1908, + "num_input_tokens_seen": 4058656, + "step": 19240 + }, + { + "epoch": 2.117161716171617, + "grad_norm": 0.16107426583766937, + "learning_rate": 4.9994783188640476e-05, + "loss": 0.0704, + "num_input_tokens_seen": 4059712, + "step": 19245 + }, + { + "epoch": 2.1177117711771176, + "grad_norm": 1.4559792280197144, + "learning_rate": 4.999473404498142e-05, + "loss": 0.0546, + "num_input_tokens_seen": 4060704, + "step": 19250 + }, + { + "epoch": 2.1182618261826183, + "grad_norm": 0.5918624997138977, + "learning_rate": 4.999468467095789e-05, + "loss": 0.0623, + "num_input_tokens_seen": 4061792, + "step": 19255 + }, + { + "epoch": 2.118811881188119, + "grad_norm": 0.32545533776283264, + "learning_rate": 4.999463506657035e-05, + "loss": 0.0611, + "num_input_tokens_seen": 4062944, + "step": 19260 + }, + { + "epoch": 2.1193619361936196, + "grad_norm": 0.48998838663101196, + "learning_rate": 4.9994585231819246e-05, + "loss": 0.0441, + "num_input_tokens_seen": 4063968, + "step": 19265 + }, + { + "epoch": 2.1199119911991198, + "grad_norm": 0.1591438204050064, + "learning_rate": 4.999453516670505e-05, + "loss": 0.0376, + "num_input_tokens_seen": 4065056, + "step": 19270 + }, + { + "epoch": 2.1204620462046204, + "grad_norm": 0.5180050730705261, + "learning_rate": 4.9994484871228206e-05, + "loss": 0.0285, + "num_input_tokens_seen": 4066144, + "step": 19275 + }, + { + "epoch": 2.121012101210121, + "grad_norm": 0.40513432025909424, + "learning_rate": 4.999443434538921e-05, + "loss": 0.0361, + "num_input_tokens_seen": 4067264, + "step": 19280 + }, + { + "epoch": 2.1215621562156217, + "grad_norm": 1.340785026550293, + "learning_rate": 4.9994383589188485e-05, + "loss": 0.0715, + "num_input_tokens_seen": 4068320, + "step": 19285 + }, + { + "epoch": 2.122112211221122, + "grad_norm": 0.2305288165807724, + "learning_rate": 4.999433260262653e-05, + "loss": 0.029, + "num_input_tokens_seen": 4069408, + "step": 19290 + }, + { + "epoch": 2.1226622662266226, + "grad_norm": 0.6073452234268188, + "learning_rate": 4.99942813857038e-05, + "loss": 0.0923, + "num_input_tokens_seen": 4070464, + "step": 19295 + }, + { + "epoch": 2.1232123212321232, + "grad_norm": 0.05758344754576683, + "learning_rate": 4.999422993842078e-05, + "loss": 0.0347, + "num_input_tokens_seen": 4071488, + "step": 19300 + }, + { + "epoch": 2.123762376237624, + "grad_norm": 1.1985026597976685, + "learning_rate": 4.999417826077793e-05, + "loss": 0.0541, + "num_input_tokens_seen": 4072576, + "step": 19305 + }, + { + "epoch": 2.1243124312431245, + "grad_norm": 0.2534109055995941, + "learning_rate": 4.9994126352775725e-05, + "loss": 0.0917, + "num_input_tokens_seen": 4073632, + "step": 19310 + }, + { + "epoch": 2.1248624862486247, + "grad_norm": 0.9306215643882751, + "learning_rate": 4.9994074214414666e-05, + "loss": 0.12, + "num_input_tokens_seen": 4074720, + "step": 19315 + }, + { + "epoch": 2.1254125412541254, + "grad_norm": 0.7530447244644165, + "learning_rate": 4.9994021845695205e-05, + "loss": 0.0923, + "num_input_tokens_seen": 4075776, + "step": 19320 + }, + { + "epoch": 2.125962596259626, + "grad_norm": 0.6598802208900452, + "learning_rate": 4.999396924661784e-05, + "loss": 0.0692, + "num_input_tokens_seen": 4076864, + "step": 19325 + }, + { + "epoch": 2.1265126512651267, + "grad_norm": 0.43446746468544006, + "learning_rate": 4.999391641718306e-05, + "loss": 0.0565, + "num_input_tokens_seen": 4077952, + "step": 19330 + }, + { + "epoch": 2.127062706270627, + "grad_norm": 0.18144899606704712, + "learning_rate": 4.9993863357391344e-05, + "loss": 0.1594, + "num_input_tokens_seen": 4079008, + "step": 19335 + }, + { + "epoch": 2.1276127612761275, + "grad_norm": 1.0183550119400024, + "learning_rate": 4.9993810067243184e-05, + "loss": 0.1634, + "num_input_tokens_seen": 4080096, + "step": 19340 + }, + { + "epoch": 2.128162816281628, + "grad_norm": 0.145905002951622, + "learning_rate": 4.999375654673907e-05, + "loss": 0.0221, + "num_input_tokens_seen": 4081184, + "step": 19345 + }, + { + "epoch": 2.128712871287129, + "grad_norm": 0.21444223821163177, + "learning_rate": 4.999370279587949e-05, + "loss": 0.0879, + "num_input_tokens_seen": 4082240, + "step": 19350 + }, + { + "epoch": 2.129262926292629, + "grad_norm": 0.03926301747560501, + "learning_rate": 4.999364881466495e-05, + "loss": 0.0239, + "num_input_tokens_seen": 4083328, + "step": 19355 + }, + { + "epoch": 2.1298129812981297, + "grad_norm": 0.371985524892807, + "learning_rate": 4.999359460309594e-05, + "loss": 0.1226, + "num_input_tokens_seen": 4084320, + "step": 19360 + }, + { + "epoch": 2.1303630363036303, + "grad_norm": 0.70418781042099, + "learning_rate": 4.999354016117296e-05, + "loss": 0.1353, + "num_input_tokens_seen": 4085312, + "step": 19365 + }, + { + "epoch": 2.130913091309131, + "grad_norm": 0.7393544316291809, + "learning_rate": 4.9993485488896515e-05, + "loss": 0.0273, + "num_input_tokens_seen": 4086464, + "step": 19370 + }, + { + "epoch": 2.1314631463146316, + "grad_norm": 0.7621216177940369, + "learning_rate": 4.99934305862671e-05, + "loss": 0.1085, + "num_input_tokens_seen": 4087584, + "step": 19375 + }, + { + "epoch": 2.132013201320132, + "grad_norm": 1.4069806337356567, + "learning_rate": 4.999337545328524e-05, + "loss": 0.0638, + "num_input_tokens_seen": 4088576, + "step": 19380 + }, + { + "epoch": 2.1325632563256325, + "grad_norm": 1.1091933250427246, + "learning_rate": 4.9993320089951425e-05, + "loss": 0.1746, + "num_input_tokens_seen": 4089632, + "step": 19385 + }, + { + "epoch": 2.133113311331133, + "grad_norm": 1.363852858543396, + "learning_rate": 4.999326449626617e-05, + "loss": 0.0959, + "num_input_tokens_seen": 4090688, + "step": 19390 + }, + { + "epoch": 2.133663366336634, + "grad_norm": 0.030590802431106567, + "learning_rate": 4.9993208672229995e-05, + "loss": 0.0471, + "num_input_tokens_seen": 4091808, + "step": 19395 + }, + { + "epoch": 2.1342134213421344, + "grad_norm": 1.1758383512496948, + "learning_rate": 4.9993152617843406e-05, + "loss": 0.0683, + "num_input_tokens_seen": 4092832, + "step": 19400 + }, + { + "epoch": 2.1347634763476346, + "grad_norm": 0.1852046400308609, + "learning_rate": 4.999309633310692e-05, + "loss": 0.0402, + "num_input_tokens_seen": 4093920, + "step": 19405 + }, + { + "epoch": 2.1353135313531353, + "grad_norm": 0.1066025123000145, + "learning_rate": 4.999303981802106e-05, + "loss": 0.035, + "num_input_tokens_seen": 4094976, + "step": 19410 + }, + { + "epoch": 2.135863586358636, + "grad_norm": 0.16659389436244965, + "learning_rate": 4.999298307258635e-05, + "loss": 0.078, + "num_input_tokens_seen": 4096064, + "step": 19415 + }, + { + "epoch": 2.1364136413641366, + "grad_norm": 0.6271481513977051, + "learning_rate": 4.9992926096803294e-05, + "loss": 0.0136, + "num_input_tokens_seen": 4097056, + "step": 19420 + }, + { + "epoch": 2.136963696369637, + "grad_norm": 0.2673248052597046, + "learning_rate": 4.9992868890672437e-05, + "loss": 0.0632, + "num_input_tokens_seen": 4098144, + "step": 19425 + }, + { + "epoch": 2.1375137513751374, + "grad_norm": 0.49212077260017395, + "learning_rate": 4.999281145419431e-05, + "loss": 0.0898, + "num_input_tokens_seen": 4099200, + "step": 19430 + }, + { + "epoch": 2.138063806380638, + "grad_norm": 7.777881622314453, + "learning_rate": 4.999275378736942e-05, + "loss": 0.1143, + "num_input_tokens_seen": 4100288, + "step": 19435 + }, + { + "epoch": 2.1386138613861387, + "grad_norm": 1.6455756425857544, + "learning_rate": 4.999269589019832e-05, + "loss": 0.1199, + "num_input_tokens_seen": 4101376, + "step": 19440 + }, + { + "epoch": 2.139163916391639, + "grad_norm": 0.23326259851455688, + "learning_rate": 4.9992637762681525e-05, + "loss": 0.0127, + "num_input_tokens_seen": 4102432, + "step": 19445 + }, + { + "epoch": 2.1397139713971396, + "grad_norm": 0.24678678810596466, + "learning_rate": 4.999257940481959e-05, + "loss": 0.1006, + "num_input_tokens_seen": 4103488, + "step": 19450 + }, + { + "epoch": 2.1402640264026402, + "grad_norm": 0.05899013578891754, + "learning_rate": 4.999252081661304e-05, + "loss": 0.0164, + "num_input_tokens_seen": 4104544, + "step": 19455 + }, + { + "epoch": 2.140814081408141, + "grad_norm": 0.17248718440532684, + "learning_rate": 4.999246199806242e-05, + "loss": 0.1318, + "num_input_tokens_seen": 4105600, + "step": 19460 + }, + { + "epoch": 2.1413641364136415, + "grad_norm": 0.04818751662969589, + "learning_rate": 4.9992402949168266e-05, + "loss": 0.0556, + "num_input_tokens_seen": 4106688, + "step": 19465 + }, + { + "epoch": 2.1419141914191417, + "grad_norm": 0.4668673872947693, + "learning_rate": 4.999234366993113e-05, + "loss": 0.0372, + "num_input_tokens_seen": 4107776, + "step": 19470 + }, + { + "epoch": 2.1424642464246424, + "grad_norm": 0.29146233201026917, + "learning_rate": 4.999228416035155e-05, + "loss": 0.0575, + "num_input_tokens_seen": 4108896, + "step": 19475 + }, + { + "epoch": 2.143014301430143, + "grad_norm": 0.037673164159059525, + "learning_rate": 4.999222442043009e-05, + "loss": 0.0306, + "num_input_tokens_seen": 4109952, + "step": 19480 + }, + { + "epoch": 2.1435643564356437, + "grad_norm": 0.06599839776754379, + "learning_rate": 4.999216445016728e-05, + "loss": 0.1508, + "num_input_tokens_seen": 4110976, + "step": 19485 + }, + { + "epoch": 2.1441144114411443, + "grad_norm": 0.0490182600915432, + "learning_rate": 4.999210424956368e-05, + "loss": 0.0375, + "num_input_tokens_seen": 4112000, + "step": 19490 + }, + { + "epoch": 2.1446644664466445, + "grad_norm": 0.9317384362220764, + "learning_rate": 4.999204381861985e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4113024, + "step": 19495 + }, + { + "epoch": 2.145214521452145, + "grad_norm": 0.39430108666419983, + "learning_rate": 4.9991983157336355e-05, + "loss": 0.0431, + "num_input_tokens_seen": 4114016, + "step": 19500 + }, + { + "epoch": 2.145764576457646, + "grad_norm": 0.5978780388832092, + "learning_rate": 4.999192226571373e-05, + "loss": 0.0773, + "num_input_tokens_seen": 4115040, + "step": 19505 + }, + { + "epoch": 2.1463146314631465, + "grad_norm": 0.7727700471878052, + "learning_rate": 4.999186114375256e-05, + "loss": 0.0913, + "num_input_tokens_seen": 4116096, + "step": 19510 + }, + { + "epoch": 2.1468646864686467, + "grad_norm": 0.3316843807697296, + "learning_rate": 4.999179979145339e-05, + "loss": 0.0982, + "num_input_tokens_seen": 4117152, + "step": 19515 + }, + { + "epoch": 2.1474147414741473, + "grad_norm": 0.1548011600971222, + "learning_rate": 4.99917382088168e-05, + "loss": 0.0593, + "num_input_tokens_seen": 4118208, + "step": 19520 + }, + { + "epoch": 2.147964796479648, + "grad_norm": 0.01378495804965496, + "learning_rate": 4.9991676395843344e-05, + "loss": 0.0308, + "num_input_tokens_seen": 4119232, + "step": 19525 + }, + { + "epoch": 2.1485148514851486, + "grad_norm": 0.02963482402265072, + "learning_rate": 4.999161435253361e-05, + "loss": 0.0736, + "num_input_tokens_seen": 4120288, + "step": 19530 + }, + { + "epoch": 2.149064906490649, + "grad_norm": 0.2952885031700134, + "learning_rate": 4.999155207888815e-05, + "loss": 0.0383, + "num_input_tokens_seen": 4121248, + "step": 19535 + }, + { + "epoch": 2.1496149614961495, + "grad_norm": 0.5126684904098511, + "learning_rate": 4.999148957490755e-05, + "loss": 0.0767, + "num_input_tokens_seen": 4122240, + "step": 19540 + }, + { + "epoch": 2.15016501650165, + "grad_norm": 0.5751444101333618, + "learning_rate": 4.999142684059238e-05, + "loss": 0.1043, + "num_input_tokens_seen": 4123264, + "step": 19545 + }, + { + "epoch": 2.150715071507151, + "grad_norm": 0.4678466320037842, + "learning_rate": 4.999136387594322e-05, + "loss": 0.0938, + "num_input_tokens_seen": 4124320, + "step": 19550 + }, + { + "epoch": 2.1512651265126514, + "grad_norm": 1.6684699058532715, + "learning_rate": 4.999130068096065e-05, + "loss": 0.2002, + "num_input_tokens_seen": 4125344, + "step": 19555 + }, + { + "epoch": 2.1518151815181517, + "grad_norm": 0.7368537187576294, + "learning_rate": 4.999123725564526e-05, + "loss": 0.1017, + "num_input_tokens_seen": 4126432, + "step": 19560 + }, + { + "epoch": 2.1523652365236523, + "grad_norm": 0.1045013815164566, + "learning_rate": 4.999117359999763e-05, + "loss": 0.1346, + "num_input_tokens_seen": 4127488, + "step": 19565 + }, + { + "epoch": 2.152915291529153, + "grad_norm": 0.2572714388370514, + "learning_rate": 4.999110971401834e-05, + "loss": 0.1501, + "num_input_tokens_seen": 4128576, + "step": 19570 + }, + { + "epoch": 2.1534653465346536, + "grad_norm": 0.3402513563632965, + "learning_rate": 4.999104559770797e-05, + "loss": 0.1225, + "num_input_tokens_seen": 4129664, + "step": 19575 + }, + { + "epoch": 2.1540154015401543, + "grad_norm": 0.1308605968952179, + "learning_rate": 4.999098125106715e-05, + "loss": 0.0464, + "num_input_tokens_seen": 4130688, + "step": 19580 + }, + { + "epoch": 2.1545654565456545, + "grad_norm": 1.278124451637268, + "learning_rate": 4.999091667409643e-05, + "loss": 0.1451, + "num_input_tokens_seen": 4131680, + "step": 19585 + }, + { + "epoch": 2.155115511551155, + "grad_norm": 0.9944697022438049, + "learning_rate": 4.999085186679643e-05, + "loss": 0.0835, + "num_input_tokens_seen": 4132736, + "step": 19590 + }, + { + "epoch": 2.1556655665566558, + "grad_norm": 0.4769853949546814, + "learning_rate": 4.999078682916774e-05, + "loss": 0.213, + "num_input_tokens_seen": 4133824, + "step": 19595 + }, + { + "epoch": 2.1562156215621564, + "grad_norm": 1.5960725545883179, + "learning_rate": 4.9990721561210956e-05, + "loss": 0.0601, + "num_input_tokens_seen": 4134880, + "step": 19600 + }, + { + "epoch": 2.1567656765676566, + "grad_norm": 0.18523992598056793, + "learning_rate": 4.999065606292669e-05, + "loss": 0.1424, + "num_input_tokens_seen": 4135904, + "step": 19605 + }, + { + "epoch": 2.1573157315731573, + "grad_norm": 0.43155041337013245, + "learning_rate": 4.999059033431554e-05, + "loss": 0.0542, + "num_input_tokens_seen": 4136960, + "step": 19610 + }, + { + "epoch": 2.157865786578658, + "grad_norm": 2.3053476810455322, + "learning_rate": 4.9990524375378095e-05, + "loss": 0.1746, + "num_input_tokens_seen": 4137952, + "step": 19615 + }, + { + "epoch": 2.1584158415841586, + "grad_norm": 0.4446592926979065, + "learning_rate": 4.9990458186114985e-05, + "loss": 0.0304, + "num_input_tokens_seen": 4139040, + "step": 19620 + }, + { + "epoch": 2.1589658965896588, + "grad_norm": 0.2511165738105774, + "learning_rate": 4.999039176652682e-05, + "loss": 0.0923, + "num_input_tokens_seen": 4140064, + "step": 19625 + }, + { + "epoch": 2.1595159515951594, + "grad_norm": 0.8790311217308044, + "learning_rate": 4.99903251166142e-05, + "loss": 0.1281, + "num_input_tokens_seen": 4141120, + "step": 19630 + }, + { + "epoch": 2.16006600660066, + "grad_norm": 0.2710375189781189, + "learning_rate": 4.999025823637775e-05, + "loss": 0.1189, + "num_input_tokens_seen": 4142176, + "step": 19635 + }, + { + "epoch": 2.1606160616061607, + "grad_norm": 0.4892630875110626, + "learning_rate": 4.9990191125818074e-05, + "loss": 0.0989, + "num_input_tokens_seen": 4143200, + "step": 19640 + }, + { + "epoch": 2.1611661166116614, + "grad_norm": 0.16644728183746338, + "learning_rate": 4.999012378493581e-05, + "loss": 0.0252, + "num_input_tokens_seen": 4144224, + "step": 19645 + }, + { + "epoch": 2.1617161716171616, + "grad_norm": 1.63515043258667, + "learning_rate": 4.999005621373155e-05, + "loss": 0.0579, + "num_input_tokens_seen": 4145216, + "step": 19650 + }, + { + "epoch": 2.162266226622662, + "grad_norm": 0.20662420988082886, + "learning_rate": 4.998998841220595e-05, + "loss": 0.0166, + "num_input_tokens_seen": 4146304, + "step": 19655 + }, + { + "epoch": 2.162816281628163, + "grad_norm": 2.1190428733825684, + "learning_rate": 4.9989920380359605e-05, + "loss": 0.097, + "num_input_tokens_seen": 4147392, + "step": 19660 + }, + { + "epoch": 2.1633663366336635, + "grad_norm": 0.35784250497817993, + "learning_rate": 4.998985211819316e-05, + "loss": 0.0733, + "num_input_tokens_seen": 4148480, + "step": 19665 + }, + { + "epoch": 2.1639163916391637, + "grad_norm": 0.8275038599967957, + "learning_rate": 4.9989783625707243e-05, + "loss": 0.0164, + "num_input_tokens_seen": 4149472, + "step": 19670 + }, + { + "epoch": 2.1644664466446644, + "grad_norm": 0.6755926012992859, + "learning_rate": 4.998971490290247e-05, + "loss": 0.0652, + "num_input_tokens_seen": 4150464, + "step": 19675 + }, + { + "epoch": 2.165016501650165, + "grad_norm": 0.37938806414604187, + "learning_rate": 4.9989645949779504e-05, + "loss": 0.0854, + "num_input_tokens_seen": 4151552, + "step": 19680 + }, + { + "epoch": 2.1655665566556657, + "grad_norm": 0.2943653464317322, + "learning_rate": 4.998957676633895e-05, + "loss": 0.0257, + "num_input_tokens_seen": 4152576, + "step": 19685 + }, + { + "epoch": 2.1661166116611663, + "grad_norm": 0.43024328351020813, + "learning_rate": 4.998950735258146e-05, + "loss": 0.1007, + "num_input_tokens_seen": 4153664, + "step": 19690 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.4559623599052429, + "learning_rate": 4.9989437708507675e-05, + "loss": 0.0429, + "num_input_tokens_seen": 4154688, + "step": 19695 + }, + { + "epoch": 2.167216721672167, + "grad_norm": 0.03780746832489967, + "learning_rate": 4.9989367834118225e-05, + "loss": 0.0925, + "num_input_tokens_seen": 4155712, + "step": 19700 + }, + { + "epoch": 2.167766776677668, + "grad_norm": 0.9404434561729431, + "learning_rate": 4.9989297729413774e-05, + "loss": 0.1361, + "num_input_tokens_seen": 4156832, + "step": 19705 + }, + { + "epoch": 2.1683168316831685, + "grad_norm": 0.16893164813518524, + "learning_rate": 4.998922739439495e-05, + "loss": 0.1149, + "num_input_tokens_seen": 4157824, + "step": 19710 + }, + { + "epoch": 2.1688668866886687, + "grad_norm": 0.7680317163467407, + "learning_rate": 4.998915682906241e-05, + "loss": 0.0425, + "num_input_tokens_seen": 4158944, + "step": 19715 + }, + { + "epoch": 2.1694169416941693, + "grad_norm": 0.23442000150680542, + "learning_rate": 4.9989086033416806e-05, + "loss": 0.0263, + "num_input_tokens_seen": 4160032, + "step": 19720 + }, + { + "epoch": 2.16996699669967, + "grad_norm": 0.2661811113357544, + "learning_rate": 4.998901500745878e-05, + "loss": 0.0534, + "num_input_tokens_seen": 4161088, + "step": 19725 + }, + { + "epoch": 2.1705170517051706, + "grad_norm": 0.07709628343582153, + "learning_rate": 4.9988943751189e-05, + "loss": 0.1439, + "num_input_tokens_seen": 4162080, + "step": 19730 + }, + { + "epoch": 2.1710671067106713, + "grad_norm": 1.2975434064865112, + "learning_rate": 4.9988872264608114e-05, + "loss": 0.082, + "num_input_tokens_seen": 4163168, + "step": 19735 + }, + { + "epoch": 2.1716171617161715, + "grad_norm": 0.3715416193008423, + "learning_rate": 4.9988800547716784e-05, + "loss": 0.068, + "num_input_tokens_seen": 4164224, + "step": 19740 + }, + { + "epoch": 2.172167216721672, + "grad_norm": 0.20032627880573273, + "learning_rate": 4.9988728600515664e-05, + "loss": 0.0187, + "num_input_tokens_seen": 4165312, + "step": 19745 + }, + { + "epoch": 2.1727172717271728, + "grad_norm": 1.347179651260376, + "learning_rate": 4.998865642300543e-05, + "loss": 0.059, + "num_input_tokens_seen": 4166304, + "step": 19750 + }, + { + "epoch": 2.1732673267326734, + "grad_norm": 0.0652572512626648, + "learning_rate": 4.998858401518674e-05, + "loss": 0.1303, + "num_input_tokens_seen": 4167296, + "step": 19755 + }, + { + "epoch": 2.1738173817381736, + "grad_norm": 0.5650090575218201, + "learning_rate": 4.998851137706026e-05, + "loss": 0.1291, + "num_input_tokens_seen": 4168288, + "step": 19760 + }, + { + "epoch": 2.1743674367436743, + "grad_norm": 0.044137269258499146, + "learning_rate": 4.9988438508626664e-05, + "loss": 0.0374, + "num_input_tokens_seen": 4169312, + "step": 19765 + }, + { + "epoch": 2.174917491749175, + "grad_norm": 0.807894766330719, + "learning_rate": 4.998836540988662e-05, + "loss": 0.125, + "num_input_tokens_seen": 4170400, + "step": 19770 + }, + { + "epoch": 2.1754675467546756, + "grad_norm": 0.5339779257774353, + "learning_rate": 4.998829208084079e-05, + "loss": 0.1088, + "num_input_tokens_seen": 4171488, + "step": 19775 + }, + { + "epoch": 2.1760176017601762, + "grad_norm": 0.1961001604795456, + "learning_rate": 4.998821852148988e-05, + "loss": 0.0722, + "num_input_tokens_seen": 4172608, + "step": 19780 + }, + { + "epoch": 2.1765676567656764, + "grad_norm": 0.22199739515781403, + "learning_rate": 4.998814473183454e-05, + "loss": 0.0337, + "num_input_tokens_seen": 4173664, + "step": 19785 + }, + { + "epoch": 2.177117711771177, + "grad_norm": 0.8816038370132446, + "learning_rate": 4.9988070711875465e-05, + "loss": 0.118, + "num_input_tokens_seen": 4174720, + "step": 19790 + }, + { + "epoch": 2.1776677667766777, + "grad_norm": 0.11976588517427444, + "learning_rate": 4.998799646161334e-05, + "loss": 0.041, + "num_input_tokens_seen": 4175712, + "step": 19795 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.39334598183631897, + "learning_rate": 4.998792198104883e-05, + "loss": 0.0828, + "num_input_tokens_seen": 4176832, + "step": 19800 + }, + { + "epoch": 2.1787678767876786, + "grad_norm": 0.04971352219581604, + "learning_rate": 4.998784727018264e-05, + "loss": 0.0346, + "num_input_tokens_seen": 4177952, + "step": 19805 + }, + { + "epoch": 2.1793179317931792, + "grad_norm": 1.1015535593032837, + "learning_rate": 4.9987772329015447e-05, + "loss": 0.1562, + "num_input_tokens_seen": 4179008, + "step": 19810 + }, + { + "epoch": 2.17986798679868, + "grad_norm": 1.5909065008163452, + "learning_rate": 4.998769715754795e-05, + "loss": 0.1394, + "num_input_tokens_seen": 4180096, + "step": 19815 + }, + { + "epoch": 2.1804180418041805, + "grad_norm": 0.15532425045967102, + "learning_rate": 4.9987621755780834e-05, + "loss": 0.2106, + "num_input_tokens_seen": 4181120, + "step": 19820 + }, + { + "epoch": 2.180968096809681, + "grad_norm": 0.3562787175178528, + "learning_rate": 4.998754612371481e-05, + "loss": 0.0973, + "num_input_tokens_seen": 4182176, + "step": 19825 + }, + { + "epoch": 2.1815181518151814, + "grad_norm": 0.17276141047477722, + "learning_rate": 4.998747026135055e-05, + "loss": 0.1207, + "num_input_tokens_seen": 4183296, + "step": 19830 + }, + { + "epoch": 2.182068206820682, + "grad_norm": 0.13383744657039642, + "learning_rate": 4.998739416868877e-05, + "loss": 0.1354, + "num_input_tokens_seen": 4184320, + "step": 19835 + }, + { + "epoch": 2.1826182618261827, + "grad_norm": 0.9596092104911804, + "learning_rate": 4.998731784573018e-05, + "loss": 0.1074, + "num_input_tokens_seen": 4185280, + "step": 19840 + }, + { + "epoch": 2.1831683168316833, + "grad_norm": 0.3223148286342621, + "learning_rate": 4.9987241292475456e-05, + "loss": 0.1604, + "num_input_tokens_seen": 4186368, + "step": 19845 + }, + { + "epoch": 2.1837183718371835, + "grad_norm": 1.0011357069015503, + "learning_rate": 4.998716450892533e-05, + "loss": 0.0632, + "num_input_tokens_seen": 4187424, + "step": 19850 + }, + { + "epoch": 2.184268426842684, + "grad_norm": 0.09611650556325912, + "learning_rate": 4.9987087495080484e-05, + "loss": 0.0251, + "num_input_tokens_seen": 4188480, + "step": 19855 + }, + { + "epoch": 2.184818481848185, + "grad_norm": 0.6614958643913269, + "learning_rate": 4.998701025094166e-05, + "loss": 0.0594, + "num_input_tokens_seen": 4189536, + "step": 19860 + }, + { + "epoch": 2.1853685368536855, + "grad_norm": 0.21319575607776642, + "learning_rate": 4.998693277650954e-05, + "loss": 0.1316, + "num_input_tokens_seen": 4190592, + "step": 19865 + }, + { + "epoch": 2.1859185918591857, + "grad_norm": 0.28853729367256165, + "learning_rate": 4.998685507178486e-05, + "loss": 0.066, + "num_input_tokens_seen": 4191648, + "step": 19870 + }, + { + "epoch": 2.1864686468646863, + "grad_norm": 0.4116728901863098, + "learning_rate": 4.998677713676833e-05, + "loss": 0.0473, + "num_input_tokens_seen": 4192672, + "step": 19875 + }, + { + "epoch": 2.187018701870187, + "grad_norm": 0.44766125082969666, + "learning_rate": 4.998669897146065e-05, + "loss": 0.0547, + "num_input_tokens_seen": 4193664, + "step": 19880 + }, + { + "epoch": 2.1875687568756876, + "grad_norm": 0.062206581234931946, + "learning_rate": 4.998662057586256e-05, + "loss": 0.0774, + "num_input_tokens_seen": 4194688, + "step": 19885 + }, + { + "epoch": 2.1881188118811883, + "grad_norm": 0.8962936401367188, + "learning_rate": 4.9986541949974784e-05, + "loss": 0.1189, + "num_input_tokens_seen": 4195776, + "step": 19890 + }, + { + "epoch": 2.1886688668866885, + "grad_norm": 1.3338268995285034, + "learning_rate": 4.9986463093798035e-05, + "loss": 0.1007, + "num_input_tokens_seen": 4196832, + "step": 19895 + }, + { + "epoch": 2.189218921892189, + "grad_norm": 0.9439351558685303, + "learning_rate": 4.9986384007333045e-05, + "loss": 0.1023, + "num_input_tokens_seen": 4197984, + "step": 19900 + }, + { + "epoch": 2.18976897689769, + "grad_norm": 0.058400049805641174, + "learning_rate": 4.998630469058055e-05, + "loss": 0.0511, + "num_input_tokens_seen": 4199040, + "step": 19905 + }, + { + "epoch": 2.1903190319031904, + "grad_norm": 1.30254328250885, + "learning_rate": 4.998622514354127e-05, + "loss": 0.1976, + "num_input_tokens_seen": 4200128, + "step": 19910 + }, + { + "epoch": 2.190869086908691, + "grad_norm": 0.03476513922214508, + "learning_rate": 4.9986145366215944e-05, + "loss": 0.0342, + "num_input_tokens_seen": 4201184, + "step": 19915 + }, + { + "epoch": 2.1914191419141913, + "grad_norm": 0.06647955626249313, + "learning_rate": 4.99860653586053e-05, + "loss": 0.0875, + "num_input_tokens_seen": 4202240, + "step": 19920 + }, + { + "epoch": 2.191969196919692, + "grad_norm": 1.683491587638855, + "learning_rate": 4.998598512071009e-05, + "loss": 0.0662, + "num_input_tokens_seen": 4203296, + "step": 19925 + }, + { + "epoch": 2.1925192519251926, + "grad_norm": 0.6533707976341248, + "learning_rate": 4.998590465253104e-05, + "loss": 0.1133, + "num_input_tokens_seen": 4204352, + "step": 19930 + }, + { + "epoch": 2.1930693069306932, + "grad_norm": 0.5741201639175415, + "learning_rate": 4.998582395406889e-05, + "loss": 0.1105, + "num_input_tokens_seen": 4205472, + "step": 19935 + }, + { + "epoch": 2.1936193619361934, + "grad_norm": 0.19520071148872375, + "learning_rate": 4.99857430253244e-05, + "loss": 0.0419, + "num_input_tokens_seen": 4206528, + "step": 19940 + }, + { + "epoch": 2.194169416941694, + "grad_norm": 0.21790854632854462, + "learning_rate": 4.9985661866298296e-05, + "loss": 0.1226, + "num_input_tokens_seen": 4207584, + "step": 19945 + }, + { + "epoch": 2.1947194719471947, + "grad_norm": 0.19802290201187134, + "learning_rate": 4.998558047699135e-05, + "loss": 0.0224, + "num_input_tokens_seen": 4208640, + "step": 19950 + }, + { + "epoch": 2.1952695269526954, + "grad_norm": 0.19506323337554932, + "learning_rate": 4.998549885740428e-05, + "loss": 0.0316, + "num_input_tokens_seen": 4209632, + "step": 19955 + }, + { + "epoch": 2.1958195819581956, + "grad_norm": 0.7366469502449036, + "learning_rate": 4.998541700753787e-05, + "loss": 0.0527, + "num_input_tokens_seen": 4210624, + "step": 19960 + }, + { + "epoch": 2.1963696369636962, + "grad_norm": 0.18442226946353912, + "learning_rate": 4.998533492739286e-05, + "loss": 0.0458, + "num_input_tokens_seen": 4211680, + "step": 19965 + }, + { + "epoch": 2.196919691969197, + "grad_norm": 1.6943013668060303, + "learning_rate": 4.998525261697e-05, + "loss": 0.0675, + "num_input_tokens_seen": 4212800, + "step": 19970 + }, + { + "epoch": 2.1974697469746975, + "grad_norm": 0.8539045453071594, + "learning_rate": 4.998517007627006e-05, + "loss": 0.1697, + "num_input_tokens_seen": 4213824, + "step": 19975 + }, + { + "epoch": 2.198019801980198, + "grad_norm": 1.3528308868408203, + "learning_rate": 4.9985087305293796e-05, + "loss": 0.0926, + "num_input_tokens_seen": 4214912, + "step": 19980 + }, + { + "epoch": 2.1985698569856984, + "grad_norm": 0.10721912980079651, + "learning_rate": 4.998500430404197e-05, + "loss": 0.1352, + "num_input_tokens_seen": 4216032, + "step": 19985 + }, + { + "epoch": 2.199119911991199, + "grad_norm": 0.04800816997885704, + "learning_rate": 4.9984921072515345e-05, + "loss": 0.1154, + "num_input_tokens_seen": 4217056, + "step": 19990 + }, + { + "epoch": 2.1996699669966997, + "grad_norm": 0.21514490246772766, + "learning_rate": 4.9984837610714696e-05, + "loss": 0.0939, + "num_input_tokens_seen": 4218080, + "step": 19995 + }, + { + "epoch": 2.2002200220022003, + "grad_norm": 0.08941710740327835, + "learning_rate": 4.9984753918640795e-05, + "loss": 0.0687, + "num_input_tokens_seen": 4219104, + "step": 20000 + }, + { + "epoch": 2.200770077007701, + "grad_norm": 0.03594394400715828, + "learning_rate": 4.9984669996294394e-05, + "loss": 0.0675, + "num_input_tokens_seen": 4220224, + "step": 20005 + }, + { + "epoch": 2.201320132013201, + "grad_norm": 0.7450004816055298, + "learning_rate": 4.998458584367628e-05, + "loss": 0.0797, + "num_input_tokens_seen": 4221312, + "step": 20010 + }, + { + "epoch": 2.201870187018702, + "grad_norm": 0.16793447732925415, + "learning_rate": 4.998450146078723e-05, + "loss": 0.053, + "num_input_tokens_seen": 4222368, + "step": 20015 + }, + { + "epoch": 2.2024202420242025, + "grad_norm": 0.6749218702316284, + "learning_rate": 4.9984416847628024e-05, + "loss": 0.1045, + "num_input_tokens_seen": 4223392, + "step": 20020 + }, + { + "epoch": 2.202970297029703, + "grad_norm": 0.37907692790031433, + "learning_rate": 4.998433200419943e-05, + "loss": 0.0524, + "num_input_tokens_seen": 4224416, + "step": 20025 + }, + { + "epoch": 2.2035203520352034, + "grad_norm": 0.017723562195897102, + "learning_rate": 4.998424693050223e-05, + "loss": 0.0303, + "num_input_tokens_seen": 4225472, + "step": 20030 + }, + { + "epoch": 2.204070407040704, + "grad_norm": 0.37244564294815063, + "learning_rate": 4.998416162653723e-05, + "loss": 0.0397, + "num_input_tokens_seen": 4226560, + "step": 20035 + }, + { + "epoch": 2.2046204620462047, + "grad_norm": 0.6280855536460876, + "learning_rate": 4.998407609230519e-05, + "loss": 0.084, + "num_input_tokens_seen": 4227616, + "step": 20040 + }, + { + "epoch": 2.2051705170517053, + "grad_norm": 0.42503032088279724, + "learning_rate": 4.9983990327806915e-05, + "loss": 0.0877, + "num_input_tokens_seen": 4228640, + "step": 20045 + }, + { + "epoch": 2.2057205720572055, + "grad_norm": 0.19965997338294983, + "learning_rate": 4.998390433304318e-05, + "loss": 0.0749, + "num_input_tokens_seen": 4229696, + "step": 20050 + }, + { + "epoch": 2.206270627062706, + "grad_norm": 0.5760402083396912, + "learning_rate": 4.9983818108014804e-05, + "loss": 0.1016, + "num_input_tokens_seen": 4230720, + "step": 20055 + }, + { + "epoch": 2.206820682068207, + "grad_norm": 1.124372124671936, + "learning_rate": 4.998373165272255e-05, + "loss": 0.0877, + "num_input_tokens_seen": 4231744, + "step": 20060 + }, + { + "epoch": 2.2073707370737075, + "grad_norm": 1.0691237449645996, + "learning_rate": 4.998364496716724e-05, + "loss": 0.0797, + "num_input_tokens_seen": 4232832, + "step": 20065 + }, + { + "epoch": 2.207920792079208, + "grad_norm": 1.2888582944869995, + "learning_rate": 4.998355805134966e-05, + "loss": 0.0635, + "num_input_tokens_seen": 4233856, + "step": 20070 + }, + { + "epoch": 2.2084708470847083, + "grad_norm": 0.37354326248168945, + "learning_rate": 4.99834709052706e-05, + "loss": 0.2158, + "num_input_tokens_seen": 4234880, + "step": 20075 + }, + { + "epoch": 2.209020902090209, + "grad_norm": 0.5547553896903992, + "learning_rate": 4.9983383528930896e-05, + "loss": 0.0911, + "num_input_tokens_seen": 4235968, + "step": 20080 + }, + { + "epoch": 2.2095709570957096, + "grad_norm": 0.5666561722755432, + "learning_rate": 4.9983295922331316e-05, + "loss": 0.0434, + "num_input_tokens_seen": 4237024, + "step": 20085 + }, + { + "epoch": 2.2101210121012103, + "grad_norm": 0.024404767900705338, + "learning_rate": 4.99832080854727e-05, + "loss": 0.1377, + "num_input_tokens_seen": 4238048, + "step": 20090 + }, + { + "epoch": 2.2106710671067105, + "grad_norm": 0.049807798117399216, + "learning_rate": 4.998312001835584e-05, + "loss": 0.0082, + "num_input_tokens_seen": 4239136, + "step": 20095 + }, + { + "epoch": 2.211221122112211, + "grad_norm": 0.4780164957046509, + "learning_rate": 4.998303172098155e-05, + "loss": 0.0644, + "num_input_tokens_seen": 4240128, + "step": 20100 + }, + { + "epoch": 2.2117711771177118, + "grad_norm": 0.09799355268478394, + "learning_rate": 4.998294319335065e-05, + "loss": 0.1195, + "num_input_tokens_seen": 4241184, + "step": 20105 + }, + { + "epoch": 2.2123212321232124, + "grad_norm": 0.3190375566482544, + "learning_rate": 4.998285443546394e-05, + "loss": 0.0363, + "num_input_tokens_seen": 4242272, + "step": 20110 + }, + { + "epoch": 2.212871287128713, + "grad_norm": 0.12719818949699402, + "learning_rate": 4.9982765447322256e-05, + "loss": 0.0683, + "num_input_tokens_seen": 4243328, + "step": 20115 + }, + { + "epoch": 2.2134213421342133, + "grad_norm": 0.6121553182601929, + "learning_rate": 4.998267622892641e-05, + "loss": 0.0444, + "num_input_tokens_seen": 4244384, + "step": 20120 + }, + { + "epoch": 2.213971397139714, + "grad_norm": 1.021943211555481, + "learning_rate": 4.998258678027722e-05, + "loss": 0.112, + "num_input_tokens_seen": 4245440, + "step": 20125 + }, + { + "epoch": 2.2145214521452146, + "grad_norm": 0.06557299941778183, + "learning_rate": 4.998249710137552e-05, + "loss": 0.0269, + "num_input_tokens_seen": 4246464, + "step": 20130 + }, + { + "epoch": 2.215071507150715, + "grad_norm": 0.22845464944839478, + "learning_rate": 4.998240719222214e-05, + "loss": 0.0769, + "num_input_tokens_seen": 4247520, + "step": 20135 + }, + { + "epoch": 2.2156215621562154, + "grad_norm": 0.7528194785118103, + "learning_rate": 4.998231705281788e-05, + "loss": 0.176, + "num_input_tokens_seen": 4248576, + "step": 20140 + }, + { + "epoch": 2.216171617161716, + "grad_norm": 0.29797956347465515, + "learning_rate": 4.998222668316361e-05, + "loss": 0.0394, + "num_input_tokens_seen": 4249632, + "step": 20145 + }, + { + "epoch": 2.2167216721672167, + "grad_norm": 0.06336630135774612, + "learning_rate": 4.9982136083260136e-05, + "loss": 0.064, + "num_input_tokens_seen": 4250752, + "step": 20150 + }, + { + "epoch": 2.2172717271727174, + "grad_norm": 0.748859703540802, + "learning_rate": 4.99820452531083e-05, + "loss": 0.1191, + "num_input_tokens_seen": 4251776, + "step": 20155 + }, + { + "epoch": 2.217821782178218, + "grad_norm": 0.16492363810539246, + "learning_rate": 4.9981954192708945e-05, + "loss": 0.0506, + "num_input_tokens_seen": 4252832, + "step": 20160 + }, + { + "epoch": 2.218371837183718, + "grad_norm": 0.2890995740890503, + "learning_rate": 4.99818629020629e-05, + "loss": 0.0576, + "num_input_tokens_seen": 4253824, + "step": 20165 + }, + { + "epoch": 2.218921892189219, + "grad_norm": 1.2049752473831177, + "learning_rate": 4.998177138117102e-05, + "loss": 0.0506, + "num_input_tokens_seen": 4254848, + "step": 20170 + }, + { + "epoch": 2.2194719471947195, + "grad_norm": 1.1081349849700928, + "learning_rate": 4.998167963003414e-05, + "loss": 0.1389, + "num_input_tokens_seen": 4255904, + "step": 20175 + }, + { + "epoch": 2.22002200220022, + "grad_norm": 1.7215852737426758, + "learning_rate": 4.99815876486531e-05, + "loss": 0.0924, + "num_input_tokens_seen": 4256960, + "step": 20180 + }, + { + "epoch": 2.2205720572057204, + "grad_norm": 0.2424813210964203, + "learning_rate": 4.998149543702876e-05, + "loss": 0.158, + "num_input_tokens_seen": 4258112, + "step": 20185 + }, + { + "epoch": 2.221122112211221, + "grad_norm": 1.483593225479126, + "learning_rate": 4.9981402995161955e-05, + "loss": 0.055, + "num_input_tokens_seen": 4259200, + "step": 20190 + }, + { + "epoch": 2.2216721672167217, + "grad_norm": 0.024851791560649872, + "learning_rate": 4.998131032305355e-05, + "loss": 0.1137, + "num_input_tokens_seen": 4260320, + "step": 20195 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.5255231261253357, + "learning_rate": 4.9981217420704394e-05, + "loss": 0.0503, + "num_input_tokens_seen": 4261344, + "step": 20200 + }, + { + "epoch": 2.222772277227723, + "grad_norm": 0.9893512725830078, + "learning_rate": 4.998112428811536e-05, + "loss": 0.1625, + "num_input_tokens_seen": 4262400, + "step": 20205 + }, + { + "epoch": 2.223322332233223, + "grad_norm": 0.12845157086849213, + "learning_rate": 4.998103092528727e-05, + "loss": 0.0683, + "num_input_tokens_seen": 4263424, + "step": 20210 + }, + { + "epoch": 2.223872387238724, + "grad_norm": 0.4810144007205963, + "learning_rate": 4.998093733222101e-05, + "loss": 0.0828, + "num_input_tokens_seen": 4264480, + "step": 20215 + }, + { + "epoch": 2.2244224422442245, + "grad_norm": 0.07647977024316788, + "learning_rate": 4.9980843508917444e-05, + "loss": 0.0735, + "num_input_tokens_seen": 4265504, + "step": 20220 + }, + { + "epoch": 2.224972497249725, + "grad_norm": 0.022185102105140686, + "learning_rate": 4.998074945537742e-05, + "loss": 0.0328, + "num_input_tokens_seen": 4266528, + "step": 20225 + }, + { + "epoch": 2.2255225522552253, + "grad_norm": 0.5795392394065857, + "learning_rate": 4.9980655171601825e-05, + "loss": 0.1171, + "num_input_tokens_seen": 4267680, + "step": 20230 + }, + { + "epoch": 2.226072607260726, + "grad_norm": 0.3778894543647766, + "learning_rate": 4.998056065759151e-05, + "loss": 0.0487, + "num_input_tokens_seen": 4268736, + "step": 20235 + }, + { + "epoch": 2.2266226622662266, + "grad_norm": 0.8642698526382446, + "learning_rate": 4.9980465913347365e-05, + "loss": 0.1374, + "num_input_tokens_seen": 4269760, + "step": 20240 + }, + { + "epoch": 2.2271727172717273, + "grad_norm": 0.2957649528980255, + "learning_rate": 4.998037093887025e-05, + "loss": 0.1061, + "num_input_tokens_seen": 4270912, + "step": 20245 + }, + { + "epoch": 2.227722772277228, + "grad_norm": 0.032399531453847885, + "learning_rate": 4.998027573416103e-05, + "loss": 0.0937, + "num_input_tokens_seen": 4272096, + "step": 20250 + }, + { + "epoch": 2.228272827282728, + "grad_norm": 0.06837313622236252, + "learning_rate": 4.998018029922061e-05, + "loss": 0.0702, + "num_input_tokens_seen": 4273152, + "step": 20255 + }, + { + "epoch": 2.228822882288229, + "grad_norm": 0.1873634308576584, + "learning_rate": 4.998008463404985e-05, + "loss": 0.0672, + "num_input_tokens_seen": 4274176, + "step": 20260 + }, + { + "epoch": 2.2293729372937294, + "grad_norm": 0.07321600615978241, + "learning_rate": 4.997998873864963e-05, + "loss": 0.0872, + "num_input_tokens_seen": 4275264, + "step": 20265 + }, + { + "epoch": 2.22992299229923, + "grad_norm": 0.23390769958496094, + "learning_rate": 4.997989261302085e-05, + "loss": 0.1471, + "num_input_tokens_seen": 4276352, + "step": 20270 + }, + { + "epoch": 2.2304730473047303, + "grad_norm": 1.084803819656372, + "learning_rate": 4.997979625716438e-05, + "loss": 0.1787, + "num_input_tokens_seen": 4277408, + "step": 20275 + }, + { + "epoch": 2.231023102310231, + "grad_norm": 0.14278161525726318, + "learning_rate": 4.997969967108111e-05, + "loss": 0.1104, + "num_input_tokens_seen": 4278496, + "step": 20280 + }, + { + "epoch": 2.2315731573157316, + "grad_norm": 0.05538660287857056, + "learning_rate": 4.997960285477195e-05, + "loss": 0.0165, + "num_input_tokens_seen": 4279520, + "step": 20285 + }, + { + "epoch": 2.2321232123212322, + "grad_norm": 0.96668940782547, + "learning_rate": 4.997950580823777e-05, + "loss": 0.0477, + "num_input_tokens_seen": 4280576, + "step": 20290 + }, + { + "epoch": 2.232673267326733, + "grad_norm": 0.2759058177471161, + "learning_rate": 4.997940853147947e-05, + "loss": 0.136, + "num_input_tokens_seen": 4281664, + "step": 20295 + }, + { + "epoch": 2.233223322332233, + "grad_norm": 0.3629472553730011, + "learning_rate": 4.997931102449794e-05, + "loss": 0.0606, + "num_input_tokens_seen": 4282688, + "step": 20300 + }, + { + "epoch": 2.2337733773377337, + "grad_norm": 0.09124866873025894, + "learning_rate": 4.99792132872941e-05, + "loss": 0.0487, + "num_input_tokens_seen": 4283712, + "step": 20305 + }, + { + "epoch": 2.2343234323432344, + "grad_norm": 0.18082720041275024, + "learning_rate": 4.997911531986883e-05, + "loss": 0.0564, + "num_input_tokens_seen": 4284736, + "step": 20310 + }, + { + "epoch": 2.234873487348735, + "grad_norm": 0.47920045256614685, + "learning_rate": 4.9979017122223036e-05, + "loss": 0.0292, + "num_input_tokens_seen": 4285792, + "step": 20315 + }, + { + "epoch": 2.2354235423542352, + "grad_norm": 0.061085090041160583, + "learning_rate": 4.9978918694357636e-05, + "loss": 0.0504, + "num_input_tokens_seen": 4286848, + "step": 20320 + }, + { + "epoch": 2.235973597359736, + "grad_norm": 0.04519641026854515, + "learning_rate": 4.997882003627353e-05, + "loss": 0.0894, + "num_input_tokens_seen": 4287840, + "step": 20325 + }, + { + "epoch": 2.2365236523652365, + "grad_norm": 0.06554999947547913, + "learning_rate": 4.997872114797162e-05, + "loss": 0.0355, + "num_input_tokens_seen": 4288800, + "step": 20330 + }, + { + "epoch": 2.237073707370737, + "grad_norm": 0.5889159440994263, + "learning_rate": 4.997862202945282e-05, + "loss": 0.0655, + "num_input_tokens_seen": 4289888, + "step": 20335 + }, + { + "epoch": 2.237623762376238, + "grad_norm": 0.15097220242023468, + "learning_rate": 4.997852268071805e-05, + "loss": 0.0934, + "num_input_tokens_seen": 4290976, + "step": 20340 + }, + { + "epoch": 2.238173817381738, + "grad_norm": 0.059242554008960724, + "learning_rate": 4.997842310176822e-05, + "loss": 0.0444, + "num_input_tokens_seen": 4292032, + "step": 20345 + }, + { + "epoch": 2.2387238723872387, + "grad_norm": 0.32164227962493896, + "learning_rate": 4.997832329260426e-05, + "loss": 0.0663, + "num_input_tokens_seen": 4293088, + "step": 20350 + }, + { + "epoch": 2.2392739273927393, + "grad_norm": 1.024703025817871, + "learning_rate": 4.997822325322708e-05, + "loss": 0.0547, + "num_input_tokens_seen": 4294112, + "step": 20355 + }, + { + "epoch": 2.23982398239824, + "grad_norm": 0.510552704334259, + "learning_rate": 4.9978122983637585e-05, + "loss": 0.0517, + "num_input_tokens_seen": 4295200, + "step": 20360 + }, + { + "epoch": 2.24037403740374, + "grad_norm": 0.17540885508060455, + "learning_rate": 4.997802248383673e-05, + "loss": 0.0462, + "num_input_tokens_seen": 4296224, + "step": 20365 + }, + { + "epoch": 2.240924092409241, + "grad_norm": 1.2800297737121582, + "learning_rate": 4.997792175382542e-05, + "loss": 0.1005, + "num_input_tokens_seen": 4297280, + "step": 20370 + }, + { + "epoch": 2.2414741474147415, + "grad_norm": 0.05331045016646385, + "learning_rate": 4.99778207936046e-05, + "loss": 0.0534, + "num_input_tokens_seen": 4298336, + "step": 20375 + }, + { + "epoch": 2.242024202420242, + "grad_norm": 0.031156526878476143, + "learning_rate": 4.9977719603175184e-05, + "loss": 0.1389, + "num_input_tokens_seen": 4299456, + "step": 20380 + }, + { + "epoch": 2.2425742574257423, + "grad_norm": 1.4829070568084717, + "learning_rate": 4.997761818253811e-05, + "loss": 0.1131, + "num_input_tokens_seen": 4300480, + "step": 20385 + }, + { + "epoch": 2.243124312431243, + "grad_norm": 1.3083467483520508, + "learning_rate": 4.9977516531694315e-05, + "loss": 0.083, + "num_input_tokens_seen": 4301472, + "step": 20390 + }, + { + "epoch": 2.2436743674367436, + "grad_norm": 0.3829095959663391, + "learning_rate": 4.9977414650644745e-05, + "loss": 0.0876, + "num_input_tokens_seen": 4302496, + "step": 20395 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.3022838532924652, + "learning_rate": 4.997731253939032e-05, + "loss": 0.0667, + "num_input_tokens_seen": 4303584, + "step": 20400 + }, + { + "epoch": 2.244774477447745, + "grad_norm": 0.7048702239990234, + "learning_rate": 4.9977210197931997e-05, + "loss": 0.0391, + "num_input_tokens_seen": 4304704, + "step": 20405 + }, + { + "epoch": 2.245324532453245, + "grad_norm": 1.3722972869873047, + "learning_rate": 4.997710762627071e-05, + "loss": 0.1096, + "num_input_tokens_seen": 4305728, + "step": 20410 + }, + { + "epoch": 2.245874587458746, + "grad_norm": 0.19306063652038574, + "learning_rate": 4.99770048244074e-05, + "loss": 0.0267, + "num_input_tokens_seen": 4306752, + "step": 20415 + }, + { + "epoch": 2.2464246424642464, + "grad_norm": 1.7933837175369263, + "learning_rate": 4.9976901792343034e-05, + "loss": 0.1567, + "num_input_tokens_seen": 4307776, + "step": 20420 + }, + { + "epoch": 2.246974697469747, + "grad_norm": 0.04122290387749672, + "learning_rate": 4.997679853007854e-05, + "loss": 0.0592, + "num_input_tokens_seen": 4308832, + "step": 20425 + }, + { + "epoch": 2.2475247524752477, + "grad_norm": 0.07836956530809402, + "learning_rate": 4.997669503761488e-05, + "loss": 0.1172, + "num_input_tokens_seen": 4309888, + "step": 20430 + }, + { + "epoch": 2.248074807480748, + "grad_norm": 0.9463549256324768, + "learning_rate": 4.9976591314953014e-05, + "loss": 0.0653, + "num_input_tokens_seen": 4310944, + "step": 20435 + }, + { + "epoch": 2.2486248624862486, + "grad_norm": 1.1625231504440308, + "learning_rate": 4.997648736209389e-05, + "loss": 0.0737, + "num_input_tokens_seen": 4312032, + "step": 20440 + }, + { + "epoch": 2.2491749174917492, + "grad_norm": 0.18934909999370575, + "learning_rate": 4.9976383179038464e-05, + "loss": 0.1709, + "num_input_tokens_seen": 4313088, + "step": 20445 + }, + { + "epoch": 2.24972497249725, + "grad_norm": 0.511694610118866, + "learning_rate": 4.997627876578769e-05, + "loss": 0.0504, + "num_input_tokens_seen": 4314112, + "step": 20450 + }, + { + "epoch": 2.25027502750275, + "grad_norm": 0.08484941720962524, + "learning_rate": 4.997617412234255e-05, + "loss": 0.0772, + "num_input_tokens_seen": 4315136, + "step": 20455 + }, + { + "epoch": 2.2508250825082508, + "grad_norm": 0.2786666452884674, + "learning_rate": 4.9976069248704e-05, + "loss": 0.0746, + "num_input_tokens_seen": 4316224, + "step": 20460 + }, + { + "epoch": 2.2513751375137514, + "grad_norm": 0.3376425802707672, + "learning_rate": 4.9975964144873e-05, + "loss": 0.0311, + "num_input_tokens_seen": 4317248, + "step": 20465 + }, + { + "epoch": 2.251925192519252, + "grad_norm": 0.15672369301319122, + "learning_rate": 4.997585881085052e-05, + "loss": 0.0815, + "num_input_tokens_seen": 4318304, + "step": 20470 + }, + { + "epoch": 2.2524752475247523, + "grad_norm": 1.4141839742660522, + "learning_rate": 4.997575324663754e-05, + "loss": 0.1036, + "num_input_tokens_seen": 4319392, + "step": 20475 + }, + { + "epoch": 2.253025302530253, + "grad_norm": 0.5021510720252991, + "learning_rate": 4.997564745223502e-05, + "loss": 0.1084, + "num_input_tokens_seen": 4320384, + "step": 20480 + }, + { + "epoch": 2.2535753575357536, + "grad_norm": 0.36125633120536804, + "learning_rate": 4.9975541427643945e-05, + "loss": 0.0705, + "num_input_tokens_seen": 4321472, + "step": 20485 + }, + { + "epoch": 2.254125412541254, + "grad_norm": 3.187748908996582, + "learning_rate": 4.9975435172865294e-05, + "loss": 0.0612, + "num_input_tokens_seen": 4322592, + "step": 20490 + }, + { + "epoch": 2.254675467546755, + "grad_norm": 0.19128073751926422, + "learning_rate": 4.997532868790003e-05, + "loss": 0.0567, + "num_input_tokens_seen": 4323680, + "step": 20495 + }, + { + "epoch": 2.255225522552255, + "grad_norm": 0.49090608954429626, + "learning_rate": 4.997522197274915e-05, + "loss": 0.0714, + "num_input_tokens_seen": 4324672, + "step": 20500 + }, + { + "epoch": 2.2557755775577557, + "grad_norm": 1.6463546752929688, + "learning_rate": 4.997511502741364e-05, + "loss": 0.1012, + "num_input_tokens_seen": 4325664, + "step": 20505 + }, + { + "epoch": 2.2563256325632564, + "grad_norm": 0.42003923654556274, + "learning_rate": 4.997500785189448e-05, + "loss": 0.0802, + "num_input_tokens_seen": 4326752, + "step": 20510 + }, + { + "epoch": 2.256875687568757, + "grad_norm": 0.07443301379680634, + "learning_rate": 4.997490044619265e-05, + "loss": 0.1359, + "num_input_tokens_seen": 4327712, + "step": 20515 + }, + { + "epoch": 2.2574257425742577, + "grad_norm": 0.3119058310985565, + "learning_rate": 4.997479281030915e-05, + "loss": 0.0872, + "num_input_tokens_seen": 4328704, + "step": 20520 + }, + { + "epoch": 2.257975797579758, + "grad_norm": 0.6849321722984314, + "learning_rate": 4.9974684944244964e-05, + "loss": 0.0984, + "num_input_tokens_seen": 4329728, + "step": 20525 + }, + { + "epoch": 2.2585258525852585, + "grad_norm": 2.23785138130188, + "learning_rate": 4.99745768480011e-05, + "loss": 0.1377, + "num_input_tokens_seen": 4330784, + "step": 20530 + }, + { + "epoch": 2.259075907590759, + "grad_norm": 0.3635987937450409, + "learning_rate": 4.9974468521578535e-05, + "loss": 0.1259, + "num_input_tokens_seen": 4331872, + "step": 20535 + }, + { + "epoch": 2.25962596259626, + "grad_norm": 0.13382722437381744, + "learning_rate": 4.997435996497828e-05, + "loss": 0.0381, + "num_input_tokens_seen": 4332960, + "step": 20540 + }, + { + "epoch": 2.26017601760176, + "grad_norm": 0.5433214902877808, + "learning_rate": 4.997425117820134e-05, + "loss": 0.107, + "num_input_tokens_seen": 4333952, + "step": 20545 + }, + { + "epoch": 2.2607260726072607, + "grad_norm": 0.20316432416439056, + "learning_rate": 4.997414216124871e-05, + "loss": 0.078, + "num_input_tokens_seen": 4334944, + "step": 20550 + }, + { + "epoch": 2.2612761276127613, + "grad_norm": 0.022287871688604355, + "learning_rate": 4.9974032914121386e-05, + "loss": 0.0292, + "num_input_tokens_seen": 4336032, + "step": 20555 + }, + { + "epoch": 2.261826182618262, + "grad_norm": 0.07599625736474991, + "learning_rate": 4.997392343682039e-05, + "loss": 0.0237, + "num_input_tokens_seen": 4337056, + "step": 20560 + }, + { + "epoch": 2.262376237623762, + "grad_norm": 0.4371315538883209, + "learning_rate": 4.997381372934673e-05, + "loss": 0.1142, + "num_input_tokens_seen": 4338112, + "step": 20565 + }, + { + "epoch": 2.262926292629263, + "grad_norm": 0.22240863740444183, + "learning_rate": 4.99737037917014e-05, + "loss": 0.0848, + "num_input_tokens_seen": 4339136, + "step": 20570 + }, + { + "epoch": 2.2634763476347635, + "grad_norm": 0.23979780077934265, + "learning_rate": 4.997359362388544e-05, + "loss": 0.0882, + "num_input_tokens_seen": 4340192, + "step": 20575 + }, + { + "epoch": 2.264026402640264, + "grad_norm": 1.3939614295959473, + "learning_rate": 4.9973483225899844e-05, + "loss": 0.0738, + "num_input_tokens_seen": 4341248, + "step": 20580 + }, + { + "epoch": 2.2645764576457648, + "grad_norm": 1.0650508403778076, + "learning_rate": 4.997337259774564e-05, + "loss": 0.1401, + "num_input_tokens_seen": 4342272, + "step": 20585 + }, + { + "epoch": 2.265126512651265, + "grad_norm": 0.2517302334308624, + "learning_rate": 4.997326173942384e-05, + "loss": 0.1371, + "num_input_tokens_seen": 4343328, + "step": 20590 + }, + { + "epoch": 2.2656765676567656, + "grad_norm": 0.27894288301467896, + "learning_rate": 4.997315065093546e-05, + "loss": 0.0707, + "num_input_tokens_seen": 4344352, + "step": 20595 + }, + { + "epoch": 2.2662266226622663, + "grad_norm": 0.14214318990707397, + "learning_rate": 4.997303933228155e-05, + "loss": 0.018, + "num_input_tokens_seen": 4345440, + "step": 20600 + }, + { + "epoch": 2.266776677667767, + "grad_norm": 1.0667115449905396, + "learning_rate": 4.997292778346312e-05, + "loss": 0.0904, + "num_input_tokens_seen": 4346560, + "step": 20605 + }, + { + "epoch": 2.2673267326732676, + "grad_norm": 1.02669095993042, + "learning_rate": 4.9972816004481185e-05, + "loss": 0.0816, + "num_input_tokens_seen": 4347616, + "step": 20610 + }, + { + "epoch": 2.2678767876787678, + "grad_norm": 0.3186512589454651, + "learning_rate": 4.99727039953368e-05, + "loss": 0.122, + "num_input_tokens_seen": 4348672, + "step": 20615 + }, + { + "epoch": 2.2684268426842684, + "grad_norm": 1.24171781539917, + "learning_rate": 4.997259175603098e-05, + "loss": 0.1476, + "num_input_tokens_seen": 4349696, + "step": 20620 + }, + { + "epoch": 2.268976897689769, + "grad_norm": 0.7274154424667358, + "learning_rate": 4.9972479286564764e-05, + "loss": 0.037, + "num_input_tokens_seen": 4350688, + "step": 20625 + }, + { + "epoch": 2.2695269526952697, + "grad_norm": 0.4736901521682739, + "learning_rate": 4.997236658693919e-05, + "loss": 0.0259, + "num_input_tokens_seen": 4351744, + "step": 20630 + }, + { + "epoch": 2.27007700770077, + "grad_norm": 0.14948685467243195, + "learning_rate": 4.99722536571553e-05, + "loss": 0.0498, + "num_input_tokens_seen": 4352800, + "step": 20635 + }, + { + "epoch": 2.2706270627062706, + "grad_norm": 0.2835986018180847, + "learning_rate": 4.9972140497214125e-05, + "loss": 0.1356, + "num_input_tokens_seen": 4353888, + "step": 20640 + }, + { + "epoch": 2.271177117711771, + "grad_norm": 0.14497540891170502, + "learning_rate": 4.997202710711672e-05, + "loss": 0.1122, + "num_input_tokens_seen": 4354944, + "step": 20645 + }, + { + "epoch": 2.271727172717272, + "grad_norm": 1.7838633060455322, + "learning_rate": 4.997191348686412e-05, + "loss": 0.0812, + "num_input_tokens_seen": 4355968, + "step": 20650 + }, + { + "epoch": 2.272277227722772, + "grad_norm": 0.47632670402526855, + "learning_rate": 4.997179963645737e-05, + "loss": 0.0474, + "num_input_tokens_seen": 4356960, + "step": 20655 + }, + { + "epoch": 2.2728272827282727, + "grad_norm": 0.4024766981601715, + "learning_rate": 4.997168555589753e-05, + "loss": 0.052, + "num_input_tokens_seen": 4357984, + "step": 20660 + }, + { + "epoch": 2.2733773377337734, + "grad_norm": 0.20273078978061676, + "learning_rate": 4.9971571245185655e-05, + "loss": 0.1029, + "num_input_tokens_seen": 4358976, + "step": 20665 + }, + { + "epoch": 2.273927392739274, + "grad_norm": 1.9921948909759521, + "learning_rate": 4.9971456704322784e-05, + "loss": 0.0791, + "num_input_tokens_seen": 4360000, + "step": 20670 + }, + { + "epoch": 2.2744774477447747, + "grad_norm": 0.18170055747032166, + "learning_rate": 4.997134193330998e-05, + "loss": 0.0551, + "num_input_tokens_seen": 4361056, + "step": 20675 + }, + { + "epoch": 2.275027502750275, + "grad_norm": 0.6547240614891052, + "learning_rate": 4.99712269321483e-05, + "loss": 0.0342, + "num_input_tokens_seen": 4362144, + "step": 20680 + }, + { + "epoch": 2.2755775577557755, + "grad_norm": 0.7908267974853516, + "learning_rate": 4.9971111700838805e-05, + "loss": 0.0192, + "num_input_tokens_seen": 4363264, + "step": 20685 + }, + { + "epoch": 2.276127612761276, + "grad_norm": 1.0296437740325928, + "learning_rate": 4.997099623938255e-05, + "loss": 0.201, + "num_input_tokens_seen": 4364352, + "step": 20690 + }, + { + "epoch": 2.276677667766777, + "grad_norm": 0.12462630867958069, + "learning_rate": 4.997088054778061e-05, + "loss": 0.1172, + "num_input_tokens_seen": 4365472, + "step": 20695 + }, + { + "epoch": 2.2772277227722775, + "grad_norm": 0.5228334069252014, + "learning_rate": 4.9970764626034036e-05, + "loss": 0.0327, + "num_input_tokens_seen": 4366464, + "step": 20700 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.049474261701107025, + "learning_rate": 4.997064847414392e-05, + "loss": 0.028, + "num_input_tokens_seen": 4367520, + "step": 20705 + }, + { + "epoch": 2.2783278327832783, + "grad_norm": 0.8089311718940735, + "learning_rate": 4.997053209211131e-05, + "loss": 0.0463, + "num_input_tokens_seen": 4368544, + "step": 20710 + }, + { + "epoch": 2.278877887788779, + "grad_norm": 0.03086145594716072, + "learning_rate": 4.997041547993729e-05, + "loss": 0.0972, + "num_input_tokens_seen": 4369600, + "step": 20715 + }, + { + "epoch": 2.279427942794279, + "grad_norm": 0.3166489899158478, + "learning_rate": 4.9970298637622936e-05, + "loss": 0.0512, + "num_input_tokens_seen": 4370656, + "step": 20720 + }, + { + "epoch": 2.27997799779978, + "grad_norm": 0.4816226363182068, + "learning_rate": 4.997018156516933e-05, + "loss": 0.2009, + "num_input_tokens_seen": 4371744, + "step": 20725 + }, + { + "epoch": 2.2805280528052805, + "grad_norm": 0.11396168172359467, + "learning_rate": 4.997006426257752e-05, + "loss": 0.0819, + "num_input_tokens_seen": 4372800, + "step": 20730 + }, + { + "epoch": 2.281078107810781, + "grad_norm": 0.11531083285808563, + "learning_rate": 4.9969946729848626e-05, + "loss": 0.0788, + "num_input_tokens_seen": 4373792, + "step": 20735 + }, + { + "epoch": 2.281628162816282, + "grad_norm": 0.028564894571900368, + "learning_rate": 4.996982896698371e-05, + "loss": 0.0796, + "num_input_tokens_seen": 4374816, + "step": 20740 + }, + { + "epoch": 2.282178217821782, + "grad_norm": 0.27671492099761963, + "learning_rate": 4.9969710973983865e-05, + "loss": 0.074, + "num_input_tokens_seen": 4375872, + "step": 20745 + }, + { + "epoch": 2.2827282728272826, + "grad_norm": 0.2931664288043976, + "learning_rate": 4.996959275085017e-05, + "loss": 0.1029, + "num_input_tokens_seen": 4376928, + "step": 20750 + }, + { + "epoch": 2.2832783278327833, + "grad_norm": 1.74879789352417, + "learning_rate": 4.996947429758373e-05, + "loss": 0.0432, + "num_input_tokens_seen": 4377920, + "step": 20755 + }, + { + "epoch": 2.283828382838284, + "grad_norm": 0.04689076170325279, + "learning_rate": 4.9969355614185616e-05, + "loss": 0.035, + "num_input_tokens_seen": 4378976, + "step": 20760 + }, + { + "epoch": 2.2843784378437846, + "grad_norm": 0.08038274198770523, + "learning_rate": 4.9969236700656934e-05, + "loss": 0.0699, + "num_input_tokens_seen": 4380128, + "step": 20765 + }, + { + "epoch": 2.284928492849285, + "grad_norm": 0.15309908986091614, + "learning_rate": 4.9969117556998785e-05, + "loss": 0.1382, + "num_input_tokens_seen": 4381184, + "step": 20770 + }, + { + "epoch": 2.2854785478547854, + "grad_norm": 0.6462634801864624, + "learning_rate": 4.9968998183212254e-05, + "loss": 0.0441, + "num_input_tokens_seen": 4382272, + "step": 20775 + }, + { + "epoch": 2.286028602860286, + "grad_norm": 1.1787209510803223, + "learning_rate": 4.996887857929845e-05, + "loss": 0.1171, + "num_input_tokens_seen": 4383328, + "step": 20780 + }, + { + "epoch": 2.2865786578657867, + "grad_norm": 1.473591923713684, + "learning_rate": 4.996875874525848e-05, + "loss": 0.0836, + "num_input_tokens_seen": 4384384, + "step": 20785 + }, + { + "epoch": 2.287128712871287, + "grad_norm": 0.7410531640052795, + "learning_rate": 4.996863868109344e-05, + "loss": 0.1838, + "num_input_tokens_seen": 4385472, + "step": 20790 + }, + { + "epoch": 2.2876787678767876, + "grad_norm": 0.3511829078197479, + "learning_rate": 4.996851838680443e-05, + "loss": 0.0376, + "num_input_tokens_seen": 4386560, + "step": 20795 + }, + { + "epoch": 2.2882288228822882, + "grad_norm": 0.028600607067346573, + "learning_rate": 4.996839786239257e-05, + "loss": 0.0694, + "num_input_tokens_seen": 4387648, + "step": 20800 + }, + { + "epoch": 2.288778877887789, + "grad_norm": 0.08263701945543289, + "learning_rate": 4.996827710785897e-05, + "loss": 0.086, + "num_input_tokens_seen": 4388736, + "step": 20805 + }, + { + "epoch": 2.289328932893289, + "grad_norm": 1.1486207246780396, + "learning_rate": 4.9968156123204744e-05, + "loss": 0.1892, + "num_input_tokens_seen": 4389856, + "step": 20810 + }, + { + "epoch": 2.2898789878987897, + "grad_norm": 0.04032258689403534, + "learning_rate": 4.9968034908431e-05, + "loss": 0.0499, + "num_input_tokens_seen": 4390944, + "step": 20815 + }, + { + "epoch": 2.2904290429042904, + "grad_norm": 0.22573024034500122, + "learning_rate": 4.9967913463538864e-05, + "loss": 0.0658, + "num_input_tokens_seen": 4392000, + "step": 20820 + }, + { + "epoch": 2.290979097909791, + "grad_norm": 0.30531880259513855, + "learning_rate": 4.996779178852944e-05, + "loss": 0.0899, + "num_input_tokens_seen": 4392960, + "step": 20825 + }, + { + "epoch": 2.2915291529152917, + "grad_norm": 2.4645543098449707, + "learning_rate": 4.9967669883403866e-05, + "loss": 0.0732, + "num_input_tokens_seen": 4393952, + "step": 20830 + }, + { + "epoch": 2.292079207920792, + "grad_norm": 0.46423113346099854, + "learning_rate": 4.9967547748163264e-05, + "loss": 0.1262, + "num_input_tokens_seen": 4395040, + "step": 20835 + }, + { + "epoch": 2.2926292629262925, + "grad_norm": 1.3101050853729248, + "learning_rate": 4.996742538280875e-05, + "loss": 0.1187, + "num_input_tokens_seen": 4396064, + "step": 20840 + }, + { + "epoch": 2.293179317931793, + "grad_norm": 0.9266735911369324, + "learning_rate": 4.9967302787341455e-05, + "loss": 0.0968, + "num_input_tokens_seen": 4397184, + "step": 20845 + }, + { + "epoch": 2.293729372937294, + "grad_norm": 0.21670274436473846, + "learning_rate": 4.996717996176251e-05, + "loss": 0.0707, + "num_input_tokens_seen": 4398208, + "step": 20850 + }, + { + "epoch": 2.2942794279427945, + "grad_norm": 0.27350547909736633, + "learning_rate": 4.996705690607305e-05, + "loss": 0.0528, + "num_input_tokens_seen": 4399264, + "step": 20855 + }, + { + "epoch": 2.2948294829482947, + "grad_norm": 0.4877180755138397, + "learning_rate": 4.996693362027421e-05, + "loss": 0.0701, + "num_input_tokens_seen": 4400320, + "step": 20860 + }, + { + "epoch": 2.2953795379537953, + "grad_norm": 0.4987805485725403, + "learning_rate": 4.996681010436712e-05, + "loss": 0.1012, + "num_input_tokens_seen": 4401344, + "step": 20865 + }, + { + "epoch": 2.295929592959296, + "grad_norm": 0.18604037165641785, + "learning_rate": 4.996668635835292e-05, + "loss": 0.0425, + "num_input_tokens_seen": 4402368, + "step": 20870 + }, + { + "epoch": 2.2964796479647966, + "grad_norm": 5.079363822937012, + "learning_rate": 4.996656238223276e-05, + "loss": 0.0726, + "num_input_tokens_seen": 4403424, + "step": 20875 + }, + { + "epoch": 2.297029702970297, + "grad_norm": 0.024321047589182854, + "learning_rate": 4.996643817600777e-05, + "loss": 0.013, + "num_input_tokens_seen": 4404480, + "step": 20880 + }, + { + "epoch": 2.2975797579757975, + "grad_norm": 1.8219753503799438, + "learning_rate": 4.9966313739679096e-05, + "loss": 0.2096, + "num_input_tokens_seen": 4405536, + "step": 20885 + }, + { + "epoch": 2.298129812981298, + "grad_norm": 0.26857924461364746, + "learning_rate": 4.9966189073247885e-05, + "loss": 0.0405, + "num_input_tokens_seen": 4406592, + "step": 20890 + }, + { + "epoch": 2.298679867986799, + "grad_norm": 0.21050389111042023, + "learning_rate": 4.99660641767153e-05, + "loss": 0.0281, + "num_input_tokens_seen": 4407680, + "step": 20895 + }, + { + "epoch": 2.299229922992299, + "grad_norm": 0.32218411564826965, + "learning_rate": 4.996593905008248e-05, + "loss": 0.092, + "num_input_tokens_seen": 4408736, + "step": 20900 + }, + { + "epoch": 2.2997799779977997, + "grad_norm": 0.05478043109178543, + "learning_rate": 4.996581369335057e-05, + "loss": 0.0108, + "num_input_tokens_seen": 4409760, + "step": 20905 + }, + { + "epoch": 2.3003300330033003, + "grad_norm": 0.8785390257835388, + "learning_rate": 4.9965688106520745e-05, + "loss": 0.0935, + "num_input_tokens_seen": 4410784, + "step": 20910 + }, + { + "epoch": 2.300880088008801, + "grad_norm": 0.08626949787139893, + "learning_rate": 4.9965562289594145e-05, + "loss": 0.0919, + "num_input_tokens_seen": 4411872, + "step": 20915 + }, + { + "epoch": 2.3014301430143016, + "grad_norm": 0.6745759844779968, + "learning_rate": 4.996543624257194e-05, + "loss": 0.0862, + "num_input_tokens_seen": 4412864, + "step": 20920 + }, + { + "epoch": 2.301980198019802, + "grad_norm": 0.1858421415090561, + "learning_rate": 4.996530996545529e-05, + "loss": 0.1253, + "num_input_tokens_seen": 4413856, + "step": 20925 + }, + { + "epoch": 2.3025302530253025, + "grad_norm": 0.7594950199127197, + "learning_rate": 4.9965183458245354e-05, + "loss": 0.111, + "num_input_tokens_seen": 4414880, + "step": 20930 + }, + { + "epoch": 2.303080308030803, + "grad_norm": 0.19825109839439392, + "learning_rate": 4.9965056720943304e-05, + "loss": 0.1087, + "num_input_tokens_seen": 4415904, + "step": 20935 + }, + { + "epoch": 2.3036303630363038, + "grad_norm": 1.3306456804275513, + "learning_rate": 4.9964929753550304e-05, + "loss": 0.1444, + "num_input_tokens_seen": 4416960, + "step": 20940 + }, + { + "epoch": 2.3041804180418044, + "grad_norm": 0.026156647130846977, + "learning_rate": 4.9964802556067526e-05, + "loss": 0.1305, + "num_input_tokens_seen": 4417952, + "step": 20945 + }, + { + "epoch": 2.3047304730473046, + "grad_norm": 0.05409236252307892, + "learning_rate": 4.996467512849614e-05, + "loss": 0.0546, + "num_input_tokens_seen": 4419104, + "step": 20950 + }, + { + "epoch": 2.3052805280528053, + "grad_norm": 0.11456301808357239, + "learning_rate": 4.996454747083733e-05, + "loss": 0.0401, + "num_input_tokens_seen": 4420160, + "step": 20955 + }, + { + "epoch": 2.305830583058306, + "grad_norm": 0.3086201250553131, + "learning_rate": 4.996441958309226e-05, + "loss": 0.1505, + "num_input_tokens_seen": 4421184, + "step": 20960 + }, + { + "epoch": 2.3063806380638066, + "grad_norm": 0.5118808746337891, + "learning_rate": 4.9964291465262114e-05, + "loss": 0.1324, + "num_input_tokens_seen": 4422304, + "step": 20965 + }, + { + "epoch": 2.3069306930693068, + "grad_norm": 0.20938590168952942, + "learning_rate": 4.996416311734807e-05, + "loss": 0.0617, + "num_input_tokens_seen": 4423392, + "step": 20970 + }, + { + "epoch": 2.3074807480748074, + "grad_norm": 1.2497854232788086, + "learning_rate": 4.9964034539351324e-05, + "loss": 0.1334, + "num_input_tokens_seen": 4424448, + "step": 20975 + }, + { + "epoch": 2.308030803080308, + "grad_norm": 0.19697709381580353, + "learning_rate": 4.996390573127304e-05, + "loss": 0.0559, + "num_input_tokens_seen": 4425504, + "step": 20980 + }, + { + "epoch": 2.3085808580858087, + "grad_norm": 1.1359734535217285, + "learning_rate": 4.996377669311442e-05, + "loss": 0.0602, + "num_input_tokens_seen": 4426528, + "step": 20985 + }, + { + "epoch": 2.309130913091309, + "grad_norm": 0.0123254070058465, + "learning_rate": 4.9963647424876645e-05, + "loss": 0.0557, + "num_input_tokens_seen": 4427584, + "step": 20990 + }, + { + "epoch": 2.3096809680968096, + "grad_norm": 0.531610906124115, + "learning_rate": 4.996351792656092e-05, + "loss": 0.0383, + "num_input_tokens_seen": 4428608, + "step": 20995 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 1.2299206256866455, + "learning_rate": 4.996338819816842e-05, + "loss": 0.1274, + "num_input_tokens_seen": 4429664, + "step": 21000 + }, + { + "epoch": 2.310781078107811, + "grad_norm": 1.3345017433166504, + "learning_rate": 4.996325823970035e-05, + "loss": 0.1183, + "num_input_tokens_seen": 4430752, + "step": 21005 + }, + { + "epoch": 2.3113311331133115, + "grad_norm": 0.2510409355163574, + "learning_rate": 4.9963128051157916e-05, + "loss": 0.0519, + "num_input_tokens_seen": 4431808, + "step": 21010 + }, + { + "epoch": 2.3118811881188117, + "grad_norm": 0.4217455983161926, + "learning_rate": 4.99629976325423e-05, + "loss": 0.0379, + "num_input_tokens_seen": 4432800, + "step": 21015 + }, + { + "epoch": 2.3124312431243124, + "grad_norm": 0.06213074177503586, + "learning_rate": 4.996286698385472e-05, + "loss": 0.0705, + "num_input_tokens_seen": 4433856, + "step": 21020 + }, + { + "epoch": 2.312981298129813, + "grad_norm": 0.13252107799053192, + "learning_rate": 4.9962736105096365e-05, + "loss": 0.0524, + "num_input_tokens_seen": 4434944, + "step": 21025 + }, + { + "epoch": 2.3135313531353137, + "grad_norm": 0.7899248600006104, + "learning_rate": 4.9962604996268456e-05, + "loss": 0.1505, + "num_input_tokens_seen": 4436000, + "step": 21030 + }, + { + "epoch": 2.3140814081408143, + "grad_norm": 0.47280430793762207, + "learning_rate": 4.996247365737219e-05, + "loss": 0.0403, + "num_input_tokens_seen": 4437056, + "step": 21035 + }, + { + "epoch": 2.3146314631463145, + "grad_norm": 1.9034583568572998, + "learning_rate": 4.996234208840879e-05, + "loss": 0.0826, + "num_input_tokens_seen": 4438176, + "step": 21040 + }, + { + "epoch": 2.315181518151815, + "grad_norm": 1.172154188156128, + "learning_rate": 4.996221028937945e-05, + "loss": 0.1677, + "num_input_tokens_seen": 4439264, + "step": 21045 + }, + { + "epoch": 2.315731573157316, + "grad_norm": 0.10125990211963654, + "learning_rate": 4.9962078260285405e-05, + "loss": 0.0944, + "num_input_tokens_seen": 4440288, + "step": 21050 + }, + { + "epoch": 2.3162816281628165, + "grad_norm": 0.4424126148223877, + "learning_rate": 4.9961946001127854e-05, + "loss": 0.1412, + "num_input_tokens_seen": 4441376, + "step": 21055 + }, + { + "epoch": 2.3168316831683167, + "grad_norm": 0.13257627189159393, + "learning_rate": 4.996181351190804e-05, + "loss": 0.0543, + "num_input_tokens_seen": 4442496, + "step": 21060 + }, + { + "epoch": 2.3173817381738173, + "grad_norm": 0.0158691443502903, + "learning_rate": 4.996168079262715e-05, + "loss": 0.0162, + "num_input_tokens_seen": 4443616, + "step": 21065 + }, + { + "epoch": 2.317931793179318, + "grad_norm": 0.7119290828704834, + "learning_rate": 4.996154784328644e-05, + "loss": 0.089, + "num_input_tokens_seen": 4444608, + "step": 21070 + }, + { + "epoch": 2.3184818481848186, + "grad_norm": 1.3716975450515747, + "learning_rate": 4.996141466388712e-05, + "loss": 0.1311, + "num_input_tokens_seen": 4445664, + "step": 21075 + }, + { + "epoch": 2.319031903190319, + "grad_norm": 0.16015493869781494, + "learning_rate": 4.9961281254430415e-05, + "loss": 0.0262, + "num_input_tokens_seen": 4446656, + "step": 21080 + }, + { + "epoch": 2.3195819581958195, + "grad_norm": 0.3707481324672699, + "learning_rate": 4.9961147614917555e-05, + "loss": 0.0874, + "num_input_tokens_seen": 4447776, + "step": 21085 + }, + { + "epoch": 2.32013201320132, + "grad_norm": 0.16095362603664398, + "learning_rate": 4.996101374534978e-05, + "loss": 0.0843, + "num_input_tokens_seen": 4448864, + "step": 21090 + }, + { + "epoch": 2.3206820682068208, + "grad_norm": 0.09166833013296127, + "learning_rate": 4.996087964572832e-05, + "loss": 0.1055, + "num_input_tokens_seen": 4449920, + "step": 21095 + }, + { + "epoch": 2.3212321232123214, + "grad_norm": 0.7784858345985413, + "learning_rate": 4.99607453160544e-05, + "loss": 0.1192, + "num_input_tokens_seen": 4451008, + "step": 21100 + }, + { + "epoch": 2.3217821782178216, + "grad_norm": 0.032737456262111664, + "learning_rate": 4.9960610756329276e-05, + "loss": 0.0335, + "num_input_tokens_seen": 4452000, + "step": 21105 + }, + { + "epoch": 2.3223322332233223, + "grad_norm": 0.04897290840744972, + "learning_rate": 4.996047596655418e-05, + "loss": 0.1826, + "num_input_tokens_seen": 4453056, + "step": 21110 + }, + { + "epoch": 2.322882288228823, + "grad_norm": 2.4635190963745117, + "learning_rate": 4.996034094673035e-05, + "loss": 0.0727, + "num_input_tokens_seen": 4454112, + "step": 21115 + }, + { + "epoch": 2.3234323432343236, + "grad_norm": 0.17029380798339844, + "learning_rate": 4.9960205696859044e-05, + "loss": 0.0559, + "num_input_tokens_seen": 4455168, + "step": 21120 + }, + { + "epoch": 2.323982398239824, + "grad_norm": 0.48340481519699097, + "learning_rate": 4.996007021694148e-05, + "loss": 0.0645, + "num_input_tokens_seen": 4456256, + "step": 21125 + }, + { + "epoch": 2.3245324532453244, + "grad_norm": 1.217771053314209, + "learning_rate": 4.995993450697894e-05, + "loss": 0.2083, + "num_input_tokens_seen": 4457312, + "step": 21130 + }, + { + "epoch": 2.325082508250825, + "grad_norm": 2.234393835067749, + "learning_rate": 4.995979856697266e-05, + "loss": 0.268, + "num_input_tokens_seen": 4458368, + "step": 21135 + }, + { + "epoch": 2.3256325632563257, + "grad_norm": 0.33275994658470154, + "learning_rate": 4.995966239692389e-05, + "loss": 0.0856, + "num_input_tokens_seen": 4459392, + "step": 21140 + }, + { + "epoch": 2.3261826182618264, + "grad_norm": 0.15075229108333588, + "learning_rate": 4.995952599683389e-05, + "loss": 0.1384, + "num_input_tokens_seen": 4460448, + "step": 21145 + }, + { + "epoch": 2.3267326732673266, + "grad_norm": 1.1876426935195923, + "learning_rate": 4.9959389366703925e-05, + "loss": 0.0326, + "num_input_tokens_seen": 4461504, + "step": 21150 + }, + { + "epoch": 2.3272827282728272, + "grad_norm": 0.39156249165534973, + "learning_rate": 4.995925250653524e-05, + "loss": 0.1397, + "num_input_tokens_seen": 4462528, + "step": 21155 + }, + { + "epoch": 2.327832783278328, + "grad_norm": 0.7182268500328064, + "learning_rate": 4.9959115416329096e-05, + "loss": 0.0972, + "num_input_tokens_seen": 4463616, + "step": 21160 + }, + { + "epoch": 2.3283828382838285, + "grad_norm": 0.44274887442588806, + "learning_rate": 4.995897809608676e-05, + "loss": 0.0603, + "num_input_tokens_seen": 4464640, + "step": 21165 + }, + { + "epoch": 2.3289328932893287, + "grad_norm": 0.20740626752376556, + "learning_rate": 4.99588405458095e-05, + "loss": 0.0448, + "num_input_tokens_seen": 4465696, + "step": 21170 + }, + { + "epoch": 2.3294829482948294, + "grad_norm": 0.10062994807958603, + "learning_rate": 4.995870276549859e-05, + "loss": 0.0799, + "num_input_tokens_seen": 4466656, + "step": 21175 + }, + { + "epoch": 2.33003300330033, + "grad_norm": 0.07469336688518524, + "learning_rate": 4.9958564755155285e-05, + "loss": 0.0975, + "num_input_tokens_seen": 4467648, + "step": 21180 + }, + { + "epoch": 2.3305830583058307, + "grad_norm": 0.060845233500003815, + "learning_rate": 4.995842651478086e-05, + "loss": 0.0686, + "num_input_tokens_seen": 4468640, + "step": 21185 + }, + { + "epoch": 2.3311331133113313, + "grad_norm": 0.6469042301177979, + "learning_rate": 4.995828804437661e-05, + "loss": 0.0399, + "num_input_tokens_seen": 4469664, + "step": 21190 + }, + { + "epoch": 2.3316831683168315, + "grad_norm": 0.16722393035888672, + "learning_rate": 4.9958149343943786e-05, + "loss": 0.064, + "num_input_tokens_seen": 4470784, + "step": 21195 + }, + { + "epoch": 2.332233223322332, + "grad_norm": 0.572198212146759, + "learning_rate": 4.995801041348368e-05, + "loss": 0.0311, + "num_input_tokens_seen": 4471936, + "step": 21200 + }, + { + "epoch": 2.332783278327833, + "grad_norm": 0.6782909631729126, + "learning_rate": 4.9957871252997566e-05, + "loss": 0.0592, + "num_input_tokens_seen": 4472928, + "step": 21205 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.2516361176967621, + "learning_rate": 4.995773186248673e-05, + "loss": 0.0522, + "num_input_tokens_seen": 4473984, + "step": 21210 + }, + { + "epoch": 2.333883388338834, + "grad_norm": 0.34715381264686584, + "learning_rate": 4.9957592241952457e-05, + "loss": 0.0675, + "num_input_tokens_seen": 4474976, + "step": 21215 + }, + { + "epoch": 2.3344334433443343, + "grad_norm": 0.12062199413776398, + "learning_rate": 4.995745239139602e-05, + "loss": 0.0843, + "num_input_tokens_seen": 4476000, + "step": 21220 + }, + { + "epoch": 2.334983498349835, + "grad_norm": 0.03310755267739296, + "learning_rate": 4.995731231081873e-05, + "loss": 0.1511, + "num_input_tokens_seen": 4477056, + "step": 21225 + }, + { + "epoch": 2.3355335533553356, + "grad_norm": 0.587490439414978, + "learning_rate": 4.995717200022187e-05, + "loss": 0.1059, + "num_input_tokens_seen": 4478112, + "step": 21230 + }, + { + "epoch": 2.336083608360836, + "grad_norm": 0.46591535210609436, + "learning_rate": 4.9957031459606726e-05, + "loss": 0.0776, + "num_input_tokens_seen": 4479200, + "step": 21235 + }, + { + "epoch": 2.3366336633663365, + "grad_norm": 1.2780743837356567, + "learning_rate": 4.99568906889746e-05, + "loss": 0.0856, + "num_input_tokens_seen": 4480160, + "step": 21240 + }, + { + "epoch": 2.337183718371837, + "grad_norm": 1.133102297782898, + "learning_rate": 4.9956749688326786e-05, + "loss": 0.0839, + "num_input_tokens_seen": 4481152, + "step": 21245 + }, + { + "epoch": 2.337733773377338, + "grad_norm": 0.09355498105287552, + "learning_rate": 4.99566084576646e-05, + "loss": 0.0759, + "num_input_tokens_seen": 4482208, + "step": 21250 + }, + { + "epoch": 2.3382838283828384, + "grad_norm": 0.7346833348274231, + "learning_rate": 4.995646699698931e-05, + "loss": 0.115, + "num_input_tokens_seen": 4483232, + "step": 21255 + }, + { + "epoch": 2.3388338833883386, + "grad_norm": 0.4411206543445587, + "learning_rate": 4.995632530630224e-05, + "loss": 0.1318, + "num_input_tokens_seen": 4484288, + "step": 21260 + }, + { + "epoch": 2.3393839383938393, + "grad_norm": 0.6884499788284302, + "learning_rate": 4.99561833856047e-05, + "loss": 0.0853, + "num_input_tokens_seen": 4485376, + "step": 21265 + }, + { + "epoch": 2.33993399339934, + "grad_norm": 0.7753521800041199, + "learning_rate": 4.9956041234898e-05, + "loss": 0.0909, + "num_input_tokens_seen": 4486528, + "step": 21270 + }, + { + "epoch": 2.3404840484048406, + "grad_norm": 0.18236131966114044, + "learning_rate": 4.995589885418344e-05, + "loss": 0.1158, + "num_input_tokens_seen": 4487616, + "step": 21275 + }, + { + "epoch": 2.3410341034103412, + "grad_norm": 0.31008321046829224, + "learning_rate": 4.9955756243462334e-05, + "loss": 0.09, + "num_input_tokens_seen": 4488704, + "step": 21280 + }, + { + "epoch": 2.3415841584158414, + "grad_norm": 0.061964839696884155, + "learning_rate": 4.995561340273599e-05, + "loss": 0.0151, + "num_input_tokens_seen": 4489824, + "step": 21285 + }, + { + "epoch": 2.342134213421342, + "grad_norm": 0.3343715965747833, + "learning_rate": 4.9955470332005735e-05, + "loss": 0.0983, + "num_input_tokens_seen": 4490848, + "step": 21290 + }, + { + "epoch": 2.3426842684268427, + "grad_norm": 1.0519410371780396, + "learning_rate": 4.995532703127289e-05, + "loss": 0.1629, + "num_input_tokens_seen": 4491840, + "step": 21295 + }, + { + "epoch": 2.3432343234323434, + "grad_norm": 0.5205367803573608, + "learning_rate": 4.995518350053877e-05, + "loss": 0.1486, + "num_input_tokens_seen": 4492960, + "step": 21300 + }, + { + "epoch": 2.3437843784378436, + "grad_norm": 0.03189389780163765, + "learning_rate": 4.99550397398047e-05, + "loss": 0.0219, + "num_input_tokens_seen": 4494048, + "step": 21305 + }, + { + "epoch": 2.3443344334433442, + "grad_norm": 0.09829182922840118, + "learning_rate": 4.9954895749072e-05, + "loss": 0.0467, + "num_input_tokens_seen": 4495168, + "step": 21310 + }, + { + "epoch": 2.344884488448845, + "grad_norm": 0.9097792506217957, + "learning_rate": 4.9954751528342e-05, + "loss": 0.1058, + "num_input_tokens_seen": 4496192, + "step": 21315 + }, + { + "epoch": 2.3454345434543455, + "grad_norm": 0.37340229749679565, + "learning_rate": 4.995460707761603e-05, + "loss": 0.09, + "num_input_tokens_seen": 4497248, + "step": 21320 + }, + { + "epoch": 2.3459845984598457, + "grad_norm": 0.058006417006254196, + "learning_rate": 4.995446239689542e-05, + "loss": 0.0264, + "num_input_tokens_seen": 4498304, + "step": 21325 + }, + { + "epoch": 2.3465346534653464, + "grad_norm": 0.20921510457992554, + "learning_rate": 4.9954317486181506e-05, + "loss": 0.0815, + "num_input_tokens_seen": 4499328, + "step": 21330 + }, + { + "epoch": 2.347084708470847, + "grad_norm": 0.24403415620326996, + "learning_rate": 4.995417234547562e-05, + "loss": 0.2027, + "num_input_tokens_seen": 4500384, + "step": 21335 + }, + { + "epoch": 2.3476347634763477, + "grad_norm": 0.5695972442626953, + "learning_rate": 4.99540269747791e-05, + "loss": 0.0373, + "num_input_tokens_seen": 4501536, + "step": 21340 + }, + { + "epoch": 2.3481848184818483, + "grad_norm": 1.2401366233825684, + "learning_rate": 4.995388137409329e-05, + "loss": 0.1932, + "num_input_tokens_seen": 4502624, + "step": 21345 + }, + { + "epoch": 2.3487348734873486, + "grad_norm": 0.04852134734392166, + "learning_rate": 4.9953735543419535e-05, + "loss": 0.0508, + "num_input_tokens_seen": 4503648, + "step": 21350 + }, + { + "epoch": 2.349284928492849, + "grad_norm": 0.11431703716516495, + "learning_rate": 4.995358948275917e-05, + "loss": 0.0417, + "num_input_tokens_seen": 4504672, + "step": 21355 + }, + { + "epoch": 2.34983498349835, + "grad_norm": 0.1546504944562912, + "learning_rate": 4.995344319211354e-05, + "loss": 0.0153, + "num_input_tokens_seen": 4505792, + "step": 21360 + }, + { + "epoch": 2.3503850385038505, + "grad_norm": 0.13626185059547424, + "learning_rate": 4.9953296671484004e-05, + "loss": 0.0858, + "num_input_tokens_seen": 4506880, + "step": 21365 + }, + { + "epoch": 2.350935093509351, + "grad_norm": 0.14949218928813934, + "learning_rate": 4.99531499208719e-05, + "loss": 0.0353, + "num_input_tokens_seen": 4507904, + "step": 21370 + }, + { + "epoch": 2.3514851485148514, + "grad_norm": 0.42180582880973816, + "learning_rate": 4.995300294027859e-05, + "loss": 0.0351, + "num_input_tokens_seen": 4508896, + "step": 21375 + }, + { + "epoch": 2.352035203520352, + "grad_norm": 1.1502774953842163, + "learning_rate": 4.995285572970543e-05, + "loss": 0.063, + "num_input_tokens_seen": 4509920, + "step": 21380 + }, + { + "epoch": 2.3525852585258527, + "grad_norm": 0.2911722660064697, + "learning_rate": 4.995270828915377e-05, + "loss": 0.1526, + "num_input_tokens_seen": 4510944, + "step": 21385 + }, + { + "epoch": 2.3531353135313533, + "grad_norm": 1.0854095220565796, + "learning_rate": 4.995256061862497e-05, + "loss": 0.0583, + "num_input_tokens_seen": 4512032, + "step": 21390 + }, + { + "epoch": 2.3536853685368535, + "grad_norm": 0.49602875113487244, + "learning_rate": 4.9952412718120387e-05, + "loss": 0.0285, + "num_input_tokens_seen": 4513088, + "step": 21395 + }, + { + "epoch": 2.354235423542354, + "grad_norm": 1.5363237857818604, + "learning_rate": 4.9952264587641394e-05, + "loss": 0.0943, + "num_input_tokens_seen": 4514176, + "step": 21400 + }, + { + "epoch": 2.354785478547855, + "grad_norm": 0.9641522765159607, + "learning_rate": 4.995211622718935e-05, + "loss": 0.1125, + "num_input_tokens_seen": 4515264, + "step": 21405 + }, + { + "epoch": 2.3553355335533555, + "grad_norm": 2.132579803466797, + "learning_rate": 4.9951967636765626e-05, + "loss": 0.1201, + "num_input_tokens_seen": 4516288, + "step": 21410 + }, + { + "epoch": 2.3558855885588557, + "grad_norm": 0.31833967566490173, + "learning_rate": 4.995181881637159e-05, + "loss": 0.0694, + "num_input_tokens_seen": 4517376, + "step": 21415 + }, + { + "epoch": 2.3564356435643563, + "grad_norm": 0.2564380168914795, + "learning_rate": 4.9951669766008615e-05, + "loss": 0.098, + "num_input_tokens_seen": 4518400, + "step": 21420 + }, + { + "epoch": 2.356985698569857, + "grad_norm": 0.38784971833229065, + "learning_rate": 4.995152048567806e-05, + "loss": 0.0439, + "num_input_tokens_seen": 4519424, + "step": 21425 + }, + { + "epoch": 2.3575357535753576, + "grad_norm": 0.25424277782440186, + "learning_rate": 4.995137097538133e-05, + "loss": 0.0735, + "num_input_tokens_seen": 4520512, + "step": 21430 + }, + { + "epoch": 2.3580858085808583, + "grad_norm": 0.8772047758102417, + "learning_rate": 4.995122123511977e-05, + "loss": 0.0783, + "num_input_tokens_seen": 4521568, + "step": 21435 + }, + { + "epoch": 2.3586358635863585, + "grad_norm": 1.1894347667694092, + "learning_rate": 4.9951071264894786e-05, + "loss": 0.1321, + "num_input_tokens_seen": 4522592, + "step": 21440 + }, + { + "epoch": 2.359185918591859, + "grad_norm": 0.6291783452033997, + "learning_rate": 4.9950921064707754e-05, + "loss": 0.0451, + "num_input_tokens_seen": 4523712, + "step": 21445 + }, + { + "epoch": 2.3597359735973598, + "grad_norm": 0.5805990099906921, + "learning_rate": 4.995077063456005e-05, + "loss": 0.0737, + "num_input_tokens_seen": 4524832, + "step": 21450 + }, + { + "epoch": 2.3602860286028604, + "grad_norm": 0.582643985748291, + "learning_rate": 4.995061997445307e-05, + "loss": 0.0645, + "num_input_tokens_seen": 4525856, + "step": 21455 + }, + { + "epoch": 2.360836083608361, + "grad_norm": 0.045319780707359314, + "learning_rate": 4.9950469084388196e-05, + "loss": 0.0632, + "num_input_tokens_seen": 4526912, + "step": 21460 + }, + { + "epoch": 2.3613861386138613, + "grad_norm": 0.8955605030059814, + "learning_rate": 4.995031796436682e-05, + "loss": 0.0926, + "num_input_tokens_seen": 4527968, + "step": 21465 + }, + { + "epoch": 2.361936193619362, + "grad_norm": 0.2931514382362366, + "learning_rate": 4.9950166614390336e-05, + "loss": 0.068, + "num_input_tokens_seen": 4529024, + "step": 21470 + }, + { + "epoch": 2.3624862486248626, + "grad_norm": 0.12964682281017303, + "learning_rate": 4.995001503446014e-05, + "loss": 0.0285, + "num_input_tokens_seen": 4530048, + "step": 21475 + }, + { + "epoch": 2.363036303630363, + "grad_norm": 0.501232922077179, + "learning_rate": 4.994986322457763e-05, + "loss": 0.0729, + "num_input_tokens_seen": 4531136, + "step": 21480 + }, + { + "epoch": 2.3635863586358634, + "grad_norm": 0.03139057010412216, + "learning_rate": 4.99497111847442e-05, + "loss": 0.0443, + "num_input_tokens_seen": 4532160, + "step": 21485 + }, + { + "epoch": 2.364136413641364, + "grad_norm": 0.5243894457817078, + "learning_rate": 4.994955891496125e-05, + "loss": 0.1806, + "num_input_tokens_seen": 4533216, + "step": 21490 + }, + { + "epoch": 2.3646864686468647, + "grad_norm": 0.016235915943980217, + "learning_rate": 4.994940641523019e-05, + "loss": 0.0913, + "num_input_tokens_seen": 4534304, + "step": 21495 + }, + { + "epoch": 2.3652365236523654, + "grad_norm": 1.1410112380981445, + "learning_rate": 4.994925368555242e-05, + "loss": 0.1008, + "num_input_tokens_seen": 4535328, + "step": 21500 + }, + { + "epoch": 2.3657865786578656, + "grad_norm": 1.2857046127319336, + "learning_rate": 4.994910072592936e-05, + "loss": 0.0485, + "num_input_tokens_seen": 4536448, + "step": 21505 + }, + { + "epoch": 2.366336633663366, + "grad_norm": 0.04684077948331833, + "learning_rate": 4.99489475363624e-05, + "loss": 0.0801, + "num_input_tokens_seen": 4537568, + "step": 21510 + }, + { + "epoch": 2.366886688668867, + "grad_norm": 0.10424000769853592, + "learning_rate": 4.994879411685297e-05, + "loss": 0.0649, + "num_input_tokens_seen": 4538624, + "step": 21515 + }, + { + "epoch": 2.3674367436743675, + "grad_norm": 0.5114049911499023, + "learning_rate": 4.994864046740247e-05, + "loss": 0.0247, + "num_input_tokens_seen": 4539744, + "step": 21520 + }, + { + "epoch": 2.367986798679868, + "grad_norm": 0.2403021901845932, + "learning_rate": 4.994848658801233e-05, + "loss": 0.023, + "num_input_tokens_seen": 4540864, + "step": 21525 + }, + { + "epoch": 2.3685368536853684, + "grad_norm": 0.30733489990234375, + "learning_rate": 4.994833247868396e-05, + "loss": 0.0496, + "num_input_tokens_seen": 4541888, + "step": 21530 + }, + { + "epoch": 2.369086908690869, + "grad_norm": 0.6509347558021545, + "learning_rate": 4.9948178139418774e-05, + "loss": 0.1312, + "num_input_tokens_seen": 4542944, + "step": 21535 + }, + { + "epoch": 2.3696369636963697, + "grad_norm": 0.04916759580373764, + "learning_rate": 4.99480235702182e-05, + "loss": 0.0274, + "num_input_tokens_seen": 4543968, + "step": 21540 + }, + { + "epoch": 2.3701870187018703, + "grad_norm": 0.7106114625930786, + "learning_rate": 4.994786877108367e-05, + "loss": 0.0775, + "num_input_tokens_seen": 4544992, + "step": 21545 + }, + { + "epoch": 2.370737073707371, + "grad_norm": 0.5634022951126099, + "learning_rate": 4.994771374201661e-05, + "loss": 0.0675, + "num_input_tokens_seen": 4546016, + "step": 21550 + }, + { + "epoch": 2.371287128712871, + "grad_norm": 0.3105184733867645, + "learning_rate": 4.994755848301843e-05, + "loss": 0.0616, + "num_input_tokens_seen": 4547104, + "step": 21555 + }, + { + "epoch": 2.371837183718372, + "grad_norm": 0.8548704981803894, + "learning_rate": 4.994740299409058e-05, + "loss": 0.0618, + "num_input_tokens_seen": 4548160, + "step": 21560 + }, + { + "epoch": 2.3723872387238725, + "grad_norm": 0.1932857483625412, + "learning_rate": 4.994724727523449e-05, + "loss": 0.0853, + "num_input_tokens_seen": 4549184, + "step": 21565 + }, + { + "epoch": 2.372937293729373, + "grad_norm": 0.6090471744537354, + "learning_rate": 4.994709132645159e-05, + "loss": 0.0971, + "num_input_tokens_seen": 4550208, + "step": 21570 + }, + { + "epoch": 2.3734873487348733, + "grad_norm": 0.022277820855379105, + "learning_rate": 4.9946935147743326e-05, + "loss": 0.0544, + "num_input_tokens_seen": 4551296, + "step": 21575 + }, + { + "epoch": 2.374037403740374, + "grad_norm": 1.3700683116912842, + "learning_rate": 4.9946778739111125e-05, + "loss": 0.0426, + "num_input_tokens_seen": 4552320, + "step": 21580 + }, + { + "epoch": 2.3745874587458746, + "grad_norm": 0.9346144795417786, + "learning_rate": 4.994662210055643e-05, + "loss": 0.0976, + "num_input_tokens_seen": 4553312, + "step": 21585 + }, + { + "epoch": 2.3751375137513753, + "grad_norm": 0.6234951019287109, + "learning_rate": 4.9946465232080696e-05, + "loss": 0.0479, + "num_input_tokens_seen": 4554432, + "step": 21590 + }, + { + "epoch": 2.3756875687568755, + "grad_norm": 0.03300696238875389, + "learning_rate": 4.994630813368537e-05, + "loss": 0.0132, + "num_input_tokens_seen": 4555488, + "step": 21595 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.26463067531585693, + "learning_rate": 4.9946150805371885e-05, + "loss": 0.0508, + "num_input_tokens_seen": 4556544, + "step": 21600 + }, + { + "epoch": 2.3767876787678768, + "grad_norm": 0.27263644337654114, + "learning_rate": 4.9945993247141697e-05, + "loss": 0.0246, + "num_input_tokens_seen": 4557568, + "step": 21605 + }, + { + "epoch": 2.3773377337733774, + "grad_norm": 1.074112892150879, + "learning_rate": 4.994583545899626e-05, + "loss": 0.3121, + "num_input_tokens_seen": 4558656, + "step": 21610 + }, + { + "epoch": 2.377887788778878, + "grad_norm": 0.8565426468849182, + "learning_rate": 4.994567744093703e-05, + "loss": 0.1117, + "num_input_tokens_seen": 4559744, + "step": 21615 + }, + { + "epoch": 2.3784378437843783, + "grad_norm": 0.03272659704089165, + "learning_rate": 4.994551919296546e-05, + "loss": 0.1598, + "num_input_tokens_seen": 4560800, + "step": 21620 + }, + { + "epoch": 2.378987898789879, + "grad_norm": 1.172216534614563, + "learning_rate": 4.994536071508301e-05, + "loss": 0.1433, + "num_input_tokens_seen": 4561824, + "step": 21625 + }, + { + "epoch": 2.3795379537953796, + "grad_norm": 0.07655107975006104, + "learning_rate": 4.9945202007291134e-05, + "loss": 0.0424, + "num_input_tokens_seen": 4562848, + "step": 21630 + }, + { + "epoch": 2.3800880088008802, + "grad_norm": 0.16425953805446625, + "learning_rate": 4.9945043069591314e-05, + "loss": 0.0674, + "num_input_tokens_seen": 4563904, + "step": 21635 + }, + { + "epoch": 2.380638063806381, + "grad_norm": 1.7512531280517578, + "learning_rate": 4.994488390198499e-05, + "loss": 0.1001, + "num_input_tokens_seen": 4564928, + "step": 21640 + }, + { + "epoch": 2.381188118811881, + "grad_norm": 0.14163389801979065, + "learning_rate": 4.994472450447365e-05, + "loss": 0.0876, + "num_input_tokens_seen": 4566016, + "step": 21645 + }, + { + "epoch": 2.3817381738173817, + "grad_norm": 0.653612494468689, + "learning_rate": 4.994456487705874e-05, + "loss": 0.0669, + "num_input_tokens_seen": 4567104, + "step": 21650 + }, + { + "epoch": 2.3822882288228824, + "grad_norm": 1.0787895917892456, + "learning_rate": 4.994440501974176e-05, + "loss": 0.0532, + "num_input_tokens_seen": 4568160, + "step": 21655 + }, + { + "epoch": 2.382838283828383, + "grad_norm": 0.11286811530590057, + "learning_rate": 4.994424493252417e-05, + "loss": 0.0306, + "num_input_tokens_seen": 4569184, + "step": 21660 + }, + { + "epoch": 2.3833883388338832, + "grad_norm": 1.3821580410003662, + "learning_rate": 4.994408461540743e-05, + "loss": 0.087, + "num_input_tokens_seen": 4570272, + "step": 21665 + }, + { + "epoch": 2.383938393839384, + "grad_norm": 0.8890976309776306, + "learning_rate": 4.994392406839303e-05, + "loss": 0.1254, + "num_input_tokens_seen": 4571296, + "step": 21670 + }, + { + "epoch": 2.3844884488448845, + "grad_norm": 0.35111355781555176, + "learning_rate": 4.994376329148246e-05, + "loss": 0.0402, + "num_input_tokens_seen": 4572288, + "step": 21675 + }, + { + "epoch": 2.385038503850385, + "grad_norm": 0.22445741295814514, + "learning_rate": 4.99436022846772e-05, + "loss": 0.0164, + "num_input_tokens_seen": 4573312, + "step": 21680 + }, + { + "epoch": 2.3855885588558854, + "grad_norm": 1.2853657007217407, + "learning_rate": 4.994344104797872e-05, + "loss": 0.1228, + "num_input_tokens_seen": 4574368, + "step": 21685 + }, + { + "epoch": 2.386138613861386, + "grad_norm": 0.24116083979606628, + "learning_rate": 4.994327958138851e-05, + "loss": 0.0593, + "num_input_tokens_seen": 4575424, + "step": 21690 + }, + { + "epoch": 2.3866886688668867, + "grad_norm": 1.6155033111572266, + "learning_rate": 4.9943117884908066e-05, + "loss": 0.0873, + "num_input_tokens_seen": 4576448, + "step": 21695 + }, + { + "epoch": 2.3872387238723873, + "grad_norm": 0.34590572118759155, + "learning_rate": 4.994295595853886e-05, + "loss": 0.0864, + "num_input_tokens_seen": 4577504, + "step": 21700 + }, + { + "epoch": 2.387788778877888, + "grad_norm": 0.10210857540369034, + "learning_rate": 4.9942793802282414e-05, + "loss": 0.1453, + "num_input_tokens_seen": 4578528, + "step": 21705 + }, + { + "epoch": 2.388338833883388, + "grad_norm": 0.12580958008766174, + "learning_rate": 4.9942631416140196e-05, + "loss": 0.0825, + "num_input_tokens_seen": 4579552, + "step": 21710 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.11412737518548965, + "learning_rate": 4.9942468800113716e-05, + "loss": 0.0934, + "num_input_tokens_seen": 4580576, + "step": 21715 + }, + { + "epoch": 2.3894389438943895, + "grad_norm": 0.14485500752925873, + "learning_rate": 4.9942305954204474e-05, + "loss": 0.0396, + "num_input_tokens_seen": 4581664, + "step": 21720 + }, + { + "epoch": 2.38998899889989, + "grad_norm": 0.7464631199836731, + "learning_rate": 4.9942142878413965e-05, + "loss": 0.1552, + "num_input_tokens_seen": 4582752, + "step": 21725 + }, + { + "epoch": 2.390539053905391, + "grad_norm": 0.9952377080917358, + "learning_rate": 4.994197957274369e-05, + "loss": 0.129, + "num_input_tokens_seen": 4583808, + "step": 21730 + }, + { + "epoch": 2.391089108910891, + "grad_norm": 2.9972991943359375, + "learning_rate": 4.994181603719516e-05, + "loss": 0.2175, + "num_input_tokens_seen": 4584864, + "step": 21735 + }, + { + "epoch": 2.3916391639163916, + "grad_norm": 0.03671395033597946, + "learning_rate": 4.9941652271769876e-05, + "loss": 0.0486, + "num_input_tokens_seen": 4585920, + "step": 21740 + }, + { + "epoch": 2.3921892189218923, + "grad_norm": 0.1455765664577484, + "learning_rate": 4.9941488276469355e-05, + "loss": 0.0686, + "num_input_tokens_seen": 4586944, + "step": 21745 + }, + { + "epoch": 2.3927392739273925, + "grad_norm": 0.1642928570508957, + "learning_rate": 4.9941324051295104e-05, + "loss": 0.0882, + "num_input_tokens_seen": 4588000, + "step": 21750 + }, + { + "epoch": 2.393289328932893, + "grad_norm": 1.1465154886245728, + "learning_rate": 4.994115959624864e-05, + "loss": 0.1222, + "num_input_tokens_seen": 4589024, + "step": 21755 + }, + { + "epoch": 2.393839383938394, + "grad_norm": 0.07973437756299973, + "learning_rate": 4.9940994911331474e-05, + "loss": 0.0507, + "num_input_tokens_seen": 4590048, + "step": 21760 + }, + { + "epoch": 2.3943894389438944, + "grad_norm": 0.11086813360452652, + "learning_rate": 4.9940829996545125e-05, + "loss": 0.0917, + "num_input_tokens_seen": 4591104, + "step": 21765 + }, + { + "epoch": 2.394939493949395, + "grad_norm": 0.17620442807674408, + "learning_rate": 4.994066485189111e-05, + "loss": 0.084, + "num_input_tokens_seen": 4592128, + "step": 21770 + }, + { + "epoch": 2.3954895489548953, + "grad_norm": 0.06640459597110748, + "learning_rate": 4.994049947737096e-05, + "loss": 0.2184, + "num_input_tokens_seen": 4593120, + "step": 21775 + }, + { + "epoch": 2.396039603960396, + "grad_norm": 0.9980067014694214, + "learning_rate": 4.994033387298619e-05, + "loss": 0.0951, + "num_input_tokens_seen": 4594144, + "step": 21780 + }, + { + "epoch": 2.3965896589658966, + "grad_norm": 0.9087804555892944, + "learning_rate": 4.994016803873833e-05, + "loss": 0.1187, + "num_input_tokens_seen": 4595264, + "step": 21785 + }, + { + "epoch": 2.3971397139713972, + "grad_norm": 0.7958692312240601, + "learning_rate": 4.9940001974628915e-05, + "loss": 0.0942, + "num_input_tokens_seen": 4596352, + "step": 21790 + }, + { + "epoch": 2.397689768976898, + "grad_norm": 0.04603494331240654, + "learning_rate": 4.993983568065946e-05, + "loss": 0.0804, + "num_input_tokens_seen": 4597408, + "step": 21795 + }, + { + "epoch": 2.398239823982398, + "grad_norm": 0.09307676553726196, + "learning_rate": 4.993966915683151e-05, + "loss": 0.0356, + "num_input_tokens_seen": 4598464, + "step": 21800 + }, + { + "epoch": 2.3987898789878987, + "grad_norm": 0.15482191741466522, + "learning_rate": 4.99395024031466e-05, + "loss": 0.0265, + "num_input_tokens_seen": 4599488, + "step": 21805 + }, + { + "epoch": 2.3993399339933994, + "grad_norm": 0.05451207980513573, + "learning_rate": 4.993933541960627e-05, + "loss": 0.0383, + "num_input_tokens_seen": 4600544, + "step": 21810 + }, + { + "epoch": 2.3998899889989, + "grad_norm": 0.9241988658905029, + "learning_rate": 4.993916820621204e-05, + "loss": 0.1155, + "num_input_tokens_seen": 4601568, + "step": 21815 + }, + { + "epoch": 2.4004400440044003, + "grad_norm": 0.6005451679229736, + "learning_rate": 4.9939000762965474e-05, + "loss": 0.0783, + "num_input_tokens_seen": 4602624, + "step": 21820 + }, + { + "epoch": 2.400990099009901, + "grad_norm": 0.20312854647636414, + "learning_rate": 4.9938833089868096e-05, + "loss": 0.0697, + "num_input_tokens_seen": 4603680, + "step": 21825 + }, + { + "epoch": 2.4015401540154016, + "grad_norm": 0.07658672332763672, + "learning_rate": 4.993866518692146e-05, + "loss": 0.0758, + "num_input_tokens_seen": 4604800, + "step": 21830 + }, + { + "epoch": 2.402090209020902, + "grad_norm": 0.1750611960887909, + "learning_rate": 4.993849705412712e-05, + "loss": 0.0616, + "num_input_tokens_seen": 4605920, + "step": 21835 + }, + { + "epoch": 2.4026402640264024, + "grad_norm": 0.44081827998161316, + "learning_rate": 4.9938328691486615e-05, + "loss": 0.0917, + "num_input_tokens_seen": 4607008, + "step": 21840 + }, + { + "epoch": 2.403190319031903, + "grad_norm": 0.0871579498052597, + "learning_rate": 4.993816009900151e-05, + "loss": 0.0961, + "num_input_tokens_seen": 4608032, + "step": 21845 + }, + { + "epoch": 2.4037403740374037, + "grad_norm": 3.322894334793091, + "learning_rate": 4.9937991276673335e-05, + "loss": 0.1407, + "num_input_tokens_seen": 4609024, + "step": 21850 + }, + { + "epoch": 2.4042904290429044, + "grad_norm": 0.06087971106171608, + "learning_rate": 4.993782222450367e-05, + "loss": 0.078, + "num_input_tokens_seen": 4610112, + "step": 21855 + }, + { + "epoch": 2.404840484048405, + "grad_norm": 0.15698844194412231, + "learning_rate": 4.9937652942494065e-05, + "loss": 0.1208, + "num_input_tokens_seen": 4611232, + "step": 21860 + }, + { + "epoch": 2.405390539053905, + "grad_norm": 0.07133550196886063, + "learning_rate": 4.993748343064607e-05, + "loss": 0.0848, + "num_input_tokens_seen": 4612256, + "step": 21865 + }, + { + "epoch": 2.405940594059406, + "grad_norm": 0.4217362105846405, + "learning_rate": 4.993731368896126e-05, + "loss": 0.1309, + "num_input_tokens_seen": 4613312, + "step": 21870 + }, + { + "epoch": 2.4064906490649065, + "grad_norm": 0.19251108169555664, + "learning_rate": 4.993714371744121e-05, + "loss": 0.0573, + "num_input_tokens_seen": 4614400, + "step": 21875 + }, + { + "epoch": 2.407040704070407, + "grad_norm": 0.5167365074157715, + "learning_rate": 4.993697351608746e-05, + "loss": 0.0829, + "num_input_tokens_seen": 4615424, + "step": 21880 + }, + { + "epoch": 2.407590759075908, + "grad_norm": 0.4469388723373413, + "learning_rate": 4.993680308490158e-05, + "loss": 0.1512, + "num_input_tokens_seen": 4616416, + "step": 21885 + }, + { + "epoch": 2.408140814081408, + "grad_norm": 0.6892776489257812, + "learning_rate": 4.993663242388516e-05, + "loss": 0.1515, + "num_input_tokens_seen": 4617504, + "step": 21890 + }, + { + "epoch": 2.4086908690869087, + "grad_norm": 0.13015976548194885, + "learning_rate": 4.993646153303978e-05, + "loss": 0.0663, + "num_input_tokens_seen": 4618528, + "step": 21895 + }, + { + "epoch": 2.4092409240924093, + "grad_norm": 1.9493144750595093, + "learning_rate": 4.993629041236698e-05, + "loss": 0.1054, + "num_input_tokens_seen": 4619552, + "step": 21900 + }, + { + "epoch": 2.40979097909791, + "grad_norm": 0.09150688350200653, + "learning_rate": 4.993611906186837e-05, + "loss": 0.0612, + "num_input_tokens_seen": 4620640, + "step": 21905 + }, + { + "epoch": 2.41034103410341, + "grad_norm": 0.5335373878479004, + "learning_rate": 4.993594748154551e-05, + "loss": 0.0499, + "num_input_tokens_seen": 4621760, + "step": 21910 + }, + { + "epoch": 2.410891089108911, + "grad_norm": 0.18525269627571106, + "learning_rate": 4.993577567139999e-05, + "loss": 0.0758, + "num_input_tokens_seen": 4622784, + "step": 21915 + }, + { + "epoch": 2.4114411441144115, + "grad_norm": 0.548625111579895, + "learning_rate": 4.993560363143339e-05, + "loss": 0.0596, + "num_input_tokens_seen": 4623808, + "step": 21920 + }, + { + "epoch": 2.411991199119912, + "grad_norm": 0.5249521732330322, + "learning_rate": 4.993543136164729e-05, + "loss": 0.0602, + "num_input_tokens_seen": 4624800, + "step": 21925 + }, + { + "epoch": 2.4125412541254123, + "grad_norm": 0.12136624753475189, + "learning_rate": 4.9935258862043294e-05, + "loss": 0.0836, + "num_input_tokens_seen": 4625856, + "step": 21930 + }, + { + "epoch": 2.413091309130913, + "grad_norm": 0.30984407663345337, + "learning_rate": 4.9935086132622975e-05, + "loss": 0.1279, + "num_input_tokens_seen": 4626944, + "step": 21935 + }, + { + "epoch": 2.4136413641364136, + "grad_norm": 0.1266581267118454, + "learning_rate": 4.993491317338794e-05, + "loss": 0.1224, + "num_input_tokens_seen": 4627968, + "step": 21940 + }, + { + "epoch": 2.4141914191419143, + "grad_norm": 0.31168726086616516, + "learning_rate": 4.993473998433977e-05, + "loss": 0.0442, + "num_input_tokens_seen": 4629088, + "step": 21945 + }, + { + "epoch": 2.414741474147415, + "grad_norm": 0.577819287776947, + "learning_rate": 4.9934566565480065e-05, + "loss": 0.0963, + "num_input_tokens_seen": 4630208, + "step": 21950 + }, + { + "epoch": 2.415291529152915, + "grad_norm": 0.11999724060297012, + "learning_rate": 4.993439291681042e-05, + "loss": 0.0606, + "num_input_tokens_seen": 4631200, + "step": 21955 + }, + { + "epoch": 2.4158415841584158, + "grad_norm": 0.14398661255836487, + "learning_rate": 4.9934219038332455e-05, + "loss": 0.1099, + "num_input_tokens_seen": 4632256, + "step": 21960 + }, + { + "epoch": 2.4163916391639164, + "grad_norm": 1.046848177909851, + "learning_rate": 4.993404493004775e-05, + "loss": 0.105, + "num_input_tokens_seen": 4633376, + "step": 21965 + }, + { + "epoch": 2.416941694169417, + "grad_norm": 0.26196354627609253, + "learning_rate": 4.993387059195792e-05, + "loss": 0.0641, + "num_input_tokens_seen": 4634432, + "step": 21970 + }, + { + "epoch": 2.4174917491749177, + "grad_norm": 0.36413177847862244, + "learning_rate": 4.993369602406456e-05, + "loss": 0.0474, + "num_input_tokens_seen": 4635456, + "step": 21975 + }, + { + "epoch": 2.418041804180418, + "grad_norm": 0.2990369498729706, + "learning_rate": 4.9933521226369294e-05, + "loss": 0.0624, + "num_input_tokens_seen": 4636544, + "step": 21980 + }, + { + "epoch": 2.4185918591859186, + "grad_norm": 0.1580539494752884, + "learning_rate": 4.993334619887373e-05, + "loss": 0.0646, + "num_input_tokens_seen": 4637664, + "step": 21985 + }, + { + "epoch": 2.419141914191419, + "grad_norm": 0.0682155191898346, + "learning_rate": 4.993317094157948e-05, + "loss": 0.084, + "num_input_tokens_seen": 4638656, + "step": 21990 + }, + { + "epoch": 2.41969196919692, + "grad_norm": 0.4532139003276825, + "learning_rate": 4.993299545448815e-05, + "loss": 0.0674, + "num_input_tokens_seen": 4639712, + "step": 21995 + }, + { + "epoch": 2.42024202420242, + "grad_norm": 1.116811752319336, + "learning_rate": 4.993281973760137e-05, + "loss": 0.0986, + "num_input_tokens_seen": 4640704, + "step": 22000 + }, + { + "epoch": 2.4207920792079207, + "grad_norm": 0.8317341804504395, + "learning_rate": 4.993264379092075e-05, + "loss": 0.046, + "num_input_tokens_seen": 4641760, + "step": 22005 + }, + { + "epoch": 2.4213421342134214, + "grad_norm": 0.022498706355690956, + "learning_rate": 4.993246761444792e-05, + "loss": 0.0289, + "num_input_tokens_seen": 4642752, + "step": 22010 + }, + { + "epoch": 2.421892189218922, + "grad_norm": 0.5833500027656555, + "learning_rate": 4.99322912081845e-05, + "loss": 0.127, + "num_input_tokens_seen": 4643808, + "step": 22015 + }, + { + "epoch": 2.4224422442244222, + "grad_norm": 0.06234028562903404, + "learning_rate": 4.993211457213212e-05, + "loss": 0.0548, + "num_input_tokens_seen": 4644896, + "step": 22020 + }, + { + "epoch": 2.422992299229923, + "grad_norm": 0.8851922750473022, + "learning_rate": 4.993193770629239e-05, + "loss": 0.0634, + "num_input_tokens_seen": 4645952, + "step": 22025 + }, + { + "epoch": 2.4235423542354235, + "grad_norm": 0.16025219857692719, + "learning_rate": 4.993176061066696e-05, + "loss": 0.0828, + "num_input_tokens_seen": 4647072, + "step": 22030 + }, + { + "epoch": 2.424092409240924, + "grad_norm": 0.5249984860420227, + "learning_rate": 4.9931583285257456e-05, + "loss": 0.1612, + "num_input_tokens_seen": 4648064, + "step": 22035 + }, + { + "epoch": 2.424642464246425, + "grad_norm": 0.19512371718883514, + "learning_rate": 4.993140573006552e-05, + "loss": 0.1469, + "num_input_tokens_seen": 4649056, + "step": 22040 + }, + { + "epoch": 2.425192519251925, + "grad_norm": 0.6850154399871826, + "learning_rate": 4.9931227945092774e-05, + "loss": 0.0904, + "num_input_tokens_seen": 4650144, + "step": 22045 + }, + { + "epoch": 2.4257425742574257, + "grad_norm": 0.08871447294950485, + "learning_rate": 4.9931049930340854e-05, + "loss": 0.1227, + "num_input_tokens_seen": 4651136, + "step": 22050 + }, + { + "epoch": 2.4262926292629263, + "grad_norm": 0.24983090162277222, + "learning_rate": 4.993087168581143e-05, + "loss": 0.0225, + "num_input_tokens_seen": 4652160, + "step": 22055 + }, + { + "epoch": 2.426842684268427, + "grad_norm": 0.5300039649009705, + "learning_rate": 4.9930693211506106e-05, + "loss": 0.0531, + "num_input_tokens_seen": 4653312, + "step": 22060 + }, + { + "epoch": 2.4273927392739276, + "grad_norm": 0.662268340587616, + "learning_rate": 4.993051450742656e-05, + "loss": 0.0635, + "num_input_tokens_seen": 4654368, + "step": 22065 + }, + { + "epoch": 2.427942794279428, + "grad_norm": 0.08546911925077438, + "learning_rate": 4.993033557357442e-05, + "loss": 0.11, + "num_input_tokens_seen": 4655424, + "step": 22070 + }, + { + "epoch": 2.4284928492849285, + "grad_norm": 0.03275226801633835, + "learning_rate": 4.9930156409951334e-05, + "loss": 0.0592, + "num_input_tokens_seen": 4656512, + "step": 22075 + }, + { + "epoch": 2.429042904290429, + "grad_norm": 0.6925309300422668, + "learning_rate": 4.992997701655896e-05, + "loss": 0.0987, + "num_input_tokens_seen": 4657632, + "step": 22080 + }, + { + "epoch": 2.4295929592959298, + "grad_norm": 0.58986896276474, + "learning_rate": 4.992979739339895e-05, + "loss": 0.0817, + "num_input_tokens_seen": 4658656, + "step": 22085 + }, + { + "epoch": 2.43014301430143, + "grad_norm": 0.18674615025520325, + "learning_rate": 4.9929617540472964e-05, + "loss": 0.0965, + "num_input_tokens_seen": 4659648, + "step": 22090 + }, + { + "epoch": 2.4306930693069306, + "grad_norm": 0.0917525440454483, + "learning_rate": 4.9929437457782654e-05, + "loss": 0.0472, + "num_input_tokens_seen": 4660736, + "step": 22095 + }, + { + "epoch": 2.4312431243124313, + "grad_norm": 0.3487709164619446, + "learning_rate": 4.992925714532969e-05, + "loss": 0.0795, + "num_input_tokens_seen": 4661824, + "step": 22100 + }, + { + "epoch": 2.431793179317932, + "grad_norm": 0.20717066526412964, + "learning_rate": 4.992907660311571e-05, + "loss": 0.0391, + "num_input_tokens_seen": 4662912, + "step": 22105 + }, + { + "epoch": 2.432343234323432, + "grad_norm": 0.04422006383538246, + "learning_rate": 4.99288958311424e-05, + "loss": 0.0552, + "num_input_tokens_seen": 4663968, + "step": 22110 + }, + { + "epoch": 2.432893289328933, + "grad_norm": 0.6115626692771912, + "learning_rate": 4.992871482941142e-05, + "loss": 0.0412, + "num_input_tokens_seen": 4665024, + "step": 22115 + }, + { + "epoch": 2.4334433443344334, + "grad_norm": 0.059261564165353775, + "learning_rate": 4.992853359792444e-05, + "loss": 0.0879, + "num_input_tokens_seen": 4666176, + "step": 22120 + }, + { + "epoch": 2.433993399339934, + "grad_norm": 0.08543983101844788, + "learning_rate": 4.992835213668312e-05, + "loss": 0.0371, + "num_input_tokens_seen": 4667232, + "step": 22125 + }, + { + "epoch": 2.4345434543454347, + "grad_norm": 0.7653880715370178, + "learning_rate": 4.9928170445689145e-05, + "loss": 0.0427, + "num_input_tokens_seen": 4668288, + "step": 22130 + }, + { + "epoch": 2.435093509350935, + "grad_norm": 0.14119884371757507, + "learning_rate": 4.9927988524944184e-05, + "loss": 0.1042, + "num_input_tokens_seen": 4669376, + "step": 22135 + }, + { + "epoch": 2.4356435643564356, + "grad_norm": 0.5405197143554688, + "learning_rate": 4.992780637444992e-05, + "loss": 0.0666, + "num_input_tokens_seen": 4670400, + "step": 22140 + }, + { + "epoch": 2.4361936193619362, + "grad_norm": 1.3301794528961182, + "learning_rate": 4.9927623994208014e-05, + "loss": 0.0759, + "num_input_tokens_seen": 4671456, + "step": 22145 + }, + { + "epoch": 2.436743674367437, + "grad_norm": 0.0848141685128212, + "learning_rate": 4.992744138422017e-05, + "loss": 0.0457, + "num_input_tokens_seen": 4672512, + "step": 22150 + }, + { + "epoch": 2.4372937293729375, + "grad_norm": 0.6726931929588318, + "learning_rate": 4.9927258544488056e-05, + "loss": 0.035, + "num_input_tokens_seen": 4673600, + "step": 22155 + }, + { + "epoch": 2.4378437843784377, + "grad_norm": 0.7418582439422607, + "learning_rate": 4.992707547501336e-05, + "loss": 0.0723, + "num_input_tokens_seen": 4674624, + "step": 22160 + }, + { + "epoch": 2.4383938393839384, + "grad_norm": 0.3378831148147583, + "learning_rate": 4.992689217579777e-05, + "loss": 0.0862, + "num_input_tokens_seen": 4675712, + "step": 22165 + }, + { + "epoch": 2.438943894389439, + "grad_norm": 0.3002000153064728, + "learning_rate": 4.992670864684298e-05, + "loss": 0.0708, + "num_input_tokens_seen": 4676800, + "step": 22170 + }, + { + "epoch": 2.4394939493949397, + "grad_norm": 0.31699326634407043, + "learning_rate": 4.9926524888150674e-05, + "loss": 0.0517, + "num_input_tokens_seen": 4677792, + "step": 22175 + }, + { + "epoch": 2.44004400440044, + "grad_norm": 0.6222780346870422, + "learning_rate": 4.992634089972255e-05, + "loss": 0.077, + "num_input_tokens_seen": 4678816, + "step": 22180 + }, + { + "epoch": 2.4405940594059405, + "grad_norm": 0.2626270055770874, + "learning_rate": 4.99261566815603e-05, + "loss": 0.0418, + "num_input_tokens_seen": 4679872, + "step": 22185 + }, + { + "epoch": 2.441144114411441, + "grad_norm": 1.0634371042251587, + "learning_rate": 4.992597223366563e-05, + "loss": 0.1502, + "num_input_tokens_seen": 4680896, + "step": 22190 + }, + { + "epoch": 2.441694169416942, + "grad_norm": 0.48630490899086, + "learning_rate": 4.992578755604023e-05, + "loss": 0.0974, + "num_input_tokens_seen": 4681952, + "step": 22195 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 1.1064460277557373, + "learning_rate": 4.992560264868581e-05, + "loss": 0.1053, + "num_input_tokens_seen": 4683008, + "step": 22200 + }, + { + "epoch": 2.4427942794279427, + "grad_norm": 0.10584540665149689, + "learning_rate": 4.9925417511604066e-05, + "loss": 0.0485, + "num_input_tokens_seen": 4684128, + "step": 22205 + }, + { + "epoch": 2.4433443344334433, + "grad_norm": 0.092058464884758, + "learning_rate": 4.992523214479672e-05, + "loss": 0.1033, + "num_input_tokens_seen": 4685184, + "step": 22210 + }, + { + "epoch": 2.443894389438944, + "grad_norm": 0.368985652923584, + "learning_rate": 4.992504654826546e-05, + "loss": 0.1306, + "num_input_tokens_seen": 4686240, + "step": 22215 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.04060712456703186, + "learning_rate": 4.992486072201201e-05, + "loss": 0.0391, + "num_input_tokens_seen": 4687232, + "step": 22220 + }, + { + "epoch": 2.444994499449945, + "grad_norm": 1.0361019372940063, + "learning_rate": 4.9924674666038075e-05, + "loss": 0.0488, + "num_input_tokens_seen": 4688288, + "step": 22225 + }, + { + "epoch": 2.4455445544554455, + "grad_norm": 0.10699877142906189, + "learning_rate": 4.992448838034537e-05, + "loss": 0.0489, + "num_input_tokens_seen": 4689280, + "step": 22230 + }, + { + "epoch": 2.446094609460946, + "grad_norm": 0.3028705418109894, + "learning_rate": 4.992430186493563e-05, + "loss": 0.0599, + "num_input_tokens_seen": 4690336, + "step": 22235 + }, + { + "epoch": 2.446644664466447, + "grad_norm": 0.169427752494812, + "learning_rate": 4.992411511981055e-05, + "loss": 0.03, + "num_input_tokens_seen": 4691360, + "step": 22240 + }, + { + "epoch": 2.4471947194719474, + "grad_norm": 0.4018000066280365, + "learning_rate": 4.992392814497186e-05, + "loss": 0.0456, + "num_input_tokens_seen": 4692448, + "step": 22245 + }, + { + "epoch": 2.4477447744774476, + "grad_norm": 0.08351198583841324, + "learning_rate": 4.9923740940421285e-05, + "loss": 0.0767, + "num_input_tokens_seen": 4693472, + "step": 22250 + }, + { + "epoch": 2.4482948294829483, + "grad_norm": 0.03077606111764908, + "learning_rate": 4.9923553506160546e-05, + "loss": 0.0704, + "num_input_tokens_seen": 4694592, + "step": 22255 + }, + { + "epoch": 2.448844884488449, + "grad_norm": 0.1556003987789154, + "learning_rate": 4.992336584219138e-05, + "loss": 0.0348, + "num_input_tokens_seen": 4695648, + "step": 22260 + }, + { + "epoch": 2.449394939493949, + "grad_norm": 0.05425841361284256, + "learning_rate": 4.992317794851551e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4696736, + "step": 22265 + }, + { + "epoch": 2.44994499449945, + "grad_norm": 0.18708212673664093, + "learning_rate": 4.9922989825134665e-05, + "loss": 0.1014, + "num_input_tokens_seen": 4697792, + "step": 22270 + }, + { + "epoch": 2.4504950495049505, + "grad_norm": 1.292216420173645, + "learning_rate": 4.9922801472050586e-05, + "loss": 0.1721, + "num_input_tokens_seen": 4698944, + "step": 22275 + }, + { + "epoch": 2.451045104510451, + "grad_norm": 2.530019760131836, + "learning_rate": 4.9922612889264995e-05, + "loss": 0.0704, + "num_input_tokens_seen": 4700032, + "step": 22280 + }, + { + "epoch": 2.4515951595159517, + "grad_norm": 0.26069456338882446, + "learning_rate": 4.992242407677965e-05, + "loss": 0.0664, + "num_input_tokens_seen": 4701088, + "step": 22285 + }, + { + "epoch": 2.452145214521452, + "grad_norm": 2.9812796115875244, + "learning_rate": 4.992223503459628e-05, + "loss": 0.1706, + "num_input_tokens_seen": 4702144, + "step": 22290 + }, + { + "epoch": 2.4526952695269526, + "grad_norm": 0.11097712814807892, + "learning_rate": 4.992204576271663e-05, + "loss": 0.1134, + "num_input_tokens_seen": 4703168, + "step": 22295 + }, + { + "epoch": 2.4532453245324533, + "grad_norm": 0.02257322333753109, + "learning_rate": 4.992185626114244e-05, + "loss": 0.1741, + "num_input_tokens_seen": 4704256, + "step": 22300 + }, + { + "epoch": 2.453795379537954, + "grad_norm": 0.19803453981876373, + "learning_rate": 4.9921666529875454e-05, + "loss": 0.0405, + "num_input_tokens_seen": 4705344, + "step": 22305 + }, + { + "epoch": 2.4543454345434546, + "grad_norm": 0.6012899875640869, + "learning_rate": 4.992147656891744e-05, + "loss": 0.0664, + "num_input_tokens_seen": 4706432, + "step": 22310 + }, + { + "epoch": 2.4548954895489548, + "grad_norm": 0.48761528730392456, + "learning_rate": 4.992128637827012e-05, + "loss": 0.1289, + "num_input_tokens_seen": 4707552, + "step": 22315 + }, + { + "epoch": 2.4554455445544554, + "grad_norm": 0.3737304210662842, + "learning_rate": 4.9921095957935274e-05, + "loss": 0.1413, + "num_input_tokens_seen": 4708608, + "step": 22320 + }, + { + "epoch": 2.455995599559956, + "grad_norm": 0.04520200192928314, + "learning_rate": 4.9920905307914635e-05, + "loss": 0.069, + "num_input_tokens_seen": 4709632, + "step": 22325 + }, + { + "epoch": 2.4565456545654567, + "grad_norm": 0.08219217509031296, + "learning_rate": 4.9920714428209974e-05, + "loss": 0.0216, + "num_input_tokens_seen": 4710624, + "step": 22330 + }, + { + "epoch": 2.457095709570957, + "grad_norm": 0.04453374817967415, + "learning_rate": 4.9920523318823046e-05, + "loss": 0.0515, + "num_input_tokens_seen": 4711712, + "step": 22335 + }, + { + "epoch": 2.4576457645764576, + "grad_norm": 0.040144480764865875, + "learning_rate": 4.992033197975561e-05, + "loss": 0.0669, + "num_input_tokens_seen": 4712832, + "step": 22340 + }, + { + "epoch": 2.458195819581958, + "grad_norm": 0.5267515778541565, + "learning_rate": 4.992014041100944e-05, + "loss": 0.0111, + "num_input_tokens_seen": 4713952, + "step": 22345 + }, + { + "epoch": 2.458745874587459, + "grad_norm": 0.06590408086776733, + "learning_rate": 4.991994861258629e-05, + "loss": 0.0462, + "num_input_tokens_seen": 4715008, + "step": 22350 + }, + { + "epoch": 2.459295929592959, + "grad_norm": 0.018465643748641014, + "learning_rate": 4.991975658448793e-05, + "loss": 0.1438, + "num_input_tokens_seen": 4716096, + "step": 22355 + }, + { + "epoch": 2.4598459845984597, + "grad_norm": 0.15391771495342255, + "learning_rate": 4.991956432671613e-05, + "loss": 0.0256, + "num_input_tokens_seen": 4717152, + "step": 22360 + }, + { + "epoch": 2.4603960396039604, + "grad_norm": 0.04265418276190758, + "learning_rate": 4.9919371839272664e-05, + "loss": 0.1019, + "num_input_tokens_seen": 4718208, + "step": 22365 + }, + { + "epoch": 2.460946094609461, + "grad_norm": 0.18064425885677338, + "learning_rate": 4.991917912215931e-05, + "loss": 0.0129, + "num_input_tokens_seen": 4719232, + "step": 22370 + }, + { + "epoch": 2.4614961496149617, + "grad_norm": 0.12940555810928345, + "learning_rate": 4.9918986175377836e-05, + "loss": 0.0623, + "num_input_tokens_seen": 4720320, + "step": 22375 + }, + { + "epoch": 2.462046204620462, + "grad_norm": 1.8774604797363281, + "learning_rate": 4.991879299893002e-05, + "loss": 0.0737, + "num_input_tokens_seen": 4721408, + "step": 22380 + }, + { + "epoch": 2.4625962596259625, + "grad_norm": 0.12740159034729004, + "learning_rate": 4.991859959281765e-05, + "loss": 0.033, + "num_input_tokens_seen": 4722432, + "step": 22385 + }, + { + "epoch": 2.463146314631463, + "grad_norm": 0.31384074687957764, + "learning_rate": 4.99184059570425e-05, + "loss": 0.1719, + "num_input_tokens_seen": 4723488, + "step": 22390 + }, + { + "epoch": 2.463696369636964, + "grad_norm": 0.13224919140338898, + "learning_rate": 4.991821209160636e-05, + "loss": 0.1021, + "num_input_tokens_seen": 4724512, + "step": 22395 + }, + { + "epoch": 2.4642464246424645, + "grad_norm": 0.0572042316198349, + "learning_rate": 4.9918017996511025e-05, + "loss": 0.0372, + "num_input_tokens_seen": 4725632, + "step": 22400 + }, + { + "epoch": 2.4647964796479647, + "grad_norm": 0.4075290560722351, + "learning_rate": 4.991782367175827e-05, + "loss": 0.0391, + "num_input_tokens_seen": 4726720, + "step": 22405 + }, + { + "epoch": 2.4653465346534653, + "grad_norm": 1.3053141832351685, + "learning_rate": 4.991762911734989e-05, + "loss": 0.0763, + "num_input_tokens_seen": 4727808, + "step": 22410 + }, + { + "epoch": 2.465896589658966, + "grad_norm": 1.0377981662750244, + "learning_rate": 4.9917434333287675e-05, + "loss": 0.1242, + "num_input_tokens_seen": 4728832, + "step": 22415 + }, + { + "epoch": 2.4664466446644666, + "grad_norm": 0.3619261384010315, + "learning_rate": 4.9917239319573426e-05, + "loss": 0.046, + "num_input_tokens_seen": 4729856, + "step": 22420 + }, + { + "epoch": 2.466996699669967, + "grad_norm": 0.1804722398519516, + "learning_rate": 4.991704407620895e-05, + "loss": 0.0287, + "num_input_tokens_seen": 4730912, + "step": 22425 + }, + { + "epoch": 2.4675467546754675, + "grad_norm": 0.03322294354438782, + "learning_rate": 4.991684860319602e-05, + "loss": 0.1381, + "num_input_tokens_seen": 4731904, + "step": 22430 + }, + { + "epoch": 2.468096809680968, + "grad_norm": 0.07437612861394882, + "learning_rate": 4.9916652900536466e-05, + "loss": 0.0522, + "num_input_tokens_seen": 4732928, + "step": 22435 + }, + { + "epoch": 2.4686468646864688, + "grad_norm": 0.38118863105773926, + "learning_rate": 4.991645696823207e-05, + "loss": 0.0692, + "num_input_tokens_seen": 4733984, + "step": 22440 + }, + { + "epoch": 2.469196919691969, + "grad_norm": 0.1355208456516266, + "learning_rate": 4.9916260806284644e-05, + "loss": 0.0555, + "num_input_tokens_seen": 4734976, + "step": 22445 + }, + { + "epoch": 2.4697469746974696, + "grad_norm": 0.14851921796798706, + "learning_rate": 4.9916064414696004e-05, + "loss": 0.0329, + "num_input_tokens_seen": 4735968, + "step": 22450 + }, + { + "epoch": 2.4702970297029703, + "grad_norm": 1.3464456796646118, + "learning_rate": 4.991586779346795e-05, + "loss": 0.0735, + "num_input_tokens_seen": 4737056, + "step": 22455 + }, + { + "epoch": 2.470847084708471, + "grad_norm": 1.3517897129058838, + "learning_rate": 4.99156709426023e-05, + "loss": 0.1281, + "num_input_tokens_seen": 4738144, + "step": 22460 + }, + { + "epoch": 2.4713971397139716, + "grad_norm": 0.19627422094345093, + "learning_rate": 4.991547386210087e-05, + "loss": 0.0263, + "num_input_tokens_seen": 4739232, + "step": 22465 + }, + { + "epoch": 2.4719471947194718, + "grad_norm": 0.45559898018836975, + "learning_rate": 4.991527655196547e-05, + "loss": 0.1346, + "num_input_tokens_seen": 4740352, + "step": 22470 + }, + { + "epoch": 2.4724972497249724, + "grad_norm": 0.8313711285591125, + "learning_rate": 4.991507901219792e-05, + "loss": 0.0901, + "num_input_tokens_seen": 4741408, + "step": 22475 + }, + { + "epoch": 2.473047304730473, + "grad_norm": 0.32875627279281616, + "learning_rate": 4.991488124280004e-05, + "loss": 0.0468, + "num_input_tokens_seen": 4742464, + "step": 22480 + }, + { + "epoch": 2.4735973597359737, + "grad_norm": 0.6110744476318359, + "learning_rate": 4.991468324377366e-05, + "loss": 0.0462, + "num_input_tokens_seen": 4743584, + "step": 22485 + }, + { + "epoch": 2.4741474147414744, + "grad_norm": 0.08189133554697037, + "learning_rate": 4.99144850151206e-05, + "loss": 0.0246, + "num_input_tokens_seen": 4744704, + "step": 22490 + }, + { + "epoch": 2.4746974697469746, + "grad_norm": 0.3619355261325836, + "learning_rate": 4.9914286556842684e-05, + "loss": 0.0686, + "num_input_tokens_seen": 4745760, + "step": 22495 + }, + { + "epoch": 2.4752475247524752, + "grad_norm": 0.3341006934642792, + "learning_rate": 4.991408786894175e-05, + "loss": 0.0936, + "num_input_tokens_seen": 4746816, + "step": 22500 + }, + { + "epoch": 2.475797579757976, + "grad_norm": 0.1889004111289978, + "learning_rate": 4.991388895141961e-05, + "loss": 0.106, + "num_input_tokens_seen": 4747968, + "step": 22505 + }, + { + "epoch": 2.4763476347634765, + "grad_norm": 0.03994401916861534, + "learning_rate": 4.991368980427812e-05, + "loss": 0.1367, + "num_input_tokens_seen": 4749056, + "step": 22510 + }, + { + "epoch": 2.4768976897689767, + "grad_norm": 1.1399829387664795, + "learning_rate": 4.9913490427519106e-05, + "loss": 0.1473, + "num_input_tokens_seen": 4750176, + "step": 22515 + }, + { + "epoch": 2.4774477447744774, + "grad_norm": 0.12280116230249405, + "learning_rate": 4.9913290821144396e-05, + "loss": 0.0117, + "num_input_tokens_seen": 4751200, + "step": 22520 + }, + { + "epoch": 2.477997799779978, + "grad_norm": 1.2490782737731934, + "learning_rate": 4.991309098515585e-05, + "loss": 0.1151, + "num_input_tokens_seen": 4752352, + "step": 22525 + }, + { + "epoch": 2.4785478547854787, + "grad_norm": 0.9053449630737305, + "learning_rate": 4.99128909195553e-05, + "loss": 0.0674, + "num_input_tokens_seen": 4753408, + "step": 22530 + }, + { + "epoch": 2.479097909790979, + "grad_norm": 0.4945276081562042, + "learning_rate": 4.9912690624344584e-05, + "loss": 0.1041, + "num_input_tokens_seen": 4754432, + "step": 22535 + }, + { + "epoch": 2.4796479647964795, + "grad_norm": 0.6264147758483887, + "learning_rate": 4.991249009952556e-05, + "loss": 0.0578, + "num_input_tokens_seen": 4755456, + "step": 22540 + }, + { + "epoch": 2.48019801980198, + "grad_norm": 0.12345870584249496, + "learning_rate": 4.991228934510005e-05, + "loss": 0.0541, + "num_input_tokens_seen": 4756512, + "step": 22545 + }, + { + "epoch": 2.480748074807481, + "grad_norm": 0.35846471786499023, + "learning_rate": 4.9912088361069945e-05, + "loss": 0.0379, + "num_input_tokens_seen": 4757600, + "step": 22550 + }, + { + "epoch": 2.4812981298129815, + "grad_norm": 0.2357397973537445, + "learning_rate": 4.991188714743706e-05, + "loss": 0.0363, + "num_input_tokens_seen": 4758720, + "step": 22555 + }, + { + "epoch": 2.4818481848184817, + "grad_norm": 0.0437147356569767, + "learning_rate": 4.991168570420327e-05, + "loss": 0.0444, + "num_input_tokens_seen": 4759776, + "step": 22560 + }, + { + "epoch": 2.4823982398239823, + "grad_norm": 0.4417200982570648, + "learning_rate": 4.991148403137043e-05, + "loss": 0.083, + "num_input_tokens_seen": 4760832, + "step": 22565 + }, + { + "epoch": 2.482948294829483, + "grad_norm": 0.20677846670150757, + "learning_rate": 4.991128212894039e-05, + "loss": 0.0868, + "num_input_tokens_seen": 4761856, + "step": 22570 + }, + { + "epoch": 2.4834983498349836, + "grad_norm": 0.1782635748386383, + "learning_rate": 4.9911079996915014e-05, + "loss": 0.1104, + "num_input_tokens_seen": 4762848, + "step": 22575 + }, + { + "epoch": 2.4840484048404843, + "grad_norm": 1.382908582687378, + "learning_rate": 4.991087763529618e-05, + "loss": 0.1115, + "num_input_tokens_seen": 4763904, + "step": 22580 + }, + { + "epoch": 2.4845984598459845, + "grad_norm": 0.22828614711761475, + "learning_rate": 4.991067504408573e-05, + "loss": 0.0756, + "num_input_tokens_seen": 4764928, + "step": 22585 + }, + { + "epoch": 2.485148514851485, + "grad_norm": 0.12866516411304474, + "learning_rate": 4.991047222328554e-05, + "loss": 0.0196, + "num_input_tokens_seen": 4765984, + "step": 22590 + }, + { + "epoch": 2.485698569856986, + "grad_norm": 0.5203793048858643, + "learning_rate": 4.991026917289748e-05, + "loss": 0.0184, + "num_input_tokens_seen": 4767040, + "step": 22595 + }, + { + "epoch": 2.4862486248624864, + "grad_norm": 0.16108569502830505, + "learning_rate": 4.991006589292343e-05, + "loss": 0.0539, + "num_input_tokens_seen": 4768128, + "step": 22600 + }, + { + "epoch": 2.4867986798679866, + "grad_norm": 0.142313614487648, + "learning_rate": 4.990986238336526e-05, + "loss": 0.1007, + "num_input_tokens_seen": 4769152, + "step": 22605 + }, + { + "epoch": 2.4873487348734873, + "grad_norm": 0.2556547224521637, + "learning_rate": 4.990965864422483e-05, + "loss": 0.0262, + "num_input_tokens_seen": 4770240, + "step": 22610 + }, + { + "epoch": 2.487898789878988, + "grad_norm": 0.06787620484828949, + "learning_rate": 4.990945467550403e-05, + "loss": 0.0808, + "num_input_tokens_seen": 4771296, + "step": 22615 + }, + { + "epoch": 2.4884488448844886, + "grad_norm": 0.03355530649423599, + "learning_rate": 4.9909250477204746e-05, + "loss": 0.0469, + "num_input_tokens_seen": 4772288, + "step": 22620 + }, + { + "epoch": 2.488998899889989, + "grad_norm": 0.769108772277832, + "learning_rate": 4.9909046049328846e-05, + "loss": 0.0647, + "num_input_tokens_seen": 4773312, + "step": 22625 + }, + { + "epoch": 2.4895489548954894, + "grad_norm": 0.4551587998867035, + "learning_rate": 4.9908841391878225e-05, + "loss": 0.0795, + "num_input_tokens_seen": 4774432, + "step": 22630 + }, + { + "epoch": 2.49009900990099, + "grad_norm": 1.8678573369979858, + "learning_rate": 4.990863650485476e-05, + "loss": 0.1193, + "num_input_tokens_seen": 4775488, + "step": 22635 + }, + { + "epoch": 2.4906490649064907, + "grad_norm": 0.6005626916885376, + "learning_rate": 4.990843138826035e-05, + "loss": 0.1857, + "num_input_tokens_seen": 4776480, + "step": 22640 + }, + { + "epoch": 2.4911991199119914, + "grad_norm": 1.4718657732009888, + "learning_rate": 4.990822604209688e-05, + "loss": 0.1205, + "num_input_tokens_seen": 4777536, + "step": 22645 + }, + { + "epoch": 2.4917491749174916, + "grad_norm": 0.04639190435409546, + "learning_rate": 4.990802046636624e-05, + "loss": 0.091, + "num_input_tokens_seen": 4778592, + "step": 22650 + }, + { + "epoch": 2.4922992299229922, + "grad_norm": 0.49861249327659607, + "learning_rate": 4.990781466107033e-05, + "loss": 0.0632, + "num_input_tokens_seen": 4779680, + "step": 22655 + }, + { + "epoch": 2.492849284928493, + "grad_norm": 0.12826135754585266, + "learning_rate": 4.990760862621104e-05, + "loss": 0.0908, + "num_input_tokens_seen": 4780864, + "step": 22660 + }, + { + "epoch": 2.4933993399339935, + "grad_norm": 0.13332638144493103, + "learning_rate": 4.990740236179028e-05, + "loss": 0.0322, + "num_input_tokens_seen": 4781952, + "step": 22665 + }, + { + "epoch": 2.493949394939494, + "grad_norm": 0.39318424463272095, + "learning_rate": 4.990719586780994e-05, + "loss": 0.1321, + "num_input_tokens_seen": 4783008, + "step": 22670 + }, + { + "epoch": 2.4944994499449944, + "grad_norm": 0.7917028665542603, + "learning_rate": 4.990698914427193e-05, + "loss": 0.0924, + "num_input_tokens_seen": 4784064, + "step": 22675 + }, + { + "epoch": 2.495049504950495, + "grad_norm": 0.3437504768371582, + "learning_rate": 4.990678219117815e-05, + "loss": 0.0854, + "num_input_tokens_seen": 4785120, + "step": 22680 + }, + { + "epoch": 2.4955995599559957, + "grad_norm": 0.8698968887329102, + "learning_rate": 4.990657500853051e-05, + "loss": 0.0775, + "num_input_tokens_seen": 4786208, + "step": 22685 + }, + { + "epoch": 2.4961496149614963, + "grad_norm": 0.44055166840553284, + "learning_rate": 4.990636759633092e-05, + "loss": 0.1333, + "num_input_tokens_seen": 4787200, + "step": 22690 + }, + { + "epoch": 2.4966996699669965, + "grad_norm": 0.3574020564556122, + "learning_rate": 4.990615995458129e-05, + "loss": 0.0464, + "num_input_tokens_seen": 4788192, + "step": 22695 + }, + { + "epoch": 2.497249724972497, + "grad_norm": 0.5044066905975342, + "learning_rate": 4.9905952083283534e-05, + "loss": 0.1386, + "num_input_tokens_seen": 4789280, + "step": 22700 + }, + { + "epoch": 2.497799779977998, + "grad_norm": 0.5486252903938293, + "learning_rate": 4.990574398243957e-05, + "loss": 0.046, + "num_input_tokens_seen": 4790400, + "step": 22705 + }, + { + "epoch": 2.4983498349834985, + "grad_norm": 0.9999597668647766, + "learning_rate": 4.990553565205132e-05, + "loss": 0.091, + "num_input_tokens_seen": 4791424, + "step": 22710 + }, + { + "epoch": 2.4988998899889987, + "grad_norm": 0.3370204269886017, + "learning_rate": 4.990532709212069e-05, + "loss": 0.1121, + "num_input_tokens_seen": 4792544, + "step": 22715 + }, + { + "epoch": 2.4994499449944994, + "grad_norm": 1.257464051246643, + "learning_rate": 4.990511830264961e-05, + "loss": 0.2129, + "num_input_tokens_seen": 4793600, + "step": 22720 + }, + { + "epoch": 2.5, + "grad_norm": 0.22716271877288818, + "learning_rate": 4.990490928364001e-05, + "loss": 0.0488, + "num_input_tokens_seen": 4794624, + "step": 22725 + }, + { + "epoch": 2.5005500550055006, + "grad_norm": 1.0442161560058594, + "learning_rate": 4.990470003509382e-05, + "loss": 0.1183, + "num_input_tokens_seen": 4795648, + "step": 22730 + }, + { + "epoch": 2.5011001100110013, + "grad_norm": 0.2963760197162628, + "learning_rate": 4.990449055701295e-05, + "loss": 0.065, + "num_input_tokens_seen": 4796768, + "step": 22735 + }, + { + "epoch": 2.5016501650165015, + "grad_norm": 0.1406857967376709, + "learning_rate": 4.990428084939934e-05, + "loss": 0.0536, + "num_input_tokens_seen": 4797824, + "step": 22740 + }, + { + "epoch": 2.502200220022002, + "grad_norm": 0.43398723006248474, + "learning_rate": 4.990407091225493e-05, + "loss": 0.0676, + "num_input_tokens_seen": 4798912, + "step": 22745 + }, + { + "epoch": 2.502750275027503, + "grad_norm": 0.27125704288482666, + "learning_rate": 4.990386074558164e-05, + "loss": 0.0666, + "num_input_tokens_seen": 4799936, + "step": 22750 + }, + { + "epoch": 2.5033003300330035, + "grad_norm": 0.42088639736175537, + "learning_rate": 4.9903650349381424e-05, + "loss": 0.0391, + "num_input_tokens_seen": 4800992, + "step": 22755 + }, + { + "epoch": 2.503850385038504, + "grad_norm": 0.21687279641628265, + "learning_rate": 4.990343972365621e-05, + "loss": 0.0818, + "num_input_tokens_seen": 4802112, + "step": 22760 + }, + { + "epoch": 2.5044004400440043, + "grad_norm": 0.6538292169570923, + "learning_rate": 4.9903228868407936e-05, + "loss": 0.1029, + "num_input_tokens_seen": 4803136, + "step": 22765 + }, + { + "epoch": 2.504950495049505, + "grad_norm": 0.8170910477638245, + "learning_rate": 4.990301778363856e-05, + "loss": 0.0323, + "num_input_tokens_seen": 4804192, + "step": 22770 + }, + { + "epoch": 2.5055005500550056, + "grad_norm": 0.03249908238649368, + "learning_rate": 4.990280646935002e-05, + "loss": 0.0534, + "num_input_tokens_seen": 4805216, + "step": 22775 + }, + { + "epoch": 2.506050605060506, + "grad_norm": 0.7278532385826111, + "learning_rate": 4.990259492554425e-05, + "loss": 0.0965, + "num_input_tokens_seen": 4806304, + "step": 22780 + }, + { + "epoch": 2.5066006600660065, + "grad_norm": 0.29675111174583435, + "learning_rate": 4.990238315222322e-05, + "loss": 0.0643, + "num_input_tokens_seen": 4807424, + "step": 22785 + }, + { + "epoch": 2.507150715071507, + "grad_norm": 0.48855629563331604, + "learning_rate": 4.9902171149388875e-05, + "loss": 0.1519, + "num_input_tokens_seen": 4808480, + "step": 22790 + }, + { + "epoch": 2.5077007700770078, + "grad_norm": 0.20006844401359558, + "learning_rate": 4.9901958917043164e-05, + "loss": 0.0575, + "num_input_tokens_seen": 4809504, + "step": 22795 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.02435227483510971, + "learning_rate": 4.990174645518805e-05, + "loss": 0.0974, + "num_input_tokens_seen": 4810592, + "step": 22800 + }, + { + "epoch": 2.5088008800880086, + "grad_norm": 1.4102904796600342, + "learning_rate": 4.990153376382548e-05, + "loss": 0.1053, + "num_input_tokens_seen": 4811648, + "step": 22805 + }, + { + "epoch": 2.5093509350935093, + "grad_norm": 0.08538779616355896, + "learning_rate": 4.9901320842957436e-05, + "loss": 0.0625, + "num_input_tokens_seen": 4812736, + "step": 22810 + }, + { + "epoch": 2.50990099009901, + "grad_norm": 0.02375250868499279, + "learning_rate": 4.9901107692585856e-05, + "loss": 0.0877, + "num_input_tokens_seen": 4813728, + "step": 22815 + }, + { + "epoch": 2.5104510451045106, + "grad_norm": 0.5606352686882019, + "learning_rate": 4.990089431271272e-05, + "loss": 0.0496, + "num_input_tokens_seen": 4814816, + "step": 22820 + }, + { + "epoch": 2.511001100110011, + "grad_norm": 0.10255087912082672, + "learning_rate": 4.990068070333999e-05, + "loss": 0.0682, + "num_input_tokens_seen": 4815808, + "step": 22825 + }, + { + "epoch": 2.5115511551155114, + "grad_norm": 0.40700289607048035, + "learning_rate": 4.990046686446963e-05, + "loss": 0.0776, + "num_input_tokens_seen": 4816864, + "step": 22830 + }, + { + "epoch": 2.512101210121012, + "grad_norm": 0.613353967666626, + "learning_rate": 4.990025279610362e-05, + "loss": 0.0739, + "num_input_tokens_seen": 4817888, + "step": 22835 + }, + { + "epoch": 2.5126512651265127, + "grad_norm": 0.1732892245054245, + "learning_rate": 4.9900038498243926e-05, + "loss": 0.061, + "num_input_tokens_seen": 4818976, + "step": 22840 + }, + { + "epoch": 2.5132013201320134, + "grad_norm": 1.2811263799667358, + "learning_rate": 4.9899823970892524e-05, + "loss": 0.1126, + "num_input_tokens_seen": 4820032, + "step": 22845 + }, + { + "epoch": 2.513751375137514, + "grad_norm": 0.4271836578845978, + "learning_rate": 4.989960921405139e-05, + "loss": 0.062, + "num_input_tokens_seen": 4821056, + "step": 22850 + }, + { + "epoch": 2.514301430143014, + "grad_norm": 0.35902392864227295, + "learning_rate": 4.989939422772252e-05, + "loss": 0.1136, + "num_input_tokens_seen": 4822176, + "step": 22855 + }, + { + "epoch": 2.514851485148515, + "grad_norm": 0.5863659977912903, + "learning_rate": 4.989917901190787e-05, + "loss": 0.0778, + "num_input_tokens_seen": 4823232, + "step": 22860 + }, + { + "epoch": 2.5154015401540155, + "grad_norm": 0.963024914264679, + "learning_rate": 4.989896356660944e-05, + "loss": 0.1064, + "num_input_tokens_seen": 4824288, + "step": 22865 + }, + { + "epoch": 2.5159515951595157, + "grad_norm": 0.30490046739578247, + "learning_rate": 4.98987478918292e-05, + "loss": 0.0517, + "num_input_tokens_seen": 4825312, + "step": 22870 + }, + { + "epoch": 2.5165016501650164, + "grad_norm": 0.06876200437545776, + "learning_rate": 4.989853198756915e-05, + "loss": 0.0113, + "num_input_tokens_seen": 4826304, + "step": 22875 + }, + { + "epoch": 2.517051705170517, + "grad_norm": 0.5655995011329651, + "learning_rate": 4.989831585383129e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4827360, + "step": 22880 + }, + { + "epoch": 2.5176017601760177, + "grad_norm": 0.23083914816379547, + "learning_rate": 4.989809949061759e-05, + "loss": 0.0845, + "num_input_tokens_seen": 4828448, + "step": 22885 + }, + { + "epoch": 2.5181518151815183, + "grad_norm": 1.0255206823349, + "learning_rate": 4.989788289793006e-05, + "loss": 0.1107, + "num_input_tokens_seen": 4829504, + "step": 22890 + }, + { + "epoch": 2.5187018701870185, + "grad_norm": 0.5257784128189087, + "learning_rate": 4.989766607577069e-05, + "loss": 0.0764, + "num_input_tokens_seen": 4830560, + "step": 22895 + }, + { + "epoch": 2.519251925192519, + "grad_norm": 0.4407321512699127, + "learning_rate": 4.989744902414147e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4831616, + "step": 22900 + }, + { + "epoch": 2.51980198019802, + "grad_norm": 1.1160252094268799, + "learning_rate": 4.989723174304441e-05, + "loss": 0.1037, + "num_input_tokens_seen": 4832672, + "step": 22905 + }, + { + "epoch": 2.5203520352035205, + "grad_norm": 1.3439277410507202, + "learning_rate": 4.9897014232481526e-05, + "loss": 0.0551, + "num_input_tokens_seen": 4833728, + "step": 22910 + }, + { + "epoch": 2.520902090209021, + "grad_norm": 0.978549599647522, + "learning_rate": 4.98967964924548e-05, + "loss": 0.0945, + "num_input_tokens_seen": 4834816, + "step": 22915 + }, + { + "epoch": 2.5214521452145213, + "grad_norm": 0.17018291354179382, + "learning_rate": 4.9896578522966245e-05, + "loss": 0.0307, + "num_input_tokens_seen": 4835808, + "step": 22920 + }, + { + "epoch": 2.522002200220022, + "grad_norm": 0.0837782695889473, + "learning_rate": 4.9896360324017874e-05, + "loss": 0.035, + "num_input_tokens_seen": 4836896, + "step": 22925 + }, + { + "epoch": 2.5225522552255226, + "grad_norm": 1.607459306716919, + "learning_rate": 4.98961418956117e-05, + "loss": 0.1062, + "num_input_tokens_seen": 4837984, + "step": 22930 + }, + { + "epoch": 2.523102310231023, + "grad_norm": 0.2237253338098526, + "learning_rate": 4.989592323774973e-05, + "loss": 0.0805, + "num_input_tokens_seen": 4839008, + "step": 22935 + }, + { + "epoch": 2.523652365236524, + "grad_norm": 0.15371344983577728, + "learning_rate": 4.989570435043398e-05, + "loss": 0.0512, + "num_input_tokens_seen": 4840064, + "step": 22940 + }, + { + "epoch": 2.524202420242024, + "grad_norm": 1.0189945697784424, + "learning_rate": 4.9895485233666476e-05, + "loss": 0.1638, + "num_input_tokens_seen": 4841184, + "step": 22945 + }, + { + "epoch": 2.5247524752475248, + "grad_norm": 0.2847106456756592, + "learning_rate": 4.989526588744923e-05, + "loss": 0.0407, + "num_input_tokens_seen": 4842208, + "step": 22950 + }, + { + "epoch": 2.5253025302530254, + "grad_norm": 0.6372635960578918, + "learning_rate": 4.989504631178426e-05, + "loss": 0.0677, + "num_input_tokens_seen": 4843264, + "step": 22955 + }, + { + "epoch": 2.5258525852585256, + "grad_norm": 0.2624012231826782, + "learning_rate": 4.989482650667359e-05, + "loss": 0.0325, + "num_input_tokens_seen": 4844288, + "step": 22960 + }, + { + "epoch": 2.5264026402640263, + "grad_norm": 0.23549149930477142, + "learning_rate": 4.9894606472119256e-05, + "loss": 0.0497, + "num_input_tokens_seen": 4845344, + "step": 22965 + }, + { + "epoch": 2.526952695269527, + "grad_norm": 0.8011471033096313, + "learning_rate": 4.9894386208123276e-05, + "loss": 0.0645, + "num_input_tokens_seen": 4846336, + "step": 22970 + }, + { + "epoch": 2.5275027502750276, + "grad_norm": 0.04427105188369751, + "learning_rate": 4.989416571468768e-05, + "loss": 0.102, + "num_input_tokens_seen": 4847424, + "step": 22975 + }, + { + "epoch": 2.5280528052805282, + "grad_norm": 0.8627780675888062, + "learning_rate": 4.9893944991814514e-05, + "loss": 0.0538, + "num_input_tokens_seen": 4848544, + "step": 22980 + }, + { + "epoch": 2.5286028602860284, + "grad_norm": 0.2648259103298187, + "learning_rate": 4.98937240395058e-05, + "loss": 0.0855, + "num_input_tokens_seen": 4849600, + "step": 22985 + }, + { + "epoch": 2.529152915291529, + "grad_norm": 0.15340253710746765, + "learning_rate": 4.989350285776357e-05, + "loss": 0.1154, + "num_input_tokens_seen": 4850592, + "step": 22990 + }, + { + "epoch": 2.5297029702970297, + "grad_norm": 0.03781551867723465, + "learning_rate": 4.989328144658987e-05, + "loss": 0.1164, + "num_input_tokens_seen": 4851680, + "step": 22995 + }, + { + "epoch": 2.5302530253025304, + "grad_norm": 0.12528745830059052, + "learning_rate": 4.989305980598674e-05, + "loss": 0.0236, + "num_input_tokens_seen": 4852704, + "step": 23000 + }, + { + "epoch": 2.530803080308031, + "grad_norm": 0.4464355707168579, + "learning_rate": 4.989283793595623e-05, + "loss": 0.0384, + "num_input_tokens_seen": 4853792, + "step": 23005 + }, + { + "epoch": 2.5313531353135312, + "grad_norm": 0.5298397541046143, + "learning_rate": 4.9892615836500375e-05, + "loss": 0.0543, + "num_input_tokens_seen": 4854848, + "step": 23010 + }, + { + "epoch": 2.531903190319032, + "grad_norm": 0.816251277923584, + "learning_rate": 4.989239350762122e-05, + "loss": 0.1162, + "num_input_tokens_seen": 4855904, + "step": 23015 + }, + { + "epoch": 2.5324532453245325, + "grad_norm": 0.1362001746892929, + "learning_rate": 4.989217094932083e-05, + "loss": 0.0283, + "num_input_tokens_seen": 4856992, + "step": 23020 + }, + { + "epoch": 2.5330033003300327, + "grad_norm": 0.29178205132484436, + "learning_rate": 4.989194816160123e-05, + "loss": 0.049, + "num_input_tokens_seen": 4858080, + "step": 23025 + }, + { + "epoch": 2.533553355335534, + "grad_norm": 2.5743937492370605, + "learning_rate": 4.9891725144464495e-05, + "loss": 0.076, + "num_input_tokens_seen": 4859040, + "step": 23030 + }, + { + "epoch": 2.534103410341034, + "grad_norm": 0.21947094798088074, + "learning_rate": 4.9891501897912676e-05, + "loss": 0.0698, + "num_input_tokens_seen": 4860096, + "step": 23035 + }, + { + "epoch": 2.5346534653465347, + "grad_norm": 0.6389302015304565, + "learning_rate": 4.989127842194782e-05, + "loss": 0.039, + "num_input_tokens_seen": 4861152, + "step": 23040 + }, + { + "epoch": 2.5352035203520353, + "grad_norm": 0.02102438546717167, + "learning_rate": 4.9891054716572e-05, + "loss": 0.0447, + "num_input_tokens_seen": 4862272, + "step": 23045 + }, + { + "epoch": 2.5357535753575355, + "grad_norm": 0.07862008363008499, + "learning_rate": 4.989083078178727e-05, + "loss": 0.1096, + "num_input_tokens_seen": 4863360, + "step": 23050 + }, + { + "epoch": 2.536303630363036, + "grad_norm": 0.055328577756881714, + "learning_rate": 4.9890606617595706e-05, + "loss": 0.1089, + "num_input_tokens_seen": 4864384, + "step": 23055 + }, + { + "epoch": 2.536853685368537, + "grad_norm": 0.7115529775619507, + "learning_rate": 4.989038222399936e-05, + "loss": 0.0717, + "num_input_tokens_seen": 4865440, + "step": 23060 + }, + { + "epoch": 2.5374037403740375, + "grad_norm": 0.0852745920419693, + "learning_rate": 4.989015760100029e-05, + "loss": 0.028, + "num_input_tokens_seen": 4866496, + "step": 23065 + }, + { + "epoch": 2.537953795379538, + "grad_norm": 0.40215516090393066, + "learning_rate": 4.9889932748600596e-05, + "loss": 0.0574, + "num_input_tokens_seen": 4867616, + "step": 23070 + }, + { + "epoch": 2.5385038503850383, + "grad_norm": 0.5521998405456543, + "learning_rate": 4.988970766680233e-05, + "loss": 0.1168, + "num_input_tokens_seen": 4868704, + "step": 23075 + }, + { + "epoch": 2.539053905390539, + "grad_norm": 1.0095301866531372, + "learning_rate": 4.9889482355607574e-05, + "loss": 0.0902, + "num_input_tokens_seen": 4869856, + "step": 23080 + }, + { + "epoch": 2.5396039603960396, + "grad_norm": 0.19404761493206024, + "learning_rate": 4.9889256815018405e-05, + "loss": 0.0663, + "num_input_tokens_seen": 4870976, + "step": 23085 + }, + { + "epoch": 2.5401540154015403, + "grad_norm": 0.7730674743652344, + "learning_rate": 4.988903104503689e-05, + "loss": 0.0826, + "num_input_tokens_seen": 4872064, + "step": 23090 + }, + { + "epoch": 2.540704070407041, + "grad_norm": 0.6933779120445251, + "learning_rate": 4.988880504566513e-05, + "loss": 0.0833, + "num_input_tokens_seen": 4873120, + "step": 23095 + }, + { + "epoch": 2.541254125412541, + "grad_norm": 0.10347260534763336, + "learning_rate": 4.9888578816905187e-05, + "loss": 0.1886, + "num_input_tokens_seen": 4874176, + "step": 23100 + }, + { + "epoch": 2.541804180418042, + "grad_norm": 0.2398589700460434, + "learning_rate": 4.988835235875916e-05, + "loss": 0.0224, + "num_input_tokens_seen": 4875232, + "step": 23105 + }, + { + "epoch": 2.5423542354235424, + "grad_norm": 0.10122549533843994, + "learning_rate": 4.9888125671229134e-05, + "loss": 0.0605, + "num_input_tokens_seen": 4876224, + "step": 23110 + }, + { + "epoch": 2.5429042904290426, + "grad_norm": 0.4670291841030121, + "learning_rate": 4.98878987543172e-05, + "loss": 0.074, + "num_input_tokens_seen": 4877280, + "step": 23115 + }, + { + "epoch": 2.5434543454345433, + "grad_norm": 0.48705920577049255, + "learning_rate": 4.988767160802543e-05, + "loss": 0.1004, + "num_input_tokens_seen": 4878272, + "step": 23120 + }, + { + "epoch": 2.544004400440044, + "grad_norm": 0.4331834316253662, + "learning_rate": 4.988744423235594e-05, + "loss": 0.0339, + "num_input_tokens_seen": 4879360, + "step": 23125 + }, + { + "epoch": 2.5445544554455446, + "grad_norm": 0.6221439838409424, + "learning_rate": 4.988721662731083e-05, + "loss": 0.0306, + "num_input_tokens_seen": 4880384, + "step": 23130 + }, + { + "epoch": 2.5451045104510452, + "grad_norm": 0.38437941670417786, + "learning_rate": 4.988698879289217e-05, + "loss": 0.032, + "num_input_tokens_seen": 4881472, + "step": 23135 + }, + { + "epoch": 2.5456545654565454, + "grad_norm": 0.9847904443740845, + "learning_rate": 4.988676072910209e-05, + "loss": 0.1044, + "num_input_tokens_seen": 4882528, + "step": 23140 + }, + { + "epoch": 2.546204620462046, + "grad_norm": 0.09095021337270737, + "learning_rate": 4.988653243594267e-05, + "loss": 0.0622, + "num_input_tokens_seen": 4883488, + "step": 23145 + }, + { + "epoch": 2.5467546754675467, + "grad_norm": 0.14174728095531464, + "learning_rate": 4.988630391341602e-05, + "loss": 0.0464, + "num_input_tokens_seen": 4884512, + "step": 23150 + }, + { + "epoch": 2.5473047304730474, + "grad_norm": 0.3150326609611511, + "learning_rate": 4.988607516152426e-05, + "loss": 0.1096, + "num_input_tokens_seen": 4885600, + "step": 23155 + }, + { + "epoch": 2.547854785478548, + "grad_norm": 1.1835025548934937, + "learning_rate": 4.988584618026948e-05, + "loss": 0.1401, + "num_input_tokens_seen": 4886720, + "step": 23160 + }, + { + "epoch": 2.5484048404840483, + "grad_norm": 1.099954605102539, + "learning_rate": 4.988561696965379e-05, + "loss": 0.1238, + "num_input_tokens_seen": 4887808, + "step": 23165 + }, + { + "epoch": 2.548954895489549, + "grad_norm": 0.7568948864936829, + "learning_rate": 4.988538752967932e-05, + "loss": 0.0677, + "num_input_tokens_seen": 4888864, + "step": 23170 + }, + { + "epoch": 2.5495049504950495, + "grad_norm": 0.3232114613056183, + "learning_rate": 4.988515786034817e-05, + "loss": 0.0745, + "num_input_tokens_seen": 4889984, + "step": 23175 + }, + { + "epoch": 2.55005500550055, + "grad_norm": 0.07622619718313217, + "learning_rate": 4.9884927961662466e-05, + "loss": 0.0249, + "num_input_tokens_seen": 4891008, + "step": 23180 + }, + { + "epoch": 2.550605060506051, + "grad_norm": 0.7488269209861755, + "learning_rate": 4.988469783362432e-05, + "loss": 0.0934, + "num_input_tokens_seen": 4892032, + "step": 23185 + }, + { + "epoch": 2.551155115511551, + "grad_norm": 0.08835670351982117, + "learning_rate": 4.988446747623585e-05, + "loss": 0.0274, + "num_input_tokens_seen": 4893152, + "step": 23190 + }, + { + "epoch": 2.5517051705170517, + "grad_norm": 0.08885656297206879, + "learning_rate": 4.988423688949918e-05, + "loss": 0.0661, + "num_input_tokens_seen": 4894304, + "step": 23195 + }, + { + "epoch": 2.5522552255225524, + "grad_norm": 0.2814825773239136, + "learning_rate": 4.988400607341645e-05, + "loss": 0.0168, + "num_input_tokens_seen": 4895328, + "step": 23200 + }, + { + "epoch": 2.5528052805280526, + "grad_norm": 0.49488574266433716, + "learning_rate": 4.9883775027989773e-05, + "loss": 0.0202, + "num_input_tokens_seen": 4896384, + "step": 23205 + }, + { + "epoch": 2.553355335533553, + "grad_norm": 0.2939589023590088, + "learning_rate": 4.988354375322128e-05, + "loss": 0.0288, + "num_input_tokens_seen": 4897408, + "step": 23210 + }, + { + "epoch": 2.553905390539054, + "grad_norm": 0.034429747611284256, + "learning_rate": 4.9883312249113105e-05, + "loss": 0.1035, + "num_input_tokens_seen": 4898400, + "step": 23215 + }, + { + "epoch": 2.5544554455445545, + "grad_norm": 0.8703551888465881, + "learning_rate": 4.988308051566738e-05, + "loss": 0.0546, + "num_input_tokens_seen": 4899424, + "step": 23220 + }, + { + "epoch": 2.555005500550055, + "grad_norm": 0.056167274713516235, + "learning_rate": 4.988284855288625e-05, + "loss": 0.1005, + "num_input_tokens_seen": 4900512, + "step": 23225 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.04054336994886398, + "learning_rate": 4.988261636077184e-05, + "loss": 0.0405, + "num_input_tokens_seen": 4901632, + "step": 23230 + }, + { + "epoch": 2.556105610561056, + "grad_norm": 0.12475841492414474, + "learning_rate": 4.988238393932629e-05, + "loss": 0.04, + "num_input_tokens_seen": 4902624, + "step": 23235 + }, + { + "epoch": 2.5566556655665567, + "grad_norm": 0.020917478948831558, + "learning_rate": 4.988215128855175e-05, + "loss": 0.0538, + "num_input_tokens_seen": 4903648, + "step": 23240 + }, + { + "epoch": 2.5572057205720573, + "grad_norm": 0.23465539515018463, + "learning_rate": 4.9881918408450356e-05, + "loss": 0.097, + "num_input_tokens_seen": 4904768, + "step": 23245 + }, + { + "epoch": 2.557755775577558, + "grad_norm": 0.21834050118923187, + "learning_rate": 4.988168529902427e-05, + "loss": 0.0643, + "num_input_tokens_seen": 4905856, + "step": 23250 + }, + { + "epoch": 2.558305830583058, + "grad_norm": 0.2872485816478729, + "learning_rate": 4.988145196027563e-05, + "loss": 0.0828, + "num_input_tokens_seen": 4906944, + "step": 23255 + }, + { + "epoch": 2.558855885588559, + "grad_norm": 0.19935289025306702, + "learning_rate": 4.9881218392206574e-05, + "loss": 0.0558, + "num_input_tokens_seen": 4908064, + "step": 23260 + }, + { + "epoch": 2.5594059405940595, + "grad_norm": 0.08769793808460236, + "learning_rate": 4.988098459481928e-05, + "loss": 0.0336, + "num_input_tokens_seen": 4909120, + "step": 23265 + }, + { + "epoch": 2.55995599559956, + "grad_norm": 0.032935623079538345, + "learning_rate": 4.988075056811589e-05, + "loss": 0.0607, + "num_input_tokens_seen": 4910112, + "step": 23270 + }, + { + "epoch": 2.5605060506050608, + "grad_norm": 0.16117410361766815, + "learning_rate": 4.988051631209855e-05, + "loss": 0.0628, + "num_input_tokens_seen": 4911168, + "step": 23275 + }, + { + "epoch": 2.561056105610561, + "grad_norm": 3.6410953998565674, + "learning_rate": 4.988028182676944e-05, + "loss": 0.0765, + "num_input_tokens_seen": 4912320, + "step": 23280 + }, + { + "epoch": 2.5616061606160616, + "grad_norm": 0.20381926000118256, + "learning_rate": 4.98800471121307e-05, + "loss": 0.1276, + "num_input_tokens_seen": 4913344, + "step": 23285 + }, + { + "epoch": 2.5621562156215623, + "grad_norm": 0.14068837463855743, + "learning_rate": 4.987981216818451e-05, + "loss": 0.0221, + "num_input_tokens_seen": 4914432, + "step": 23290 + }, + { + "epoch": 2.5627062706270625, + "grad_norm": 0.2845796048641205, + "learning_rate": 4.9879576994933035e-05, + "loss": 0.0558, + "num_input_tokens_seen": 4915488, + "step": 23295 + }, + { + "epoch": 2.563256325632563, + "grad_norm": 2.310079336166382, + "learning_rate": 4.9879341592378435e-05, + "loss": 0.1168, + "num_input_tokens_seen": 4916512, + "step": 23300 + }, + { + "epoch": 2.5638063806380638, + "grad_norm": 0.21191023290157318, + "learning_rate": 4.9879105960522875e-05, + "loss": 0.0503, + "num_input_tokens_seen": 4917568, + "step": 23305 + }, + { + "epoch": 2.5643564356435644, + "grad_norm": 0.26752981543540955, + "learning_rate": 4.9878870099368544e-05, + "loss": 0.0782, + "num_input_tokens_seen": 4918624, + "step": 23310 + }, + { + "epoch": 2.564906490649065, + "grad_norm": 0.5602541565895081, + "learning_rate": 4.9878634008917604e-05, + "loss": 0.1089, + "num_input_tokens_seen": 4919648, + "step": 23315 + }, + { + "epoch": 2.5654565456545653, + "grad_norm": 0.4942129850387573, + "learning_rate": 4.987839768917222e-05, + "loss": 0.1314, + "num_input_tokens_seen": 4920672, + "step": 23320 + }, + { + "epoch": 2.566006600660066, + "grad_norm": 0.19220088422298431, + "learning_rate": 4.9878161140134594e-05, + "loss": 0.0989, + "num_input_tokens_seen": 4921760, + "step": 23325 + }, + { + "epoch": 2.5665566556655666, + "grad_norm": 1.4244016408920288, + "learning_rate": 4.9877924361806896e-05, + "loss": 0.0946, + "num_input_tokens_seen": 4922848, + "step": 23330 + }, + { + "epoch": 2.567106710671067, + "grad_norm": 0.0691041350364685, + "learning_rate": 4.9877687354191295e-05, + "loss": 0.1639, + "num_input_tokens_seen": 4923904, + "step": 23335 + }, + { + "epoch": 2.567656765676568, + "grad_norm": 0.12158837914466858, + "learning_rate": 4.9877450117289995e-05, + "loss": 0.0399, + "num_input_tokens_seen": 4924960, + "step": 23340 + }, + { + "epoch": 2.568206820682068, + "grad_norm": 1.2831915616989136, + "learning_rate": 4.987721265110518e-05, + "loss": 0.0726, + "num_input_tokens_seen": 4926016, + "step": 23345 + }, + { + "epoch": 2.5687568756875687, + "grad_norm": 0.26628366112709045, + "learning_rate": 4.987697495563902e-05, + "loss": 0.0222, + "num_input_tokens_seen": 4927040, + "step": 23350 + }, + { + "epoch": 2.5693069306930694, + "grad_norm": 0.18590644001960754, + "learning_rate": 4.987673703089373e-05, + "loss": 0.1484, + "num_input_tokens_seen": 4928128, + "step": 23355 + }, + { + "epoch": 2.56985698569857, + "grad_norm": 0.31492429971694946, + "learning_rate": 4.987649887687149e-05, + "loss": 0.0674, + "num_input_tokens_seen": 4929184, + "step": 23360 + }, + { + "epoch": 2.5704070407040707, + "grad_norm": 0.15818171203136444, + "learning_rate": 4.987626049357449e-05, + "loss": 0.0176, + "num_input_tokens_seen": 4930144, + "step": 23365 + }, + { + "epoch": 2.570957095709571, + "grad_norm": 0.2144562304019928, + "learning_rate": 4.9876021881004944e-05, + "loss": 0.0199, + "num_input_tokens_seen": 4931104, + "step": 23370 + }, + { + "epoch": 2.5715071507150715, + "grad_norm": 0.027436435222625732, + "learning_rate": 4.987578303916503e-05, + "loss": 0.1786, + "num_input_tokens_seen": 4932160, + "step": 23375 + }, + { + "epoch": 2.572057205720572, + "grad_norm": 0.3381181061267853, + "learning_rate": 4.9875543968056965e-05, + "loss": 0.0455, + "num_input_tokens_seen": 4933184, + "step": 23380 + }, + { + "epoch": 2.5726072607260724, + "grad_norm": 0.992914617061615, + "learning_rate": 4.987530466768295e-05, + "loss": 0.0543, + "num_input_tokens_seen": 4934240, + "step": 23385 + }, + { + "epoch": 2.573157315731573, + "grad_norm": 0.575912356376648, + "learning_rate": 4.9875065138045185e-05, + "loss": 0.0797, + "num_input_tokens_seen": 4935296, + "step": 23390 + }, + { + "epoch": 2.5737073707370737, + "grad_norm": 0.11761137843132019, + "learning_rate": 4.987482537914589e-05, + "loss": 0.0528, + "num_input_tokens_seen": 4936320, + "step": 23395 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 1.2403161525726318, + "learning_rate": 4.987458539098725e-05, + "loss": 0.1739, + "num_input_tokens_seen": 4937312, + "step": 23400 + }, + { + "epoch": 2.574807480748075, + "grad_norm": 0.29857149720191956, + "learning_rate": 4.98743451735715e-05, + "loss": 0.0815, + "num_input_tokens_seen": 4938368, + "step": 23405 + }, + { + "epoch": 2.575357535753575, + "grad_norm": 0.06659875810146332, + "learning_rate": 4.987410472690085e-05, + "loss": 0.0885, + "num_input_tokens_seen": 4939424, + "step": 23410 + }, + { + "epoch": 2.575907590759076, + "grad_norm": 0.2502187490463257, + "learning_rate": 4.9873864050977504e-05, + "loss": 0.0633, + "num_input_tokens_seen": 4940480, + "step": 23415 + }, + { + "epoch": 2.5764576457645765, + "grad_norm": 0.13393718004226685, + "learning_rate": 4.98736231458037e-05, + "loss": 0.1843, + "num_input_tokens_seen": 4941504, + "step": 23420 + }, + { + "epoch": 2.577007700770077, + "grad_norm": 1.3379870653152466, + "learning_rate": 4.987338201138164e-05, + "loss": 0.1024, + "num_input_tokens_seen": 4942592, + "step": 23425 + }, + { + "epoch": 2.5775577557755778, + "grad_norm": 0.48119673132896423, + "learning_rate": 4.987314064771356e-05, + "loss": 0.0644, + "num_input_tokens_seen": 4943680, + "step": 23430 + }, + { + "epoch": 2.578107810781078, + "grad_norm": 0.936862587928772, + "learning_rate": 4.987289905480166e-05, + "loss": 0.0708, + "num_input_tokens_seen": 4944768, + "step": 23435 + }, + { + "epoch": 2.5786578657865786, + "grad_norm": 0.16759251058101654, + "learning_rate": 4.98726572326482e-05, + "loss": 0.0967, + "num_input_tokens_seen": 4945888, + "step": 23440 + }, + { + "epoch": 2.5792079207920793, + "grad_norm": 1.0928194522857666, + "learning_rate": 4.9872415181255396e-05, + "loss": 0.0451, + "num_input_tokens_seen": 4946912, + "step": 23445 + }, + { + "epoch": 2.5797579757975795, + "grad_norm": 0.5621097683906555, + "learning_rate": 4.987217290062547e-05, + "loss": 0.0657, + "num_input_tokens_seen": 4948032, + "step": 23450 + }, + { + "epoch": 2.5803080308030806, + "grad_norm": 0.35444971919059753, + "learning_rate": 4.9871930390760654e-05, + "loss": 0.0469, + "num_input_tokens_seen": 4949056, + "step": 23455 + }, + { + "epoch": 2.580858085808581, + "grad_norm": 0.010412176139652729, + "learning_rate": 4.98716876516632e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4950080, + "step": 23460 + }, + { + "epoch": 2.5814081408140814, + "grad_norm": 0.1634100079536438, + "learning_rate": 4.987144468333533e-05, + "loss": 0.0341, + "num_input_tokens_seen": 4951136, + "step": 23465 + }, + { + "epoch": 2.581958195819582, + "grad_norm": 1.208444356918335, + "learning_rate": 4.98712014857793e-05, + "loss": 0.1, + "num_input_tokens_seen": 4952160, + "step": 23470 + }, + { + "epoch": 2.5825082508250823, + "grad_norm": 0.23541013896465302, + "learning_rate": 4.987095805899733e-05, + "loss": 0.1332, + "num_input_tokens_seen": 4953216, + "step": 23475 + }, + { + "epoch": 2.583058305830583, + "grad_norm": 1.0655730962753296, + "learning_rate": 4.987071440299168e-05, + "loss": 0.1157, + "num_input_tokens_seen": 4954240, + "step": 23480 + }, + { + "epoch": 2.5836083608360836, + "grad_norm": 0.11691100895404816, + "learning_rate": 4.9870470517764584e-05, + "loss": 0.0355, + "num_input_tokens_seen": 4955328, + "step": 23485 + }, + { + "epoch": 2.5841584158415842, + "grad_norm": 0.5080633163452148, + "learning_rate": 4.9870226403318296e-05, + "loss": 0.0936, + "num_input_tokens_seen": 4956384, + "step": 23490 + }, + { + "epoch": 2.584708470847085, + "grad_norm": 0.4386175572872162, + "learning_rate": 4.986998205965506e-05, + "loss": 0.1657, + "num_input_tokens_seen": 4957440, + "step": 23495 + }, + { + "epoch": 2.585258525852585, + "grad_norm": 0.9518478512763977, + "learning_rate": 4.9869737486777145e-05, + "loss": 0.1295, + "num_input_tokens_seen": 4958496, + "step": 23500 + }, + { + "epoch": 2.5858085808580857, + "grad_norm": 0.621599555015564, + "learning_rate": 4.986949268468679e-05, + "loss": 0.1515, + "num_input_tokens_seen": 4959552, + "step": 23505 + }, + { + "epoch": 2.5863586358635864, + "grad_norm": 1.3484992980957031, + "learning_rate": 4.9869247653386256e-05, + "loss": 0.0534, + "num_input_tokens_seen": 4960640, + "step": 23510 + }, + { + "epoch": 2.586908690869087, + "grad_norm": 0.40814533829689026, + "learning_rate": 4.9869002392877786e-05, + "loss": 0.0413, + "num_input_tokens_seen": 4961792, + "step": 23515 + }, + { + "epoch": 2.5874587458745877, + "grad_norm": 0.7030791640281677, + "learning_rate": 4.986875690316367e-05, + "loss": 0.0587, + "num_input_tokens_seen": 4962816, + "step": 23520 + }, + { + "epoch": 2.588008800880088, + "grad_norm": 0.03152547776699066, + "learning_rate": 4.986851118424615e-05, + "loss": 0.0312, + "num_input_tokens_seen": 4963872, + "step": 23525 + }, + { + "epoch": 2.5885588558855885, + "grad_norm": 0.7715482115745544, + "learning_rate": 4.986826523612749e-05, + "loss": 0.0633, + "num_input_tokens_seen": 4964928, + "step": 23530 + }, + { + "epoch": 2.589108910891089, + "grad_norm": 0.9972863793373108, + "learning_rate": 4.986801905880997e-05, + "loss": 0.1502, + "num_input_tokens_seen": 4965952, + "step": 23535 + }, + { + "epoch": 2.5896589658965894, + "grad_norm": 0.22065626084804535, + "learning_rate": 4.9867772652295845e-05, + "loss": 0.0236, + "num_input_tokens_seen": 4967008, + "step": 23540 + }, + { + "epoch": 2.5902090209020905, + "grad_norm": 1.0537135601043701, + "learning_rate": 4.986752601658739e-05, + "loss": 0.0439, + "num_input_tokens_seen": 4968128, + "step": 23545 + }, + { + "epoch": 2.5907590759075907, + "grad_norm": 0.8950979113578796, + "learning_rate": 4.986727915168689e-05, + "loss": 0.0639, + "num_input_tokens_seen": 4969152, + "step": 23550 + }, + { + "epoch": 2.5913091309130913, + "grad_norm": 1.6651785373687744, + "learning_rate": 4.986703205759661e-05, + "loss": 0.1161, + "num_input_tokens_seen": 4970208, + "step": 23555 + }, + { + "epoch": 2.591859185918592, + "grad_norm": 0.38171806931495667, + "learning_rate": 4.986678473431882e-05, + "loss": 0.0209, + "num_input_tokens_seen": 4971232, + "step": 23560 + }, + { + "epoch": 2.592409240924092, + "grad_norm": 0.2480623573064804, + "learning_rate": 4.986653718185581e-05, + "loss": 0.0447, + "num_input_tokens_seen": 4972256, + "step": 23565 + }, + { + "epoch": 2.592959295929593, + "grad_norm": 0.20395079255104065, + "learning_rate": 4.9866289400209865e-05, + "loss": 0.0207, + "num_input_tokens_seen": 4973312, + "step": 23570 + }, + { + "epoch": 2.5935093509350935, + "grad_norm": 0.047721292823553085, + "learning_rate": 4.9866041389383254e-05, + "loss": 0.0483, + "num_input_tokens_seen": 4974304, + "step": 23575 + }, + { + "epoch": 2.594059405940594, + "grad_norm": 0.13700439035892487, + "learning_rate": 4.986579314937828e-05, + "loss": 0.0073, + "num_input_tokens_seen": 4975360, + "step": 23580 + }, + { + "epoch": 2.594609460946095, + "grad_norm": 0.41786715388298035, + "learning_rate": 4.986554468019722e-05, + "loss": 0.137, + "num_input_tokens_seen": 4976416, + "step": 23585 + }, + { + "epoch": 2.595159515951595, + "grad_norm": 0.5856755375862122, + "learning_rate": 4.986529598184236e-05, + "loss": 0.0515, + "num_input_tokens_seen": 4977408, + "step": 23590 + }, + { + "epoch": 2.5957095709570956, + "grad_norm": 0.10544732213020325, + "learning_rate": 4.9865047054316e-05, + "loss": 0.0313, + "num_input_tokens_seen": 4978528, + "step": 23595 + }, + { + "epoch": 2.5962596259625963, + "grad_norm": 0.1553480327129364, + "learning_rate": 4.9864797897620434e-05, + "loss": 0.0639, + "num_input_tokens_seen": 4979584, + "step": 23600 + }, + { + "epoch": 2.596809680968097, + "grad_norm": 0.17355996370315552, + "learning_rate": 4.9864548511757955e-05, + "loss": 0.0191, + "num_input_tokens_seen": 4980672, + "step": 23605 + }, + { + "epoch": 2.5973597359735976, + "grad_norm": 1.2957217693328857, + "learning_rate": 4.986429889673087e-05, + "loss": 0.0769, + "num_input_tokens_seen": 4981696, + "step": 23610 + }, + { + "epoch": 2.597909790979098, + "grad_norm": 0.3579069972038269, + "learning_rate": 4.986404905254147e-05, + "loss": 0.0803, + "num_input_tokens_seen": 4982720, + "step": 23615 + }, + { + "epoch": 2.5984598459845984, + "grad_norm": 0.10152982920408249, + "learning_rate": 4.986379897919206e-05, + "loss": 0.0292, + "num_input_tokens_seen": 4983744, + "step": 23620 + }, + { + "epoch": 2.599009900990099, + "grad_norm": 0.430396169424057, + "learning_rate": 4.9863548676684944e-05, + "loss": 0.0953, + "num_input_tokens_seen": 4984864, + "step": 23625 + }, + { + "epoch": 2.5995599559955993, + "grad_norm": 0.04125220328569412, + "learning_rate": 4.986329814502244e-05, + "loss": 0.0284, + "num_input_tokens_seen": 4985984, + "step": 23630 + }, + { + "epoch": 2.6001100110011, + "grad_norm": 0.15198461711406708, + "learning_rate": 4.9863047384206835e-05, + "loss": 0.0392, + "num_input_tokens_seen": 4987104, + "step": 23635 + }, + { + "epoch": 2.6006600660066006, + "grad_norm": 0.035666681826114655, + "learning_rate": 4.9862796394240456e-05, + "loss": 0.0259, + "num_input_tokens_seen": 4988160, + "step": 23640 + }, + { + "epoch": 2.6012101210121013, + "grad_norm": 1.0542181730270386, + "learning_rate": 4.9862545175125616e-05, + "loss": 0.0858, + "num_input_tokens_seen": 4989280, + "step": 23645 + }, + { + "epoch": 2.601760176017602, + "grad_norm": 0.07434539496898651, + "learning_rate": 4.9862293726864626e-05, + "loss": 0.0285, + "num_input_tokens_seen": 4990368, + "step": 23650 + }, + { + "epoch": 2.602310231023102, + "grad_norm": 0.03224101662635803, + "learning_rate": 4.9862042049459805e-05, + "loss": 0.0158, + "num_input_tokens_seen": 4991456, + "step": 23655 + }, + { + "epoch": 2.6028602860286028, + "grad_norm": 0.2931489944458008, + "learning_rate": 4.9861790142913476e-05, + "loss": 0.1158, + "num_input_tokens_seen": 4992544, + "step": 23660 + }, + { + "epoch": 2.6034103410341034, + "grad_norm": 0.5799564123153687, + "learning_rate": 4.986153800722795e-05, + "loss": 0.0524, + "num_input_tokens_seen": 4993600, + "step": 23665 + }, + { + "epoch": 2.603960396039604, + "grad_norm": 0.266825407743454, + "learning_rate": 4.9861285642405564e-05, + "loss": 0.1283, + "num_input_tokens_seen": 4994656, + "step": 23670 + }, + { + "epoch": 2.6045104510451047, + "grad_norm": 0.14049425721168518, + "learning_rate": 4.986103304844864e-05, + "loss": 0.1033, + "num_input_tokens_seen": 4995776, + "step": 23675 + }, + { + "epoch": 2.605060506050605, + "grad_norm": 0.7709000706672668, + "learning_rate": 4.986078022535949e-05, + "loss": 0.0666, + "num_input_tokens_seen": 4996800, + "step": 23680 + }, + { + "epoch": 2.6056105610561056, + "grad_norm": 0.15101094543933868, + "learning_rate": 4.986052717314047e-05, + "loss": 0.0695, + "num_input_tokens_seen": 4997824, + "step": 23685 + }, + { + "epoch": 2.606160616061606, + "grad_norm": 0.03548382595181465, + "learning_rate": 4.9860273891793896e-05, + "loss": 0.1188, + "num_input_tokens_seen": 4998880, + "step": 23690 + }, + { + "epoch": 2.606710671067107, + "grad_norm": 0.7534456849098206, + "learning_rate": 4.986002038132211e-05, + "loss": 0.036, + "num_input_tokens_seen": 4999904, + "step": 23695 + }, + { + "epoch": 2.6072607260726075, + "grad_norm": 0.6713078618049622, + "learning_rate": 4.985976664172745e-05, + "loss": 0.116, + "num_input_tokens_seen": 5000928, + "step": 23700 + }, + { + "epoch": 2.6078107810781077, + "grad_norm": 0.018131423741579056, + "learning_rate": 4.9859512673012236e-05, + "loss": 0.1468, + "num_input_tokens_seen": 5001952, + "step": 23705 + }, + { + "epoch": 2.6083608360836084, + "grad_norm": 1.3741768598556519, + "learning_rate": 4.9859258475178826e-05, + "loss": 0.0963, + "num_input_tokens_seen": 5003008, + "step": 23710 + }, + { + "epoch": 2.608910891089109, + "grad_norm": 0.02175314724445343, + "learning_rate": 4.9859004048229564e-05, + "loss": 0.0578, + "num_input_tokens_seen": 5004064, + "step": 23715 + }, + { + "epoch": 2.609460946094609, + "grad_norm": 0.02790878340601921, + "learning_rate": 4.985874939216679e-05, + "loss": 0.0659, + "num_input_tokens_seen": 5005088, + "step": 23720 + }, + { + "epoch": 2.61001100110011, + "grad_norm": 0.2721906304359436, + "learning_rate": 4.9858494506992845e-05, + "loss": 0.0342, + "num_input_tokens_seen": 5006144, + "step": 23725 + }, + { + "epoch": 2.6105610561056105, + "grad_norm": 0.6252448558807373, + "learning_rate": 4.985823939271009e-05, + "loss": 0.0504, + "num_input_tokens_seen": 5007200, + "step": 23730 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 1.326634168624878, + "learning_rate": 4.985798404932088e-05, + "loss": 0.3178, + "num_input_tokens_seen": 5008288, + "step": 23735 + }, + { + "epoch": 2.611661166116612, + "grad_norm": 0.8515278100967407, + "learning_rate": 4.985772847682755e-05, + "loss": 0.1682, + "num_input_tokens_seen": 5009344, + "step": 23740 + }, + { + "epoch": 2.612211221122112, + "grad_norm": 0.19316919147968292, + "learning_rate": 4.985747267523246e-05, + "loss": 0.0881, + "num_input_tokens_seen": 5010400, + "step": 23745 + }, + { + "epoch": 2.6127612761276127, + "grad_norm": 0.011115416884422302, + "learning_rate": 4.9857216644537976e-05, + "loss": 0.028, + "num_input_tokens_seen": 5011520, + "step": 23750 + }, + { + "epoch": 2.6133113311331133, + "grad_norm": 0.5943436622619629, + "learning_rate": 4.985696038474645e-05, + "loss": 0.1044, + "num_input_tokens_seen": 5012576, + "step": 23755 + }, + { + "epoch": 2.613861386138614, + "grad_norm": 0.26922935247421265, + "learning_rate": 4.985670389586026e-05, + "loss": 0.0419, + "num_input_tokens_seen": 5013664, + "step": 23760 + }, + { + "epoch": 2.6144114411441146, + "grad_norm": 0.9079360961914062, + "learning_rate": 4.9856447177881745e-05, + "loss": 0.0504, + "num_input_tokens_seen": 5014688, + "step": 23765 + }, + { + "epoch": 2.614961496149615, + "grad_norm": 0.16602344810962677, + "learning_rate": 4.9856190230813285e-05, + "loss": 0.1593, + "num_input_tokens_seen": 5015776, + "step": 23770 + }, + { + "epoch": 2.6155115511551155, + "grad_norm": 0.35515502095222473, + "learning_rate": 4.985593305465725e-05, + "loss": 0.0439, + "num_input_tokens_seen": 5016864, + "step": 23775 + }, + { + "epoch": 2.616061606160616, + "grad_norm": 0.33098486065864563, + "learning_rate": 4.9855675649416e-05, + "loss": 0.0256, + "num_input_tokens_seen": 5017888, + "step": 23780 + }, + { + "epoch": 2.6166116611661168, + "grad_norm": 0.1304221749305725, + "learning_rate": 4.9855418015091924e-05, + "loss": 0.1392, + "num_input_tokens_seen": 5018944, + "step": 23785 + }, + { + "epoch": 2.6171617161716174, + "grad_norm": 0.19359783828258514, + "learning_rate": 4.9855160151687386e-05, + "loss": 0.1175, + "num_input_tokens_seen": 5020000, + "step": 23790 + }, + { + "epoch": 2.6177117711771176, + "grad_norm": 0.2591223120689392, + "learning_rate": 4.9854902059204765e-05, + "loss": 0.0895, + "num_input_tokens_seen": 5021088, + "step": 23795 + }, + { + "epoch": 2.6182618261826183, + "grad_norm": 0.06935928761959076, + "learning_rate": 4.9854643737646434e-05, + "loss": 0.0123, + "num_input_tokens_seen": 5022112, + "step": 23800 + }, + { + "epoch": 2.618811881188119, + "grad_norm": 0.08055760711431503, + "learning_rate": 4.985438518701477e-05, + "loss": 0.1009, + "num_input_tokens_seen": 5023168, + "step": 23805 + }, + { + "epoch": 2.619361936193619, + "grad_norm": 1.1554195880889893, + "learning_rate": 4.9854126407312176e-05, + "loss": 0.1354, + "num_input_tokens_seen": 5024224, + "step": 23810 + }, + { + "epoch": 2.6199119911991198, + "grad_norm": 0.305435448884964, + "learning_rate": 4.9853867398541026e-05, + "loss": 0.045, + "num_input_tokens_seen": 5025312, + "step": 23815 + }, + { + "epoch": 2.6204620462046204, + "grad_norm": 0.11641760170459747, + "learning_rate": 4.98536081607037e-05, + "loss": 0.0283, + "num_input_tokens_seen": 5026336, + "step": 23820 + }, + { + "epoch": 2.621012101210121, + "grad_norm": 0.8037834167480469, + "learning_rate": 4.98533486938026e-05, + "loss": 0.0492, + "num_input_tokens_seen": 5027424, + "step": 23825 + }, + { + "epoch": 2.6215621562156217, + "grad_norm": 0.14374575018882751, + "learning_rate": 4.98530889978401e-05, + "loss": 0.0234, + "num_input_tokens_seen": 5028448, + "step": 23830 + }, + { + "epoch": 2.622112211221122, + "grad_norm": 0.15689238905906677, + "learning_rate": 4.985282907281862e-05, + "loss": 0.0665, + "num_input_tokens_seen": 5029504, + "step": 23835 + }, + { + "epoch": 2.6226622662266226, + "grad_norm": 0.34116607904434204, + "learning_rate": 4.985256891874053e-05, + "loss": 0.0636, + "num_input_tokens_seen": 5030528, + "step": 23840 + }, + { + "epoch": 2.6232123212321232, + "grad_norm": 0.6485015749931335, + "learning_rate": 4.985230853560824e-05, + "loss": 0.1352, + "num_input_tokens_seen": 5031584, + "step": 23845 + }, + { + "epoch": 2.623762376237624, + "grad_norm": 0.0901193767786026, + "learning_rate": 4.9852047923424137e-05, + "loss": 0.0505, + "num_input_tokens_seen": 5032608, + "step": 23850 + }, + { + "epoch": 2.6243124312431245, + "grad_norm": 0.0946902483701706, + "learning_rate": 4.985178708219065e-05, + "loss": 0.0352, + "num_input_tokens_seen": 5033792, + "step": 23855 + }, + { + "epoch": 2.6248624862486247, + "grad_norm": 1.8329206705093384, + "learning_rate": 4.985152601191016e-05, + "loss": 0.0563, + "num_input_tokens_seen": 5034816, + "step": 23860 + }, + { + "epoch": 2.6254125412541254, + "grad_norm": 0.8631324172019958, + "learning_rate": 4.985126471258508e-05, + "loss": 0.0572, + "num_input_tokens_seen": 5035872, + "step": 23865 + }, + { + "epoch": 2.625962596259626, + "grad_norm": 1.789557695388794, + "learning_rate": 4.985100318421781e-05, + "loss": 0.1338, + "num_input_tokens_seen": 5036960, + "step": 23870 + }, + { + "epoch": 2.6265126512651267, + "grad_norm": 1.2913358211517334, + "learning_rate": 4.9850741426810774e-05, + "loss": 0.1519, + "num_input_tokens_seen": 5037952, + "step": 23875 + }, + { + "epoch": 2.6270627062706273, + "grad_norm": 0.14695297181606293, + "learning_rate": 4.985047944036638e-05, + "loss": 0.0353, + "num_input_tokens_seen": 5039040, + "step": 23880 + }, + { + "epoch": 2.6276127612761275, + "grad_norm": 0.31705421209335327, + "learning_rate": 4.985021722488704e-05, + "loss": 0.0359, + "num_input_tokens_seen": 5040032, + "step": 23885 + }, + { + "epoch": 2.628162816281628, + "grad_norm": 0.9481049180030823, + "learning_rate": 4.9849954780375174e-05, + "loss": 0.0569, + "num_input_tokens_seen": 5041056, + "step": 23890 + }, + { + "epoch": 2.628712871287129, + "grad_norm": 0.4729142487049103, + "learning_rate": 4.98496921068332e-05, + "loss": 0.0982, + "num_input_tokens_seen": 5042112, + "step": 23895 + }, + { + "epoch": 2.629262926292629, + "grad_norm": 0.8715692162513733, + "learning_rate": 4.984942920426354e-05, + "loss": 0.0787, + "num_input_tokens_seen": 5043136, + "step": 23900 + }, + { + "epoch": 2.6298129812981297, + "grad_norm": 0.12183018773794174, + "learning_rate": 4.984916607266861e-05, + "loss": 0.0881, + "num_input_tokens_seen": 5044224, + "step": 23905 + }, + { + "epoch": 2.6303630363036303, + "grad_norm": 0.21551892161369324, + "learning_rate": 4.9848902712050836e-05, + "loss": 0.0307, + "num_input_tokens_seen": 5045344, + "step": 23910 + }, + { + "epoch": 2.630913091309131, + "grad_norm": 0.9212315678596497, + "learning_rate": 4.9848639122412655e-05, + "loss": 0.0719, + "num_input_tokens_seen": 5046368, + "step": 23915 + }, + { + "epoch": 2.6314631463146316, + "grad_norm": 2.206963539123535, + "learning_rate": 4.984837530375649e-05, + "loss": 0.0983, + "num_input_tokens_seen": 5047488, + "step": 23920 + }, + { + "epoch": 2.632013201320132, + "grad_norm": 0.396982878446579, + "learning_rate": 4.984811125608477e-05, + "loss": 0.0938, + "num_input_tokens_seen": 5048512, + "step": 23925 + }, + { + "epoch": 2.6325632563256325, + "grad_norm": 0.15077820420265198, + "learning_rate": 4.9847846979399945e-05, + "loss": 0.0554, + "num_input_tokens_seen": 5049600, + "step": 23930 + }, + { + "epoch": 2.633113311331133, + "grad_norm": 0.3512171804904938, + "learning_rate": 4.984758247370443e-05, + "loss": 0.2616, + "num_input_tokens_seen": 5050624, + "step": 23935 + }, + { + "epoch": 2.633663366336634, + "grad_norm": 0.8399854898452759, + "learning_rate": 4.9847317739000664e-05, + "loss": 0.0612, + "num_input_tokens_seen": 5051776, + "step": 23940 + }, + { + "epoch": 2.6342134213421344, + "grad_norm": 0.5038120150566101, + "learning_rate": 4.98470527752911e-05, + "loss": 0.1112, + "num_input_tokens_seen": 5052800, + "step": 23945 + }, + { + "epoch": 2.6347634763476346, + "grad_norm": 0.5833993554115295, + "learning_rate": 4.984678758257817e-05, + "loss": 0.0269, + "num_input_tokens_seen": 5053856, + "step": 23950 + }, + { + "epoch": 2.6353135313531353, + "grad_norm": 0.03616360202431679, + "learning_rate": 4.9846522160864314e-05, + "loss": 0.0363, + "num_input_tokens_seen": 5054944, + "step": 23955 + }, + { + "epoch": 2.635863586358636, + "grad_norm": 1.6412702798843384, + "learning_rate": 4.9846256510152003e-05, + "loss": 0.1599, + "num_input_tokens_seen": 5056000, + "step": 23960 + }, + { + "epoch": 2.636413641364136, + "grad_norm": 0.23048634827136993, + "learning_rate": 4.984599063044366e-05, + "loss": 0.0314, + "num_input_tokens_seen": 5057088, + "step": 23965 + }, + { + "epoch": 2.6369636963696372, + "grad_norm": 0.05259552597999573, + "learning_rate": 4.984572452174174e-05, + "loss": 0.059, + "num_input_tokens_seen": 5058144, + "step": 23970 + }, + { + "epoch": 2.6375137513751374, + "grad_norm": 0.8566240072250366, + "learning_rate": 4.98454581840487e-05, + "loss": 0.0658, + "num_input_tokens_seen": 5059168, + "step": 23975 + }, + { + "epoch": 2.638063806380638, + "grad_norm": 1.3969719409942627, + "learning_rate": 4.9845191617367e-05, + "loss": 0.0983, + "num_input_tokens_seen": 5060160, + "step": 23980 + }, + { + "epoch": 2.6386138613861387, + "grad_norm": 0.7543598413467407, + "learning_rate": 4.9844924821699086e-05, + "loss": 0.0992, + "num_input_tokens_seen": 5061184, + "step": 23985 + }, + { + "epoch": 2.639163916391639, + "grad_norm": 0.019603488966822624, + "learning_rate": 4.984465779704742e-05, + "loss": 0.0228, + "num_input_tokens_seen": 5062272, + "step": 23990 + }, + { + "epoch": 2.6397139713971396, + "grad_norm": 0.06785782426595688, + "learning_rate": 4.984439054341447e-05, + "loss": 0.0628, + "num_input_tokens_seen": 5063424, + "step": 23995 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.22443798184394836, + "learning_rate": 4.98441230608027e-05, + "loss": 0.0504, + "num_input_tokens_seen": 5064480, + "step": 24000 + }, + { + "epoch": 2.640814081408141, + "grad_norm": 1.1912314891815186, + "learning_rate": 4.984385534921455e-05, + "loss": 0.0915, + "num_input_tokens_seen": 5065536, + "step": 24005 + }, + { + "epoch": 2.6413641364136415, + "grad_norm": 0.3638335168361664, + "learning_rate": 4.984358740865251e-05, + "loss": 0.0486, + "num_input_tokens_seen": 5066592, + "step": 24010 + }, + { + "epoch": 2.6419141914191417, + "grad_norm": 1.2882940769195557, + "learning_rate": 4.984331923911906e-05, + "loss": 0.0593, + "num_input_tokens_seen": 5067584, + "step": 24015 + }, + { + "epoch": 2.6424642464246424, + "grad_norm": 1.0464961528778076, + "learning_rate": 4.984305084061665e-05, + "loss": 0.1363, + "num_input_tokens_seen": 5068608, + "step": 24020 + }, + { + "epoch": 2.643014301430143, + "grad_norm": 0.8143844604492188, + "learning_rate": 4.9842782213147765e-05, + "loss": 0.166, + "num_input_tokens_seen": 5069632, + "step": 24025 + }, + { + "epoch": 2.6435643564356437, + "grad_norm": 0.06467599421739578, + "learning_rate": 4.984251335671487e-05, + "loss": 0.0531, + "num_input_tokens_seen": 5070720, + "step": 24030 + }, + { + "epoch": 2.6441144114411443, + "grad_norm": 0.15869832038879395, + "learning_rate": 4.984224427132045e-05, + "loss": 0.0223, + "num_input_tokens_seen": 5071776, + "step": 24035 + }, + { + "epoch": 2.6446644664466445, + "grad_norm": 0.8230555653572083, + "learning_rate": 4.9841974956967e-05, + "loss": 0.0772, + "num_input_tokens_seen": 5072832, + "step": 24040 + }, + { + "epoch": 2.645214521452145, + "grad_norm": 0.20661212503910065, + "learning_rate": 4.984170541365697e-05, + "loss": 0.0388, + "num_input_tokens_seen": 5073952, + "step": 24045 + }, + { + "epoch": 2.645764576457646, + "grad_norm": 0.10427214950323105, + "learning_rate": 4.9841435641392866e-05, + "loss": 0.0563, + "num_input_tokens_seen": 5075104, + "step": 24050 + }, + { + "epoch": 2.646314631463146, + "grad_norm": 0.4013477563858032, + "learning_rate": 4.9841165640177165e-05, + "loss": 0.0697, + "num_input_tokens_seen": 5076224, + "step": 24055 + }, + { + "epoch": 2.6468646864686467, + "grad_norm": 0.9280372858047485, + "learning_rate": 4.9840895410012375e-05, + "loss": 0.1125, + "num_input_tokens_seen": 5077280, + "step": 24060 + }, + { + "epoch": 2.6474147414741473, + "grad_norm": 0.09305144846439362, + "learning_rate": 4.984062495090096e-05, + "loss": 0.0337, + "num_input_tokens_seen": 5078336, + "step": 24065 + }, + { + "epoch": 2.647964796479648, + "grad_norm": 0.29000145196914673, + "learning_rate": 4.984035426284543e-05, + "loss": 0.0834, + "num_input_tokens_seen": 5079392, + "step": 24070 + }, + { + "epoch": 2.6485148514851486, + "grad_norm": 0.7915500998497009, + "learning_rate": 4.984008334584827e-05, + "loss": 0.1349, + "num_input_tokens_seen": 5080480, + "step": 24075 + }, + { + "epoch": 2.649064906490649, + "grad_norm": 0.04480546712875366, + "learning_rate": 4.983981219991198e-05, + "loss": 0.2407, + "num_input_tokens_seen": 5081472, + "step": 24080 + }, + { + "epoch": 2.6496149614961495, + "grad_norm": 0.09016527980566025, + "learning_rate": 4.9839540825039065e-05, + "loss": 0.0693, + "num_input_tokens_seen": 5082528, + "step": 24085 + }, + { + "epoch": 2.65016501650165, + "grad_norm": 0.06619328260421753, + "learning_rate": 4.983926922123202e-05, + "loss": 0.059, + "num_input_tokens_seen": 5083584, + "step": 24090 + }, + { + "epoch": 2.650715071507151, + "grad_norm": 0.5446596741676331, + "learning_rate": 4.983899738849335e-05, + "loss": 0.1272, + "num_input_tokens_seen": 5084608, + "step": 24095 + }, + { + "epoch": 2.6512651265126514, + "grad_norm": 1.392760992050171, + "learning_rate": 4.983872532682556e-05, + "loss": 0.0816, + "num_input_tokens_seen": 5085664, + "step": 24100 + }, + { + "epoch": 2.6518151815181517, + "grad_norm": 0.14232735335826874, + "learning_rate": 4.9838453036231155e-05, + "loss": 0.0701, + "num_input_tokens_seen": 5086720, + "step": 24105 + }, + { + "epoch": 2.6523652365236523, + "grad_norm": 0.4907209277153015, + "learning_rate": 4.9838180516712655e-05, + "loss": 0.0384, + "num_input_tokens_seen": 5087744, + "step": 24110 + }, + { + "epoch": 2.652915291529153, + "grad_norm": 0.187005415558815, + "learning_rate": 4.983790776827255e-05, + "loss": 0.0736, + "num_input_tokens_seen": 5088768, + "step": 24115 + }, + { + "epoch": 2.6534653465346536, + "grad_norm": 0.6441152095794678, + "learning_rate": 4.983763479091338e-05, + "loss": 0.1238, + "num_input_tokens_seen": 5089824, + "step": 24120 + }, + { + "epoch": 2.6540154015401543, + "grad_norm": 0.7471609711647034, + "learning_rate": 4.983736158463764e-05, + "loss": 0.0595, + "num_input_tokens_seen": 5090880, + "step": 24125 + }, + { + "epoch": 2.6545654565456545, + "grad_norm": 1.2571018934249878, + "learning_rate": 4.983708814944786e-05, + "loss": 0.1341, + "num_input_tokens_seen": 5091904, + "step": 24130 + }, + { + "epoch": 2.655115511551155, + "grad_norm": 0.03235140070319176, + "learning_rate": 4.983681448534656e-05, + "loss": 0.1058, + "num_input_tokens_seen": 5092928, + "step": 24135 + }, + { + "epoch": 2.6556655665566558, + "grad_norm": 0.1869356334209442, + "learning_rate": 4.983654059233626e-05, + "loss": 0.0472, + "num_input_tokens_seen": 5093952, + "step": 24140 + }, + { + "epoch": 2.656215621562156, + "grad_norm": 0.19541220366954803, + "learning_rate": 4.983626647041947e-05, + "loss": 0.0506, + "num_input_tokens_seen": 5094912, + "step": 24145 + }, + { + "epoch": 2.6567656765676566, + "grad_norm": 0.20339690148830414, + "learning_rate": 4.983599211959874e-05, + "loss": 0.0497, + "num_input_tokens_seen": 5095904, + "step": 24150 + }, + { + "epoch": 2.6573157315731573, + "grad_norm": 0.7289158701896667, + "learning_rate": 4.983571753987658e-05, + "loss": 0.0537, + "num_input_tokens_seen": 5096928, + "step": 24155 + }, + { + "epoch": 2.657865786578658, + "grad_norm": 0.10530450940132141, + "learning_rate": 4.983544273125554e-05, + "loss": 0.074, + "num_input_tokens_seen": 5098048, + "step": 24160 + }, + { + "epoch": 2.6584158415841586, + "grad_norm": 0.40667200088500977, + "learning_rate": 4.983516769373814e-05, + "loss": 0.0642, + "num_input_tokens_seen": 5099136, + "step": 24165 + }, + { + "epoch": 2.6589658965896588, + "grad_norm": 0.1864243894815445, + "learning_rate": 4.9834892427326907e-05, + "loss": 0.0266, + "num_input_tokens_seen": 5100192, + "step": 24170 + }, + { + "epoch": 2.6595159515951594, + "grad_norm": 0.6957809329032898, + "learning_rate": 4.983461693202439e-05, + "loss": 0.0415, + "num_input_tokens_seen": 5101312, + "step": 24175 + }, + { + "epoch": 2.66006600660066, + "grad_norm": 0.22284682095050812, + "learning_rate": 4.983434120783314e-05, + "loss": 0.2315, + "num_input_tokens_seen": 5102336, + "step": 24180 + }, + { + "epoch": 2.6606160616061607, + "grad_norm": 0.7238979339599609, + "learning_rate": 4.9834065254755664e-05, + "loss": 0.0601, + "num_input_tokens_seen": 5103424, + "step": 24185 + }, + { + "epoch": 2.6611661166116614, + "grad_norm": 1.1344666481018066, + "learning_rate": 4.9833789072794534e-05, + "loss": 0.0974, + "num_input_tokens_seen": 5104512, + "step": 24190 + }, + { + "epoch": 2.6617161716171616, + "grad_norm": 0.8363102674484253, + "learning_rate": 4.983351266195229e-05, + "loss": 0.0739, + "num_input_tokens_seen": 5105536, + "step": 24195 + }, + { + "epoch": 2.662266226622662, + "grad_norm": 0.1069042757153511, + "learning_rate": 4.9833236022231465e-05, + "loss": 0.0205, + "num_input_tokens_seen": 5106560, + "step": 24200 + }, + { + "epoch": 2.662816281628163, + "grad_norm": 1.3370201587677002, + "learning_rate": 4.983295915363463e-05, + "loss": 0.1303, + "num_input_tokens_seen": 5107616, + "step": 24205 + }, + { + "epoch": 2.6633663366336635, + "grad_norm": 0.020266272127628326, + "learning_rate": 4.983268205616433e-05, + "loss": 0.0485, + "num_input_tokens_seen": 5108672, + "step": 24210 + }, + { + "epoch": 2.663916391639164, + "grad_norm": 0.34538063406944275, + "learning_rate": 4.98324047298231e-05, + "loss": 0.0501, + "num_input_tokens_seen": 5109760, + "step": 24215 + }, + { + "epoch": 2.6644664466446644, + "grad_norm": 0.058934543281793594, + "learning_rate": 4.983212717461352e-05, + "loss": 0.0932, + "num_input_tokens_seen": 5110880, + "step": 24220 + }, + { + "epoch": 2.665016501650165, + "grad_norm": 0.06061768904328346, + "learning_rate": 4.983184939053814e-05, + "loss": 0.1024, + "num_input_tokens_seen": 5111904, + "step": 24225 + }, + { + "epoch": 2.6655665566556657, + "grad_norm": 0.055821020156145096, + "learning_rate": 4.9831571377599515e-05, + "loss": 0.1826, + "num_input_tokens_seen": 5112960, + "step": 24230 + }, + { + "epoch": 2.666116611661166, + "grad_norm": 0.04482755810022354, + "learning_rate": 4.983129313580022e-05, + "loss": 0.0582, + "num_input_tokens_seen": 5114112, + "step": 24235 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.21463221311569214, + "learning_rate": 4.98310146651428e-05, + "loss": 0.0763, + "num_input_tokens_seen": 5115168, + "step": 24240 + }, + { + "epoch": 2.667216721672167, + "grad_norm": 0.09073543548583984, + "learning_rate": 4.983073596562985e-05, + "loss": 0.0569, + "num_input_tokens_seen": 5116288, + "step": 24245 + }, + { + "epoch": 2.667766776677668, + "grad_norm": 0.40711885690689087, + "learning_rate": 4.983045703726391e-05, + "loss": 0.0558, + "num_input_tokens_seen": 5117312, + "step": 24250 + }, + { + "epoch": 2.6683168316831685, + "grad_norm": 0.23078864812850952, + "learning_rate": 4.983017788004756e-05, + "loss": 0.065, + "num_input_tokens_seen": 5118368, + "step": 24255 + }, + { + "epoch": 2.6688668866886687, + "grad_norm": 0.20415310561656952, + "learning_rate": 4.9829898493983384e-05, + "loss": 0.0205, + "num_input_tokens_seen": 5119488, + "step": 24260 + }, + { + "epoch": 2.6694169416941693, + "grad_norm": 0.6247220635414124, + "learning_rate": 4.9829618879073944e-05, + "loss": 0.1313, + "num_input_tokens_seen": 5120608, + "step": 24265 + }, + { + "epoch": 2.66996699669967, + "grad_norm": 0.18387748301029205, + "learning_rate": 4.9829339035321824e-05, + "loss": 0.054, + "num_input_tokens_seen": 5121664, + "step": 24270 + }, + { + "epoch": 2.6705170517051706, + "grad_norm": 1.480771780014038, + "learning_rate": 4.982905896272959e-05, + "loss": 0.1779, + "num_input_tokens_seen": 5122656, + "step": 24275 + }, + { + "epoch": 2.6710671067106713, + "grad_norm": 0.0799575075507164, + "learning_rate": 4.9828778661299845e-05, + "loss": 0.0303, + "num_input_tokens_seen": 5123712, + "step": 24280 + }, + { + "epoch": 2.6716171617161715, + "grad_norm": 0.365916907787323, + "learning_rate": 4.982849813103515e-05, + "loss": 0.0699, + "num_input_tokens_seen": 5124768, + "step": 24285 + }, + { + "epoch": 2.672167216721672, + "grad_norm": 2.2576446533203125, + "learning_rate": 4.9828217371938115e-05, + "loss": 0.0797, + "num_input_tokens_seen": 5125856, + "step": 24290 + }, + { + "epoch": 2.6727172717271728, + "grad_norm": 0.37485045194625854, + "learning_rate": 4.982793638401131e-05, + "loss": 0.0367, + "num_input_tokens_seen": 5126912, + "step": 24295 + }, + { + "epoch": 2.6732673267326734, + "grad_norm": 0.4903901517391205, + "learning_rate": 4.982765516725732e-05, + "loss": 0.0408, + "num_input_tokens_seen": 5127936, + "step": 24300 + }, + { + "epoch": 2.673817381738174, + "grad_norm": 0.08633644878864288, + "learning_rate": 4.9827373721678755e-05, + "loss": 0.018, + "num_input_tokens_seen": 5128992, + "step": 24305 + }, + { + "epoch": 2.6743674367436743, + "grad_norm": 1.1694337129592896, + "learning_rate": 4.9827092047278195e-05, + "loss": 0.0566, + "num_input_tokens_seen": 5130048, + "step": 24310 + }, + { + "epoch": 2.674917491749175, + "grad_norm": 0.10762513428926468, + "learning_rate": 4.9826810144058244e-05, + "loss": 0.0634, + "num_input_tokens_seen": 5131104, + "step": 24315 + }, + { + "epoch": 2.6754675467546756, + "grad_norm": 0.23186220228672028, + "learning_rate": 4.9826528012021496e-05, + "loss": 0.0539, + "num_input_tokens_seen": 5132224, + "step": 24320 + }, + { + "epoch": 2.676017601760176, + "grad_norm": 0.04901546984910965, + "learning_rate": 4.9826245651170555e-05, + "loss": 0.02, + "num_input_tokens_seen": 5133280, + "step": 24325 + }, + { + "epoch": 2.6765676567656764, + "grad_norm": 0.2087460309267044, + "learning_rate": 4.982596306150802e-05, + "loss": 0.1023, + "num_input_tokens_seen": 5134368, + "step": 24330 + }, + { + "epoch": 2.677117711771177, + "grad_norm": 0.531662106513977, + "learning_rate": 4.9825680243036485e-05, + "loss": 0.0418, + "num_input_tokens_seen": 5135360, + "step": 24335 + }, + { + "epoch": 2.6776677667766777, + "grad_norm": 0.28232455253601074, + "learning_rate": 4.982539719575858e-05, + "loss": 0.0293, + "num_input_tokens_seen": 5136480, + "step": 24340 + }, + { + "epoch": 2.6782178217821784, + "grad_norm": 0.32591456174850464, + "learning_rate": 4.98251139196769e-05, + "loss": 0.0661, + "num_input_tokens_seen": 5137472, + "step": 24345 + }, + { + "epoch": 2.6787678767876786, + "grad_norm": 0.12566624581813812, + "learning_rate": 4.9824830414794053e-05, + "loss": 0.0565, + "num_input_tokens_seen": 5138464, + "step": 24350 + }, + { + "epoch": 2.6793179317931792, + "grad_norm": 0.04360309988260269, + "learning_rate": 4.982454668111266e-05, + "loss": 0.1425, + "num_input_tokens_seen": 5139488, + "step": 24355 + }, + { + "epoch": 2.67986798679868, + "grad_norm": 0.6012257933616638, + "learning_rate": 4.982426271863533e-05, + "loss": 0.0324, + "num_input_tokens_seen": 5140512, + "step": 24360 + }, + { + "epoch": 2.6804180418041805, + "grad_norm": 0.15789049863815308, + "learning_rate": 4.982397852736468e-05, + "loss": 0.08, + "num_input_tokens_seen": 5141568, + "step": 24365 + }, + { + "epoch": 2.680968096809681, + "grad_norm": 1.2191416025161743, + "learning_rate": 4.982369410730334e-05, + "loss": 0.1632, + "num_input_tokens_seen": 5142624, + "step": 24370 + }, + { + "epoch": 2.6815181518151814, + "grad_norm": 0.904144823551178, + "learning_rate": 4.982340945845392e-05, + "loss": 0.0531, + "num_input_tokens_seen": 5143680, + "step": 24375 + }, + { + "epoch": 2.682068206820682, + "grad_norm": 0.9782782793045044, + "learning_rate": 4.9823124580819037e-05, + "loss": 0.0651, + "num_input_tokens_seen": 5144736, + "step": 24380 + }, + { + "epoch": 2.6826182618261827, + "grad_norm": 0.06830976903438568, + "learning_rate": 4.9822839474401334e-05, + "loss": 0.0459, + "num_input_tokens_seen": 5145792, + "step": 24385 + }, + { + "epoch": 2.6831683168316833, + "grad_norm": 0.045688942074775696, + "learning_rate": 4.982255413920343e-05, + "loss": 0.0314, + "num_input_tokens_seen": 5146816, + "step": 24390 + }, + { + "epoch": 2.683718371837184, + "grad_norm": 0.22908373177051544, + "learning_rate": 4.982226857522796e-05, + "loss": 0.1421, + "num_input_tokens_seen": 5147840, + "step": 24395 + }, + { + "epoch": 2.684268426842684, + "grad_norm": 0.15412423014640808, + "learning_rate": 4.982198278247754e-05, + "loss": 0.1289, + "num_input_tokens_seen": 5148896, + "step": 24400 + }, + { + "epoch": 2.684818481848185, + "grad_norm": 0.8785402774810791, + "learning_rate": 4.982169676095483e-05, + "loss": 0.2025, + "num_input_tokens_seen": 5149984, + "step": 24405 + }, + { + "epoch": 2.6853685368536855, + "grad_norm": 1.9918442964553833, + "learning_rate": 4.982141051066243e-05, + "loss": 0.0769, + "num_input_tokens_seen": 5150976, + "step": 24410 + }, + { + "epoch": 2.6859185918591857, + "grad_norm": 0.1779658943414688, + "learning_rate": 4.9821124031603025e-05, + "loss": 0.0346, + "num_input_tokens_seen": 5152032, + "step": 24415 + }, + { + "epoch": 2.6864686468646863, + "grad_norm": 0.07563143968582153, + "learning_rate": 4.9820837323779215e-05, + "loss": 0.0136, + "num_input_tokens_seen": 5153088, + "step": 24420 + }, + { + "epoch": 2.687018701870187, + "grad_norm": 1.6759098768234253, + "learning_rate": 4.9820550387193667e-05, + "loss": 0.198, + "num_input_tokens_seen": 5154144, + "step": 24425 + }, + { + "epoch": 2.6875687568756876, + "grad_norm": 0.08424997329711914, + "learning_rate": 4.982026322184901e-05, + "loss": 0.037, + "num_input_tokens_seen": 5155200, + "step": 24430 + }, + { + "epoch": 2.6881188118811883, + "grad_norm": 0.09157705307006836, + "learning_rate": 4.981997582774789e-05, + "loss": 0.0504, + "num_input_tokens_seen": 5156256, + "step": 24435 + }, + { + "epoch": 2.6886688668866885, + "grad_norm": 0.2134012132883072, + "learning_rate": 4.9819688204892974e-05, + "loss": 0.0899, + "num_input_tokens_seen": 5157312, + "step": 24440 + }, + { + "epoch": 2.689218921892189, + "grad_norm": 0.18467654287815094, + "learning_rate": 4.981940035328689e-05, + "loss": 0.058, + "num_input_tokens_seen": 5158368, + "step": 24445 + }, + { + "epoch": 2.68976897689769, + "grad_norm": 0.399819940328598, + "learning_rate": 4.981911227293231e-05, + "loss": 0.0754, + "num_input_tokens_seen": 5159392, + "step": 24450 + }, + { + "epoch": 2.6903190319031904, + "grad_norm": 1.1266433000564575, + "learning_rate": 4.981882396383189e-05, + "loss": 0.0928, + "num_input_tokens_seen": 5160448, + "step": 24455 + }, + { + "epoch": 2.690869086908691, + "grad_norm": 0.8247209787368774, + "learning_rate": 4.9818535425988265e-05, + "loss": 0.0892, + "num_input_tokens_seen": 5161600, + "step": 24460 + }, + { + "epoch": 2.6914191419141913, + "grad_norm": 0.6272345781326294, + "learning_rate": 4.9818246659404114e-05, + "loss": 0.1562, + "num_input_tokens_seen": 5162656, + "step": 24465 + }, + { + "epoch": 2.691969196919692, + "grad_norm": 3.2503292560577393, + "learning_rate": 4.9817957664082096e-05, + "loss": 0.175, + "num_input_tokens_seen": 5163648, + "step": 24470 + }, + { + "epoch": 2.6925192519251926, + "grad_norm": 0.13627485930919647, + "learning_rate": 4.981766844002486e-05, + "loss": 0.0418, + "num_input_tokens_seen": 5164704, + "step": 24475 + }, + { + "epoch": 2.693069306930693, + "grad_norm": 0.2724021375179291, + "learning_rate": 4.981737898723509e-05, + "loss": 0.1169, + "num_input_tokens_seen": 5165728, + "step": 24480 + }, + { + "epoch": 2.693619361936194, + "grad_norm": 0.19049878418445587, + "learning_rate": 4.9817089305715445e-05, + "loss": 0.0689, + "num_input_tokens_seen": 5166752, + "step": 24485 + }, + { + "epoch": 2.694169416941694, + "grad_norm": 1.6509730815887451, + "learning_rate": 4.98167993954686e-05, + "loss": 0.16, + "num_input_tokens_seen": 5167776, + "step": 24490 + }, + { + "epoch": 2.6947194719471947, + "grad_norm": 0.15077555179595947, + "learning_rate": 4.981650925649722e-05, + "loss": 0.0569, + "num_input_tokens_seen": 5168832, + "step": 24495 + }, + { + "epoch": 2.6952695269526954, + "grad_norm": 1.2909069061279297, + "learning_rate": 4.981621888880398e-05, + "loss": 0.0621, + "num_input_tokens_seen": 5169824, + "step": 24500 + }, + { + "epoch": 2.6958195819581956, + "grad_norm": 0.8078539371490479, + "learning_rate": 4.981592829239156e-05, + "loss": 0.1766, + "num_input_tokens_seen": 5170912, + "step": 24505 + }, + { + "epoch": 2.6963696369636962, + "grad_norm": 0.2049034684896469, + "learning_rate": 4.981563746726264e-05, + "loss": 0.1126, + "num_input_tokens_seen": 5171936, + "step": 24510 + }, + { + "epoch": 2.696919691969197, + "grad_norm": 1.1763004064559937, + "learning_rate": 4.981534641341989e-05, + "loss": 0.1323, + "num_input_tokens_seen": 5173024, + "step": 24515 + }, + { + "epoch": 2.6974697469746975, + "grad_norm": 0.21206820011138916, + "learning_rate": 4.981505513086601e-05, + "loss": 0.1109, + "num_input_tokens_seen": 5174080, + "step": 24520 + }, + { + "epoch": 2.698019801980198, + "grad_norm": 0.967677116394043, + "learning_rate": 4.981476361960367e-05, + "loss": 0.1228, + "num_input_tokens_seen": 5175072, + "step": 24525 + }, + { + "epoch": 2.6985698569856984, + "grad_norm": 0.19918134808540344, + "learning_rate": 4.981447187963556e-05, + "loss": 0.0801, + "num_input_tokens_seen": 5176128, + "step": 24530 + }, + { + "epoch": 2.699119911991199, + "grad_norm": 0.5479410290718079, + "learning_rate": 4.981417991096436e-05, + "loss": 0.0497, + "num_input_tokens_seen": 5177184, + "step": 24535 + }, + { + "epoch": 2.6996699669966997, + "grad_norm": 0.15858595073223114, + "learning_rate": 4.981388771359279e-05, + "loss": 0.0244, + "num_input_tokens_seen": 5178240, + "step": 24540 + }, + { + "epoch": 2.7002200220022003, + "grad_norm": 0.25907278060913086, + "learning_rate": 4.981359528752351e-05, + "loss": 0.034, + "num_input_tokens_seen": 5179296, + "step": 24545 + }, + { + "epoch": 2.700770077007701, + "grad_norm": 0.4183288812637329, + "learning_rate": 4.981330263275924e-05, + "loss": 0.0543, + "num_input_tokens_seen": 5180352, + "step": 24550 + }, + { + "epoch": 2.701320132013201, + "grad_norm": 1.094624400138855, + "learning_rate": 4.9813009749302655e-05, + "loss": 0.1908, + "num_input_tokens_seen": 5181344, + "step": 24555 + }, + { + "epoch": 2.701870187018702, + "grad_norm": 0.6998999714851379, + "learning_rate": 4.981271663715647e-05, + "loss": 0.0931, + "num_input_tokens_seen": 5182432, + "step": 24560 + }, + { + "epoch": 2.7024202420242025, + "grad_norm": 0.8076958656311035, + "learning_rate": 4.9812423296323384e-05, + "loss": 0.0843, + "num_input_tokens_seen": 5183456, + "step": 24565 + }, + { + "epoch": 2.7029702970297027, + "grad_norm": 0.24178096652030945, + "learning_rate": 4.98121297268061e-05, + "loss": 0.0984, + "num_input_tokens_seen": 5184512, + "step": 24570 + }, + { + "epoch": 2.7035203520352034, + "grad_norm": 0.11375348269939423, + "learning_rate": 4.981183592860731e-05, + "loss": 0.0325, + "num_input_tokens_seen": 5185664, + "step": 24575 + }, + { + "epoch": 2.704070407040704, + "grad_norm": 0.3881506621837616, + "learning_rate": 4.981154190172975e-05, + "loss": 0.0685, + "num_input_tokens_seen": 5186752, + "step": 24580 + }, + { + "epoch": 2.7046204620462047, + "grad_norm": 0.3710153102874756, + "learning_rate": 4.981124764617611e-05, + "loss": 0.067, + "num_input_tokens_seen": 5187776, + "step": 24585 + }, + { + "epoch": 2.7051705170517053, + "grad_norm": 0.14584480226039886, + "learning_rate": 4.9810953161949094e-05, + "loss": 0.0293, + "num_input_tokens_seen": 5188864, + "step": 24590 + }, + { + "epoch": 2.7057205720572055, + "grad_norm": 0.33617204427719116, + "learning_rate": 4.981065844905144e-05, + "loss": 0.0472, + "num_input_tokens_seen": 5189952, + "step": 24595 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.8482621312141418, + "learning_rate": 4.9810363507485844e-05, + "loss": 0.0869, + "num_input_tokens_seen": 5191008, + "step": 24600 + }, + { + "epoch": 2.706820682068207, + "grad_norm": 0.6473875641822815, + "learning_rate": 4.981006833725504e-05, + "loss": 0.1339, + "num_input_tokens_seen": 5192064, + "step": 24605 + }, + { + "epoch": 2.7073707370737075, + "grad_norm": 0.13409464061260223, + "learning_rate": 4.980977293836174e-05, + "loss": 0.0552, + "num_input_tokens_seen": 5193152, + "step": 24610 + }, + { + "epoch": 2.707920792079208, + "grad_norm": 0.1238333135843277, + "learning_rate": 4.980947731080867e-05, + "loss": 0.0443, + "num_input_tokens_seen": 5194272, + "step": 24615 + }, + { + "epoch": 2.7084708470847083, + "grad_norm": 0.25204023718833923, + "learning_rate": 4.980918145459855e-05, + "loss": 0.1136, + "num_input_tokens_seen": 5195328, + "step": 24620 + }, + { + "epoch": 2.709020902090209, + "grad_norm": 0.08194296807050705, + "learning_rate": 4.980888536973411e-05, + "loss": 0.0372, + "num_input_tokens_seen": 5196320, + "step": 24625 + }, + { + "epoch": 2.7095709570957096, + "grad_norm": 0.04067271947860718, + "learning_rate": 4.980858905621808e-05, + "loss": 0.1191, + "num_input_tokens_seen": 5197344, + "step": 24630 + }, + { + "epoch": 2.7101210121012103, + "grad_norm": 0.30675262212753296, + "learning_rate": 4.9808292514053176e-05, + "loss": 0.0643, + "num_input_tokens_seen": 5198336, + "step": 24635 + }, + { + "epoch": 2.710671067106711, + "grad_norm": 0.363606721162796, + "learning_rate": 4.980799574324216e-05, + "loss": 0.1334, + "num_input_tokens_seen": 5199424, + "step": 24640 + }, + { + "epoch": 2.711221122112211, + "grad_norm": 0.1549677848815918, + "learning_rate": 4.9807698743787744e-05, + "loss": 0.0592, + "num_input_tokens_seen": 5200512, + "step": 24645 + }, + { + "epoch": 2.7117711771177118, + "grad_norm": 0.05645529553294182, + "learning_rate": 4.9807401515692675e-05, + "loss": 0.0476, + "num_input_tokens_seen": 5201536, + "step": 24650 + }, + { + "epoch": 2.7123212321232124, + "grad_norm": 0.4834161400794983, + "learning_rate": 4.9807104058959684e-05, + "loss": 0.1005, + "num_input_tokens_seen": 5202560, + "step": 24655 + }, + { + "epoch": 2.7128712871287126, + "grad_norm": 1.0562516450881958, + "learning_rate": 4.980680637359153e-05, + "loss": 0.0805, + "num_input_tokens_seen": 5203552, + "step": 24660 + }, + { + "epoch": 2.7134213421342133, + "grad_norm": 0.0842585638165474, + "learning_rate": 4.9806508459590936e-05, + "loss": 0.0446, + "num_input_tokens_seen": 5204608, + "step": 24665 + }, + { + "epoch": 2.713971397139714, + "grad_norm": 0.10611092299222946, + "learning_rate": 4.9806210316960655e-05, + "loss": 0.0979, + "num_input_tokens_seen": 5205632, + "step": 24670 + }, + { + "epoch": 2.7145214521452146, + "grad_norm": 0.5326011180877686, + "learning_rate": 4.980591194570344e-05, + "loss": 0.059, + "num_input_tokens_seen": 5206688, + "step": 24675 + }, + { + "epoch": 2.715071507150715, + "grad_norm": 0.5342242121696472, + "learning_rate": 4.980561334582204e-05, + "loss": 0.0614, + "num_input_tokens_seen": 5207680, + "step": 24680 + }, + { + "epoch": 2.7156215621562154, + "grad_norm": 0.5254374742507935, + "learning_rate": 4.980531451731921e-05, + "loss": 0.0628, + "num_input_tokens_seen": 5208640, + "step": 24685 + }, + { + "epoch": 2.716171617161716, + "grad_norm": 1.6594346761703491, + "learning_rate": 4.980501546019769e-05, + "loss": 0.1471, + "num_input_tokens_seen": 5209696, + "step": 24690 + }, + { + "epoch": 2.7167216721672167, + "grad_norm": 0.4105280637741089, + "learning_rate": 4.9804716174460244e-05, + "loss": 0.0389, + "num_input_tokens_seen": 5210816, + "step": 24695 + }, + { + "epoch": 2.7172717271727174, + "grad_norm": 1.2861359119415283, + "learning_rate": 4.980441666010963e-05, + "loss": 0.1545, + "num_input_tokens_seen": 5211904, + "step": 24700 + }, + { + "epoch": 2.717821782178218, + "grad_norm": 0.08901777118444443, + "learning_rate": 4.980411691714862e-05, + "loss": 0.0557, + "num_input_tokens_seen": 5212960, + "step": 24705 + }, + { + "epoch": 2.718371837183718, + "grad_norm": 0.5613222122192383, + "learning_rate": 4.980381694557996e-05, + "loss": 0.0455, + "num_input_tokens_seen": 5214112, + "step": 24710 + }, + { + "epoch": 2.718921892189219, + "grad_norm": 0.21876084804534912, + "learning_rate": 4.980351674540642e-05, + "loss": 0.1287, + "num_input_tokens_seen": 5215168, + "step": 24715 + }, + { + "epoch": 2.7194719471947195, + "grad_norm": 1.0430850982666016, + "learning_rate": 4.980321631663078e-05, + "loss": 0.1067, + "num_input_tokens_seen": 5216288, + "step": 24720 + }, + { + "epoch": 2.72002200220022, + "grad_norm": 0.33848029375076294, + "learning_rate": 4.980291565925578e-05, + "loss": 0.0592, + "num_input_tokens_seen": 5217344, + "step": 24725 + }, + { + "epoch": 2.720572057205721, + "grad_norm": 0.7237740755081177, + "learning_rate": 4.9802614773284216e-05, + "loss": 0.0426, + "num_input_tokens_seen": 5218368, + "step": 24730 + }, + { + "epoch": 2.721122112211221, + "grad_norm": 0.7136104106903076, + "learning_rate": 4.980231365871885e-05, + "loss": 0.1126, + "num_input_tokens_seen": 5219520, + "step": 24735 + }, + { + "epoch": 2.7216721672167217, + "grad_norm": 0.1904960572719574, + "learning_rate": 4.9802012315562465e-05, + "loss": 0.1145, + "num_input_tokens_seen": 5220544, + "step": 24740 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.11743324995040894, + "learning_rate": 4.980171074381783e-05, + "loss": 0.0437, + "num_input_tokens_seen": 5221632, + "step": 24745 + }, + { + "epoch": 2.7227722772277225, + "grad_norm": 1.5177267789840698, + "learning_rate": 4.980140894348773e-05, + "loss": 0.1667, + "num_input_tokens_seen": 5222688, + "step": 24750 + }, + { + "epoch": 2.723322332233223, + "grad_norm": 0.5372921228408813, + "learning_rate": 4.9801106914574944e-05, + "loss": 0.0886, + "num_input_tokens_seen": 5223680, + "step": 24755 + }, + { + "epoch": 2.723872387238724, + "grad_norm": 0.09568269550800323, + "learning_rate": 4.9800804657082255e-05, + "loss": 0.1185, + "num_input_tokens_seen": 5224736, + "step": 24760 + }, + { + "epoch": 2.7244224422442245, + "grad_norm": 1.6032224893569946, + "learning_rate": 4.980050217101245e-05, + "loss": 0.0784, + "num_input_tokens_seen": 5225792, + "step": 24765 + }, + { + "epoch": 2.724972497249725, + "grad_norm": 0.046639639884233475, + "learning_rate": 4.980019945636832e-05, + "loss": 0.1088, + "num_input_tokens_seen": 5226816, + "step": 24770 + }, + { + "epoch": 2.7255225522552253, + "grad_norm": 0.0722045972943306, + "learning_rate": 4.979989651315266e-05, + "loss": 0.0367, + "num_input_tokens_seen": 5227872, + "step": 24775 + }, + { + "epoch": 2.726072607260726, + "grad_norm": 0.07097156345844269, + "learning_rate": 4.9799593341368234e-05, + "loss": 0.0423, + "num_input_tokens_seen": 5228928, + "step": 24780 + }, + { + "epoch": 2.7266226622662266, + "grad_norm": 0.40942254662513733, + "learning_rate": 4.9799289941017866e-05, + "loss": 0.0177, + "num_input_tokens_seen": 5229984, + "step": 24785 + }, + { + "epoch": 2.7271727172717273, + "grad_norm": 0.22300906479358673, + "learning_rate": 4.979898631210434e-05, + "loss": 0.0864, + "num_input_tokens_seen": 5231040, + "step": 24790 + }, + { + "epoch": 2.727722772277228, + "grad_norm": 0.2144106924533844, + "learning_rate": 4.9798682454630466e-05, + "loss": 0.1182, + "num_input_tokens_seen": 5232096, + "step": 24795 + }, + { + "epoch": 2.728272827282728, + "grad_norm": 0.283846914768219, + "learning_rate": 4.979837836859903e-05, + "loss": 0.0518, + "num_input_tokens_seen": 5233152, + "step": 24800 + }, + { + "epoch": 2.728822882288229, + "grad_norm": 0.08058483898639679, + "learning_rate": 4.979807405401284e-05, + "loss": 0.0538, + "num_input_tokens_seen": 5234176, + "step": 24805 + }, + { + "epoch": 2.7293729372937294, + "grad_norm": 0.046692635864019394, + "learning_rate": 4.9797769510874696e-05, + "loss": 0.0365, + "num_input_tokens_seen": 5235232, + "step": 24810 + }, + { + "epoch": 2.72992299229923, + "grad_norm": 0.0475580170750618, + "learning_rate": 4.979746473918742e-05, + "loss": 0.0566, + "num_input_tokens_seen": 5236256, + "step": 24815 + }, + { + "epoch": 2.7304730473047307, + "grad_norm": 1.4419227838516235, + "learning_rate": 4.9797159738953806e-05, + "loss": 0.0902, + "num_input_tokens_seen": 5237248, + "step": 24820 + }, + { + "epoch": 2.731023102310231, + "grad_norm": 0.07127173244953156, + "learning_rate": 4.9796854510176664e-05, + "loss": 0.0611, + "num_input_tokens_seen": 5238336, + "step": 24825 + }, + { + "epoch": 2.7315731573157316, + "grad_norm": 0.1528705358505249, + "learning_rate": 4.979654905285881e-05, + "loss": 0.0548, + "num_input_tokens_seen": 5239328, + "step": 24830 + }, + { + "epoch": 2.7321232123212322, + "grad_norm": 0.3491426408290863, + "learning_rate": 4.979624336700307e-05, + "loss": 0.0437, + "num_input_tokens_seen": 5240384, + "step": 24835 + }, + { + "epoch": 2.7326732673267324, + "grad_norm": 0.07930594682693481, + "learning_rate": 4.979593745261225e-05, + "loss": 0.1548, + "num_input_tokens_seen": 5241440, + "step": 24840 + }, + { + "epoch": 2.733223322332233, + "grad_norm": 0.701728343963623, + "learning_rate": 4.979563130968917e-05, + "loss": 0.0297, + "num_input_tokens_seen": 5242496, + "step": 24845 + }, + { + "epoch": 2.7337733773377337, + "grad_norm": 0.9438893795013428, + "learning_rate": 4.9795324938236664e-05, + "loss": 0.1302, + "num_input_tokens_seen": 5243616, + "step": 24850 + }, + { + "epoch": 2.7343234323432344, + "grad_norm": 0.31877022981643677, + "learning_rate": 4.9795018338257535e-05, + "loss": 0.0773, + "num_input_tokens_seen": 5244704, + "step": 24855 + }, + { + "epoch": 2.734873487348735, + "grad_norm": 0.02557189017534256, + "learning_rate": 4.9794711509754636e-05, + "loss": 0.0136, + "num_input_tokens_seen": 5245760, + "step": 24860 + }, + { + "epoch": 2.7354235423542352, + "grad_norm": 1.2980568408966064, + "learning_rate": 4.979440445273076e-05, + "loss": 0.178, + "num_input_tokens_seen": 5246816, + "step": 24865 + }, + { + "epoch": 2.735973597359736, + "grad_norm": 0.051353663206100464, + "learning_rate": 4.979409716718877e-05, + "loss": 0.1387, + "num_input_tokens_seen": 5247840, + "step": 24870 + }, + { + "epoch": 2.7365236523652365, + "grad_norm": 0.09805327653884888, + "learning_rate": 4.9793789653131474e-05, + "loss": 0.0632, + "num_input_tokens_seen": 5248928, + "step": 24875 + }, + { + "epoch": 2.737073707370737, + "grad_norm": 0.48628389835357666, + "learning_rate": 4.979348191056172e-05, + "loss": 0.0461, + "num_input_tokens_seen": 5249984, + "step": 24880 + }, + { + "epoch": 2.737623762376238, + "grad_norm": 0.08096179366111755, + "learning_rate": 4.979317393948234e-05, + "loss": 0.0748, + "num_input_tokens_seen": 5251072, + "step": 24885 + }, + { + "epoch": 2.738173817381738, + "grad_norm": 0.03188437968492508, + "learning_rate": 4.979286573989617e-05, + "loss": 0.0328, + "num_input_tokens_seen": 5252096, + "step": 24890 + }, + { + "epoch": 2.7387238723872387, + "grad_norm": 0.028364239260554314, + "learning_rate": 4.979255731180605e-05, + "loss": 0.0413, + "num_input_tokens_seen": 5253120, + "step": 24895 + }, + { + "epoch": 2.7392739273927393, + "grad_norm": 0.7979767322540283, + "learning_rate": 4.9792248655214834e-05, + "loss": 0.1175, + "num_input_tokens_seen": 5254144, + "step": 24900 + }, + { + "epoch": 2.73982398239824, + "grad_norm": 0.4661584496498108, + "learning_rate": 4.9791939770125364e-05, + "loss": 0.2012, + "num_input_tokens_seen": 5255200, + "step": 24905 + }, + { + "epoch": 2.7403740374037406, + "grad_norm": 0.7977349758148193, + "learning_rate": 4.9791630656540463e-05, + "loss": 0.1404, + "num_input_tokens_seen": 5256192, + "step": 24910 + }, + { + "epoch": 2.740924092409241, + "grad_norm": 0.08801046013832092, + "learning_rate": 4.979132131446301e-05, + "loss": 0.0627, + "num_input_tokens_seen": 5257280, + "step": 24915 + }, + { + "epoch": 2.7414741474147415, + "grad_norm": 0.08286456763744354, + "learning_rate": 4.9791011743895846e-05, + "loss": 0.0723, + "num_input_tokens_seen": 5258272, + "step": 24920 + }, + { + "epoch": 2.742024202420242, + "grad_norm": 0.18374986946582794, + "learning_rate": 4.979070194484182e-05, + "loss": 0.0921, + "num_input_tokens_seen": 5259328, + "step": 24925 + }, + { + "epoch": 2.7425742574257423, + "grad_norm": 0.07943210005760193, + "learning_rate": 4.979039191730379e-05, + "loss": 0.0812, + "num_input_tokens_seen": 5260352, + "step": 24930 + }, + { + "epoch": 2.743124312431243, + "grad_norm": 0.07689773291349411, + "learning_rate": 4.9790081661284616e-05, + "loss": 0.1035, + "num_input_tokens_seen": 5261408, + "step": 24935 + }, + { + "epoch": 2.7436743674367436, + "grad_norm": 0.9813063740730286, + "learning_rate": 4.978977117678715e-05, + "loss": 0.1587, + "num_input_tokens_seen": 5262400, + "step": 24940 + }, + { + "epoch": 2.7442244224422443, + "grad_norm": 0.8546206951141357, + "learning_rate": 4.978946046381426e-05, + "loss": 0.0676, + "num_input_tokens_seen": 5263488, + "step": 24945 + }, + { + "epoch": 2.744774477447745, + "grad_norm": 0.5230100750923157, + "learning_rate": 4.9789149522368805e-05, + "loss": 0.1241, + "num_input_tokens_seen": 5264512, + "step": 24950 + }, + { + "epoch": 2.745324532453245, + "grad_norm": 1.4353123903274536, + "learning_rate": 4.978883835245366e-05, + "loss": 0.0492, + "num_input_tokens_seen": 5265536, + "step": 24955 + }, + { + "epoch": 2.745874587458746, + "grad_norm": 0.32730042934417725, + "learning_rate": 4.9788526954071685e-05, + "loss": 0.0346, + "num_input_tokens_seen": 5266624, + "step": 24960 + }, + { + "epoch": 2.7464246424642464, + "grad_norm": 0.3172112703323364, + "learning_rate": 4.9788215327225754e-05, + "loss": 0.0281, + "num_input_tokens_seen": 5267680, + "step": 24965 + }, + { + "epoch": 2.746974697469747, + "grad_norm": 0.6459304690361023, + "learning_rate": 4.978790347191873e-05, + "loss": 0.0754, + "num_input_tokens_seen": 5268704, + "step": 24970 + }, + { + "epoch": 2.7475247524752477, + "grad_norm": 0.615204393863678, + "learning_rate": 4.97875913881535e-05, + "loss": 0.11, + "num_input_tokens_seen": 5269728, + "step": 24975 + }, + { + "epoch": 2.748074807480748, + "grad_norm": 0.10751448571681976, + "learning_rate": 4.978727907593293e-05, + "loss": 0.0746, + "num_input_tokens_seen": 5270784, + "step": 24980 + }, + { + "epoch": 2.7486248624862486, + "grad_norm": 0.1533443033695221, + "learning_rate": 4.97869665352599e-05, + "loss": 0.0516, + "num_input_tokens_seen": 5271840, + "step": 24985 + }, + { + "epoch": 2.7491749174917492, + "grad_norm": 0.3149898648262024, + "learning_rate": 4.97866537661373e-05, + "loss": 0.0439, + "num_input_tokens_seen": 5272928, + "step": 24990 + }, + { + "epoch": 2.7497249724972495, + "grad_norm": 0.35687094926834106, + "learning_rate": 4.978634076856801e-05, + "loss": 0.1633, + "num_input_tokens_seen": 5273984, + "step": 24995 + }, + { + "epoch": 2.7502750275027505, + "grad_norm": 0.07440493255853653, + "learning_rate": 4.97860275425549e-05, + "loss": 0.0165, + "num_input_tokens_seen": 5274976, + "step": 25000 + }, + { + "epoch": 2.7508250825082508, + "grad_norm": 0.19565002620220184, + "learning_rate": 4.978571408810088e-05, + "loss": 0.0416, + "num_input_tokens_seen": 5276000, + "step": 25005 + }, + { + "epoch": 2.7513751375137514, + "grad_norm": 0.18626593053340912, + "learning_rate": 4.978540040520882e-05, + "loss": 0.1023, + "num_input_tokens_seen": 5277088, + "step": 25010 + }, + { + "epoch": 2.751925192519252, + "grad_norm": 0.41729968786239624, + "learning_rate": 4.978508649388161e-05, + "loss": 0.0795, + "num_input_tokens_seen": 5278112, + "step": 25015 + }, + { + "epoch": 2.7524752475247523, + "grad_norm": 0.13831758499145508, + "learning_rate": 4.978477235412216e-05, + "loss": 0.0851, + "num_input_tokens_seen": 5279104, + "step": 25020 + }, + { + "epoch": 2.753025302530253, + "grad_norm": 0.3398101031780243, + "learning_rate": 4.9784457985933354e-05, + "loss": 0.0662, + "num_input_tokens_seen": 5280224, + "step": 25025 + }, + { + "epoch": 2.7535753575357536, + "grad_norm": 0.22392182052135468, + "learning_rate": 4.9784143389318095e-05, + "loss": 0.0458, + "num_input_tokens_seen": 5281248, + "step": 25030 + }, + { + "epoch": 2.754125412541254, + "grad_norm": 1.6282989978790283, + "learning_rate": 4.9783828564279274e-05, + "loss": 0.1426, + "num_input_tokens_seen": 5282336, + "step": 25035 + }, + { + "epoch": 2.754675467546755, + "grad_norm": 0.4409515857696533, + "learning_rate": 4.97835135108198e-05, + "loss": 0.0475, + "num_input_tokens_seen": 5283392, + "step": 25040 + }, + { + "epoch": 2.755225522552255, + "grad_norm": 0.3495544195175171, + "learning_rate": 4.978319822894257e-05, + "loss": 0.0427, + "num_input_tokens_seen": 5284384, + "step": 25045 + }, + { + "epoch": 2.7557755775577557, + "grad_norm": 0.08114578574895859, + "learning_rate": 4.97828827186505e-05, + "loss": 0.0255, + "num_input_tokens_seen": 5285440, + "step": 25050 + }, + { + "epoch": 2.7563256325632564, + "grad_norm": 1.2198195457458496, + "learning_rate": 4.9782566979946485e-05, + "loss": 0.1411, + "num_input_tokens_seen": 5286560, + "step": 25055 + }, + { + "epoch": 2.756875687568757, + "grad_norm": 1.2424111366271973, + "learning_rate": 4.978225101283345e-05, + "loss": 0.0729, + "num_input_tokens_seen": 5287648, + "step": 25060 + }, + { + "epoch": 2.7574257425742577, + "grad_norm": 0.08792226761579514, + "learning_rate": 4.978193481731429e-05, + "loss": 0.0366, + "num_input_tokens_seen": 5288704, + "step": 25065 + }, + { + "epoch": 2.757975797579758, + "grad_norm": 0.05555720999836922, + "learning_rate": 4.978161839339194e-05, + "loss": 0.0903, + "num_input_tokens_seen": 5289728, + "step": 25070 + }, + { + "epoch": 2.7585258525852585, + "grad_norm": 0.7030182480812073, + "learning_rate": 4.9781301741069295e-05, + "loss": 0.0295, + "num_input_tokens_seen": 5290816, + "step": 25075 + }, + { + "epoch": 2.759075907590759, + "grad_norm": 0.030337253585457802, + "learning_rate": 4.978098486034929e-05, + "loss": 0.0776, + "num_input_tokens_seen": 5291936, + "step": 25080 + }, + { + "epoch": 2.7596259625962594, + "grad_norm": 1.4465020895004272, + "learning_rate": 4.9780667751234835e-05, + "loss": 0.083, + "num_input_tokens_seen": 5293024, + "step": 25085 + }, + { + "epoch": 2.76017601760176, + "grad_norm": 0.7437642812728882, + "learning_rate": 4.978035041372885e-05, + "loss": 0.0947, + "num_input_tokens_seen": 5294048, + "step": 25090 + }, + { + "epoch": 2.7607260726072607, + "grad_norm": 0.652286946773529, + "learning_rate": 4.9780032847834276e-05, + "loss": 0.0559, + "num_input_tokens_seen": 5295072, + "step": 25095 + }, + { + "epoch": 2.7612761276127613, + "grad_norm": 0.15524718165397644, + "learning_rate": 4.977971505355402e-05, + "loss": 0.0413, + "num_input_tokens_seen": 5296096, + "step": 25100 + }, + { + "epoch": 2.761826182618262, + "grad_norm": 0.22239123284816742, + "learning_rate": 4.9779397030891026e-05, + "loss": 0.054, + "num_input_tokens_seen": 5297248, + "step": 25105 + }, + { + "epoch": 2.762376237623762, + "grad_norm": 0.042601440101861954, + "learning_rate": 4.9779078779848224e-05, + "loss": 0.0494, + "num_input_tokens_seen": 5298240, + "step": 25110 + }, + { + "epoch": 2.762926292629263, + "grad_norm": 0.3441423177719116, + "learning_rate": 4.977876030042854e-05, + "loss": 0.0601, + "num_input_tokens_seen": 5299296, + "step": 25115 + }, + { + "epoch": 2.7634763476347635, + "grad_norm": 0.21403174102306366, + "learning_rate": 4.977844159263492e-05, + "loss": 0.1209, + "num_input_tokens_seen": 5300288, + "step": 25120 + }, + { + "epoch": 2.764026402640264, + "grad_norm": 0.03883808106184006, + "learning_rate": 4.9778122656470284e-05, + "loss": 0.037, + "num_input_tokens_seen": 5301312, + "step": 25125 + }, + { + "epoch": 2.7645764576457648, + "grad_norm": 0.5914686918258667, + "learning_rate": 4.977780349193758e-05, + "loss": 0.2704, + "num_input_tokens_seen": 5302336, + "step": 25130 + }, + { + "epoch": 2.765126512651265, + "grad_norm": 0.06219866871833801, + "learning_rate": 4.977748409903975e-05, + "loss": 0.0149, + "num_input_tokens_seen": 5303424, + "step": 25135 + }, + { + "epoch": 2.7656765676567656, + "grad_norm": 0.02931712009012699, + "learning_rate": 4.9777164477779745e-05, + "loss": 0.0951, + "num_input_tokens_seen": 5304448, + "step": 25140 + }, + { + "epoch": 2.7662266226622663, + "grad_norm": 0.12696418166160583, + "learning_rate": 4.97768446281605e-05, + "loss": 0.0323, + "num_input_tokens_seen": 5305408, + "step": 25145 + }, + { + "epoch": 2.766776677667767, + "grad_norm": 0.3336341381072998, + "learning_rate": 4.9776524550184965e-05, + "loss": 0.0569, + "num_input_tokens_seen": 5306464, + "step": 25150 + }, + { + "epoch": 2.7673267326732676, + "grad_norm": 0.20056799054145813, + "learning_rate": 4.97762042438561e-05, + "loss": 0.0262, + "num_input_tokens_seen": 5307552, + "step": 25155 + }, + { + "epoch": 2.7678767876787678, + "grad_norm": 0.6846528053283691, + "learning_rate": 4.977588370917684e-05, + "loss": 0.1481, + "num_input_tokens_seen": 5308608, + "step": 25160 + }, + { + "epoch": 2.7684268426842684, + "grad_norm": 0.026800870895385742, + "learning_rate": 4.9775562946150155e-05, + "loss": 0.0325, + "num_input_tokens_seen": 5309600, + "step": 25165 + }, + { + "epoch": 2.768976897689769, + "grad_norm": 0.4636041224002838, + "learning_rate": 4.9775241954779e-05, + "loss": 0.0326, + "num_input_tokens_seen": 5310656, + "step": 25170 + }, + { + "epoch": 2.7695269526952693, + "grad_norm": 0.1180294081568718, + "learning_rate": 4.977492073506632e-05, + "loss": 0.0667, + "num_input_tokens_seen": 5311680, + "step": 25175 + }, + { + "epoch": 2.77007700770077, + "grad_norm": 1.5910930633544922, + "learning_rate": 4.977459928701508e-05, + "loss": 0.1175, + "num_input_tokens_seen": 5312768, + "step": 25180 + }, + { + "epoch": 2.7706270627062706, + "grad_norm": 0.2850627601146698, + "learning_rate": 4.977427761062825e-05, + "loss": 0.0439, + "num_input_tokens_seen": 5313760, + "step": 25185 + }, + { + "epoch": 2.771177117711771, + "grad_norm": 0.08444952964782715, + "learning_rate": 4.97739557059088e-05, + "loss": 0.0842, + "num_input_tokens_seen": 5314848, + "step": 25190 + }, + { + "epoch": 2.771727172717272, + "grad_norm": 0.5363585352897644, + "learning_rate": 4.977363357285968e-05, + "loss": 0.1787, + "num_input_tokens_seen": 5315872, + "step": 25195 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.4831162393093109, + "learning_rate": 4.977331121148386e-05, + "loss": 0.0802, + "num_input_tokens_seen": 5316896, + "step": 25200 + }, + { + "epoch": 2.7728272827282727, + "grad_norm": 0.06737649440765381, + "learning_rate": 4.9772988621784335e-05, + "loss": 0.0211, + "num_input_tokens_seen": 5317984, + "step": 25205 + }, + { + "epoch": 2.7733773377337734, + "grad_norm": 0.027236362919211388, + "learning_rate": 4.9772665803764054e-05, + "loss": 0.0837, + "num_input_tokens_seen": 5318976, + "step": 25210 + }, + { + "epoch": 2.773927392739274, + "grad_norm": 0.142800435423851, + "learning_rate": 4.9772342757425995e-05, + "loss": 0.1383, + "num_input_tokens_seen": 5320064, + "step": 25215 + }, + { + "epoch": 2.7744774477447747, + "grad_norm": 0.020160188898444176, + "learning_rate": 4.977201948277313e-05, + "loss": 0.0271, + "num_input_tokens_seen": 5321088, + "step": 25220 + }, + { + "epoch": 2.775027502750275, + "grad_norm": 0.8924676775932312, + "learning_rate": 4.977169597980847e-05, + "loss": 0.1058, + "num_input_tokens_seen": 5322112, + "step": 25225 + }, + { + "epoch": 2.7755775577557755, + "grad_norm": 1.3472415208816528, + "learning_rate": 4.977137224853496e-05, + "loss": 0.2017, + "num_input_tokens_seen": 5323104, + "step": 25230 + }, + { + "epoch": 2.776127612761276, + "grad_norm": 0.1553880125284195, + "learning_rate": 4.9771048288955606e-05, + "loss": 0.0845, + "num_input_tokens_seen": 5324128, + "step": 25235 + }, + { + "epoch": 2.776677667766777, + "grad_norm": 0.885155439376831, + "learning_rate": 4.9770724101073386e-05, + "loss": 0.0624, + "num_input_tokens_seen": 5325216, + "step": 25240 + }, + { + "epoch": 2.7772277227722775, + "grad_norm": 0.0659344419836998, + "learning_rate": 4.977039968489128e-05, + "loss": 0.0195, + "num_input_tokens_seen": 5326304, + "step": 25245 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.9676098823547363, + "learning_rate": 4.977007504041229e-05, + "loss": 0.0734, + "num_input_tokens_seen": 5327456, + "step": 25250 + }, + { + "epoch": 2.7783278327832783, + "grad_norm": 0.24444082379341125, + "learning_rate": 4.976975016763941e-05, + "loss": 0.1037, + "num_input_tokens_seen": 5328544, + "step": 25255 + }, + { + "epoch": 2.778877887788779, + "grad_norm": 1.2599189281463623, + "learning_rate": 4.9769425066575624e-05, + "loss": 0.1184, + "num_input_tokens_seen": 5329504, + "step": 25260 + }, + { + "epoch": 2.779427942794279, + "grad_norm": 0.10125745087862015, + "learning_rate": 4.9769099737223936e-05, + "loss": 0.049, + "num_input_tokens_seen": 5330592, + "step": 25265 + }, + { + "epoch": 2.77997799779978, + "grad_norm": 0.17324960231781006, + "learning_rate": 4.976877417958734e-05, + "loss": 0.0355, + "num_input_tokens_seen": 5331712, + "step": 25270 + }, + { + "epoch": 2.7805280528052805, + "grad_norm": 0.37396660447120667, + "learning_rate": 4.976844839366883e-05, + "loss": 0.0213, + "num_input_tokens_seen": 5332736, + "step": 25275 + }, + { + "epoch": 2.781078107810781, + "grad_norm": 0.16167747974395752, + "learning_rate": 4.976812237947143e-05, + "loss": 0.0446, + "num_input_tokens_seen": 5333792, + "step": 25280 + }, + { + "epoch": 2.781628162816282, + "grad_norm": 0.2015698254108429, + "learning_rate": 4.976779613699812e-05, + "loss": 0.0228, + "num_input_tokens_seen": 5334912, + "step": 25285 + }, + { + "epoch": 2.782178217821782, + "grad_norm": 0.7518303990364075, + "learning_rate": 4.976746966625192e-05, + "loss": 0.0679, + "num_input_tokens_seen": 5336000, + "step": 25290 + }, + { + "epoch": 2.7827282728272826, + "grad_norm": 0.6918964982032776, + "learning_rate": 4.9767142967235835e-05, + "loss": 0.0853, + "num_input_tokens_seen": 5337088, + "step": 25295 + }, + { + "epoch": 2.7832783278327833, + "grad_norm": 0.37511447072029114, + "learning_rate": 4.976681603995288e-05, + "loss": 0.0249, + "num_input_tokens_seen": 5338080, + "step": 25300 + }, + { + "epoch": 2.783828382838284, + "grad_norm": 0.5214822888374329, + "learning_rate": 4.9766488884406066e-05, + "loss": 0.0722, + "num_input_tokens_seen": 5339200, + "step": 25305 + }, + { + "epoch": 2.7843784378437846, + "grad_norm": 0.6205403208732605, + "learning_rate": 4.976616150059841e-05, + "loss": 0.063, + "num_input_tokens_seen": 5340288, + "step": 25310 + }, + { + "epoch": 2.784928492849285, + "grad_norm": 1.2756085395812988, + "learning_rate": 4.976583388853292e-05, + "loss": 0.1509, + "num_input_tokens_seen": 5341312, + "step": 25315 + }, + { + "epoch": 2.7854785478547854, + "grad_norm": 0.05531010776758194, + "learning_rate": 4.976550604821263e-05, + "loss": 0.0343, + "num_input_tokens_seen": 5342368, + "step": 25320 + }, + { + "epoch": 2.786028602860286, + "grad_norm": 2.2628769874572754, + "learning_rate": 4.9765177979640556e-05, + "loss": 0.1798, + "num_input_tokens_seen": 5343456, + "step": 25325 + }, + { + "epoch": 2.7865786578657867, + "grad_norm": 0.6676655411720276, + "learning_rate": 4.9764849682819716e-05, + "loss": 0.0611, + "num_input_tokens_seen": 5344512, + "step": 25330 + }, + { + "epoch": 2.7871287128712874, + "grad_norm": 0.1314377784729004, + "learning_rate": 4.976452115775314e-05, + "loss": 0.0511, + "num_input_tokens_seen": 5345600, + "step": 25335 + }, + { + "epoch": 2.7876787678767876, + "grad_norm": 0.9478818774223328, + "learning_rate": 4.976419240444385e-05, + "loss": 0.1475, + "num_input_tokens_seen": 5346592, + "step": 25340 + }, + { + "epoch": 2.7882288228822882, + "grad_norm": 0.05801237002015114, + "learning_rate": 4.976386342289489e-05, + "loss": 0.0155, + "num_input_tokens_seen": 5347680, + "step": 25345 + }, + { + "epoch": 2.788778877887789, + "grad_norm": 0.06809424608945847, + "learning_rate": 4.976353421310929e-05, + "loss": 0.0349, + "num_input_tokens_seen": 5348832, + "step": 25350 + }, + { + "epoch": 2.789328932893289, + "grad_norm": 0.2550862729549408, + "learning_rate": 4.9763204775090064e-05, + "loss": 0.1222, + "num_input_tokens_seen": 5349888, + "step": 25355 + }, + { + "epoch": 2.7898789878987897, + "grad_norm": 0.8687294721603394, + "learning_rate": 4.976287510884027e-05, + "loss": 0.1283, + "num_input_tokens_seen": 5350912, + "step": 25360 + }, + { + "epoch": 2.7904290429042904, + "grad_norm": 0.05403241887688637, + "learning_rate": 4.976254521436293e-05, + "loss": 0.1039, + "num_input_tokens_seen": 5351968, + "step": 25365 + }, + { + "epoch": 2.790979097909791, + "grad_norm": 1.3377025127410889, + "learning_rate": 4.9762215091661105e-05, + "loss": 0.0892, + "num_input_tokens_seen": 5353088, + "step": 25370 + }, + { + "epoch": 2.7915291529152917, + "grad_norm": 0.4938792288303375, + "learning_rate": 4.976188474073781e-05, + "loss": 0.0658, + "num_input_tokens_seen": 5354176, + "step": 25375 + }, + { + "epoch": 2.792079207920792, + "grad_norm": 1.1277791261672974, + "learning_rate": 4.976155416159612e-05, + "loss": 0.1372, + "num_input_tokens_seen": 5355264, + "step": 25380 + }, + { + "epoch": 2.7926292629262925, + "grad_norm": 0.42393720149993896, + "learning_rate": 4.976122335423906e-05, + "loss": 0.0944, + "num_input_tokens_seen": 5356288, + "step": 25385 + }, + { + "epoch": 2.793179317931793, + "grad_norm": 0.25353753566741943, + "learning_rate": 4.976089231866969e-05, + "loss": 0.0274, + "num_input_tokens_seen": 5357312, + "step": 25390 + }, + { + "epoch": 2.793729372937294, + "grad_norm": 0.05514710396528244, + "learning_rate": 4.9760561054891055e-05, + "loss": 0.0555, + "num_input_tokens_seen": 5358368, + "step": 25395 + }, + { + "epoch": 2.7942794279427945, + "grad_norm": 0.9657339453697205, + "learning_rate": 4.9760229562906205e-05, + "loss": 0.084, + "num_input_tokens_seen": 5359392, + "step": 25400 + }, + { + "epoch": 2.7948294829482947, + "grad_norm": 0.5559417009353638, + "learning_rate": 4.9759897842718205e-05, + "loss": 0.1185, + "num_input_tokens_seen": 5360416, + "step": 25405 + }, + { + "epoch": 2.7953795379537953, + "grad_norm": 0.09286173433065414, + "learning_rate": 4.975956589433011e-05, + "loss": 0.035, + "num_input_tokens_seen": 5361472, + "step": 25410 + }, + { + "epoch": 2.795929592959296, + "grad_norm": 0.16953343152999878, + "learning_rate": 4.975923371774497e-05, + "loss": 0.0937, + "num_input_tokens_seen": 5362528, + "step": 25415 + }, + { + "epoch": 2.7964796479647966, + "grad_norm": 0.19482341408729553, + "learning_rate": 4.975890131296586e-05, + "loss": 0.0172, + "num_input_tokens_seen": 5363584, + "step": 25420 + }, + { + "epoch": 2.7970297029702973, + "grad_norm": 0.18430842459201813, + "learning_rate": 4.9758568679995835e-05, + "loss": 0.0761, + "num_input_tokens_seen": 5364672, + "step": 25425 + }, + { + "epoch": 2.7975797579757975, + "grad_norm": 1.024680495262146, + "learning_rate": 4.975823581883796e-05, + "loss": 0.0393, + "num_input_tokens_seen": 5365792, + "step": 25430 + }, + { + "epoch": 2.798129812981298, + "grad_norm": 0.17316928505897522, + "learning_rate": 4.975790272949531e-05, + "loss": 0.0646, + "num_input_tokens_seen": 5366848, + "step": 25435 + }, + { + "epoch": 2.798679867986799, + "grad_norm": 1.479766607284546, + "learning_rate": 4.975756941197095e-05, + "loss": 0.0838, + "num_input_tokens_seen": 5367936, + "step": 25440 + }, + { + "epoch": 2.799229922992299, + "grad_norm": 0.13828127086162567, + "learning_rate": 4.975723586626795e-05, + "loss": 0.0426, + "num_input_tokens_seen": 5369056, + "step": 25445 + }, + { + "epoch": 2.7997799779977997, + "grad_norm": 0.0918058454990387, + "learning_rate": 4.975690209238939e-05, + "loss": 0.0579, + "num_input_tokens_seen": 5370144, + "step": 25450 + }, + { + "epoch": 2.8003300330033003, + "grad_norm": 0.09355199337005615, + "learning_rate": 4.975656809033834e-05, + "loss": 0.0951, + "num_input_tokens_seen": 5371264, + "step": 25455 + }, + { + "epoch": 2.800880088008801, + "grad_norm": 2.3474812507629395, + "learning_rate": 4.9756233860117885e-05, + "loss": 0.0935, + "num_input_tokens_seen": 5372352, + "step": 25460 + }, + { + "epoch": 2.8014301430143016, + "grad_norm": 0.11597947031259537, + "learning_rate": 4.97558994017311e-05, + "loss": 0.1185, + "num_input_tokens_seen": 5373408, + "step": 25465 + }, + { + "epoch": 2.801980198019802, + "grad_norm": 0.33150461316108704, + "learning_rate": 4.9755564715181064e-05, + "loss": 0.0355, + "num_input_tokens_seen": 5374496, + "step": 25470 + }, + { + "epoch": 2.8025302530253025, + "grad_norm": 0.5037468075752258, + "learning_rate": 4.975522980047087e-05, + "loss": 0.0632, + "num_input_tokens_seen": 5375552, + "step": 25475 + }, + { + "epoch": 2.803080308030803, + "grad_norm": 0.23195114731788635, + "learning_rate": 4.97548946576036e-05, + "loss": 0.1216, + "num_input_tokens_seen": 5376576, + "step": 25480 + }, + { + "epoch": 2.8036303630363038, + "grad_norm": 1.1145282983779907, + "learning_rate": 4.975455928658235e-05, + "loss": 0.1802, + "num_input_tokens_seen": 5377600, + "step": 25485 + }, + { + "epoch": 2.8041804180418044, + "grad_norm": 0.7791829705238342, + "learning_rate": 4.975422368741021e-05, + "loss": 0.0723, + "num_input_tokens_seen": 5378688, + "step": 25490 + }, + { + "epoch": 2.8047304730473046, + "grad_norm": 0.06278074532747269, + "learning_rate": 4.9753887860090254e-05, + "loss": 0.0164, + "num_input_tokens_seen": 5379776, + "step": 25495 + }, + { + "epoch": 2.8052805280528053, + "grad_norm": 0.6816359758377075, + "learning_rate": 4.9753551804625596e-05, + "loss": 0.1216, + "num_input_tokens_seen": 5380800, + "step": 25500 + }, + { + "epoch": 2.805830583058306, + "grad_norm": 0.18646730482578278, + "learning_rate": 4.9753215521019335e-05, + "loss": 0.0423, + "num_input_tokens_seen": 5381856, + "step": 25505 + }, + { + "epoch": 2.806380638063806, + "grad_norm": 0.04913213849067688, + "learning_rate": 4.975287900927456e-05, + "loss": 0.0982, + "num_input_tokens_seen": 5382880, + "step": 25510 + }, + { + "epoch": 2.806930693069307, + "grad_norm": 0.9815525412559509, + "learning_rate": 4.9752542269394373e-05, + "loss": 0.0691, + "num_input_tokens_seen": 5384000, + "step": 25515 + }, + { + "epoch": 2.8074807480748074, + "grad_norm": 0.04766189306974411, + "learning_rate": 4.975220530138189e-05, + "loss": 0.0326, + "num_input_tokens_seen": 5385024, + "step": 25520 + }, + { + "epoch": 2.808030803080308, + "grad_norm": 0.459060400724411, + "learning_rate": 4.9751868105240204e-05, + "loss": 0.0793, + "num_input_tokens_seen": 5386112, + "step": 25525 + }, + { + "epoch": 2.8085808580858087, + "grad_norm": 0.6318235993385315, + "learning_rate": 4.9751530680972424e-05, + "loss": 0.0846, + "num_input_tokens_seen": 5387200, + "step": 25530 + }, + { + "epoch": 2.809130913091309, + "grad_norm": 0.322715699672699, + "learning_rate": 4.975119302858166e-05, + "loss": 0.1173, + "num_input_tokens_seen": 5388256, + "step": 25535 + }, + { + "epoch": 2.8096809680968096, + "grad_norm": 0.83998703956604, + "learning_rate": 4.975085514807104e-05, + "loss": 0.108, + "num_input_tokens_seen": 5389248, + "step": 25540 + }, + { + "epoch": 2.81023102310231, + "grad_norm": 0.2741723358631134, + "learning_rate": 4.975051703944366e-05, + "loss": 0.0882, + "num_input_tokens_seen": 5390272, + "step": 25545 + }, + { + "epoch": 2.810781078107811, + "grad_norm": 0.3661878705024719, + "learning_rate": 4.975017870270263e-05, + "loss": 0.0699, + "num_input_tokens_seen": 5391360, + "step": 25550 + }, + { + "epoch": 2.8113311331133115, + "grad_norm": 0.5634315013885498, + "learning_rate": 4.974984013785109e-05, + "loss": 0.1604, + "num_input_tokens_seen": 5392384, + "step": 25555 + }, + { + "epoch": 2.8118811881188117, + "grad_norm": 0.3994324803352356, + "learning_rate": 4.974950134489215e-05, + "loss": 0.0394, + "num_input_tokens_seen": 5393376, + "step": 25560 + }, + { + "epoch": 2.8124312431243124, + "grad_norm": 0.9754000902175903, + "learning_rate": 4.9749162323828926e-05, + "loss": 0.0743, + "num_input_tokens_seen": 5394400, + "step": 25565 + }, + { + "epoch": 2.812981298129813, + "grad_norm": 0.2911984920501709, + "learning_rate": 4.9748823074664555e-05, + "loss": 0.0336, + "num_input_tokens_seen": 5395424, + "step": 25570 + }, + { + "epoch": 2.8135313531353137, + "grad_norm": 0.9132826924324036, + "learning_rate": 4.974848359740215e-05, + "loss": 0.0873, + "num_input_tokens_seen": 5396416, + "step": 25575 + }, + { + "epoch": 2.8140814081408143, + "grad_norm": 0.41078412532806396, + "learning_rate": 4.974814389204485e-05, + "loss": 0.0459, + "num_input_tokens_seen": 5397536, + "step": 25580 + }, + { + "epoch": 2.8146314631463145, + "grad_norm": 0.06888385862112045, + "learning_rate": 4.9747803958595785e-05, + "loss": 0.1255, + "num_input_tokens_seen": 5398592, + "step": 25585 + }, + { + "epoch": 2.815181518151815, + "grad_norm": 0.5157177448272705, + "learning_rate": 4.974746379705809e-05, + "loss": 0.0742, + "num_input_tokens_seen": 5399616, + "step": 25590 + }, + { + "epoch": 2.815731573157316, + "grad_norm": 1.7071017026901245, + "learning_rate": 4.974712340743489e-05, + "loss": 0.2232, + "num_input_tokens_seen": 5400672, + "step": 25595 + }, + { + "epoch": 2.816281628162816, + "grad_norm": 0.4104034900665283, + "learning_rate": 4.974678278972933e-05, + "loss": 0.0442, + "num_input_tokens_seen": 5401632, + "step": 25600 + }, + { + "epoch": 2.8168316831683167, + "grad_norm": 0.7806342244148254, + "learning_rate": 4.974644194394454e-05, + "loss": 0.0648, + "num_input_tokens_seen": 5402688, + "step": 25605 + }, + { + "epoch": 2.8173817381738173, + "grad_norm": 0.12175866961479187, + "learning_rate": 4.9746100870083676e-05, + "loss": 0.0183, + "num_input_tokens_seen": 5403712, + "step": 25610 + }, + { + "epoch": 2.817931793179318, + "grad_norm": 0.055753856897354126, + "learning_rate": 4.974575956814988e-05, + "loss": 0.0438, + "num_input_tokens_seen": 5404768, + "step": 25615 + }, + { + "epoch": 2.8184818481848186, + "grad_norm": 0.1869223415851593, + "learning_rate": 4.974541803814629e-05, + "loss": 0.0898, + "num_input_tokens_seen": 5405792, + "step": 25620 + }, + { + "epoch": 2.819031903190319, + "grad_norm": 0.09168125689029694, + "learning_rate": 4.974507628007605e-05, + "loss": 0.0387, + "num_input_tokens_seen": 5406848, + "step": 25625 + }, + { + "epoch": 2.8195819581958195, + "grad_norm": 0.13001912832260132, + "learning_rate": 4.974473429394232e-05, + "loss": 0.0718, + "num_input_tokens_seen": 5407936, + "step": 25630 + }, + { + "epoch": 2.82013201320132, + "grad_norm": 0.09856778383255005, + "learning_rate": 4.974439207974825e-05, + "loss": 0.0398, + "num_input_tokens_seen": 5409024, + "step": 25635 + }, + { + "epoch": 2.8206820682068208, + "grad_norm": 0.8677005767822266, + "learning_rate": 4.9744049637496984e-05, + "loss": 0.0991, + "num_input_tokens_seen": 5410048, + "step": 25640 + }, + { + "epoch": 2.8212321232123214, + "grad_norm": 0.32503148913383484, + "learning_rate": 4.974370696719169e-05, + "loss": 0.0678, + "num_input_tokens_seen": 5411072, + "step": 25645 + }, + { + "epoch": 2.8217821782178216, + "grad_norm": 3.2053136825561523, + "learning_rate": 4.974336406883552e-05, + "loss": 0.1935, + "num_input_tokens_seen": 5412064, + "step": 25650 + }, + { + "epoch": 2.8223322332233223, + "grad_norm": 0.054875556379556656, + "learning_rate": 4.974302094243164e-05, + "loss": 0.0332, + "num_input_tokens_seen": 5413056, + "step": 25655 + }, + { + "epoch": 2.822882288228823, + "grad_norm": 0.44258368015289307, + "learning_rate": 4.9742677587983205e-05, + "loss": 0.0608, + "num_input_tokens_seen": 5414208, + "step": 25660 + }, + { + "epoch": 2.8234323432343236, + "grad_norm": 0.33071354031562805, + "learning_rate": 4.974233400549339e-05, + "loss": 0.1347, + "num_input_tokens_seen": 5415296, + "step": 25665 + }, + { + "epoch": 2.823982398239824, + "grad_norm": 0.3929176926612854, + "learning_rate": 4.9741990194965346e-05, + "loss": 0.1426, + "num_input_tokens_seen": 5416352, + "step": 25670 + }, + { + "epoch": 2.8245324532453244, + "grad_norm": 0.2964581847190857, + "learning_rate": 4.974164615640225e-05, + "loss": 0.0591, + "num_input_tokens_seen": 5417408, + "step": 25675 + }, + { + "epoch": 2.825082508250825, + "grad_norm": 0.614841878414154, + "learning_rate": 4.974130188980729e-05, + "loss": 0.1007, + "num_input_tokens_seen": 5418496, + "step": 25680 + }, + { + "epoch": 2.8256325632563257, + "grad_norm": 0.1692558377981186, + "learning_rate": 4.974095739518361e-05, + "loss": 0.013, + "num_input_tokens_seen": 5419520, + "step": 25685 + }, + { + "epoch": 2.826182618261826, + "grad_norm": 0.17827682197093964, + "learning_rate": 4.97406126725344e-05, + "loss": 0.1244, + "num_input_tokens_seen": 5420576, + "step": 25690 + }, + { + "epoch": 2.8267326732673266, + "grad_norm": 0.10890559107065201, + "learning_rate": 4.9740267721862835e-05, + "loss": 0.0694, + "num_input_tokens_seen": 5421632, + "step": 25695 + }, + { + "epoch": 2.8272827282728272, + "grad_norm": 0.26887527108192444, + "learning_rate": 4.973992254317209e-05, + "loss": 0.1158, + "num_input_tokens_seen": 5422688, + "step": 25700 + }, + { + "epoch": 2.827832783278328, + "grad_norm": 0.18167950212955475, + "learning_rate": 4.9739577136465354e-05, + "loss": 0.0169, + "num_input_tokens_seen": 5423808, + "step": 25705 + }, + { + "epoch": 2.8283828382838285, + "grad_norm": 0.17200897634029388, + "learning_rate": 4.9739231501745805e-05, + "loss": 0.0365, + "num_input_tokens_seen": 5424832, + "step": 25710 + }, + { + "epoch": 2.8289328932893287, + "grad_norm": 1.3178293704986572, + "learning_rate": 4.973888563901663e-05, + "loss": 0.097, + "num_input_tokens_seen": 5425856, + "step": 25715 + }, + { + "epoch": 2.8294829482948294, + "grad_norm": 0.19356898963451385, + "learning_rate": 4.9738539548281017e-05, + "loss": 0.0537, + "num_input_tokens_seen": 5426912, + "step": 25720 + }, + { + "epoch": 2.83003300330033, + "grad_norm": 0.33705902099609375, + "learning_rate": 4.973819322954216e-05, + "loss": 0.0179, + "num_input_tokens_seen": 5428000, + "step": 25725 + }, + { + "epoch": 2.8305830583058307, + "grad_norm": 0.03167625516653061, + "learning_rate": 4.973784668280324e-05, + "loss": 0.0978, + "num_input_tokens_seen": 5428992, + "step": 25730 + }, + { + "epoch": 2.8311331133113313, + "grad_norm": 0.12140297144651413, + "learning_rate": 4.9737499908067465e-05, + "loss": 0.0682, + "num_input_tokens_seen": 5430048, + "step": 25735 + }, + { + "epoch": 2.8316831683168315, + "grad_norm": 0.036536604166030884, + "learning_rate": 4.973715290533801e-05, + "loss": 0.0324, + "num_input_tokens_seen": 5431136, + "step": 25740 + }, + { + "epoch": 2.832233223322332, + "grad_norm": 0.697650671005249, + "learning_rate": 4.9736805674618105e-05, + "loss": 0.0824, + "num_input_tokens_seen": 5432192, + "step": 25745 + }, + { + "epoch": 2.832783278327833, + "grad_norm": 0.7337601780891418, + "learning_rate": 4.973645821591092e-05, + "loss": 0.1087, + "num_input_tokens_seen": 5433248, + "step": 25750 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.6197208762168884, + "learning_rate": 4.9736110529219674e-05, + "loss": 0.0646, + "num_input_tokens_seen": 5434304, + "step": 25755 + }, + { + "epoch": 2.833883388338834, + "grad_norm": 0.7421062588691711, + "learning_rate": 4.9735762614547565e-05, + "loss": 0.063, + "num_input_tokens_seen": 5435360, + "step": 25760 + }, + { + "epoch": 2.8344334433443343, + "grad_norm": 1.5440871715545654, + "learning_rate": 4.97354144718978e-05, + "loss": 0.0925, + "num_input_tokens_seen": 5436416, + "step": 25765 + }, + { + "epoch": 2.834983498349835, + "grad_norm": 1.262001633644104, + "learning_rate": 4.973506610127359e-05, + "loss": 0.1701, + "num_input_tokens_seen": 5437408, + "step": 25770 + }, + { + "epoch": 2.8355335533553356, + "grad_norm": 1.9110145568847656, + "learning_rate": 4.973471750267814e-05, + "loss": 0.1338, + "num_input_tokens_seen": 5438496, + "step": 25775 + }, + { + "epoch": 2.836083608360836, + "grad_norm": 0.08371990919113159, + "learning_rate": 4.973436867611467e-05, + "loss": 0.1044, + "num_input_tokens_seen": 5439488, + "step": 25780 + }, + { + "epoch": 2.8366336633663365, + "grad_norm": 0.26770710945129395, + "learning_rate": 4.97340196215864e-05, + "loss": 0.1273, + "num_input_tokens_seen": 5440512, + "step": 25785 + }, + { + "epoch": 2.837183718371837, + "grad_norm": 0.024510379880666733, + "learning_rate": 4.973367033909654e-05, + "loss": 0.0401, + "num_input_tokens_seen": 5441536, + "step": 25790 + }, + { + "epoch": 2.837733773377338, + "grad_norm": 0.3101192116737366, + "learning_rate": 4.97333208286483e-05, + "loss": 0.0322, + "num_input_tokens_seen": 5442560, + "step": 25795 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 1.1584885120391846, + "learning_rate": 4.973297109024492e-05, + "loss": 0.2041, + "num_input_tokens_seen": 5443616, + "step": 25800 + }, + { + "epoch": 2.8388338833883386, + "grad_norm": 0.23538701236248016, + "learning_rate": 4.97326211238896e-05, + "loss": 0.0909, + "num_input_tokens_seen": 5444672, + "step": 25805 + }, + { + "epoch": 2.8393839383938393, + "grad_norm": 0.14036786556243896, + "learning_rate": 4.973227092958559e-05, + "loss": 0.083, + "num_input_tokens_seen": 5445792, + "step": 25810 + }, + { + "epoch": 2.83993399339934, + "grad_norm": 0.043477561324834824, + "learning_rate": 4.97319205073361e-05, + "loss": 0.0259, + "num_input_tokens_seen": 5446880, + "step": 25815 + }, + { + "epoch": 2.8404840484048406, + "grad_norm": 0.30273860692977905, + "learning_rate": 4.973156985714437e-05, + "loss": 0.0633, + "num_input_tokens_seen": 5447936, + "step": 25820 + }, + { + "epoch": 2.8410341034103412, + "grad_norm": 0.06306290626525879, + "learning_rate": 4.973121897901363e-05, + "loss": 0.1331, + "num_input_tokens_seen": 5448960, + "step": 25825 + }, + { + "epoch": 2.8415841584158414, + "grad_norm": 0.2249348908662796, + "learning_rate": 4.9730867872947105e-05, + "loss": 0.0232, + "num_input_tokens_seen": 5450016, + "step": 25830 + }, + { + "epoch": 2.842134213421342, + "grad_norm": 0.1641799658536911, + "learning_rate": 4.973051653894805e-05, + "loss": 0.0685, + "num_input_tokens_seen": 5451104, + "step": 25835 + }, + { + "epoch": 2.8426842684268427, + "grad_norm": 0.06030518561601639, + "learning_rate": 4.9730164977019676e-05, + "loss": 0.0274, + "num_input_tokens_seen": 5452192, + "step": 25840 + }, + { + "epoch": 2.8432343234323434, + "grad_norm": 0.4472825527191162, + "learning_rate": 4.972981318716525e-05, + "loss": 0.0839, + "num_input_tokens_seen": 5453216, + "step": 25845 + }, + { + "epoch": 2.843784378437844, + "grad_norm": 0.843646228313446, + "learning_rate": 4.9729461169387994e-05, + "loss": 0.0529, + "num_input_tokens_seen": 5454240, + "step": 25850 + }, + { + "epoch": 2.8443344334433442, + "grad_norm": 0.3515302240848541, + "learning_rate": 4.9729108923691164e-05, + "loss": 0.0358, + "num_input_tokens_seen": 5455296, + "step": 25855 + }, + { + "epoch": 2.844884488448845, + "grad_norm": 0.26660916209220886, + "learning_rate": 4.972875645007801e-05, + "loss": 0.0376, + "num_input_tokens_seen": 5456352, + "step": 25860 + }, + { + "epoch": 2.8454345434543455, + "grad_norm": 0.21519704163074493, + "learning_rate": 4.972840374855176e-05, + "loss": 0.1321, + "num_input_tokens_seen": 5457472, + "step": 25865 + }, + { + "epoch": 2.8459845984598457, + "grad_norm": 0.8033871650695801, + "learning_rate": 4.972805081911569e-05, + "loss": 0.0711, + "num_input_tokens_seen": 5458496, + "step": 25870 + }, + { + "epoch": 2.8465346534653464, + "grad_norm": 0.7763295769691467, + "learning_rate": 4.972769766177303e-05, + "loss": 0.0533, + "num_input_tokens_seen": 5459520, + "step": 25875 + }, + { + "epoch": 2.847084708470847, + "grad_norm": 0.622879683971405, + "learning_rate": 4.972734427652705e-05, + "loss": 0.0437, + "num_input_tokens_seen": 5460608, + "step": 25880 + }, + { + "epoch": 2.8476347634763477, + "grad_norm": 0.4562959671020508, + "learning_rate": 4.972699066338101e-05, + "loss": 0.055, + "num_input_tokens_seen": 5461728, + "step": 25885 + }, + { + "epoch": 2.8481848184818483, + "grad_norm": 0.17802053689956665, + "learning_rate": 4.972663682233816e-05, + "loss": 0.0548, + "num_input_tokens_seen": 5462720, + "step": 25890 + }, + { + "epoch": 2.8487348734873486, + "grad_norm": 0.31643909215927124, + "learning_rate": 4.9726282753401765e-05, + "loss": 0.0747, + "num_input_tokens_seen": 5463808, + "step": 25895 + }, + { + "epoch": 2.849284928492849, + "grad_norm": 0.03203657642006874, + "learning_rate": 4.972592845657508e-05, + "loss": 0.1654, + "num_input_tokens_seen": 5464832, + "step": 25900 + }, + { + "epoch": 2.84983498349835, + "grad_norm": 0.4249213933944702, + "learning_rate": 4.972557393186138e-05, + "loss": 0.1678, + "num_input_tokens_seen": 5465888, + "step": 25905 + }, + { + "epoch": 2.8503850385038505, + "grad_norm": 0.09095200151205063, + "learning_rate": 4.972521917926394e-05, + "loss": 0.1218, + "num_input_tokens_seen": 5466912, + "step": 25910 + }, + { + "epoch": 2.850935093509351, + "grad_norm": 0.328981876373291, + "learning_rate": 4.9724864198786006e-05, + "loss": 0.0574, + "num_input_tokens_seen": 5467904, + "step": 25915 + }, + { + "epoch": 2.8514851485148514, + "grad_norm": 1.1326189041137695, + "learning_rate": 4.9724508990430865e-05, + "loss": 0.0764, + "num_input_tokens_seen": 5468928, + "step": 25920 + }, + { + "epoch": 2.852035203520352, + "grad_norm": 0.360953688621521, + "learning_rate": 4.972415355420179e-05, + "loss": 0.0942, + "num_input_tokens_seen": 5469952, + "step": 25925 + }, + { + "epoch": 2.8525852585258527, + "grad_norm": 0.2535398602485657, + "learning_rate": 4.972379789010205e-05, + "loss": 0.1048, + "num_input_tokens_seen": 5471008, + "step": 25930 + }, + { + "epoch": 2.8531353135313533, + "grad_norm": 0.34582239389419556, + "learning_rate": 4.9723441998134936e-05, + "loss": 0.094, + "num_input_tokens_seen": 5472096, + "step": 25935 + }, + { + "epoch": 2.853685368536854, + "grad_norm": 0.07410380989313126, + "learning_rate": 4.972308587830372e-05, + "loss": 0.0879, + "num_input_tokens_seen": 5473152, + "step": 25940 + }, + { + "epoch": 2.854235423542354, + "grad_norm": 1.3081440925598145, + "learning_rate": 4.9722729530611684e-05, + "loss": 0.0614, + "num_input_tokens_seen": 5474240, + "step": 25945 + }, + { + "epoch": 2.854785478547855, + "grad_norm": 0.6905416250228882, + "learning_rate": 4.972237295506211e-05, + "loss": 0.0587, + "num_input_tokens_seen": 5475360, + "step": 25950 + }, + { + "epoch": 2.8553355335533555, + "grad_norm": 0.41013821959495544, + "learning_rate": 4.972201615165829e-05, + "loss": 0.0417, + "num_input_tokens_seen": 5476416, + "step": 25955 + }, + { + "epoch": 2.8558855885588557, + "grad_norm": 0.04019542038440704, + "learning_rate": 4.972165912040351e-05, + "loss": 0.0277, + "num_input_tokens_seen": 5477472, + "step": 25960 + }, + { + "epoch": 2.8564356435643563, + "grad_norm": 0.25634896755218506, + "learning_rate": 4.972130186130106e-05, + "loss": 0.0287, + "num_input_tokens_seen": 5478592, + "step": 25965 + }, + { + "epoch": 2.856985698569857, + "grad_norm": 0.41877004504203796, + "learning_rate": 4.9720944374354235e-05, + "loss": 0.0479, + "num_input_tokens_seen": 5479680, + "step": 25970 + }, + { + "epoch": 2.8575357535753576, + "grad_norm": 0.08666858077049255, + "learning_rate": 4.972058665956633e-05, + "loss": 0.0272, + "num_input_tokens_seen": 5480736, + "step": 25975 + }, + { + "epoch": 2.8580858085808583, + "grad_norm": 0.18297025561332703, + "learning_rate": 4.972022871694063e-05, + "loss": 0.0957, + "num_input_tokens_seen": 5481760, + "step": 25980 + }, + { + "epoch": 2.8586358635863585, + "grad_norm": 0.3146939277648926, + "learning_rate": 4.971987054648045e-05, + "loss": 0.0642, + "num_input_tokens_seen": 5482784, + "step": 25985 + }, + { + "epoch": 2.859185918591859, + "grad_norm": 0.05412369593977928, + "learning_rate": 4.971951214818908e-05, + "loss": 0.0953, + "num_input_tokens_seen": 5483872, + "step": 25990 + }, + { + "epoch": 2.8597359735973598, + "grad_norm": 0.746235728263855, + "learning_rate": 4.9719153522069836e-05, + "loss": 0.0855, + "num_input_tokens_seen": 5484960, + "step": 25995 + }, + { + "epoch": 2.8602860286028604, + "grad_norm": 0.044830843806266785, + "learning_rate": 4.9718794668126015e-05, + "loss": 0.0302, + "num_input_tokens_seen": 5486016, + "step": 26000 + }, + { + "epoch": 2.860836083608361, + "grad_norm": 0.048406314104795456, + "learning_rate": 4.971843558636092e-05, + "loss": 0.0645, + "num_input_tokens_seen": 5487072, + "step": 26005 + }, + { + "epoch": 2.8613861386138613, + "grad_norm": 0.20682089030742645, + "learning_rate": 4.971807627677787e-05, + "loss": 0.0852, + "num_input_tokens_seen": 5488192, + "step": 26010 + }, + { + "epoch": 2.861936193619362, + "grad_norm": 0.05016538128256798, + "learning_rate": 4.9717716739380166e-05, + "loss": 0.0728, + "num_input_tokens_seen": 5489248, + "step": 26015 + }, + { + "epoch": 2.8624862486248626, + "grad_norm": 0.14047937095165253, + "learning_rate": 4.971735697417113e-05, + "loss": 0.0504, + "num_input_tokens_seen": 5490304, + "step": 26020 + }, + { + "epoch": 2.8630363036303628, + "grad_norm": 0.9695435762405396, + "learning_rate": 4.9716996981154076e-05, + "loss": 0.1129, + "num_input_tokens_seen": 5491360, + "step": 26025 + }, + { + "epoch": 2.863586358635864, + "grad_norm": 2.148393392562866, + "learning_rate": 4.9716636760332325e-05, + "loss": 0.0546, + "num_input_tokens_seen": 5492416, + "step": 26030 + }, + { + "epoch": 2.864136413641364, + "grad_norm": 0.6549693942070007, + "learning_rate": 4.9716276311709186e-05, + "loss": 0.0985, + "num_input_tokens_seen": 5493504, + "step": 26035 + }, + { + "epoch": 2.8646864686468647, + "grad_norm": 0.11063448339700699, + "learning_rate": 4.971591563528799e-05, + "loss": 0.0389, + "num_input_tokens_seen": 5494560, + "step": 26040 + }, + { + "epoch": 2.8652365236523654, + "grad_norm": 0.10323425382375717, + "learning_rate": 4.971555473107206e-05, + "loss": 0.0389, + "num_input_tokens_seen": 5495584, + "step": 26045 + }, + { + "epoch": 2.8657865786578656, + "grad_norm": 2.282648801803589, + "learning_rate": 4.9715193599064725e-05, + "loss": 0.0913, + "num_input_tokens_seen": 5496608, + "step": 26050 + }, + { + "epoch": 2.866336633663366, + "grad_norm": 0.050175637006759644, + "learning_rate": 4.9714832239269306e-05, + "loss": 0.0306, + "num_input_tokens_seen": 5497632, + "step": 26055 + }, + { + "epoch": 2.866886688668867, + "grad_norm": 0.2054222822189331, + "learning_rate": 4.9714470651689136e-05, + "loss": 0.0156, + "num_input_tokens_seen": 5498624, + "step": 26060 + }, + { + "epoch": 2.8674367436743675, + "grad_norm": 0.03376399353146553, + "learning_rate": 4.9714108836327556e-05, + "loss": 0.1553, + "num_input_tokens_seen": 5499584, + "step": 26065 + }, + { + "epoch": 2.867986798679868, + "grad_norm": 0.5262383222579956, + "learning_rate": 4.971374679318789e-05, + "loss": 0.0561, + "num_input_tokens_seen": 5500704, + "step": 26070 + }, + { + "epoch": 2.8685368536853684, + "grad_norm": 0.5699571371078491, + "learning_rate": 4.971338452227348e-05, + "loss": 0.0585, + "num_input_tokens_seen": 5501856, + "step": 26075 + }, + { + "epoch": 2.869086908690869, + "grad_norm": 0.6762867569923401, + "learning_rate": 4.971302202358766e-05, + "loss": 0.0699, + "num_input_tokens_seen": 5502880, + "step": 26080 + }, + { + "epoch": 2.8696369636963697, + "grad_norm": 0.444498747587204, + "learning_rate": 4.971265929713378e-05, + "loss": 0.0592, + "num_input_tokens_seen": 5503936, + "step": 26085 + }, + { + "epoch": 2.8701870187018703, + "grad_norm": 0.19441749155521393, + "learning_rate": 4.9712296342915174e-05, + "loss": 0.0647, + "num_input_tokens_seen": 5504992, + "step": 26090 + }, + { + "epoch": 2.870737073707371, + "grad_norm": 0.020481359213590622, + "learning_rate": 4.9711933160935194e-05, + "loss": 0.0355, + "num_input_tokens_seen": 5506080, + "step": 26095 + }, + { + "epoch": 2.871287128712871, + "grad_norm": 0.22193071246147156, + "learning_rate": 4.971156975119718e-05, + "loss": 0.0771, + "num_input_tokens_seen": 5507168, + "step": 26100 + }, + { + "epoch": 2.871837183718372, + "grad_norm": 0.08357261121273041, + "learning_rate": 4.971120611370449e-05, + "loss": 0.1192, + "num_input_tokens_seen": 5508224, + "step": 26105 + }, + { + "epoch": 2.8723872387238725, + "grad_norm": 1.069779872894287, + "learning_rate": 4.9710842248460466e-05, + "loss": 0.0434, + "num_input_tokens_seen": 5509248, + "step": 26110 + }, + { + "epoch": 2.8729372937293727, + "grad_norm": 0.19607067108154297, + "learning_rate": 4.9710478155468463e-05, + "loss": 0.0744, + "num_input_tokens_seen": 5510304, + "step": 26115 + }, + { + "epoch": 2.8734873487348733, + "grad_norm": 0.02793087437748909, + "learning_rate": 4.9710113834731846e-05, + "loss": 0.1002, + "num_input_tokens_seen": 5511296, + "step": 26120 + }, + { + "epoch": 2.874037403740374, + "grad_norm": 0.9877232909202576, + "learning_rate": 4.970974928625397e-05, + "loss": 0.0717, + "num_input_tokens_seen": 5512352, + "step": 26125 + }, + { + "epoch": 2.8745874587458746, + "grad_norm": 0.8604146838188171, + "learning_rate": 4.9709384510038184e-05, + "loss": 0.0732, + "num_input_tokens_seen": 5513408, + "step": 26130 + }, + { + "epoch": 2.8751375137513753, + "grad_norm": 1.296219825744629, + "learning_rate": 4.970901950608786e-05, + "loss": 0.1355, + "num_input_tokens_seen": 5514464, + "step": 26135 + }, + { + "epoch": 2.8756875687568755, + "grad_norm": 0.42864885926246643, + "learning_rate": 4.970865427440636e-05, + "loss": 0.0646, + "num_input_tokens_seen": 5515520, + "step": 26140 + }, + { + "epoch": 2.876237623762376, + "grad_norm": 0.4450298845767975, + "learning_rate": 4.970828881499705e-05, + "loss": 0.0471, + "num_input_tokens_seen": 5516608, + "step": 26145 + }, + { + "epoch": 2.8767876787678768, + "grad_norm": 0.13822147250175476, + "learning_rate": 4.97079231278633e-05, + "loss": 0.0872, + "num_input_tokens_seen": 5517696, + "step": 26150 + }, + { + "epoch": 2.8773377337733774, + "grad_norm": 0.2399526834487915, + "learning_rate": 4.9707557213008485e-05, + "loss": 0.0379, + "num_input_tokens_seen": 5518752, + "step": 26155 + }, + { + "epoch": 2.877887788778878, + "grad_norm": 1.3552870750427246, + "learning_rate": 4.970719107043595e-05, + "loss": 0.134, + "num_input_tokens_seen": 5519776, + "step": 26160 + }, + { + "epoch": 2.8784378437843783, + "grad_norm": 0.1379285603761673, + "learning_rate": 4.9706824700149115e-05, + "loss": 0.0674, + "num_input_tokens_seen": 5520832, + "step": 26165 + }, + { + "epoch": 2.878987898789879, + "grad_norm": 0.32694879174232483, + "learning_rate": 4.970645810215132e-05, + "loss": 0.0311, + "num_input_tokens_seen": 5521888, + "step": 26170 + }, + { + "epoch": 2.8795379537953796, + "grad_norm": 0.5913417339324951, + "learning_rate": 4.970609127644596e-05, + "loss": 0.095, + "num_input_tokens_seen": 5522912, + "step": 26175 + }, + { + "epoch": 2.8800880088008802, + "grad_norm": 0.2111162543296814, + "learning_rate": 4.9705724223036406e-05, + "loss": 0.0436, + "num_input_tokens_seen": 5523904, + "step": 26180 + }, + { + "epoch": 2.880638063806381, + "grad_norm": 0.08336509764194489, + "learning_rate": 4.970535694192605e-05, + "loss": 0.0342, + "num_input_tokens_seen": 5524960, + "step": 26185 + }, + { + "epoch": 2.881188118811881, + "grad_norm": 0.24591386318206787, + "learning_rate": 4.970498943311827e-05, + "loss": 0.0193, + "num_input_tokens_seen": 5525984, + "step": 26190 + }, + { + "epoch": 2.8817381738173817, + "grad_norm": 0.037276919931173325, + "learning_rate": 4.970462169661646e-05, + "loss": 0.0405, + "num_input_tokens_seen": 5527072, + "step": 26195 + }, + { + "epoch": 2.8822882288228824, + "grad_norm": 0.023230955004692078, + "learning_rate": 4.970425373242401e-05, + "loss": 0.0234, + "num_input_tokens_seen": 5528064, + "step": 26200 + }, + { + "epoch": 2.8828382838283826, + "grad_norm": 1.2402769327163696, + "learning_rate": 4.97038855405443e-05, + "loss": 0.0372, + "num_input_tokens_seen": 5529120, + "step": 26205 + }, + { + "epoch": 2.8833883388338832, + "grad_norm": 0.05402589961886406, + "learning_rate": 4.970351712098074e-05, + "loss": 0.045, + "num_input_tokens_seen": 5530208, + "step": 26210 + }, + { + "epoch": 2.883938393839384, + "grad_norm": 0.5660664439201355, + "learning_rate": 4.970314847373672e-05, + "loss": 0.0412, + "num_input_tokens_seen": 5531232, + "step": 26215 + }, + { + "epoch": 2.8844884488448845, + "grad_norm": 0.09509351849555969, + "learning_rate": 4.970277959881562e-05, + "loss": 0.0346, + "num_input_tokens_seen": 5532352, + "step": 26220 + }, + { + "epoch": 2.885038503850385, + "grad_norm": 0.10176992416381836, + "learning_rate": 4.970241049622086e-05, + "loss": 0.0217, + "num_input_tokens_seen": 5533376, + "step": 26225 + }, + { + "epoch": 2.8855885588558854, + "grad_norm": 0.031840596348047256, + "learning_rate": 4.9702041165955837e-05, + "loss": 0.1167, + "num_input_tokens_seen": 5534464, + "step": 26230 + }, + { + "epoch": 2.886138613861386, + "grad_norm": 1.9745553731918335, + "learning_rate": 4.970167160802395e-05, + "loss": 0.157, + "num_input_tokens_seen": 5535552, + "step": 26235 + }, + { + "epoch": 2.8866886688668867, + "grad_norm": 0.12866701185703278, + "learning_rate": 4.970130182242861e-05, + "loss": 0.0727, + "num_input_tokens_seen": 5536576, + "step": 26240 + }, + { + "epoch": 2.8872387238723873, + "grad_norm": 0.06321003288030624, + "learning_rate": 4.9700931809173226e-05, + "loss": 0.0752, + "num_input_tokens_seen": 5537632, + "step": 26245 + }, + { + "epoch": 2.887788778877888, + "grad_norm": 0.6795685291290283, + "learning_rate": 4.970056156826121e-05, + "loss": 0.1301, + "num_input_tokens_seen": 5538688, + "step": 26250 + }, + { + "epoch": 2.888338833883388, + "grad_norm": 0.9934577941894531, + "learning_rate": 4.970019109969597e-05, + "loss": 0.0914, + "num_input_tokens_seen": 5539712, + "step": 26255 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4529394805431366, + "learning_rate": 4.9699820403480915e-05, + "loss": 0.0663, + "num_input_tokens_seen": 5540800, + "step": 26260 + }, + { + "epoch": 2.8894389438943895, + "grad_norm": 0.014843137934803963, + "learning_rate": 4.969944947961947e-05, + "loss": 0.0558, + "num_input_tokens_seen": 5541888, + "step": 26265 + }, + { + "epoch": 2.88998899889989, + "grad_norm": 0.16127869486808777, + "learning_rate": 4.969907832811505e-05, + "loss": 0.052, + "num_input_tokens_seen": 5542976, + "step": 26270 + }, + { + "epoch": 2.890539053905391, + "grad_norm": 1.6210906505584717, + "learning_rate": 4.9698706948971074e-05, + "loss": 0.0756, + "num_input_tokens_seen": 5544032, + "step": 26275 + }, + { + "epoch": 2.891089108910891, + "grad_norm": 1.2607650756835938, + "learning_rate": 4.969833534219097e-05, + "loss": 0.0437, + "num_input_tokens_seen": 5545056, + "step": 26280 + }, + { + "epoch": 2.8916391639163916, + "grad_norm": 1.328313946723938, + "learning_rate": 4.969796350777817e-05, + "loss": 0.0681, + "num_input_tokens_seen": 5546112, + "step": 26285 + }, + { + "epoch": 2.8921892189218923, + "grad_norm": 0.8560015559196472, + "learning_rate": 4.969759144573608e-05, + "loss": 0.0819, + "num_input_tokens_seen": 5547200, + "step": 26290 + }, + { + "epoch": 2.8927392739273925, + "grad_norm": 0.10098792612552643, + "learning_rate": 4.969721915606814e-05, + "loss": 0.0571, + "num_input_tokens_seen": 5548256, + "step": 26295 + }, + { + "epoch": 2.893289328932893, + "grad_norm": 0.9017660021781921, + "learning_rate": 4.9696846638777796e-05, + "loss": 0.0674, + "num_input_tokens_seen": 5549344, + "step": 26300 + }, + { + "epoch": 2.893839383938394, + "grad_norm": 0.09838826209306717, + "learning_rate": 4.9696473893868455e-05, + "loss": 0.0639, + "num_input_tokens_seen": 5550368, + "step": 26305 + }, + { + "epoch": 2.8943894389438944, + "grad_norm": 0.20775575935840607, + "learning_rate": 4.969610092134357e-05, + "loss": 0.0537, + "num_input_tokens_seen": 5551360, + "step": 26310 + }, + { + "epoch": 2.894939493949395, + "grad_norm": 0.2493581622838974, + "learning_rate": 4.969572772120656e-05, + "loss": 0.0338, + "num_input_tokens_seen": 5552416, + "step": 26315 + }, + { + "epoch": 2.8954895489548953, + "grad_norm": 1.0673651695251465, + "learning_rate": 4.969535429346089e-05, + "loss": 0.1816, + "num_input_tokens_seen": 5553472, + "step": 26320 + }, + { + "epoch": 2.896039603960396, + "grad_norm": 0.06920870393514633, + "learning_rate": 4.969498063810999e-05, + "loss": 0.0393, + "num_input_tokens_seen": 5554464, + "step": 26325 + }, + { + "epoch": 2.8965896589658966, + "grad_norm": 0.31284213066101074, + "learning_rate": 4.96946067551573e-05, + "loss": 0.0538, + "num_input_tokens_seen": 5555456, + "step": 26330 + }, + { + "epoch": 2.8971397139713972, + "grad_norm": 0.16546352207660675, + "learning_rate": 4.969423264460627e-05, + "loss": 0.0513, + "num_input_tokens_seen": 5556448, + "step": 26335 + }, + { + "epoch": 2.897689768976898, + "grad_norm": 0.07825221121311188, + "learning_rate": 4.969385830646035e-05, + "loss": 0.0536, + "num_input_tokens_seen": 5557536, + "step": 26340 + }, + { + "epoch": 2.898239823982398, + "grad_norm": 0.10146958380937576, + "learning_rate": 4.969348374072298e-05, + "loss": 0.0487, + "num_input_tokens_seen": 5558592, + "step": 26345 + }, + { + "epoch": 2.8987898789878987, + "grad_norm": 0.11140080541372299, + "learning_rate": 4.9693108947397626e-05, + "loss": 0.032, + "num_input_tokens_seen": 5559680, + "step": 26350 + }, + { + "epoch": 2.8993399339933994, + "grad_norm": 0.028664959594607353, + "learning_rate": 4.969273392648773e-05, + "loss": 0.0961, + "num_input_tokens_seen": 5560800, + "step": 26355 + }, + { + "epoch": 2.8998899889989, + "grad_norm": 0.1498212218284607, + "learning_rate": 4.9692358677996766e-05, + "loss": 0.0151, + "num_input_tokens_seen": 5561856, + "step": 26360 + }, + { + "epoch": 2.9004400440044007, + "grad_norm": 0.05863005667924881, + "learning_rate": 4.9691983201928174e-05, + "loss": 0.0477, + "num_input_tokens_seen": 5562912, + "step": 26365 + }, + { + "epoch": 2.900990099009901, + "grad_norm": 0.32750651240348816, + "learning_rate": 4.9691607498285417e-05, + "loss": 0.1292, + "num_input_tokens_seen": 5564000, + "step": 26370 + }, + { + "epoch": 2.9015401540154016, + "grad_norm": 1.503659725189209, + "learning_rate": 4.9691231567071964e-05, + "loss": 0.0596, + "num_input_tokens_seen": 5564992, + "step": 26375 + }, + { + "epoch": 2.902090209020902, + "grad_norm": 0.6605795621871948, + "learning_rate": 4.969085540829128e-05, + "loss": 0.0355, + "num_input_tokens_seen": 5566048, + "step": 26380 + }, + { + "epoch": 2.9026402640264024, + "grad_norm": 1.0934842824935913, + "learning_rate": 4.9690479021946824e-05, + "loss": 0.1384, + "num_input_tokens_seen": 5567040, + "step": 26385 + }, + { + "epoch": 2.903190319031903, + "grad_norm": 0.07400981336832047, + "learning_rate": 4.969010240804207e-05, + "loss": 0.0176, + "num_input_tokens_seen": 5568160, + "step": 26390 + }, + { + "epoch": 2.9037403740374037, + "grad_norm": 0.14881110191345215, + "learning_rate": 4.9689725566580496e-05, + "loss": 0.1247, + "num_input_tokens_seen": 5569216, + "step": 26395 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 1.2342549562454224, + "learning_rate": 4.968934849756557e-05, + "loss": 0.0748, + "num_input_tokens_seen": 5570304, + "step": 26400 + }, + { + "epoch": 2.904840484048405, + "grad_norm": 0.7098202705383301, + "learning_rate": 4.9688971201000755e-05, + "loss": 0.1395, + "num_input_tokens_seen": 5571392, + "step": 26405 + }, + { + "epoch": 2.905390539053905, + "grad_norm": 0.03278685733675957, + "learning_rate": 4.968859367688955e-05, + "loss": 0.0084, + "num_input_tokens_seen": 5572416, + "step": 26410 + }, + { + "epoch": 2.905940594059406, + "grad_norm": 0.05137041211128235, + "learning_rate": 4.968821592523542e-05, + "loss": 0.046, + "num_input_tokens_seen": 5573632, + "step": 26415 + }, + { + "epoch": 2.9064906490649065, + "grad_norm": 0.15214641392230988, + "learning_rate": 4.9687837946041846e-05, + "loss": 0.0768, + "num_input_tokens_seen": 5574688, + "step": 26420 + }, + { + "epoch": 2.907040704070407, + "grad_norm": 0.0870063379406929, + "learning_rate": 4.968745973931233e-05, + "loss": 0.069, + "num_input_tokens_seen": 5575744, + "step": 26425 + }, + { + "epoch": 2.907590759075908, + "grad_norm": 0.24019289016723633, + "learning_rate": 4.968708130505033e-05, + "loss": 0.073, + "num_input_tokens_seen": 5576832, + "step": 26430 + }, + { + "epoch": 2.908140814081408, + "grad_norm": 0.4049709737300873, + "learning_rate": 4.968670264325935e-05, + "loss": 0.0225, + "num_input_tokens_seen": 5577888, + "step": 26435 + }, + { + "epoch": 2.9086908690869087, + "grad_norm": 0.4192514717578888, + "learning_rate": 4.968632375394289e-05, + "loss": 0.0261, + "num_input_tokens_seen": 5578944, + "step": 26440 + }, + { + "epoch": 2.9092409240924093, + "grad_norm": 0.30343252420425415, + "learning_rate": 4.9685944637104415e-05, + "loss": 0.0511, + "num_input_tokens_seen": 5580032, + "step": 26445 + }, + { + "epoch": 2.9097909790979095, + "grad_norm": 0.033924754709005356, + "learning_rate": 4.9685565292747436e-05, + "loss": 0.0367, + "num_input_tokens_seen": 5581056, + "step": 26450 + }, + { + "epoch": 2.9103410341034106, + "grad_norm": 0.34558501839637756, + "learning_rate": 4.968518572087545e-05, + "loss": 0.1127, + "num_input_tokens_seen": 5582208, + "step": 26455 + }, + { + "epoch": 2.910891089108911, + "grad_norm": 0.10896945744752884, + "learning_rate": 4.968480592149195e-05, + "loss": 0.0623, + "num_input_tokens_seen": 5583264, + "step": 26460 + }, + { + "epoch": 2.9114411441144115, + "grad_norm": 0.7835853695869446, + "learning_rate": 4.968442589460044e-05, + "loss": 0.0745, + "num_input_tokens_seen": 5584288, + "step": 26465 + }, + { + "epoch": 2.911991199119912, + "grad_norm": 0.1722385585308075, + "learning_rate": 4.9684045640204426e-05, + "loss": 0.0573, + "num_input_tokens_seen": 5585376, + "step": 26470 + }, + { + "epoch": 2.9125412541254123, + "grad_norm": 1.3316787481307983, + "learning_rate": 4.96836651583074e-05, + "loss": 0.0864, + "num_input_tokens_seen": 5586528, + "step": 26475 + }, + { + "epoch": 2.913091309130913, + "grad_norm": 0.11089414358139038, + "learning_rate": 4.968328444891288e-05, + "loss": 0.0255, + "num_input_tokens_seen": 5587616, + "step": 26480 + }, + { + "epoch": 2.9136413641364136, + "grad_norm": 0.23944194614887238, + "learning_rate": 4.968290351202437e-05, + "loss": 0.0396, + "num_input_tokens_seen": 5588640, + "step": 26485 + }, + { + "epoch": 2.9141914191419143, + "grad_norm": 0.24025572836399078, + "learning_rate": 4.9682522347645386e-05, + "loss": 0.0481, + "num_input_tokens_seen": 5589600, + "step": 26490 + }, + { + "epoch": 2.914741474147415, + "grad_norm": 3.56103253364563, + "learning_rate": 4.9682140955779434e-05, + "loss": 0.0528, + "num_input_tokens_seen": 5590624, + "step": 26495 + }, + { + "epoch": 2.915291529152915, + "grad_norm": 0.005596804898232222, + "learning_rate": 4.968175933643003e-05, + "loss": 0.083, + "num_input_tokens_seen": 5591712, + "step": 26500 + }, + { + "epoch": 2.9158415841584158, + "grad_norm": 0.7417492866516113, + "learning_rate": 4.96813774896007e-05, + "loss": 0.0241, + "num_input_tokens_seen": 5592768, + "step": 26505 + }, + { + "epoch": 2.9163916391639164, + "grad_norm": 0.020353252068161964, + "learning_rate": 4.9680995415294956e-05, + "loss": 0.0218, + "num_input_tokens_seen": 5593856, + "step": 26510 + }, + { + "epoch": 2.916941694169417, + "grad_norm": 1.5132853984832764, + "learning_rate": 4.9680613113516315e-05, + "loss": 0.1281, + "num_input_tokens_seen": 5594880, + "step": 26515 + }, + { + "epoch": 2.9174917491749177, + "grad_norm": 0.01823445037007332, + "learning_rate": 4.968023058426832e-05, + "loss": 0.091, + "num_input_tokens_seen": 5595904, + "step": 26520 + }, + { + "epoch": 2.918041804180418, + "grad_norm": 0.23733676970005035, + "learning_rate": 4.967984782755446e-05, + "loss": 0.164, + "num_input_tokens_seen": 5596960, + "step": 26525 + }, + { + "epoch": 2.9185918591859186, + "grad_norm": 0.07057017087936401, + "learning_rate": 4.96794648433783e-05, + "loss": 0.0244, + "num_input_tokens_seen": 5597984, + "step": 26530 + }, + { + "epoch": 2.919141914191419, + "grad_norm": 0.47605857253074646, + "learning_rate": 4.9679081631743354e-05, + "loss": 0.0234, + "num_input_tokens_seen": 5599072, + "step": 26535 + }, + { + "epoch": 2.9196919691969194, + "grad_norm": 0.8811553716659546, + "learning_rate": 4.967869819265315e-05, + "loss": 0.1043, + "num_input_tokens_seen": 5600128, + "step": 26540 + }, + { + "epoch": 2.9202420242024205, + "grad_norm": 0.11037174612283707, + "learning_rate": 4.967831452611124e-05, + "loss": 0.0376, + "num_input_tokens_seen": 5601152, + "step": 26545 + }, + { + "epoch": 2.9207920792079207, + "grad_norm": 0.05672647804021835, + "learning_rate": 4.967793063212113e-05, + "loss": 0.0989, + "num_input_tokens_seen": 5602208, + "step": 26550 + }, + { + "epoch": 2.9213421342134214, + "grad_norm": 1.079758644104004, + "learning_rate": 4.967754651068638e-05, + "loss": 0.1667, + "num_input_tokens_seen": 5603296, + "step": 26555 + }, + { + "epoch": 2.921892189218922, + "grad_norm": 0.24469049274921417, + "learning_rate": 4.967716216181052e-05, + "loss": 0.0918, + "num_input_tokens_seen": 5604320, + "step": 26560 + }, + { + "epoch": 2.9224422442244222, + "grad_norm": 1.3445740938186646, + "learning_rate": 4.967677758549711e-05, + "loss": 0.0643, + "num_input_tokens_seen": 5605408, + "step": 26565 + }, + { + "epoch": 2.922992299229923, + "grad_norm": 0.0670255795121193, + "learning_rate": 4.9676392781749674e-05, + "loss": 0.054, + "num_input_tokens_seen": 5606464, + "step": 26570 + }, + { + "epoch": 2.9235423542354235, + "grad_norm": 0.03859784081578255, + "learning_rate": 4.9676007750571774e-05, + "loss": 0.0079, + "num_input_tokens_seen": 5607520, + "step": 26575 + }, + { + "epoch": 2.924092409240924, + "grad_norm": 0.10268880426883698, + "learning_rate": 4.9675622491966944e-05, + "loss": 0.0317, + "num_input_tokens_seen": 5608576, + "step": 26580 + }, + { + "epoch": 2.924642464246425, + "grad_norm": 1.0372213125228882, + "learning_rate": 4.967523700593875e-05, + "loss": 0.041, + "num_input_tokens_seen": 5609632, + "step": 26585 + }, + { + "epoch": 2.925192519251925, + "grad_norm": 0.7089337706565857, + "learning_rate": 4.967485129249072e-05, + "loss": 0.0716, + "num_input_tokens_seen": 5610720, + "step": 26590 + }, + { + "epoch": 2.9257425742574257, + "grad_norm": 0.7142333388328552, + "learning_rate": 4.967446535162644e-05, + "loss": 0.0739, + "num_input_tokens_seen": 5611776, + "step": 26595 + }, + { + "epoch": 2.9262926292629263, + "grad_norm": 0.11111350357532501, + "learning_rate": 4.967407918334945e-05, + "loss": 0.0161, + "num_input_tokens_seen": 5612768, + "step": 26600 + }, + { + "epoch": 2.926842684268427, + "grad_norm": 0.1719740331172943, + "learning_rate": 4.9673692787663314e-05, + "loss": 0.0648, + "num_input_tokens_seen": 5613792, + "step": 26605 + }, + { + "epoch": 2.9273927392739276, + "grad_norm": 0.04277997463941574, + "learning_rate": 4.9673306164571584e-05, + "loss": 0.0789, + "num_input_tokens_seen": 5614816, + "step": 26610 + }, + { + "epoch": 2.927942794279428, + "grad_norm": 1.5663788318634033, + "learning_rate": 4.967291931407784e-05, + "loss": 0.1427, + "num_input_tokens_seen": 5615872, + "step": 26615 + }, + { + "epoch": 2.9284928492849285, + "grad_norm": 0.2947978079319, + "learning_rate": 4.9672532236185634e-05, + "loss": 0.0345, + "num_input_tokens_seen": 5616928, + "step": 26620 + }, + { + "epoch": 2.929042904290429, + "grad_norm": 0.32878682017326355, + "learning_rate": 4.9672144930898535e-05, + "loss": 0.0816, + "num_input_tokens_seen": 5617952, + "step": 26625 + }, + { + "epoch": 2.9295929592959293, + "grad_norm": 1.1052862405776978, + "learning_rate": 4.967175739822011e-05, + "loss": 0.1372, + "num_input_tokens_seen": 5619040, + "step": 26630 + }, + { + "epoch": 2.93014301430143, + "grad_norm": 0.18993599712848663, + "learning_rate": 4.967136963815394e-05, + "loss": 0.0453, + "num_input_tokens_seen": 5620128, + "step": 26635 + }, + { + "epoch": 2.9306930693069306, + "grad_norm": 0.2896038889884949, + "learning_rate": 4.96709816507036e-05, + "loss": 0.0285, + "num_input_tokens_seen": 5621184, + "step": 26640 + }, + { + "epoch": 2.9312431243124313, + "grad_norm": 0.06633894145488739, + "learning_rate": 4.967059343587266e-05, + "loss": 0.1094, + "num_input_tokens_seen": 5622336, + "step": 26645 + }, + { + "epoch": 2.931793179317932, + "grad_norm": 0.2308030128479004, + "learning_rate": 4.967020499366469e-05, + "loss": 0.0364, + "num_input_tokens_seen": 5623360, + "step": 26650 + }, + { + "epoch": 2.932343234323432, + "grad_norm": 0.10568951070308685, + "learning_rate": 4.966981632408328e-05, + "loss": 0.0852, + "num_input_tokens_seen": 5624448, + "step": 26655 + }, + { + "epoch": 2.932893289328933, + "grad_norm": 0.06611903756856918, + "learning_rate": 4.966942742713202e-05, + "loss": 0.02, + "num_input_tokens_seen": 5625536, + "step": 26660 + }, + { + "epoch": 2.9334433443344334, + "grad_norm": 0.061155833303928375, + "learning_rate": 4.966903830281449e-05, + "loss": 0.026, + "num_input_tokens_seen": 5626560, + "step": 26665 + }, + { + "epoch": 2.933993399339934, + "grad_norm": 0.04893150180578232, + "learning_rate": 4.966864895113426e-05, + "loss": 0.0735, + "num_input_tokens_seen": 5627520, + "step": 26670 + }, + { + "epoch": 2.9345434543454347, + "grad_norm": 0.0727972537279129, + "learning_rate": 4.966825937209493e-05, + "loss": 0.0433, + "num_input_tokens_seen": 5628640, + "step": 26675 + }, + { + "epoch": 2.935093509350935, + "grad_norm": 0.044043850153684616, + "learning_rate": 4.966786956570009e-05, + "loss": 0.1104, + "num_input_tokens_seen": 5629696, + "step": 26680 + }, + { + "epoch": 2.9356435643564356, + "grad_norm": 1.4650408029556274, + "learning_rate": 4.9667479531953334e-05, + "loss": 0.104, + "num_input_tokens_seen": 5630720, + "step": 26685 + }, + { + "epoch": 2.9361936193619362, + "grad_norm": 0.11829817295074463, + "learning_rate": 4.966708927085825e-05, + "loss": 0.0768, + "num_input_tokens_seen": 5631872, + "step": 26690 + }, + { + "epoch": 2.936743674367437, + "grad_norm": 0.4069957435131073, + "learning_rate": 4.9666698782418446e-05, + "loss": 0.0898, + "num_input_tokens_seen": 5632864, + "step": 26695 + }, + { + "epoch": 2.9372937293729375, + "grad_norm": 1.2022285461425781, + "learning_rate": 4.966630806663751e-05, + "loss": 0.1246, + "num_input_tokens_seen": 5633984, + "step": 26700 + }, + { + "epoch": 2.9378437843784377, + "grad_norm": 0.27548491954803467, + "learning_rate": 4.966591712351906e-05, + "loss": 0.1029, + "num_input_tokens_seen": 5635008, + "step": 26705 + }, + { + "epoch": 2.9383938393839384, + "grad_norm": 0.26346245408058167, + "learning_rate": 4.966552595306668e-05, + "loss": 0.042, + "num_input_tokens_seen": 5636064, + "step": 26710 + }, + { + "epoch": 2.938943894389439, + "grad_norm": 0.18776407837867737, + "learning_rate": 4.966513455528399e-05, + "loss": 0.0467, + "num_input_tokens_seen": 5637120, + "step": 26715 + }, + { + "epoch": 2.9394939493949392, + "grad_norm": 0.1683935970067978, + "learning_rate": 4.9664742930174585e-05, + "loss": 0.0198, + "num_input_tokens_seen": 5638176, + "step": 26720 + }, + { + "epoch": 2.94004400440044, + "grad_norm": 1.0820778608322144, + "learning_rate": 4.966435107774208e-05, + "loss": 0.0314, + "num_input_tokens_seen": 5639232, + "step": 26725 + }, + { + "epoch": 2.9405940594059405, + "grad_norm": 1.3843803405761719, + "learning_rate": 4.9663958997990084e-05, + "loss": 0.1451, + "num_input_tokens_seen": 5640288, + "step": 26730 + }, + { + "epoch": 2.941144114411441, + "grad_norm": 0.4297286570072174, + "learning_rate": 4.966356669092222e-05, + "loss": 0.0512, + "num_input_tokens_seen": 5641344, + "step": 26735 + }, + { + "epoch": 2.941694169416942, + "grad_norm": 0.06788501143455505, + "learning_rate": 4.9663174156542095e-05, + "loss": 0.0911, + "num_input_tokens_seen": 5642400, + "step": 26740 + }, + { + "epoch": 2.942244224422442, + "grad_norm": 0.04057222604751587, + "learning_rate": 4.966278139485332e-05, + "loss": 0.0794, + "num_input_tokens_seen": 5643456, + "step": 26745 + }, + { + "epoch": 2.9427942794279427, + "grad_norm": 0.11115086078643799, + "learning_rate": 4.966238840585953e-05, + "loss": 0.0311, + "num_input_tokens_seen": 5644448, + "step": 26750 + }, + { + "epoch": 2.9433443344334433, + "grad_norm": 0.6814870834350586, + "learning_rate": 4.966199518956433e-05, + "loss": 0.0598, + "num_input_tokens_seen": 5645536, + "step": 26755 + }, + { + "epoch": 2.943894389438944, + "grad_norm": 0.8901759386062622, + "learning_rate": 4.966160174597137e-05, + "loss": 0.1018, + "num_input_tokens_seen": 5646560, + "step": 26760 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 1.4312695264816284, + "learning_rate": 4.966120807508425e-05, + "loss": 0.1277, + "num_input_tokens_seen": 5647520, + "step": 26765 + }, + { + "epoch": 2.944994499449945, + "grad_norm": 0.33757951855659485, + "learning_rate": 4.9660814176906614e-05, + "loss": 0.1207, + "num_input_tokens_seen": 5648608, + "step": 26770 + }, + { + "epoch": 2.9455445544554455, + "grad_norm": 0.10019151121377945, + "learning_rate": 4.9660420051442084e-05, + "loss": 0.0321, + "num_input_tokens_seen": 5649664, + "step": 26775 + }, + { + "epoch": 2.946094609460946, + "grad_norm": 0.11312151700258255, + "learning_rate": 4.96600256986943e-05, + "loss": 0.0704, + "num_input_tokens_seen": 5650720, + "step": 26780 + }, + { + "epoch": 2.946644664466447, + "grad_norm": 0.8365734219551086, + "learning_rate": 4.965963111866689e-05, + "loss": 0.1292, + "num_input_tokens_seen": 5651776, + "step": 26785 + }, + { + "epoch": 2.9471947194719474, + "grad_norm": 0.5646108984947205, + "learning_rate": 4.965923631136349e-05, + "loss": 0.0842, + "num_input_tokens_seen": 5652768, + "step": 26790 + }, + { + "epoch": 2.9477447744774476, + "grad_norm": 1.12444269657135, + "learning_rate": 4.9658841276787746e-05, + "loss": 0.0611, + "num_input_tokens_seen": 5653856, + "step": 26795 + }, + { + "epoch": 2.9482948294829483, + "grad_norm": 0.8308984637260437, + "learning_rate": 4.9658446014943285e-05, + "loss": 0.0588, + "num_input_tokens_seen": 5654944, + "step": 26800 + }, + { + "epoch": 2.948844884488449, + "grad_norm": 0.03548278659582138, + "learning_rate": 4.965805052583377e-05, + "loss": 0.0812, + "num_input_tokens_seen": 5655968, + "step": 26805 + }, + { + "epoch": 2.949394939493949, + "grad_norm": 0.2641603946685791, + "learning_rate": 4.9657654809462826e-05, + "loss": 0.0348, + "num_input_tokens_seen": 5657056, + "step": 26810 + }, + { + "epoch": 2.94994499449945, + "grad_norm": 0.34708863496780396, + "learning_rate": 4.965725886583412e-05, + "loss": 0.0811, + "num_input_tokens_seen": 5658144, + "step": 26815 + }, + { + "epoch": 2.9504950495049505, + "grad_norm": 0.1110222190618515, + "learning_rate": 4.965686269495129e-05, + "loss": 0.0429, + "num_input_tokens_seen": 5659136, + "step": 26820 + }, + { + "epoch": 2.951045104510451, + "grad_norm": 1.2043386697769165, + "learning_rate": 4.9656466296817985e-05, + "loss": 0.1054, + "num_input_tokens_seen": 5660160, + "step": 26825 + }, + { + "epoch": 2.9515951595159517, + "grad_norm": 0.08321657031774521, + "learning_rate": 4.9656069671437856e-05, + "loss": 0.0584, + "num_input_tokens_seen": 5661184, + "step": 26830 + }, + { + "epoch": 2.952145214521452, + "grad_norm": 1.511614441871643, + "learning_rate": 4.965567281881457e-05, + "loss": 0.1664, + "num_input_tokens_seen": 5662240, + "step": 26835 + }, + { + "epoch": 2.9526952695269526, + "grad_norm": 0.2079167366027832, + "learning_rate": 4.965527573895178e-05, + "loss": 0.0785, + "num_input_tokens_seen": 5663328, + "step": 26840 + }, + { + "epoch": 2.9532453245324533, + "grad_norm": 0.897367537021637, + "learning_rate": 4.965487843185314e-05, + "loss": 0.1506, + "num_input_tokens_seen": 5664352, + "step": 26845 + }, + { + "epoch": 2.953795379537954, + "grad_norm": 0.07454477250576019, + "learning_rate": 4.9654480897522314e-05, + "loss": 0.0451, + "num_input_tokens_seen": 5665376, + "step": 26850 + }, + { + "epoch": 2.9543454345434546, + "grad_norm": 0.1732872724533081, + "learning_rate": 4.965408313596297e-05, + "loss": 0.0597, + "num_input_tokens_seen": 5666368, + "step": 26855 + }, + { + "epoch": 2.9548954895489548, + "grad_norm": 1.705388069152832, + "learning_rate": 4.965368514717877e-05, + "loss": 0.0302, + "num_input_tokens_seen": 5667424, + "step": 26860 + }, + { + "epoch": 2.9554455445544554, + "grad_norm": 0.06586379557847977, + "learning_rate": 4.9653286931173395e-05, + "loss": 0.0424, + "num_input_tokens_seen": 5668480, + "step": 26865 + }, + { + "epoch": 2.955995599559956, + "grad_norm": 0.6495676040649414, + "learning_rate": 4.9652888487950494e-05, + "loss": 0.0361, + "num_input_tokens_seen": 5669536, + "step": 26870 + }, + { + "epoch": 2.9565456545654567, + "grad_norm": 0.1466655284166336, + "learning_rate": 4.965248981751375e-05, + "loss": 0.1005, + "num_input_tokens_seen": 5670528, + "step": 26875 + }, + { + "epoch": 2.9570957095709574, + "grad_norm": 0.5606462955474854, + "learning_rate": 4.965209091986684e-05, + "loss": 0.0928, + "num_input_tokens_seen": 5671616, + "step": 26880 + }, + { + "epoch": 2.9576457645764576, + "grad_norm": 0.05670854076743126, + "learning_rate": 4.9651691795013435e-05, + "loss": 0.029, + "num_input_tokens_seen": 5672736, + "step": 26885 + }, + { + "epoch": 2.958195819581958, + "grad_norm": 0.14585189521312714, + "learning_rate": 4.9651292442957216e-05, + "loss": 0.0224, + "num_input_tokens_seen": 5673760, + "step": 26890 + }, + { + "epoch": 2.958745874587459, + "grad_norm": 0.19167937338352203, + "learning_rate": 4.9650892863701865e-05, + "loss": 0.025, + "num_input_tokens_seen": 5674784, + "step": 26895 + }, + { + "epoch": 2.959295929592959, + "grad_norm": 0.2108403444290161, + "learning_rate": 4.9650493057251054e-05, + "loss": 0.0577, + "num_input_tokens_seen": 5675808, + "step": 26900 + }, + { + "epoch": 2.9598459845984597, + "grad_norm": 0.3949767053127289, + "learning_rate": 4.965009302360849e-05, + "loss": 0.1089, + "num_input_tokens_seen": 5676864, + "step": 26905 + }, + { + "epoch": 2.9603960396039604, + "grad_norm": 0.16016915440559387, + "learning_rate": 4.9649692762777846e-05, + "loss": 0.0555, + "num_input_tokens_seen": 5677952, + "step": 26910 + }, + { + "epoch": 2.960946094609461, + "grad_norm": 0.5280354619026184, + "learning_rate": 4.96492922747628e-05, + "loss": 0.0437, + "num_input_tokens_seen": 5679008, + "step": 26915 + }, + { + "epoch": 2.9614961496149617, + "grad_norm": 0.23407618701457977, + "learning_rate": 4.964889155956707e-05, + "loss": 0.0182, + "num_input_tokens_seen": 5680064, + "step": 26920 + }, + { + "epoch": 2.962046204620462, + "grad_norm": 0.14232103526592255, + "learning_rate": 4.964849061719432e-05, + "loss": 0.0543, + "num_input_tokens_seen": 5681088, + "step": 26925 + }, + { + "epoch": 2.9625962596259625, + "grad_norm": 0.14585281908512115, + "learning_rate": 4.964808944764827e-05, + "loss": 0.1465, + "num_input_tokens_seen": 5682080, + "step": 26930 + }, + { + "epoch": 2.963146314631463, + "grad_norm": 0.7096620202064514, + "learning_rate": 4.964768805093261e-05, + "loss": 0.0462, + "num_input_tokens_seen": 5683200, + "step": 26935 + }, + { + "epoch": 2.963696369636964, + "grad_norm": 0.13576821982860565, + "learning_rate": 4.9647286427051023e-05, + "loss": 0.0178, + "num_input_tokens_seen": 5684288, + "step": 26940 + }, + { + "epoch": 2.9642464246424645, + "grad_norm": 0.03640870377421379, + "learning_rate": 4.964688457600723e-05, + "loss": 0.0242, + "num_input_tokens_seen": 5685344, + "step": 26945 + }, + { + "epoch": 2.9647964796479647, + "grad_norm": 0.27503880858421326, + "learning_rate": 4.964648249780493e-05, + "loss": 0.1076, + "num_input_tokens_seen": 5686432, + "step": 26950 + }, + { + "epoch": 2.9653465346534653, + "grad_norm": 0.1903766691684723, + "learning_rate": 4.964608019244783e-05, + "loss": 0.0886, + "num_input_tokens_seen": 5687488, + "step": 26955 + }, + { + "epoch": 2.965896589658966, + "grad_norm": 0.15169550478458405, + "learning_rate": 4.964567765993963e-05, + "loss": 0.0729, + "num_input_tokens_seen": 5688512, + "step": 26960 + }, + { + "epoch": 2.966446644664466, + "grad_norm": 0.6697326898574829, + "learning_rate": 4.9645274900284045e-05, + "loss": 0.0412, + "num_input_tokens_seen": 5689568, + "step": 26965 + }, + { + "epoch": 2.9669966996699673, + "grad_norm": 0.658597469329834, + "learning_rate": 4.964487191348479e-05, + "loss": 0.0581, + "num_input_tokens_seen": 5690624, + "step": 26970 + }, + { + "epoch": 2.9675467546754675, + "grad_norm": 0.3108977973461151, + "learning_rate": 4.964446869954558e-05, + "loss": 0.0372, + "num_input_tokens_seen": 5691680, + "step": 26975 + }, + { + "epoch": 2.968096809680968, + "grad_norm": 0.006267583463340998, + "learning_rate": 4.964406525847012e-05, + "loss": 0.0426, + "num_input_tokens_seen": 5692800, + "step": 26980 + }, + { + "epoch": 2.9686468646864688, + "grad_norm": 0.9889190196990967, + "learning_rate": 4.964366159026213e-05, + "loss": 0.1096, + "num_input_tokens_seen": 5693792, + "step": 26985 + }, + { + "epoch": 2.969196919691969, + "grad_norm": 0.9434036016464233, + "learning_rate": 4.964325769492535e-05, + "loss": 0.0369, + "num_input_tokens_seen": 5694816, + "step": 26990 + }, + { + "epoch": 2.9697469746974696, + "grad_norm": 2.0386619567871094, + "learning_rate": 4.964285357246348e-05, + "loss": 0.0996, + "num_input_tokens_seen": 5695840, + "step": 26995 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.07803234457969666, + "learning_rate": 4.964244922288026e-05, + "loss": 0.1279, + "num_input_tokens_seen": 5696896, + "step": 27000 + }, + { + "epoch": 2.970847084708471, + "grad_norm": 0.03929632157087326, + "learning_rate": 4.964204464617941e-05, + "loss": 0.0484, + "num_input_tokens_seen": 5697984, + "step": 27005 + }, + { + "epoch": 2.9713971397139716, + "grad_norm": 0.7531137466430664, + "learning_rate": 4.964163984236465e-05, + "loss": 0.0855, + "num_input_tokens_seen": 5698944, + "step": 27010 + }, + { + "epoch": 2.9719471947194718, + "grad_norm": 0.6148043870925903, + "learning_rate": 4.964123481143973e-05, + "loss": 0.0351, + "num_input_tokens_seen": 5699968, + "step": 27015 + }, + { + "epoch": 2.9724972497249724, + "grad_norm": 0.08297079801559448, + "learning_rate": 4.9640829553408366e-05, + "loss": 0.0579, + "num_input_tokens_seen": 5700960, + "step": 27020 + }, + { + "epoch": 2.973047304730473, + "grad_norm": 0.2046501338481903, + "learning_rate": 4.9640424068274304e-05, + "loss": 0.0642, + "num_input_tokens_seen": 5701984, + "step": 27025 + }, + { + "epoch": 2.9735973597359737, + "grad_norm": 0.1333741396665573, + "learning_rate": 4.9640018356041275e-05, + "loss": 0.0509, + "num_input_tokens_seen": 5703072, + "step": 27030 + }, + { + "epoch": 2.9741474147414744, + "grad_norm": 0.16763924062252045, + "learning_rate": 4.963961241671302e-05, + "loss": 0.1226, + "num_input_tokens_seen": 5704064, + "step": 27035 + }, + { + "epoch": 2.9746974697469746, + "grad_norm": 1.0558319091796875, + "learning_rate": 4.963920625029328e-05, + "loss": 0.0903, + "num_input_tokens_seen": 5705088, + "step": 27040 + }, + { + "epoch": 2.9752475247524752, + "grad_norm": 0.14234644174575806, + "learning_rate": 4.963879985678581e-05, + "loss": 0.0744, + "num_input_tokens_seen": 5706176, + "step": 27045 + }, + { + "epoch": 2.975797579757976, + "grad_norm": 0.07239079475402832, + "learning_rate": 4.963839323619433e-05, + "loss": 0.1576, + "num_input_tokens_seen": 5707264, + "step": 27050 + }, + { + "epoch": 2.976347634763476, + "grad_norm": 1.5464404821395874, + "learning_rate": 4.9637986388522605e-05, + "loss": 0.0775, + "num_input_tokens_seen": 5708256, + "step": 27055 + }, + { + "epoch": 2.976897689768977, + "grad_norm": 0.2022341638803482, + "learning_rate": 4.963757931377439e-05, + "loss": 0.0792, + "num_input_tokens_seen": 5709344, + "step": 27060 + }, + { + "epoch": 2.9774477447744774, + "grad_norm": 0.42303386330604553, + "learning_rate": 4.963717201195342e-05, + "loss": 0.0426, + "num_input_tokens_seen": 5710336, + "step": 27065 + }, + { + "epoch": 2.977997799779978, + "grad_norm": 1.5873026847839355, + "learning_rate": 4.963676448306346e-05, + "loss": 0.0898, + "num_input_tokens_seen": 5711360, + "step": 27070 + }, + { + "epoch": 2.9785478547854787, + "grad_norm": 0.7974417209625244, + "learning_rate": 4.9636356727108265e-05, + "loss": 0.0404, + "num_input_tokens_seen": 5712416, + "step": 27075 + }, + { + "epoch": 2.979097909790979, + "grad_norm": 0.9729480743408203, + "learning_rate": 4.9635948744091585e-05, + "loss": 0.1112, + "num_input_tokens_seen": 5713408, + "step": 27080 + }, + { + "epoch": 2.9796479647964795, + "grad_norm": 0.6417766213417053, + "learning_rate": 4.96355405340172e-05, + "loss": 0.021, + "num_input_tokens_seen": 5714400, + "step": 27085 + }, + { + "epoch": 2.98019801980198, + "grad_norm": 0.04866637662053108, + "learning_rate": 4.963513209688885e-05, + "loss": 0.1546, + "num_input_tokens_seen": 5715424, + "step": 27090 + }, + { + "epoch": 2.980748074807481, + "grad_norm": 1.013150930404663, + "learning_rate": 4.963472343271031e-05, + "loss": 0.1493, + "num_input_tokens_seen": 5716512, + "step": 27095 + }, + { + "epoch": 2.9812981298129815, + "grad_norm": 0.029505938291549683, + "learning_rate": 4.9634314541485346e-05, + "loss": 0.0388, + "num_input_tokens_seen": 5717536, + "step": 27100 + }, + { + "epoch": 2.9818481848184817, + "grad_norm": 0.9999393820762634, + "learning_rate": 4.963390542321773e-05, + "loss": 0.0531, + "num_input_tokens_seen": 5718656, + "step": 27105 + }, + { + "epoch": 2.9823982398239823, + "grad_norm": 0.05845151096582413, + "learning_rate": 4.9633496077911216e-05, + "loss": 0.0328, + "num_input_tokens_seen": 5719744, + "step": 27110 + }, + { + "epoch": 2.982948294829483, + "grad_norm": 0.06870125979185104, + "learning_rate": 4.96330865055696e-05, + "loss": 0.0933, + "num_input_tokens_seen": 5720800, + "step": 27115 + }, + { + "epoch": 2.9834983498349836, + "grad_norm": 0.11458390951156616, + "learning_rate": 4.963267670619664e-05, + "loss": 0.0881, + "num_input_tokens_seen": 5721824, + "step": 27120 + }, + { + "epoch": 2.9840484048404843, + "grad_norm": 0.22616729140281677, + "learning_rate": 4.963226667979612e-05, + "loss": 0.0901, + "num_input_tokens_seen": 5722944, + "step": 27125 + }, + { + "epoch": 2.9845984598459845, + "grad_norm": 0.12494955956935883, + "learning_rate": 4.9631856426371816e-05, + "loss": 0.0119, + "num_input_tokens_seen": 5724032, + "step": 27130 + }, + { + "epoch": 2.985148514851485, + "grad_norm": 0.07382932305335999, + "learning_rate": 4.963144594592751e-05, + "loss": 0.0493, + "num_input_tokens_seen": 5725056, + "step": 27135 + }, + { + "epoch": 2.985698569856986, + "grad_norm": 0.5918796062469482, + "learning_rate": 4.9631035238467e-05, + "loss": 0.0784, + "num_input_tokens_seen": 5726080, + "step": 27140 + }, + { + "epoch": 2.986248624862486, + "grad_norm": 0.9851495623588562, + "learning_rate": 4.9630624303994044e-05, + "loss": 0.0344, + "num_input_tokens_seen": 5727136, + "step": 27145 + }, + { + "epoch": 2.9867986798679866, + "grad_norm": 0.5001277327537537, + "learning_rate": 4.9630213142512447e-05, + "loss": 0.0544, + "num_input_tokens_seen": 5728160, + "step": 27150 + }, + { + "epoch": 2.9873487348734873, + "grad_norm": 0.16762644052505493, + "learning_rate": 4.9629801754025985e-05, + "loss": 0.0463, + "num_input_tokens_seen": 5729184, + "step": 27155 + }, + { + "epoch": 2.987898789878988, + "grad_norm": 0.9627442955970764, + "learning_rate": 4.962939013853847e-05, + "loss": 0.1245, + "num_input_tokens_seen": 5730208, + "step": 27160 + }, + { + "epoch": 2.9884488448844886, + "grad_norm": 0.3104441165924072, + "learning_rate": 4.962897829605368e-05, + "loss": 0.0516, + "num_input_tokens_seen": 5731232, + "step": 27165 + }, + { + "epoch": 2.988998899889989, + "grad_norm": 0.6225072145462036, + "learning_rate": 4.962856622657541e-05, + "loss": 0.0452, + "num_input_tokens_seen": 5732288, + "step": 27170 + }, + { + "epoch": 2.9895489548954894, + "grad_norm": 0.33897319436073303, + "learning_rate": 4.962815393010747e-05, + "loss": 0.038, + "num_input_tokens_seen": 5733344, + "step": 27175 + }, + { + "epoch": 2.99009900990099, + "grad_norm": 0.4688735604286194, + "learning_rate": 4.962774140665366e-05, + "loss": 0.0685, + "num_input_tokens_seen": 5734336, + "step": 27180 + }, + { + "epoch": 2.9906490649064907, + "grad_norm": 0.6921869516372681, + "learning_rate": 4.9627328656217764e-05, + "loss": 0.0558, + "num_input_tokens_seen": 5735424, + "step": 27185 + }, + { + "epoch": 2.9911991199119914, + "grad_norm": 1.0764859914779663, + "learning_rate": 4.9626915678803597e-05, + "loss": 0.0607, + "num_input_tokens_seen": 5736512, + "step": 27190 + }, + { + "epoch": 2.9917491749174916, + "grad_norm": 0.16234949231147766, + "learning_rate": 4.962650247441497e-05, + "loss": 0.0357, + "num_input_tokens_seen": 5737536, + "step": 27195 + }, + { + "epoch": 2.9922992299229922, + "grad_norm": 0.5403395891189575, + "learning_rate": 4.9626089043055687e-05, + "loss": 0.1342, + "num_input_tokens_seen": 5738592, + "step": 27200 + }, + { + "epoch": 2.992849284928493, + "grad_norm": 0.02160225808620453, + "learning_rate": 4.9625675384729554e-05, + "loss": 0.0418, + "num_input_tokens_seen": 5739648, + "step": 27205 + }, + { + "epoch": 2.9933993399339935, + "grad_norm": 0.06440620124340057, + "learning_rate": 4.962526149944039e-05, + "loss": 0.1384, + "num_input_tokens_seen": 5740640, + "step": 27210 + }, + { + "epoch": 2.993949394939494, + "grad_norm": 0.11703934520483017, + "learning_rate": 4.962484738719201e-05, + "loss": 0.031, + "num_input_tokens_seen": 5741728, + "step": 27215 + }, + { + "epoch": 2.9944994499449944, + "grad_norm": 0.21670810878276825, + "learning_rate": 4.9624433047988214e-05, + "loss": 0.0253, + "num_input_tokens_seen": 5742784, + "step": 27220 + }, + { + "epoch": 2.995049504950495, + "grad_norm": 0.3262707591056824, + "learning_rate": 4.962401848183285e-05, + "loss": 0.078, + "num_input_tokens_seen": 5743872, + "step": 27225 + }, + { + "epoch": 2.9955995599559957, + "grad_norm": 0.47775837779045105, + "learning_rate": 4.962360368872972e-05, + "loss": 0.0525, + "num_input_tokens_seen": 5744928, + "step": 27230 + }, + { + "epoch": 2.996149614961496, + "grad_norm": 0.6552488207817078, + "learning_rate": 4.962318866868265e-05, + "loss": 0.0404, + "num_input_tokens_seen": 5745984, + "step": 27235 + }, + { + "epoch": 2.9966996699669965, + "grad_norm": 0.09694567322731018, + "learning_rate": 4.962277342169546e-05, + "loss": 0.0307, + "num_input_tokens_seen": 5747040, + "step": 27240 + }, + { + "epoch": 2.997249724972497, + "grad_norm": 0.37906426191329956, + "learning_rate": 4.962235794777199e-05, + "loss": 0.0662, + "num_input_tokens_seen": 5748064, + "step": 27245 + }, + { + "epoch": 2.997799779977998, + "grad_norm": 0.17142826318740845, + "learning_rate": 4.962194224691606e-05, + "loss": 0.0255, + "num_input_tokens_seen": 5749120, + "step": 27250 + }, + { + "epoch": 2.9983498349834985, + "grad_norm": 0.49419865012168884, + "learning_rate": 4.96215263191315e-05, + "loss": 0.0804, + "num_input_tokens_seen": 5750176, + "step": 27255 + }, + { + "epoch": 2.9988998899889987, + "grad_norm": 0.7147830724716187, + "learning_rate": 4.9621110164422145e-05, + "loss": 0.0567, + "num_input_tokens_seen": 5751232, + "step": 27260 + }, + { + "epoch": 2.9994499449944994, + "grad_norm": 0.6725450158119202, + "learning_rate": 4.962069378279184e-05, + "loss": 0.0275, + "num_input_tokens_seen": 5752256, + "step": 27265 + }, + { + "epoch": 3.0, + "grad_norm": 0.08383341133594513, + "learning_rate": 4.962027717424441e-05, + "loss": 0.0395, + "num_input_tokens_seen": 5753152, + "step": 27270 + }, + { + "epoch": 3.0, + "eval_loss": 0.07973591983318329, + "eval_runtime": 37.0358, + "eval_samples_per_second": 109.084, + "eval_steps_per_second": 27.271, + "num_input_tokens_seen": 5753152, + "step": 27270 + }, + { + "epoch": 3.0005500550055006, + "grad_norm": 0.03480329364538193, + "learning_rate": 4.9619860338783695e-05, + "loss": 0.0387, + "num_input_tokens_seen": 5754176, + "step": 27275 + }, + { + "epoch": 3.0011001100110013, + "grad_norm": 0.8185605406761169, + "learning_rate": 4.961944327641355e-05, + "loss": 0.0607, + "num_input_tokens_seen": 5755232, + "step": 27280 + }, + { + "epoch": 3.0016501650165015, + "grad_norm": 0.18465100228786469, + "learning_rate": 4.96190259871378e-05, + "loss": 0.0206, + "num_input_tokens_seen": 5756384, + "step": 27285 + }, + { + "epoch": 3.002200220022002, + "grad_norm": 0.24760845303535461, + "learning_rate": 4.96186084709603e-05, + "loss": 0.034, + "num_input_tokens_seen": 5757504, + "step": 27290 + }, + { + "epoch": 3.002750275027503, + "grad_norm": 0.5417699813842773, + "learning_rate": 4.961819072788491e-05, + "loss": 0.0389, + "num_input_tokens_seen": 5758624, + "step": 27295 + }, + { + "epoch": 3.0033003300330035, + "grad_norm": 0.1270134001970291, + "learning_rate": 4.961777275791546e-05, + "loss": 0.0144, + "num_input_tokens_seen": 5759712, + "step": 27300 + }, + { + "epoch": 3.0038503850385037, + "grad_norm": 0.06555131822824478, + "learning_rate": 4.961735456105582e-05, + "loss": 0.1227, + "num_input_tokens_seen": 5760768, + "step": 27305 + }, + { + "epoch": 3.0044004400440043, + "grad_norm": 0.0301112812012434, + "learning_rate": 4.961693613730983e-05, + "loss": 0.0155, + "num_input_tokens_seen": 5761920, + "step": 27310 + }, + { + "epoch": 3.004950495049505, + "grad_norm": 0.03642262518405914, + "learning_rate": 4.961651748668135e-05, + "loss": 0.118, + "num_input_tokens_seen": 5763008, + "step": 27315 + }, + { + "epoch": 3.0055005500550056, + "grad_norm": 0.07001281529664993, + "learning_rate": 4.9616098609174246e-05, + "loss": 0.0444, + "num_input_tokens_seen": 5764096, + "step": 27320 + }, + { + "epoch": 3.0060506050605063, + "grad_norm": 1.9014490842819214, + "learning_rate": 4.9615679504792364e-05, + "loss": 0.1804, + "num_input_tokens_seen": 5765216, + "step": 27325 + }, + { + "epoch": 3.0066006600660065, + "grad_norm": 0.11930253356695175, + "learning_rate": 4.961526017353958e-05, + "loss": 0.0067, + "num_input_tokens_seen": 5766336, + "step": 27330 + }, + { + "epoch": 3.007150715071507, + "grad_norm": 0.7095158696174622, + "learning_rate": 4.961484061541976e-05, + "loss": 0.107, + "num_input_tokens_seen": 5767360, + "step": 27335 + }, + { + "epoch": 3.0077007700770078, + "grad_norm": 0.17984305322170258, + "learning_rate": 4.961442083043676e-05, + "loss": 0.0982, + "num_input_tokens_seen": 5768480, + "step": 27340 + }, + { + "epoch": 3.0082508250825084, + "grad_norm": 0.519156277179718, + "learning_rate": 4.9614000818594456e-05, + "loss": 0.0932, + "num_input_tokens_seen": 5769504, + "step": 27345 + }, + { + "epoch": 3.0088008800880086, + "grad_norm": 0.5302866101264954, + "learning_rate": 4.961358057989672e-05, + "loss": 0.0372, + "num_input_tokens_seen": 5770624, + "step": 27350 + }, + { + "epoch": 3.0093509350935093, + "grad_norm": 0.19628868997097015, + "learning_rate": 4.961316011434742e-05, + "loss": 0.1357, + "num_input_tokens_seen": 5771648, + "step": 27355 + }, + { + "epoch": 3.00990099009901, + "grad_norm": 0.11687330156564713, + "learning_rate": 4.9612739421950425e-05, + "loss": 0.1483, + "num_input_tokens_seen": 5772672, + "step": 27360 + }, + { + "epoch": 3.0104510451045106, + "grad_norm": 0.04219627007842064, + "learning_rate": 4.961231850270963e-05, + "loss": 0.1967, + "num_input_tokens_seen": 5773696, + "step": 27365 + }, + { + "epoch": 3.011001100110011, + "grad_norm": 0.5042234063148499, + "learning_rate": 4.9611897356628915e-05, + "loss": 0.1124, + "num_input_tokens_seen": 5774848, + "step": 27370 + }, + { + "epoch": 3.0115511551155114, + "grad_norm": 0.3140057623386383, + "learning_rate": 4.961147598371214e-05, + "loss": 0.0186, + "num_input_tokens_seen": 5775936, + "step": 27375 + }, + { + "epoch": 3.012101210121012, + "grad_norm": 0.10701008886098862, + "learning_rate": 4.96110543839632e-05, + "loss": 0.0303, + "num_input_tokens_seen": 5776960, + "step": 27380 + }, + { + "epoch": 3.0126512651265127, + "grad_norm": 0.038866762071847916, + "learning_rate": 4.961063255738598e-05, + "loss": 0.1004, + "num_input_tokens_seen": 5777984, + "step": 27385 + }, + { + "epoch": 3.0132013201320134, + "grad_norm": 0.21404694020748138, + "learning_rate": 4.9610210503984375e-05, + "loss": 0.0692, + "num_input_tokens_seen": 5779040, + "step": 27390 + }, + { + "epoch": 3.0137513751375136, + "grad_norm": 0.30189597606658936, + "learning_rate": 4.960978822376227e-05, + "loss": 0.1187, + "num_input_tokens_seen": 5780128, + "step": 27395 + }, + { + "epoch": 3.014301430143014, + "grad_norm": 0.2926478385925293, + "learning_rate": 4.9609365716723546e-05, + "loss": 0.0383, + "num_input_tokens_seen": 5781216, + "step": 27400 + }, + { + "epoch": 3.014851485148515, + "grad_norm": 0.27448445558547974, + "learning_rate": 4.960894298287212e-05, + "loss": 0.057, + "num_input_tokens_seen": 5782336, + "step": 27405 + }, + { + "epoch": 3.0154015401540155, + "grad_norm": 0.2705551087856293, + "learning_rate": 4.9608520022211866e-05, + "loss": 0.0548, + "num_input_tokens_seen": 5783360, + "step": 27410 + }, + { + "epoch": 3.015951595159516, + "grad_norm": 0.3693429231643677, + "learning_rate": 4.96080968347467e-05, + "loss": 0.0262, + "num_input_tokens_seen": 5784352, + "step": 27415 + }, + { + "epoch": 3.0165016501650164, + "grad_norm": 0.39821702241897583, + "learning_rate": 4.9607673420480507e-05, + "loss": 0.0332, + "num_input_tokens_seen": 5785440, + "step": 27420 + }, + { + "epoch": 3.017051705170517, + "grad_norm": 1.7641810178756714, + "learning_rate": 4.960724977941719e-05, + "loss": 0.1057, + "num_input_tokens_seen": 5786528, + "step": 27425 + }, + { + "epoch": 3.0176017601760177, + "grad_norm": 0.0545242577791214, + "learning_rate": 4.9606825911560665e-05, + "loss": 0.0235, + "num_input_tokens_seen": 5787584, + "step": 27430 + }, + { + "epoch": 3.0181518151815183, + "grad_norm": 0.043354421854019165, + "learning_rate": 4.960640181691484e-05, + "loss": 0.0347, + "num_input_tokens_seen": 5788704, + "step": 27435 + }, + { + "epoch": 3.0187018701870185, + "grad_norm": 0.5041795969009399, + "learning_rate": 4.960597749548361e-05, + "loss": 0.1231, + "num_input_tokens_seen": 5789760, + "step": 27440 + }, + { + "epoch": 3.019251925192519, + "grad_norm": 0.08721218258142471, + "learning_rate": 4.9605552947270894e-05, + "loss": 0.1666, + "num_input_tokens_seen": 5790784, + "step": 27445 + }, + { + "epoch": 3.01980198019802, + "grad_norm": 0.1541200429201126, + "learning_rate": 4.96051281722806e-05, + "loss": 0.094, + "num_input_tokens_seen": 5791808, + "step": 27450 + }, + { + "epoch": 3.0203520352035205, + "grad_norm": 0.01422492042183876, + "learning_rate": 4.960470317051665e-05, + "loss": 0.0523, + "num_input_tokens_seen": 5792896, + "step": 27455 + }, + { + "epoch": 3.020902090209021, + "grad_norm": 0.3133222460746765, + "learning_rate": 4.9604277941982955e-05, + "loss": 0.0319, + "num_input_tokens_seen": 5793888, + "step": 27460 + }, + { + "epoch": 3.0214521452145213, + "grad_norm": 0.0494631826877594, + "learning_rate": 4.960385248668344e-05, + "loss": 0.1036, + "num_input_tokens_seen": 5794976, + "step": 27465 + }, + { + "epoch": 3.022002200220022, + "grad_norm": 0.3186737298965454, + "learning_rate": 4.960342680462202e-05, + "loss": 0.031, + "num_input_tokens_seen": 5795968, + "step": 27470 + }, + { + "epoch": 3.0225522552255226, + "grad_norm": 0.1475069522857666, + "learning_rate": 4.9603000895802623e-05, + "loss": 0.1098, + "num_input_tokens_seen": 5797056, + "step": 27475 + }, + { + "epoch": 3.0231023102310233, + "grad_norm": 0.6528930068016052, + "learning_rate": 4.960257476022917e-05, + "loss": 0.0556, + "num_input_tokens_seen": 5798080, + "step": 27480 + }, + { + "epoch": 3.0236523652365235, + "grad_norm": 0.05026673898100853, + "learning_rate": 4.960214839790559e-05, + "loss": 0.0312, + "num_input_tokens_seen": 5799104, + "step": 27485 + }, + { + "epoch": 3.024202420242024, + "grad_norm": 0.5743812918663025, + "learning_rate": 4.9601721808835814e-05, + "loss": 0.0335, + "num_input_tokens_seen": 5800160, + "step": 27490 + }, + { + "epoch": 3.0247524752475248, + "grad_norm": 0.04693243280053139, + "learning_rate": 4.960129499302377e-05, + "loss": 0.0542, + "num_input_tokens_seen": 5801280, + "step": 27495 + }, + { + "epoch": 3.0253025302530254, + "grad_norm": 0.026974469423294067, + "learning_rate": 4.960086795047341e-05, + "loss": 0.0625, + "num_input_tokens_seen": 5802336, + "step": 27500 + }, + { + "epoch": 3.0258525852585256, + "grad_norm": 0.28989657759666443, + "learning_rate": 4.960044068118864e-05, + "loss": 0.0068, + "num_input_tokens_seen": 5803456, + "step": 27505 + }, + { + "epoch": 3.0264026402640263, + "grad_norm": 0.8999572396278381, + "learning_rate": 4.960001318517342e-05, + "loss": 0.0748, + "num_input_tokens_seen": 5804480, + "step": 27510 + }, + { + "epoch": 3.026952695269527, + "grad_norm": 1.0836904048919678, + "learning_rate": 4.959958546243167e-05, + "loss": 0.0841, + "num_input_tokens_seen": 5805536, + "step": 27515 + }, + { + "epoch": 3.0275027502750276, + "grad_norm": 0.609768271446228, + "learning_rate": 4.959915751296736e-05, + "loss": 0.0777, + "num_input_tokens_seen": 5806560, + "step": 27520 + }, + { + "epoch": 3.0280528052805282, + "grad_norm": 0.04910701885819435, + "learning_rate": 4.959872933678441e-05, + "loss": 0.0245, + "num_input_tokens_seen": 5807584, + "step": 27525 + }, + { + "epoch": 3.0286028602860284, + "grad_norm": 0.15542317926883698, + "learning_rate": 4.959830093388678e-05, + "loss": 0.0428, + "num_input_tokens_seen": 5808672, + "step": 27530 + }, + { + "epoch": 3.029152915291529, + "grad_norm": 1.269457221031189, + "learning_rate": 4.959787230427842e-05, + "loss": 0.0286, + "num_input_tokens_seen": 5809728, + "step": 27535 + }, + { + "epoch": 3.0297029702970297, + "grad_norm": 0.08460573852062225, + "learning_rate": 4.959744344796326e-05, + "loss": 0.1415, + "num_input_tokens_seen": 5810720, + "step": 27540 + }, + { + "epoch": 3.0302530253025304, + "grad_norm": 0.2821883261203766, + "learning_rate": 4.959701436494527e-05, + "loss": 0.1026, + "num_input_tokens_seen": 5811808, + "step": 27545 + }, + { + "epoch": 3.0308030803080306, + "grad_norm": 0.24630486965179443, + "learning_rate": 4.959658505522841e-05, + "loss": 0.0537, + "num_input_tokens_seen": 5812864, + "step": 27550 + }, + { + "epoch": 3.0313531353135312, + "grad_norm": 0.3439083397388458, + "learning_rate": 4.959615551881662e-05, + "loss": 0.0346, + "num_input_tokens_seen": 5813856, + "step": 27555 + }, + { + "epoch": 3.031903190319032, + "grad_norm": 0.9465562105178833, + "learning_rate": 4.9595725755713877e-05, + "loss": 0.0529, + "num_input_tokens_seen": 5815008, + "step": 27560 + }, + { + "epoch": 3.0324532453245325, + "grad_norm": 0.5371885895729065, + "learning_rate": 4.9595295765924124e-05, + "loss": 0.0479, + "num_input_tokens_seen": 5816160, + "step": 27565 + }, + { + "epoch": 3.033003300330033, + "grad_norm": 0.21020977199077606, + "learning_rate": 4.959486554945133e-05, + "loss": 0.1833, + "num_input_tokens_seen": 5817280, + "step": 27570 + }, + { + "epoch": 3.0335533553355334, + "grad_norm": 0.11506211757659912, + "learning_rate": 4.959443510629947e-05, + "loss": 0.0655, + "num_input_tokens_seen": 5818304, + "step": 27575 + }, + { + "epoch": 3.034103410341034, + "grad_norm": 0.061046041548252106, + "learning_rate": 4.959400443647249e-05, + "loss": 0.0395, + "num_input_tokens_seen": 5819328, + "step": 27580 + }, + { + "epoch": 3.0346534653465347, + "grad_norm": 0.053307365626096725, + "learning_rate": 4.959357353997438e-05, + "loss": 0.0915, + "num_input_tokens_seen": 5820384, + "step": 27585 + }, + { + "epoch": 3.0352035203520353, + "grad_norm": 0.8606882691383362, + "learning_rate": 4.9593142416809104e-05, + "loss": 0.0821, + "num_input_tokens_seen": 5821440, + "step": 27590 + }, + { + "epoch": 3.0357535753575355, + "grad_norm": 0.33140090107917786, + "learning_rate": 4.9592711066980635e-05, + "loss": 0.062, + "num_input_tokens_seen": 5822560, + "step": 27595 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 1.6835020780563354, + "learning_rate": 4.959227949049295e-05, + "loss": 0.2575, + "num_input_tokens_seen": 5823680, + "step": 27600 + }, + { + "epoch": 3.036853685368537, + "grad_norm": 0.4055185616016388, + "learning_rate": 4.959184768735002e-05, + "loss": 0.049, + "num_input_tokens_seen": 5824736, + "step": 27605 + }, + { + "epoch": 3.0374037403740375, + "grad_norm": 1.3322862386703491, + "learning_rate": 4.959141565755584e-05, + "loss": 0.0901, + "num_input_tokens_seen": 5825792, + "step": 27610 + }, + { + "epoch": 3.037953795379538, + "grad_norm": 0.01621525175869465, + "learning_rate": 4.959098340111438e-05, + "loss": 0.0468, + "num_input_tokens_seen": 5826816, + "step": 27615 + }, + { + "epoch": 3.0385038503850383, + "grad_norm": 0.1820191591978073, + "learning_rate": 4.959055091802962e-05, + "loss": 0.0095, + "num_input_tokens_seen": 5827904, + "step": 27620 + }, + { + "epoch": 3.039053905390539, + "grad_norm": 0.6479875445365906, + "learning_rate": 4.9590118208305555e-05, + "loss": 0.0444, + "num_input_tokens_seen": 5828928, + "step": 27625 + }, + { + "epoch": 3.0396039603960396, + "grad_norm": 0.08295833319425583, + "learning_rate": 4.9589685271946165e-05, + "loss": 0.068, + "num_input_tokens_seen": 5829952, + "step": 27630 + }, + { + "epoch": 3.0401540154015403, + "grad_norm": 0.10398060083389282, + "learning_rate": 4.958925210895545e-05, + "loss": 0.1055, + "num_input_tokens_seen": 5831072, + "step": 27635 + }, + { + "epoch": 3.0407040704070405, + "grad_norm": 0.6243674159049988, + "learning_rate": 4.95888187193374e-05, + "loss": 0.034, + "num_input_tokens_seen": 5832128, + "step": 27640 + }, + { + "epoch": 3.041254125412541, + "grad_norm": 0.19006085395812988, + "learning_rate": 4.9588385103096005e-05, + "loss": 0.024, + "num_input_tokens_seen": 5833248, + "step": 27645 + }, + { + "epoch": 3.041804180418042, + "grad_norm": 0.2680676579475403, + "learning_rate": 4.958795126023526e-05, + "loss": 0.1324, + "num_input_tokens_seen": 5834304, + "step": 27650 + }, + { + "epoch": 3.0423542354235424, + "grad_norm": 0.20645619928836823, + "learning_rate": 4.958751719075917e-05, + "loss": 0.0314, + "num_input_tokens_seen": 5835328, + "step": 27655 + }, + { + "epoch": 3.042904290429043, + "grad_norm": 0.6154187321662903, + "learning_rate": 4.958708289467173e-05, + "loss": 0.1862, + "num_input_tokens_seen": 5836352, + "step": 27660 + }, + { + "epoch": 3.0434543454345433, + "grad_norm": 0.4051264524459839, + "learning_rate": 4.9586648371976947e-05, + "loss": 0.1177, + "num_input_tokens_seen": 5837440, + "step": 27665 + }, + { + "epoch": 3.044004400440044, + "grad_norm": 1.4641780853271484, + "learning_rate": 4.958621362267882e-05, + "loss": 0.0729, + "num_input_tokens_seen": 5838432, + "step": 27670 + }, + { + "epoch": 3.0445544554455446, + "grad_norm": 0.4511278569698334, + "learning_rate": 4.9585778646781364e-05, + "loss": 0.1184, + "num_input_tokens_seen": 5839520, + "step": 27675 + }, + { + "epoch": 3.0451045104510452, + "grad_norm": 0.15984418988227844, + "learning_rate": 4.958534344428858e-05, + "loss": 0.0395, + "num_input_tokens_seen": 5840576, + "step": 27680 + }, + { + "epoch": 3.0456545654565454, + "grad_norm": 0.04369412362575531, + "learning_rate": 4.958490801520449e-05, + "loss": 0.0494, + "num_input_tokens_seen": 5841568, + "step": 27685 + }, + { + "epoch": 3.046204620462046, + "grad_norm": 0.8956900238990784, + "learning_rate": 4.95844723595331e-05, + "loss": 0.0397, + "num_input_tokens_seen": 5842624, + "step": 27690 + }, + { + "epoch": 3.0467546754675467, + "grad_norm": 0.6212251782417297, + "learning_rate": 4.958403647727841e-05, + "loss": 0.1071, + "num_input_tokens_seen": 5843712, + "step": 27695 + }, + { + "epoch": 3.0473047304730474, + "grad_norm": 0.3808050751686096, + "learning_rate": 4.958360036844446e-05, + "loss": 0.0613, + "num_input_tokens_seen": 5844832, + "step": 27700 + }, + { + "epoch": 3.047854785478548, + "grad_norm": 0.18678772449493408, + "learning_rate": 4.9583164033035266e-05, + "loss": 0.2558, + "num_input_tokens_seen": 5845952, + "step": 27705 + }, + { + "epoch": 3.0484048404840483, + "grad_norm": 0.850342869758606, + "learning_rate": 4.9582727471054845e-05, + "loss": 0.0985, + "num_input_tokens_seen": 5846976, + "step": 27710 + }, + { + "epoch": 3.048954895489549, + "grad_norm": 0.030850384384393692, + "learning_rate": 4.958229068250721e-05, + "loss": 0.023, + "num_input_tokens_seen": 5848032, + "step": 27715 + }, + { + "epoch": 3.0495049504950495, + "grad_norm": 0.6842910647392273, + "learning_rate": 4.9581853667396414e-05, + "loss": 0.0436, + "num_input_tokens_seen": 5849120, + "step": 27720 + }, + { + "epoch": 3.05005500550055, + "grad_norm": 0.4370766580104828, + "learning_rate": 4.9581416425726454e-05, + "loss": 0.1164, + "num_input_tokens_seen": 5850112, + "step": 27725 + }, + { + "epoch": 3.0506050605060504, + "grad_norm": 0.05117131397128105, + "learning_rate": 4.958097895750139e-05, + "loss": 0.025, + "num_input_tokens_seen": 5851200, + "step": 27730 + }, + { + "epoch": 3.051155115511551, + "grad_norm": 0.10673848539590836, + "learning_rate": 4.958054126272522e-05, + "loss": 0.0741, + "num_input_tokens_seen": 5852224, + "step": 27735 + }, + { + "epoch": 3.0517051705170517, + "grad_norm": 0.3290109634399414, + "learning_rate": 4.958010334140201e-05, + "loss": 0.0789, + "num_input_tokens_seen": 5853280, + "step": 27740 + }, + { + "epoch": 3.0522552255225524, + "grad_norm": 0.033927466720342636, + "learning_rate": 4.957966519353577e-05, + "loss": 0.0455, + "num_input_tokens_seen": 5854368, + "step": 27745 + }, + { + "epoch": 3.052805280528053, + "grad_norm": 0.1433836966753006, + "learning_rate": 4.957922681913056e-05, + "loss": 0.0659, + "num_input_tokens_seen": 5855392, + "step": 27750 + }, + { + "epoch": 3.053355335533553, + "grad_norm": 1.6494296789169312, + "learning_rate": 4.9578788218190416e-05, + "loss": 0.0642, + "num_input_tokens_seen": 5856384, + "step": 27755 + }, + { + "epoch": 3.053905390539054, + "grad_norm": 0.47160789370536804, + "learning_rate": 4.9578349390719365e-05, + "loss": 0.0253, + "num_input_tokens_seen": 5857408, + "step": 27760 + }, + { + "epoch": 3.0544554455445545, + "grad_norm": 0.5204247236251831, + "learning_rate": 4.957791033672146e-05, + "loss": 0.0558, + "num_input_tokens_seen": 5858464, + "step": 27765 + }, + { + "epoch": 3.055005500550055, + "grad_norm": 0.21756340563297272, + "learning_rate": 4.957747105620075e-05, + "loss": 0.0865, + "num_input_tokens_seen": 5859520, + "step": 27770 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.41575342416763306, + "learning_rate": 4.957703154916129e-05, + "loss": 0.0624, + "num_input_tokens_seen": 5860576, + "step": 27775 + }, + { + "epoch": 3.056105610561056, + "grad_norm": 0.558371901512146, + "learning_rate": 4.957659181560712e-05, + "loss": 0.0916, + "num_input_tokens_seen": 5861632, + "step": 27780 + }, + { + "epoch": 3.0566556655665567, + "grad_norm": 0.23972035944461823, + "learning_rate": 4.9576151855542295e-05, + "loss": 0.0199, + "num_input_tokens_seen": 5862656, + "step": 27785 + }, + { + "epoch": 3.0572057205720573, + "grad_norm": 0.07832790911197662, + "learning_rate": 4.957571166897087e-05, + "loss": 0.0693, + "num_input_tokens_seen": 5863712, + "step": 27790 + }, + { + "epoch": 3.057755775577558, + "grad_norm": 1.5260944366455078, + "learning_rate": 4.957527125589691e-05, + "loss": 0.125, + "num_input_tokens_seen": 5864704, + "step": 27795 + }, + { + "epoch": 3.058305830583058, + "grad_norm": 0.1592337191104889, + "learning_rate": 4.957483061632445e-05, + "loss": 0.1394, + "num_input_tokens_seen": 5865728, + "step": 27800 + }, + { + "epoch": 3.058855885588559, + "grad_norm": 0.8792175054550171, + "learning_rate": 4.957438975025759e-05, + "loss": 0.0721, + "num_input_tokens_seen": 5866752, + "step": 27805 + }, + { + "epoch": 3.0594059405940595, + "grad_norm": 1.8749946355819702, + "learning_rate": 4.9573948657700356e-05, + "loss": 0.0791, + "num_input_tokens_seen": 5867808, + "step": 27810 + }, + { + "epoch": 3.05995599559956, + "grad_norm": 0.4021093547344208, + "learning_rate": 4.957350733865683e-05, + "loss": 0.038, + "num_input_tokens_seen": 5868832, + "step": 27815 + }, + { + "epoch": 3.0605060506050603, + "grad_norm": 0.5783820748329163, + "learning_rate": 4.957306579313109e-05, + "loss": 0.0332, + "num_input_tokens_seen": 5869856, + "step": 27820 + }, + { + "epoch": 3.061056105610561, + "grad_norm": 0.3136327564716339, + "learning_rate": 4.957262402112718e-05, + "loss": 0.0396, + "num_input_tokens_seen": 5870944, + "step": 27825 + }, + { + "epoch": 3.0616061606160616, + "grad_norm": 0.5036725997924805, + "learning_rate": 4.95721820226492e-05, + "loss": 0.0775, + "num_input_tokens_seen": 5872000, + "step": 27830 + }, + { + "epoch": 3.0621562156215623, + "grad_norm": 1.1245856285095215, + "learning_rate": 4.95717397977012e-05, + "loss": 0.1335, + "num_input_tokens_seen": 5873120, + "step": 27835 + }, + { + "epoch": 3.062706270627063, + "grad_norm": 0.18587137758731842, + "learning_rate": 4.957129734628726e-05, + "loss": 0.0131, + "num_input_tokens_seen": 5874144, + "step": 27840 + }, + { + "epoch": 3.063256325632563, + "grad_norm": 0.12305611371994019, + "learning_rate": 4.957085466841147e-05, + "loss": 0.0949, + "num_input_tokens_seen": 5875232, + "step": 27845 + }, + { + "epoch": 3.0638063806380638, + "grad_norm": 0.6712906360626221, + "learning_rate": 4.957041176407791e-05, + "loss": 0.0761, + "num_input_tokens_seen": 5876256, + "step": 27850 + }, + { + "epoch": 3.0643564356435644, + "grad_norm": 0.17951957881450653, + "learning_rate": 4.956996863329064e-05, + "loss": 0.0323, + "num_input_tokens_seen": 5877312, + "step": 27855 + }, + { + "epoch": 3.064906490649065, + "grad_norm": 0.9904127717018127, + "learning_rate": 4.9569525276053766e-05, + "loss": 0.0847, + "num_input_tokens_seen": 5878336, + "step": 27860 + }, + { + "epoch": 3.0654565456545653, + "grad_norm": 0.780198872089386, + "learning_rate": 4.9569081692371366e-05, + "loss": 0.0838, + "num_input_tokens_seen": 5879424, + "step": 27865 + }, + { + "epoch": 3.066006600660066, + "grad_norm": 0.020557861775159836, + "learning_rate": 4.956863788224753e-05, + "loss": 0.0069, + "num_input_tokens_seen": 5880512, + "step": 27870 + }, + { + "epoch": 3.0665566556655666, + "grad_norm": 0.48450154066085815, + "learning_rate": 4.956819384568635e-05, + "loss": 0.067, + "num_input_tokens_seen": 5881504, + "step": 27875 + }, + { + "epoch": 3.067106710671067, + "grad_norm": 1.1242942810058594, + "learning_rate": 4.956774958269191e-05, + "loss": 0.1717, + "num_input_tokens_seen": 5882496, + "step": 27880 + }, + { + "epoch": 3.067656765676568, + "grad_norm": 0.40859365463256836, + "learning_rate": 4.956730509326832e-05, + "loss": 0.0345, + "num_input_tokens_seen": 5883584, + "step": 27885 + }, + { + "epoch": 3.068206820682068, + "grad_norm": 0.0703565776348114, + "learning_rate": 4.956686037741966e-05, + "loss": 0.0189, + "num_input_tokens_seen": 5884672, + "step": 27890 + }, + { + "epoch": 3.0687568756875687, + "grad_norm": 0.18381571769714355, + "learning_rate": 4.956641543515004e-05, + "loss": 0.1615, + "num_input_tokens_seen": 5885696, + "step": 27895 + }, + { + "epoch": 3.0693069306930694, + "grad_norm": 1.844146728515625, + "learning_rate": 4.956597026646356e-05, + "loss": 0.027, + "num_input_tokens_seen": 5886784, + "step": 27900 + }, + { + "epoch": 3.06985698569857, + "grad_norm": 0.1374485343694687, + "learning_rate": 4.956552487136431e-05, + "loss": 0.0335, + "num_input_tokens_seen": 5887872, + "step": 27905 + }, + { + "epoch": 3.0704070407040702, + "grad_norm": 0.11179434508085251, + "learning_rate": 4.956507924985641e-05, + "loss": 0.0365, + "num_input_tokens_seen": 5888896, + "step": 27910 + }, + { + "epoch": 3.070957095709571, + "grad_norm": 0.0863257497549057, + "learning_rate": 4.956463340194396e-05, + "loss": 0.0544, + "num_input_tokens_seen": 5889888, + "step": 27915 + }, + { + "epoch": 3.0715071507150715, + "grad_norm": 1.2131942510604858, + "learning_rate": 4.956418732763107e-05, + "loss": 0.0735, + "num_input_tokens_seen": 5890912, + "step": 27920 + }, + { + "epoch": 3.072057205720572, + "grad_norm": 0.08782430738210678, + "learning_rate": 4.9563741026921865e-05, + "loss": 0.0191, + "num_input_tokens_seen": 5892000, + "step": 27925 + }, + { + "epoch": 3.072607260726073, + "grad_norm": 0.03733550384640694, + "learning_rate": 4.956329449982043e-05, + "loss": 0.0756, + "num_input_tokens_seen": 5893024, + "step": 27930 + }, + { + "epoch": 3.073157315731573, + "grad_norm": 0.3907138407230377, + "learning_rate": 4.956284774633091e-05, + "loss": 0.1937, + "num_input_tokens_seen": 5894048, + "step": 27935 + }, + { + "epoch": 3.0737073707370737, + "grad_norm": 0.532471776008606, + "learning_rate": 4.95624007664574e-05, + "loss": 0.0393, + "num_input_tokens_seen": 5895104, + "step": 27940 + }, + { + "epoch": 3.0742574257425743, + "grad_norm": 1.0242843627929688, + "learning_rate": 4.956195356020403e-05, + "loss": 0.147, + "num_input_tokens_seen": 5896096, + "step": 27945 + }, + { + "epoch": 3.074807480748075, + "grad_norm": 0.1521298736333847, + "learning_rate": 4.956150612757492e-05, + "loss": 0.0353, + "num_input_tokens_seen": 5897088, + "step": 27950 + }, + { + "epoch": 3.075357535753575, + "grad_norm": 0.7255919575691223, + "learning_rate": 4.9561058468574195e-05, + "loss": 0.0884, + "num_input_tokens_seen": 5898208, + "step": 27955 + }, + { + "epoch": 3.075907590759076, + "grad_norm": 0.12242353707551956, + "learning_rate": 4.956061058320598e-05, + "loss": 0.0516, + "num_input_tokens_seen": 5899264, + "step": 27960 + }, + { + "epoch": 3.0764576457645765, + "grad_norm": 1.7745001316070557, + "learning_rate": 4.9560162471474405e-05, + "loss": 0.0752, + "num_input_tokens_seen": 5900352, + "step": 27965 + }, + { + "epoch": 3.077007700770077, + "grad_norm": 0.03310103714466095, + "learning_rate": 4.955971413338359e-05, + "loss": 0.0374, + "num_input_tokens_seen": 5901376, + "step": 27970 + }, + { + "epoch": 3.0775577557755778, + "grad_norm": 0.13459555804729462, + "learning_rate": 4.955926556893768e-05, + "loss": 0.0425, + "num_input_tokens_seen": 5902400, + "step": 27975 + }, + { + "epoch": 3.078107810781078, + "grad_norm": 0.007889961823821068, + "learning_rate": 4.9558816778140804e-05, + "loss": 0.0474, + "num_input_tokens_seen": 5903488, + "step": 27980 + }, + { + "epoch": 3.0786578657865786, + "grad_norm": 0.07765969634056091, + "learning_rate": 4.955836776099709e-05, + "loss": 0.0678, + "num_input_tokens_seen": 5904576, + "step": 27985 + }, + { + "epoch": 3.0792079207920793, + "grad_norm": 0.27581000328063965, + "learning_rate": 4.9557918517510694e-05, + "loss": 0.0565, + "num_input_tokens_seen": 5905600, + "step": 27990 + }, + { + "epoch": 3.07975797579758, + "grad_norm": 0.19555526971817017, + "learning_rate": 4.955746904768574e-05, + "loss": 0.0167, + "num_input_tokens_seen": 5906560, + "step": 27995 + }, + { + "epoch": 3.08030803080308, + "grad_norm": 0.14051318168640137, + "learning_rate": 4.955701935152639e-05, + "loss": 0.0592, + "num_input_tokens_seen": 5907552, + "step": 28000 + }, + { + "epoch": 3.080858085808581, + "grad_norm": 0.16485492885112762, + "learning_rate": 4.955656942903677e-05, + "loss": 0.0452, + "num_input_tokens_seen": 5908640, + "step": 28005 + }, + { + "epoch": 3.0814081408140814, + "grad_norm": 0.08030997961759567, + "learning_rate": 4.955611928022103e-05, + "loss": 0.0275, + "num_input_tokens_seen": 5909760, + "step": 28010 + }, + { + "epoch": 3.081958195819582, + "grad_norm": 0.30050116777420044, + "learning_rate": 4.9555668905083317e-05, + "loss": 0.0886, + "num_input_tokens_seen": 5910880, + "step": 28015 + }, + { + "epoch": 3.0825082508250823, + "grad_norm": 0.07599478960037231, + "learning_rate": 4.9555218303627795e-05, + "loss": 0.013, + "num_input_tokens_seen": 5911904, + "step": 28020 + }, + { + "epoch": 3.083058305830583, + "grad_norm": 0.7588995099067688, + "learning_rate": 4.955476747585861e-05, + "loss": 0.0802, + "num_input_tokens_seen": 5912960, + "step": 28025 + }, + { + "epoch": 3.0836083608360836, + "grad_norm": 0.41379135847091675, + "learning_rate": 4.955431642177991e-05, + "loss": 0.057, + "num_input_tokens_seen": 5914016, + "step": 28030 + }, + { + "epoch": 3.0841584158415842, + "grad_norm": 0.45486563444137573, + "learning_rate": 4.955386514139586e-05, + "loss": 0.0306, + "num_input_tokens_seen": 5915136, + "step": 28035 + }, + { + "epoch": 3.084708470847085, + "grad_norm": 0.04919989034533501, + "learning_rate": 4.9553413634710615e-05, + "loss": 0.1282, + "num_input_tokens_seen": 5916192, + "step": 28040 + }, + { + "epoch": 3.085258525852585, + "grad_norm": 0.6715981960296631, + "learning_rate": 4.955296190172834e-05, + "loss": 0.055, + "num_input_tokens_seen": 5917280, + "step": 28045 + }, + { + "epoch": 3.0858085808580857, + "grad_norm": 0.07448241114616394, + "learning_rate": 4.9552509942453196e-05, + "loss": 0.0649, + "num_input_tokens_seen": 5918400, + "step": 28050 + }, + { + "epoch": 3.0863586358635864, + "grad_norm": 0.6041737794876099, + "learning_rate": 4.9552057756889355e-05, + "loss": 0.1205, + "num_input_tokens_seen": 5919424, + "step": 28055 + }, + { + "epoch": 3.086908690869087, + "grad_norm": 0.18620504438877106, + "learning_rate": 4.9551605345040974e-05, + "loss": 0.023, + "num_input_tokens_seen": 5920480, + "step": 28060 + }, + { + "epoch": 3.0874587458745877, + "grad_norm": 0.08730890601873398, + "learning_rate": 4.955115270691223e-05, + "loss": 0.0315, + "num_input_tokens_seen": 5921536, + "step": 28065 + }, + { + "epoch": 3.088008800880088, + "grad_norm": 1.5651881694793701, + "learning_rate": 4.955069984250729e-05, + "loss": 0.1124, + "num_input_tokens_seen": 5922656, + "step": 28070 + }, + { + "epoch": 3.0885588558855885, + "grad_norm": 0.23288585245609283, + "learning_rate": 4.955024675183033e-05, + "loss": 0.02, + "num_input_tokens_seen": 5923712, + "step": 28075 + }, + { + "epoch": 3.089108910891089, + "grad_norm": 1.1244924068450928, + "learning_rate": 4.9549793434885525e-05, + "loss": 0.0578, + "num_input_tokens_seen": 5924768, + "step": 28080 + }, + { + "epoch": 3.08965896589659, + "grad_norm": 1.058410882949829, + "learning_rate": 4.9549339891677056e-05, + "loss": 0.0406, + "num_input_tokens_seen": 5925792, + "step": 28085 + }, + { + "epoch": 3.09020902090209, + "grad_norm": 0.10356007516384125, + "learning_rate": 4.95488861222091e-05, + "loss": 0.0286, + "num_input_tokens_seen": 5926816, + "step": 28090 + }, + { + "epoch": 3.0907590759075907, + "grad_norm": 0.23397910594940186, + "learning_rate": 4.954843212648584e-05, + "loss": 0.1401, + "num_input_tokens_seen": 5927808, + "step": 28095 + }, + { + "epoch": 3.0913091309130913, + "grad_norm": 0.0906284749507904, + "learning_rate": 4.954797790451146e-05, + "loss": 0.024, + "num_input_tokens_seen": 5928928, + "step": 28100 + }, + { + "epoch": 3.091859185918592, + "grad_norm": 0.6263504028320312, + "learning_rate": 4.9547523456290146e-05, + "loss": 0.0526, + "num_input_tokens_seen": 5930016, + "step": 28105 + }, + { + "epoch": 3.092409240924092, + "grad_norm": 1.0375888347625732, + "learning_rate": 4.954706878182609e-05, + "loss": 0.0572, + "num_input_tokens_seen": 5931040, + "step": 28110 + }, + { + "epoch": 3.092959295929593, + "grad_norm": 0.6537671089172363, + "learning_rate": 4.954661388112348e-05, + "loss": 0.0363, + "num_input_tokens_seen": 5932096, + "step": 28115 + }, + { + "epoch": 3.0935093509350935, + "grad_norm": 0.4553493559360504, + "learning_rate": 4.954615875418651e-05, + "loss": 0.0767, + "num_input_tokens_seen": 5933120, + "step": 28120 + }, + { + "epoch": 3.094059405940594, + "grad_norm": 0.512245237827301, + "learning_rate": 4.954570340101937e-05, + "loss": 0.082, + "num_input_tokens_seen": 5934208, + "step": 28125 + }, + { + "epoch": 3.094609460946095, + "grad_norm": 0.5318773984909058, + "learning_rate": 4.9545247821626265e-05, + "loss": 0.1086, + "num_input_tokens_seen": 5935232, + "step": 28130 + }, + { + "epoch": 3.095159515951595, + "grad_norm": 1.6274827718734741, + "learning_rate": 4.954479201601138e-05, + "loss": 0.1296, + "num_input_tokens_seen": 5936288, + "step": 28135 + }, + { + "epoch": 3.0957095709570956, + "grad_norm": 0.08857816457748413, + "learning_rate": 4.9544335984178933e-05, + "loss": 0.0844, + "num_input_tokens_seen": 5937312, + "step": 28140 + }, + { + "epoch": 3.0962596259625963, + "grad_norm": 1.1450717449188232, + "learning_rate": 4.954387972613311e-05, + "loss": 0.1931, + "num_input_tokens_seen": 5938336, + "step": 28145 + }, + { + "epoch": 3.096809680968097, + "grad_norm": 0.04224705323576927, + "learning_rate": 4.954342324187814e-05, + "loss": 0.1005, + "num_input_tokens_seen": 5939392, + "step": 28150 + }, + { + "epoch": 3.097359735973597, + "grad_norm": 0.1285633146762848, + "learning_rate": 4.95429665314182e-05, + "loss": 0.035, + "num_input_tokens_seen": 5940448, + "step": 28155 + }, + { + "epoch": 3.097909790979098, + "grad_norm": 0.5173105001449585, + "learning_rate": 4.9542509594757526e-05, + "loss": 0.0798, + "num_input_tokens_seen": 5941440, + "step": 28160 + }, + { + "epoch": 3.0984598459845984, + "grad_norm": 0.7033933401107788, + "learning_rate": 4.954205243190031e-05, + "loss": 0.1201, + "num_input_tokens_seen": 5942496, + "step": 28165 + }, + { + "epoch": 3.099009900990099, + "grad_norm": 1.2558271884918213, + "learning_rate": 4.9541595042850766e-05, + "loss": 0.203, + "num_input_tokens_seen": 5943616, + "step": 28170 + }, + { + "epoch": 3.0995599559955997, + "grad_norm": 0.09383974969387054, + "learning_rate": 4.9541137427613125e-05, + "loss": 0.0103, + "num_input_tokens_seen": 5944736, + "step": 28175 + }, + { + "epoch": 3.1001100110011, + "grad_norm": 0.08565175533294678, + "learning_rate": 4.9540679586191605e-05, + "loss": 0.0251, + "num_input_tokens_seen": 5945824, + "step": 28180 + }, + { + "epoch": 3.1006600660066006, + "grad_norm": 0.13285087049007416, + "learning_rate": 4.95402215185904e-05, + "loss": 0.0232, + "num_input_tokens_seen": 5946880, + "step": 28185 + }, + { + "epoch": 3.1012101210121013, + "grad_norm": 0.48992079496383667, + "learning_rate": 4.953976322481376e-05, + "loss": 0.0458, + "num_input_tokens_seen": 5947936, + "step": 28190 + }, + { + "epoch": 3.101760176017602, + "grad_norm": 0.306907057762146, + "learning_rate": 4.953930470486589e-05, + "loss": 0.0374, + "num_input_tokens_seen": 5948896, + "step": 28195 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.14460739493370056, + "learning_rate": 4.9538845958751034e-05, + "loss": 0.0962, + "num_input_tokens_seen": 5949984, + "step": 28200 + }, + { + "epoch": 3.1028602860286028, + "grad_norm": 0.011136055923998356, + "learning_rate": 4.95383869864734e-05, + "loss": 0.0176, + "num_input_tokens_seen": 5951104, + "step": 28205 + }, + { + "epoch": 3.1034103410341034, + "grad_norm": 0.10175364464521408, + "learning_rate": 4.9537927788037244e-05, + "loss": 0.0581, + "num_input_tokens_seen": 5952128, + "step": 28210 + }, + { + "epoch": 3.103960396039604, + "grad_norm": 0.530182421207428, + "learning_rate": 4.9537468363446764e-05, + "loss": 0.0948, + "num_input_tokens_seen": 5953184, + "step": 28215 + }, + { + "epoch": 3.1045104510451047, + "grad_norm": 0.244759663939476, + "learning_rate": 4.953700871270622e-05, + "loss": 0.0814, + "num_input_tokens_seen": 5954240, + "step": 28220 + }, + { + "epoch": 3.105060506050605, + "grad_norm": 1.038770079612732, + "learning_rate": 4.953654883581984e-05, + "loss": 0.083, + "num_input_tokens_seen": 5955264, + "step": 28225 + }, + { + "epoch": 3.1056105610561056, + "grad_norm": 0.30934154987335205, + "learning_rate": 4.953608873279186e-05, + "loss": 0.118, + "num_input_tokens_seen": 5956384, + "step": 28230 + }, + { + "epoch": 3.106160616061606, + "grad_norm": 0.07246610522270203, + "learning_rate": 4.9535628403626524e-05, + "loss": 0.0349, + "num_input_tokens_seen": 5957408, + "step": 28235 + }, + { + "epoch": 3.106710671067107, + "grad_norm": 0.3219904601573944, + "learning_rate": 4.953516784832808e-05, + "loss": 0.0904, + "num_input_tokens_seen": 5958432, + "step": 28240 + }, + { + "epoch": 3.107260726072607, + "grad_norm": 0.03858689218759537, + "learning_rate": 4.953470706690077e-05, + "loss": 0.1109, + "num_input_tokens_seen": 5959456, + "step": 28245 + }, + { + "epoch": 3.1078107810781077, + "grad_norm": 0.4829949140548706, + "learning_rate": 4.9534246059348824e-05, + "loss": 0.035, + "num_input_tokens_seen": 5960576, + "step": 28250 + }, + { + "epoch": 3.1083608360836084, + "grad_norm": 0.09458039700984955, + "learning_rate": 4.953378482567651e-05, + "loss": 0.042, + "num_input_tokens_seen": 5961632, + "step": 28255 + }, + { + "epoch": 3.108910891089109, + "grad_norm": 0.2720424234867096, + "learning_rate": 4.953332336588808e-05, + "loss": 0.0758, + "num_input_tokens_seen": 5962688, + "step": 28260 + }, + { + "epoch": 3.1094609460946097, + "grad_norm": 0.2862129509449005, + "learning_rate": 4.953286167998777e-05, + "loss": 0.0561, + "num_input_tokens_seen": 5963776, + "step": 28265 + }, + { + "epoch": 3.11001100110011, + "grad_norm": 0.2302274852991104, + "learning_rate": 4.9532399767979854e-05, + "loss": 0.1356, + "num_input_tokens_seen": 5964800, + "step": 28270 + }, + { + "epoch": 3.1105610561056105, + "grad_norm": 0.18887099623680115, + "learning_rate": 4.953193762986859e-05, + "loss": 0.0838, + "num_input_tokens_seen": 5965824, + "step": 28275 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.8469340205192566, + "learning_rate": 4.953147526565821e-05, + "loss": 0.0705, + "num_input_tokens_seen": 5966912, + "step": 28280 + }, + { + "epoch": 3.111661166116612, + "grad_norm": 0.039473630487918854, + "learning_rate": 4.9531012675353006e-05, + "loss": 0.0565, + "num_input_tokens_seen": 5967904, + "step": 28285 + }, + { + "epoch": 3.112211221122112, + "grad_norm": 0.05091697350144386, + "learning_rate": 4.953054985895722e-05, + "loss": 0.0392, + "num_input_tokens_seen": 5968992, + "step": 28290 + }, + { + "epoch": 3.1127612761276127, + "grad_norm": 0.29069045186042786, + "learning_rate": 4.953008681647513e-05, + "loss": 0.0687, + "num_input_tokens_seen": 5970048, + "step": 28295 + }, + { + "epoch": 3.1133113311331133, + "grad_norm": 0.0589662566781044, + "learning_rate": 4.952962354791101e-05, + "loss": 0.0155, + "num_input_tokens_seen": 5971104, + "step": 28300 + }, + { + "epoch": 3.113861386138614, + "grad_norm": 0.486197829246521, + "learning_rate": 4.9529160053269106e-05, + "loss": 0.108, + "num_input_tokens_seen": 5972160, + "step": 28305 + }, + { + "epoch": 3.1144114411441146, + "grad_norm": 1.345855474472046, + "learning_rate": 4.9528696332553715e-05, + "loss": 0.0627, + "num_input_tokens_seen": 5973280, + "step": 28310 + }, + { + "epoch": 3.114961496149615, + "grad_norm": 0.25850677490234375, + "learning_rate": 4.952823238576909e-05, + "loss": 0.0626, + "num_input_tokens_seen": 5974400, + "step": 28315 + }, + { + "epoch": 3.1155115511551155, + "grad_norm": 1.0113513469696045, + "learning_rate": 4.952776821291952e-05, + "loss": 0.0807, + "num_input_tokens_seen": 5975424, + "step": 28320 + }, + { + "epoch": 3.116061606160616, + "grad_norm": 0.26387670636177063, + "learning_rate": 4.952730381400929e-05, + "loss": 0.0679, + "num_input_tokens_seen": 5976480, + "step": 28325 + }, + { + "epoch": 3.1166116611661168, + "grad_norm": 0.13005809485912323, + "learning_rate": 4.9526839189042665e-05, + "loss": 0.07, + "num_input_tokens_seen": 5977472, + "step": 28330 + }, + { + "epoch": 3.117161716171617, + "grad_norm": 0.804564356803894, + "learning_rate": 4.952637433802393e-05, + "loss": 0.1577, + "num_input_tokens_seen": 5978624, + "step": 28335 + }, + { + "epoch": 3.1177117711771176, + "grad_norm": 0.8068821430206299, + "learning_rate": 4.952590926095737e-05, + "loss": 0.0833, + "num_input_tokens_seen": 5979648, + "step": 28340 + }, + { + "epoch": 3.1182618261826183, + "grad_norm": 0.3914298117160797, + "learning_rate": 4.9525443957847276e-05, + "loss": 0.0313, + "num_input_tokens_seen": 5980704, + "step": 28345 + }, + { + "epoch": 3.118811881188119, + "grad_norm": 0.13567142188549042, + "learning_rate": 4.9524978428697934e-05, + "loss": 0.0109, + "num_input_tokens_seen": 5981728, + "step": 28350 + }, + { + "epoch": 3.1193619361936196, + "grad_norm": 0.48718011379241943, + "learning_rate": 4.952451267351362e-05, + "loss": 0.0609, + "num_input_tokens_seen": 5982752, + "step": 28355 + }, + { + "epoch": 3.1199119911991198, + "grad_norm": 0.17173506319522858, + "learning_rate": 4.952404669229866e-05, + "loss": 0.0954, + "num_input_tokens_seen": 5983776, + "step": 28360 + }, + { + "epoch": 3.1204620462046204, + "grad_norm": 1.6236990690231323, + "learning_rate": 4.952358048505732e-05, + "loss": 0.1997, + "num_input_tokens_seen": 5984864, + "step": 28365 + }, + { + "epoch": 3.121012101210121, + "grad_norm": 0.24789567291736603, + "learning_rate": 4.952311405179391e-05, + "loss": 0.022, + "num_input_tokens_seen": 5985920, + "step": 28370 + }, + { + "epoch": 3.1215621562156217, + "grad_norm": 0.08415774255990982, + "learning_rate": 4.9522647392512724e-05, + "loss": 0.0559, + "num_input_tokens_seen": 5987072, + "step": 28375 + }, + { + "epoch": 3.122112211221122, + "grad_norm": 0.03740360215306282, + "learning_rate": 4.9522180507218064e-05, + "loss": 0.0388, + "num_input_tokens_seen": 5988128, + "step": 28380 + }, + { + "epoch": 3.1226622662266226, + "grad_norm": 0.04829926788806915, + "learning_rate": 4.952171339591423e-05, + "loss": 0.1056, + "num_input_tokens_seen": 5989184, + "step": 28385 + }, + { + "epoch": 3.1232123212321232, + "grad_norm": 0.0331929586827755, + "learning_rate": 4.9521246058605534e-05, + "loss": 0.1067, + "num_input_tokens_seen": 5990272, + "step": 28390 + }, + { + "epoch": 3.123762376237624, + "grad_norm": 0.10023927688598633, + "learning_rate": 4.9520778495296284e-05, + "loss": 0.0566, + "num_input_tokens_seen": 5991328, + "step": 28395 + }, + { + "epoch": 3.1243124312431245, + "grad_norm": 0.13364577293395996, + "learning_rate": 4.9520310705990775e-05, + "loss": 0.052, + "num_input_tokens_seen": 5992352, + "step": 28400 + }, + { + "epoch": 3.1248624862486247, + "grad_norm": 1.1216182708740234, + "learning_rate": 4.951984269069333e-05, + "loss": 0.071, + "num_input_tokens_seen": 5993344, + "step": 28405 + }, + { + "epoch": 3.1254125412541254, + "grad_norm": 0.7637099027633667, + "learning_rate": 4.9519374449408266e-05, + "loss": 0.0769, + "num_input_tokens_seen": 5994400, + "step": 28410 + }, + { + "epoch": 3.125962596259626, + "grad_norm": 0.04638155177235603, + "learning_rate": 4.951890598213989e-05, + "loss": 0.0623, + "num_input_tokens_seen": 5995520, + "step": 28415 + }, + { + "epoch": 3.1265126512651267, + "grad_norm": 0.29645922780036926, + "learning_rate": 4.9518437288892524e-05, + "loss": 0.0615, + "num_input_tokens_seen": 5996608, + "step": 28420 + }, + { + "epoch": 3.127062706270627, + "grad_norm": 0.2364242523908615, + "learning_rate": 4.951796836967049e-05, + "loss": 0.0956, + "num_input_tokens_seen": 5997600, + "step": 28425 + }, + { + "epoch": 3.1276127612761275, + "grad_norm": 0.10422489047050476, + "learning_rate": 4.9517499224478095e-05, + "loss": 0.0846, + "num_input_tokens_seen": 5998688, + "step": 28430 + }, + { + "epoch": 3.128162816281628, + "grad_norm": 0.04159943759441376, + "learning_rate": 4.9517029853319687e-05, + "loss": 0.1543, + "num_input_tokens_seen": 5999776, + "step": 28435 + }, + { + "epoch": 3.128712871287129, + "grad_norm": 1.1171177625656128, + "learning_rate": 4.951656025619957e-05, + "loss": 0.0999, + "num_input_tokens_seen": 6000832, + "step": 28440 + }, + { + "epoch": 3.129262926292629, + "grad_norm": 0.48305076360702515, + "learning_rate": 4.951609043312209e-05, + "loss": 0.0767, + "num_input_tokens_seen": 6001824, + "step": 28445 + }, + { + "epoch": 3.1298129812981297, + "grad_norm": 0.2252238243818283, + "learning_rate": 4.951562038409157e-05, + "loss": 0.0442, + "num_input_tokens_seen": 6002912, + "step": 28450 + }, + { + "epoch": 3.1303630363036303, + "grad_norm": 1.6127111911773682, + "learning_rate": 4.951515010911233e-05, + "loss": 0.1476, + "num_input_tokens_seen": 6004000, + "step": 28455 + }, + { + "epoch": 3.130913091309131, + "grad_norm": 0.36448684334754944, + "learning_rate": 4.9514679608188716e-05, + "loss": 0.0175, + "num_input_tokens_seen": 6005024, + "step": 28460 + }, + { + "epoch": 3.1314631463146316, + "grad_norm": 0.10294846445322037, + "learning_rate": 4.9514208881325066e-05, + "loss": 0.0719, + "num_input_tokens_seen": 6006080, + "step": 28465 + }, + { + "epoch": 3.132013201320132, + "grad_norm": 0.13764435052871704, + "learning_rate": 4.951373792852572e-05, + "loss": 0.0257, + "num_input_tokens_seen": 6007104, + "step": 28470 + }, + { + "epoch": 3.1325632563256325, + "grad_norm": 0.570868730545044, + "learning_rate": 4.951326674979501e-05, + "loss": 0.0369, + "num_input_tokens_seen": 6008160, + "step": 28475 + }, + { + "epoch": 3.133113311331133, + "grad_norm": 0.14947976171970367, + "learning_rate": 4.951279534513728e-05, + "loss": 0.0386, + "num_input_tokens_seen": 6009216, + "step": 28480 + }, + { + "epoch": 3.133663366336634, + "grad_norm": 0.3563413918018341, + "learning_rate": 4.9512323714556886e-05, + "loss": 0.0296, + "num_input_tokens_seen": 6010336, + "step": 28485 + }, + { + "epoch": 3.1342134213421344, + "grad_norm": 0.36650174856185913, + "learning_rate": 4.9511851858058154e-05, + "loss": 0.0314, + "num_input_tokens_seen": 6011424, + "step": 28490 + }, + { + "epoch": 3.1347634763476346, + "grad_norm": 0.04307932406663895, + "learning_rate": 4.951137977564546e-05, + "loss": 0.0203, + "num_input_tokens_seen": 6012448, + "step": 28495 + }, + { + "epoch": 3.1353135313531353, + "grad_norm": 0.10959891974925995, + "learning_rate": 4.9510907467323134e-05, + "loss": 0.0911, + "num_input_tokens_seen": 6013472, + "step": 28500 + }, + { + "epoch": 3.135863586358636, + "grad_norm": 0.16480949521064758, + "learning_rate": 4.951043493309553e-05, + "loss": 0.0415, + "num_input_tokens_seen": 6014528, + "step": 28505 + }, + { + "epoch": 3.1364136413641366, + "grad_norm": 0.4940902590751648, + "learning_rate": 4.9509962172967016e-05, + "loss": 0.0306, + "num_input_tokens_seen": 6015520, + "step": 28510 + }, + { + "epoch": 3.136963696369637, + "grad_norm": 0.6311994194984436, + "learning_rate": 4.950948918694194e-05, + "loss": 0.0592, + "num_input_tokens_seen": 6016544, + "step": 28515 + }, + { + "epoch": 3.1375137513751374, + "grad_norm": 0.4674943685531616, + "learning_rate": 4.9509015975024665e-05, + "loss": 0.1006, + "num_input_tokens_seen": 6017632, + "step": 28520 + }, + { + "epoch": 3.138063806380638, + "grad_norm": 0.0631929412484169, + "learning_rate": 4.9508542537219544e-05, + "loss": 0.0567, + "num_input_tokens_seen": 6018752, + "step": 28525 + }, + { + "epoch": 3.1386138613861387, + "grad_norm": 0.28799718618392944, + "learning_rate": 4.950806887353095e-05, + "loss": 0.0137, + "num_input_tokens_seen": 6019776, + "step": 28530 + }, + { + "epoch": 3.139163916391639, + "grad_norm": 0.03379069268703461, + "learning_rate": 4.950759498396325e-05, + "loss": 0.0465, + "num_input_tokens_seen": 6020896, + "step": 28535 + }, + { + "epoch": 3.1397139713971396, + "grad_norm": 0.4119161069393158, + "learning_rate": 4.95071208685208e-05, + "loss": 0.0663, + "num_input_tokens_seen": 6021952, + "step": 28540 + }, + { + "epoch": 3.1402640264026402, + "grad_norm": 0.04609093442559242, + "learning_rate": 4.950664652720798e-05, + "loss": 0.1103, + "num_input_tokens_seen": 6023104, + "step": 28545 + }, + { + "epoch": 3.140814081408141, + "grad_norm": 0.421888530254364, + "learning_rate": 4.950617196002916e-05, + "loss": 0.0224, + "num_input_tokens_seen": 6024128, + "step": 28550 + }, + { + "epoch": 3.1413641364136415, + "grad_norm": 0.016322841867804527, + "learning_rate": 4.9505697166988705e-05, + "loss": 0.0127, + "num_input_tokens_seen": 6025152, + "step": 28555 + }, + { + "epoch": 3.1419141914191417, + "grad_norm": 0.6416897773742676, + "learning_rate": 4.9505222148091e-05, + "loss": 0.0728, + "num_input_tokens_seen": 6026240, + "step": 28560 + }, + { + "epoch": 3.1424642464246424, + "grad_norm": 0.46339160203933716, + "learning_rate": 4.9504746903340424e-05, + "loss": 0.0583, + "num_input_tokens_seen": 6027328, + "step": 28565 + }, + { + "epoch": 3.143014301430143, + "grad_norm": 0.6505302786827087, + "learning_rate": 4.9504271432741354e-05, + "loss": 0.068, + "num_input_tokens_seen": 6028384, + "step": 28570 + }, + { + "epoch": 3.1435643564356437, + "grad_norm": 0.3483484089374542, + "learning_rate": 4.9503795736298173e-05, + "loss": 0.0194, + "num_input_tokens_seen": 6029440, + "step": 28575 + }, + { + "epoch": 3.1441144114411443, + "grad_norm": 0.05630716308951378, + "learning_rate": 4.950331981401527e-05, + "loss": 0.1352, + "num_input_tokens_seen": 6030496, + "step": 28580 + }, + { + "epoch": 3.1446644664466445, + "grad_norm": 1.114284634590149, + "learning_rate": 4.950284366589702e-05, + "loss": 0.0399, + "num_input_tokens_seen": 6031584, + "step": 28585 + }, + { + "epoch": 3.145214521452145, + "grad_norm": 0.07952886074781418, + "learning_rate": 4.9502367291947814e-05, + "loss": 0.0216, + "num_input_tokens_seen": 6032672, + "step": 28590 + }, + { + "epoch": 3.145764576457646, + "grad_norm": 0.7169684767723083, + "learning_rate": 4.9501890692172055e-05, + "loss": 0.0505, + "num_input_tokens_seen": 6033664, + "step": 28595 + }, + { + "epoch": 3.1463146314631465, + "grad_norm": 1.354157567024231, + "learning_rate": 4.950141386657412e-05, + "loss": 0.057, + "num_input_tokens_seen": 6034752, + "step": 28600 + }, + { + "epoch": 3.1468646864686467, + "grad_norm": 0.02268999256193638, + "learning_rate": 4.950093681515842e-05, + "loss": 0.1799, + "num_input_tokens_seen": 6035776, + "step": 28605 + }, + { + "epoch": 3.1474147414741473, + "grad_norm": 0.30490434169769287, + "learning_rate": 4.9500459537929326e-05, + "loss": 0.0248, + "num_input_tokens_seen": 6036832, + "step": 28610 + }, + { + "epoch": 3.147964796479648, + "grad_norm": 0.8712652325630188, + "learning_rate": 4.949998203489126e-05, + "loss": 0.0993, + "num_input_tokens_seen": 6037952, + "step": 28615 + }, + { + "epoch": 3.1485148514851486, + "grad_norm": 0.9762800931930542, + "learning_rate": 4.949950430604862e-05, + "loss": 0.1403, + "num_input_tokens_seen": 6039040, + "step": 28620 + }, + { + "epoch": 3.149064906490649, + "grad_norm": 0.16332776844501495, + "learning_rate": 4.94990263514058e-05, + "loss": 0.0602, + "num_input_tokens_seen": 6040128, + "step": 28625 + }, + { + "epoch": 3.1496149614961495, + "grad_norm": 0.027721457183361053, + "learning_rate": 4.9498548170967216e-05, + "loss": 0.0692, + "num_input_tokens_seen": 6041248, + "step": 28630 + }, + { + "epoch": 3.15016501650165, + "grad_norm": 0.27132299542427063, + "learning_rate": 4.949806976473727e-05, + "loss": 0.0913, + "num_input_tokens_seen": 6042336, + "step": 28635 + }, + { + "epoch": 3.150715071507151, + "grad_norm": 0.028554873540997505, + "learning_rate": 4.949759113272035e-05, + "loss": 0.0373, + "num_input_tokens_seen": 6043424, + "step": 28640 + }, + { + "epoch": 3.1512651265126514, + "grad_norm": 0.04296666756272316, + "learning_rate": 4.949711227492091e-05, + "loss": 0.0053, + "num_input_tokens_seen": 6044448, + "step": 28645 + }, + { + "epoch": 3.1518151815181517, + "grad_norm": 0.45170894265174866, + "learning_rate": 4.949663319134333e-05, + "loss": 0.0971, + "num_input_tokens_seen": 6045504, + "step": 28650 + }, + { + "epoch": 3.1523652365236523, + "grad_norm": 0.3352065682411194, + "learning_rate": 4.9496153881992044e-05, + "loss": 0.0251, + "num_input_tokens_seen": 6046592, + "step": 28655 + }, + { + "epoch": 3.152915291529153, + "grad_norm": 0.04568164795637131, + "learning_rate": 4.9495674346871454e-05, + "loss": 0.0248, + "num_input_tokens_seen": 6047680, + "step": 28660 + }, + { + "epoch": 3.1534653465346536, + "grad_norm": 0.1413293033838272, + "learning_rate": 4.9495194585985996e-05, + "loss": 0.0145, + "num_input_tokens_seen": 6048768, + "step": 28665 + }, + { + "epoch": 3.1540154015401543, + "grad_norm": 0.08704841136932373, + "learning_rate": 4.949471459934008e-05, + "loss": 0.0489, + "num_input_tokens_seen": 6049824, + "step": 28670 + }, + { + "epoch": 3.1545654565456545, + "grad_norm": 1.2116669416427612, + "learning_rate": 4.949423438693813e-05, + "loss": 0.1098, + "num_input_tokens_seen": 6050848, + "step": 28675 + }, + { + "epoch": 3.155115511551155, + "grad_norm": 0.8177232146263123, + "learning_rate": 4.9493753948784584e-05, + "loss": 0.0858, + "num_input_tokens_seen": 6051936, + "step": 28680 + }, + { + "epoch": 3.1556655665566558, + "grad_norm": 0.16732831299304962, + "learning_rate": 4.9493273284883854e-05, + "loss": 0.0412, + "num_input_tokens_seen": 6052928, + "step": 28685 + }, + { + "epoch": 3.1562156215621564, + "grad_norm": 0.021292004734277725, + "learning_rate": 4.949279239524037e-05, + "loss": 0.1528, + "num_input_tokens_seen": 6054016, + "step": 28690 + }, + { + "epoch": 3.1567656765676566, + "grad_norm": 0.6828209757804871, + "learning_rate": 4.9492311279858585e-05, + "loss": 0.1065, + "num_input_tokens_seen": 6055040, + "step": 28695 + }, + { + "epoch": 3.1573157315731573, + "grad_norm": 0.08919735252857208, + "learning_rate": 4.949182993874292e-05, + "loss": 0.0216, + "num_input_tokens_seen": 6056096, + "step": 28700 + }, + { + "epoch": 3.157865786578658, + "grad_norm": 0.09455118328332901, + "learning_rate": 4.94913483718978e-05, + "loss": 0.0544, + "num_input_tokens_seen": 6057152, + "step": 28705 + }, + { + "epoch": 3.1584158415841586, + "grad_norm": 0.07945549488067627, + "learning_rate": 4.9490866579327674e-05, + "loss": 0.0619, + "num_input_tokens_seen": 6058240, + "step": 28710 + }, + { + "epoch": 3.1589658965896588, + "grad_norm": 0.03588779270648956, + "learning_rate": 4.9490384561036995e-05, + "loss": 0.0391, + "num_input_tokens_seen": 6059264, + "step": 28715 + }, + { + "epoch": 3.1595159515951594, + "grad_norm": 0.8092262148857117, + "learning_rate": 4.9489902317030176e-05, + "loss": 0.0998, + "num_input_tokens_seen": 6060256, + "step": 28720 + }, + { + "epoch": 3.16006600660066, + "grad_norm": 0.07259154319763184, + "learning_rate": 4.948941984731169e-05, + "loss": 0.0165, + "num_input_tokens_seen": 6061280, + "step": 28725 + }, + { + "epoch": 3.1606160616061607, + "grad_norm": 0.9927155375480652, + "learning_rate": 4.948893715188597e-05, + "loss": 0.1208, + "num_input_tokens_seen": 6062336, + "step": 28730 + }, + { + "epoch": 3.1611661166116614, + "grad_norm": 1.3453857898712158, + "learning_rate": 4.948845423075748e-05, + "loss": 0.1295, + "num_input_tokens_seen": 6063328, + "step": 28735 + }, + { + "epoch": 3.1617161716171616, + "grad_norm": 0.44224876165390015, + "learning_rate": 4.9487971083930637e-05, + "loss": 0.0279, + "num_input_tokens_seen": 6064352, + "step": 28740 + }, + { + "epoch": 3.162266226622662, + "grad_norm": 1.4719505310058594, + "learning_rate": 4.948748771140993e-05, + "loss": 0.0729, + "num_input_tokens_seen": 6065408, + "step": 28745 + }, + { + "epoch": 3.162816281628163, + "grad_norm": 1.026228904724121, + "learning_rate": 4.948700411319979e-05, + "loss": 0.0693, + "num_input_tokens_seen": 6066496, + "step": 28750 + }, + { + "epoch": 3.1633663366336635, + "grad_norm": 0.4369416832923889, + "learning_rate": 4.948652028930468e-05, + "loss": 0.0208, + "num_input_tokens_seen": 6067616, + "step": 28755 + }, + { + "epoch": 3.1639163916391637, + "grad_norm": 0.6871055960655212, + "learning_rate": 4.948603623972907e-05, + "loss": 0.0935, + "num_input_tokens_seen": 6068672, + "step": 28760 + }, + { + "epoch": 3.1644664466446644, + "grad_norm": 0.9185709953308105, + "learning_rate": 4.948555196447742e-05, + "loss": 0.109, + "num_input_tokens_seen": 6069760, + "step": 28765 + }, + { + "epoch": 3.165016501650165, + "grad_norm": 0.7745792865753174, + "learning_rate": 4.948506746355418e-05, + "loss": 0.0748, + "num_input_tokens_seen": 6070816, + "step": 28770 + }, + { + "epoch": 3.1655665566556657, + "grad_norm": 1.1179678440093994, + "learning_rate": 4.9484582736963816e-05, + "loss": 0.0734, + "num_input_tokens_seen": 6071936, + "step": 28775 + }, + { + "epoch": 3.1661166116611663, + "grad_norm": 0.2742502689361572, + "learning_rate": 4.9484097784710805e-05, + "loss": 0.1004, + "num_input_tokens_seen": 6073024, + "step": 28780 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.9643514156341553, + "learning_rate": 4.948361260679961e-05, + "loss": 0.1141, + "num_input_tokens_seen": 6074048, + "step": 28785 + }, + { + "epoch": 3.167216721672167, + "grad_norm": 0.16527147591114044, + "learning_rate": 4.9483127203234715e-05, + "loss": 0.0905, + "num_input_tokens_seen": 6075072, + "step": 28790 + }, + { + "epoch": 3.167766776677668, + "grad_norm": 0.34788742661476135, + "learning_rate": 4.9482641574020586e-05, + "loss": 0.1376, + "num_input_tokens_seen": 6076064, + "step": 28795 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.09717297554016113, + "learning_rate": 4.948215571916169e-05, + "loss": 0.051, + "num_input_tokens_seen": 6077120, + "step": 28800 + }, + { + "epoch": 3.1688668866886687, + "grad_norm": 0.041366223245859146, + "learning_rate": 4.9481669638662516e-05, + "loss": 0.0763, + "num_input_tokens_seen": 6078144, + "step": 28805 + }, + { + "epoch": 3.1694169416941693, + "grad_norm": 0.06182554364204407, + "learning_rate": 4.9481183332527534e-05, + "loss": 0.0229, + "num_input_tokens_seen": 6079296, + "step": 28810 + }, + { + "epoch": 3.16996699669967, + "grad_norm": 0.19719073176383972, + "learning_rate": 4.9480696800761236e-05, + "loss": 0.0196, + "num_input_tokens_seen": 6080352, + "step": 28815 + }, + { + "epoch": 3.1705170517051706, + "grad_norm": 0.06037362292408943, + "learning_rate": 4.948021004336811e-05, + "loss": 0.0536, + "num_input_tokens_seen": 6081408, + "step": 28820 + }, + { + "epoch": 3.1710671067106713, + "grad_norm": 0.5143716335296631, + "learning_rate": 4.947972306035263e-05, + "loss": 0.0909, + "num_input_tokens_seen": 6082496, + "step": 28825 + }, + { + "epoch": 3.1716171617161715, + "grad_norm": 0.318901002407074, + "learning_rate": 4.9479235851719284e-05, + "loss": 0.1469, + "num_input_tokens_seen": 6083584, + "step": 28830 + }, + { + "epoch": 3.172167216721672, + "grad_norm": 0.32611405849456787, + "learning_rate": 4.947874841747258e-05, + "loss": 0.0744, + "num_input_tokens_seen": 6084608, + "step": 28835 + }, + { + "epoch": 3.1727172717271728, + "grad_norm": 0.1462930291891098, + "learning_rate": 4.947826075761698e-05, + "loss": 0.0518, + "num_input_tokens_seen": 6085696, + "step": 28840 + }, + { + "epoch": 3.1732673267326734, + "grad_norm": 0.06686592102050781, + "learning_rate": 4.947777287215701e-05, + "loss": 0.0483, + "num_input_tokens_seen": 6086784, + "step": 28845 + }, + { + "epoch": 3.1738173817381736, + "grad_norm": 0.28770849108695984, + "learning_rate": 4.947728476109715e-05, + "loss": 0.0684, + "num_input_tokens_seen": 6087872, + "step": 28850 + }, + { + "epoch": 3.1743674367436743, + "grad_norm": 0.08201979100704193, + "learning_rate": 4.94767964244419e-05, + "loss": 0.0601, + "num_input_tokens_seen": 6088928, + "step": 28855 + }, + { + "epoch": 3.174917491749175, + "grad_norm": 0.1202753558754921, + "learning_rate": 4.947630786219576e-05, + "loss": 0.0304, + "num_input_tokens_seen": 6089952, + "step": 28860 + }, + { + "epoch": 3.1754675467546756, + "grad_norm": 0.37411972880363464, + "learning_rate": 4.947581907436324e-05, + "loss": 0.1192, + "num_input_tokens_seen": 6091008, + "step": 28865 + }, + { + "epoch": 3.1760176017601762, + "grad_norm": 0.4166939854621887, + "learning_rate": 4.947533006094884e-05, + "loss": 0.0833, + "num_input_tokens_seen": 6092096, + "step": 28870 + }, + { + "epoch": 3.1765676567656764, + "grad_norm": 0.07344408333301544, + "learning_rate": 4.9474840821957064e-05, + "loss": 0.0104, + "num_input_tokens_seen": 6093152, + "step": 28875 + }, + { + "epoch": 3.177117711771177, + "grad_norm": 0.7214216589927673, + "learning_rate": 4.947435135739242e-05, + "loss": 0.054, + "num_input_tokens_seen": 6094176, + "step": 28880 + }, + { + "epoch": 3.1776677667766777, + "grad_norm": 0.17512059211730957, + "learning_rate": 4.9473861667259434e-05, + "loss": 0.0286, + "num_input_tokens_seen": 6095296, + "step": 28885 + }, + { + "epoch": 3.1782178217821784, + "grad_norm": 0.029928341507911682, + "learning_rate": 4.9473371751562604e-05, + "loss": 0.0843, + "num_input_tokens_seen": 6096352, + "step": 28890 + }, + { + "epoch": 3.1787678767876786, + "grad_norm": 0.8453541994094849, + "learning_rate": 4.9472881610306456e-05, + "loss": 0.0705, + "num_input_tokens_seen": 6097408, + "step": 28895 + }, + { + "epoch": 3.1793179317931792, + "grad_norm": 0.06042332947254181, + "learning_rate": 4.947239124349549e-05, + "loss": 0.0868, + "num_input_tokens_seen": 6098400, + "step": 28900 + }, + { + "epoch": 3.17986798679868, + "grad_norm": 0.029449183493852615, + "learning_rate": 4.947190065113424e-05, + "loss": 0.0137, + "num_input_tokens_seen": 6099456, + "step": 28905 + }, + { + "epoch": 3.1804180418041805, + "grad_norm": 0.07780519127845764, + "learning_rate": 4.947140983322723e-05, + "loss": 0.0324, + "num_input_tokens_seen": 6100544, + "step": 28910 + }, + { + "epoch": 3.180968096809681, + "grad_norm": 0.3929796516895294, + "learning_rate": 4.947091878977897e-05, + "loss": 0.0569, + "num_input_tokens_seen": 6101568, + "step": 28915 + }, + { + "epoch": 3.1815181518151814, + "grad_norm": 0.1609378159046173, + "learning_rate": 4.947042752079401e-05, + "loss": 0.0227, + "num_input_tokens_seen": 6102656, + "step": 28920 + }, + { + "epoch": 3.182068206820682, + "grad_norm": 0.7394979596138, + "learning_rate": 4.946993602627684e-05, + "loss": 0.0442, + "num_input_tokens_seen": 6103680, + "step": 28925 + }, + { + "epoch": 3.1826182618261827, + "grad_norm": 0.08690708875656128, + "learning_rate": 4.946944430623203e-05, + "loss": 0.015, + "num_input_tokens_seen": 6104704, + "step": 28930 + }, + { + "epoch": 3.1831683168316833, + "grad_norm": 0.033241160213947296, + "learning_rate": 4.946895236066408e-05, + "loss": 0.024, + "num_input_tokens_seen": 6105728, + "step": 28935 + }, + { + "epoch": 3.1837183718371835, + "grad_norm": 1.4195775985717773, + "learning_rate": 4.946846018957755e-05, + "loss": 0.0949, + "num_input_tokens_seen": 6106816, + "step": 28940 + }, + { + "epoch": 3.184268426842684, + "grad_norm": 0.0401199571788311, + "learning_rate": 4.9467967792976956e-05, + "loss": 0.0645, + "num_input_tokens_seen": 6107808, + "step": 28945 + }, + { + "epoch": 3.184818481848185, + "grad_norm": 0.18212930858135223, + "learning_rate": 4.946747517086684e-05, + "loss": 0.0719, + "num_input_tokens_seen": 6108896, + "step": 28950 + }, + { + "epoch": 3.1853685368536855, + "grad_norm": 0.03734135627746582, + "learning_rate": 4.946698232325175e-05, + "loss": 0.0118, + "num_input_tokens_seen": 6109920, + "step": 28955 + }, + { + "epoch": 3.1859185918591857, + "grad_norm": 0.06155630946159363, + "learning_rate": 4.9466489250136224e-05, + "loss": 0.0475, + "num_input_tokens_seen": 6110880, + "step": 28960 + }, + { + "epoch": 3.1864686468646863, + "grad_norm": 0.27420517802238464, + "learning_rate": 4.9465995951524805e-05, + "loss": 0.0365, + "num_input_tokens_seen": 6111936, + "step": 28965 + }, + { + "epoch": 3.187018701870187, + "grad_norm": 0.4210094213485718, + "learning_rate": 4.946550242742204e-05, + "loss": 0.055, + "num_input_tokens_seen": 6112992, + "step": 28970 + }, + { + "epoch": 3.1875687568756876, + "grad_norm": 0.3494087755680084, + "learning_rate": 4.946500867783249e-05, + "loss": 0.0206, + "num_input_tokens_seen": 6114080, + "step": 28975 + }, + { + "epoch": 3.1881188118811883, + "grad_norm": 0.09615550190210342, + "learning_rate": 4.9464514702760684e-05, + "loss": 0.1232, + "num_input_tokens_seen": 6115200, + "step": 28980 + }, + { + "epoch": 3.1886688668866885, + "grad_norm": 0.5506872534751892, + "learning_rate": 4.946402050221118e-05, + "loss": 0.0526, + "num_input_tokens_seen": 6116256, + "step": 28985 + }, + { + "epoch": 3.189218921892189, + "grad_norm": 1.1238398551940918, + "learning_rate": 4.9463526076188556e-05, + "loss": 0.0665, + "num_input_tokens_seen": 6117280, + "step": 28990 + }, + { + "epoch": 3.18976897689769, + "grad_norm": 0.3273780941963196, + "learning_rate": 4.9463031424697335e-05, + "loss": 0.0893, + "num_input_tokens_seen": 6118336, + "step": 28995 + }, + { + "epoch": 3.1903190319031904, + "grad_norm": 0.3865019381046295, + "learning_rate": 4.94625365477421e-05, + "loss": 0.0257, + "num_input_tokens_seen": 6119456, + "step": 29000 + }, + { + "epoch": 3.190869086908691, + "grad_norm": 0.11593495309352875, + "learning_rate": 4.94620414453274e-05, + "loss": 0.0067, + "num_input_tokens_seen": 6120448, + "step": 29005 + }, + { + "epoch": 3.1914191419141913, + "grad_norm": 0.1702641248703003, + "learning_rate": 4.94615461174578e-05, + "loss": 0.0781, + "num_input_tokens_seen": 6121536, + "step": 29010 + }, + { + "epoch": 3.191969196919692, + "grad_norm": 0.13715201616287231, + "learning_rate": 4.946105056413788e-05, + "loss": 0.0581, + "num_input_tokens_seen": 6122496, + "step": 29015 + }, + { + "epoch": 3.1925192519251926, + "grad_norm": 0.42454761266708374, + "learning_rate": 4.946055478537218e-05, + "loss": 0.0425, + "num_input_tokens_seen": 6123584, + "step": 29020 + }, + { + "epoch": 3.1930693069306932, + "grad_norm": 0.04349809139966965, + "learning_rate": 4.946005878116529e-05, + "loss": 0.0224, + "num_input_tokens_seen": 6124672, + "step": 29025 + }, + { + "epoch": 3.1936193619361934, + "grad_norm": 1.0787489414215088, + "learning_rate": 4.945956255152178e-05, + "loss": 0.0844, + "num_input_tokens_seen": 6125728, + "step": 29030 + }, + { + "epoch": 3.194169416941694, + "grad_norm": 0.03211421146988869, + "learning_rate": 4.94590660964462e-05, + "loss": 0.0135, + "num_input_tokens_seen": 6126752, + "step": 29035 + }, + { + "epoch": 3.1947194719471947, + "grad_norm": 0.08483931422233582, + "learning_rate": 4.945856941594317e-05, + "loss": 0.0381, + "num_input_tokens_seen": 6127936, + "step": 29040 + }, + { + "epoch": 3.1952695269526954, + "grad_norm": 0.011804123409092426, + "learning_rate": 4.945807251001723e-05, + "loss": 0.0395, + "num_input_tokens_seen": 6129024, + "step": 29045 + }, + { + "epoch": 3.1958195819581956, + "grad_norm": 0.6243996620178223, + "learning_rate": 4.9457575378672974e-05, + "loss": 0.0419, + "num_input_tokens_seen": 6130080, + "step": 29050 + }, + { + "epoch": 3.1963696369636962, + "grad_norm": 0.10570204257965088, + "learning_rate": 4.945707802191497e-05, + "loss": 0.0827, + "num_input_tokens_seen": 6131072, + "step": 29055 + }, + { + "epoch": 3.196919691969197, + "grad_norm": 0.6483085751533508, + "learning_rate": 4.945658043974782e-05, + "loss": 0.0483, + "num_input_tokens_seen": 6132128, + "step": 29060 + }, + { + "epoch": 3.1974697469746975, + "grad_norm": 0.7341450452804565, + "learning_rate": 4.9456082632176115e-05, + "loss": 0.0745, + "num_input_tokens_seen": 6133216, + "step": 29065 + }, + { + "epoch": 3.198019801980198, + "grad_norm": 0.033084750175476074, + "learning_rate": 4.945558459920442e-05, + "loss": 0.0865, + "num_input_tokens_seen": 6134240, + "step": 29070 + }, + { + "epoch": 3.1985698569856984, + "grad_norm": 0.037096843123435974, + "learning_rate": 4.945508634083734e-05, + "loss": 0.0915, + "num_input_tokens_seen": 6135296, + "step": 29075 + }, + { + "epoch": 3.199119911991199, + "grad_norm": 0.07656657695770264, + "learning_rate": 4.945458785707946e-05, + "loss": 0.0278, + "num_input_tokens_seen": 6136416, + "step": 29080 + }, + { + "epoch": 3.1996699669966997, + "grad_norm": 1.125339388847351, + "learning_rate": 4.945408914793538e-05, + "loss": 0.0486, + "num_input_tokens_seen": 6137408, + "step": 29085 + }, + { + "epoch": 3.2002200220022003, + "grad_norm": 1.3319861888885498, + "learning_rate": 4.9453590213409697e-05, + "loss": 0.055, + "num_input_tokens_seen": 6138496, + "step": 29090 + }, + { + "epoch": 3.200770077007701, + "grad_norm": 0.5166297554969788, + "learning_rate": 4.945309105350701e-05, + "loss": 0.0353, + "num_input_tokens_seen": 6139584, + "step": 29095 + }, + { + "epoch": 3.201320132013201, + "grad_norm": 0.08758755773305893, + "learning_rate": 4.9452591668231905e-05, + "loss": 0.2, + "num_input_tokens_seen": 6140608, + "step": 29100 + }, + { + "epoch": 3.201870187018702, + "grad_norm": 0.49708086252212524, + "learning_rate": 4.9452092057589e-05, + "loss": 0.1517, + "num_input_tokens_seen": 6141632, + "step": 29105 + }, + { + "epoch": 3.2024202420242025, + "grad_norm": 0.03051835671067238, + "learning_rate": 4.945159222158291e-05, + "loss": 0.0897, + "num_input_tokens_seen": 6142656, + "step": 29110 + }, + { + "epoch": 3.202970297029703, + "grad_norm": 0.024043608456850052, + "learning_rate": 4.9451092160218216e-05, + "loss": 0.0113, + "num_input_tokens_seen": 6143712, + "step": 29115 + }, + { + "epoch": 3.2035203520352034, + "grad_norm": 0.012677072547376156, + "learning_rate": 4.945059187349954e-05, + "loss": 0.0525, + "num_input_tokens_seen": 6144800, + "step": 29120 + }, + { + "epoch": 3.204070407040704, + "grad_norm": 0.7060203552246094, + "learning_rate": 4.945009136143149e-05, + "loss": 0.1425, + "num_input_tokens_seen": 6145792, + "step": 29125 + }, + { + "epoch": 3.2046204620462047, + "grad_norm": 0.5112256407737732, + "learning_rate": 4.944959062401868e-05, + "loss": 0.0708, + "num_input_tokens_seen": 6146912, + "step": 29130 + }, + { + "epoch": 3.2051705170517053, + "grad_norm": 0.39416196942329407, + "learning_rate": 4.9449089661265724e-05, + "loss": 0.0463, + "num_input_tokens_seen": 6147904, + "step": 29135 + }, + { + "epoch": 3.2057205720572055, + "grad_norm": 0.30174002051353455, + "learning_rate": 4.944858847317724e-05, + "loss": 0.1397, + "num_input_tokens_seen": 6148992, + "step": 29140 + }, + { + "epoch": 3.206270627062706, + "grad_norm": 0.15009404718875885, + "learning_rate": 4.944808705975785e-05, + "loss": 0.0667, + "num_input_tokens_seen": 6149984, + "step": 29145 + }, + { + "epoch": 3.206820682068207, + "grad_norm": 0.16464489698410034, + "learning_rate": 4.944758542101218e-05, + "loss": 0.0352, + "num_input_tokens_seen": 6151072, + "step": 29150 + }, + { + "epoch": 3.2073707370737075, + "grad_norm": 0.5571454167366028, + "learning_rate": 4.9447083556944834e-05, + "loss": 0.0531, + "num_input_tokens_seen": 6152096, + "step": 29155 + }, + { + "epoch": 3.207920792079208, + "grad_norm": 0.9360046982765198, + "learning_rate": 4.9446581467560455e-05, + "loss": 0.033, + "num_input_tokens_seen": 6153216, + "step": 29160 + }, + { + "epoch": 3.2084708470847083, + "grad_norm": 0.03762911260128021, + "learning_rate": 4.944607915286367e-05, + "loss": 0.0127, + "num_input_tokens_seen": 6154240, + "step": 29165 + }, + { + "epoch": 3.209020902090209, + "grad_norm": 0.3439352810382843, + "learning_rate": 4.944557661285909e-05, + "loss": 0.1721, + "num_input_tokens_seen": 6155232, + "step": 29170 + }, + { + "epoch": 3.2095709570957096, + "grad_norm": 0.22941674292087555, + "learning_rate": 4.944507384755137e-05, + "loss": 0.124, + "num_input_tokens_seen": 6156288, + "step": 29175 + }, + { + "epoch": 3.2101210121012103, + "grad_norm": 0.17685148119926453, + "learning_rate": 4.944457085694514e-05, + "loss": 0.1064, + "num_input_tokens_seen": 6157344, + "step": 29180 + }, + { + "epoch": 3.2106710671067105, + "grad_norm": 0.6266752481460571, + "learning_rate": 4.9444067641045024e-05, + "loss": 0.072, + "num_input_tokens_seen": 6158304, + "step": 29185 + }, + { + "epoch": 3.211221122112211, + "grad_norm": 0.93361896276474, + "learning_rate": 4.9443564199855666e-05, + "loss": 0.0327, + "num_input_tokens_seen": 6159296, + "step": 29190 + }, + { + "epoch": 3.2117711771177118, + "grad_norm": 0.0407063253223896, + "learning_rate": 4.94430605333817e-05, + "loss": 0.0273, + "num_input_tokens_seen": 6160352, + "step": 29195 + }, + { + "epoch": 3.2123212321232124, + "grad_norm": 0.10298207402229309, + "learning_rate": 4.944255664162778e-05, + "loss": 0.0681, + "num_input_tokens_seen": 6161440, + "step": 29200 + }, + { + "epoch": 3.212871287128713, + "grad_norm": 0.8109502196311951, + "learning_rate": 4.944205252459855e-05, + "loss": 0.094, + "num_input_tokens_seen": 6162528, + "step": 29205 + }, + { + "epoch": 3.2134213421342133, + "grad_norm": 1.170009970664978, + "learning_rate": 4.9441548182298636e-05, + "loss": 0.1349, + "num_input_tokens_seen": 6163520, + "step": 29210 + }, + { + "epoch": 3.213971397139714, + "grad_norm": 0.03738449513912201, + "learning_rate": 4.9441043614732705e-05, + "loss": 0.0103, + "num_input_tokens_seen": 6164512, + "step": 29215 + }, + { + "epoch": 3.2145214521452146, + "grad_norm": 0.17957282066345215, + "learning_rate": 4.9440538821905416e-05, + "loss": 0.1286, + "num_input_tokens_seen": 6165536, + "step": 29220 + }, + { + "epoch": 3.215071507150715, + "grad_norm": 0.0760415717959404, + "learning_rate": 4.94400338038214e-05, + "loss": 0.0205, + "num_input_tokens_seen": 6166592, + "step": 29225 + }, + { + "epoch": 3.2156215621562154, + "grad_norm": 0.3262209892272949, + "learning_rate": 4.9439528560485316e-05, + "loss": 0.0646, + "num_input_tokens_seen": 6167616, + "step": 29230 + }, + { + "epoch": 3.216171617161716, + "grad_norm": 0.07278238981962204, + "learning_rate": 4.943902309190182e-05, + "loss": 0.0141, + "num_input_tokens_seen": 6168672, + "step": 29235 + }, + { + "epoch": 3.2167216721672167, + "grad_norm": 0.9264947772026062, + "learning_rate": 4.943851739807559e-05, + "loss": 0.0761, + "num_input_tokens_seen": 6169696, + "step": 29240 + }, + { + "epoch": 3.2172717271727174, + "grad_norm": 1.749800443649292, + "learning_rate": 4.943801147901126e-05, + "loss": 0.1185, + "num_input_tokens_seen": 6170720, + "step": 29245 + }, + { + "epoch": 3.217821782178218, + "grad_norm": 0.3459201157093048, + "learning_rate": 4.943750533471351e-05, + "loss": 0.1035, + "num_input_tokens_seen": 6171776, + "step": 29250 + }, + { + "epoch": 3.218371837183718, + "grad_norm": 0.9109892845153809, + "learning_rate": 4.9436998965187e-05, + "loss": 0.066, + "num_input_tokens_seen": 6172864, + "step": 29255 + }, + { + "epoch": 3.218921892189219, + "grad_norm": 0.8218509554862976, + "learning_rate": 4.943649237043639e-05, + "loss": 0.0698, + "num_input_tokens_seen": 6173888, + "step": 29260 + }, + { + "epoch": 3.2194719471947195, + "grad_norm": 0.009789945557713509, + "learning_rate": 4.943598555046637e-05, + "loss": 0.0134, + "num_input_tokens_seen": 6174944, + "step": 29265 + }, + { + "epoch": 3.22002200220022, + "grad_norm": 0.12397689372301102, + "learning_rate": 4.9435478505281585e-05, + "loss": 0.0548, + "num_input_tokens_seen": 6175968, + "step": 29270 + }, + { + "epoch": 3.2205720572057204, + "grad_norm": 0.23303106427192688, + "learning_rate": 4.943497123488672e-05, + "loss": 0.0683, + "num_input_tokens_seen": 6176992, + "step": 29275 + }, + { + "epoch": 3.221122112211221, + "grad_norm": 0.20063374936580658, + "learning_rate": 4.9434463739286454e-05, + "loss": 0.0496, + "num_input_tokens_seen": 6178080, + "step": 29280 + }, + { + "epoch": 3.2216721672167217, + "grad_norm": 0.08695241808891296, + "learning_rate": 4.943395601848546e-05, + "loss": 0.0277, + "num_input_tokens_seen": 6179104, + "step": 29285 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 1.1610710620880127, + "learning_rate": 4.9433448072488414e-05, + "loss": 0.0604, + "num_input_tokens_seen": 6180224, + "step": 29290 + }, + { + "epoch": 3.222772277227723, + "grad_norm": 0.4167872369289398, + "learning_rate": 4.94329399013e-05, + "loss": 0.075, + "num_input_tokens_seen": 6181216, + "step": 29295 + }, + { + "epoch": 3.223322332233223, + "grad_norm": 0.10820285975933075, + "learning_rate": 4.9432431504924915e-05, + "loss": 0.0214, + "num_input_tokens_seen": 6182272, + "step": 29300 + }, + { + "epoch": 3.223872387238724, + "grad_norm": 0.9877858757972717, + "learning_rate": 4.943192288336782e-05, + "loss": 0.0547, + "num_input_tokens_seen": 6183296, + "step": 29305 + }, + { + "epoch": 3.2244224422442245, + "grad_norm": 0.887795627117157, + "learning_rate": 4.9431414036633424e-05, + "loss": 0.1228, + "num_input_tokens_seen": 6184384, + "step": 29310 + }, + { + "epoch": 3.224972497249725, + "grad_norm": 0.30673110485076904, + "learning_rate": 4.94309049647264e-05, + "loss": 0.0374, + "num_input_tokens_seen": 6185472, + "step": 29315 + }, + { + "epoch": 3.2255225522552253, + "grad_norm": 0.023736845701932907, + "learning_rate": 4.9430395667651454e-05, + "loss": 0.1028, + "num_input_tokens_seen": 6186528, + "step": 29320 + }, + { + "epoch": 3.226072607260726, + "grad_norm": 0.5108729004859924, + "learning_rate": 4.942988614541327e-05, + "loss": 0.0317, + "num_input_tokens_seen": 6187616, + "step": 29325 + }, + { + "epoch": 3.2266226622662266, + "grad_norm": 0.16547657549381256, + "learning_rate": 4.942937639801655e-05, + "loss": 0.0219, + "num_input_tokens_seen": 6188704, + "step": 29330 + }, + { + "epoch": 3.2271727172717273, + "grad_norm": 0.8501033186912537, + "learning_rate": 4.942886642546599e-05, + "loss": 0.2203, + "num_input_tokens_seen": 6189728, + "step": 29335 + }, + { + "epoch": 3.227722772277228, + "grad_norm": 0.14110015332698822, + "learning_rate": 4.9428356227766294e-05, + "loss": 0.1011, + "num_input_tokens_seen": 6190752, + "step": 29340 + }, + { + "epoch": 3.228272827282728, + "grad_norm": 0.11193510890007019, + "learning_rate": 4.942784580492216e-05, + "loss": 0.0935, + "num_input_tokens_seen": 6191808, + "step": 29345 + }, + { + "epoch": 3.228822882288229, + "grad_norm": 0.4568471312522888, + "learning_rate": 4.942733515693829e-05, + "loss": 0.1468, + "num_input_tokens_seen": 6192864, + "step": 29350 + }, + { + "epoch": 3.2293729372937294, + "grad_norm": 0.2122693806886673, + "learning_rate": 4.942682428381939e-05, + "loss": 0.0669, + "num_input_tokens_seen": 6193888, + "step": 29355 + }, + { + "epoch": 3.22992299229923, + "grad_norm": 0.9529705047607422, + "learning_rate": 4.9426313185570174e-05, + "loss": 0.0993, + "num_input_tokens_seen": 6194880, + "step": 29360 + }, + { + "epoch": 3.2304730473047303, + "grad_norm": 0.2651783525943756, + "learning_rate": 4.942580186219535e-05, + "loss": 0.0984, + "num_input_tokens_seen": 6195936, + "step": 29365 + }, + { + "epoch": 3.231023102310231, + "grad_norm": 0.5931733250617981, + "learning_rate": 4.9425290313699637e-05, + "loss": 0.1101, + "num_input_tokens_seen": 6197024, + "step": 29370 + }, + { + "epoch": 3.2315731573157316, + "grad_norm": 0.061981137841939926, + "learning_rate": 4.942477854008774e-05, + "loss": 0.0654, + "num_input_tokens_seen": 6198048, + "step": 29375 + }, + { + "epoch": 3.2321232123212322, + "grad_norm": 0.02883118763566017, + "learning_rate": 4.942426654136437e-05, + "loss": 0.0566, + "num_input_tokens_seen": 6199136, + "step": 29380 + }, + { + "epoch": 3.232673267326733, + "grad_norm": 0.32481849193573, + "learning_rate": 4.942375431753427e-05, + "loss": 0.0634, + "num_input_tokens_seen": 6200192, + "step": 29385 + }, + { + "epoch": 3.233223322332233, + "grad_norm": 0.29622647166252136, + "learning_rate": 4.942324186860214e-05, + "loss": 0.042, + "num_input_tokens_seen": 6201248, + "step": 29390 + }, + { + "epoch": 3.2337733773377337, + "grad_norm": 1.2069814205169678, + "learning_rate": 4.94227291945727e-05, + "loss": 0.0948, + "num_input_tokens_seen": 6202304, + "step": 29395 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 1.2093485593795776, + "learning_rate": 4.9422216295450696e-05, + "loss": 0.0836, + "num_input_tokens_seen": 6203360, + "step": 29400 + }, + { + "epoch": 3.234873487348735, + "grad_norm": 0.048812247812747955, + "learning_rate": 4.942170317124084e-05, + "loss": 0.0366, + "num_input_tokens_seen": 6204352, + "step": 29405 + }, + { + "epoch": 3.2354235423542352, + "grad_norm": 0.3067646324634552, + "learning_rate": 4.942118982194787e-05, + "loss": 0.09, + "num_input_tokens_seen": 6205312, + "step": 29410 + }, + { + "epoch": 3.235973597359736, + "grad_norm": 0.07468432188034058, + "learning_rate": 4.94206762475765e-05, + "loss": 0.0231, + "num_input_tokens_seen": 6206400, + "step": 29415 + }, + { + "epoch": 3.2365236523652365, + "grad_norm": 0.042524389922618866, + "learning_rate": 4.942016244813148e-05, + "loss": 0.047, + "num_input_tokens_seen": 6207488, + "step": 29420 + }, + { + "epoch": 3.237073707370737, + "grad_norm": 0.489024817943573, + "learning_rate": 4.941964842361755e-05, + "loss": 0.0907, + "num_input_tokens_seen": 6208512, + "step": 29425 + }, + { + "epoch": 3.237623762376238, + "grad_norm": 0.0627373531460762, + "learning_rate": 4.941913417403943e-05, + "loss": 0.08, + "num_input_tokens_seen": 6209632, + "step": 29430 + }, + { + "epoch": 3.238173817381738, + "grad_norm": 0.04147151857614517, + "learning_rate": 4.941861969940187e-05, + "loss": 0.0438, + "num_input_tokens_seen": 6210656, + "step": 29435 + }, + { + "epoch": 3.2387238723872387, + "grad_norm": 0.33248835802078247, + "learning_rate": 4.9418104999709604e-05, + "loss": 0.1635, + "num_input_tokens_seen": 6211648, + "step": 29440 + }, + { + "epoch": 3.2392739273927393, + "grad_norm": 0.4825722873210907, + "learning_rate": 4.941759007496738e-05, + "loss": 0.056, + "num_input_tokens_seen": 6212768, + "step": 29445 + }, + { + "epoch": 3.23982398239824, + "grad_norm": 0.13001765310764313, + "learning_rate": 4.941707492517995e-05, + "loss": 0.0135, + "num_input_tokens_seen": 6213824, + "step": 29450 + }, + { + "epoch": 3.24037403740374, + "grad_norm": 0.09494322538375854, + "learning_rate": 4.941655955035206e-05, + "loss": 0.0788, + "num_input_tokens_seen": 6214912, + "step": 29455 + }, + { + "epoch": 3.240924092409241, + "grad_norm": 0.5008897185325623, + "learning_rate": 4.9416043950488454e-05, + "loss": 0.1063, + "num_input_tokens_seen": 6215968, + "step": 29460 + }, + { + "epoch": 3.2414741474147415, + "grad_norm": 0.4233980178833008, + "learning_rate": 4.941552812559389e-05, + "loss": 0.0266, + "num_input_tokens_seen": 6216992, + "step": 29465 + }, + { + "epoch": 3.242024202420242, + "grad_norm": 0.029339363798499107, + "learning_rate": 4.941501207567311e-05, + "loss": 0.0058, + "num_input_tokens_seen": 6218080, + "step": 29470 + }, + { + "epoch": 3.2425742574257423, + "grad_norm": 1.02633798122406, + "learning_rate": 4.9414495800730886e-05, + "loss": 0.1369, + "num_input_tokens_seen": 6219136, + "step": 29475 + }, + { + "epoch": 3.243124312431243, + "grad_norm": 0.6220920085906982, + "learning_rate": 4.9413979300771975e-05, + "loss": 0.0654, + "num_input_tokens_seen": 6220256, + "step": 29480 + }, + { + "epoch": 3.2436743674367436, + "grad_norm": 0.09481921046972275, + "learning_rate": 4.941346257580112e-05, + "loss": 0.0345, + "num_input_tokens_seen": 6221376, + "step": 29485 + }, + { + "epoch": 3.2442244224422443, + "grad_norm": 0.04388884827494621, + "learning_rate": 4.9412945625823096e-05, + "loss": 0.0381, + "num_input_tokens_seen": 6222432, + "step": 29490 + }, + { + "epoch": 3.244774477447745, + "grad_norm": 0.7171007990837097, + "learning_rate": 4.9412428450842676e-05, + "loss": 0.1208, + "num_input_tokens_seen": 6223520, + "step": 29495 + }, + { + "epoch": 3.245324532453245, + "grad_norm": 0.5360695719718933, + "learning_rate": 4.941191105086461e-05, + "loss": 0.0975, + "num_input_tokens_seen": 6224576, + "step": 29500 + }, + { + "epoch": 3.245874587458746, + "grad_norm": 0.12411320209503174, + "learning_rate": 4.941139342589367e-05, + "loss": 0.0393, + "num_input_tokens_seen": 6225664, + "step": 29505 + }, + { + "epoch": 3.2464246424642464, + "grad_norm": 0.8269591331481934, + "learning_rate": 4.941087557593464e-05, + "loss": 0.0804, + "num_input_tokens_seen": 6226752, + "step": 29510 + }, + { + "epoch": 3.246974697469747, + "grad_norm": 0.7804696559906006, + "learning_rate": 4.941035750099228e-05, + "loss": 0.0754, + "num_input_tokens_seen": 6227744, + "step": 29515 + }, + { + "epoch": 3.2475247524752477, + "grad_norm": 0.2700584828853607, + "learning_rate": 4.940983920107137e-05, + "loss": 0.0248, + "num_input_tokens_seen": 6228768, + "step": 29520 + }, + { + "epoch": 3.248074807480748, + "grad_norm": 0.4984608590602875, + "learning_rate": 4.940932067617668e-05, + "loss": 0.0323, + "num_input_tokens_seen": 6229824, + "step": 29525 + }, + { + "epoch": 3.2486248624862486, + "grad_norm": 0.04266531020402908, + "learning_rate": 4.9408801926313e-05, + "loss": 0.1076, + "num_input_tokens_seen": 6230816, + "step": 29530 + }, + { + "epoch": 3.2491749174917492, + "grad_norm": 0.6663475036621094, + "learning_rate": 4.9408282951485096e-05, + "loss": 0.1004, + "num_input_tokens_seen": 6231904, + "step": 29535 + }, + { + "epoch": 3.24972497249725, + "grad_norm": 0.082212433218956, + "learning_rate": 4.9407763751697766e-05, + "loss": 0.0805, + "num_input_tokens_seen": 6232928, + "step": 29540 + }, + { + "epoch": 3.25027502750275, + "grad_norm": 0.21818643808364868, + "learning_rate": 4.940724432695579e-05, + "loss": 0.0947, + "num_input_tokens_seen": 6233952, + "step": 29545 + }, + { + "epoch": 3.2508250825082508, + "grad_norm": 0.8892743587493896, + "learning_rate": 4.940672467726396e-05, + "loss": 0.0452, + "num_input_tokens_seen": 6234944, + "step": 29550 + }, + { + "epoch": 3.2513751375137514, + "grad_norm": 0.3896802067756653, + "learning_rate": 4.940620480262705e-05, + "loss": 0.0999, + "num_input_tokens_seen": 6235936, + "step": 29555 + }, + { + "epoch": 3.251925192519252, + "grad_norm": 0.12383395433425903, + "learning_rate": 4.9405684703049865e-05, + "loss": 0.0709, + "num_input_tokens_seen": 6237088, + "step": 29560 + }, + { + "epoch": 3.2524752475247523, + "grad_norm": 0.0984208807349205, + "learning_rate": 4.9405164378537195e-05, + "loss": 0.0848, + "num_input_tokens_seen": 6238176, + "step": 29565 + }, + { + "epoch": 3.253025302530253, + "grad_norm": 0.11406460404396057, + "learning_rate": 4.9404643829093835e-05, + "loss": 0.0309, + "num_input_tokens_seen": 6239200, + "step": 29570 + }, + { + "epoch": 3.2535753575357536, + "grad_norm": 0.14293919503688812, + "learning_rate": 4.940412305472459e-05, + "loss": 0.0832, + "num_input_tokens_seen": 6240288, + "step": 29575 + }, + { + "epoch": 3.254125412541254, + "grad_norm": 0.1403973400592804, + "learning_rate": 4.940360205543424e-05, + "loss": 0.0432, + "num_input_tokens_seen": 6241344, + "step": 29580 + }, + { + "epoch": 3.254675467546755, + "grad_norm": 0.15658342838287354, + "learning_rate": 4.940308083122761e-05, + "loss": 0.194, + "num_input_tokens_seen": 6242368, + "step": 29585 + }, + { + "epoch": 3.255225522552255, + "grad_norm": 0.09790980815887451, + "learning_rate": 4.940255938210949e-05, + "loss": 0.0173, + "num_input_tokens_seen": 6243392, + "step": 29590 + }, + { + "epoch": 3.2557755775577557, + "grad_norm": 0.0943593829870224, + "learning_rate": 4.9402037708084694e-05, + "loss": 0.0359, + "num_input_tokens_seen": 6244480, + "step": 29595 + }, + { + "epoch": 3.2563256325632564, + "grad_norm": 0.11053353548049927, + "learning_rate": 4.940151580915803e-05, + "loss": 0.1682, + "num_input_tokens_seen": 6245600, + "step": 29600 + }, + { + "epoch": 3.256875687568757, + "grad_norm": 0.18557330965995789, + "learning_rate": 4.940099368533429e-05, + "loss": 0.0328, + "num_input_tokens_seen": 6246656, + "step": 29605 + }, + { + "epoch": 3.2574257425742577, + "grad_norm": 0.6590675115585327, + "learning_rate": 4.940047133661831e-05, + "loss": 0.074, + "num_input_tokens_seen": 6247744, + "step": 29610 + }, + { + "epoch": 3.257975797579758, + "grad_norm": 0.11514270305633545, + "learning_rate": 4.9399948763014894e-05, + "loss": 0.0524, + "num_input_tokens_seen": 6248832, + "step": 29615 + }, + { + "epoch": 3.2585258525852585, + "grad_norm": 0.8046231865882874, + "learning_rate": 4.939942596452885e-05, + "loss": 0.1581, + "num_input_tokens_seen": 6249888, + "step": 29620 + }, + { + "epoch": 3.259075907590759, + "grad_norm": 1.0660573244094849, + "learning_rate": 4.939890294116501e-05, + "loss": 0.0765, + "num_input_tokens_seen": 6250912, + "step": 29625 + }, + { + "epoch": 3.25962596259626, + "grad_norm": 0.5920246243476868, + "learning_rate": 4.9398379692928185e-05, + "loss": 0.0337, + "num_input_tokens_seen": 6251936, + "step": 29630 + }, + { + "epoch": 3.26017601760176, + "grad_norm": 0.08365805447101593, + "learning_rate": 4.939785621982321e-05, + "loss": 0.0144, + "num_input_tokens_seen": 6253056, + "step": 29635 + }, + { + "epoch": 3.2607260726072607, + "grad_norm": 0.19039975106716156, + "learning_rate": 4.93973325218549e-05, + "loss": 0.0224, + "num_input_tokens_seen": 6254144, + "step": 29640 + }, + { + "epoch": 3.2612761276127613, + "grad_norm": 0.7363401651382446, + "learning_rate": 4.939680859902808e-05, + "loss": 0.0605, + "num_input_tokens_seen": 6255200, + "step": 29645 + }, + { + "epoch": 3.261826182618262, + "grad_norm": 0.6878530979156494, + "learning_rate": 4.939628445134759e-05, + "loss": 0.0418, + "num_input_tokens_seen": 6256288, + "step": 29650 + }, + { + "epoch": 3.262376237623762, + "grad_norm": 0.06759044528007507, + "learning_rate": 4.9395760078818235e-05, + "loss": 0.0397, + "num_input_tokens_seen": 6257312, + "step": 29655 + }, + { + "epoch": 3.262926292629263, + "grad_norm": 0.9868612289428711, + "learning_rate": 4.939523548144488e-05, + "loss": 0.0498, + "num_input_tokens_seen": 6258368, + "step": 29660 + }, + { + "epoch": 3.2634763476347635, + "grad_norm": 0.30761682987213135, + "learning_rate": 4.9394710659232343e-05, + "loss": 0.0357, + "num_input_tokens_seen": 6259392, + "step": 29665 + }, + { + "epoch": 3.264026402640264, + "grad_norm": 0.4114011526107788, + "learning_rate": 4.939418561218546e-05, + "loss": 0.0431, + "num_input_tokens_seen": 6260384, + "step": 29670 + }, + { + "epoch": 3.2645764576457648, + "grad_norm": 0.040663715451955795, + "learning_rate": 4.939366034030908e-05, + "loss": 0.0214, + "num_input_tokens_seen": 6261376, + "step": 29675 + }, + { + "epoch": 3.265126512651265, + "grad_norm": 0.4466766119003296, + "learning_rate": 4.9393134843608027e-05, + "loss": 0.0534, + "num_input_tokens_seen": 6262400, + "step": 29680 + }, + { + "epoch": 3.2656765676567656, + "grad_norm": 0.018718816339969635, + "learning_rate": 4.9392609122087165e-05, + "loss": 0.0152, + "num_input_tokens_seen": 6263488, + "step": 29685 + }, + { + "epoch": 3.2662266226622663, + "grad_norm": 0.02362157218158245, + "learning_rate": 4.9392083175751325e-05, + "loss": 0.06, + "num_input_tokens_seen": 6264544, + "step": 29690 + }, + { + "epoch": 3.266776677667767, + "grad_norm": 0.892661988735199, + "learning_rate": 4.939155700460536e-05, + "loss": 0.0543, + "num_input_tokens_seen": 6265568, + "step": 29695 + }, + { + "epoch": 3.2673267326732676, + "grad_norm": 1.1514016389846802, + "learning_rate": 4.939103060865412e-05, + "loss": 0.0579, + "num_input_tokens_seen": 6266560, + "step": 29700 + }, + { + "epoch": 3.2678767876787678, + "grad_norm": 0.028829116374254227, + "learning_rate": 4.939050398790245e-05, + "loss": 0.0697, + "num_input_tokens_seen": 6267584, + "step": 29705 + }, + { + "epoch": 3.2684268426842684, + "grad_norm": 0.8632187843322754, + "learning_rate": 4.9389977142355214e-05, + "loss": 0.0943, + "num_input_tokens_seen": 6268608, + "step": 29710 + }, + { + "epoch": 3.268976897689769, + "grad_norm": 0.08822024613618851, + "learning_rate": 4.9389450072017264e-05, + "loss": 0.0834, + "num_input_tokens_seen": 6269632, + "step": 29715 + }, + { + "epoch": 3.2695269526952697, + "grad_norm": 0.8596160411834717, + "learning_rate": 4.938892277689344e-05, + "loss": 0.0528, + "num_input_tokens_seen": 6270752, + "step": 29720 + }, + { + "epoch": 3.27007700770077, + "grad_norm": 0.49438542127609253, + "learning_rate": 4.938839525698863e-05, + "loss": 0.1015, + "num_input_tokens_seen": 6271808, + "step": 29725 + }, + { + "epoch": 3.2706270627062706, + "grad_norm": 0.03958214446902275, + "learning_rate": 4.938786751230769e-05, + "loss": 0.0438, + "num_input_tokens_seen": 6272864, + "step": 29730 + }, + { + "epoch": 3.271177117711771, + "grad_norm": 0.06788498908281326, + "learning_rate": 4.938733954285547e-05, + "loss": 0.034, + "num_input_tokens_seen": 6273920, + "step": 29735 + }, + { + "epoch": 3.271727172717272, + "grad_norm": 0.050409622490406036, + "learning_rate": 4.938681134863684e-05, + "loss": 0.0276, + "num_input_tokens_seen": 6274976, + "step": 29740 + }, + { + "epoch": 3.272277227722772, + "grad_norm": 0.0724492222070694, + "learning_rate": 4.938628292965668e-05, + "loss": 0.0359, + "num_input_tokens_seen": 6276096, + "step": 29745 + }, + { + "epoch": 3.2728272827282727, + "grad_norm": 0.06238600239157677, + "learning_rate": 4.938575428591985e-05, + "loss": 0.0208, + "num_input_tokens_seen": 6277088, + "step": 29750 + }, + { + "epoch": 3.2733773377337734, + "grad_norm": 0.22690576314926147, + "learning_rate": 4.938522541743122e-05, + "loss": 0.0579, + "num_input_tokens_seen": 6278176, + "step": 29755 + }, + { + "epoch": 3.273927392739274, + "grad_norm": 0.27946338057518005, + "learning_rate": 4.9384696324195664e-05, + "loss": 0.0343, + "num_input_tokens_seen": 6279296, + "step": 29760 + }, + { + "epoch": 3.2744774477447747, + "grad_norm": 0.18868261575698853, + "learning_rate": 4.938416700621807e-05, + "loss": 0.0398, + "num_input_tokens_seen": 6280352, + "step": 29765 + }, + { + "epoch": 3.275027502750275, + "grad_norm": 0.014729839749634266, + "learning_rate": 4.9383637463503314e-05, + "loss": 0.0671, + "num_input_tokens_seen": 6281440, + "step": 29770 + }, + { + "epoch": 3.2755775577557755, + "grad_norm": 0.07520736008882523, + "learning_rate": 4.938310769605626e-05, + "loss": 0.0153, + "num_input_tokens_seen": 6282496, + "step": 29775 + }, + { + "epoch": 3.276127612761276, + "grad_norm": 0.4828885793685913, + "learning_rate": 4.938257770388182e-05, + "loss": 0.0912, + "num_input_tokens_seen": 6283520, + "step": 29780 + }, + { + "epoch": 3.276677667766777, + "grad_norm": 0.15336738526821136, + "learning_rate": 4.9382047486984847e-05, + "loss": 0.1056, + "num_input_tokens_seen": 6284672, + "step": 29785 + }, + { + "epoch": 3.2772277227722775, + "grad_norm": 0.44673749804496765, + "learning_rate": 4.938151704537024e-05, + "loss": 0.0536, + "num_input_tokens_seen": 6285664, + "step": 29790 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.058120205998420715, + "learning_rate": 4.9380986379042896e-05, + "loss": 0.0532, + "num_input_tokens_seen": 6286688, + "step": 29795 + }, + { + "epoch": 3.2783278327832783, + "grad_norm": 1.2546590566635132, + "learning_rate": 4.93804554880077e-05, + "loss": 0.2417, + "num_input_tokens_seen": 6287712, + "step": 29800 + }, + { + "epoch": 3.278877887788779, + "grad_norm": 0.033755477517843246, + "learning_rate": 4.937992437226955e-05, + "loss": 0.0313, + "num_input_tokens_seen": 6288800, + "step": 29805 + }, + { + "epoch": 3.279427942794279, + "grad_norm": 1.6658859252929688, + "learning_rate": 4.937939303183332e-05, + "loss": 0.0742, + "num_input_tokens_seen": 6289888, + "step": 29810 + }, + { + "epoch": 3.27997799779978, + "grad_norm": 0.08530577272176743, + "learning_rate": 4.9378861466703935e-05, + "loss": 0.0489, + "num_input_tokens_seen": 6290880, + "step": 29815 + }, + { + "epoch": 3.2805280528052805, + "grad_norm": 0.3155209720134735, + "learning_rate": 4.937832967688628e-05, + "loss": 0.0722, + "num_input_tokens_seen": 6291872, + "step": 29820 + }, + { + "epoch": 3.281078107810781, + "grad_norm": 0.03575904294848442, + "learning_rate": 4.937779766238526e-05, + "loss": 0.0436, + "num_input_tokens_seen": 6292928, + "step": 29825 + }, + { + "epoch": 3.281628162816282, + "grad_norm": 0.028952883556485176, + "learning_rate": 4.937726542320577e-05, + "loss": 0.0255, + "num_input_tokens_seen": 6294016, + "step": 29830 + }, + { + "epoch": 3.282178217821782, + "grad_norm": 1.0681391954421997, + "learning_rate": 4.937673295935272e-05, + "loss": 0.0626, + "num_input_tokens_seen": 6295040, + "step": 29835 + }, + { + "epoch": 3.2827282728272826, + "grad_norm": 0.03384501859545708, + "learning_rate": 4.937620027083103e-05, + "loss": 0.0179, + "num_input_tokens_seen": 6296096, + "step": 29840 + }, + { + "epoch": 3.2832783278327833, + "grad_norm": 0.08045529574155807, + "learning_rate": 4.937566735764559e-05, + "loss": 0.0803, + "num_input_tokens_seen": 6297152, + "step": 29845 + }, + { + "epoch": 3.283828382838284, + "grad_norm": 0.14323769509792328, + "learning_rate": 4.937513421980133e-05, + "loss": 0.0264, + "num_input_tokens_seen": 6298144, + "step": 29850 + }, + { + "epoch": 3.2843784378437846, + "grad_norm": 0.0219330545514822, + "learning_rate": 4.937460085730315e-05, + "loss": 0.0641, + "num_input_tokens_seen": 6299200, + "step": 29855 + }, + { + "epoch": 3.284928492849285, + "grad_norm": 0.012868747115135193, + "learning_rate": 4.937406727015596e-05, + "loss": 0.0356, + "num_input_tokens_seen": 6300256, + "step": 29860 + }, + { + "epoch": 3.2854785478547854, + "grad_norm": 0.5635441541671753, + "learning_rate": 4.9373533458364704e-05, + "loss": 0.0508, + "num_input_tokens_seen": 6301248, + "step": 29865 + }, + { + "epoch": 3.286028602860286, + "grad_norm": 0.05246094986796379, + "learning_rate": 4.937299942193427e-05, + "loss": 0.0058, + "num_input_tokens_seen": 6302336, + "step": 29870 + }, + { + "epoch": 3.2865786578657867, + "grad_norm": 0.03495827689766884, + "learning_rate": 4.937246516086961e-05, + "loss": 0.1086, + "num_input_tokens_seen": 6303392, + "step": 29875 + }, + { + "epoch": 3.287128712871287, + "grad_norm": 0.4727918803691864, + "learning_rate": 4.937193067517562e-05, + "loss": 0.019, + "num_input_tokens_seen": 6304448, + "step": 29880 + }, + { + "epoch": 3.2876787678767876, + "grad_norm": 0.3513128459453583, + "learning_rate": 4.9371395964857256e-05, + "loss": 0.0313, + "num_input_tokens_seen": 6305568, + "step": 29885 + }, + { + "epoch": 3.2882288228822882, + "grad_norm": 0.3477259576320648, + "learning_rate": 4.9370861029919425e-05, + "loss": 0.1421, + "num_input_tokens_seen": 6306592, + "step": 29890 + }, + { + "epoch": 3.288778877887789, + "grad_norm": 0.05618960037827492, + "learning_rate": 4.937032587036705e-05, + "loss": 0.1439, + "num_input_tokens_seen": 6307648, + "step": 29895 + }, + { + "epoch": 3.289328932893289, + "grad_norm": 0.5083011984825134, + "learning_rate": 4.9369790486205086e-05, + "loss": 0.0639, + "num_input_tokens_seen": 6308736, + "step": 29900 + }, + { + "epoch": 3.2898789878987897, + "grad_norm": 0.4353386461734772, + "learning_rate": 4.936925487743845e-05, + "loss": 0.0959, + "num_input_tokens_seen": 6309792, + "step": 29905 + }, + { + "epoch": 3.2904290429042904, + "grad_norm": 0.6693413257598877, + "learning_rate": 4.93687190440721e-05, + "loss": 0.0635, + "num_input_tokens_seen": 6310816, + "step": 29910 + }, + { + "epoch": 3.290979097909791, + "grad_norm": 0.4319276213645935, + "learning_rate": 4.936818298611094e-05, + "loss": 0.04, + "num_input_tokens_seen": 6311872, + "step": 29915 + }, + { + "epoch": 3.2915291529152917, + "grad_norm": 0.05228602886199951, + "learning_rate": 4.936764670355994e-05, + "loss": 0.0323, + "num_input_tokens_seen": 6312928, + "step": 29920 + }, + { + "epoch": 3.292079207920792, + "grad_norm": 0.18509428203105927, + "learning_rate": 4.936711019642404e-05, + "loss": 0.0593, + "num_input_tokens_seen": 6313952, + "step": 29925 + }, + { + "epoch": 3.2926292629262925, + "grad_norm": 0.033895980566740036, + "learning_rate": 4.936657346470816e-05, + "loss": 0.0414, + "num_input_tokens_seen": 6314944, + "step": 29930 + }, + { + "epoch": 3.293179317931793, + "grad_norm": 0.25376835465431213, + "learning_rate": 4.936603650841728e-05, + "loss": 0.0483, + "num_input_tokens_seen": 6315968, + "step": 29935 + }, + { + "epoch": 3.293729372937294, + "grad_norm": 1.0069509744644165, + "learning_rate": 4.936549932755633e-05, + "loss": 0.0659, + "num_input_tokens_seen": 6316928, + "step": 29940 + }, + { + "epoch": 3.2942794279427945, + "grad_norm": 0.4142099618911743, + "learning_rate": 4.936496192213027e-05, + "loss": 0.0281, + "num_input_tokens_seen": 6317984, + "step": 29945 + }, + { + "epoch": 3.2948294829482947, + "grad_norm": 1.7015759944915771, + "learning_rate": 4.936442429214404e-05, + "loss": 0.0405, + "num_input_tokens_seen": 6319040, + "step": 29950 + }, + { + "epoch": 3.2953795379537953, + "grad_norm": 0.4765954613685608, + "learning_rate": 4.93638864376026e-05, + "loss": 0.0942, + "num_input_tokens_seen": 6320064, + "step": 29955 + }, + { + "epoch": 3.295929592959296, + "grad_norm": 0.054377391934394836, + "learning_rate": 4.936334835851092e-05, + "loss": 0.0206, + "num_input_tokens_seen": 6321088, + "step": 29960 + }, + { + "epoch": 3.2964796479647966, + "grad_norm": 0.14372067153453827, + "learning_rate": 4.936281005487394e-05, + "loss": 0.0443, + "num_input_tokens_seen": 6322112, + "step": 29965 + }, + { + "epoch": 3.297029702970297, + "grad_norm": 0.8070692420005798, + "learning_rate": 4.936227152669663e-05, + "loss": 0.037, + "num_input_tokens_seen": 6323136, + "step": 29970 + }, + { + "epoch": 3.2975797579757975, + "grad_norm": 0.7909713983535767, + "learning_rate": 4.936173277398395e-05, + "loss": 0.0542, + "num_input_tokens_seen": 6324160, + "step": 29975 + }, + { + "epoch": 3.298129812981298, + "grad_norm": 1.9803129434585571, + "learning_rate": 4.936119379674088e-05, + "loss": 0.1548, + "num_input_tokens_seen": 6325248, + "step": 29980 + }, + { + "epoch": 3.298679867986799, + "grad_norm": 0.3252840042114258, + "learning_rate": 4.9360654594972365e-05, + "loss": 0.0345, + "num_input_tokens_seen": 6326240, + "step": 29985 + }, + { + "epoch": 3.299229922992299, + "grad_norm": 0.027730325236916542, + "learning_rate": 4.9360115168683396e-05, + "loss": 0.044, + "num_input_tokens_seen": 6327360, + "step": 29990 + }, + { + "epoch": 3.2997799779977997, + "grad_norm": 0.7488242983818054, + "learning_rate": 4.935957551787893e-05, + "loss": 0.0332, + "num_input_tokens_seen": 6328448, + "step": 29995 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.6341339945793152, + "learning_rate": 4.9359035642563946e-05, + "loss": 0.0653, + "num_input_tokens_seen": 6329504, + "step": 30000 + }, + { + "epoch": 3.300880088008801, + "grad_norm": 0.7118903398513794, + "learning_rate": 4.935849554274342e-05, + "loss": 0.0811, + "num_input_tokens_seen": 6330560, + "step": 30005 + }, + { + "epoch": 3.3014301430143016, + "grad_norm": 0.10852974653244019, + "learning_rate": 4.935795521842233e-05, + "loss": 0.0936, + "num_input_tokens_seen": 6331648, + "step": 30010 + }, + { + "epoch": 3.301980198019802, + "grad_norm": 0.7197789549827576, + "learning_rate": 4.935741466960565e-05, + "loss": 0.0388, + "num_input_tokens_seen": 6332736, + "step": 30015 + }, + { + "epoch": 3.3025302530253025, + "grad_norm": 0.05078168958425522, + "learning_rate": 4.935687389629837e-05, + "loss": 0.0094, + "num_input_tokens_seen": 6333824, + "step": 30020 + }, + { + "epoch": 3.303080308030803, + "grad_norm": 0.8315606713294983, + "learning_rate": 4.935633289850547e-05, + "loss": 0.0482, + "num_input_tokens_seen": 6334816, + "step": 30025 + }, + { + "epoch": 3.3036303630363038, + "grad_norm": 0.037579648196697235, + "learning_rate": 4.935579167623193e-05, + "loss": 0.0129, + "num_input_tokens_seen": 6335936, + "step": 30030 + }, + { + "epoch": 3.3041804180418044, + "grad_norm": 0.7353687286376953, + "learning_rate": 4.9355250229482755e-05, + "loss": 0.0617, + "num_input_tokens_seen": 6337024, + "step": 30035 + }, + { + "epoch": 3.3047304730473046, + "grad_norm": 0.03035540133714676, + "learning_rate": 4.935470855826292e-05, + "loss": 0.0941, + "num_input_tokens_seen": 6338112, + "step": 30040 + }, + { + "epoch": 3.3052805280528053, + "grad_norm": 1.0245543718338013, + "learning_rate": 4.935416666257743e-05, + "loss": 0.0594, + "num_input_tokens_seen": 6339168, + "step": 30045 + }, + { + "epoch": 3.305830583058306, + "grad_norm": 0.17553287744522095, + "learning_rate": 4.935362454243127e-05, + "loss": 0.0281, + "num_input_tokens_seen": 6340192, + "step": 30050 + }, + { + "epoch": 3.3063806380638066, + "grad_norm": 0.051518868654966354, + "learning_rate": 4.9353082197829435e-05, + "loss": 0.1429, + "num_input_tokens_seen": 6341216, + "step": 30055 + }, + { + "epoch": 3.3069306930693068, + "grad_norm": 1.7678743600845337, + "learning_rate": 4.935253962877693e-05, + "loss": 0.132, + "num_input_tokens_seen": 6342272, + "step": 30060 + }, + { + "epoch": 3.3074807480748074, + "grad_norm": 1.4481837749481201, + "learning_rate": 4.9351996835278746e-05, + "loss": 0.0996, + "num_input_tokens_seen": 6343328, + "step": 30065 + }, + { + "epoch": 3.308030803080308, + "grad_norm": 0.03392248973250389, + "learning_rate": 4.93514538173399e-05, + "loss": 0.0278, + "num_input_tokens_seen": 6344352, + "step": 30070 + }, + { + "epoch": 3.3085808580858087, + "grad_norm": 0.5743772387504578, + "learning_rate": 4.9350910574965385e-05, + "loss": 0.025, + "num_input_tokens_seen": 6345344, + "step": 30075 + }, + { + "epoch": 3.309130913091309, + "grad_norm": 0.8297425508499146, + "learning_rate": 4.935036710816021e-05, + "loss": 0.0554, + "num_input_tokens_seen": 6346304, + "step": 30080 + }, + { + "epoch": 3.3096809680968096, + "grad_norm": 0.09115409106016159, + "learning_rate": 4.93498234169294e-05, + "loss": 0.1256, + "num_input_tokens_seen": 6347360, + "step": 30085 + }, + { + "epoch": 3.31023102310231, + "grad_norm": 0.0740852802991867, + "learning_rate": 4.9349279501277935e-05, + "loss": 0.011, + "num_input_tokens_seen": 6348448, + "step": 30090 + }, + { + "epoch": 3.310781078107811, + "grad_norm": 1.2607817649841309, + "learning_rate": 4.934873536121085e-05, + "loss": 0.1107, + "num_input_tokens_seen": 6349536, + "step": 30095 + }, + { + "epoch": 3.3113311331133115, + "grad_norm": 0.05417655408382416, + "learning_rate": 4.9348190996733155e-05, + "loss": 0.0476, + "num_input_tokens_seen": 6350560, + "step": 30100 + }, + { + "epoch": 3.3118811881188117, + "grad_norm": 1.6570719480514526, + "learning_rate": 4.934764640784987e-05, + "loss": 0.0764, + "num_input_tokens_seen": 6351584, + "step": 30105 + }, + { + "epoch": 3.3124312431243124, + "grad_norm": 0.1420852541923523, + "learning_rate": 4.9347101594566005e-05, + "loss": 0.0291, + "num_input_tokens_seen": 6352640, + "step": 30110 + }, + { + "epoch": 3.312981298129813, + "grad_norm": 0.18566180765628815, + "learning_rate": 4.9346556556886594e-05, + "loss": 0.0378, + "num_input_tokens_seen": 6353792, + "step": 30115 + }, + { + "epoch": 3.3135313531353137, + "grad_norm": 0.19092953205108643, + "learning_rate": 4.934601129481665e-05, + "loss": 0.051, + "num_input_tokens_seen": 6354816, + "step": 30120 + }, + { + "epoch": 3.3140814081408143, + "grad_norm": 0.29085084795951843, + "learning_rate": 4.9345465808361205e-05, + "loss": 0.0671, + "num_input_tokens_seen": 6355872, + "step": 30125 + }, + { + "epoch": 3.3146314631463145, + "grad_norm": 0.08400076627731323, + "learning_rate": 4.9344920097525284e-05, + "loss": 0.068, + "num_input_tokens_seen": 6356896, + "step": 30130 + }, + { + "epoch": 3.315181518151815, + "grad_norm": 0.8929714560508728, + "learning_rate": 4.934437416231391e-05, + "loss": 0.0302, + "num_input_tokens_seen": 6357952, + "step": 30135 + }, + { + "epoch": 3.315731573157316, + "grad_norm": 0.10472533851861954, + "learning_rate": 4.934382800273213e-05, + "loss": 0.0381, + "num_input_tokens_seen": 6359072, + "step": 30140 + }, + { + "epoch": 3.3162816281628165, + "grad_norm": 0.11040901392698288, + "learning_rate": 4.9343281618784965e-05, + "loss": 0.1105, + "num_input_tokens_seen": 6360128, + "step": 30145 + }, + { + "epoch": 3.3168316831683167, + "grad_norm": 1.1418297290802002, + "learning_rate": 4.9342735010477445e-05, + "loss": 0.039, + "num_input_tokens_seen": 6361152, + "step": 30150 + }, + { + "epoch": 3.3173817381738173, + "grad_norm": 1.36910879611969, + "learning_rate": 4.934218817781463e-05, + "loss": 0.138, + "num_input_tokens_seen": 6362208, + "step": 30155 + }, + { + "epoch": 3.317931793179318, + "grad_norm": 0.4185540974140167, + "learning_rate": 4.934164112080154e-05, + "loss": 0.0359, + "num_input_tokens_seen": 6363232, + "step": 30160 + }, + { + "epoch": 3.3184818481848186, + "grad_norm": 0.030179854482412338, + "learning_rate": 4.934109383944323e-05, + "loss": 0.0073, + "num_input_tokens_seen": 6364288, + "step": 30165 + }, + { + "epoch": 3.319031903190319, + "grad_norm": 0.03204780071973801, + "learning_rate": 4.9340546333744736e-05, + "loss": 0.0497, + "num_input_tokens_seen": 6365280, + "step": 30170 + }, + { + "epoch": 3.3195819581958195, + "grad_norm": 0.36556559801101685, + "learning_rate": 4.9339998603711105e-05, + "loss": 0.0189, + "num_input_tokens_seen": 6366336, + "step": 30175 + }, + { + "epoch": 3.32013201320132, + "grad_norm": 0.03800184279680252, + "learning_rate": 4.9339450649347384e-05, + "loss": 0.0533, + "num_input_tokens_seen": 6367360, + "step": 30180 + }, + { + "epoch": 3.3206820682068208, + "grad_norm": 1.3296936750411987, + "learning_rate": 4.933890247065863e-05, + "loss": 0.0946, + "num_input_tokens_seen": 6368320, + "step": 30185 + }, + { + "epoch": 3.3212321232123214, + "grad_norm": 0.15436704456806183, + "learning_rate": 4.933835406764989e-05, + "loss": 0.1112, + "num_input_tokens_seen": 6369440, + "step": 30190 + }, + { + "epoch": 3.3217821782178216, + "grad_norm": 0.24123336374759674, + "learning_rate": 4.933780544032621e-05, + "loss": 0.021, + "num_input_tokens_seen": 6370560, + "step": 30195 + }, + { + "epoch": 3.3223322332233223, + "grad_norm": 0.18558357656002045, + "learning_rate": 4.933725658869267e-05, + "loss": 0.0167, + "num_input_tokens_seen": 6371648, + "step": 30200 + }, + { + "epoch": 3.322882288228823, + "grad_norm": 0.05135392025113106, + "learning_rate": 4.933670751275431e-05, + "loss": 0.0497, + "num_input_tokens_seen": 6372736, + "step": 30205 + }, + { + "epoch": 3.3234323432343236, + "grad_norm": 0.3001079559326172, + "learning_rate": 4.933615821251619e-05, + "loss": 0.0086, + "num_input_tokens_seen": 6373792, + "step": 30210 + }, + { + "epoch": 3.323982398239824, + "grad_norm": 2.381901979446411, + "learning_rate": 4.933560868798338e-05, + "loss": 0.1301, + "num_input_tokens_seen": 6374784, + "step": 30215 + }, + { + "epoch": 3.3245324532453244, + "grad_norm": 1.0293110609054565, + "learning_rate": 4.933505893916094e-05, + "loss": 0.0534, + "num_input_tokens_seen": 6375872, + "step": 30220 + }, + { + "epoch": 3.325082508250825, + "grad_norm": 0.04901180788874626, + "learning_rate": 4.9334508966053946e-05, + "loss": 0.0508, + "num_input_tokens_seen": 6376928, + "step": 30225 + }, + { + "epoch": 3.3256325632563257, + "grad_norm": 0.06814918667078018, + "learning_rate": 4.9333958768667455e-05, + "loss": 0.0054, + "num_input_tokens_seen": 6377952, + "step": 30230 + }, + { + "epoch": 3.3261826182618264, + "grad_norm": 0.3029877245426178, + "learning_rate": 4.9333408347006535e-05, + "loss": 0.0824, + "num_input_tokens_seen": 6379008, + "step": 30235 + }, + { + "epoch": 3.3267326732673266, + "grad_norm": 1.8065693378448486, + "learning_rate": 4.9332857701076274e-05, + "loss": 0.1372, + "num_input_tokens_seen": 6380032, + "step": 30240 + }, + { + "epoch": 3.3272827282728272, + "grad_norm": 0.8779808282852173, + "learning_rate": 4.933230683088174e-05, + "loss": 0.1109, + "num_input_tokens_seen": 6381120, + "step": 30245 + }, + { + "epoch": 3.327832783278328, + "grad_norm": 4.206660747528076, + "learning_rate": 4.9331755736428006e-05, + "loss": 0.059, + "num_input_tokens_seen": 6382208, + "step": 30250 + }, + { + "epoch": 3.3283828382838285, + "grad_norm": 0.8253380060195923, + "learning_rate": 4.933120441772017e-05, + "loss": 0.0588, + "num_input_tokens_seen": 6383232, + "step": 30255 + }, + { + "epoch": 3.3289328932893287, + "grad_norm": 0.25194165110588074, + "learning_rate": 4.9330652874763275e-05, + "loss": 0.0518, + "num_input_tokens_seen": 6384256, + "step": 30260 + }, + { + "epoch": 3.3294829482948294, + "grad_norm": 0.9838925004005432, + "learning_rate": 4.9330101107562444e-05, + "loss": 0.1609, + "num_input_tokens_seen": 6385344, + "step": 30265 + }, + { + "epoch": 3.33003300330033, + "grad_norm": 0.03744430094957352, + "learning_rate": 4.9329549116122745e-05, + "loss": 0.0113, + "num_input_tokens_seen": 6386336, + "step": 30270 + }, + { + "epoch": 3.3305830583058307, + "grad_norm": 1.5226163864135742, + "learning_rate": 4.9328996900449254e-05, + "loss": 0.1329, + "num_input_tokens_seen": 6387424, + "step": 30275 + }, + { + "epoch": 3.3311331133113313, + "grad_norm": 0.7600492238998413, + "learning_rate": 4.932844446054709e-05, + "loss": 0.1117, + "num_input_tokens_seen": 6388416, + "step": 30280 + }, + { + "epoch": 3.3316831683168315, + "grad_norm": 0.8062810897827148, + "learning_rate": 4.9327891796421314e-05, + "loss": 0.0201, + "num_input_tokens_seen": 6389504, + "step": 30285 + }, + { + "epoch": 3.332233223322332, + "grad_norm": 0.027132673189044, + "learning_rate": 4.932733890807703e-05, + "loss": 0.0856, + "num_input_tokens_seen": 6390560, + "step": 30290 + }, + { + "epoch": 3.332783278327833, + "grad_norm": 0.17342419922351837, + "learning_rate": 4.932678579551935e-05, + "loss": 0.0183, + "num_input_tokens_seen": 6391584, + "step": 30295 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.9743140339851379, + "learning_rate": 4.9326232458753345e-05, + "loss": 0.125, + "num_input_tokens_seen": 6392640, + "step": 30300 + }, + { + "epoch": 3.333883388338834, + "grad_norm": 0.1724480837583542, + "learning_rate": 4.932567889778413e-05, + "loss": 0.1215, + "num_input_tokens_seen": 6393696, + "step": 30305 + }, + { + "epoch": 3.3344334433443343, + "grad_norm": 0.051947418600320816, + "learning_rate": 4.932512511261681e-05, + "loss": 0.0544, + "num_input_tokens_seen": 6394688, + "step": 30310 + }, + { + "epoch": 3.334983498349835, + "grad_norm": 1.1283769607543945, + "learning_rate": 4.932457110325649e-05, + "loss": 0.1732, + "num_input_tokens_seen": 6395808, + "step": 30315 + }, + { + "epoch": 3.3355335533553356, + "grad_norm": 0.1674806922674179, + "learning_rate": 4.9324016869708264e-05, + "loss": 0.1007, + "num_input_tokens_seen": 6396832, + "step": 30320 + }, + { + "epoch": 3.336083608360836, + "grad_norm": 0.022609282284975052, + "learning_rate": 4.932346241197725e-05, + "loss": 0.0652, + "num_input_tokens_seen": 6397888, + "step": 30325 + }, + { + "epoch": 3.3366336633663365, + "grad_norm": 0.30897486209869385, + "learning_rate": 4.932290773006855e-05, + "loss": 0.0224, + "num_input_tokens_seen": 6398944, + "step": 30330 + }, + { + "epoch": 3.337183718371837, + "grad_norm": 0.04978257417678833, + "learning_rate": 4.9322352823987284e-05, + "loss": 0.1465, + "num_input_tokens_seen": 6400032, + "step": 30335 + }, + { + "epoch": 3.337733773377338, + "grad_norm": 0.09714334458112717, + "learning_rate": 4.9321797693738555e-05, + "loss": 0.0596, + "num_input_tokens_seen": 6401088, + "step": 30340 + }, + { + "epoch": 3.3382838283828384, + "grad_norm": 0.6681616902351379, + "learning_rate": 4.9321242339327504e-05, + "loss": 0.0327, + "num_input_tokens_seen": 6402144, + "step": 30345 + }, + { + "epoch": 3.3388338833883386, + "grad_norm": 0.2790820896625519, + "learning_rate": 4.932068676075922e-05, + "loss": 0.0457, + "num_input_tokens_seen": 6403200, + "step": 30350 + }, + { + "epoch": 3.3393839383938393, + "grad_norm": 0.18452683091163635, + "learning_rate": 4.9320130958038835e-05, + "loss": 0.1052, + "num_input_tokens_seen": 6404288, + "step": 30355 + }, + { + "epoch": 3.33993399339934, + "grad_norm": 0.20399680733680725, + "learning_rate": 4.931957493117148e-05, + "loss": 0.0477, + "num_input_tokens_seen": 6405312, + "step": 30360 + }, + { + "epoch": 3.3404840484048406, + "grad_norm": 0.08018314838409424, + "learning_rate": 4.931901868016227e-05, + "loss": 0.0978, + "num_input_tokens_seen": 6406432, + "step": 30365 + }, + { + "epoch": 3.3410341034103412, + "grad_norm": 0.3528279960155487, + "learning_rate": 4.9318462205016324e-05, + "loss": 0.032, + "num_input_tokens_seen": 6407520, + "step": 30370 + }, + { + "epoch": 3.3415841584158414, + "grad_norm": 0.756486177444458, + "learning_rate": 4.931790550573879e-05, + "loss": 0.1003, + "num_input_tokens_seen": 6408544, + "step": 30375 + }, + { + "epoch": 3.342134213421342, + "grad_norm": 0.25966137647628784, + "learning_rate": 4.931734858233479e-05, + "loss": 0.1884, + "num_input_tokens_seen": 6409568, + "step": 30380 + }, + { + "epoch": 3.3426842684268427, + "grad_norm": 0.34502434730529785, + "learning_rate": 4.931679143480946e-05, + "loss": 0.0637, + "num_input_tokens_seen": 6410624, + "step": 30385 + }, + { + "epoch": 3.3432343234323434, + "grad_norm": 0.39792507886886597, + "learning_rate": 4.931623406316793e-05, + "loss": 0.0602, + "num_input_tokens_seen": 6411680, + "step": 30390 + }, + { + "epoch": 3.3437843784378436, + "grad_norm": 0.16419968008995056, + "learning_rate": 4.931567646741534e-05, + "loss": 0.0421, + "num_input_tokens_seen": 6412672, + "step": 30395 + }, + { + "epoch": 3.3443344334433442, + "grad_norm": 0.09633566439151764, + "learning_rate": 4.9315118647556824e-05, + "loss": 0.0211, + "num_input_tokens_seen": 6413792, + "step": 30400 + }, + { + "epoch": 3.344884488448845, + "grad_norm": 0.8393880724906921, + "learning_rate": 4.931456060359753e-05, + "loss": 0.2001, + "num_input_tokens_seen": 6414880, + "step": 30405 + }, + { + "epoch": 3.3454345434543455, + "grad_norm": 0.19746480882167816, + "learning_rate": 4.931400233554259e-05, + "loss": 0.0394, + "num_input_tokens_seen": 6415936, + "step": 30410 + }, + { + "epoch": 3.3459845984598457, + "grad_norm": 0.6020165681838989, + "learning_rate": 4.931344384339717e-05, + "loss": 0.1082, + "num_input_tokens_seen": 6417024, + "step": 30415 + }, + { + "epoch": 3.3465346534653464, + "grad_norm": 0.6016717553138733, + "learning_rate": 4.93128851271664e-05, + "loss": 0.0635, + "num_input_tokens_seen": 6418080, + "step": 30420 + }, + { + "epoch": 3.347084708470847, + "grad_norm": 0.5879265666007996, + "learning_rate": 4.931232618685543e-05, + "loss": 0.041, + "num_input_tokens_seen": 6419168, + "step": 30425 + }, + { + "epoch": 3.3476347634763477, + "grad_norm": 0.3248685300350189, + "learning_rate": 4.931176702246941e-05, + "loss": 0.0275, + "num_input_tokens_seen": 6420192, + "step": 30430 + }, + { + "epoch": 3.3481848184818483, + "grad_norm": 0.017646322026848793, + "learning_rate": 4.9311207634013515e-05, + "loss": 0.0547, + "num_input_tokens_seen": 6421216, + "step": 30435 + }, + { + "epoch": 3.3487348734873486, + "grad_norm": 0.025104397907853127, + "learning_rate": 4.931064802149288e-05, + "loss": 0.0566, + "num_input_tokens_seen": 6422272, + "step": 30440 + }, + { + "epoch": 3.349284928492849, + "grad_norm": 0.026558298617601395, + "learning_rate": 4.931008818491266e-05, + "loss": 0.0374, + "num_input_tokens_seen": 6423296, + "step": 30445 + }, + { + "epoch": 3.34983498349835, + "grad_norm": 0.3778541088104248, + "learning_rate": 4.9309528124278025e-05, + "loss": 0.0589, + "num_input_tokens_seen": 6424352, + "step": 30450 + }, + { + "epoch": 3.3503850385038505, + "grad_norm": 0.07454104721546173, + "learning_rate": 4.9308967839594136e-05, + "loss": 0.0344, + "num_input_tokens_seen": 6425344, + "step": 30455 + }, + { + "epoch": 3.350935093509351, + "grad_norm": 0.0823221281170845, + "learning_rate": 4.9308407330866156e-05, + "loss": 0.0475, + "num_input_tokens_seen": 6426464, + "step": 30460 + }, + { + "epoch": 3.3514851485148514, + "grad_norm": 0.45505356788635254, + "learning_rate": 4.930784659809925e-05, + "loss": 0.0697, + "num_input_tokens_seen": 6427488, + "step": 30465 + }, + { + "epoch": 3.352035203520352, + "grad_norm": 0.06342148035764694, + "learning_rate": 4.9307285641298585e-05, + "loss": 0.0444, + "num_input_tokens_seen": 6428512, + "step": 30470 + }, + { + "epoch": 3.3525852585258527, + "grad_norm": 0.2638693153858185, + "learning_rate": 4.9306724460469335e-05, + "loss": 0.0423, + "num_input_tokens_seen": 6429536, + "step": 30475 + }, + { + "epoch": 3.3531353135313533, + "grad_norm": 0.48958662152290344, + "learning_rate": 4.930616305561667e-05, + "loss": 0.0185, + "num_input_tokens_seen": 6430656, + "step": 30480 + }, + { + "epoch": 3.3536853685368535, + "grad_norm": 0.3596709072589874, + "learning_rate": 4.930560142674576e-05, + "loss": 0.0364, + "num_input_tokens_seen": 6431712, + "step": 30485 + }, + { + "epoch": 3.354235423542354, + "grad_norm": 1.7063084840774536, + "learning_rate": 4.9305039573861776e-05, + "loss": 0.1195, + "num_input_tokens_seen": 6432768, + "step": 30490 + }, + { + "epoch": 3.354785478547855, + "grad_norm": 0.030184004455804825, + "learning_rate": 4.9304477496969916e-05, + "loss": 0.038, + "num_input_tokens_seen": 6433856, + "step": 30495 + }, + { + "epoch": 3.3553355335533555, + "grad_norm": 1.060636043548584, + "learning_rate": 4.930391519607534e-05, + "loss": 0.0688, + "num_input_tokens_seen": 6434976, + "step": 30500 + }, + { + "epoch": 3.3558855885588557, + "grad_norm": 0.029847798869013786, + "learning_rate": 4.930335267118326e-05, + "loss": 0.0517, + "num_input_tokens_seen": 6436000, + "step": 30505 + }, + { + "epoch": 3.3564356435643563, + "grad_norm": 0.2291102111339569, + "learning_rate": 4.930278992229882e-05, + "loss": 0.0326, + "num_input_tokens_seen": 6437088, + "step": 30510 + }, + { + "epoch": 3.356985698569857, + "grad_norm": 0.5633700489997864, + "learning_rate": 4.930222694942723e-05, + "loss": 0.0381, + "num_input_tokens_seen": 6438080, + "step": 30515 + }, + { + "epoch": 3.3575357535753576, + "grad_norm": 0.01598775014281273, + "learning_rate": 4.9301663752573684e-05, + "loss": 0.023, + "num_input_tokens_seen": 6439168, + "step": 30520 + }, + { + "epoch": 3.3580858085808583, + "grad_norm": 0.032628919929265976, + "learning_rate": 4.930110033174336e-05, + "loss": 0.0263, + "num_input_tokens_seen": 6440160, + "step": 30525 + }, + { + "epoch": 3.3586358635863585, + "grad_norm": 0.049491994082927704, + "learning_rate": 4.9300536686941455e-05, + "loss": 0.0917, + "num_input_tokens_seen": 6441280, + "step": 30530 + }, + { + "epoch": 3.359185918591859, + "grad_norm": 0.09039817750453949, + "learning_rate": 4.929997281817317e-05, + "loss": 0.0894, + "num_input_tokens_seen": 6442368, + "step": 30535 + }, + { + "epoch": 3.3597359735973598, + "grad_norm": 0.24998049437999725, + "learning_rate": 4.9299408725443685e-05, + "loss": 0.1072, + "num_input_tokens_seen": 6443456, + "step": 30540 + }, + { + "epoch": 3.3602860286028604, + "grad_norm": 0.20666959881782532, + "learning_rate": 4.9298844408758215e-05, + "loss": 0.0637, + "num_input_tokens_seen": 6444480, + "step": 30545 + }, + { + "epoch": 3.360836083608361, + "grad_norm": 1.5905941724777222, + "learning_rate": 4.929827986812196e-05, + "loss": 0.0611, + "num_input_tokens_seen": 6445568, + "step": 30550 + }, + { + "epoch": 3.3613861386138613, + "grad_norm": 1.2101167440414429, + "learning_rate": 4.929771510354011e-05, + "loss": 0.1402, + "num_input_tokens_seen": 6446656, + "step": 30555 + }, + { + "epoch": 3.361936193619362, + "grad_norm": 0.2603108882904053, + "learning_rate": 4.9297150115017875e-05, + "loss": 0.0357, + "num_input_tokens_seen": 6447712, + "step": 30560 + }, + { + "epoch": 3.3624862486248626, + "grad_norm": 1.283022165298462, + "learning_rate": 4.929658490256048e-05, + "loss": 0.1019, + "num_input_tokens_seen": 6448736, + "step": 30565 + }, + { + "epoch": 3.363036303630363, + "grad_norm": 0.05780898034572601, + "learning_rate": 4.9296019466173116e-05, + "loss": 0.1263, + "num_input_tokens_seen": 6449760, + "step": 30570 + }, + { + "epoch": 3.3635863586358634, + "grad_norm": 0.05686675012111664, + "learning_rate": 4.9295453805860994e-05, + "loss": 0.0738, + "num_input_tokens_seen": 6450816, + "step": 30575 + }, + { + "epoch": 3.364136413641364, + "grad_norm": 0.14623741805553436, + "learning_rate": 4.929488792162934e-05, + "loss": 0.0388, + "num_input_tokens_seen": 6451904, + "step": 30580 + }, + { + "epoch": 3.3646864686468647, + "grad_norm": 0.07679769396781921, + "learning_rate": 4.929432181348336e-05, + "loss": 0.0651, + "num_input_tokens_seen": 6452960, + "step": 30585 + }, + { + "epoch": 3.3652365236523654, + "grad_norm": 0.9056196808815002, + "learning_rate": 4.929375548142827e-05, + "loss": 0.0242, + "num_input_tokens_seen": 6454080, + "step": 30590 + }, + { + "epoch": 3.3657865786578656, + "grad_norm": 0.242056205868721, + "learning_rate": 4.9293188925469294e-05, + "loss": 0.0258, + "num_input_tokens_seen": 6455168, + "step": 30595 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.1376166194677353, + "learning_rate": 4.929262214561165e-05, + "loss": 0.0634, + "num_input_tokens_seen": 6456160, + "step": 30600 + }, + { + "epoch": 3.366886688668867, + "grad_norm": 0.06995498389005661, + "learning_rate": 4.929205514186057e-05, + "loss": 0.0486, + "num_input_tokens_seen": 6457216, + "step": 30605 + }, + { + "epoch": 3.3674367436743675, + "grad_norm": 1.5246094465255737, + "learning_rate": 4.9291487914221266e-05, + "loss": 0.0797, + "num_input_tokens_seen": 6458240, + "step": 30610 + }, + { + "epoch": 3.367986798679868, + "grad_norm": 0.902762770652771, + "learning_rate": 4.9290920462698985e-05, + "loss": 0.208, + "num_input_tokens_seen": 6459296, + "step": 30615 + }, + { + "epoch": 3.3685368536853684, + "grad_norm": 0.4907777011394501, + "learning_rate": 4.929035278729894e-05, + "loss": 0.1496, + "num_input_tokens_seen": 6460384, + "step": 30620 + }, + { + "epoch": 3.369086908690869, + "grad_norm": 0.527398943901062, + "learning_rate": 4.928978488802637e-05, + "loss": 0.1159, + "num_input_tokens_seen": 6461408, + "step": 30625 + }, + { + "epoch": 3.3696369636963697, + "grad_norm": 0.3275330066680908, + "learning_rate": 4.9289216764886506e-05, + "loss": 0.0895, + "num_input_tokens_seen": 6462432, + "step": 30630 + }, + { + "epoch": 3.3701870187018703, + "grad_norm": 0.6328462958335876, + "learning_rate": 4.9288648417884584e-05, + "loss": 0.0511, + "num_input_tokens_seen": 6463424, + "step": 30635 + }, + { + "epoch": 3.370737073707371, + "grad_norm": 1.0571274757385254, + "learning_rate": 4.9288079847025855e-05, + "loss": 0.094, + "num_input_tokens_seen": 6464512, + "step": 30640 + }, + { + "epoch": 3.371287128712871, + "grad_norm": 0.29491567611694336, + "learning_rate": 4.9287511052315535e-05, + "loss": 0.0832, + "num_input_tokens_seen": 6465504, + "step": 30645 + }, + { + "epoch": 3.371837183718372, + "grad_norm": 0.10427217930555344, + "learning_rate": 4.928694203375889e-05, + "loss": 0.1089, + "num_input_tokens_seen": 6466592, + "step": 30650 + }, + { + "epoch": 3.3723872387238725, + "grad_norm": 0.09729436784982681, + "learning_rate": 4.928637279136115e-05, + "loss": 0.0191, + "num_input_tokens_seen": 6467648, + "step": 30655 + }, + { + "epoch": 3.372937293729373, + "grad_norm": 0.24124060571193695, + "learning_rate": 4.928580332512757e-05, + "loss": 0.111, + "num_input_tokens_seen": 6468704, + "step": 30660 + }, + { + "epoch": 3.3734873487348733, + "grad_norm": 0.42968565225601196, + "learning_rate": 4.928523363506339e-05, + "loss": 0.0572, + "num_input_tokens_seen": 6469760, + "step": 30665 + }, + { + "epoch": 3.374037403740374, + "grad_norm": 0.5165342092514038, + "learning_rate": 4.9284663721173865e-05, + "loss": 0.0514, + "num_input_tokens_seen": 6470848, + "step": 30670 + }, + { + "epoch": 3.3745874587458746, + "grad_norm": 1.024749517440796, + "learning_rate": 4.928409358346425e-05, + "loss": 0.0894, + "num_input_tokens_seen": 6471936, + "step": 30675 + }, + { + "epoch": 3.3751375137513753, + "grad_norm": 0.029011109843850136, + "learning_rate": 4.9283523221939796e-05, + "loss": 0.0868, + "num_input_tokens_seen": 6472992, + "step": 30680 + }, + { + "epoch": 3.3756875687568755, + "grad_norm": 0.7063316106796265, + "learning_rate": 4.928295263660576e-05, + "loss": 0.0484, + "num_input_tokens_seen": 6474048, + "step": 30685 + }, + { + "epoch": 3.376237623762376, + "grad_norm": 0.5387053489685059, + "learning_rate": 4.9282381827467397e-05, + "loss": 0.0853, + "num_input_tokens_seen": 6475200, + "step": 30690 + }, + { + "epoch": 3.3767876787678768, + "grad_norm": 0.4525936543941498, + "learning_rate": 4.928181079452998e-05, + "loss": 0.0668, + "num_input_tokens_seen": 6476320, + "step": 30695 + }, + { + "epoch": 3.3773377337733774, + "grad_norm": 0.5667585134506226, + "learning_rate": 4.928123953779876e-05, + "loss": 0.0773, + "num_input_tokens_seen": 6477344, + "step": 30700 + }, + { + "epoch": 3.377887788778878, + "grad_norm": 0.362237811088562, + "learning_rate": 4.9280668057279014e-05, + "loss": 0.0592, + "num_input_tokens_seen": 6478368, + "step": 30705 + }, + { + "epoch": 3.3784378437843783, + "grad_norm": 0.0993577167391777, + "learning_rate": 4.928009635297599e-05, + "loss": 0.0699, + "num_input_tokens_seen": 6479392, + "step": 30710 + }, + { + "epoch": 3.378987898789879, + "grad_norm": 0.18476635217666626, + "learning_rate": 4.9279524424894976e-05, + "loss": 0.082, + "num_input_tokens_seen": 6480448, + "step": 30715 + }, + { + "epoch": 3.3795379537953796, + "grad_norm": 0.2542438209056854, + "learning_rate": 4.9278952273041236e-05, + "loss": 0.02, + "num_input_tokens_seen": 6481504, + "step": 30720 + }, + { + "epoch": 3.3800880088008802, + "grad_norm": 0.018556220456957817, + "learning_rate": 4.927837989742004e-05, + "loss": 0.0459, + "num_input_tokens_seen": 6482624, + "step": 30725 + }, + { + "epoch": 3.380638063806381, + "grad_norm": 0.07920154184103012, + "learning_rate": 4.927780729803667e-05, + "loss": 0.0901, + "num_input_tokens_seen": 6483712, + "step": 30730 + }, + { + "epoch": 3.381188118811881, + "grad_norm": 1.3950812816619873, + "learning_rate": 4.92772344748964e-05, + "loss": 0.1331, + "num_input_tokens_seen": 6484832, + "step": 30735 + }, + { + "epoch": 3.3817381738173817, + "grad_norm": 0.03178144618868828, + "learning_rate": 4.9276661428004503e-05, + "loss": 0.1442, + "num_input_tokens_seen": 6485920, + "step": 30740 + }, + { + "epoch": 3.3822882288228824, + "grad_norm": 0.06720077991485596, + "learning_rate": 4.927608815736627e-05, + "loss": 0.036, + "num_input_tokens_seen": 6487008, + "step": 30745 + }, + { + "epoch": 3.382838283828383, + "grad_norm": 0.11306507885456085, + "learning_rate": 4.927551466298698e-05, + "loss": 0.0087, + "num_input_tokens_seen": 6488032, + "step": 30750 + }, + { + "epoch": 3.3833883388338832, + "grad_norm": 0.3526788353919983, + "learning_rate": 4.927494094487192e-05, + "loss": 0.0697, + "num_input_tokens_seen": 6489120, + "step": 30755 + }, + { + "epoch": 3.383938393839384, + "grad_norm": 0.24165064096450806, + "learning_rate": 4.927436700302638e-05, + "loss": 0.0419, + "num_input_tokens_seen": 6490240, + "step": 30760 + }, + { + "epoch": 3.3844884488448845, + "grad_norm": 0.8853172063827515, + "learning_rate": 4.927379283745565e-05, + "loss": 0.0386, + "num_input_tokens_seen": 6491328, + "step": 30765 + }, + { + "epoch": 3.385038503850385, + "grad_norm": 0.316308856010437, + "learning_rate": 4.9273218448165014e-05, + "loss": 0.0121, + "num_input_tokens_seen": 6492320, + "step": 30770 + }, + { + "epoch": 3.3855885588558854, + "grad_norm": 0.6745229959487915, + "learning_rate": 4.927264383515977e-05, + "loss": 0.0655, + "num_input_tokens_seen": 6493408, + "step": 30775 + }, + { + "epoch": 3.386138613861386, + "grad_norm": 1.0500136613845825, + "learning_rate": 4.927206899844522e-05, + "loss": 0.0795, + "num_input_tokens_seen": 6494464, + "step": 30780 + }, + { + "epoch": 3.3866886688668867, + "grad_norm": 0.029948094859719276, + "learning_rate": 4.927149393802665e-05, + "loss": 0.0226, + "num_input_tokens_seen": 6495520, + "step": 30785 + }, + { + "epoch": 3.3872387238723873, + "grad_norm": 0.059538375586271286, + "learning_rate": 4.927091865390937e-05, + "loss": 0.0156, + "num_input_tokens_seen": 6496544, + "step": 30790 + }, + { + "epoch": 3.387788778877888, + "grad_norm": 0.4290587306022644, + "learning_rate": 4.927034314609869e-05, + "loss": 0.0773, + "num_input_tokens_seen": 6497568, + "step": 30795 + }, + { + "epoch": 3.388338833883388, + "grad_norm": 0.479765921831131, + "learning_rate": 4.926976741459989e-05, + "loss": 0.0643, + "num_input_tokens_seen": 6498592, + "step": 30800 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.06915892660617828, + "learning_rate": 4.926919145941829e-05, + "loss": 0.1055, + "num_input_tokens_seen": 6499584, + "step": 30805 + }, + { + "epoch": 3.3894389438943895, + "grad_norm": 0.19158516824245453, + "learning_rate": 4.9268615280559205e-05, + "loss": 0.0746, + "num_input_tokens_seen": 6500640, + "step": 30810 + }, + { + "epoch": 3.38998899889989, + "grad_norm": 0.7366333603858948, + "learning_rate": 4.9268038878027937e-05, + "loss": 0.1679, + "num_input_tokens_seen": 6501728, + "step": 30815 + }, + { + "epoch": 3.390539053905391, + "grad_norm": 0.7672715187072754, + "learning_rate": 4.9267462251829785e-05, + "loss": 0.0954, + "num_input_tokens_seen": 6502720, + "step": 30820 + }, + { + "epoch": 3.391089108910891, + "grad_norm": 0.13747768104076385, + "learning_rate": 4.9266885401970097e-05, + "loss": 0.0279, + "num_input_tokens_seen": 6503808, + "step": 30825 + }, + { + "epoch": 3.3916391639163916, + "grad_norm": 0.4532677233219147, + "learning_rate": 4.926630832845416e-05, + "loss": 0.075, + "num_input_tokens_seen": 6504832, + "step": 30830 + }, + { + "epoch": 3.3921892189218923, + "grad_norm": 1.0077565908432007, + "learning_rate": 4.926573103128731e-05, + "loss": 0.1026, + "num_input_tokens_seen": 6505856, + "step": 30835 + }, + { + "epoch": 3.3927392739273925, + "grad_norm": 0.7901499271392822, + "learning_rate": 4.926515351047486e-05, + "loss": 0.0624, + "num_input_tokens_seen": 6506912, + "step": 30840 + }, + { + "epoch": 3.393289328932893, + "grad_norm": 0.15015731751918793, + "learning_rate": 4.926457576602213e-05, + "loss": 0.1057, + "num_input_tokens_seen": 6507936, + "step": 30845 + }, + { + "epoch": 3.393839383938394, + "grad_norm": 1.1546353101730347, + "learning_rate": 4.9263997797934445e-05, + "loss": 0.0758, + "num_input_tokens_seen": 6509024, + "step": 30850 + }, + { + "epoch": 3.3943894389438944, + "grad_norm": 0.050266336649656296, + "learning_rate": 4.926341960621714e-05, + "loss": 0.0531, + "num_input_tokens_seen": 6510080, + "step": 30855 + }, + { + "epoch": 3.394939493949395, + "grad_norm": 0.6061245203018188, + "learning_rate": 4.9262841190875545e-05, + "loss": 0.0305, + "num_input_tokens_seen": 6511104, + "step": 30860 + }, + { + "epoch": 3.3954895489548953, + "grad_norm": 0.019079383462667465, + "learning_rate": 4.926226255191497e-05, + "loss": 0.0176, + "num_input_tokens_seen": 6512064, + "step": 30865 + }, + { + "epoch": 3.396039603960396, + "grad_norm": 0.28524693846702576, + "learning_rate": 4.926168368934078e-05, + "loss": 0.0535, + "num_input_tokens_seen": 6513152, + "step": 30870 + }, + { + "epoch": 3.3965896589658966, + "grad_norm": 0.057310234755277634, + "learning_rate": 4.926110460315828e-05, + "loss": 0.0866, + "num_input_tokens_seen": 6514240, + "step": 30875 + }, + { + "epoch": 3.3971397139713972, + "grad_norm": 1.153663158416748, + "learning_rate": 4.9260525293372825e-05, + "loss": 0.1657, + "num_input_tokens_seen": 6515296, + "step": 30880 + }, + { + "epoch": 3.397689768976898, + "grad_norm": 0.17672070860862732, + "learning_rate": 4.925994575998976e-05, + "loss": 0.0406, + "num_input_tokens_seen": 6516352, + "step": 30885 + }, + { + "epoch": 3.398239823982398, + "grad_norm": 0.13272185623645782, + "learning_rate": 4.92593660030144e-05, + "loss": 0.0383, + "num_input_tokens_seen": 6517408, + "step": 30890 + }, + { + "epoch": 3.3987898789878987, + "grad_norm": 1.5419552326202393, + "learning_rate": 4.925878602245211e-05, + "loss": 0.1194, + "num_input_tokens_seen": 6518400, + "step": 30895 + }, + { + "epoch": 3.3993399339933994, + "grad_norm": 1.1419206857681274, + "learning_rate": 4.925820581830823e-05, + "loss": 0.0922, + "num_input_tokens_seen": 6519488, + "step": 30900 + }, + { + "epoch": 3.3998899889989, + "grad_norm": 0.11567455530166626, + "learning_rate": 4.925762539058811e-05, + "loss": 0.0656, + "num_input_tokens_seen": 6520544, + "step": 30905 + }, + { + "epoch": 3.4004400440044003, + "grad_norm": 0.030258994549512863, + "learning_rate": 4.92570447392971e-05, + "loss": 0.0434, + "num_input_tokens_seen": 6521600, + "step": 30910 + }, + { + "epoch": 3.400990099009901, + "grad_norm": 0.3803362548351288, + "learning_rate": 4.9256463864440535e-05, + "loss": 0.0688, + "num_input_tokens_seen": 6522592, + "step": 30915 + }, + { + "epoch": 3.4015401540154016, + "grad_norm": 0.10619843006134033, + "learning_rate": 4.9255882766023795e-05, + "loss": 0.0799, + "num_input_tokens_seen": 6523584, + "step": 30920 + }, + { + "epoch": 3.402090209020902, + "grad_norm": 0.10121241956949234, + "learning_rate": 4.925530144405222e-05, + "loss": 0.0572, + "num_input_tokens_seen": 6524576, + "step": 30925 + }, + { + "epoch": 3.4026402640264024, + "grad_norm": 0.2880103886127472, + "learning_rate": 4.925471989853117e-05, + "loss": 0.0513, + "num_input_tokens_seen": 6525696, + "step": 30930 + }, + { + "epoch": 3.403190319031903, + "grad_norm": 0.03656885772943497, + "learning_rate": 4.9254138129466e-05, + "loss": 0.0498, + "num_input_tokens_seen": 6526784, + "step": 30935 + }, + { + "epoch": 3.4037403740374037, + "grad_norm": 0.07172273844480515, + "learning_rate": 4.925355613686208e-05, + "loss": 0.0323, + "num_input_tokens_seen": 6527904, + "step": 30940 + }, + { + "epoch": 3.4042904290429044, + "grad_norm": 0.022593844681978226, + "learning_rate": 4.925297392072478e-05, + "loss": 0.0103, + "num_input_tokens_seen": 6528928, + "step": 30945 + }, + { + "epoch": 3.404840484048405, + "grad_norm": 0.09411750733852386, + "learning_rate": 4.9252391481059445e-05, + "loss": 0.1354, + "num_input_tokens_seen": 6529984, + "step": 30950 + }, + { + "epoch": 3.405390539053905, + "grad_norm": 0.368685245513916, + "learning_rate": 4.9251808817871456e-05, + "loss": 0.0777, + "num_input_tokens_seen": 6531072, + "step": 30955 + }, + { + "epoch": 3.405940594059406, + "grad_norm": 0.261398047208786, + "learning_rate": 4.9251225931166184e-05, + "loss": 0.0478, + "num_input_tokens_seen": 6532128, + "step": 30960 + }, + { + "epoch": 3.4064906490649065, + "grad_norm": 1.4272130727767944, + "learning_rate": 4.9250642820949e-05, + "loss": 0.0956, + "num_input_tokens_seen": 6533152, + "step": 30965 + }, + { + "epoch": 3.407040704070407, + "grad_norm": 0.14667634665966034, + "learning_rate": 4.9250059487225283e-05, + "loss": 0.0212, + "num_input_tokens_seen": 6534208, + "step": 30970 + }, + { + "epoch": 3.407590759075908, + "grad_norm": 0.2080196738243103, + "learning_rate": 4.92494759300004e-05, + "loss": 0.0697, + "num_input_tokens_seen": 6535232, + "step": 30975 + }, + { + "epoch": 3.408140814081408, + "grad_norm": 0.11718335747718811, + "learning_rate": 4.924889214927973e-05, + "loss": 0.0716, + "num_input_tokens_seen": 6536288, + "step": 30980 + }, + { + "epoch": 3.4086908690869087, + "grad_norm": 0.2626570463180542, + "learning_rate": 4.924830814506866e-05, + "loss": 0.0868, + "num_input_tokens_seen": 6537344, + "step": 30985 + }, + { + "epoch": 3.4092409240924093, + "grad_norm": 0.7406039834022522, + "learning_rate": 4.9247723917372565e-05, + "loss": 0.1135, + "num_input_tokens_seen": 6538400, + "step": 30990 + }, + { + "epoch": 3.40979097909791, + "grad_norm": 0.02833172120153904, + "learning_rate": 4.924713946619684e-05, + "loss": 0.0886, + "num_input_tokens_seen": 6539488, + "step": 30995 + }, + { + "epoch": 3.41034103410341, + "grad_norm": 0.1451382339000702, + "learning_rate": 4.924655479154686e-05, + "loss": 0.0153, + "num_input_tokens_seen": 6540544, + "step": 31000 + }, + { + "epoch": 3.410891089108911, + "grad_norm": 0.20018166303634644, + "learning_rate": 4.924596989342802e-05, + "loss": 0.0506, + "num_input_tokens_seen": 6541632, + "step": 31005 + }, + { + "epoch": 3.4114411441144115, + "grad_norm": 1.1332203149795532, + "learning_rate": 4.924538477184571e-05, + "loss": 0.1286, + "num_input_tokens_seen": 6542656, + "step": 31010 + }, + { + "epoch": 3.411991199119912, + "grad_norm": 0.2733840048313141, + "learning_rate": 4.924479942680533e-05, + "loss": 0.0494, + "num_input_tokens_seen": 6543744, + "step": 31015 + }, + { + "epoch": 3.4125412541254123, + "grad_norm": 0.9893098473548889, + "learning_rate": 4.924421385831227e-05, + "loss": 0.0604, + "num_input_tokens_seen": 6544736, + "step": 31020 + }, + { + "epoch": 3.413091309130913, + "grad_norm": 1.0194333791732788, + "learning_rate": 4.924362806637191e-05, + "loss": 0.0593, + "num_input_tokens_seen": 6545760, + "step": 31025 + }, + { + "epoch": 3.4136413641364136, + "grad_norm": 0.03783312812447548, + "learning_rate": 4.924304205098967e-05, + "loss": 0.0408, + "num_input_tokens_seen": 6546880, + "step": 31030 + }, + { + "epoch": 3.4141914191419143, + "grad_norm": 1.097248911857605, + "learning_rate": 4.924245581217095e-05, + "loss": 0.0691, + "num_input_tokens_seen": 6547968, + "step": 31035 + }, + { + "epoch": 3.414741474147415, + "grad_norm": 0.5460391640663147, + "learning_rate": 4.924186934992114e-05, + "loss": 0.0352, + "num_input_tokens_seen": 6549024, + "step": 31040 + }, + { + "epoch": 3.415291529152915, + "grad_norm": 0.043010346591472626, + "learning_rate": 4.924128266424565e-05, + "loss": 0.0192, + "num_input_tokens_seen": 6550016, + "step": 31045 + }, + { + "epoch": 3.4158415841584158, + "grad_norm": 0.12088636308908463, + "learning_rate": 4.9240695755149894e-05, + "loss": 0.039, + "num_input_tokens_seen": 6551072, + "step": 31050 + }, + { + "epoch": 3.4163916391639164, + "grad_norm": 0.7546489834785461, + "learning_rate": 4.924010862263928e-05, + "loss": 0.0354, + "num_input_tokens_seen": 6552064, + "step": 31055 + }, + { + "epoch": 3.416941694169417, + "grad_norm": 0.10369981080293655, + "learning_rate": 4.9239521266719216e-05, + "loss": 0.0169, + "num_input_tokens_seen": 6553120, + "step": 31060 + }, + { + "epoch": 3.4174917491749177, + "grad_norm": 0.7022213935852051, + "learning_rate": 4.923893368739511e-05, + "loss": 0.0674, + "num_input_tokens_seen": 6554208, + "step": 31065 + }, + { + "epoch": 3.418041804180418, + "grad_norm": 0.26982739567756653, + "learning_rate": 4.923834588467239e-05, + "loss": 0.0181, + "num_input_tokens_seen": 6555264, + "step": 31070 + }, + { + "epoch": 3.4185918591859186, + "grad_norm": 0.2155054211616516, + "learning_rate": 4.923775785855646e-05, + "loss": 0.0351, + "num_input_tokens_seen": 6556320, + "step": 31075 + }, + { + "epoch": 3.419141914191419, + "grad_norm": 0.24639172852039337, + "learning_rate": 4.9237169609052744e-05, + "loss": 0.0501, + "num_input_tokens_seen": 6557344, + "step": 31080 + }, + { + "epoch": 3.41969196919692, + "grad_norm": 0.045261844992637634, + "learning_rate": 4.9236581136166676e-05, + "loss": 0.0118, + "num_input_tokens_seen": 6558368, + "step": 31085 + }, + { + "epoch": 3.42024202420242, + "grad_norm": 0.026277052238583565, + "learning_rate": 4.923599243990367e-05, + "loss": 0.0234, + "num_input_tokens_seen": 6559456, + "step": 31090 + }, + { + "epoch": 3.4207920792079207, + "grad_norm": 0.4666725993156433, + "learning_rate": 4.923540352026914e-05, + "loss": 0.0943, + "num_input_tokens_seen": 6560544, + "step": 31095 + }, + { + "epoch": 3.4213421342134214, + "grad_norm": 0.06928811222314835, + "learning_rate": 4.923481437726853e-05, + "loss": 0.0813, + "num_input_tokens_seen": 6561632, + "step": 31100 + }, + { + "epoch": 3.421892189218922, + "grad_norm": 0.2660559117794037, + "learning_rate": 4.923422501090726e-05, + "loss": 0.0311, + "num_input_tokens_seen": 6562720, + "step": 31105 + }, + { + "epoch": 3.4224422442244222, + "grad_norm": 0.30661535263061523, + "learning_rate": 4.9233635421190784e-05, + "loss": 0.1382, + "num_input_tokens_seen": 6563808, + "step": 31110 + }, + { + "epoch": 3.422992299229923, + "grad_norm": 0.20519113540649414, + "learning_rate": 4.92330456081245e-05, + "loss": 0.075, + "num_input_tokens_seen": 6564928, + "step": 31115 + }, + { + "epoch": 3.4235423542354235, + "grad_norm": 0.07719280570745468, + "learning_rate": 4.923245557171388e-05, + "loss": 0.0359, + "num_input_tokens_seen": 6565984, + "step": 31120 + }, + { + "epoch": 3.424092409240924, + "grad_norm": 0.15196345746517181, + "learning_rate": 4.923186531196433e-05, + "loss": 0.0846, + "num_input_tokens_seen": 6567040, + "step": 31125 + }, + { + "epoch": 3.424642464246425, + "grad_norm": 0.3731841742992401, + "learning_rate": 4.9231274828881315e-05, + "loss": 0.125, + "num_input_tokens_seen": 6568096, + "step": 31130 + }, + { + "epoch": 3.425192519251925, + "grad_norm": 0.17151151597499847, + "learning_rate": 4.923068412247026e-05, + "loss": 0.0717, + "num_input_tokens_seen": 6569088, + "step": 31135 + }, + { + "epoch": 3.4257425742574257, + "grad_norm": 0.1868213415145874, + "learning_rate": 4.923009319273662e-05, + "loss": 0.2156, + "num_input_tokens_seen": 6570208, + "step": 31140 + }, + { + "epoch": 3.4262926292629263, + "grad_norm": 0.5958669185638428, + "learning_rate": 4.922950203968583e-05, + "loss": 0.0628, + "num_input_tokens_seen": 6571264, + "step": 31145 + }, + { + "epoch": 3.426842684268427, + "grad_norm": 0.04847276955842972, + "learning_rate": 4.922891066332336e-05, + "loss": 0.0695, + "num_input_tokens_seen": 6572320, + "step": 31150 + }, + { + "epoch": 3.4273927392739276, + "grad_norm": 0.07883863896131516, + "learning_rate": 4.922831906365463e-05, + "loss": 0.0258, + "num_input_tokens_seen": 6573344, + "step": 31155 + }, + { + "epoch": 3.427942794279428, + "grad_norm": 0.060497816652059555, + "learning_rate": 4.922772724068512e-05, + "loss": 0.0258, + "num_input_tokens_seen": 6574432, + "step": 31160 + }, + { + "epoch": 3.4284928492849285, + "grad_norm": 0.038012661039829254, + "learning_rate": 4.922713519442027e-05, + "loss": 0.066, + "num_input_tokens_seen": 6575488, + "step": 31165 + }, + { + "epoch": 3.429042904290429, + "grad_norm": 0.020941555500030518, + "learning_rate": 4.9226542924865546e-05, + "loss": 0.0072, + "num_input_tokens_seen": 6576576, + "step": 31170 + }, + { + "epoch": 3.4295929592959298, + "grad_norm": 0.07381405681371689, + "learning_rate": 4.92259504320264e-05, + "loss": 0.0373, + "num_input_tokens_seen": 6577632, + "step": 31175 + }, + { + "epoch": 3.43014301430143, + "grad_norm": 1.6684783697128296, + "learning_rate": 4.922535771590829e-05, + "loss": 0.111, + "num_input_tokens_seen": 6578720, + "step": 31180 + }, + { + "epoch": 3.4306930693069306, + "grad_norm": 0.03181789442896843, + "learning_rate": 4.922476477651668e-05, + "loss": 0.0435, + "num_input_tokens_seen": 6579776, + "step": 31185 + }, + { + "epoch": 3.4312431243124313, + "grad_norm": 0.04369939863681793, + "learning_rate": 4.922417161385704e-05, + "loss": 0.0699, + "num_input_tokens_seen": 6580832, + "step": 31190 + }, + { + "epoch": 3.431793179317932, + "grad_norm": 0.3161245882511139, + "learning_rate": 4.922357822793483e-05, + "loss": 0.0365, + "num_input_tokens_seen": 6581888, + "step": 31195 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.27838656306266785, + "learning_rate": 4.922298461875553e-05, + "loss": 0.0656, + "num_input_tokens_seen": 6582976, + "step": 31200 + }, + { + "epoch": 3.432893289328933, + "grad_norm": 0.05952116847038269, + "learning_rate": 4.92223907863246e-05, + "loss": 0.0256, + "num_input_tokens_seen": 6584064, + "step": 31205 + }, + { + "epoch": 3.4334433443344334, + "grad_norm": 0.027320660650730133, + "learning_rate": 4.9221796730647516e-05, + "loss": 0.0387, + "num_input_tokens_seen": 6585184, + "step": 31210 + }, + { + "epoch": 3.433993399339934, + "grad_norm": 0.09530860185623169, + "learning_rate": 4.922120245172977e-05, + "loss": 0.0283, + "num_input_tokens_seen": 6586304, + "step": 31215 + }, + { + "epoch": 3.4345434543454347, + "grad_norm": 1.647099256515503, + "learning_rate": 4.92206079495768e-05, + "loss": 0.1147, + "num_input_tokens_seen": 6587360, + "step": 31220 + }, + { + "epoch": 3.435093509350935, + "grad_norm": 0.32528650760650635, + "learning_rate": 4.9220013224194126e-05, + "loss": 0.0161, + "num_input_tokens_seen": 6588416, + "step": 31225 + }, + { + "epoch": 3.4356435643564356, + "grad_norm": 0.8352306485176086, + "learning_rate": 4.9219418275587205e-05, + "loss": 0.1621, + "num_input_tokens_seen": 6589504, + "step": 31230 + }, + { + "epoch": 3.4361936193619362, + "grad_norm": 0.09266303479671478, + "learning_rate": 4.921882310376152e-05, + "loss": 0.1073, + "num_input_tokens_seen": 6590656, + "step": 31235 + }, + { + "epoch": 3.436743674367437, + "grad_norm": 0.741703987121582, + "learning_rate": 4.9218227708722573e-05, + "loss": 0.1033, + "num_input_tokens_seen": 6591648, + "step": 31240 + }, + { + "epoch": 3.4372937293729375, + "grad_norm": 0.12662571668624878, + "learning_rate": 4.921763209047584e-05, + "loss": 0.0232, + "num_input_tokens_seen": 6592736, + "step": 31245 + }, + { + "epoch": 3.4378437843784377, + "grad_norm": 0.06374476850032806, + "learning_rate": 4.921703624902682e-05, + "loss": 0.0118, + "num_input_tokens_seen": 6593760, + "step": 31250 + }, + { + "epoch": 3.4383938393839384, + "grad_norm": 0.41949963569641113, + "learning_rate": 4.921644018438099e-05, + "loss": 0.0574, + "num_input_tokens_seen": 6594816, + "step": 31255 + }, + { + "epoch": 3.438943894389439, + "grad_norm": 0.11602649837732315, + "learning_rate": 4.921584389654385e-05, + "loss": 0.1075, + "num_input_tokens_seen": 6595840, + "step": 31260 + }, + { + "epoch": 3.4394939493949397, + "grad_norm": 0.09463232755661011, + "learning_rate": 4.92152473855209e-05, + "loss": 0.0962, + "num_input_tokens_seen": 6596864, + "step": 31265 + }, + { + "epoch": 3.44004400440044, + "grad_norm": 1.3112061023712158, + "learning_rate": 4.921465065131763e-05, + "loss": 0.1319, + "num_input_tokens_seen": 6597920, + "step": 31270 + }, + { + "epoch": 3.4405940594059405, + "grad_norm": 0.6943945288658142, + "learning_rate": 4.921405369393955e-05, + "loss": 0.0476, + "num_input_tokens_seen": 6599008, + "step": 31275 + }, + { + "epoch": 3.441144114411441, + "grad_norm": 0.8156154155731201, + "learning_rate": 4.921345651339215e-05, + "loss": 0.0502, + "num_input_tokens_seen": 6600064, + "step": 31280 + }, + { + "epoch": 3.441694169416942, + "grad_norm": 0.22249828279018402, + "learning_rate": 4.921285910968094e-05, + "loss": 0.0352, + "num_input_tokens_seen": 6601120, + "step": 31285 + }, + { + "epoch": 3.442244224422442, + "grad_norm": 0.13251005113124847, + "learning_rate": 4.9212261482811426e-05, + "loss": 0.0811, + "num_input_tokens_seen": 6602208, + "step": 31290 + }, + { + "epoch": 3.4427942794279427, + "grad_norm": 0.13892517983913422, + "learning_rate": 4.921166363278912e-05, + "loss": 0.0425, + "num_input_tokens_seen": 6603264, + "step": 31295 + }, + { + "epoch": 3.4433443344334433, + "grad_norm": 0.04377652704715729, + "learning_rate": 4.921106555961953e-05, + "loss": 0.0378, + "num_input_tokens_seen": 6604320, + "step": 31300 + }, + { + "epoch": 3.443894389438944, + "grad_norm": 0.061124809086322784, + "learning_rate": 4.921046726330817e-05, + "loss": 0.0928, + "num_input_tokens_seen": 6605344, + "step": 31305 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.049483522772789, + "learning_rate": 4.920986874386054e-05, + "loss": 0.1152, + "num_input_tokens_seen": 6606464, + "step": 31310 + }, + { + "epoch": 3.444994499449945, + "grad_norm": 0.049031391739845276, + "learning_rate": 4.920927000128217e-05, + "loss": 0.0533, + "num_input_tokens_seen": 6607552, + "step": 31315 + }, + { + "epoch": 3.4455445544554455, + "grad_norm": 0.09023302793502808, + "learning_rate": 4.920867103557858e-05, + "loss": 0.0217, + "num_input_tokens_seen": 6608576, + "step": 31320 + }, + { + "epoch": 3.446094609460946, + "grad_norm": 0.03917283937335014, + "learning_rate": 4.920807184675528e-05, + "loss": 0.055, + "num_input_tokens_seen": 6609632, + "step": 31325 + }, + { + "epoch": 3.446644664466447, + "grad_norm": 0.18551692366600037, + "learning_rate": 4.92074724348178e-05, + "loss": 0.0426, + "num_input_tokens_seen": 6610752, + "step": 31330 + }, + { + "epoch": 3.4471947194719474, + "grad_norm": 0.6717285513877869, + "learning_rate": 4.920687279977166e-05, + "loss": 0.0448, + "num_input_tokens_seen": 6611776, + "step": 31335 + }, + { + "epoch": 3.4477447744774476, + "grad_norm": 0.5656999349594116, + "learning_rate": 4.920627294162239e-05, + "loss": 0.0694, + "num_input_tokens_seen": 6612864, + "step": 31340 + }, + { + "epoch": 3.4482948294829483, + "grad_norm": 0.01853390969336033, + "learning_rate": 4.920567286037552e-05, + "loss": 0.1182, + "num_input_tokens_seen": 6613952, + "step": 31345 + }, + { + "epoch": 3.448844884488449, + "grad_norm": 0.009470228105783463, + "learning_rate": 4.920507255603658e-05, + "loss": 0.0442, + "num_input_tokens_seen": 6614976, + "step": 31350 + }, + { + "epoch": 3.449394939493949, + "grad_norm": 0.1985747218132019, + "learning_rate": 4.9204472028611096e-05, + "loss": 0.0334, + "num_input_tokens_seen": 6616032, + "step": 31355 + }, + { + "epoch": 3.44994499449945, + "grad_norm": 0.10922878235578537, + "learning_rate": 4.9203871278104605e-05, + "loss": 0.0393, + "num_input_tokens_seen": 6617152, + "step": 31360 + }, + { + "epoch": 3.4504950495049505, + "grad_norm": 0.04339471831917763, + "learning_rate": 4.920327030452265e-05, + "loss": 0.0771, + "num_input_tokens_seen": 6618240, + "step": 31365 + }, + { + "epoch": 3.451045104510451, + "grad_norm": 0.8752534985542297, + "learning_rate": 4.9202669107870775e-05, + "loss": 0.0437, + "num_input_tokens_seen": 6619296, + "step": 31370 + }, + { + "epoch": 3.4515951595159517, + "grad_norm": 0.02112528309226036, + "learning_rate": 4.920206768815451e-05, + "loss": 0.026, + "num_input_tokens_seen": 6620352, + "step": 31375 + }, + { + "epoch": 3.452145214521452, + "grad_norm": 0.07295017689466476, + "learning_rate": 4.920146604537939e-05, + "loss": 0.0225, + "num_input_tokens_seen": 6621408, + "step": 31380 + }, + { + "epoch": 3.4526952695269526, + "grad_norm": 1.1101343631744385, + "learning_rate": 4.9200864179550975e-05, + "loss": 0.0654, + "num_input_tokens_seen": 6622432, + "step": 31385 + }, + { + "epoch": 3.4532453245324533, + "grad_norm": 1.0979422330856323, + "learning_rate": 4.920026209067481e-05, + "loss": 0.0398, + "num_input_tokens_seen": 6623520, + "step": 31390 + }, + { + "epoch": 3.453795379537954, + "grad_norm": 0.44832223653793335, + "learning_rate": 4.919965977875644e-05, + "loss": 0.0533, + "num_input_tokens_seen": 6624512, + "step": 31395 + }, + { + "epoch": 3.4543454345434546, + "grad_norm": 0.16263322532176971, + "learning_rate": 4.9199057243801416e-05, + "loss": 0.0119, + "num_input_tokens_seen": 6625504, + "step": 31400 + }, + { + "epoch": 3.4548954895489548, + "grad_norm": 0.28893277049064636, + "learning_rate": 4.91984544858153e-05, + "loss": 0.0158, + "num_input_tokens_seen": 6626560, + "step": 31405 + }, + { + "epoch": 3.4554455445544554, + "grad_norm": 0.30347108840942383, + "learning_rate": 4.9197851504803635e-05, + "loss": 0.0413, + "num_input_tokens_seen": 6627616, + "step": 31410 + }, + { + "epoch": 3.455995599559956, + "grad_norm": 0.07362000644207001, + "learning_rate": 4.919724830077199e-05, + "loss": 0.0285, + "num_input_tokens_seen": 6628768, + "step": 31415 + }, + { + "epoch": 3.4565456545654567, + "grad_norm": 0.32307079434394836, + "learning_rate": 4.9196644873725906e-05, + "loss": 0.1155, + "num_input_tokens_seen": 6629792, + "step": 31420 + }, + { + "epoch": 3.457095709570957, + "grad_norm": 0.8825664520263672, + "learning_rate": 4.919604122367096e-05, + "loss": 0.1337, + "num_input_tokens_seen": 6630848, + "step": 31425 + }, + { + "epoch": 3.4576457645764576, + "grad_norm": 0.19199346005916595, + "learning_rate": 4.919543735061273e-05, + "loss": 0.1113, + "num_input_tokens_seen": 6631904, + "step": 31430 + }, + { + "epoch": 3.458195819581958, + "grad_norm": 1.0865776538848877, + "learning_rate": 4.919483325455674e-05, + "loss": 0.072, + "num_input_tokens_seen": 6632992, + "step": 31435 + }, + { + "epoch": 3.458745874587459, + "grad_norm": 0.02261139266192913, + "learning_rate": 4.9194228935508595e-05, + "loss": 0.0395, + "num_input_tokens_seen": 6634016, + "step": 31440 + }, + { + "epoch": 3.459295929592959, + "grad_norm": 0.3956356942653656, + "learning_rate": 4.919362439347385e-05, + "loss": 0.0461, + "num_input_tokens_seen": 6635040, + "step": 31445 + }, + { + "epoch": 3.4598459845984597, + "grad_norm": 0.01850615255534649, + "learning_rate": 4.919301962845808e-05, + "loss": 0.0169, + "num_input_tokens_seen": 6636096, + "step": 31450 + }, + { + "epoch": 3.4603960396039604, + "grad_norm": 0.28206685185432434, + "learning_rate": 4.919241464046686e-05, + "loss": 0.0322, + "num_input_tokens_seen": 6637120, + "step": 31455 + }, + { + "epoch": 3.460946094609461, + "grad_norm": 0.19733405113220215, + "learning_rate": 4.919180942950575e-05, + "loss": 0.1329, + "num_input_tokens_seen": 6638144, + "step": 31460 + }, + { + "epoch": 3.4614961496149617, + "grad_norm": 0.19324323534965515, + "learning_rate": 4.9191203995580347e-05, + "loss": 0.0829, + "num_input_tokens_seen": 6639232, + "step": 31465 + }, + { + "epoch": 3.462046204620462, + "grad_norm": 0.1971232146024704, + "learning_rate": 4.9190598338696225e-05, + "loss": 0.0472, + "num_input_tokens_seen": 6640224, + "step": 31470 + }, + { + "epoch": 3.4625962596259625, + "grad_norm": 0.3667930066585541, + "learning_rate": 4.918999245885897e-05, + "loss": 0.0899, + "num_input_tokens_seen": 6641312, + "step": 31475 + }, + { + "epoch": 3.463146314631463, + "grad_norm": 0.05207981914281845, + "learning_rate": 4.9189386356074154e-05, + "loss": 0.1008, + "num_input_tokens_seen": 6642336, + "step": 31480 + }, + { + "epoch": 3.463696369636964, + "grad_norm": 0.4470517039299011, + "learning_rate": 4.918878003034738e-05, + "loss": 0.0626, + "num_input_tokens_seen": 6643424, + "step": 31485 + }, + { + "epoch": 3.4642464246424645, + "grad_norm": 0.16385036706924438, + "learning_rate": 4.9188173481684216e-05, + "loss": 0.0544, + "num_input_tokens_seen": 6644480, + "step": 31490 + }, + { + "epoch": 3.4647964796479647, + "grad_norm": 0.03427400812506676, + "learning_rate": 4.9187566710090275e-05, + "loss": 0.0553, + "num_input_tokens_seen": 6645536, + "step": 31495 + }, + { + "epoch": 3.4653465346534653, + "grad_norm": 1.0597857236862183, + "learning_rate": 4.918695971557113e-05, + "loss": 0.119, + "num_input_tokens_seen": 6646560, + "step": 31500 + }, + { + "epoch": 3.465896589658966, + "grad_norm": 0.045391373336315155, + "learning_rate": 4.9186352498132395e-05, + "loss": 0.0801, + "num_input_tokens_seen": 6647584, + "step": 31505 + }, + { + "epoch": 3.4664466446644666, + "grad_norm": 0.03292318433523178, + "learning_rate": 4.918574505777964e-05, + "loss": 0.0732, + "num_input_tokens_seen": 6648608, + "step": 31510 + }, + { + "epoch": 3.466996699669967, + "grad_norm": 0.22717218101024628, + "learning_rate": 4.918513739451849e-05, + "loss": 0.0346, + "num_input_tokens_seen": 6649632, + "step": 31515 + }, + { + "epoch": 3.4675467546754675, + "grad_norm": 0.914793848991394, + "learning_rate": 4.9184529508354524e-05, + "loss": 0.1201, + "num_input_tokens_seen": 6650720, + "step": 31520 + }, + { + "epoch": 3.468096809680968, + "grad_norm": 0.1389092355966568, + "learning_rate": 4.9183921399293363e-05, + "loss": 0.0304, + "num_input_tokens_seen": 6651776, + "step": 31525 + }, + { + "epoch": 3.4686468646864688, + "grad_norm": 0.7378307580947876, + "learning_rate": 4.91833130673406e-05, + "loss": 0.1203, + "num_input_tokens_seen": 6652832, + "step": 31530 + }, + { + "epoch": 3.469196919691969, + "grad_norm": 0.3129393756389618, + "learning_rate": 4.918270451250184e-05, + "loss": 0.0759, + "num_input_tokens_seen": 6653920, + "step": 31535 + }, + { + "epoch": 3.4697469746974696, + "grad_norm": 0.21359120309352875, + "learning_rate": 4.918209573478271e-05, + "loss": 0.0472, + "num_input_tokens_seen": 6654944, + "step": 31540 + }, + { + "epoch": 3.4702970297029703, + "grad_norm": 0.593985378742218, + "learning_rate": 4.918148673418879e-05, + "loss": 0.0585, + "num_input_tokens_seen": 6655968, + "step": 31545 + }, + { + "epoch": 3.470847084708471, + "grad_norm": 0.02810223028063774, + "learning_rate": 4.918087751072572e-05, + "loss": 0.0287, + "num_input_tokens_seen": 6657024, + "step": 31550 + }, + { + "epoch": 3.4713971397139716, + "grad_norm": 0.12483242899179459, + "learning_rate": 4.91802680643991e-05, + "loss": 0.0128, + "num_input_tokens_seen": 6658112, + "step": 31555 + }, + { + "epoch": 3.4719471947194718, + "grad_norm": 0.22196340560913086, + "learning_rate": 4.917965839521455e-05, + "loss": 0.0251, + "num_input_tokens_seen": 6659168, + "step": 31560 + }, + { + "epoch": 3.4724972497249724, + "grad_norm": 1.5897315740585327, + "learning_rate": 4.91790485031777e-05, + "loss": 0.0928, + "num_input_tokens_seen": 6660320, + "step": 31565 + }, + { + "epoch": 3.473047304730473, + "grad_norm": 0.08347246795892715, + "learning_rate": 4.917843838829416e-05, + "loss": 0.0162, + "num_input_tokens_seen": 6661376, + "step": 31570 + }, + { + "epoch": 3.4735973597359737, + "grad_norm": 1.5546172857284546, + "learning_rate": 4.917782805056955e-05, + "loss": 0.0898, + "num_input_tokens_seen": 6662432, + "step": 31575 + }, + { + "epoch": 3.4741474147414744, + "grad_norm": 0.5099920034408569, + "learning_rate": 4.917721749000951e-05, + "loss": 0.1226, + "num_input_tokens_seen": 6663488, + "step": 31580 + }, + { + "epoch": 3.4746974697469746, + "grad_norm": 0.4180869460105896, + "learning_rate": 4.9176606706619654e-05, + "loss": 0.1098, + "num_input_tokens_seen": 6664544, + "step": 31585 + }, + { + "epoch": 3.4752475247524752, + "grad_norm": 0.17175786197185516, + "learning_rate": 4.9175995700405606e-05, + "loss": 0.0307, + "num_input_tokens_seen": 6665664, + "step": 31590 + }, + { + "epoch": 3.475797579757976, + "grad_norm": 0.22957423329353333, + "learning_rate": 4.917538447137302e-05, + "loss": 0.0559, + "num_input_tokens_seen": 6666720, + "step": 31595 + }, + { + "epoch": 3.4763476347634765, + "grad_norm": 0.013841011561453342, + "learning_rate": 4.9174773019527506e-05, + "loss": 0.0957, + "num_input_tokens_seen": 6667808, + "step": 31600 + }, + { + "epoch": 3.4768976897689767, + "grad_norm": 0.07211373746395111, + "learning_rate": 4.917416134487472e-05, + "loss": 0.0517, + "num_input_tokens_seen": 6668928, + "step": 31605 + }, + { + "epoch": 3.4774477447744774, + "grad_norm": 0.025930043309926987, + "learning_rate": 4.917354944742027e-05, + "loss": 0.067, + "num_input_tokens_seen": 6669984, + "step": 31610 + }, + { + "epoch": 3.477997799779978, + "grad_norm": 0.27273643016815186, + "learning_rate": 4.917293732716983e-05, + "loss": 0.1089, + "num_input_tokens_seen": 6671008, + "step": 31615 + }, + { + "epoch": 3.4785478547854787, + "grad_norm": 0.023363368585705757, + "learning_rate": 4.917232498412903e-05, + "loss": 0.0801, + "num_input_tokens_seen": 6672096, + "step": 31620 + }, + { + "epoch": 3.479097909790979, + "grad_norm": 0.02996460348367691, + "learning_rate": 4.9171712418303496e-05, + "loss": 0.0176, + "num_input_tokens_seen": 6673088, + "step": 31625 + }, + { + "epoch": 3.4796479647964795, + "grad_norm": 1.3467435836791992, + "learning_rate": 4.9171099629698894e-05, + "loss": 0.1817, + "num_input_tokens_seen": 6674112, + "step": 31630 + }, + { + "epoch": 3.48019801980198, + "grad_norm": 0.644514262676239, + "learning_rate": 4.917048661832086e-05, + "loss": 0.0301, + "num_input_tokens_seen": 6675200, + "step": 31635 + }, + { + "epoch": 3.480748074807481, + "grad_norm": 0.6620039343833923, + "learning_rate": 4.916987338417505e-05, + "loss": 0.0183, + "num_input_tokens_seen": 6676256, + "step": 31640 + }, + { + "epoch": 3.4812981298129815, + "grad_norm": 0.29199615120887756, + "learning_rate": 4.916925992726712e-05, + "loss": 0.074, + "num_input_tokens_seen": 6677248, + "step": 31645 + }, + { + "epoch": 3.4818481848184817, + "grad_norm": 0.3799235224723816, + "learning_rate": 4.916864624760272e-05, + "loss": 0.0594, + "num_input_tokens_seen": 6678336, + "step": 31650 + }, + { + "epoch": 3.4823982398239823, + "grad_norm": 0.05351470038294792, + "learning_rate": 4.9168032345187497e-05, + "loss": 0.0237, + "num_input_tokens_seen": 6679392, + "step": 31655 + }, + { + "epoch": 3.482948294829483, + "grad_norm": 0.5651573538780212, + "learning_rate": 4.9167418220027124e-05, + "loss": 0.0359, + "num_input_tokens_seen": 6680448, + "step": 31660 + }, + { + "epoch": 3.4834983498349836, + "grad_norm": 0.36154910922050476, + "learning_rate": 4.9166803872127254e-05, + "loss": 0.0699, + "num_input_tokens_seen": 6681568, + "step": 31665 + }, + { + "epoch": 3.4840484048404843, + "grad_norm": 0.8398702144622803, + "learning_rate": 4.9166189301493544e-05, + "loss": 0.0768, + "num_input_tokens_seen": 6682624, + "step": 31670 + }, + { + "epoch": 3.4845984598459845, + "grad_norm": 0.6803544163703918, + "learning_rate": 4.916557450813166e-05, + "loss": 0.112, + "num_input_tokens_seen": 6683680, + "step": 31675 + }, + { + "epoch": 3.485148514851485, + "grad_norm": 0.11718403548002243, + "learning_rate": 4.916495949204728e-05, + "loss": 0.0366, + "num_input_tokens_seen": 6684736, + "step": 31680 + }, + { + "epoch": 3.485698569856986, + "grad_norm": 0.3114849030971527, + "learning_rate": 4.9164344253246066e-05, + "loss": 0.0715, + "num_input_tokens_seen": 6685856, + "step": 31685 + }, + { + "epoch": 3.4862486248624864, + "grad_norm": 0.8850411772727966, + "learning_rate": 4.916372879173368e-05, + "loss": 0.0427, + "num_input_tokens_seen": 6686976, + "step": 31690 + }, + { + "epoch": 3.4867986798679866, + "grad_norm": 0.29332491755485535, + "learning_rate": 4.9163113107515795e-05, + "loss": 0.0178, + "num_input_tokens_seen": 6688000, + "step": 31695 + }, + { + "epoch": 3.4873487348734873, + "grad_norm": 0.4237574636936188, + "learning_rate": 4.9162497200598104e-05, + "loss": 0.0659, + "num_input_tokens_seen": 6689152, + "step": 31700 + }, + { + "epoch": 3.487898789878988, + "grad_norm": 0.41934505105018616, + "learning_rate": 4.916188107098626e-05, + "loss": 0.1267, + "num_input_tokens_seen": 6690208, + "step": 31705 + }, + { + "epoch": 3.4884488448844886, + "grad_norm": 0.7421970367431641, + "learning_rate": 4.916126471868595e-05, + "loss": 0.1392, + "num_input_tokens_seen": 6691296, + "step": 31710 + }, + { + "epoch": 3.488998899889989, + "grad_norm": 0.7009329199790955, + "learning_rate": 4.916064814370287e-05, + "loss": 0.0554, + "num_input_tokens_seen": 6692352, + "step": 31715 + }, + { + "epoch": 3.4895489548954894, + "grad_norm": 0.4236189126968384, + "learning_rate": 4.916003134604268e-05, + "loss": 0.0578, + "num_input_tokens_seen": 6693408, + "step": 31720 + }, + { + "epoch": 3.49009900990099, + "grad_norm": 0.7513123154640198, + "learning_rate": 4.915941432571108e-05, + "loss": 0.077, + "num_input_tokens_seen": 6694464, + "step": 31725 + }, + { + "epoch": 3.4906490649064907, + "grad_norm": 0.03325505182147026, + "learning_rate": 4.915879708271375e-05, + "loss": 0.0598, + "num_input_tokens_seen": 6695520, + "step": 31730 + }, + { + "epoch": 3.4911991199119914, + "grad_norm": 0.6069605350494385, + "learning_rate": 4.915817961705637e-05, + "loss": 0.0296, + "num_input_tokens_seen": 6696512, + "step": 31735 + }, + { + "epoch": 3.4917491749174916, + "grad_norm": 0.19944997131824493, + "learning_rate": 4.915756192874465e-05, + "loss": 0.0239, + "num_input_tokens_seen": 6697568, + "step": 31740 + }, + { + "epoch": 3.4922992299229922, + "grad_norm": 0.10162569582462311, + "learning_rate": 4.9156944017784274e-05, + "loss": 0.0306, + "num_input_tokens_seen": 6698624, + "step": 31745 + }, + { + "epoch": 3.492849284928493, + "grad_norm": 0.15385572612285614, + "learning_rate": 4.915632588418093e-05, + "loss": 0.0336, + "num_input_tokens_seen": 6699648, + "step": 31750 + }, + { + "epoch": 3.4933993399339935, + "grad_norm": 0.08480949699878693, + "learning_rate": 4.915570752794033e-05, + "loss": 0.06, + "num_input_tokens_seen": 6700736, + "step": 31755 + }, + { + "epoch": 3.493949394939494, + "grad_norm": 0.03447933867573738, + "learning_rate": 4.915508894906816e-05, + "loss": 0.0454, + "num_input_tokens_seen": 6701760, + "step": 31760 + }, + { + "epoch": 3.4944994499449944, + "grad_norm": 0.17923404276371002, + "learning_rate": 4.9154470147570124e-05, + "loss": 0.0152, + "num_input_tokens_seen": 6702848, + "step": 31765 + }, + { + "epoch": 3.495049504950495, + "grad_norm": 0.028993643820285797, + "learning_rate": 4.915385112345193e-05, + "loss": 0.0684, + "num_input_tokens_seen": 6704032, + "step": 31770 + }, + { + "epoch": 3.4955995599559957, + "grad_norm": 0.8368478417396545, + "learning_rate": 4.9153231876719284e-05, + "loss": 0.0602, + "num_input_tokens_seen": 6705152, + "step": 31775 + }, + { + "epoch": 3.4961496149614963, + "grad_norm": 0.663429319858551, + "learning_rate": 4.915261240737789e-05, + "loss": 0.0444, + "num_input_tokens_seen": 6706240, + "step": 31780 + }, + { + "epoch": 3.4966996699669965, + "grad_norm": 0.38959282636642456, + "learning_rate": 4.915199271543345e-05, + "loss": 0.1162, + "num_input_tokens_seen": 6707296, + "step": 31785 + }, + { + "epoch": 3.497249724972497, + "grad_norm": 0.7383068203926086, + "learning_rate": 4.915137280089168e-05, + "loss": 0.0836, + "num_input_tokens_seen": 6708384, + "step": 31790 + }, + { + "epoch": 3.497799779977998, + "grad_norm": 0.27121636271476746, + "learning_rate": 4.9150752663758307e-05, + "loss": 0.0408, + "num_input_tokens_seen": 6709408, + "step": 31795 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 1.0496671199798584, + "learning_rate": 4.9150132304039024e-05, + "loss": 0.0477, + "num_input_tokens_seen": 6710496, + "step": 31800 + }, + { + "epoch": 3.4988998899889987, + "grad_norm": 0.19719867408275604, + "learning_rate": 4.914951172173957e-05, + "loss": 0.0803, + "num_input_tokens_seen": 6711552, + "step": 31805 + }, + { + "epoch": 3.4994499449944994, + "grad_norm": 0.26345211267471313, + "learning_rate": 4.914889091686565e-05, + "loss": 0.0453, + "num_input_tokens_seen": 6712608, + "step": 31810 + }, + { + "epoch": 3.5, + "grad_norm": 0.9993818998336792, + "learning_rate": 4.914826988942299e-05, + "loss": 0.1203, + "num_input_tokens_seen": 6713664, + "step": 31815 + }, + { + "epoch": 3.5005500550055006, + "grad_norm": 0.4921537935733795, + "learning_rate": 4.914764863941731e-05, + "loss": 0.0726, + "num_input_tokens_seen": 6714752, + "step": 31820 + }, + { + "epoch": 3.5011001100110013, + "grad_norm": 1.6426528692245483, + "learning_rate": 4.914702716685434e-05, + "loss": 0.1413, + "num_input_tokens_seen": 6715776, + "step": 31825 + }, + { + "epoch": 3.5016501650165015, + "grad_norm": 0.07745851576328278, + "learning_rate": 4.9146405471739816e-05, + "loss": 0.0611, + "num_input_tokens_seen": 6716800, + "step": 31830 + }, + { + "epoch": 3.502200220022002, + "grad_norm": 0.032200559973716736, + "learning_rate": 4.9145783554079447e-05, + "loss": 0.0179, + "num_input_tokens_seen": 6717888, + "step": 31835 + }, + { + "epoch": 3.502750275027503, + "grad_norm": 0.5842743515968323, + "learning_rate": 4.914516141387899e-05, + "loss": 0.0411, + "num_input_tokens_seen": 6719008, + "step": 31840 + }, + { + "epoch": 3.5033003300330035, + "grad_norm": 1.6515862941741943, + "learning_rate": 4.9144539051144165e-05, + "loss": 0.0834, + "num_input_tokens_seen": 6720064, + "step": 31845 + }, + { + "epoch": 3.503850385038504, + "grad_norm": 0.1057516559958458, + "learning_rate": 4.9143916465880704e-05, + "loss": 0.0641, + "num_input_tokens_seen": 6721216, + "step": 31850 + }, + { + "epoch": 3.5044004400440043, + "grad_norm": 0.19775639474391937, + "learning_rate": 4.914329365809435e-05, + "loss": 0.1508, + "num_input_tokens_seen": 6722304, + "step": 31855 + }, + { + "epoch": 3.504950495049505, + "grad_norm": 0.30448848009109497, + "learning_rate": 4.9142670627790847e-05, + "loss": 0.0773, + "num_input_tokens_seen": 6723328, + "step": 31860 + }, + { + "epoch": 3.5055005500550056, + "grad_norm": 1.7599467039108276, + "learning_rate": 4.914204737497593e-05, + "loss": 0.0933, + "num_input_tokens_seen": 6724416, + "step": 31865 + }, + { + "epoch": 3.506050605060506, + "grad_norm": 0.10927886515855789, + "learning_rate": 4.914142389965535e-05, + "loss": 0.0918, + "num_input_tokens_seen": 6725472, + "step": 31870 + }, + { + "epoch": 3.5066006600660065, + "grad_norm": 0.015114722773432732, + "learning_rate": 4.914080020183485e-05, + "loss": 0.0793, + "num_input_tokens_seen": 6726560, + "step": 31875 + }, + { + "epoch": 3.507150715071507, + "grad_norm": 0.19852356612682343, + "learning_rate": 4.914017628152017e-05, + "loss": 0.1245, + "num_input_tokens_seen": 6727584, + "step": 31880 + }, + { + "epoch": 3.5077007700770078, + "grad_norm": 0.3630710244178772, + "learning_rate": 4.9139552138717076e-05, + "loss": 0.0818, + "num_input_tokens_seen": 6728640, + "step": 31885 + }, + { + "epoch": 3.5082508250825084, + "grad_norm": 0.020892728120088577, + "learning_rate": 4.913892777343131e-05, + "loss": 0.0598, + "num_input_tokens_seen": 6729728, + "step": 31890 + }, + { + "epoch": 3.5088008800880086, + "grad_norm": 1.2580907344818115, + "learning_rate": 4.913830318566864e-05, + "loss": 0.0474, + "num_input_tokens_seen": 6730848, + "step": 31895 + }, + { + "epoch": 3.5093509350935093, + "grad_norm": 0.07803384959697723, + "learning_rate": 4.9137678375434806e-05, + "loss": 0.0118, + "num_input_tokens_seen": 6731904, + "step": 31900 + }, + { + "epoch": 3.50990099009901, + "grad_norm": 0.46894991397857666, + "learning_rate": 4.913705334273557e-05, + "loss": 0.0494, + "num_input_tokens_seen": 6732928, + "step": 31905 + }, + { + "epoch": 3.5104510451045106, + "grad_norm": 1.0722997188568115, + "learning_rate": 4.913642808757669e-05, + "loss": 0.0618, + "num_input_tokens_seen": 6733952, + "step": 31910 + }, + { + "epoch": 3.511001100110011, + "grad_norm": 1.7612379789352417, + "learning_rate": 4.9135802609963947e-05, + "loss": 0.1197, + "num_input_tokens_seen": 6735040, + "step": 31915 + }, + { + "epoch": 3.5115511551155114, + "grad_norm": 0.38300177454948425, + "learning_rate": 4.9135176909903085e-05, + "loss": 0.0825, + "num_input_tokens_seen": 6736096, + "step": 31920 + }, + { + "epoch": 3.512101210121012, + "grad_norm": 0.6218224167823792, + "learning_rate": 4.913455098739988e-05, + "loss": 0.0898, + "num_input_tokens_seen": 6737152, + "step": 31925 + }, + { + "epoch": 3.5126512651265127, + "grad_norm": 0.20269984006881714, + "learning_rate": 4.91339248424601e-05, + "loss": 0.0239, + "num_input_tokens_seen": 6738208, + "step": 31930 + }, + { + "epoch": 3.5132013201320134, + "grad_norm": 0.4305364191532135, + "learning_rate": 4.913329847508952e-05, + "loss": 0.0315, + "num_input_tokens_seen": 6739232, + "step": 31935 + }, + { + "epoch": 3.513751375137514, + "grad_norm": 0.041960492730140686, + "learning_rate": 4.91326718852939e-05, + "loss": 0.0403, + "num_input_tokens_seen": 6740320, + "step": 31940 + }, + { + "epoch": 3.514301430143014, + "grad_norm": 0.06934197247028351, + "learning_rate": 4.913204507307903e-05, + "loss": 0.0124, + "num_input_tokens_seen": 6741312, + "step": 31945 + }, + { + "epoch": 3.514851485148515, + "grad_norm": 1.045638918876648, + "learning_rate": 4.913141803845067e-05, + "loss": 0.0502, + "num_input_tokens_seen": 6742336, + "step": 31950 + }, + { + "epoch": 3.5154015401540155, + "grad_norm": 0.24326875805854797, + "learning_rate": 4.913079078141462e-05, + "loss": 0.0744, + "num_input_tokens_seen": 6743392, + "step": 31955 + }, + { + "epoch": 3.5159515951595157, + "grad_norm": 0.04264223948121071, + "learning_rate": 4.913016330197664e-05, + "loss": 0.0855, + "num_input_tokens_seen": 6744352, + "step": 31960 + }, + { + "epoch": 3.5165016501650164, + "grad_norm": 0.04302958771586418, + "learning_rate": 4.912953560014254e-05, + "loss": 0.0202, + "num_input_tokens_seen": 6745440, + "step": 31965 + }, + { + "epoch": 3.517051705170517, + "grad_norm": 0.16137711703777313, + "learning_rate": 4.912890767591808e-05, + "loss": 0.0506, + "num_input_tokens_seen": 6746464, + "step": 31970 + }, + { + "epoch": 3.5176017601760177, + "grad_norm": 0.17817866802215576, + "learning_rate": 4.912827952930905e-05, + "loss": 0.0788, + "num_input_tokens_seen": 6747584, + "step": 31975 + }, + { + "epoch": 3.5181518151815183, + "grad_norm": 0.1958692967891693, + "learning_rate": 4.912765116032125e-05, + "loss": 0.0656, + "num_input_tokens_seen": 6748608, + "step": 31980 + }, + { + "epoch": 3.5187018701870185, + "grad_norm": 0.040920861065387726, + "learning_rate": 4.9127022568960466e-05, + "loss": 0.0433, + "num_input_tokens_seen": 6749664, + "step": 31985 + }, + { + "epoch": 3.519251925192519, + "grad_norm": 0.10057950019836426, + "learning_rate": 4.91263937552325e-05, + "loss": 0.0629, + "num_input_tokens_seen": 6750720, + "step": 31990 + }, + { + "epoch": 3.51980198019802, + "grad_norm": 0.2329111248254776, + "learning_rate": 4.9125764719143134e-05, + "loss": 0.0903, + "num_input_tokens_seen": 6751808, + "step": 31995 + }, + { + "epoch": 3.5203520352035205, + "grad_norm": 0.12285587191581726, + "learning_rate": 4.912513546069817e-05, + "loss": 0.0202, + "num_input_tokens_seen": 6752864, + "step": 32000 + }, + { + "epoch": 3.520902090209021, + "grad_norm": 0.047072116285562515, + "learning_rate": 4.912450597990341e-05, + "loss": 0.0473, + "num_input_tokens_seen": 6753984, + "step": 32005 + }, + { + "epoch": 3.5214521452145213, + "grad_norm": 1.1664433479309082, + "learning_rate": 4.9123876276764655e-05, + "loss": 0.0746, + "num_input_tokens_seen": 6755008, + "step": 32010 + }, + { + "epoch": 3.522002200220022, + "grad_norm": 0.44139760732650757, + "learning_rate": 4.91232463512877e-05, + "loss": 0.0269, + "num_input_tokens_seen": 6756096, + "step": 32015 + }, + { + "epoch": 3.5225522552255226, + "grad_norm": 2.1775341033935547, + "learning_rate": 4.912261620347837e-05, + "loss": 0.1869, + "num_input_tokens_seen": 6757184, + "step": 32020 + }, + { + "epoch": 3.523102310231023, + "grad_norm": 0.3472650945186615, + "learning_rate": 4.9121985833342456e-05, + "loss": 0.1414, + "num_input_tokens_seen": 6758304, + "step": 32025 + }, + { + "epoch": 3.523652365236524, + "grad_norm": 0.06294526904821396, + "learning_rate": 4.912135524088577e-05, + "loss": 0.031, + "num_input_tokens_seen": 6759360, + "step": 32030 + }, + { + "epoch": 3.524202420242024, + "grad_norm": 1.0094650983810425, + "learning_rate": 4.912072442611414e-05, + "loss": 0.1761, + "num_input_tokens_seen": 6760416, + "step": 32035 + }, + { + "epoch": 3.5247524752475248, + "grad_norm": 0.0862717553973198, + "learning_rate": 4.912009338903336e-05, + "loss": 0.0596, + "num_input_tokens_seen": 6761472, + "step": 32040 + }, + { + "epoch": 3.5253025302530254, + "grad_norm": 1.0470823049545288, + "learning_rate": 4.9119462129649254e-05, + "loss": 0.0894, + "num_input_tokens_seen": 6762432, + "step": 32045 + }, + { + "epoch": 3.5258525852585256, + "grad_norm": 0.8105860948562622, + "learning_rate": 4.911883064796763e-05, + "loss": 0.0792, + "num_input_tokens_seen": 6763456, + "step": 32050 + }, + { + "epoch": 3.5264026402640263, + "grad_norm": 0.17730988562107086, + "learning_rate": 4.911819894399433e-05, + "loss": 0.0713, + "num_input_tokens_seen": 6764512, + "step": 32055 + }, + { + "epoch": 3.526952695269527, + "grad_norm": 0.22925157845020294, + "learning_rate": 4.911756701773516e-05, + "loss": 0.0189, + "num_input_tokens_seen": 6765568, + "step": 32060 + }, + { + "epoch": 3.5275027502750276, + "grad_norm": 0.06516709178686142, + "learning_rate": 4.911693486919594e-05, + "loss": 0.0834, + "num_input_tokens_seen": 6766656, + "step": 32065 + }, + { + "epoch": 3.5280528052805282, + "grad_norm": 0.3181614577770233, + "learning_rate": 4.911630249838252e-05, + "loss": 0.0442, + "num_input_tokens_seen": 6767712, + "step": 32070 + }, + { + "epoch": 3.5286028602860284, + "grad_norm": 0.0649905875325203, + "learning_rate": 4.91156699053007e-05, + "loss": 0.0213, + "num_input_tokens_seen": 6768768, + "step": 32075 + }, + { + "epoch": 3.529152915291529, + "grad_norm": 0.523231029510498, + "learning_rate": 4.9115037089956316e-05, + "loss": 0.0333, + "num_input_tokens_seen": 6769760, + "step": 32080 + }, + { + "epoch": 3.5297029702970297, + "grad_norm": 0.6880151629447937, + "learning_rate": 4.9114404052355225e-05, + "loss": 0.0563, + "num_input_tokens_seen": 6770784, + "step": 32085 + }, + { + "epoch": 3.5302530253025304, + "grad_norm": 0.12228953093290329, + "learning_rate": 4.9113770792503225e-05, + "loss": 0.0123, + "num_input_tokens_seen": 6771904, + "step": 32090 + }, + { + "epoch": 3.530803080308031, + "grad_norm": 0.3165457844734192, + "learning_rate": 4.911313731040618e-05, + "loss": 0.0875, + "num_input_tokens_seen": 6772928, + "step": 32095 + }, + { + "epoch": 3.5313531353135312, + "grad_norm": 1.230556845664978, + "learning_rate": 4.911250360606992e-05, + "loss": 0.1053, + "num_input_tokens_seen": 6774016, + "step": 32100 + }, + { + "epoch": 3.531903190319032, + "grad_norm": 0.2032836675643921, + "learning_rate": 4.9111869679500274e-05, + "loss": 0.211, + "num_input_tokens_seen": 6775040, + "step": 32105 + }, + { + "epoch": 3.5324532453245325, + "grad_norm": 0.1136513277888298, + "learning_rate": 4.911123553070311e-05, + "loss": 0.0336, + "num_input_tokens_seen": 6776096, + "step": 32110 + }, + { + "epoch": 3.5330033003300327, + "grad_norm": 0.07694821804761887, + "learning_rate": 4.911060115968425e-05, + "loss": 0.0128, + "num_input_tokens_seen": 6777120, + "step": 32115 + }, + { + "epoch": 3.533553355335534, + "grad_norm": 0.026573527604341507, + "learning_rate": 4.910996656644955e-05, + "loss": 0.03, + "num_input_tokens_seen": 6778176, + "step": 32120 + }, + { + "epoch": 3.534103410341034, + "grad_norm": 0.35271984338760376, + "learning_rate": 4.9109331751004855e-05, + "loss": 0.0724, + "num_input_tokens_seen": 6779168, + "step": 32125 + }, + { + "epoch": 3.5346534653465347, + "grad_norm": 0.04206756129860878, + "learning_rate": 4.910869671335602e-05, + "loss": 0.0807, + "num_input_tokens_seen": 6780256, + "step": 32130 + }, + { + "epoch": 3.5352035203520353, + "grad_norm": 0.025793341919779778, + "learning_rate": 4.91080614535089e-05, + "loss": 0.0169, + "num_input_tokens_seen": 6781344, + "step": 32135 + }, + { + "epoch": 3.5357535753575355, + "grad_norm": 0.08442076295614243, + "learning_rate": 4.9107425971469336e-05, + "loss": 0.1022, + "num_input_tokens_seen": 6782432, + "step": 32140 + }, + { + "epoch": 3.536303630363036, + "grad_norm": 0.1277027726173401, + "learning_rate": 4.91067902672432e-05, + "loss": 0.0856, + "num_input_tokens_seen": 6783456, + "step": 32145 + }, + { + "epoch": 3.536853685368537, + "grad_norm": 0.01888570375740528, + "learning_rate": 4.910615434083635e-05, + "loss": 0.1126, + "num_input_tokens_seen": 6784480, + "step": 32150 + }, + { + "epoch": 3.5374037403740375, + "grad_norm": 0.04691823199391365, + "learning_rate": 4.9105518192254634e-05, + "loss": 0.0719, + "num_input_tokens_seen": 6785600, + "step": 32155 + }, + { + "epoch": 3.537953795379538, + "grad_norm": 0.4797431528568268, + "learning_rate": 4.910488182150393e-05, + "loss": 0.043, + "num_input_tokens_seen": 6786784, + "step": 32160 + }, + { + "epoch": 3.5385038503850383, + "grad_norm": 0.21531985700130463, + "learning_rate": 4.9104245228590096e-05, + "loss": 0.0507, + "num_input_tokens_seen": 6787904, + "step": 32165 + }, + { + "epoch": 3.539053905390539, + "grad_norm": 0.03143324702978134, + "learning_rate": 4.9103608413518995e-05, + "loss": 0.1006, + "num_input_tokens_seen": 6788928, + "step": 32170 + }, + { + "epoch": 3.5396039603960396, + "grad_norm": 0.09959995746612549, + "learning_rate": 4.910297137629651e-05, + "loss": 0.1207, + "num_input_tokens_seen": 6789952, + "step": 32175 + }, + { + "epoch": 3.5401540154015403, + "grad_norm": 0.24200588464736938, + "learning_rate": 4.910233411692849e-05, + "loss": 0.034, + "num_input_tokens_seen": 6791040, + "step": 32180 + }, + { + "epoch": 3.540704070407041, + "grad_norm": 0.00523368688300252, + "learning_rate": 4.9101696635420834e-05, + "loss": 0.0271, + "num_input_tokens_seen": 6792128, + "step": 32185 + }, + { + "epoch": 3.541254125412541, + "grad_norm": 0.495389461517334, + "learning_rate": 4.910105893177941e-05, + "loss": 0.1947, + "num_input_tokens_seen": 6793152, + "step": 32190 + }, + { + "epoch": 3.541804180418042, + "grad_norm": 0.21983161568641663, + "learning_rate": 4.910042100601008e-05, + "loss": 0.073, + "num_input_tokens_seen": 6794272, + "step": 32195 + }, + { + "epoch": 3.5423542354235424, + "grad_norm": 0.05532464757561684, + "learning_rate": 4.9099782858118736e-05, + "loss": 0.0811, + "num_input_tokens_seen": 6795392, + "step": 32200 + }, + { + "epoch": 3.5429042904290426, + "grad_norm": 0.46309545636177063, + "learning_rate": 4.909914448811126e-05, + "loss": 0.0884, + "num_input_tokens_seen": 6796448, + "step": 32205 + }, + { + "epoch": 3.5434543454345433, + "grad_norm": 0.09252730756998062, + "learning_rate": 4.909850589599353e-05, + "loss": 0.0507, + "num_input_tokens_seen": 6797440, + "step": 32210 + }, + { + "epoch": 3.544004400440044, + "grad_norm": 0.04150521755218506, + "learning_rate": 4.9097867081771435e-05, + "loss": 0.0418, + "num_input_tokens_seen": 6798464, + "step": 32215 + }, + { + "epoch": 3.5445544554455446, + "grad_norm": 1.7003486156463623, + "learning_rate": 4.9097228045450864e-05, + "loss": 0.0631, + "num_input_tokens_seen": 6799520, + "step": 32220 + }, + { + "epoch": 3.5451045104510452, + "grad_norm": 0.9242626428604126, + "learning_rate": 4.9096588787037706e-05, + "loss": 0.0635, + "num_input_tokens_seen": 6800608, + "step": 32225 + }, + { + "epoch": 3.5456545654565454, + "grad_norm": 0.07443051040172577, + "learning_rate": 4.909594930653785e-05, + "loss": 0.0366, + "num_input_tokens_seen": 6801664, + "step": 32230 + }, + { + "epoch": 3.546204620462046, + "grad_norm": 0.3458097279071808, + "learning_rate": 4.9095309603957194e-05, + "loss": 0.0156, + "num_input_tokens_seen": 6802688, + "step": 32235 + }, + { + "epoch": 3.5467546754675467, + "grad_norm": 0.09264305979013443, + "learning_rate": 4.909466967930163e-05, + "loss": 0.0542, + "num_input_tokens_seen": 6803744, + "step": 32240 + }, + { + "epoch": 3.5473047304730474, + "grad_norm": 0.36737579107284546, + "learning_rate": 4.9094029532577055e-05, + "loss": 0.0783, + "num_input_tokens_seen": 6804864, + "step": 32245 + }, + { + "epoch": 3.547854785478548, + "grad_norm": 0.8074241876602173, + "learning_rate": 4.909338916378937e-05, + "loss": 0.0618, + "num_input_tokens_seen": 6805888, + "step": 32250 + }, + { + "epoch": 3.5484048404840483, + "grad_norm": 0.3605304956436157, + "learning_rate": 4.9092748572944486e-05, + "loss": 0.0811, + "num_input_tokens_seen": 6807008, + "step": 32255 + }, + { + "epoch": 3.548954895489549, + "grad_norm": 1.0145765542984009, + "learning_rate": 4.90921077600483e-05, + "loss": 0.0536, + "num_input_tokens_seen": 6808032, + "step": 32260 + }, + { + "epoch": 3.5495049504950495, + "grad_norm": 0.01265117060393095, + "learning_rate": 4.9091466725106704e-05, + "loss": 0.0567, + "num_input_tokens_seen": 6809120, + "step": 32265 + }, + { + "epoch": 3.55005500550055, + "grad_norm": 0.602063775062561, + "learning_rate": 4.909082546812563e-05, + "loss": 0.0525, + "num_input_tokens_seen": 6810208, + "step": 32270 + }, + { + "epoch": 3.550605060506051, + "grad_norm": 0.10293000191450119, + "learning_rate": 4.9090183989110974e-05, + "loss": 0.0445, + "num_input_tokens_seen": 6811264, + "step": 32275 + }, + { + "epoch": 3.551155115511551, + "grad_norm": 2.9701805114746094, + "learning_rate": 4.9089542288068654e-05, + "loss": 0.1337, + "num_input_tokens_seen": 6812320, + "step": 32280 + }, + { + "epoch": 3.5517051705170517, + "grad_norm": 0.32421359419822693, + "learning_rate": 4.908890036500457e-05, + "loss": 0.0393, + "num_input_tokens_seen": 6813344, + "step": 32285 + }, + { + "epoch": 3.5522552255225524, + "grad_norm": 0.26199567317962646, + "learning_rate": 4.908825821992467e-05, + "loss": 0.044, + "num_input_tokens_seen": 6814368, + "step": 32290 + }, + { + "epoch": 3.5528052805280526, + "grad_norm": 0.18594437837600708, + "learning_rate": 4.908761585283485e-05, + "loss": 0.0202, + "num_input_tokens_seen": 6815424, + "step": 32295 + }, + { + "epoch": 3.553355335533553, + "grad_norm": 0.9111419916152954, + "learning_rate": 4.908697326374102e-05, + "loss": 0.1629, + "num_input_tokens_seen": 6816512, + "step": 32300 + }, + { + "epoch": 3.553905390539054, + "grad_norm": 1.1611250638961792, + "learning_rate": 4.908633045264912e-05, + "loss": 0.102, + "num_input_tokens_seen": 6817632, + "step": 32305 + }, + { + "epoch": 3.5544554455445545, + "grad_norm": 0.30747634172439575, + "learning_rate": 4.908568741956507e-05, + "loss": 0.0273, + "num_input_tokens_seen": 6818688, + "step": 32310 + }, + { + "epoch": 3.555005500550055, + "grad_norm": 0.9322883486747742, + "learning_rate": 4.9085044164494803e-05, + "loss": 0.2097, + "num_input_tokens_seen": 6819744, + "step": 32315 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.014684181660413742, + "learning_rate": 4.9084400687444236e-05, + "loss": 0.0635, + "num_input_tokens_seen": 6820768, + "step": 32320 + }, + { + "epoch": 3.556105610561056, + "grad_norm": 1.8187230825424194, + "learning_rate": 4.9083756988419306e-05, + "loss": 0.2017, + "num_input_tokens_seen": 6821824, + "step": 32325 + }, + { + "epoch": 3.5566556655665567, + "grad_norm": 0.455818235874176, + "learning_rate": 4.9083113067425944e-05, + "loss": 0.0525, + "num_input_tokens_seen": 6822976, + "step": 32330 + }, + { + "epoch": 3.5572057205720573, + "grad_norm": 0.15092207491397858, + "learning_rate": 4.9082468924470084e-05, + "loss": 0.0714, + "num_input_tokens_seen": 6824064, + "step": 32335 + }, + { + "epoch": 3.557755775577558, + "grad_norm": 0.05213095247745514, + "learning_rate": 4.908182455955766e-05, + "loss": 0.0686, + "num_input_tokens_seen": 6825056, + "step": 32340 + }, + { + "epoch": 3.558305830583058, + "grad_norm": 0.45127400755882263, + "learning_rate": 4.9081179972694625e-05, + "loss": 0.0938, + "num_input_tokens_seen": 6826112, + "step": 32345 + }, + { + "epoch": 3.558855885588559, + "grad_norm": 0.25198495388031006, + "learning_rate": 4.9080535163886904e-05, + "loss": 0.1329, + "num_input_tokens_seen": 6827104, + "step": 32350 + }, + { + "epoch": 3.5594059405940595, + "grad_norm": 0.09322798997163773, + "learning_rate": 4.907989013314045e-05, + "loss": 0.3146, + "num_input_tokens_seen": 6828192, + "step": 32355 + }, + { + "epoch": 3.55995599559956, + "grad_norm": 0.08041033148765564, + "learning_rate": 4.9079244880461194e-05, + "loss": 0.0237, + "num_input_tokens_seen": 6829184, + "step": 32360 + }, + { + "epoch": 3.5605060506050608, + "grad_norm": 0.05181010439991951, + "learning_rate": 4.90785994058551e-05, + "loss": 0.0332, + "num_input_tokens_seen": 6830272, + "step": 32365 + }, + { + "epoch": 3.561056105610561, + "grad_norm": 0.13500037789344788, + "learning_rate": 4.907795370932811e-05, + "loss": 0.0711, + "num_input_tokens_seen": 6831296, + "step": 32370 + }, + { + "epoch": 3.5616061606160616, + "grad_norm": 0.22111628949642181, + "learning_rate": 4.907730779088616e-05, + "loss": 0.0809, + "num_input_tokens_seen": 6832352, + "step": 32375 + }, + { + "epoch": 3.5621562156215623, + "grad_norm": 0.31187647581100464, + "learning_rate": 4.9076661650535226e-05, + "loss": 0.0507, + "num_input_tokens_seen": 6833440, + "step": 32380 + }, + { + "epoch": 3.5627062706270625, + "grad_norm": 0.8287284970283508, + "learning_rate": 4.9076015288281264e-05, + "loss": 0.0486, + "num_input_tokens_seen": 6834592, + "step": 32385 + }, + { + "epoch": 3.563256325632563, + "grad_norm": 0.16850224137306213, + "learning_rate": 4.907536870413021e-05, + "loss": 0.0986, + "num_input_tokens_seen": 6835648, + "step": 32390 + }, + { + "epoch": 3.5638063806380638, + "grad_norm": 0.24201612174510956, + "learning_rate": 4.907472189808804e-05, + "loss": 0.1383, + "num_input_tokens_seen": 6836672, + "step": 32395 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.7603124976158142, + "learning_rate": 4.9074074870160715e-05, + "loss": 0.0666, + "num_input_tokens_seen": 6837696, + "step": 32400 + }, + { + "epoch": 3.564906490649065, + "grad_norm": 0.8121964335441589, + "learning_rate": 4.907342762035418e-05, + "loss": 0.111, + "num_input_tokens_seen": 6838752, + "step": 32405 + }, + { + "epoch": 3.5654565456545653, + "grad_norm": 0.44429531693458557, + "learning_rate": 4.907278014867442e-05, + "loss": 0.0546, + "num_input_tokens_seen": 6839840, + "step": 32410 + }, + { + "epoch": 3.566006600660066, + "grad_norm": 0.7392260432243347, + "learning_rate": 4.9072132455127406e-05, + "loss": 0.0479, + "num_input_tokens_seen": 6840832, + "step": 32415 + }, + { + "epoch": 3.5665566556655666, + "grad_norm": 0.506218671798706, + "learning_rate": 4.907148453971909e-05, + "loss": 0.0982, + "num_input_tokens_seen": 6841888, + "step": 32420 + }, + { + "epoch": 3.567106710671067, + "grad_norm": 0.8187053203582764, + "learning_rate": 4.9070836402455456e-05, + "loss": 0.1023, + "num_input_tokens_seen": 6842944, + "step": 32425 + }, + { + "epoch": 3.567656765676568, + "grad_norm": 0.07896719872951508, + "learning_rate": 4.907018804334247e-05, + "loss": 0.0215, + "num_input_tokens_seen": 6844032, + "step": 32430 + }, + { + "epoch": 3.568206820682068, + "grad_norm": 0.08819522708654404, + "learning_rate": 4.906953946238611e-05, + "loss": 0.0294, + "num_input_tokens_seen": 6845120, + "step": 32435 + }, + { + "epoch": 3.5687568756875687, + "grad_norm": 0.06021348759531975, + "learning_rate": 4.906889065959236e-05, + "loss": 0.028, + "num_input_tokens_seen": 6846144, + "step": 32440 + }, + { + "epoch": 3.5693069306930694, + "grad_norm": 0.34045344591140747, + "learning_rate": 4.906824163496719e-05, + "loss": 0.025, + "num_input_tokens_seen": 6847232, + "step": 32445 + }, + { + "epoch": 3.56985698569857, + "grad_norm": 0.6366111040115356, + "learning_rate": 4.906759238851658e-05, + "loss": 0.117, + "num_input_tokens_seen": 6848288, + "step": 32450 + }, + { + "epoch": 3.5704070407040707, + "grad_norm": 0.2822965085506439, + "learning_rate": 4.906694292024653e-05, + "loss": 0.0795, + "num_input_tokens_seen": 6849280, + "step": 32455 + }, + { + "epoch": 3.570957095709571, + "grad_norm": 0.30233660340309143, + "learning_rate": 4.9066293230163005e-05, + "loss": 0.1262, + "num_input_tokens_seen": 6850240, + "step": 32460 + }, + { + "epoch": 3.5715071507150715, + "grad_norm": 0.5458672046661377, + "learning_rate": 4.906564331827202e-05, + "loss": 0.0216, + "num_input_tokens_seen": 6851232, + "step": 32465 + }, + { + "epoch": 3.572057205720572, + "grad_norm": 0.08064150810241699, + "learning_rate": 4.906499318457953e-05, + "loss": 0.0819, + "num_input_tokens_seen": 6852288, + "step": 32470 + }, + { + "epoch": 3.5726072607260724, + "grad_norm": 1.6195811033248901, + "learning_rate": 4.906434282909156e-05, + "loss": 0.1921, + "num_input_tokens_seen": 6853344, + "step": 32475 + }, + { + "epoch": 3.573157315731573, + "grad_norm": 0.29071757197380066, + "learning_rate": 4.906369225181407e-05, + "loss": 0.0256, + "num_input_tokens_seen": 6854400, + "step": 32480 + }, + { + "epoch": 3.5737073707370737, + "grad_norm": 0.26890698075294495, + "learning_rate": 4.9063041452753096e-05, + "loss": 0.0806, + "num_input_tokens_seen": 6855392, + "step": 32485 + }, + { + "epoch": 3.5742574257425743, + "grad_norm": 0.12271631509065628, + "learning_rate": 4.906239043191461e-05, + "loss": 0.0502, + "num_input_tokens_seen": 6856416, + "step": 32490 + }, + { + "epoch": 3.574807480748075, + "grad_norm": 0.5452298521995544, + "learning_rate": 4.906173918930461e-05, + "loss": 0.028, + "num_input_tokens_seen": 6857504, + "step": 32495 + }, + { + "epoch": 3.575357535753575, + "grad_norm": 0.242996484041214, + "learning_rate": 4.906108772492911e-05, + "loss": 0.0943, + "num_input_tokens_seen": 6858496, + "step": 32500 + }, + { + "epoch": 3.575907590759076, + "grad_norm": 0.07311161607503891, + "learning_rate": 4.906043603879411e-05, + "loss": 0.0473, + "num_input_tokens_seen": 6859552, + "step": 32505 + }, + { + "epoch": 3.5764576457645765, + "grad_norm": 0.45387813448905945, + "learning_rate": 4.905978413090562e-05, + "loss": 0.0536, + "num_input_tokens_seen": 6860608, + "step": 32510 + }, + { + "epoch": 3.577007700770077, + "grad_norm": 0.0530383475124836, + "learning_rate": 4.905913200126964e-05, + "loss": 0.0511, + "num_input_tokens_seen": 6861632, + "step": 32515 + }, + { + "epoch": 3.5775577557755778, + "grad_norm": 1.1912819147109985, + "learning_rate": 4.905847964989219e-05, + "loss": 0.0967, + "num_input_tokens_seen": 6862720, + "step": 32520 + }, + { + "epoch": 3.578107810781078, + "grad_norm": 0.9194698333740234, + "learning_rate": 4.905782707677927e-05, + "loss": 0.055, + "num_input_tokens_seen": 6863808, + "step": 32525 + }, + { + "epoch": 3.5786578657865786, + "grad_norm": 1.2332377433776855, + "learning_rate": 4.905717428193691e-05, + "loss": 0.1056, + "num_input_tokens_seen": 6864864, + "step": 32530 + }, + { + "epoch": 3.5792079207920793, + "grad_norm": 0.5853351354598999, + "learning_rate": 4.905652126537111e-05, + "loss": 0.0641, + "num_input_tokens_seen": 6865920, + "step": 32535 + }, + { + "epoch": 3.5797579757975795, + "grad_norm": 0.7452439665794373, + "learning_rate": 4.90558680270879e-05, + "loss": 0.0283, + "num_input_tokens_seen": 6866944, + "step": 32540 + }, + { + "epoch": 3.5803080308030806, + "grad_norm": 0.13865680992603302, + "learning_rate": 4.905521456709331e-05, + "loss": 0.0978, + "num_input_tokens_seen": 6867968, + "step": 32545 + }, + { + "epoch": 3.580858085808581, + "grad_norm": 0.21075712144374847, + "learning_rate": 4.9054560885393335e-05, + "loss": 0.0359, + "num_input_tokens_seen": 6869024, + "step": 32550 + }, + { + "epoch": 3.5814081408140814, + "grad_norm": 0.03166989982128143, + "learning_rate": 4.9053906981994025e-05, + "loss": 0.0977, + "num_input_tokens_seen": 6870112, + "step": 32555 + }, + { + "epoch": 3.581958195819582, + "grad_norm": 0.3536205589771271, + "learning_rate": 4.905325285690139e-05, + "loss": 0.0157, + "num_input_tokens_seen": 6871200, + "step": 32560 + }, + { + "epoch": 3.5825082508250823, + "grad_norm": 0.20704270899295807, + "learning_rate": 4.905259851012146e-05, + "loss": 0.0881, + "num_input_tokens_seen": 6872288, + "step": 32565 + }, + { + "epoch": 3.583058305830583, + "grad_norm": 1.0431724786758423, + "learning_rate": 4.9051943941660285e-05, + "loss": 0.0739, + "num_input_tokens_seen": 6873312, + "step": 32570 + }, + { + "epoch": 3.5836083608360836, + "grad_norm": 0.02316007763147354, + "learning_rate": 4.905128915152387e-05, + "loss": 0.0869, + "num_input_tokens_seen": 6874432, + "step": 32575 + }, + { + "epoch": 3.5841584158415842, + "grad_norm": 0.49932780861854553, + "learning_rate": 4.905063413971827e-05, + "loss": 0.0181, + "num_input_tokens_seen": 6875488, + "step": 32580 + }, + { + "epoch": 3.584708470847085, + "grad_norm": 0.09474191069602966, + "learning_rate": 4.9049978906249515e-05, + "loss": 0.0323, + "num_input_tokens_seen": 6876512, + "step": 32585 + }, + { + "epoch": 3.585258525852585, + "grad_norm": 0.19072163105010986, + "learning_rate": 4.904932345112364e-05, + "loss": 0.0395, + "num_input_tokens_seen": 6877568, + "step": 32590 + }, + { + "epoch": 3.5858085808580857, + "grad_norm": 0.804142951965332, + "learning_rate": 4.90486677743467e-05, + "loss": 0.0568, + "num_input_tokens_seen": 6878656, + "step": 32595 + }, + { + "epoch": 3.5863586358635864, + "grad_norm": 0.777791440486908, + "learning_rate": 4.9048011875924725e-05, + "loss": 0.0513, + "num_input_tokens_seen": 6879808, + "step": 32600 + }, + { + "epoch": 3.586908690869087, + "grad_norm": 1.0424660444259644, + "learning_rate": 4.9047355755863766e-05, + "loss": 0.1247, + "num_input_tokens_seen": 6880832, + "step": 32605 + }, + { + "epoch": 3.5874587458745877, + "grad_norm": 0.029828252270817757, + "learning_rate": 4.904669941416987e-05, + "loss": 0.046, + "num_input_tokens_seen": 6881856, + "step": 32610 + }, + { + "epoch": 3.588008800880088, + "grad_norm": 1.1295404434204102, + "learning_rate": 4.9046042850849074e-05, + "loss": 0.0814, + "num_input_tokens_seen": 6882912, + "step": 32615 + }, + { + "epoch": 3.5885588558855885, + "grad_norm": 0.45979970693588257, + "learning_rate": 4.904538606590745e-05, + "loss": 0.0152, + "num_input_tokens_seen": 6884000, + "step": 32620 + }, + { + "epoch": 3.589108910891089, + "grad_norm": 0.2414785921573639, + "learning_rate": 4.904472905935104e-05, + "loss": 0.0368, + "num_input_tokens_seen": 6885056, + "step": 32625 + }, + { + "epoch": 3.5896589658965894, + "grad_norm": 0.2690165936946869, + "learning_rate": 4.904407183118589e-05, + "loss": 0.0357, + "num_input_tokens_seen": 6886112, + "step": 32630 + }, + { + "epoch": 3.5902090209020905, + "grad_norm": 0.17859622836112976, + "learning_rate": 4.904341438141807e-05, + "loss": 0.0611, + "num_input_tokens_seen": 6887136, + "step": 32635 + }, + { + "epoch": 3.5907590759075907, + "grad_norm": 0.2620832324028015, + "learning_rate": 4.904275671005364e-05, + "loss": 0.0893, + "num_input_tokens_seen": 6888160, + "step": 32640 + }, + { + "epoch": 3.5913091309130913, + "grad_norm": 0.2813132703304291, + "learning_rate": 4.904209881709866e-05, + "loss": 0.0421, + "num_input_tokens_seen": 6889248, + "step": 32645 + }, + { + "epoch": 3.591859185918592, + "grad_norm": 0.02317824587225914, + "learning_rate": 4.90414407025592e-05, + "loss": 0.0354, + "num_input_tokens_seen": 6890272, + "step": 32650 + }, + { + "epoch": 3.592409240924092, + "grad_norm": 0.045887626707553864, + "learning_rate": 4.9040782366441304e-05, + "loss": 0.0854, + "num_input_tokens_seen": 6891392, + "step": 32655 + }, + { + "epoch": 3.592959295929593, + "grad_norm": 0.033613745123147964, + "learning_rate": 4.904012380875106e-05, + "loss": 0.0066, + "num_input_tokens_seen": 6892384, + "step": 32660 + }, + { + "epoch": 3.5935093509350935, + "grad_norm": 1.32626211643219, + "learning_rate": 4.903946502949452e-05, + "loss": 0.0382, + "num_input_tokens_seen": 6893472, + "step": 32665 + }, + { + "epoch": 3.594059405940594, + "grad_norm": 0.07202179729938507, + "learning_rate": 4.9038806028677776e-05, + "loss": 0.0264, + "num_input_tokens_seen": 6894496, + "step": 32670 + }, + { + "epoch": 3.594609460946095, + "grad_norm": 1.2249183654785156, + "learning_rate": 4.9038146806306885e-05, + "loss": 0.0385, + "num_input_tokens_seen": 6895552, + "step": 32675 + }, + { + "epoch": 3.595159515951595, + "grad_norm": 0.05303589999675751, + "learning_rate": 4.903748736238793e-05, + "loss": 0.1418, + "num_input_tokens_seen": 6896640, + "step": 32680 + }, + { + "epoch": 3.5957095709570956, + "grad_norm": 0.0818859189748764, + "learning_rate": 4.9036827696927e-05, + "loss": 0.0216, + "num_input_tokens_seen": 6897696, + "step": 32685 + }, + { + "epoch": 3.5962596259625963, + "grad_norm": 0.3243362307548523, + "learning_rate": 4.903616780993015e-05, + "loss": 0.0336, + "num_input_tokens_seen": 6898688, + "step": 32690 + }, + { + "epoch": 3.596809680968097, + "grad_norm": 0.32634618878364563, + "learning_rate": 4.903550770140348e-05, + "loss": 0.0146, + "num_input_tokens_seen": 6899744, + "step": 32695 + }, + { + "epoch": 3.5973597359735976, + "grad_norm": 0.17266225814819336, + "learning_rate": 4.903484737135306e-05, + "loss": 0.0217, + "num_input_tokens_seen": 6900768, + "step": 32700 + }, + { + "epoch": 3.597909790979098, + "grad_norm": 1.2557169198989868, + "learning_rate": 4.9034186819785e-05, + "loss": 0.1003, + "num_input_tokens_seen": 6901824, + "step": 32705 + }, + { + "epoch": 3.5984598459845984, + "grad_norm": 0.7701632976531982, + "learning_rate": 4.903352604670537e-05, + "loss": 0.1292, + "num_input_tokens_seen": 6902912, + "step": 32710 + }, + { + "epoch": 3.599009900990099, + "grad_norm": 0.031169086694717407, + "learning_rate": 4.9032865052120255e-05, + "loss": 0.0327, + "num_input_tokens_seen": 6903968, + "step": 32715 + }, + { + "epoch": 3.5995599559955993, + "grad_norm": 0.12787143886089325, + "learning_rate": 4.903220383603576e-05, + "loss": 0.0084, + "num_input_tokens_seen": 6905024, + "step": 32720 + }, + { + "epoch": 3.6001100110011, + "grad_norm": 0.6007446646690369, + "learning_rate": 4.9031542398457974e-05, + "loss": 0.1069, + "num_input_tokens_seen": 6906048, + "step": 32725 + }, + { + "epoch": 3.6006600660066006, + "grad_norm": 0.054993148893117905, + "learning_rate": 4.9030880739392993e-05, + "loss": 0.1121, + "num_input_tokens_seen": 6907072, + "step": 32730 + }, + { + "epoch": 3.6012101210121013, + "grad_norm": 0.12972015142440796, + "learning_rate": 4.903021885884691e-05, + "loss": 0.0977, + "num_input_tokens_seen": 6908160, + "step": 32735 + }, + { + "epoch": 3.601760176017602, + "grad_norm": 0.5494881272315979, + "learning_rate": 4.902955675682584e-05, + "loss": 0.0462, + "num_input_tokens_seen": 6909216, + "step": 32740 + }, + { + "epoch": 3.602310231023102, + "grad_norm": 0.26761987805366516, + "learning_rate": 4.9028894433335874e-05, + "loss": 0.0328, + "num_input_tokens_seen": 6910272, + "step": 32745 + }, + { + "epoch": 3.6028602860286028, + "grad_norm": 0.7056902050971985, + "learning_rate": 4.9028231888383114e-05, + "loss": 0.0314, + "num_input_tokens_seen": 6911264, + "step": 32750 + }, + { + "epoch": 3.6034103410341034, + "grad_norm": 0.08199409395456314, + "learning_rate": 4.902756912197367e-05, + "loss": 0.0963, + "num_input_tokens_seen": 6912384, + "step": 32755 + }, + { + "epoch": 3.603960396039604, + "grad_norm": 0.051586952060461044, + "learning_rate": 4.902690613411365e-05, + "loss": 0.0069, + "num_input_tokens_seen": 6913440, + "step": 32760 + }, + { + "epoch": 3.6045104510451047, + "grad_norm": 1.4486405849456787, + "learning_rate": 4.902624292480917e-05, + "loss": 0.1283, + "num_input_tokens_seen": 6914432, + "step": 32765 + }, + { + "epoch": 3.605060506050605, + "grad_norm": 0.030674520879983902, + "learning_rate": 4.902557949406633e-05, + "loss": 0.0267, + "num_input_tokens_seen": 6915456, + "step": 32770 + }, + { + "epoch": 3.6056105610561056, + "grad_norm": 1.3642207384109497, + "learning_rate": 4.902491584189126e-05, + "loss": 0.1003, + "num_input_tokens_seen": 6916544, + "step": 32775 + }, + { + "epoch": 3.606160616061606, + "grad_norm": 0.06679736822843552, + "learning_rate": 4.902425196829007e-05, + "loss": 0.0189, + "num_input_tokens_seen": 6917600, + "step": 32780 + }, + { + "epoch": 3.606710671067107, + "grad_norm": 0.3690261244773865, + "learning_rate": 4.902358787326887e-05, + "loss": 0.0747, + "num_input_tokens_seen": 6918624, + "step": 32785 + }, + { + "epoch": 3.6072607260726075, + "grad_norm": 0.017684871330857277, + "learning_rate": 4.90229235568338e-05, + "loss": 0.0752, + "num_input_tokens_seen": 6919744, + "step": 32790 + }, + { + "epoch": 3.6078107810781077, + "grad_norm": 0.12739579379558563, + "learning_rate": 4.902225901899096e-05, + "loss": 0.2578, + "num_input_tokens_seen": 6920800, + "step": 32795 + }, + { + "epoch": 3.6083608360836084, + "grad_norm": 0.4322388172149658, + "learning_rate": 4.90215942597465e-05, + "loss": 0.0878, + "num_input_tokens_seen": 6921952, + "step": 32800 + }, + { + "epoch": 3.608910891089109, + "grad_norm": 0.08320781588554382, + "learning_rate": 4.9020929279106514e-05, + "loss": 0.0246, + "num_input_tokens_seen": 6923040, + "step": 32805 + }, + { + "epoch": 3.609460946094609, + "grad_norm": 0.6215873956680298, + "learning_rate": 4.902026407707716e-05, + "loss": 0.0788, + "num_input_tokens_seen": 6924160, + "step": 32810 + }, + { + "epoch": 3.61001100110011, + "grad_norm": 0.18231721222400665, + "learning_rate": 4.901959865366456e-05, + "loss": 0.063, + "num_input_tokens_seen": 6925152, + "step": 32815 + }, + { + "epoch": 3.6105610561056105, + "grad_norm": 0.8479587435722351, + "learning_rate": 4.9018933008874835e-05, + "loss": 0.0763, + "num_input_tokens_seen": 6926304, + "step": 32820 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.10514889657497406, + "learning_rate": 4.9018267142714136e-05, + "loss": 0.1341, + "num_input_tokens_seen": 6927328, + "step": 32825 + }, + { + "epoch": 3.611661166116612, + "grad_norm": 0.31252261996269226, + "learning_rate": 4.901760105518859e-05, + "loss": 0.0489, + "num_input_tokens_seen": 6928320, + "step": 32830 + }, + { + "epoch": 3.612211221122112, + "grad_norm": 0.01964610256254673, + "learning_rate": 4.9016934746304346e-05, + "loss": 0.109, + "num_input_tokens_seen": 6929408, + "step": 32835 + }, + { + "epoch": 3.6127612761276127, + "grad_norm": 0.6050879955291748, + "learning_rate": 4.9016268216067526e-05, + "loss": 0.0494, + "num_input_tokens_seen": 6930464, + "step": 32840 + }, + { + "epoch": 3.6133113311331133, + "grad_norm": 0.06613455712795258, + "learning_rate": 4.90156014644843e-05, + "loss": 0.0653, + "num_input_tokens_seen": 6931552, + "step": 32845 + }, + { + "epoch": 3.613861386138614, + "grad_norm": 0.015018184669315815, + "learning_rate": 4.9014934491560796e-05, + "loss": 0.0601, + "num_input_tokens_seen": 6932704, + "step": 32850 + }, + { + "epoch": 3.6144114411441146, + "grad_norm": 1.2156431674957275, + "learning_rate": 4.901426729730316e-05, + "loss": 0.1024, + "num_input_tokens_seen": 6933728, + "step": 32855 + }, + { + "epoch": 3.614961496149615, + "grad_norm": 0.08477208018302917, + "learning_rate": 4.9013599881717556e-05, + "loss": 0.0706, + "num_input_tokens_seen": 6934752, + "step": 32860 + }, + { + "epoch": 3.6155115511551155, + "grad_norm": 0.9447011947631836, + "learning_rate": 4.9012932244810113e-05, + "loss": 0.0456, + "num_input_tokens_seen": 6935840, + "step": 32865 + }, + { + "epoch": 3.616061606160616, + "grad_norm": 0.8163653612136841, + "learning_rate": 4.9012264386586996e-05, + "loss": 0.0607, + "num_input_tokens_seen": 6936864, + "step": 32870 + }, + { + "epoch": 3.6166116611661168, + "grad_norm": 0.13438718020915985, + "learning_rate": 4.901159630705436e-05, + "loss": 0.0123, + "num_input_tokens_seen": 6937920, + "step": 32875 + }, + { + "epoch": 3.6171617161716174, + "grad_norm": 0.9121230244636536, + "learning_rate": 4.901092800621837e-05, + "loss": 0.0914, + "num_input_tokens_seen": 6938976, + "step": 32880 + }, + { + "epoch": 3.6177117711771176, + "grad_norm": 0.8954194188117981, + "learning_rate": 4.901025948408518e-05, + "loss": 0.0901, + "num_input_tokens_seen": 6940000, + "step": 32885 + }, + { + "epoch": 3.6182618261826183, + "grad_norm": 1.4834336042404175, + "learning_rate": 4.900959074066094e-05, + "loss": 0.1886, + "num_input_tokens_seen": 6941056, + "step": 32890 + }, + { + "epoch": 3.618811881188119, + "grad_norm": 0.05269942432641983, + "learning_rate": 4.9008921775951835e-05, + "loss": 0.0264, + "num_input_tokens_seen": 6942080, + "step": 32895 + }, + { + "epoch": 3.619361936193619, + "grad_norm": 0.05181889981031418, + "learning_rate": 4.900825258996401e-05, + "loss": 0.0454, + "num_input_tokens_seen": 6943072, + "step": 32900 + }, + { + "epoch": 3.6199119911991198, + "grad_norm": 0.03502492234110832, + "learning_rate": 4.900758318270364e-05, + "loss": 0.0239, + "num_input_tokens_seen": 6944096, + "step": 32905 + }, + { + "epoch": 3.6204620462046204, + "grad_norm": 0.7972841858863831, + "learning_rate": 4.9006913554176905e-05, + "loss": 0.0969, + "num_input_tokens_seen": 6945184, + "step": 32910 + }, + { + "epoch": 3.621012101210121, + "grad_norm": 0.14157356321811676, + "learning_rate": 4.900624370438995e-05, + "loss": 0.027, + "num_input_tokens_seen": 6946208, + "step": 32915 + }, + { + "epoch": 3.6215621562156217, + "grad_norm": 0.12232654541730881, + "learning_rate": 4.900557363334898e-05, + "loss": 0.0482, + "num_input_tokens_seen": 6947264, + "step": 32920 + }, + { + "epoch": 3.622112211221122, + "grad_norm": 0.1623794138431549, + "learning_rate": 4.900490334106016e-05, + "loss": 0.0391, + "num_input_tokens_seen": 6948352, + "step": 32925 + }, + { + "epoch": 3.6226622662266226, + "grad_norm": 0.2727380692958832, + "learning_rate": 4.9004232827529656e-05, + "loss": 0.096, + "num_input_tokens_seen": 6949408, + "step": 32930 + }, + { + "epoch": 3.6232123212321232, + "grad_norm": 0.43065765500068665, + "learning_rate": 4.9003562092763656e-05, + "loss": 0.024, + "num_input_tokens_seen": 6950464, + "step": 32935 + }, + { + "epoch": 3.623762376237624, + "grad_norm": 1.4725956916809082, + "learning_rate": 4.900289113676835e-05, + "loss": 0.1232, + "num_input_tokens_seen": 6951552, + "step": 32940 + }, + { + "epoch": 3.6243124312431245, + "grad_norm": 0.7052592635154724, + "learning_rate": 4.9002219959549906e-05, + "loss": 0.0522, + "num_input_tokens_seen": 6952640, + "step": 32945 + }, + { + "epoch": 3.6248624862486247, + "grad_norm": 0.17101339995861053, + "learning_rate": 4.9001548561114516e-05, + "loss": 0.0978, + "num_input_tokens_seen": 6953760, + "step": 32950 + }, + { + "epoch": 3.6254125412541254, + "grad_norm": 0.06120172515511513, + "learning_rate": 4.900087694146838e-05, + "loss": 0.0549, + "num_input_tokens_seen": 6954848, + "step": 32955 + }, + { + "epoch": 3.625962596259626, + "grad_norm": 0.1323753446340561, + "learning_rate": 4.9000205100617674e-05, + "loss": 0.0405, + "num_input_tokens_seen": 6955968, + "step": 32960 + }, + { + "epoch": 3.6265126512651267, + "grad_norm": 0.03820401802659035, + "learning_rate": 4.899953303856859e-05, + "loss": 0.0502, + "num_input_tokens_seen": 6957024, + "step": 32965 + }, + { + "epoch": 3.6270627062706273, + "grad_norm": 0.03050103224813938, + "learning_rate": 4.899886075532733e-05, + "loss": 0.047, + "num_input_tokens_seen": 6958016, + "step": 32970 + }, + { + "epoch": 3.6276127612761275, + "grad_norm": 0.9181691408157349, + "learning_rate": 4.899818825090009e-05, + "loss": 0.0476, + "num_input_tokens_seen": 6959072, + "step": 32975 + }, + { + "epoch": 3.628162816281628, + "grad_norm": 4.398846626281738, + "learning_rate": 4.899751552529306e-05, + "loss": 0.0811, + "num_input_tokens_seen": 6960096, + "step": 32980 + }, + { + "epoch": 3.628712871287129, + "grad_norm": 0.066465362906456, + "learning_rate": 4.8996842578512444e-05, + "loss": 0.0799, + "num_input_tokens_seen": 6961120, + "step": 32985 + }, + { + "epoch": 3.629262926292629, + "grad_norm": 0.054715052247047424, + "learning_rate": 4.8996169410564444e-05, + "loss": 0.0685, + "num_input_tokens_seen": 6962144, + "step": 32990 + }, + { + "epoch": 3.6298129812981297, + "grad_norm": 0.06941426545381546, + "learning_rate": 4.899549602145527e-05, + "loss": 0.0497, + "num_input_tokens_seen": 6963264, + "step": 32995 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.10031745582818985, + "learning_rate": 4.899482241119111e-05, + "loss": 0.012, + "num_input_tokens_seen": 6964320, + "step": 33000 + }, + { + "epoch": 3.630913091309131, + "grad_norm": 0.08049843460321426, + "learning_rate": 4.89941485797782e-05, + "loss": 0.1509, + "num_input_tokens_seen": 6965376, + "step": 33005 + }, + { + "epoch": 3.6314631463146316, + "grad_norm": 0.10050390660762787, + "learning_rate": 4.899347452722273e-05, + "loss": 0.0784, + "num_input_tokens_seen": 6966400, + "step": 33010 + }, + { + "epoch": 3.632013201320132, + "grad_norm": 0.196999654173851, + "learning_rate": 4.8992800253530916e-05, + "loss": 0.1205, + "num_input_tokens_seen": 6967424, + "step": 33015 + }, + { + "epoch": 3.6325632563256325, + "grad_norm": 1.1213338375091553, + "learning_rate": 4.899212575870898e-05, + "loss": 0.1238, + "num_input_tokens_seen": 6968416, + "step": 33020 + }, + { + "epoch": 3.633113311331133, + "grad_norm": 0.24993862211704254, + "learning_rate": 4.8991451042763135e-05, + "loss": 0.1189, + "num_input_tokens_seen": 6969408, + "step": 33025 + }, + { + "epoch": 3.633663366336634, + "grad_norm": 1.0051276683807373, + "learning_rate": 4.8990776105699594e-05, + "loss": 0.052, + "num_input_tokens_seen": 6970464, + "step": 33030 + }, + { + "epoch": 3.6342134213421344, + "grad_norm": 0.14407584071159363, + "learning_rate": 4.899010094752458e-05, + "loss": 0.0723, + "num_input_tokens_seen": 6971552, + "step": 33035 + }, + { + "epoch": 3.6347634763476346, + "grad_norm": 0.3334486186504364, + "learning_rate": 4.8989425568244315e-05, + "loss": 0.1005, + "num_input_tokens_seen": 6972640, + "step": 33040 + }, + { + "epoch": 3.6353135313531353, + "grad_norm": 1.6341255903244019, + "learning_rate": 4.898874996786503e-05, + "loss": 0.088, + "num_input_tokens_seen": 6973792, + "step": 33045 + }, + { + "epoch": 3.635863586358636, + "grad_norm": 0.5586223602294922, + "learning_rate": 4.898807414639296e-05, + "loss": 0.1703, + "num_input_tokens_seen": 6974848, + "step": 33050 + }, + { + "epoch": 3.636413641364136, + "grad_norm": 0.3331091105937958, + "learning_rate": 4.8987398103834305e-05, + "loss": 0.0351, + "num_input_tokens_seen": 6975904, + "step": 33055 + }, + { + "epoch": 3.6369636963696372, + "grad_norm": 0.2876132130622864, + "learning_rate": 4.898672184019532e-05, + "loss": 0.0681, + "num_input_tokens_seen": 6977056, + "step": 33060 + }, + { + "epoch": 3.6375137513751374, + "grad_norm": 0.15470242500305176, + "learning_rate": 4.8986045355482216e-05, + "loss": 0.0399, + "num_input_tokens_seen": 6978048, + "step": 33065 + }, + { + "epoch": 3.638063806380638, + "grad_norm": 0.038879137486219406, + "learning_rate": 4.898536864970126e-05, + "loss": 0.026, + "num_input_tokens_seen": 6979136, + "step": 33070 + }, + { + "epoch": 3.6386138613861387, + "grad_norm": 0.11698536574840546, + "learning_rate": 4.898469172285866e-05, + "loss": 0.0523, + "num_input_tokens_seen": 6980224, + "step": 33075 + }, + { + "epoch": 3.639163916391639, + "grad_norm": 0.44590574502944946, + "learning_rate": 4.898401457496066e-05, + "loss": 0.0552, + "num_input_tokens_seen": 6981312, + "step": 33080 + }, + { + "epoch": 3.6397139713971396, + "grad_norm": 0.07926750183105469, + "learning_rate": 4.898333720601352e-05, + "loss": 0.0439, + "num_input_tokens_seen": 6982368, + "step": 33085 + }, + { + "epoch": 3.6402640264026402, + "grad_norm": 0.8219365477561951, + "learning_rate": 4.898265961602346e-05, + "loss": 0.1328, + "num_input_tokens_seen": 6983456, + "step": 33090 + }, + { + "epoch": 3.640814081408141, + "grad_norm": 1.746597409248352, + "learning_rate": 4.8981981804996736e-05, + "loss": 0.0945, + "num_input_tokens_seen": 6984544, + "step": 33095 + }, + { + "epoch": 3.6413641364136415, + "grad_norm": 0.15487095713615417, + "learning_rate": 4.8981303772939594e-05, + "loss": 0.07, + "num_input_tokens_seen": 6985600, + "step": 33100 + }, + { + "epoch": 3.6419141914191417, + "grad_norm": 0.24830040335655212, + "learning_rate": 4.8980625519858284e-05, + "loss": 0.0483, + "num_input_tokens_seen": 6986624, + "step": 33105 + }, + { + "epoch": 3.6424642464246424, + "grad_norm": 1.0765515565872192, + "learning_rate": 4.8979947045759055e-05, + "loss": 0.0731, + "num_input_tokens_seen": 6987648, + "step": 33110 + }, + { + "epoch": 3.643014301430143, + "grad_norm": 0.7595534324645996, + "learning_rate": 4.897926835064816e-05, + "loss": 0.0638, + "num_input_tokens_seen": 6988672, + "step": 33115 + }, + { + "epoch": 3.6435643564356437, + "grad_norm": 1.0733476877212524, + "learning_rate": 4.8978589434531855e-05, + "loss": 0.069, + "num_input_tokens_seen": 6989696, + "step": 33120 + }, + { + "epoch": 3.6441144114411443, + "grad_norm": 0.04528972506523132, + "learning_rate": 4.897791029741639e-05, + "loss": 0.0876, + "num_input_tokens_seen": 6990720, + "step": 33125 + }, + { + "epoch": 3.6446644664466445, + "grad_norm": 0.526086688041687, + "learning_rate": 4.897723093930805e-05, + "loss": 0.0552, + "num_input_tokens_seen": 6991744, + "step": 33130 + }, + { + "epoch": 3.645214521452145, + "grad_norm": 0.8607520461082458, + "learning_rate": 4.897655136021306e-05, + "loss": 0.0572, + "num_input_tokens_seen": 6992768, + "step": 33135 + }, + { + "epoch": 3.645764576457646, + "grad_norm": 0.07015441358089447, + "learning_rate": 4.897587156013771e-05, + "loss": 0.1424, + "num_input_tokens_seen": 6993760, + "step": 33140 + }, + { + "epoch": 3.646314631463146, + "grad_norm": 0.11907370388507843, + "learning_rate": 4.897519153908826e-05, + "loss": 0.0465, + "num_input_tokens_seen": 6994816, + "step": 33145 + }, + { + "epoch": 3.6468646864686467, + "grad_norm": 0.18096385896205902, + "learning_rate": 4.897451129707096e-05, + "loss": 0.068, + "num_input_tokens_seen": 6995872, + "step": 33150 + }, + { + "epoch": 3.6474147414741473, + "grad_norm": 0.0825629010796547, + "learning_rate": 4.8973830834092104e-05, + "loss": 0.0158, + "num_input_tokens_seen": 6996928, + "step": 33155 + }, + { + "epoch": 3.647964796479648, + "grad_norm": 0.08531370013952255, + "learning_rate": 4.897315015015795e-05, + "loss": 0.0142, + "num_input_tokens_seen": 6998016, + "step": 33160 + }, + { + "epoch": 3.6485148514851486, + "grad_norm": 0.26866284012794495, + "learning_rate": 4.8972469245274776e-05, + "loss": 0.112, + "num_input_tokens_seen": 6999104, + "step": 33165 + }, + { + "epoch": 3.649064906490649, + "grad_norm": 0.0668950155377388, + "learning_rate": 4.8971788119448855e-05, + "loss": 0.0144, + "num_input_tokens_seen": 7000160, + "step": 33170 + }, + { + "epoch": 3.6496149614961495, + "grad_norm": 0.05708944797515869, + "learning_rate": 4.897110677268647e-05, + "loss": 0.0376, + "num_input_tokens_seen": 7001216, + "step": 33175 + }, + { + "epoch": 3.65016501650165, + "grad_norm": 0.03397534415125847, + "learning_rate": 4.897042520499389e-05, + "loss": 0.0144, + "num_input_tokens_seen": 7002272, + "step": 33180 + }, + { + "epoch": 3.650715071507151, + "grad_norm": 0.05995670706033707, + "learning_rate": 4.8969743416377406e-05, + "loss": 0.1168, + "num_input_tokens_seen": 7003328, + "step": 33185 + }, + { + "epoch": 3.6512651265126514, + "grad_norm": 0.8088381886482239, + "learning_rate": 4.89690614068433e-05, + "loss": 0.0839, + "num_input_tokens_seen": 7004384, + "step": 33190 + }, + { + "epoch": 3.6518151815181517, + "grad_norm": 1.1213762760162354, + "learning_rate": 4.8968379176397857e-05, + "loss": 0.1279, + "num_input_tokens_seen": 7005376, + "step": 33195 + }, + { + "epoch": 3.6523652365236523, + "grad_norm": 0.024122927337884903, + "learning_rate": 4.896769672504736e-05, + "loss": 0.0186, + "num_input_tokens_seen": 7006432, + "step": 33200 + }, + { + "epoch": 3.652915291529153, + "grad_norm": 0.043950699269771576, + "learning_rate": 4.8967014052798104e-05, + "loss": 0.013, + "num_input_tokens_seen": 7007456, + "step": 33205 + }, + { + "epoch": 3.6534653465346536, + "grad_norm": 0.04784403741359711, + "learning_rate": 4.8966331159656384e-05, + "loss": 0.0781, + "num_input_tokens_seen": 7008480, + "step": 33210 + }, + { + "epoch": 3.6540154015401543, + "grad_norm": 0.12299997359514236, + "learning_rate": 4.896564804562849e-05, + "loss": 0.0571, + "num_input_tokens_seen": 7009504, + "step": 33215 + }, + { + "epoch": 3.6545654565456545, + "grad_norm": 0.6056292057037354, + "learning_rate": 4.8964964710720705e-05, + "loss": 0.0595, + "num_input_tokens_seen": 7010560, + "step": 33220 + }, + { + "epoch": 3.655115511551155, + "grad_norm": 0.03670322522521019, + "learning_rate": 4.896428115493935e-05, + "loss": 0.0445, + "num_input_tokens_seen": 7011648, + "step": 33225 + }, + { + "epoch": 3.6556655665566558, + "grad_norm": 0.9482324719429016, + "learning_rate": 4.896359737829071e-05, + "loss": 0.0398, + "num_input_tokens_seen": 7012672, + "step": 33230 + }, + { + "epoch": 3.656215621562156, + "grad_norm": 0.22786125540733337, + "learning_rate": 4.896291338078111e-05, + "loss": 0.0683, + "num_input_tokens_seen": 7013664, + "step": 33235 + }, + { + "epoch": 3.6567656765676566, + "grad_norm": 0.11428256332874298, + "learning_rate": 4.8962229162416816e-05, + "loss": 0.0295, + "num_input_tokens_seen": 7014720, + "step": 33240 + }, + { + "epoch": 3.6573157315731573, + "grad_norm": 0.1238178014755249, + "learning_rate": 4.896154472320416e-05, + "loss": 0.0171, + "num_input_tokens_seen": 7015712, + "step": 33245 + }, + { + "epoch": 3.657865786578658, + "grad_norm": 0.12478868663311005, + "learning_rate": 4.896086006314944e-05, + "loss": 0.0378, + "num_input_tokens_seen": 7016736, + "step": 33250 + }, + { + "epoch": 3.6584158415841586, + "grad_norm": 0.026982959359884262, + "learning_rate": 4.8960175182258984e-05, + "loss": 0.0171, + "num_input_tokens_seen": 7017728, + "step": 33255 + }, + { + "epoch": 3.6589658965896588, + "grad_norm": 0.1164412871003151, + "learning_rate": 4.8959490080539074e-05, + "loss": 0.0601, + "num_input_tokens_seen": 7018784, + "step": 33260 + }, + { + "epoch": 3.6595159515951594, + "grad_norm": 0.3007236123085022, + "learning_rate": 4.895880475799605e-05, + "loss": 0.071, + "num_input_tokens_seen": 7019840, + "step": 33265 + }, + { + "epoch": 3.66006600660066, + "grad_norm": 0.043979790061712265, + "learning_rate": 4.895811921463622e-05, + "loss": 0.0114, + "num_input_tokens_seen": 7020896, + "step": 33270 + }, + { + "epoch": 3.6606160616061607, + "grad_norm": 0.1799388825893402, + "learning_rate": 4.8957433450465894e-05, + "loss": 0.0218, + "num_input_tokens_seen": 7021984, + "step": 33275 + }, + { + "epoch": 3.6611661166116614, + "grad_norm": 0.21685117483139038, + "learning_rate": 4.89567474654914e-05, + "loss": 0.0695, + "num_input_tokens_seen": 7023104, + "step": 33280 + }, + { + "epoch": 3.6617161716171616, + "grad_norm": 0.057577189058065414, + "learning_rate": 4.895606125971907e-05, + "loss": 0.0469, + "num_input_tokens_seen": 7024224, + "step": 33285 + }, + { + "epoch": 3.662266226622662, + "grad_norm": 0.8212693333625793, + "learning_rate": 4.895537483315521e-05, + "loss": 0.0864, + "num_input_tokens_seen": 7025280, + "step": 33290 + }, + { + "epoch": 3.662816281628163, + "grad_norm": 1.2564890384674072, + "learning_rate": 4.8954688185806165e-05, + "loss": 0.0763, + "num_input_tokens_seen": 7026304, + "step": 33295 + }, + { + "epoch": 3.6633663366336635, + "grad_norm": 2.0717151165008545, + "learning_rate": 4.8954001317678245e-05, + "loss": 0.1141, + "num_input_tokens_seen": 7027392, + "step": 33300 + }, + { + "epoch": 3.663916391639164, + "grad_norm": 0.3821021020412445, + "learning_rate": 4.8953314228777795e-05, + "loss": 0.0453, + "num_input_tokens_seen": 7028512, + "step": 33305 + }, + { + "epoch": 3.6644664466446644, + "grad_norm": 0.008950884453952312, + "learning_rate": 4.8952626919111135e-05, + "loss": 0.0778, + "num_input_tokens_seen": 7029568, + "step": 33310 + }, + { + "epoch": 3.665016501650165, + "grad_norm": 0.29773831367492676, + "learning_rate": 4.89519393886846e-05, + "loss": 0.0872, + "num_input_tokens_seen": 7030592, + "step": 33315 + }, + { + "epoch": 3.6655665566556657, + "grad_norm": 0.03583782538771629, + "learning_rate": 4.895125163750455e-05, + "loss": 0.1477, + "num_input_tokens_seen": 7031712, + "step": 33320 + }, + { + "epoch": 3.666116611661166, + "grad_norm": 0.018529096618294716, + "learning_rate": 4.895056366557729e-05, + "loss": 0.0756, + "num_input_tokens_seen": 7032768, + "step": 33325 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.32194262742996216, + "learning_rate": 4.894987547290919e-05, + "loss": 0.0609, + "num_input_tokens_seen": 7033824, + "step": 33330 + }, + { + "epoch": 3.667216721672167, + "grad_norm": 0.06709836423397064, + "learning_rate": 4.894918705950657e-05, + "loss": 0.0312, + "num_input_tokens_seen": 7034880, + "step": 33335 + }, + { + "epoch": 3.667766776677668, + "grad_norm": 0.38020825386047363, + "learning_rate": 4.89484984253758e-05, + "loss": 0.0446, + "num_input_tokens_seen": 7035840, + "step": 33340 + }, + { + "epoch": 3.6683168316831685, + "grad_norm": 0.2005992978811264, + "learning_rate": 4.894780957052319e-05, + "loss": 0.0399, + "num_input_tokens_seen": 7036960, + "step": 33345 + }, + { + "epoch": 3.6688668866886687, + "grad_norm": 0.08603842556476593, + "learning_rate": 4.894712049495513e-05, + "loss": 0.0393, + "num_input_tokens_seen": 7038080, + "step": 33350 + }, + { + "epoch": 3.6694169416941693, + "grad_norm": 0.386897474527359, + "learning_rate": 4.894643119867794e-05, + "loss": 0.0339, + "num_input_tokens_seen": 7039200, + "step": 33355 + }, + { + "epoch": 3.66996699669967, + "grad_norm": 0.7479109168052673, + "learning_rate": 4.8945741681697984e-05, + "loss": 0.0425, + "num_input_tokens_seen": 7040288, + "step": 33360 + }, + { + "epoch": 3.6705170517051706, + "grad_norm": 0.039687931537628174, + "learning_rate": 4.894505194402163e-05, + "loss": 0.069, + "num_input_tokens_seen": 7041408, + "step": 33365 + }, + { + "epoch": 3.6710671067106713, + "grad_norm": 1.8478015661239624, + "learning_rate": 4.8944361985655214e-05, + "loss": 0.1241, + "num_input_tokens_seen": 7042464, + "step": 33370 + }, + { + "epoch": 3.6716171617161715, + "grad_norm": 0.9292958378791809, + "learning_rate": 4.89436718066051e-05, + "loss": 0.0751, + "num_input_tokens_seen": 7043424, + "step": 33375 + }, + { + "epoch": 3.672167216721672, + "grad_norm": 0.5854099988937378, + "learning_rate": 4.894298140687765e-05, + "loss": 0.087, + "num_input_tokens_seen": 7044480, + "step": 33380 + }, + { + "epoch": 3.6727172717271728, + "grad_norm": 0.3579900562763214, + "learning_rate": 4.894229078647924e-05, + "loss": 0.0873, + "num_input_tokens_seen": 7045472, + "step": 33385 + }, + { + "epoch": 3.6732673267326734, + "grad_norm": 0.018907519057393074, + "learning_rate": 4.8941599945416213e-05, + "loss": 0.0431, + "num_input_tokens_seen": 7046496, + "step": 33390 + }, + { + "epoch": 3.673817381738174, + "grad_norm": 1.1892203092575073, + "learning_rate": 4.894090888369496e-05, + "loss": 0.0815, + "num_input_tokens_seen": 7047584, + "step": 33395 + }, + { + "epoch": 3.6743674367436743, + "grad_norm": 0.5863766670227051, + "learning_rate": 4.8940217601321824e-05, + "loss": 0.0248, + "num_input_tokens_seen": 7048672, + "step": 33400 + }, + { + "epoch": 3.674917491749175, + "grad_norm": 1.2199379205703735, + "learning_rate": 4.89395260983032e-05, + "loss": 0.072, + "num_input_tokens_seen": 7049664, + "step": 33405 + }, + { + "epoch": 3.6754675467546756, + "grad_norm": 0.6734784245491028, + "learning_rate": 4.893883437464545e-05, + "loss": 0.0583, + "num_input_tokens_seen": 7050688, + "step": 33410 + }, + { + "epoch": 3.676017601760176, + "grad_norm": 0.15628266334533691, + "learning_rate": 4.893814243035495e-05, + "loss": 0.0152, + "num_input_tokens_seen": 7051712, + "step": 33415 + }, + { + "epoch": 3.6765676567656764, + "grad_norm": 0.5051697492599487, + "learning_rate": 4.8937450265438076e-05, + "loss": 0.0487, + "num_input_tokens_seen": 7052736, + "step": 33420 + }, + { + "epoch": 3.677117711771177, + "grad_norm": 0.4464503526687622, + "learning_rate": 4.8936757879901205e-05, + "loss": 0.0226, + "num_input_tokens_seen": 7053824, + "step": 33425 + }, + { + "epoch": 3.6776677667766777, + "grad_norm": 0.09241299331188202, + "learning_rate": 4.893606527375073e-05, + "loss": 0.051, + "num_input_tokens_seen": 7054880, + "step": 33430 + }, + { + "epoch": 3.6782178217821784, + "grad_norm": 0.017089208588004112, + "learning_rate": 4.8935372446993024e-05, + "loss": 0.0697, + "num_input_tokens_seen": 7055872, + "step": 33435 + }, + { + "epoch": 3.6787678767876786, + "grad_norm": 0.7584157586097717, + "learning_rate": 4.8934679399634475e-05, + "loss": 0.0784, + "num_input_tokens_seen": 7056992, + "step": 33440 + }, + { + "epoch": 3.6793179317931792, + "grad_norm": 0.03409041091799736, + "learning_rate": 4.8933986131681474e-05, + "loss": 0.0292, + "num_input_tokens_seen": 7057984, + "step": 33445 + }, + { + "epoch": 3.67986798679868, + "grad_norm": 0.06323611736297607, + "learning_rate": 4.893329264314041e-05, + "loss": 0.1198, + "num_input_tokens_seen": 7059008, + "step": 33450 + }, + { + "epoch": 3.6804180418041805, + "grad_norm": 1.0054229497909546, + "learning_rate": 4.893259893401766e-05, + "loss": 0.1248, + "num_input_tokens_seen": 7060064, + "step": 33455 + }, + { + "epoch": 3.680968096809681, + "grad_norm": 0.8392528295516968, + "learning_rate": 4.893190500431964e-05, + "loss": 0.0432, + "num_input_tokens_seen": 7061152, + "step": 33460 + }, + { + "epoch": 3.6815181518151814, + "grad_norm": 1.119094967842102, + "learning_rate": 4.8931210854052734e-05, + "loss": 0.1099, + "num_input_tokens_seen": 7062272, + "step": 33465 + }, + { + "epoch": 3.682068206820682, + "grad_norm": 0.5646344423294067, + "learning_rate": 4.893051648322334e-05, + "loss": 0.0791, + "num_input_tokens_seen": 7063360, + "step": 33470 + }, + { + "epoch": 3.6826182618261827, + "grad_norm": 0.03858402371406555, + "learning_rate": 4.892982189183786e-05, + "loss": 0.024, + "num_input_tokens_seen": 7064448, + "step": 33475 + }, + { + "epoch": 3.6831683168316833, + "grad_norm": 0.45997074246406555, + "learning_rate": 4.8929127079902684e-05, + "loss": 0.0349, + "num_input_tokens_seen": 7065408, + "step": 33480 + }, + { + "epoch": 3.683718371837184, + "grad_norm": 0.08641098439693451, + "learning_rate": 4.892843204742424e-05, + "loss": 0.0363, + "num_input_tokens_seen": 7066400, + "step": 33485 + }, + { + "epoch": 3.684268426842684, + "grad_norm": 0.15412040054798126, + "learning_rate": 4.8927736794408905e-05, + "loss": 0.1087, + "num_input_tokens_seen": 7067520, + "step": 33490 + }, + { + "epoch": 3.684818481848185, + "grad_norm": 0.05259016528725624, + "learning_rate": 4.892704132086312e-05, + "loss": 0.0347, + "num_input_tokens_seen": 7068512, + "step": 33495 + }, + { + "epoch": 3.6853685368536855, + "grad_norm": 0.6662054657936096, + "learning_rate": 4.8926345626793264e-05, + "loss": 0.0862, + "num_input_tokens_seen": 7069568, + "step": 33500 + }, + { + "epoch": 3.6859185918591857, + "grad_norm": 0.29926860332489014, + "learning_rate": 4.8925649712205765e-05, + "loss": 0.0393, + "num_input_tokens_seen": 7070528, + "step": 33505 + }, + { + "epoch": 3.6864686468646863, + "grad_norm": 0.11215189844369888, + "learning_rate": 4.892495357710703e-05, + "loss": 0.055, + "num_input_tokens_seen": 7071520, + "step": 33510 + }, + { + "epoch": 3.687018701870187, + "grad_norm": 0.6835209131240845, + "learning_rate": 4.892425722150348e-05, + "loss": 0.0752, + "num_input_tokens_seen": 7072640, + "step": 33515 + }, + { + "epoch": 3.6875687568756876, + "grad_norm": 0.8444454669952393, + "learning_rate": 4.8923560645401536e-05, + "loss": 0.111, + "num_input_tokens_seen": 7073728, + "step": 33520 + }, + { + "epoch": 3.6881188118811883, + "grad_norm": 0.19806750118732452, + "learning_rate": 4.892286384880761e-05, + "loss": 0.0283, + "num_input_tokens_seen": 7074752, + "step": 33525 + }, + { + "epoch": 3.6886688668866885, + "grad_norm": 0.28842636942863464, + "learning_rate": 4.892216683172812e-05, + "loss": 0.0098, + "num_input_tokens_seen": 7075776, + "step": 33530 + }, + { + "epoch": 3.689218921892189, + "grad_norm": 0.2833820581436157, + "learning_rate": 4.892146959416951e-05, + "loss": 0.0201, + "num_input_tokens_seen": 7076896, + "step": 33535 + }, + { + "epoch": 3.68976897689769, + "grad_norm": 0.06174285337328911, + "learning_rate": 4.892077213613818e-05, + "loss": 0.0352, + "num_input_tokens_seen": 7077920, + "step": 33540 + }, + { + "epoch": 3.6903190319031904, + "grad_norm": 0.5796499252319336, + "learning_rate": 4.892007445764058e-05, + "loss": 0.0653, + "num_input_tokens_seen": 7078944, + "step": 33545 + }, + { + "epoch": 3.690869086908691, + "grad_norm": 0.058436088263988495, + "learning_rate": 4.891937655868313e-05, + "loss": 0.0308, + "num_input_tokens_seen": 7080000, + "step": 33550 + }, + { + "epoch": 3.6914191419141913, + "grad_norm": 0.7844566106796265, + "learning_rate": 4.8918678439272265e-05, + "loss": 0.0476, + "num_input_tokens_seen": 7080992, + "step": 33555 + }, + { + "epoch": 3.691969196919692, + "grad_norm": 0.7172713875770569, + "learning_rate": 4.891798009941442e-05, + "loss": 0.1007, + "num_input_tokens_seen": 7082048, + "step": 33560 + }, + { + "epoch": 3.6925192519251926, + "grad_norm": 0.4110770523548126, + "learning_rate": 4.891728153911602e-05, + "loss": 0.0813, + "num_input_tokens_seen": 7083136, + "step": 33565 + }, + { + "epoch": 3.693069306930693, + "grad_norm": 0.08555939793586731, + "learning_rate": 4.8916582758383525e-05, + "loss": 0.0972, + "num_input_tokens_seen": 7084160, + "step": 33570 + }, + { + "epoch": 3.693619361936194, + "grad_norm": 0.030014101415872574, + "learning_rate": 4.891588375722335e-05, + "loss": 0.0025, + "num_input_tokens_seen": 7085184, + "step": 33575 + }, + { + "epoch": 3.694169416941694, + "grad_norm": 0.007884783670306206, + "learning_rate": 4.8915184535641964e-05, + "loss": 0.0151, + "num_input_tokens_seen": 7086240, + "step": 33580 + }, + { + "epoch": 3.6947194719471947, + "grad_norm": 0.6666883230209351, + "learning_rate": 4.891448509364579e-05, + "loss": 0.0293, + "num_input_tokens_seen": 7087232, + "step": 33585 + }, + { + "epoch": 3.6952695269526954, + "grad_norm": 0.04720267280936241, + "learning_rate": 4.8913785431241283e-05, + "loss": 0.0933, + "num_input_tokens_seen": 7088352, + "step": 33590 + }, + { + "epoch": 3.6958195819581956, + "grad_norm": 0.2533341348171234, + "learning_rate": 4.891308554843489e-05, + "loss": 0.0799, + "num_input_tokens_seen": 7089312, + "step": 33595 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.15422439575195312, + "learning_rate": 4.8912385445233065e-05, + "loss": 0.0118, + "num_input_tokens_seen": 7090272, + "step": 33600 + }, + { + "epoch": 3.696919691969197, + "grad_norm": 0.050577450543642044, + "learning_rate": 4.891168512164225e-05, + "loss": 0.014, + "num_input_tokens_seen": 7091424, + "step": 33605 + }, + { + "epoch": 3.6974697469746975, + "grad_norm": 0.03223821520805359, + "learning_rate": 4.891098457766892e-05, + "loss": 0.0595, + "num_input_tokens_seen": 7092512, + "step": 33610 + }, + { + "epoch": 3.698019801980198, + "grad_norm": 0.1405160129070282, + "learning_rate": 4.891028381331951e-05, + "loss": 0.0347, + "num_input_tokens_seen": 7093536, + "step": 33615 + }, + { + "epoch": 3.6985698569856984, + "grad_norm": 0.05022304132580757, + "learning_rate": 4.890958282860049e-05, + "loss": 0.0394, + "num_input_tokens_seen": 7094560, + "step": 33620 + }, + { + "epoch": 3.699119911991199, + "grad_norm": 0.16389577090740204, + "learning_rate": 4.890888162351831e-05, + "loss": 0.0307, + "num_input_tokens_seen": 7095616, + "step": 33625 + }, + { + "epoch": 3.6996699669966997, + "grad_norm": 1.3050614595413208, + "learning_rate": 4.890818019807945e-05, + "loss": 0.0197, + "num_input_tokens_seen": 7096704, + "step": 33630 + }, + { + "epoch": 3.7002200220022003, + "grad_norm": 0.40364208817481995, + "learning_rate": 4.890747855229036e-05, + "loss": 0.0802, + "num_input_tokens_seen": 7097856, + "step": 33635 + }, + { + "epoch": 3.700770077007701, + "grad_norm": 0.5266609787940979, + "learning_rate": 4.8906776686157516e-05, + "loss": 0.1205, + "num_input_tokens_seen": 7098944, + "step": 33640 + }, + { + "epoch": 3.701320132013201, + "grad_norm": 0.02950800396502018, + "learning_rate": 4.8906074599687385e-05, + "loss": 0.0244, + "num_input_tokens_seen": 7100000, + "step": 33645 + }, + { + "epoch": 3.701870187018702, + "grad_norm": 0.050614241510629654, + "learning_rate": 4.8905372292886434e-05, + "loss": 0.0193, + "num_input_tokens_seen": 7101024, + "step": 33650 + }, + { + "epoch": 3.7024202420242025, + "grad_norm": 1.7402379512786865, + "learning_rate": 4.8904669765761133e-05, + "loss": 0.1252, + "num_input_tokens_seen": 7102080, + "step": 33655 + }, + { + "epoch": 3.7029702970297027, + "grad_norm": 0.829416036605835, + "learning_rate": 4.890396701831796e-05, + "loss": 0.0799, + "num_input_tokens_seen": 7103104, + "step": 33660 + }, + { + "epoch": 3.7035203520352034, + "grad_norm": 1.06447434425354, + "learning_rate": 4.89032640505634e-05, + "loss": 0.1165, + "num_input_tokens_seen": 7104224, + "step": 33665 + }, + { + "epoch": 3.704070407040704, + "grad_norm": 0.36669087409973145, + "learning_rate": 4.890256086250393e-05, + "loss": 0.0681, + "num_input_tokens_seen": 7105216, + "step": 33670 + }, + { + "epoch": 3.7046204620462047, + "grad_norm": 0.08791103959083557, + "learning_rate": 4.8901857454146014e-05, + "loss": 0.123, + "num_input_tokens_seen": 7106272, + "step": 33675 + }, + { + "epoch": 3.7051705170517053, + "grad_norm": 0.4272569715976715, + "learning_rate": 4.8901153825496155e-05, + "loss": 0.1475, + "num_input_tokens_seen": 7107264, + "step": 33680 + }, + { + "epoch": 3.7057205720572055, + "grad_norm": 0.035786405205726624, + "learning_rate": 4.890044997656082e-05, + "loss": 0.0279, + "num_input_tokens_seen": 7108320, + "step": 33685 + }, + { + "epoch": 3.706270627062706, + "grad_norm": 0.5915351510047913, + "learning_rate": 4.8899745907346516e-05, + "loss": 0.0679, + "num_input_tokens_seen": 7109376, + "step": 33690 + }, + { + "epoch": 3.706820682068207, + "grad_norm": 0.10687559098005295, + "learning_rate": 4.889904161785972e-05, + "loss": 0.0709, + "num_input_tokens_seen": 7110464, + "step": 33695 + }, + { + "epoch": 3.7073707370737075, + "grad_norm": 1.0186488628387451, + "learning_rate": 4.889833710810693e-05, + "loss": 0.0285, + "num_input_tokens_seen": 7111520, + "step": 33700 + }, + { + "epoch": 3.707920792079208, + "grad_norm": 1.0714662075042725, + "learning_rate": 4.889763237809463e-05, + "loss": 0.0952, + "num_input_tokens_seen": 7112544, + "step": 33705 + }, + { + "epoch": 3.7084708470847083, + "grad_norm": 0.008102472871541977, + "learning_rate": 4.889692742782932e-05, + "loss": 0.0199, + "num_input_tokens_seen": 7113632, + "step": 33710 + }, + { + "epoch": 3.709020902090209, + "grad_norm": 0.04853817820549011, + "learning_rate": 4.889622225731749e-05, + "loss": 0.0413, + "num_input_tokens_seen": 7114720, + "step": 33715 + }, + { + "epoch": 3.7095709570957096, + "grad_norm": 0.8688043355941772, + "learning_rate": 4.889551686656565e-05, + "loss": 0.0606, + "num_input_tokens_seen": 7115744, + "step": 33720 + }, + { + "epoch": 3.7101210121012103, + "grad_norm": 0.012316928245127201, + "learning_rate": 4.88948112555803e-05, + "loss": 0.0618, + "num_input_tokens_seen": 7116800, + "step": 33725 + }, + { + "epoch": 3.710671067106711, + "grad_norm": 0.8946797847747803, + "learning_rate": 4.889410542436794e-05, + "loss": 0.0556, + "num_input_tokens_seen": 7117824, + "step": 33730 + }, + { + "epoch": 3.711221122112211, + "grad_norm": 0.03239269182085991, + "learning_rate": 4.889339937293508e-05, + "loss": 0.0134, + "num_input_tokens_seen": 7118880, + "step": 33735 + }, + { + "epoch": 3.7117711771177118, + "grad_norm": 0.20448754727840424, + "learning_rate": 4.8892693101288215e-05, + "loss": 0.0515, + "num_input_tokens_seen": 7119936, + "step": 33740 + }, + { + "epoch": 3.7123212321232124, + "grad_norm": 0.12393301725387573, + "learning_rate": 4.8891986609433863e-05, + "loss": 0.1094, + "num_input_tokens_seen": 7121056, + "step": 33745 + }, + { + "epoch": 3.7128712871287126, + "grad_norm": 0.0362284891307354, + "learning_rate": 4.889127989737854e-05, + "loss": 0.0608, + "num_input_tokens_seen": 7122080, + "step": 33750 + }, + { + "epoch": 3.7134213421342133, + "grad_norm": 0.14362110197544098, + "learning_rate": 4.889057296512875e-05, + "loss": 0.0629, + "num_input_tokens_seen": 7123072, + "step": 33755 + }, + { + "epoch": 3.713971397139714, + "grad_norm": 0.19043180346488953, + "learning_rate": 4.888986581269102e-05, + "loss": 0.0178, + "num_input_tokens_seen": 7124096, + "step": 33760 + }, + { + "epoch": 3.7145214521452146, + "grad_norm": 0.16020794212818146, + "learning_rate": 4.888915844007185e-05, + "loss": 0.0672, + "num_input_tokens_seen": 7125184, + "step": 33765 + }, + { + "epoch": 3.715071507150715, + "grad_norm": 0.08825179934501648, + "learning_rate": 4.8888450847277775e-05, + "loss": 0.0331, + "num_input_tokens_seen": 7126240, + "step": 33770 + }, + { + "epoch": 3.7156215621562154, + "grad_norm": 1.4612241983413696, + "learning_rate": 4.888774303431531e-05, + "loss": 0.0679, + "num_input_tokens_seen": 7127296, + "step": 33775 + }, + { + "epoch": 3.716171617161716, + "grad_norm": 1.463916540145874, + "learning_rate": 4.8887035001190986e-05, + "loss": 0.1331, + "num_input_tokens_seen": 7128352, + "step": 33780 + }, + { + "epoch": 3.7167216721672167, + "grad_norm": 1.2171614170074463, + "learning_rate": 4.888632674791131e-05, + "loss": 0.0846, + "num_input_tokens_seen": 7129376, + "step": 33785 + }, + { + "epoch": 3.7172717271727174, + "grad_norm": 0.052787601947784424, + "learning_rate": 4.888561827448284e-05, + "loss": 0.0202, + "num_input_tokens_seen": 7130432, + "step": 33790 + }, + { + "epoch": 3.717821782178218, + "grad_norm": 0.5453888773918152, + "learning_rate": 4.8884909580912076e-05, + "loss": 0.109, + "num_input_tokens_seen": 7131520, + "step": 33795 + }, + { + "epoch": 3.718371837183718, + "grad_norm": 0.4636537730693817, + "learning_rate": 4.888420066720556e-05, + "loss": 0.047, + "num_input_tokens_seen": 7132608, + "step": 33800 + }, + { + "epoch": 3.718921892189219, + "grad_norm": 0.14467774331569672, + "learning_rate": 4.888349153336983e-05, + "loss": 0.0198, + "num_input_tokens_seen": 7133568, + "step": 33805 + }, + { + "epoch": 3.7194719471947195, + "grad_norm": 1.1557769775390625, + "learning_rate": 4.888278217941142e-05, + "loss": 0.1172, + "num_input_tokens_seen": 7134592, + "step": 33810 + }, + { + "epoch": 3.72002200220022, + "grad_norm": 1.2197778224945068, + "learning_rate": 4.888207260533687e-05, + "loss": 0.0366, + "num_input_tokens_seen": 7135584, + "step": 33815 + }, + { + "epoch": 3.720572057205721, + "grad_norm": 0.11896444857120514, + "learning_rate": 4.888136281115271e-05, + "loss": 0.0907, + "num_input_tokens_seen": 7136640, + "step": 33820 + }, + { + "epoch": 3.721122112211221, + "grad_norm": 0.09934855997562408, + "learning_rate": 4.8880652796865486e-05, + "loss": 0.0251, + "num_input_tokens_seen": 7137696, + "step": 33825 + }, + { + "epoch": 3.7216721672167217, + "grad_norm": 0.06753313541412354, + "learning_rate": 4.887994256248175e-05, + "loss": 0.0076, + "num_input_tokens_seen": 7138752, + "step": 33830 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.05116835981607437, + "learning_rate": 4.887923210800804e-05, + "loss": 0.0293, + "num_input_tokens_seen": 7139840, + "step": 33835 + }, + { + "epoch": 3.7227722772277225, + "grad_norm": 0.26503345370292664, + "learning_rate": 4.887852143345091e-05, + "loss": 0.0332, + "num_input_tokens_seen": 7140832, + "step": 33840 + }, + { + "epoch": 3.723322332233223, + "grad_norm": 0.018855050206184387, + "learning_rate": 4.8877810538816905e-05, + "loss": 0.1004, + "num_input_tokens_seen": 7141824, + "step": 33845 + }, + { + "epoch": 3.723872387238724, + "grad_norm": 0.4143924117088318, + "learning_rate": 4.887709942411257e-05, + "loss": 0.021, + "num_input_tokens_seen": 7142880, + "step": 33850 + }, + { + "epoch": 3.7244224422442245, + "grad_norm": 1.4693348407745361, + "learning_rate": 4.887638808934447e-05, + "loss": 0.0966, + "num_input_tokens_seen": 7144000, + "step": 33855 + }, + { + "epoch": 3.724972497249725, + "grad_norm": 0.15460655093193054, + "learning_rate": 4.8875676534519164e-05, + "loss": 0.0564, + "num_input_tokens_seen": 7145152, + "step": 33860 + }, + { + "epoch": 3.7255225522552253, + "grad_norm": 0.1801174283027649, + "learning_rate": 4.8874964759643207e-05, + "loss": 0.0635, + "num_input_tokens_seen": 7146176, + "step": 33865 + }, + { + "epoch": 3.726072607260726, + "grad_norm": 0.6827272176742554, + "learning_rate": 4.887425276472315e-05, + "loss": 0.0636, + "num_input_tokens_seen": 7147200, + "step": 33870 + }, + { + "epoch": 3.7266226622662266, + "grad_norm": 0.12825456261634827, + "learning_rate": 4.8873540549765564e-05, + "loss": 0.01, + "num_input_tokens_seen": 7148256, + "step": 33875 + }, + { + "epoch": 3.7271727172717273, + "grad_norm": 0.3308165371417999, + "learning_rate": 4.8872828114777004e-05, + "loss": 0.031, + "num_input_tokens_seen": 7149408, + "step": 33880 + }, + { + "epoch": 3.727722772277228, + "grad_norm": 0.06063627079129219, + "learning_rate": 4.8872115459764046e-05, + "loss": 0.0195, + "num_input_tokens_seen": 7150432, + "step": 33885 + }, + { + "epoch": 3.728272827282728, + "grad_norm": 0.04835747182369232, + "learning_rate": 4.887140258473325e-05, + "loss": 0.0261, + "num_input_tokens_seen": 7151456, + "step": 33890 + }, + { + "epoch": 3.728822882288229, + "grad_norm": 0.22726407647132874, + "learning_rate": 4.88706894896912e-05, + "loss": 0.0781, + "num_input_tokens_seen": 7152544, + "step": 33895 + }, + { + "epoch": 3.7293729372937294, + "grad_norm": 0.5889315009117126, + "learning_rate": 4.886997617464446e-05, + "loss": 0.0389, + "num_input_tokens_seen": 7153568, + "step": 33900 + }, + { + "epoch": 3.72992299229923, + "grad_norm": 1.2894690036773682, + "learning_rate": 4.886926263959959e-05, + "loss": 0.097, + "num_input_tokens_seen": 7154560, + "step": 33905 + }, + { + "epoch": 3.7304730473047307, + "grad_norm": 0.6277446746826172, + "learning_rate": 4.8868548884563194e-05, + "loss": 0.028, + "num_input_tokens_seen": 7155584, + "step": 33910 + }, + { + "epoch": 3.731023102310231, + "grad_norm": 0.11628147214651108, + "learning_rate": 4.886783490954183e-05, + "loss": 0.0588, + "num_input_tokens_seen": 7156608, + "step": 33915 + }, + { + "epoch": 3.7315731573157316, + "grad_norm": 0.07696745544672012, + "learning_rate": 4.886712071454208e-05, + "loss": 0.0072, + "num_input_tokens_seen": 7157696, + "step": 33920 + }, + { + "epoch": 3.7321232123212322, + "grad_norm": 0.02165871486067772, + "learning_rate": 4.886640629957054e-05, + "loss": 0.0526, + "num_input_tokens_seen": 7158752, + "step": 33925 + }, + { + "epoch": 3.7326732673267324, + "grad_norm": 0.12505659461021423, + "learning_rate": 4.8865691664633785e-05, + "loss": 0.034, + "num_input_tokens_seen": 7159776, + "step": 33930 + }, + { + "epoch": 3.733223322332233, + "grad_norm": 0.013341093435883522, + "learning_rate": 4.886497680973839e-05, + "loss": 0.0043, + "num_input_tokens_seen": 7160864, + "step": 33935 + }, + { + "epoch": 3.7337733773377337, + "grad_norm": 0.15217261016368866, + "learning_rate": 4.886426173489097e-05, + "loss": 0.0081, + "num_input_tokens_seen": 7161984, + "step": 33940 + }, + { + "epoch": 3.7343234323432344, + "grad_norm": 0.04373796656727791, + "learning_rate": 4.8863546440098095e-05, + "loss": 0.0036, + "num_input_tokens_seen": 7163008, + "step": 33945 + }, + { + "epoch": 3.734873487348735, + "grad_norm": 0.9046661853790283, + "learning_rate": 4.886283092536636e-05, + "loss": 0.0626, + "num_input_tokens_seen": 7164000, + "step": 33950 + }, + { + "epoch": 3.7354235423542352, + "grad_norm": 0.01606430672109127, + "learning_rate": 4.886211519070237e-05, + "loss": 0.0418, + "num_input_tokens_seen": 7165088, + "step": 33955 + }, + { + "epoch": 3.735973597359736, + "grad_norm": 0.14460474252700806, + "learning_rate": 4.88613992361127e-05, + "loss": 0.0083, + "num_input_tokens_seen": 7166112, + "step": 33960 + }, + { + "epoch": 3.7365236523652365, + "grad_norm": 0.3692814111709595, + "learning_rate": 4.8860683061603986e-05, + "loss": 0.0256, + "num_input_tokens_seen": 7167136, + "step": 33965 + }, + { + "epoch": 3.737073707370737, + "grad_norm": 0.1125405803322792, + "learning_rate": 4.885996666718279e-05, + "loss": 0.0151, + "num_input_tokens_seen": 7168192, + "step": 33970 + }, + { + "epoch": 3.737623762376238, + "grad_norm": 1.4283276796340942, + "learning_rate": 4.8859250052855735e-05, + "loss": 0.0754, + "num_input_tokens_seen": 7169280, + "step": 33975 + }, + { + "epoch": 3.738173817381738, + "grad_norm": 0.02211022563278675, + "learning_rate": 4.885853321862941e-05, + "loss": 0.0133, + "num_input_tokens_seen": 7170304, + "step": 33980 + }, + { + "epoch": 3.7387238723872387, + "grad_norm": 0.5003255009651184, + "learning_rate": 4.885781616451045e-05, + "loss": 0.1391, + "num_input_tokens_seen": 7171296, + "step": 33985 + }, + { + "epoch": 3.7392739273927393, + "grad_norm": 0.38546183705329895, + "learning_rate": 4.8857098890505434e-05, + "loss": 0.0219, + "num_input_tokens_seen": 7172416, + "step": 33990 + }, + { + "epoch": 3.73982398239824, + "grad_norm": 0.2567461431026459, + "learning_rate": 4.885638139662099e-05, + "loss": 0.0776, + "num_input_tokens_seen": 7173440, + "step": 33995 + }, + { + "epoch": 3.7403740374037406, + "grad_norm": 0.30158868432044983, + "learning_rate": 4.885566368286373e-05, + "loss": 0.0398, + "num_input_tokens_seen": 7174528, + "step": 34000 + }, + { + "epoch": 3.740924092409241, + "grad_norm": 0.012714393436908722, + "learning_rate": 4.885494574924026e-05, + "loss": 0.0277, + "num_input_tokens_seen": 7175584, + "step": 34005 + }, + { + "epoch": 3.7414741474147415, + "grad_norm": 0.0072592333890497684, + "learning_rate": 4.88542275957572e-05, + "loss": 0.0353, + "num_input_tokens_seen": 7176736, + "step": 34010 + }, + { + "epoch": 3.742024202420242, + "grad_norm": 0.23700980842113495, + "learning_rate": 4.8853509222421176e-05, + "loss": 0.0566, + "num_input_tokens_seen": 7177728, + "step": 34015 + }, + { + "epoch": 3.7425742574257423, + "grad_norm": 1.8240588903427124, + "learning_rate": 4.88527906292388e-05, + "loss": 0.2071, + "num_input_tokens_seen": 7178752, + "step": 34020 + }, + { + "epoch": 3.743124312431243, + "grad_norm": 0.5757493376731873, + "learning_rate": 4.8852071816216696e-05, + "loss": 0.0188, + "num_input_tokens_seen": 7179840, + "step": 34025 + }, + { + "epoch": 3.7436743674367436, + "grad_norm": 0.7405094504356384, + "learning_rate": 4.88513527833615e-05, + "loss": 0.0717, + "num_input_tokens_seen": 7180928, + "step": 34030 + }, + { + "epoch": 3.7442244224422443, + "grad_norm": 0.5303259491920471, + "learning_rate": 4.885063353067982e-05, + "loss": 0.0324, + "num_input_tokens_seen": 7181920, + "step": 34035 + }, + { + "epoch": 3.744774477447745, + "grad_norm": 0.9457950592041016, + "learning_rate": 4.88499140581783e-05, + "loss": 0.1981, + "num_input_tokens_seen": 7182976, + "step": 34040 + }, + { + "epoch": 3.745324532453245, + "grad_norm": 0.05976322293281555, + "learning_rate": 4.884919436586357e-05, + "loss": 0.0385, + "num_input_tokens_seen": 7184064, + "step": 34045 + }, + { + "epoch": 3.745874587458746, + "grad_norm": 0.9900677800178528, + "learning_rate": 4.8848474453742255e-05, + "loss": 0.2335, + "num_input_tokens_seen": 7185120, + "step": 34050 + }, + { + "epoch": 3.7464246424642464, + "grad_norm": 0.07591582089662552, + "learning_rate": 4.884775432182099e-05, + "loss": 0.0694, + "num_input_tokens_seen": 7186176, + "step": 34055 + }, + { + "epoch": 3.746974697469747, + "grad_norm": 0.10676461458206177, + "learning_rate": 4.8847033970106427e-05, + "loss": 0.01, + "num_input_tokens_seen": 7187200, + "step": 34060 + }, + { + "epoch": 3.7475247524752477, + "grad_norm": 0.4497392773628235, + "learning_rate": 4.884631339860519e-05, + "loss": 0.0993, + "num_input_tokens_seen": 7188320, + "step": 34065 + }, + { + "epoch": 3.748074807480748, + "grad_norm": 0.15132904052734375, + "learning_rate": 4.8845592607323935e-05, + "loss": 0.0091, + "num_input_tokens_seen": 7189376, + "step": 34070 + }, + { + "epoch": 3.7486248624862486, + "grad_norm": 0.14293906092643738, + "learning_rate": 4.8844871596269275e-05, + "loss": 0.0433, + "num_input_tokens_seen": 7190400, + "step": 34075 + }, + { + "epoch": 3.7491749174917492, + "grad_norm": 2.017426013946533, + "learning_rate": 4.8844150365447885e-05, + "loss": 0.236, + "num_input_tokens_seen": 7191488, + "step": 34080 + }, + { + "epoch": 3.7497249724972495, + "grad_norm": 0.040921401232481, + "learning_rate": 4.8843428914866406e-05, + "loss": 0.0194, + "num_input_tokens_seen": 7192576, + "step": 34085 + }, + { + "epoch": 3.7502750275027505, + "grad_norm": 0.8982762694358826, + "learning_rate": 4.8842707244531485e-05, + "loss": 0.0456, + "num_input_tokens_seen": 7193632, + "step": 34090 + }, + { + "epoch": 3.7508250825082508, + "grad_norm": 0.01208828017115593, + "learning_rate": 4.884198535444976e-05, + "loss": 0.0807, + "num_input_tokens_seen": 7194656, + "step": 34095 + }, + { + "epoch": 3.7513751375137514, + "grad_norm": 0.21416594088077545, + "learning_rate": 4.884126324462791e-05, + "loss": 0.0201, + "num_input_tokens_seen": 7195776, + "step": 34100 + }, + { + "epoch": 3.751925192519252, + "grad_norm": 0.08103375881910324, + "learning_rate": 4.8840540915072573e-05, + "loss": 0.022, + "num_input_tokens_seen": 7196864, + "step": 34105 + }, + { + "epoch": 3.7524752475247523, + "grad_norm": 1.1968475580215454, + "learning_rate": 4.883981836579041e-05, + "loss": 0.1682, + "num_input_tokens_seen": 7197856, + "step": 34110 + }, + { + "epoch": 3.753025302530253, + "grad_norm": 0.05909888446331024, + "learning_rate": 4.883909559678808e-05, + "loss": 0.0204, + "num_input_tokens_seen": 7198880, + "step": 34115 + }, + { + "epoch": 3.7535753575357536, + "grad_norm": 0.6290124654769897, + "learning_rate": 4.883837260807224e-05, + "loss": 0.0376, + "num_input_tokens_seen": 7200032, + "step": 34120 + }, + { + "epoch": 3.754125412541254, + "grad_norm": 0.024456677958369255, + "learning_rate": 4.883764939964957e-05, + "loss": 0.0206, + "num_input_tokens_seen": 7200992, + "step": 34125 + }, + { + "epoch": 3.754675467546755, + "grad_norm": 0.8006961345672607, + "learning_rate": 4.8836925971526714e-05, + "loss": 0.0997, + "num_input_tokens_seen": 7202016, + "step": 34130 + }, + { + "epoch": 3.755225522552255, + "grad_norm": 0.4339779019355774, + "learning_rate": 4.883620232371035e-05, + "loss": 0.0224, + "num_input_tokens_seen": 7203072, + "step": 34135 + }, + { + "epoch": 3.7557755775577557, + "grad_norm": 0.046678103506565094, + "learning_rate": 4.883547845620715e-05, + "loss": 0.0568, + "num_input_tokens_seen": 7204160, + "step": 34140 + }, + { + "epoch": 3.7563256325632564, + "grad_norm": 0.9479208588600159, + "learning_rate": 4.8834754369023784e-05, + "loss": 0.0726, + "num_input_tokens_seen": 7205280, + "step": 34145 + }, + { + "epoch": 3.756875687568757, + "grad_norm": 0.24112382531166077, + "learning_rate": 4.883403006216692e-05, + "loss": 0.0305, + "num_input_tokens_seen": 7206272, + "step": 34150 + }, + { + "epoch": 3.7574257425742577, + "grad_norm": 0.941194474697113, + "learning_rate": 4.883330553564324e-05, + "loss": 0.0346, + "num_input_tokens_seen": 7207360, + "step": 34155 + }, + { + "epoch": 3.757975797579758, + "grad_norm": 0.0782608836889267, + "learning_rate": 4.8832580789459416e-05, + "loss": 0.0144, + "num_input_tokens_seen": 7208544, + "step": 34160 + }, + { + "epoch": 3.7585258525852585, + "grad_norm": 0.09652628749608994, + "learning_rate": 4.8831855823622135e-05, + "loss": 0.0544, + "num_input_tokens_seen": 7209696, + "step": 34165 + }, + { + "epoch": 3.759075907590759, + "grad_norm": 0.44743239879608154, + "learning_rate": 4.883113063813807e-05, + "loss": 0.0381, + "num_input_tokens_seen": 7210688, + "step": 34170 + }, + { + "epoch": 3.7596259625962594, + "grad_norm": 0.7742711901664734, + "learning_rate": 4.883040523301392e-05, + "loss": 0.0336, + "num_input_tokens_seen": 7211712, + "step": 34175 + }, + { + "epoch": 3.76017601760176, + "grad_norm": 0.04311347007751465, + "learning_rate": 4.8829679608256345e-05, + "loss": 0.0663, + "num_input_tokens_seen": 7212768, + "step": 34180 + }, + { + "epoch": 3.7607260726072607, + "grad_norm": 0.12931859493255615, + "learning_rate": 4.882895376387205e-05, + "loss": 0.0297, + "num_input_tokens_seen": 7213760, + "step": 34185 + }, + { + "epoch": 3.7612761276127613, + "grad_norm": 1.9108773469924927, + "learning_rate": 4.882822769986774e-05, + "loss": 0.0539, + "num_input_tokens_seen": 7214848, + "step": 34190 + }, + { + "epoch": 3.761826182618262, + "grad_norm": 1.5691295862197876, + "learning_rate": 4.8827501416250065e-05, + "loss": 0.1274, + "num_input_tokens_seen": 7215872, + "step": 34195 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.26390305161476135, + "learning_rate": 4.882677491302576e-05, + "loss": 0.0605, + "num_input_tokens_seen": 7216864, + "step": 34200 + }, + { + "epoch": 3.762926292629263, + "grad_norm": 0.00493262242525816, + "learning_rate": 4.882604819020149e-05, + "loss": 0.1471, + "num_input_tokens_seen": 7217920, + "step": 34205 + }, + { + "epoch": 3.7634763476347635, + "grad_norm": 1.1765111684799194, + "learning_rate": 4.8825321247783984e-05, + "loss": 0.1043, + "num_input_tokens_seen": 7218944, + "step": 34210 + }, + { + "epoch": 3.764026402640264, + "grad_norm": 0.009947510436177254, + "learning_rate": 4.882459408577991e-05, + "loss": 0.0094, + "num_input_tokens_seen": 7219936, + "step": 34215 + }, + { + "epoch": 3.7645764576457648, + "grad_norm": 2.2098772525787354, + "learning_rate": 4.882386670419599e-05, + "loss": 0.0499, + "num_input_tokens_seen": 7220960, + "step": 34220 + }, + { + "epoch": 3.765126512651265, + "grad_norm": 0.10202783346176147, + "learning_rate": 4.8823139103038926e-05, + "loss": 0.0328, + "num_input_tokens_seen": 7221920, + "step": 34225 + }, + { + "epoch": 3.7656765676567656, + "grad_norm": 0.08530713617801666, + "learning_rate": 4.8822411282315416e-05, + "loss": 0.0923, + "num_input_tokens_seen": 7222944, + "step": 34230 + }, + { + "epoch": 3.7662266226622663, + "grad_norm": 0.027744777500629425, + "learning_rate": 4.882168324203217e-05, + "loss": 0.0045, + "num_input_tokens_seen": 7224032, + "step": 34235 + }, + { + "epoch": 3.766776677667767, + "grad_norm": 0.09684891998767853, + "learning_rate": 4.8820954982195905e-05, + "loss": 0.1108, + "num_input_tokens_seen": 7225088, + "step": 34240 + }, + { + "epoch": 3.7673267326732676, + "grad_norm": 0.3857784867286682, + "learning_rate": 4.882022650281333e-05, + "loss": 0.0242, + "num_input_tokens_seen": 7226112, + "step": 34245 + }, + { + "epoch": 3.7678767876787678, + "grad_norm": 0.014438540674746037, + "learning_rate": 4.881949780389115e-05, + "loss": 0.0765, + "num_input_tokens_seen": 7227072, + "step": 34250 + }, + { + "epoch": 3.7684268426842684, + "grad_norm": 0.27014151215553284, + "learning_rate": 4.881876888543609e-05, + "loss": 0.1228, + "num_input_tokens_seen": 7228160, + "step": 34255 + }, + { + "epoch": 3.768976897689769, + "grad_norm": 0.18480469286441803, + "learning_rate": 4.881803974745487e-05, + "loss": 0.0423, + "num_input_tokens_seen": 7229216, + "step": 34260 + }, + { + "epoch": 3.7695269526952693, + "grad_norm": 0.4058849513530731, + "learning_rate": 4.881731038995421e-05, + "loss": 0.0367, + "num_input_tokens_seen": 7230240, + "step": 34265 + }, + { + "epoch": 3.77007700770077, + "grad_norm": 0.3547903299331665, + "learning_rate": 4.881658081294082e-05, + "loss": 0.0243, + "num_input_tokens_seen": 7231360, + "step": 34270 + }, + { + "epoch": 3.7706270627062706, + "grad_norm": 0.74910968542099, + "learning_rate": 4.8815851016421435e-05, + "loss": 0.0321, + "num_input_tokens_seen": 7232448, + "step": 34275 + }, + { + "epoch": 3.771177117711771, + "grad_norm": 0.05299462750554085, + "learning_rate": 4.8815121000402785e-05, + "loss": 0.0481, + "num_input_tokens_seen": 7233472, + "step": 34280 + }, + { + "epoch": 3.771727172717272, + "grad_norm": 0.057325396686792374, + "learning_rate": 4.8814390764891586e-05, + "loss": 0.0076, + "num_input_tokens_seen": 7234560, + "step": 34285 + }, + { + "epoch": 3.772277227722772, + "grad_norm": 0.7680713534355164, + "learning_rate": 4.881366030989458e-05, + "loss": 0.071, + "num_input_tokens_seen": 7235616, + "step": 34290 + }, + { + "epoch": 3.7728272827282727, + "grad_norm": 0.2035590261220932, + "learning_rate": 4.881292963541849e-05, + "loss": 0.0731, + "num_input_tokens_seen": 7236736, + "step": 34295 + }, + { + "epoch": 3.7733773377337734, + "grad_norm": 0.06766631454229355, + "learning_rate": 4.881219874147006e-05, + "loss": 0.0846, + "num_input_tokens_seen": 7237760, + "step": 34300 + }, + { + "epoch": 3.773927392739274, + "grad_norm": 0.5804831385612488, + "learning_rate": 4.8811467628056006e-05, + "loss": 0.0728, + "num_input_tokens_seen": 7238784, + "step": 34305 + }, + { + "epoch": 3.7744774477447747, + "grad_norm": 0.9333169460296631, + "learning_rate": 4.8810736295183096e-05, + "loss": 0.0642, + "num_input_tokens_seen": 7239872, + "step": 34310 + }, + { + "epoch": 3.775027502750275, + "grad_norm": 0.059667982161045074, + "learning_rate": 4.881000474285805e-05, + "loss": 0.0112, + "num_input_tokens_seen": 7240928, + "step": 34315 + }, + { + "epoch": 3.7755775577557755, + "grad_norm": 0.11064521223306656, + "learning_rate": 4.8809272971087614e-05, + "loss": 0.0279, + "num_input_tokens_seen": 7241952, + "step": 34320 + }, + { + "epoch": 3.776127612761276, + "grad_norm": 0.9557967782020569, + "learning_rate": 4.880854097987853e-05, + "loss": 0.1312, + "num_input_tokens_seen": 7243040, + "step": 34325 + }, + { + "epoch": 3.776677667766777, + "grad_norm": 0.04509221017360687, + "learning_rate": 4.880780876923756e-05, + "loss": 0.054, + "num_input_tokens_seen": 7244064, + "step": 34330 + }, + { + "epoch": 3.7772277227722775, + "grad_norm": 0.7910218834877014, + "learning_rate": 4.880707633917143e-05, + "loss": 0.1226, + "num_input_tokens_seen": 7245088, + "step": 34335 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.08268304914236069, + "learning_rate": 4.8806343689686906e-05, + "loss": 0.056, + "num_input_tokens_seen": 7246112, + "step": 34340 + }, + { + "epoch": 3.7783278327832783, + "grad_norm": 0.04538803920149803, + "learning_rate": 4.8805610820790735e-05, + "loss": 0.0615, + "num_input_tokens_seen": 7247104, + "step": 34345 + }, + { + "epoch": 3.778877887788779, + "grad_norm": 0.3503033518791199, + "learning_rate": 4.880487773248967e-05, + "loss": 0.0292, + "num_input_tokens_seen": 7248224, + "step": 34350 + }, + { + "epoch": 3.779427942794279, + "grad_norm": 0.6601455211639404, + "learning_rate": 4.880414442479047e-05, + "loss": 0.1392, + "num_input_tokens_seen": 7249248, + "step": 34355 + }, + { + "epoch": 3.77997799779978, + "grad_norm": 0.4461381137371063, + "learning_rate": 4.88034108976999e-05, + "loss": 0.0327, + "num_input_tokens_seen": 7250400, + "step": 34360 + }, + { + "epoch": 3.7805280528052805, + "grad_norm": 1.3520658016204834, + "learning_rate": 4.88026771512247e-05, + "loss": 0.142, + "num_input_tokens_seen": 7251456, + "step": 34365 + }, + { + "epoch": 3.781078107810781, + "grad_norm": 1.265360713005066, + "learning_rate": 4.880194318537166e-05, + "loss": 0.1282, + "num_input_tokens_seen": 7252640, + "step": 34370 + }, + { + "epoch": 3.781628162816282, + "grad_norm": 0.7327277064323425, + "learning_rate": 4.880120900014752e-05, + "loss": 0.1586, + "num_input_tokens_seen": 7253728, + "step": 34375 + }, + { + "epoch": 3.782178217821782, + "grad_norm": 0.6594136357307434, + "learning_rate": 4.880047459555907e-05, + "loss": 0.0381, + "num_input_tokens_seen": 7254784, + "step": 34380 + }, + { + "epoch": 3.7827282728272826, + "grad_norm": 0.022695334628224373, + "learning_rate": 4.8799739971613055e-05, + "loss": 0.0824, + "num_input_tokens_seen": 7255776, + "step": 34385 + }, + { + "epoch": 3.7832783278327833, + "grad_norm": 1.1510534286499023, + "learning_rate": 4.8799005128316254e-05, + "loss": 0.0906, + "num_input_tokens_seen": 7256768, + "step": 34390 + }, + { + "epoch": 3.783828382838284, + "grad_norm": 1.4665813446044922, + "learning_rate": 4.879827006567545e-05, + "loss": 0.1864, + "num_input_tokens_seen": 7257728, + "step": 34395 + }, + { + "epoch": 3.7843784378437846, + "grad_norm": 1.8916937112808228, + "learning_rate": 4.8797534783697415e-05, + "loss": 0.1346, + "num_input_tokens_seen": 7258752, + "step": 34400 + }, + { + "epoch": 3.784928492849285, + "grad_norm": 0.048190515488386154, + "learning_rate": 4.879679928238892e-05, + "loss": 0.1137, + "num_input_tokens_seen": 7259776, + "step": 34405 + }, + { + "epoch": 3.7854785478547854, + "grad_norm": 0.2096569985151291, + "learning_rate": 4.8796063561756734e-05, + "loss": 0.009, + "num_input_tokens_seen": 7260736, + "step": 34410 + }, + { + "epoch": 3.786028602860286, + "grad_norm": 0.9087448120117188, + "learning_rate": 4.8795327621807656e-05, + "loss": 0.0326, + "num_input_tokens_seen": 7261792, + "step": 34415 + }, + { + "epoch": 3.7865786578657867, + "grad_norm": 0.7721185088157654, + "learning_rate": 4.879459146254846e-05, + "loss": 0.0414, + "num_input_tokens_seen": 7262848, + "step": 34420 + }, + { + "epoch": 3.7871287128712874, + "grad_norm": 0.23371395468711853, + "learning_rate": 4.8793855083985936e-05, + "loss": 0.0383, + "num_input_tokens_seen": 7263904, + "step": 34425 + }, + { + "epoch": 3.7876787678767876, + "grad_norm": 0.623131275177002, + "learning_rate": 4.879311848612687e-05, + "loss": 0.1571, + "num_input_tokens_seen": 7264992, + "step": 34430 + }, + { + "epoch": 3.7882288228822882, + "grad_norm": 1.0565850734710693, + "learning_rate": 4.8792381668978035e-05, + "loss": 0.0726, + "num_input_tokens_seen": 7266016, + "step": 34435 + }, + { + "epoch": 3.788778877887789, + "grad_norm": 0.33682137727737427, + "learning_rate": 4.8791644632546244e-05, + "loss": 0.0291, + "num_input_tokens_seen": 7267040, + "step": 34440 + }, + { + "epoch": 3.789328932893289, + "grad_norm": 0.32860955595970154, + "learning_rate": 4.879090737683828e-05, + "loss": 0.0822, + "num_input_tokens_seen": 7268064, + "step": 34445 + }, + { + "epoch": 3.7898789878987897, + "grad_norm": 0.333615243434906, + "learning_rate": 4.879016990186094e-05, + "loss": 0.0287, + "num_input_tokens_seen": 7269120, + "step": 34450 + }, + { + "epoch": 3.7904290429042904, + "grad_norm": 2.2849690914154053, + "learning_rate": 4.878943220762101e-05, + "loss": 0.1049, + "num_input_tokens_seen": 7270144, + "step": 34455 + }, + { + "epoch": 3.790979097909791, + "grad_norm": 0.23060379922389984, + "learning_rate": 4.8788694294125306e-05, + "loss": 0.0225, + "num_input_tokens_seen": 7271232, + "step": 34460 + }, + { + "epoch": 3.7915291529152917, + "grad_norm": 0.08412384986877441, + "learning_rate": 4.878795616138062e-05, + "loss": 0.0217, + "num_input_tokens_seen": 7272320, + "step": 34465 + }, + { + "epoch": 3.792079207920792, + "grad_norm": 0.026362614706158638, + "learning_rate": 4.878721780939376e-05, + "loss": 0.0796, + "num_input_tokens_seen": 7273376, + "step": 34470 + }, + { + "epoch": 3.7926292629262925, + "grad_norm": 0.3447265625, + "learning_rate": 4.878647923817152e-05, + "loss": 0.0302, + "num_input_tokens_seen": 7274432, + "step": 34475 + }, + { + "epoch": 3.793179317931793, + "grad_norm": 0.09051936119794846, + "learning_rate": 4.878574044772073e-05, + "loss": 0.0265, + "num_input_tokens_seen": 7275520, + "step": 34480 + }, + { + "epoch": 3.793729372937294, + "grad_norm": 1.2561390399932861, + "learning_rate": 4.8785001438048175e-05, + "loss": 0.0356, + "num_input_tokens_seen": 7276608, + "step": 34485 + }, + { + "epoch": 3.7942794279427945, + "grad_norm": 0.08819787204265594, + "learning_rate": 4.878426220916067e-05, + "loss": 0.0181, + "num_input_tokens_seen": 7277696, + "step": 34490 + }, + { + "epoch": 3.7948294829482947, + "grad_norm": 0.4367865025997162, + "learning_rate": 4.878352276106504e-05, + "loss": 0.0389, + "num_input_tokens_seen": 7278752, + "step": 34495 + }, + { + "epoch": 3.7953795379537953, + "grad_norm": 0.10418140888214111, + "learning_rate": 4.878278309376809e-05, + "loss": 0.0336, + "num_input_tokens_seen": 7279808, + "step": 34500 + }, + { + "epoch": 3.795929592959296, + "grad_norm": 0.0498267263174057, + "learning_rate": 4.878204320727664e-05, + "loss": 0.0504, + "num_input_tokens_seen": 7280832, + "step": 34505 + }, + { + "epoch": 3.7964796479647966, + "grad_norm": 0.05703490599989891, + "learning_rate": 4.878130310159751e-05, + "loss": 0.045, + "num_input_tokens_seen": 7281888, + "step": 34510 + }, + { + "epoch": 3.7970297029702973, + "grad_norm": 0.027309181168675423, + "learning_rate": 4.8780562776737515e-05, + "loss": 0.055, + "num_input_tokens_seen": 7282976, + "step": 34515 + }, + { + "epoch": 3.7975797579757975, + "grad_norm": 0.03596430644392967, + "learning_rate": 4.8779822232703495e-05, + "loss": 0.0054, + "num_input_tokens_seen": 7284064, + "step": 34520 + }, + { + "epoch": 3.798129812981298, + "grad_norm": 0.0221475288271904, + "learning_rate": 4.8779081469502255e-05, + "loss": 0.1399, + "num_input_tokens_seen": 7285184, + "step": 34525 + }, + { + "epoch": 3.798679867986799, + "grad_norm": 0.3778509497642517, + "learning_rate": 4.877834048714064e-05, + "loss": 0.0249, + "num_input_tokens_seen": 7286304, + "step": 34530 + }, + { + "epoch": 3.799229922992299, + "grad_norm": 0.02861642651259899, + "learning_rate": 4.877759928562546e-05, + "loss": 0.0322, + "num_input_tokens_seen": 7287392, + "step": 34535 + }, + { + "epoch": 3.7997799779977997, + "grad_norm": 0.2583392858505249, + "learning_rate": 4.8776857864963564e-05, + "loss": 0.1797, + "num_input_tokens_seen": 7288352, + "step": 34540 + }, + { + "epoch": 3.8003300330033003, + "grad_norm": 0.4309380352497101, + "learning_rate": 4.877611622516178e-05, + "loss": 0.0651, + "num_input_tokens_seen": 7289472, + "step": 34545 + }, + { + "epoch": 3.800880088008801, + "grad_norm": 0.5427894592285156, + "learning_rate": 4.8775374366226936e-05, + "loss": 0.0914, + "num_input_tokens_seen": 7290464, + "step": 34550 + }, + { + "epoch": 3.8014301430143016, + "grad_norm": 0.020025156438350677, + "learning_rate": 4.877463228816588e-05, + "loss": 0.0089, + "num_input_tokens_seen": 7291520, + "step": 34555 + }, + { + "epoch": 3.801980198019802, + "grad_norm": 0.22728225588798523, + "learning_rate": 4.8773889990985445e-05, + "loss": 0.0797, + "num_input_tokens_seen": 7292576, + "step": 34560 + }, + { + "epoch": 3.8025302530253025, + "grad_norm": 0.11538350582122803, + "learning_rate": 4.8773147474692474e-05, + "loss": 0.018, + "num_input_tokens_seen": 7293568, + "step": 34565 + }, + { + "epoch": 3.803080308030803, + "grad_norm": 0.07224580645561218, + "learning_rate": 4.877240473929381e-05, + "loss": 0.0142, + "num_input_tokens_seen": 7294656, + "step": 34570 + }, + { + "epoch": 3.8036303630363038, + "grad_norm": 0.25067588686943054, + "learning_rate": 4.87716617847963e-05, + "loss": 0.0388, + "num_input_tokens_seen": 7295744, + "step": 34575 + }, + { + "epoch": 3.8041804180418044, + "grad_norm": 0.5499544739723206, + "learning_rate": 4.8770918611206784e-05, + "loss": 0.0829, + "num_input_tokens_seen": 7296864, + "step": 34580 + }, + { + "epoch": 3.8047304730473046, + "grad_norm": 0.22519022226333618, + "learning_rate": 4.877017521853212e-05, + "loss": 0.0442, + "num_input_tokens_seen": 7297856, + "step": 34585 + }, + { + "epoch": 3.8052805280528053, + "grad_norm": 1.0489237308502197, + "learning_rate": 4.876943160677916e-05, + "loss": 0.0885, + "num_input_tokens_seen": 7298880, + "step": 34590 + }, + { + "epoch": 3.805830583058306, + "grad_norm": 0.07408388704061508, + "learning_rate": 4.8768687775954744e-05, + "loss": 0.0289, + "num_input_tokens_seen": 7299968, + "step": 34595 + }, + { + "epoch": 3.806380638063806, + "grad_norm": 0.0860656127333641, + "learning_rate": 4.8767943726065755e-05, + "loss": 0.035, + "num_input_tokens_seen": 7300992, + "step": 34600 + }, + { + "epoch": 3.806930693069307, + "grad_norm": 0.3101862370967865, + "learning_rate": 4.876719945711902e-05, + "loss": 0.0706, + "num_input_tokens_seen": 7302048, + "step": 34605 + }, + { + "epoch": 3.8074807480748074, + "grad_norm": 0.13847488164901733, + "learning_rate": 4.876645496912142e-05, + "loss": 0.0205, + "num_input_tokens_seen": 7303168, + "step": 34610 + }, + { + "epoch": 3.808030803080308, + "grad_norm": 0.05634534731507301, + "learning_rate": 4.8765710262079814e-05, + "loss": 0.0635, + "num_input_tokens_seen": 7304224, + "step": 34615 + }, + { + "epoch": 3.8085808580858087, + "grad_norm": 0.446935772895813, + "learning_rate": 4.8764965336001046e-05, + "loss": 0.04, + "num_input_tokens_seen": 7305216, + "step": 34620 + }, + { + "epoch": 3.809130913091309, + "grad_norm": 0.775467574596405, + "learning_rate": 4.8764220190892005e-05, + "loss": 0.1149, + "num_input_tokens_seen": 7306272, + "step": 34625 + }, + { + "epoch": 3.8096809680968096, + "grad_norm": 0.06997177004814148, + "learning_rate": 4.876347482675955e-05, + "loss": 0.1293, + "num_input_tokens_seen": 7307328, + "step": 34630 + }, + { + "epoch": 3.81023102310231, + "grad_norm": 0.02471601963043213, + "learning_rate": 4.8762729243610544e-05, + "loss": 0.0912, + "num_input_tokens_seen": 7308384, + "step": 34635 + }, + { + "epoch": 3.810781078107811, + "grad_norm": 0.3320823311805725, + "learning_rate": 4.8761983441451866e-05, + "loss": 0.0985, + "num_input_tokens_seen": 7309472, + "step": 34640 + }, + { + "epoch": 3.8113311331133115, + "grad_norm": 0.45220762491226196, + "learning_rate": 4.8761237420290394e-05, + "loss": 0.04, + "num_input_tokens_seen": 7310464, + "step": 34645 + }, + { + "epoch": 3.8118811881188117, + "grad_norm": 0.4695313274860382, + "learning_rate": 4.8760491180133e-05, + "loss": 0.0283, + "num_input_tokens_seen": 7311488, + "step": 34650 + }, + { + "epoch": 3.8124312431243124, + "grad_norm": 0.5255032181739807, + "learning_rate": 4.875974472098656e-05, + "loss": 0.0961, + "num_input_tokens_seen": 7312608, + "step": 34655 + }, + { + "epoch": 3.812981298129813, + "grad_norm": 0.9555225372314453, + "learning_rate": 4.875899804285794e-05, + "loss": 0.0635, + "num_input_tokens_seen": 7313600, + "step": 34660 + }, + { + "epoch": 3.8135313531353137, + "grad_norm": 0.041776325553655624, + "learning_rate": 4.875825114575405e-05, + "loss": 0.0399, + "num_input_tokens_seen": 7314592, + "step": 34665 + }, + { + "epoch": 3.8140814081408143, + "grad_norm": 0.6861763596534729, + "learning_rate": 4.8757504029681755e-05, + "loss": 0.0673, + "num_input_tokens_seen": 7315680, + "step": 34670 + }, + { + "epoch": 3.8146314631463145, + "grad_norm": 1.7182546854019165, + "learning_rate": 4.8756756694647946e-05, + "loss": 0.0866, + "num_input_tokens_seen": 7316768, + "step": 34675 + }, + { + "epoch": 3.815181518151815, + "grad_norm": 0.6312527656555176, + "learning_rate": 4.8756009140659514e-05, + "loss": 0.0315, + "num_input_tokens_seen": 7317824, + "step": 34680 + }, + { + "epoch": 3.815731573157316, + "grad_norm": 0.3012927174568176, + "learning_rate": 4.875526136772334e-05, + "loss": 0.0329, + "num_input_tokens_seen": 7318944, + "step": 34685 + }, + { + "epoch": 3.816281628162816, + "grad_norm": 1.1879541873931885, + "learning_rate": 4.8754513375846324e-05, + "loss": 0.1027, + "num_input_tokens_seen": 7320000, + "step": 34690 + }, + { + "epoch": 3.8168316831683167, + "grad_norm": 0.04696204513311386, + "learning_rate": 4.8753765165035356e-05, + "loss": 0.0713, + "num_input_tokens_seen": 7321056, + "step": 34695 + }, + { + "epoch": 3.8173817381738173, + "grad_norm": 1.0374470949172974, + "learning_rate": 4.875301673529733e-05, + "loss": 0.1058, + "num_input_tokens_seen": 7322112, + "step": 34700 + }, + { + "epoch": 3.817931793179318, + "grad_norm": 0.2229926884174347, + "learning_rate": 4.875226808663915e-05, + "loss": 0.0222, + "num_input_tokens_seen": 7323168, + "step": 34705 + }, + { + "epoch": 3.8184818481848186, + "grad_norm": 0.06942174583673477, + "learning_rate": 4.875151921906771e-05, + "loss": 0.0403, + "num_input_tokens_seen": 7324256, + "step": 34710 + }, + { + "epoch": 3.819031903190319, + "grad_norm": 1.3253644704818726, + "learning_rate": 4.8750770132589915e-05, + "loss": 0.1808, + "num_input_tokens_seen": 7325312, + "step": 34715 + }, + { + "epoch": 3.8195819581958195, + "grad_norm": 0.016563454642891884, + "learning_rate": 4.875002082721267e-05, + "loss": 0.0559, + "num_input_tokens_seen": 7326368, + "step": 34720 + }, + { + "epoch": 3.82013201320132, + "grad_norm": 0.2279737889766693, + "learning_rate": 4.874927130294287e-05, + "loss": 0.1275, + "num_input_tokens_seen": 7327328, + "step": 34725 + }, + { + "epoch": 3.8206820682068208, + "grad_norm": 0.07887584716081619, + "learning_rate": 4.874852155978744e-05, + "loss": 0.0203, + "num_input_tokens_seen": 7328352, + "step": 34730 + }, + { + "epoch": 3.8212321232123214, + "grad_norm": 0.324268102645874, + "learning_rate": 4.874777159775329e-05, + "loss": 0.0603, + "num_input_tokens_seen": 7329440, + "step": 34735 + }, + { + "epoch": 3.8217821782178216, + "grad_norm": 0.12734083831310272, + "learning_rate": 4.874702141684732e-05, + "loss": 0.0574, + "num_input_tokens_seen": 7330432, + "step": 34740 + }, + { + "epoch": 3.8223322332233223, + "grad_norm": 0.15417510271072388, + "learning_rate": 4.874627101707644e-05, + "loss": 0.0261, + "num_input_tokens_seen": 7331552, + "step": 34745 + }, + { + "epoch": 3.822882288228823, + "grad_norm": 0.3042553663253784, + "learning_rate": 4.874552039844758e-05, + "loss": 0.0351, + "num_input_tokens_seen": 7332608, + "step": 34750 + }, + { + "epoch": 3.8234323432343236, + "grad_norm": 0.09938018769025803, + "learning_rate": 4.874476956096765e-05, + "loss": 0.0343, + "num_input_tokens_seen": 7333632, + "step": 34755 + }, + { + "epoch": 3.823982398239824, + "grad_norm": 0.676725447177887, + "learning_rate": 4.874401850464358e-05, + "loss": 0.0514, + "num_input_tokens_seen": 7334752, + "step": 34760 + }, + { + "epoch": 3.8245324532453244, + "grad_norm": 1.081019639968872, + "learning_rate": 4.874326722948228e-05, + "loss": 0.0534, + "num_input_tokens_seen": 7335808, + "step": 34765 + }, + { + "epoch": 3.825082508250825, + "grad_norm": 0.02966240420937538, + "learning_rate": 4.874251573549068e-05, + "loss": 0.1108, + "num_input_tokens_seen": 7336928, + "step": 34770 + }, + { + "epoch": 3.8256325632563257, + "grad_norm": 1.3418464660644531, + "learning_rate": 4.8741764022675705e-05, + "loss": 0.1569, + "num_input_tokens_seen": 7337888, + "step": 34775 + }, + { + "epoch": 3.826182618261826, + "grad_norm": 0.1393088698387146, + "learning_rate": 4.874101209104428e-05, + "loss": 0.0841, + "num_input_tokens_seen": 7338976, + "step": 34780 + }, + { + "epoch": 3.8267326732673266, + "grad_norm": 0.7865951657295227, + "learning_rate": 4.874025994060334e-05, + "loss": 0.0422, + "num_input_tokens_seen": 7340032, + "step": 34785 + }, + { + "epoch": 3.8272827282728272, + "grad_norm": 0.13883864879608154, + "learning_rate": 4.873950757135982e-05, + "loss": 0.0635, + "num_input_tokens_seen": 7341088, + "step": 34790 + }, + { + "epoch": 3.827832783278328, + "grad_norm": 1.1610348224639893, + "learning_rate": 4.873875498332065e-05, + "loss": 0.085, + "num_input_tokens_seen": 7342112, + "step": 34795 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 0.5634425282478333, + "learning_rate": 4.8738002176492764e-05, + "loss": 0.1611, + "num_input_tokens_seen": 7343200, + "step": 34800 + }, + { + "epoch": 3.8289328932893287, + "grad_norm": 0.018427100032567978, + "learning_rate": 4.87372491508831e-05, + "loss": 0.0896, + "num_input_tokens_seen": 7344256, + "step": 34805 + }, + { + "epoch": 3.8294829482948294, + "grad_norm": 0.2561958432197571, + "learning_rate": 4.87364959064986e-05, + "loss": 0.0333, + "num_input_tokens_seen": 7345280, + "step": 34810 + }, + { + "epoch": 3.83003300330033, + "grad_norm": 0.9739768505096436, + "learning_rate": 4.8735742443346216e-05, + "loss": 0.0666, + "num_input_tokens_seen": 7346368, + "step": 34815 + }, + { + "epoch": 3.8305830583058307, + "grad_norm": 0.0918811485171318, + "learning_rate": 4.8734988761432876e-05, + "loss": 0.0249, + "num_input_tokens_seen": 7347392, + "step": 34820 + }, + { + "epoch": 3.8311331133113313, + "grad_norm": 0.9939813613891602, + "learning_rate": 4.873423486076554e-05, + "loss": 0.0863, + "num_input_tokens_seen": 7348512, + "step": 34825 + }, + { + "epoch": 3.8316831683168315, + "grad_norm": 0.07433786988258362, + "learning_rate": 4.873348074135114e-05, + "loss": 0.0443, + "num_input_tokens_seen": 7349600, + "step": 34830 + }, + { + "epoch": 3.832233223322332, + "grad_norm": 1.1423267126083374, + "learning_rate": 4.873272640319665e-05, + "loss": 0.1112, + "num_input_tokens_seen": 7350720, + "step": 34835 + }, + { + "epoch": 3.832783278327833, + "grad_norm": 0.5357503294944763, + "learning_rate": 4.8731971846309e-05, + "loss": 0.0632, + "num_input_tokens_seen": 7351776, + "step": 34840 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.7866601347923279, + "learning_rate": 4.873121707069515e-05, + "loss": 0.0344, + "num_input_tokens_seen": 7352864, + "step": 34845 + }, + { + "epoch": 3.833883388338834, + "grad_norm": 0.10642843693494797, + "learning_rate": 4.8730462076362066e-05, + "loss": 0.1152, + "num_input_tokens_seen": 7353888, + "step": 34850 + }, + { + "epoch": 3.8344334433443343, + "grad_norm": 0.045805200934410095, + "learning_rate": 4.872970686331669e-05, + "loss": 0.0664, + "num_input_tokens_seen": 7354912, + "step": 34855 + }, + { + "epoch": 3.834983498349835, + "grad_norm": 0.6841025352478027, + "learning_rate": 4.8728951431566004e-05, + "loss": 0.1039, + "num_input_tokens_seen": 7355936, + "step": 34860 + }, + { + "epoch": 3.8355335533553356, + "grad_norm": 1.5659555196762085, + "learning_rate": 4.872819578111696e-05, + "loss": 0.0756, + "num_input_tokens_seen": 7357024, + "step": 34865 + }, + { + "epoch": 3.836083608360836, + "grad_norm": 0.055225927382707596, + "learning_rate": 4.872743991197651e-05, + "loss": 0.0299, + "num_input_tokens_seen": 7358112, + "step": 34870 + }, + { + "epoch": 3.8366336633663365, + "grad_norm": 0.14785191416740417, + "learning_rate": 4.872668382415164e-05, + "loss": 0.0709, + "num_input_tokens_seen": 7359264, + "step": 34875 + }, + { + "epoch": 3.837183718371837, + "grad_norm": 0.04862779751420021, + "learning_rate": 4.872592751764931e-05, + "loss": 0.0105, + "num_input_tokens_seen": 7360288, + "step": 34880 + }, + { + "epoch": 3.837733773377338, + "grad_norm": 0.18291720747947693, + "learning_rate": 4.872517099247649e-05, + "loss": 0.0214, + "num_input_tokens_seen": 7361312, + "step": 34885 + }, + { + "epoch": 3.8382838283828384, + "grad_norm": 0.030012041330337524, + "learning_rate": 4.8724414248640154e-05, + "loss": 0.1288, + "num_input_tokens_seen": 7362400, + "step": 34890 + }, + { + "epoch": 3.8388338833883386, + "grad_norm": 0.39986690878868103, + "learning_rate": 4.872365728614727e-05, + "loss": 0.047, + "num_input_tokens_seen": 7363456, + "step": 34895 + }, + { + "epoch": 3.8393839383938393, + "grad_norm": 1.5764516592025757, + "learning_rate": 4.872290010500483e-05, + "loss": 0.0676, + "num_input_tokens_seen": 7364480, + "step": 34900 + }, + { + "epoch": 3.83993399339934, + "grad_norm": 0.14797964692115784, + "learning_rate": 4.87221427052198e-05, + "loss": 0.1001, + "num_input_tokens_seen": 7365536, + "step": 34905 + }, + { + "epoch": 3.8404840484048406, + "grad_norm": 0.04659005254507065, + "learning_rate": 4.872138508679916e-05, + "loss": 0.0522, + "num_input_tokens_seen": 7366528, + "step": 34910 + }, + { + "epoch": 3.8410341034103412, + "grad_norm": 0.1543680876493454, + "learning_rate": 4.8720627249749904e-05, + "loss": 0.0302, + "num_input_tokens_seen": 7367584, + "step": 34915 + }, + { + "epoch": 3.8415841584158414, + "grad_norm": 0.2112496793270111, + "learning_rate": 4.8719869194079e-05, + "loss": 0.1113, + "num_input_tokens_seen": 7368672, + "step": 34920 + }, + { + "epoch": 3.842134213421342, + "grad_norm": 0.71754390001297, + "learning_rate": 4.871911091979345e-05, + "loss": 0.1189, + "num_input_tokens_seen": 7369760, + "step": 34925 + }, + { + "epoch": 3.8426842684268427, + "grad_norm": 0.2706446945667267, + "learning_rate": 4.8718352426900234e-05, + "loss": 0.0603, + "num_input_tokens_seen": 7370816, + "step": 34930 + }, + { + "epoch": 3.8432343234323434, + "grad_norm": 0.4689350426197052, + "learning_rate": 4.871759371540634e-05, + "loss": 0.0649, + "num_input_tokens_seen": 7371936, + "step": 34935 + }, + { + "epoch": 3.843784378437844, + "grad_norm": 0.19516415894031525, + "learning_rate": 4.871683478531877e-05, + "loss": 0.068, + "num_input_tokens_seen": 7373056, + "step": 34940 + }, + { + "epoch": 3.8443344334433442, + "grad_norm": 0.03250719979405403, + "learning_rate": 4.8716075636644516e-05, + "loss": 0.0375, + "num_input_tokens_seen": 7374112, + "step": 34945 + }, + { + "epoch": 3.844884488448845, + "grad_norm": 0.1187262237071991, + "learning_rate": 4.871531626939057e-05, + "loss": 0.1578, + "num_input_tokens_seen": 7375104, + "step": 34950 + }, + { + "epoch": 3.8454345434543455, + "grad_norm": 0.09247241169214249, + "learning_rate": 4.871455668356393e-05, + "loss": 0.0906, + "num_input_tokens_seen": 7376192, + "step": 34955 + }, + { + "epoch": 3.8459845984598457, + "grad_norm": 0.08983556181192398, + "learning_rate": 4.871379687917161e-05, + "loss": 0.0816, + "num_input_tokens_seen": 7377216, + "step": 34960 + }, + { + "epoch": 3.8465346534653464, + "grad_norm": 0.15095244348049164, + "learning_rate": 4.8713036856220596e-05, + "loss": 0.0425, + "num_input_tokens_seen": 7378272, + "step": 34965 + }, + { + "epoch": 3.847084708470847, + "grad_norm": 0.05163788050413132, + "learning_rate": 4.87122766147179e-05, + "loss": 0.0614, + "num_input_tokens_seen": 7379232, + "step": 34970 + }, + { + "epoch": 3.8476347634763477, + "grad_norm": 0.7073361277580261, + "learning_rate": 4.8711516154670533e-05, + "loss": 0.0446, + "num_input_tokens_seen": 7380288, + "step": 34975 + }, + { + "epoch": 3.8481848184818483, + "grad_norm": 1.119729995727539, + "learning_rate": 4.87107554760855e-05, + "loss": 0.0845, + "num_input_tokens_seen": 7381280, + "step": 34980 + }, + { + "epoch": 3.8487348734873486, + "grad_norm": 0.2646920084953308, + "learning_rate": 4.870999457896981e-05, + "loss": 0.0409, + "num_input_tokens_seen": 7382304, + "step": 34985 + }, + { + "epoch": 3.849284928492849, + "grad_norm": 0.27785518765449524, + "learning_rate": 4.870923346333047e-05, + "loss": 0.0246, + "num_input_tokens_seen": 7383296, + "step": 34990 + }, + { + "epoch": 3.84983498349835, + "grad_norm": 0.03482897952198982, + "learning_rate": 4.870847212917451e-05, + "loss": 0.0777, + "num_input_tokens_seen": 7384320, + "step": 34995 + }, + { + "epoch": 3.8503850385038505, + "grad_norm": 2.4764244556427, + "learning_rate": 4.8707710576508935e-05, + "loss": 0.0525, + "num_input_tokens_seen": 7385376, + "step": 35000 + }, + { + "epoch": 3.850935093509351, + "grad_norm": 0.026070570573210716, + "learning_rate": 4.870694880534077e-05, + "loss": 0.0652, + "num_input_tokens_seen": 7386464, + "step": 35005 + }, + { + "epoch": 3.8514851485148514, + "grad_norm": 0.6172319650650024, + "learning_rate": 4.8706186815677034e-05, + "loss": 0.1057, + "num_input_tokens_seen": 7387456, + "step": 35010 + }, + { + "epoch": 3.852035203520352, + "grad_norm": 0.6176477670669556, + "learning_rate": 4.870542460752475e-05, + "loss": 0.0617, + "num_input_tokens_seen": 7388480, + "step": 35015 + }, + { + "epoch": 3.8525852585258527, + "grad_norm": 0.04043826833367348, + "learning_rate": 4.870466218089094e-05, + "loss": 0.072, + "num_input_tokens_seen": 7389536, + "step": 35020 + }, + { + "epoch": 3.8531353135313533, + "grad_norm": 0.32119739055633545, + "learning_rate": 4.8703899535782624e-05, + "loss": 0.032, + "num_input_tokens_seen": 7390656, + "step": 35025 + }, + { + "epoch": 3.853685368536854, + "grad_norm": 0.7519924640655518, + "learning_rate": 4.8703136672206854e-05, + "loss": 0.0598, + "num_input_tokens_seen": 7391680, + "step": 35030 + }, + { + "epoch": 3.854235423542354, + "grad_norm": 0.10239486396312714, + "learning_rate": 4.870237359017065e-05, + "loss": 0.0523, + "num_input_tokens_seen": 7392704, + "step": 35035 + }, + { + "epoch": 3.854785478547855, + "grad_norm": 1.0801353454589844, + "learning_rate": 4.8701610289681035e-05, + "loss": 0.0981, + "num_input_tokens_seen": 7393728, + "step": 35040 + }, + { + "epoch": 3.8553355335533555, + "grad_norm": 0.9781757593154907, + "learning_rate": 4.8700846770745046e-05, + "loss": 0.1298, + "num_input_tokens_seen": 7394784, + "step": 35045 + }, + { + "epoch": 3.8558855885588557, + "grad_norm": 0.1508171409368515, + "learning_rate": 4.870008303336973e-05, + "loss": 0.028, + "num_input_tokens_seen": 7395840, + "step": 35050 + }, + { + "epoch": 3.8564356435643563, + "grad_norm": 0.037805311381816864, + "learning_rate": 4.8699319077562124e-05, + "loss": 0.087, + "num_input_tokens_seen": 7396896, + "step": 35055 + }, + { + "epoch": 3.856985698569857, + "grad_norm": 0.017290320247411728, + "learning_rate": 4.869855490332926e-05, + "loss": 0.0125, + "num_input_tokens_seen": 7397920, + "step": 35060 + }, + { + "epoch": 3.8575357535753576, + "grad_norm": 0.2045324444770813, + "learning_rate": 4.8697790510678196e-05, + "loss": 0.129, + "num_input_tokens_seen": 7398912, + "step": 35065 + }, + { + "epoch": 3.8580858085808583, + "grad_norm": 0.3728044629096985, + "learning_rate": 4.869702589961596e-05, + "loss": 0.0882, + "num_input_tokens_seen": 7400064, + "step": 35070 + }, + { + "epoch": 3.8586358635863585, + "grad_norm": 0.051461588591337204, + "learning_rate": 4.8696261070149605e-05, + "loss": 0.0563, + "num_input_tokens_seen": 7401088, + "step": 35075 + }, + { + "epoch": 3.859185918591859, + "grad_norm": 0.33684688806533813, + "learning_rate": 4.869549602228619e-05, + "loss": 0.0581, + "num_input_tokens_seen": 7402144, + "step": 35080 + }, + { + "epoch": 3.8597359735973598, + "grad_norm": 0.6053227782249451, + "learning_rate": 4.869473075603276e-05, + "loss": 0.0606, + "num_input_tokens_seen": 7403232, + "step": 35085 + }, + { + "epoch": 3.8602860286028604, + "grad_norm": 0.17434677481651306, + "learning_rate": 4.869396527139636e-05, + "loss": 0.0295, + "num_input_tokens_seen": 7404192, + "step": 35090 + }, + { + "epoch": 3.860836083608361, + "grad_norm": 0.05049332231283188, + "learning_rate": 4.869319956838405e-05, + "loss": 0.0626, + "num_input_tokens_seen": 7405248, + "step": 35095 + }, + { + "epoch": 3.8613861386138613, + "grad_norm": 0.3125370144844055, + "learning_rate": 4.8692433647002896e-05, + "loss": 0.0574, + "num_input_tokens_seen": 7406240, + "step": 35100 + }, + { + "epoch": 3.861936193619362, + "grad_norm": 0.206442728638649, + "learning_rate": 4.869166750725995e-05, + "loss": 0.0587, + "num_input_tokens_seen": 7407232, + "step": 35105 + }, + { + "epoch": 3.8624862486248626, + "grad_norm": 0.5576313138008118, + "learning_rate": 4.8690901149162266e-05, + "loss": 0.0865, + "num_input_tokens_seen": 7408320, + "step": 35110 + }, + { + "epoch": 3.8630363036303628, + "grad_norm": 0.16835996508598328, + "learning_rate": 4.869013457271692e-05, + "loss": 0.0952, + "num_input_tokens_seen": 7409344, + "step": 35115 + }, + { + "epoch": 3.863586358635864, + "grad_norm": 0.04844101890921593, + "learning_rate": 4.868936777793097e-05, + "loss": 0.018, + "num_input_tokens_seen": 7410400, + "step": 35120 + }, + { + "epoch": 3.864136413641364, + "grad_norm": 1.131164312362671, + "learning_rate": 4.868860076481148e-05, + "loss": 0.1414, + "num_input_tokens_seen": 7411456, + "step": 35125 + }, + { + "epoch": 3.8646864686468647, + "grad_norm": 0.10318519175052643, + "learning_rate": 4.868783353336552e-05, + "loss": 0.0311, + "num_input_tokens_seen": 7412480, + "step": 35130 + }, + { + "epoch": 3.8652365236523654, + "grad_norm": 0.11884687095880508, + "learning_rate": 4.868706608360018e-05, + "loss": 0.024, + "num_input_tokens_seen": 7413568, + "step": 35135 + }, + { + "epoch": 3.8657865786578656, + "grad_norm": 0.1105644553899765, + "learning_rate": 4.8686298415522505e-05, + "loss": 0.0142, + "num_input_tokens_seen": 7414560, + "step": 35140 + }, + { + "epoch": 3.866336633663366, + "grad_norm": 0.14902891218662262, + "learning_rate": 4.868553052913959e-05, + "loss": 0.0232, + "num_input_tokens_seen": 7415584, + "step": 35145 + }, + { + "epoch": 3.866886688668867, + "grad_norm": 0.692172110080719, + "learning_rate": 4.868476242445851e-05, + "loss": 0.054, + "num_input_tokens_seen": 7416640, + "step": 35150 + }, + { + "epoch": 3.8674367436743675, + "grad_norm": 0.1280910074710846, + "learning_rate": 4.868399410148633e-05, + "loss": 0.0521, + "num_input_tokens_seen": 7417728, + "step": 35155 + }, + { + "epoch": 3.867986798679868, + "grad_norm": 0.0841752216219902, + "learning_rate": 4.8683225560230136e-05, + "loss": 0.0317, + "num_input_tokens_seen": 7418816, + "step": 35160 + }, + { + "epoch": 3.8685368536853684, + "grad_norm": 0.02587210200726986, + "learning_rate": 4.8682456800697026e-05, + "loss": 0.0989, + "num_input_tokens_seen": 7419872, + "step": 35165 + }, + { + "epoch": 3.869086908690869, + "grad_norm": 0.39776933193206787, + "learning_rate": 4.8681687822894077e-05, + "loss": 0.0396, + "num_input_tokens_seen": 7420928, + "step": 35170 + }, + { + "epoch": 3.8696369636963697, + "grad_norm": 0.5607919096946716, + "learning_rate": 4.8680918626828375e-05, + "loss": 0.0366, + "num_input_tokens_seen": 7422016, + "step": 35175 + }, + { + "epoch": 3.8701870187018703, + "grad_norm": 0.2102186679840088, + "learning_rate": 4.8680149212507e-05, + "loss": 0.0923, + "num_input_tokens_seen": 7423008, + "step": 35180 + }, + { + "epoch": 3.870737073707371, + "grad_norm": 0.11529655754566193, + "learning_rate": 4.867937957993707e-05, + "loss": 0.0139, + "num_input_tokens_seen": 7424064, + "step": 35185 + }, + { + "epoch": 3.871287128712871, + "grad_norm": 0.14955322444438934, + "learning_rate": 4.8678609729125646e-05, + "loss": 0.1055, + "num_input_tokens_seen": 7425152, + "step": 35190 + }, + { + "epoch": 3.871837183718372, + "grad_norm": 0.3257471024990082, + "learning_rate": 4.8677839660079837e-05, + "loss": 0.0913, + "num_input_tokens_seen": 7426208, + "step": 35195 + }, + { + "epoch": 3.8723872387238725, + "grad_norm": 0.1432994306087494, + "learning_rate": 4.867706937280675e-05, + "loss": 0.1295, + "num_input_tokens_seen": 7427328, + "step": 35200 + }, + { + "epoch": 3.8729372937293727, + "grad_norm": 0.31375783681869507, + "learning_rate": 4.867629886731347e-05, + "loss": 0.058, + "num_input_tokens_seen": 7428352, + "step": 35205 + }, + { + "epoch": 3.8734873487348733, + "grad_norm": 0.6002492308616638, + "learning_rate": 4.8675528143607106e-05, + "loss": 0.0843, + "num_input_tokens_seen": 7429440, + "step": 35210 + }, + { + "epoch": 3.874037403740374, + "grad_norm": 0.2790611982345581, + "learning_rate": 4.8674757201694766e-05, + "loss": 0.0465, + "num_input_tokens_seen": 7430496, + "step": 35215 + }, + { + "epoch": 3.8745874587458746, + "grad_norm": 0.10852465778589249, + "learning_rate": 4.867398604158354e-05, + "loss": 0.0989, + "num_input_tokens_seen": 7431552, + "step": 35220 + }, + { + "epoch": 3.8751375137513753, + "grad_norm": 0.8880786299705505, + "learning_rate": 4.867321466328055e-05, + "loss": 0.0864, + "num_input_tokens_seen": 7432544, + "step": 35225 + }, + { + "epoch": 3.8756875687568755, + "grad_norm": 0.22030006349086761, + "learning_rate": 4.8672443066792894e-05, + "loss": 0.0564, + "num_input_tokens_seen": 7433536, + "step": 35230 + }, + { + "epoch": 3.876237623762376, + "grad_norm": 0.2655562162399292, + "learning_rate": 4.8671671252127696e-05, + "loss": 0.023, + "num_input_tokens_seen": 7434624, + "step": 35235 + }, + { + "epoch": 3.8767876787678768, + "grad_norm": 0.10753639042377472, + "learning_rate": 4.8670899219292065e-05, + "loss": 0.0167, + "num_input_tokens_seen": 7435744, + "step": 35240 + }, + { + "epoch": 3.8773377337733774, + "grad_norm": 0.9625088572502136, + "learning_rate": 4.8670126968293104e-05, + "loss": 0.111, + "num_input_tokens_seen": 7436768, + "step": 35245 + }, + { + "epoch": 3.877887788778878, + "grad_norm": 1.1174577474594116, + "learning_rate": 4.8669354499137955e-05, + "loss": 0.0875, + "num_input_tokens_seen": 7437824, + "step": 35250 + }, + { + "epoch": 3.8784378437843783, + "grad_norm": 0.0200837180018425, + "learning_rate": 4.866858181183371e-05, + "loss": 0.0916, + "num_input_tokens_seen": 7438880, + "step": 35255 + }, + { + "epoch": 3.878987898789879, + "grad_norm": 0.9190706610679626, + "learning_rate": 4.866780890638751e-05, + "loss": 0.1656, + "num_input_tokens_seen": 7440000, + "step": 35260 + }, + { + "epoch": 3.8795379537953796, + "grad_norm": 0.15580670535564423, + "learning_rate": 4.8667035782806466e-05, + "loss": 0.063, + "num_input_tokens_seen": 7441024, + "step": 35265 + }, + { + "epoch": 3.8800880088008802, + "grad_norm": 0.1290360391139984, + "learning_rate": 4.866626244109772e-05, + "loss": 0.0398, + "num_input_tokens_seen": 7442144, + "step": 35270 + }, + { + "epoch": 3.880638063806381, + "grad_norm": 1.0547573566436768, + "learning_rate": 4.866548888126838e-05, + "loss": 0.0685, + "num_input_tokens_seen": 7443104, + "step": 35275 + }, + { + "epoch": 3.881188118811881, + "grad_norm": 0.1763109266757965, + "learning_rate": 4.866471510332559e-05, + "loss": 0.1108, + "num_input_tokens_seen": 7444128, + "step": 35280 + }, + { + "epoch": 3.8817381738173817, + "grad_norm": 0.20666976273059845, + "learning_rate": 4.866394110727647e-05, + "loss": 0.0448, + "num_input_tokens_seen": 7445184, + "step": 35285 + }, + { + "epoch": 3.8822882288228824, + "grad_norm": 0.11598896980285645, + "learning_rate": 4.866316689312817e-05, + "loss": 0.0799, + "num_input_tokens_seen": 7446208, + "step": 35290 + }, + { + "epoch": 3.8828382838283826, + "grad_norm": 0.9924762845039368, + "learning_rate": 4.8662392460887806e-05, + "loss": 0.0663, + "num_input_tokens_seen": 7447232, + "step": 35295 + }, + { + "epoch": 3.8833883388338832, + "grad_norm": 0.2763696610927582, + "learning_rate": 4.866161781056253e-05, + "loss": 0.0664, + "num_input_tokens_seen": 7448288, + "step": 35300 + }, + { + "epoch": 3.883938393839384, + "grad_norm": 0.07881854474544525, + "learning_rate": 4.866084294215948e-05, + "loss": 0.0443, + "num_input_tokens_seen": 7449408, + "step": 35305 + }, + { + "epoch": 3.8844884488448845, + "grad_norm": 0.09066382795572281, + "learning_rate": 4.866006785568579e-05, + "loss": 0.0645, + "num_input_tokens_seen": 7450464, + "step": 35310 + }, + { + "epoch": 3.885038503850385, + "grad_norm": 0.34589043259620667, + "learning_rate": 4.8659292551148606e-05, + "loss": 0.0342, + "num_input_tokens_seen": 7451488, + "step": 35315 + }, + { + "epoch": 3.8855885588558854, + "grad_norm": 0.25391316413879395, + "learning_rate": 4.865851702855508e-05, + "loss": 0.0492, + "num_input_tokens_seen": 7452544, + "step": 35320 + }, + { + "epoch": 3.886138613861386, + "grad_norm": 0.048379190266132355, + "learning_rate": 4.865774128791235e-05, + "loss": 0.0869, + "num_input_tokens_seen": 7453632, + "step": 35325 + }, + { + "epoch": 3.8866886688668867, + "grad_norm": 0.33678925037384033, + "learning_rate": 4.8656965329227574e-05, + "loss": 0.0104, + "num_input_tokens_seen": 7454688, + "step": 35330 + }, + { + "epoch": 3.8872387238723873, + "grad_norm": 0.47756144404411316, + "learning_rate": 4.8656189152507896e-05, + "loss": 0.0536, + "num_input_tokens_seen": 7455808, + "step": 35335 + }, + { + "epoch": 3.887788778877888, + "grad_norm": 1.735068678855896, + "learning_rate": 4.8655412757760473e-05, + "loss": 0.1194, + "num_input_tokens_seen": 7456928, + "step": 35340 + }, + { + "epoch": 3.888338833883388, + "grad_norm": 0.0872650071978569, + "learning_rate": 4.865463614499247e-05, + "loss": 0.0523, + "num_input_tokens_seen": 7457952, + "step": 35345 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.43804341554641724, + "learning_rate": 4.865385931421104e-05, + "loss": 0.0501, + "num_input_tokens_seen": 7459072, + "step": 35350 + }, + { + "epoch": 3.8894389438943895, + "grad_norm": 0.06589958071708679, + "learning_rate": 4.8653082265423325e-05, + "loss": 0.0348, + "num_input_tokens_seen": 7460064, + "step": 35355 + }, + { + "epoch": 3.88998899889989, + "grad_norm": 0.06426676362752914, + "learning_rate": 4.86523049986365e-05, + "loss": 0.1074, + "num_input_tokens_seen": 7461088, + "step": 35360 + }, + { + "epoch": 3.890539053905391, + "grad_norm": 2.0864381790161133, + "learning_rate": 4.865152751385774e-05, + "loss": 0.0891, + "num_input_tokens_seen": 7462176, + "step": 35365 + }, + { + "epoch": 3.891089108910891, + "grad_norm": 0.8563281893730164, + "learning_rate": 4.86507498110942e-05, + "loss": 0.0666, + "num_input_tokens_seen": 7463232, + "step": 35370 + }, + { + "epoch": 3.8916391639163916, + "grad_norm": 0.16682589054107666, + "learning_rate": 4.864997189035304e-05, + "loss": 0.01, + "num_input_tokens_seen": 7464256, + "step": 35375 + }, + { + "epoch": 3.8921892189218923, + "grad_norm": 0.06144848093390465, + "learning_rate": 4.864919375164144e-05, + "loss": 0.1015, + "num_input_tokens_seen": 7465344, + "step": 35380 + }, + { + "epoch": 3.8927392739273925, + "grad_norm": 0.01858919858932495, + "learning_rate": 4.864841539496657e-05, + "loss": 0.0171, + "num_input_tokens_seen": 7466432, + "step": 35385 + }, + { + "epoch": 3.893289328932893, + "grad_norm": 0.8670295476913452, + "learning_rate": 4.8647636820335604e-05, + "loss": 0.0659, + "num_input_tokens_seen": 7467488, + "step": 35390 + }, + { + "epoch": 3.893839383938394, + "grad_norm": 0.13179118931293488, + "learning_rate": 4.864685802775571e-05, + "loss": 0.0206, + "num_input_tokens_seen": 7468544, + "step": 35395 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 0.22521156072616577, + "learning_rate": 4.864607901723408e-05, + "loss": 0.0154, + "num_input_tokens_seen": 7469632, + "step": 35400 + }, + { + "epoch": 3.894939493949395, + "grad_norm": 1.7003746032714844, + "learning_rate": 4.864529978877789e-05, + "loss": 0.131, + "num_input_tokens_seen": 7470720, + "step": 35405 + }, + { + "epoch": 3.8954895489548953, + "grad_norm": 0.33450043201446533, + "learning_rate": 4.8644520342394305e-05, + "loss": 0.0551, + "num_input_tokens_seen": 7471712, + "step": 35410 + }, + { + "epoch": 3.896039603960396, + "grad_norm": 1.2078434228897095, + "learning_rate": 4.864374067809053e-05, + "loss": 0.1458, + "num_input_tokens_seen": 7472768, + "step": 35415 + }, + { + "epoch": 3.8965896589658966, + "grad_norm": 1.3327698707580566, + "learning_rate": 4.864296079587374e-05, + "loss": 0.1204, + "num_input_tokens_seen": 7473888, + "step": 35420 + }, + { + "epoch": 3.8971397139713972, + "grad_norm": 0.7323826551437378, + "learning_rate": 4.8642180695751133e-05, + "loss": 0.1044, + "num_input_tokens_seen": 7474880, + "step": 35425 + }, + { + "epoch": 3.897689768976898, + "grad_norm": 0.38338252902030945, + "learning_rate": 4.864140037772988e-05, + "loss": 0.0462, + "num_input_tokens_seen": 7475936, + "step": 35430 + }, + { + "epoch": 3.898239823982398, + "grad_norm": 0.3137533664703369, + "learning_rate": 4.864061984181719e-05, + "loss": 0.0664, + "num_input_tokens_seen": 7477024, + "step": 35435 + }, + { + "epoch": 3.8987898789878987, + "grad_norm": 0.025875605642795563, + "learning_rate": 4.863983908802026e-05, + "loss": 0.0166, + "num_input_tokens_seen": 7478048, + "step": 35440 + }, + { + "epoch": 3.8993399339933994, + "grad_norm": 0.43287307024002075, + "learning_rate": 4.863905811634626e-05, + "loss": 0.0272, + "num_input_tokens_seen": 7479008, + "step": 35445 + }, + { + "epoch": 3.8998899889989, + "grad_norm": 0.28169164061546326, + "learning_rate": 4.86382769268024e-05, + "loss": 0.0333, + "num_input_tokens_seen": 7480064, + "step": 35450 + }, + { + "epoch": 3.9004400440044007, + "grad_norm": 0.05588331073522568, + "learning_rate": 4.86374955193959e-05, + "loss": 0.0492, + "num_input_tokens_seen": 7481120, + "step": 35455 + }, + { + "epoch": 3.900990099009901, + "grad_norm": 0.23421084880828857, + "learning_rate": 4.8636713894133946e-05, + "loss": 0.0322, + "num_input_tokens_seen": 7482176, + "step": 35460 + }, + { + "epoch": 3.9015401540154016, + "grad_norm": 0.05802890658378601, + "learning_rate": 4.8635932051023734e-05, + "loss": 0.0806, + "num_input_tokens_seen": 7483200, + "step": 35465 + }, + { + "epoch": 3.902090209020902, + "grad_norm": 0.03414979949593544, + "learning_rate": 4.8635149990072484e-05, + "loss": 0.0799, + "num_input_tokens_seen": 7484256, + "step": 35470 + }, + { + "epoch": 3.9026402640264024, + "grad_norm": 0.31344106793403625, + "learning_rate": 4.86343677112874e-05, + "loss": 0.0625, + "num_input_tokens_seen": 7485408, + "step": 35475 + }, + { + "epoch": 3.903190319031903, + "grad_norm": 0.2236911952495575, + "learning_rate": 4.863358521467568e-05, + "loss": 0.1066, + "num_input_tokens_seen": 7486464, + "step": 35480 + }, + { + "epoch": 3.9037403740374037, + "grad_norm": 0.014547479338943958, + "learning_rate": 4.863280250024455e-05, + "loss": 0.0277, + "num_input_tokens_seen": 7487520, + "step": 35485 + }, + { + "epoch": 3.9042904290429044, + "grad_norm": 0.6565249562263489, + "learning_rate": 4.863201956800122e-05, + "loss": 0.0895, + "num_input_tokens_seen": 7488544, + "step": 35490 + }, + { + "epoch": 3.904840484048405, + "grad_norm": 0.7214839458465576, + "learning_rate": 4.8631236417952905e-05, + "loss": 0.1767, + "num_input_tokens_seen": 7489568, + "step": 35495 + }, + { + "epoch": 3.905390539053905, + "grad_norm": 0.48828551173210144, + "learning_rate": 4.863045305010682e-05, + "loss": 0.0885, + "num_input_tokens_seen": 7490592, + "step": 35500 + }, + { + "epoch": 3.905940594059406, + "grad_norm": 0.4763168692588806, + "learning_rate": 4.862966946447019e-05, + "loss": 0.1074, + "num_input_tokens_seen": 7491648, + "step": 35505 + }, + { + "epoch": 3.9064906490649065, + "grad_norm": 0.9767311215400696, + "learning_rate": 4.862888566105024e-05, + "loss": 0.0894, + "num_input_tokens_seen": 7492672, + "step": 35510 + }, + { + "epoch": 3.907040704070407, + "grad_norm": 0.7118656039237976, + "learning_rate": 4.862810163985418e-05, + "loss": 0.1276, + "num_input_tokens_seen": 7493728, + "step": 35515 + }, + { + "epoch": 3.907590759075908, + "grad_norm": 0.14500348269939423, + "learning_rate": 4.862731740088926e-05, + "loss": 0.0378, + "num_input_tokens_seen": 7494752, + "step": 35520 + }, + { + "epoch": 3.908140814081408, + "grad_norm": 0.9076429009437561, + "learning_rate": 4.862653294416268e-05, + "loss": 0.0445, + "num_input_tokens_seen": 7495776, + "step": 35525 + }, + { + "epoch": 3.9086908690869087, + "grad_norm": 0.15643641352653503, + "learning_rate": 4.862574826968169e-05, + "loss": 0.0579, + "num_input_tokens_seen": 7496832, + "step": 35530 + }, + { + "epoch": 3.9092409240924093, + "grad_norm": 0.1203705370426178, + "learning_rate": 4.8624963377453516e-05, + "loss": 0.0824, + "num_input_tokens_seen": 7497856, + "step": 35535 + }, + { + "epoch": 3.9097909790979095, + "grad_norm": 0.5266221761703491, + "learning_rate": 4.862417826748538e-05, + "loss": 0.0398, + "num_input_tokens_seen": 7498944, + "step": 35540 + }, + { + "epoch": 3.9103410341034106, + "grad_norm": 0.03617171198129654, + "learning_rate": 4.862339293978454e-05, + "loss": 0.0581, + "num_input_tokens_seen": 7500000, + "step": 35545 + }, + { + "epoch": 3.910891089108911, + "grad_norm": 0.40838783979415894, + "learning_rate": 4.8622607394358215e-05, + "loss": 0.0245, + "num_input_tokens_seen": 7501088, + "step": 35550 + }, + { + "epoch": 3.9114411441144115, + "grad_norm": 0.053313542157411575, + "learning_rate": 4.862182163121366e-05, + "loss": 0.0412, + "num_input_tokens_seen": 7502176, + "step": 35555 + }, + { + "epoch": 3.911991199119912, + "grad_norm": 0.46751296520233154, + "learning_rate": 4.862103565035811e-05, + "loss": 0.0185, + "num_input_tokens_seen": 7503264, + "step": 35560 + }, + { + "epoch": 3.9125412541254123, + "grad_norm": 0.268678218126297, + "learning_rate": 4.86202494517988e-05, + "loss": 0.026, + "num_input_tokens_seen": 7504320, + "step": 35565 + }, + { + "epoch": 3.913091309130913, + "grad_norm": 0.2574843168258667, + "learning_rate": 4.861946303554299e-05, + "loss": 0.0208, + "num_input_tokens_seen": 7505344, + "step": 35570 + }, + { + "epoch": 3.9136413641364136, + "grad_norm": 0.9158257842063904, + "learning_rate": 4.861867640159792e-05, + "loss": 0.1064, + "num_input_tokens_seen": 7506432, + "step": 35575 + }, + { + "epoch": 3.9141914191419143, + "grad_norm": 0.02854250743985176, + "learning_rate": 4.861788954997085e-05, + "loss": 0.1211, + "num_input_tokens_seen": 7507488, + "step": 35580 + }, + { + "epoch": 3.914741474147415, + "grad_norm": 0.45108845829963684, + "learning_rate": 4.861710248066902e-05, + "loss": 0.1182, + "num_input_tokens_seen": 7508544, + "step": 35585 + }, + { + "epoch": 3.915291529152915, + "grad_norm": 0.1772163063287735, + "learning_rate": 4.861631519369969e-05, + "loss": 0.0753, + "num_input_tokens_seen": 7509536, + "step": 35590 + }, + { + "epoch": 3.9158415841584158, + "grad_norm": 0.2279951423406601, + "learning_rate": 4.8615527689070115e-05, + "loss": 0.0283, + "num_input_tokens_seen": 7510656, + "step": 35595 + }, + { + "epoch": 3.9163916391639164, + "grad_norm": 0.36250582337379456, + "learning_rate": 4.861473996678756e-05, + "loss": 0.0186, + "num_input_tokens_seen": 7511680, + "step": 35600 + }, + { + "epoch": 3.916941694169417, + "grad_norm": 1.1294234991073608, + "learning_rate": 4.861395202685927e-05, + "loss": 0.0591, + "num_input_tokens_seen": 7512704, + "step": 35605 + }, + { + "epoch": 3.9174917491749177, + "grad_norm": 0.11652929335832596, + "learning_rate": 4.861316386929251e-05, + "loss": 0.0306, + "num_input_tokens_seen": 7513792, + "step": 35610 + }, + { + "epoch": 3.918041804180418, + "grad_norm": 0.14047513902187347, + "learning_rate": 4.861237549409456e-05, + "loss": 0.0427, + "num_input_tokens_seen": 7514816, + "step": 35615 + }, + { + "epoch": 3.9185918591859186, + "grad_norm": 0.5515344142913818, + "learning_rate": 4.8611586901272664e-05, + "loss": 0.0966, + "num_input_tokens_seen": 7515872, + "step": 35620 + }, + { + "epoch": 3.919141914191419, + "grad_norm": 0.1461753249168396, + "learning_rate": 4.861079809083411e-05, + "loss": 0.0168, + "num_input_tokens_seen": 7516928, + "step": 35625 + }, + { + "epoch": 3.9196919691969194, + "grad_norm": 0.07741200178861618, + "learning_rate": 4.861000906278616e-05, + "loss": 0.0348, + "num_input_tokens_seen": 7517952, + "step": 35630 + }, + { + "epoch": 3.9202420242024205, + "grad_norm": 0.05305492505431175, + "learning_rate": 4.8609219817136087e-05, + "loss": 0.0663, + "num_input_tokens_seen": 7518944, + "step": 35635 + }, + { + "epoch": 3.9207920792079207, + "grad_norm": 0.04524233937263489, + "learning_rate": 4.860843035389116e-05, + "loss": 0.0302, + "num_input_tokens_seen": 7520032, + "step": 35640 + }, + { + "epoch": 3.9213421342134214, + "grad_norm": 0.4103178083896637, + "learning_rate": 4.860764067305866e-05, + "loss": 0.0254, + "num_input_tokens_seen": 7521088, + "step": 35645 + }, + { + "epoch": 3.921892189218922, + "grad_norm": 0.04247896000742912, + "learning_rate": 4.8606850774645864e-05, + "loss": 0.0719, + "num_input_tokens_seen": 7522112, + "step": 35650 + }, + { + "epoch": 3.9224422442244222, + "grad_norm": 0.10298437625169754, + "learning_rate": 4.860606065866005e-05, + "loss": 0.0225, + "num_input_tokens_seen": 7523168, + "step": 35655 + }, + { + "epoch": 3.922992299229923, + "grad_norm": 0.7660414576530457, + "learning_rate": 4.86052703251085e-05, + "loss": 0.0331, + "num_input_tokens_seen": 7524224, + "step": 35660 + }, + { + "epoch": 3.9235423542354235, + "grad_norm": 0.15294523537158966, + "learning_rate": 4.8604479773998515e-05, + "loss": 0.0504, + "num_input_tokens_seen": 7525312, + "step": 35665 + }, + { + "epoch": 3.924092409240924, + "grad_norm": 0.7064430117607117, + "learning_rate": 4.860368900533735e-05, + "loss": 0.0368, + "num_input_tokens_seen": 7526400, + "step": 35670 + }, + { + "epoch": 3.924642464246425, + "grad_norm": 0.15584757924079895, + "learning_rate": 4.8602898019132316e-05, + "loss": 0.0372, + "num_input_tokens_seen": 7527424, + "step": 35675 + }, + { + "epoch": 3.925192519251925, + "grad_norm": 0.08797377347946167, + "learning_rate": 4.860210681539069e-05, + "loss": 0.0287, + "num_input_tokens_seen": 7528512, + "step": 35680 + }, + { + "epoch": 3.9257425742574257, + "grad_norm": 0.07618129253387451, + "learning_rate": 4.860131539411978e-05, + "loss": 0.0233, + "num_input_tokens_seen": 7529536, + "step": 35685 + }, + { + "epoch": 3.9262926292629263, + "grad_norm": 0.005962054710835218, + "learning_rate": 4.8600523755326865e-05, + "loss": 0.1307, + "num_input_tokens_seen": 7530592, + "step": 35690 + }, + { + "epoch": 3.926842684268427, + "grad_norm": 0.17466798424720764, + "learning_rate": 4.859973189901925e-05, + "loss": 0.0185, + "num_input_tokens_seen": 7531584, + "step": 35695 + }, + { + "epoch": 3.9273927392739276, + "grad_norm": 0.03673221543431282, + "learning_rate": 4.859893982520424e-05, + "loss": 0.0267, + "num_input_tokens_seen": 7532576, + "step": 35700 + }, + { + "epoch": 3.927942794279428, + "grad_norm": 0.056860875338315964, + "learning_rate": 4.859814753388911e-05, + "loss": 0.0428, + "num_input_tokens_seen": 7533664, + "step": 35705 + }, + { + "epoch": 3.9284928492849285, + "grad_norm": 1.0821465253829956, + "learning_rate": 4.859735502508118e-05, + "loss": 0.1296, + "num_input_tokens_seen": 7534752, + "step": 35710 + }, + { + "epoch": 3.929042904290429, + "grad_norm": 1.234157919883728, + "learning_rate": 4.859656229878776e-05, + "loss": 0.0574, + "num_input_tokens_seen": 7535776, + "step": 35715 + }, + { + "epoch": 3.9295929592959293, + "grad_norm": 0.4062121510505676, + "learning_rate": 4.859576935501614e-05, + "loss": 0.0117, + "num_input_tokens_seen": 7536832, + "step": 35720 + }, + { + "epoch": 3.93014301430143, + "grad_norm": 0.007311516907066107, + "learning_rate": 4.8594976193773646e-05, + "loss": 0.0116, + "num_input_tokens_seen": 7537856, + "step": 35725 + }, + { + "epoch": 3.9306930693069306, + "grad_norm": 0.1341417282819748, + "learning_rate": 4.859418281506757e-05, + "loss": 0.1272, + "num_input_tokens_seen": 7538976, + "step": 35730 + }, + { + "epoch": 3.9312431243124313, + "grad_norm": 0.4065324068069458, + "learning_rate": 4.8593389218905234e-05, + "loss": 0.0404, + "num_input_tokens_seen": 7540096, + "step": 35735 + }, + { + "epoch": 3.931793179317932, + "grad_norm": 0.4599490165710449, + "learning_rate": 4.859259540529395e-05, + "loss": 0.0388, + "num_input_tokens_seen": 7541120, + "step": 35740 + }, + { + "epoch": 3.932343234323432, + "grad_norm": 0.2922661602497101, + "learning_rate": 4.859180137424104e-05, + "loss": 0.1331, + "num_input_tokens_seen": 7542208, + "step": 35745 + }, + { + "epoch": 3.932893289328933, + "grad_norm": 0.030027950182557106, + "learning_rate": 4.8591007125753807e-05, + "loss": 0.0183, + "num_input_tokens_seen": 7543232, + "step": 35750 + }, + { + "epoch": 3.9334433443344334, + "grad_norm": 0.26251065731048584, + "learning_rate": 4.859021265983959e-05, + "loss": 0.0618, + "num_input_tokens_seen": 7544256, + "step": 35755 + }, + { + "epoch": 3.933993399339934, + "grad_norm": 1.2048243284225464, + "learning_rate": 4.85894179765057e-05, + "loss": 0.0792, + "num_input_tokens_seen": 7545376, + "step": 35760 + }, + { + "epoch": 3.9345434543454347, + "grad_norm": 0.006112139206379652, + "learning_rate": 4.858862307575947e-05, + "loss": 0.0017, + "num_input_tokens_seen": 7546464, + "step": 35765 + }, + { + "epoch": 3.935093509350935, + "grad_norm": 0.1353079378604889, + "learning_rate": 4.858782795760821e-05, + "loss": 0.0312, + "num_input_tokens_seen": 7547520, + "step": 35770 + }, + { + "epoch": 3.9356435643564356, + "grad_norm": 0.636523425579071, + "learning_rate": 4.8587032622059263e-05, + "loss": 0.0609, + "num_input_tokens_seen": 7548544, + "step": 35775 + }, + { + "epoch": 3.9361936193619362, + "grad_norm": 0.11323250085115433, + "learning_rate": 4.8586237069119956e-05, + "loss": 0.0524, + "num_input_tokens_seen": 7549632, + "step": 35780 + }, + { + "epoch": 3.936743674367437, + "grad_norm": 0.28532716631889343, + "learning_rate": 4.858544129879762e-05, + "loss": 0.1745, + "num_input_tokens_seen": 7550656, + "step": 35785 + }, + { + "epoch": 3.9372937293729375, + "grad_norm": 0.7504904270172119, + "learning_rate": 4.858464531109959e-05, + "loss": 0.0492, + "num_input_tokens_seen": 7551712, + "step": 35790 + }, + { + "epoch": 3.9378437843784377, + "grad_norm": 0.24141106009483337, + "learning_rate": 4.858384910603319e-05, + "loss": 0.0948, + "num_input_tokens_seen": 7552768, + "step": 35795 + }, + { + "epoch": 3.9383938393839384, + "grad_norm": 0.08165079355239868, + "learning_rate": 4.858305268360578e-05, + "loss": 0.0416, + "num_input_tokens_seen": 7553792, + "step": 35800 + }, + { + "epoch": 3.938943894389439, + "grad_norm": 0.11781615018844604, + "learning_rate": 4.858225604382469e-05, + "loss": 0.0854, + "num_input_tokens_seen": 7554880, + "step": 35805 + }, + { + "epoch": 3.9394939493949392, + "grad_norm": 0.029546095058321953, + "learning_rate": 4.858145918669725e-05, + "loss": 0.0634, + "num_input_tokens_seen": 7555936, + "step": 35810 + }, + { + "epoch": 3.94004400440044, + "grad_norm": 0.07828821241855621, + "learning_rate": 4.858066211223083e-05, + "loss": 0.094, + "num_input_tokens_seen": 7556928, + "step": 35815 + }, + { + "epoch": 3.9405940594059405, + "grad_norm": 1.2615114450454712, + "learning_rate": 4.857986482043276e-05, + "loss": 0.1515, + "num_input_tokens_seen": 7557984, + "step": 35820 + }, + { + "epoch": 3.941144114411441, + "grad_norm": 0.1710413545370102, + "learning_rate": 4.857906731131039e-05, + "loss": 0.0665, + "num_input_tokens_seen": 7559072, + "step": 35825 + }, + { + "epoch": 3.941694169416942, + "grad_norm": 1.2022243738174438, + "learning_rate": 4.8578269584871063e-05, + "loss": 0.1232, + "num_input_tokens_seen": 7560160, + "step": 35830 + }, + { + "epoch": 3.942244224422442, + "grad_norm": 0.04557320475578308, + "learning_rate": 4.8577471641122146e-05, + "loss": 0.0312, + "num_input_tokens_seen": 7561184, + "step": 35835 + }, + { + "epoch": 3.9427942794279427, + "grad_norm": 1.2767155170440674, + "learning_rate": 4.857667348007099e-05, + "loss": 0.0454, + "num_input_tokens_seen": 7562208, + "step": 35840 + }, + { + "epoch": 3.9433443344334433, + "grad_norm": 0.716244637966156, + "learning_rate": 4.857587510172494e-05, + "loss": 0.0402, + "num_input_tokens_seen": 7563200, + "step": 35845 + }, + { + "epoch": 3.943894389438944, + "grad_norm": 0.5531171560287476, + "learning_rate": 4.857507650609137e-05, + "loss": 0.0263, + "num_input_tokens_seen": 7564256, + "step": 35850 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.016960082575678825, + "learning_rate": 4.857427769317762e-05, + "loss": 0.0231, + "num_input_tokens_seen": 7565312, + "step": 35855 + }, + { + "epoch": 3.944994499449945, + "grad_norm": 0.15648649632930756, + "learning_rate": 4.857347866299108e-05, + "loss": 0.0654, + "num_input_tokens_seen": 7566336, + "step": 35860 + }, + { + "epoch": 3.9455445544554455, + "grad_norm": 0.6364166140556335, + "learning_rate": 4.857267941553908e-05, + "loss": 0.0612, + "num_input_tokens_seen": 7567360, + "step": 35865 + }, + { + "epoch": 3.946094609460946, + "grad_norm": 0.5024822354316711, + "learning_rate": 4.8571879950829026e-05, + "loss": 0.0434, + "num_input_tokens_seen": 7568480, + "step": 35870 + }, + { + "epoch": 3.946644664466447, + "grad_norm": 0.028490541502833366, + "learning_rate": 4.857108026886825e-05, + "loss": 0.082, + "num_input_tokens_seen": 7569536, + "step": 35875 + }, + { + "epoch": 3.9471947194719474, + "grad_norm": 0.1907234638929367, + "learning_rate": 4.857028036966415e-05, + "loss": 0.0989, + "num_input_tokens_seen": 7570560, + "step": 35880 + }, + { + "epoch": 3.9477447744774476, + "grad_norm": 0.5940341949462891, + "learning_rate": 4.856948025322407e-05, + "loss": 0.0848, + "num_input_tokens_seen": 7571584, + "step": 35885 + }, + { + "epoch": 3.9482948294829483, + "grad_norm": 0.07712870836257935, + "learning_rate": 4.856867991955542e-05, + "loss": 0.026, + "num_input_tokens_seen": 7572672, + "step": 35890 + }, + { + "epoch": 3.948844884488449, + "grad_norm": 1.2026958465576172, + "learning_rate": 4.8567879368665536e-05, + "loss": 0.1273, + "num_input_tokens_seen": 7573728, + "step": 35895 + }, + { + "epoch": 3.949394939493949, + "grad_norm": 0.11771538108587265, + "learning_rate": 4.8567078600561834e-05, + "loss": 0.1662, + "num_input_tokens_seen": 7574752, + "step": 35900 + }, + { + "epoch": 3.94994499449945, + "grad_norm": 1.3351322412490845, + "learning_rate": 4.8566277615251665e-05, + "loss": 0.1173, + "num_input_tokens_seen": 7575840, + "step": 35905 + }, + { + "epoch": 3.9504950495049505, + "grad_norm": 0.5669898390769958, + "learning_rate": 4.856547641274243e-05, + "loss": 0.073, + "num_input_tokens_seen": 7576896, + "step": 35910 + }, + { + "epoch": 3.951045104510451, + "grad_norm": 0.21270769834518433, + "learning_rate": 4.856467499304151e-05, + "loss": 0.0338, + "num_input_tokens_seen": 7577952, + "step": 35915 + }, + { + "epoch": 3.9515951595159517, + "grad_norm": 0.2198595553636551, + "learning_rate": 4.856387335615628e-05, + "loss": 0.0123, + "num_input_tokens_seen": 7579008, + "step": 35920 + }, + { + "epoch": 3.952145214521452, + "grad_norm": 0.23067662119865417, + "learning_rate": 4.856307150209414e-05, + "loss": 0.0393, + "num_input_tokens_seen": 7580096, + "step": 35925 + }, + { + "epoch": 3.9526952695269526, + "grad_norm": 1.129990577697754, + "learning_rate": 4.8562269430862475e-05, + "loss": 0.1098, + "num_input_tokens_seen": 7581120, + "step": 35930 + }, + { + "epoch": 3.9532453245324533, + "grad_norm": 0.326744019985199, + "learning_rate": 4.8561467142468674e-05, + "loss": 0.0377, + "num_input_tokens_seen": 7582144, + "step": 35935 + }, + { + "epoch": 3.953795379537954, + "grad_norm": 0.5043174624443054, + "learning_rate": 4.856066463692015e-05, + "loss": 0.0382, + "num_input_tokens_seen": 7583168, + "step": 35940 + }, + { + "epoch": 3.9543454345434546, + "grad_norm": 0.8555014729499817, + "learning_rate": 4.855986191422427e-05, + "loss": 0.0384, + "num_input_tokens_seen": 7584192, + "step": 35945 + }, + { + "epoch": 3.9548954895489548, + "grad_norm": 0.3188510239124298, + "learning_rate": 4.855905897438846e-05, + "loss": 0.0193, + "num_input_tokens_seen": 7585248, + "step": 35950 + }, + { + "epoch": 3.9554455445544554, + "grad_norm": 0.10328482836484909, + "learning_rate": 4.855825581742011e-05, + "loss": 0.0699, + "num_input_tokens_seen": 7586272, + "step": 35955 + }, + { + "epoch": 3.955995599559956, + "grad_norm": 0.014384501613676548, + "learning_rate": 4.8557452443326615e-05, + "loss": 0.0858, + "num_input_tokens_seen": 7587328, + "step": 35960 + }, + { + "epoch": 3.9565456545654567, + "grad_norm": 1.269400715827942, + "learning_rate": 4.8556648852115375e-05, + "loss": 0.1078, + "num_input_tokens_seen": 7588384, + "step": 35965 + }, + { + "epoch": 3.9570957095709574, + "grad_norm": 0.2399905025959015, + "learning_rate": 4.8555845043793816e-05, + "loss": 0.088, + "num_input_tokens_seen": 7589440, + "step": 35970 + }, + { + "epoch": 3.9576457645764576, + "grad_norm": 0.016637573018670082, + "learning_rate": 4.855504101836934e-05, + "loss": 0.0165, + "num_input_tokens_seen": 7590496, + "step": 35975 + }, + { + "epoch": 3.958195819581958, + "grad_norm": 0.1174638494849205, + "learning_rate": 4.855423677584935e-05, + "loss": 0.1204, + "num_input_tokens_seen": 7591584, + "step": 35980 + }, + { + "epoch": 3.958745874587459, + "grad_norm": 0.029968850314617157, + "learning_rate": 4.855343231624125e-05, + "loss": 0.0085, + "num_input_tokens_seen": 7592608, + "step": 35985 + }, + { + "epoch": 3.959295929592959, + "grad_norm": 0.2575068771839142, + "learning_rate": 4.8552627639552485e-05, + "loss": 0.0651, + "num_input_tokens_seen": 7593696, + "step": 35990 + }, + { + "epoch": 3.9598459845984597, + "grad_norm": 0.30878961086273193, + "learning_rate": 4.8551822745790445e-05, + "loss": 0.0342, + "num_input_tokens_seen": 7594816, + "step": 35995 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.2515031099319458, + "learning_rate": 4.855101763496256e-05, + "loss": 0.0368, + "num_input_tokens_seen": 7595808, + "step": 36000 + }, + { + "epoch": 3.960946094609461, + "grad_norm": 0.09108522534370422, + "learning_rate": 4.855021230707624e-05, + "loss": 0.0789, + "num_input_tokens_seen": 7596864, + "step": 36005 + }, + { + "epoch": 3.9614961496149617, + "grad_norm": 0.04758673906326294, + "learning_rate": 4.854940676213892e-05, + "loss": 0.0486, + "num_input_tokens_seen": 7597920, + "step": 36010 + }, + { + "epoch": 3.962046204620462, + "grad_norm": 0.2293912023305893, + "learning_rate": 4.854860100015801e-05, + "loss": 0.0205, + "num_input_tokens_seen": 7598912, + "step": 36015 + }, + { + "epoch": 3.9625962596259625, + "grad_norm": 0.08027460426092148, + "learning_rate": 4.854779502114095e-05, + "loss": 0.0605, + "num_input_tokens_seen": 7599936, + "step": 36020 + }, + { + "epoch": 3.963146314631463, + "grad_norm": 0.5779051780700684, + "learning_rate": 4.854698882509516e-05, + "loss": 0.0729, + "num_input_tokens_seen": 7600992, + "step": 36025 + }, + { + "epoch": 3.963696369636964, + "grad_norm": 1.8135734796524048, + "learning_rate": 4.8546182412028075e-05, + "loss": 0.0709, + "num_input_tokens_seen": 7602016, + "step": 36030 + }, + { + "epoch": 3.9642464246424645, + "grad_norm": 1.806604027748108, + "learning_rate": 4.854537578194712e-05, + "loss": 0.0913, + "num_input_tokens_seen": 7603072, + "step": 36035 + }, + { + "epoch": 3.9647964796479647, + "grad_norm": 0.943705677986145, + "learning_rate": 4.854456893485975e-05, + "loss": 0.125, + "num_input_tokens_seen": 7604032, + "step": 36040 + }, + { + "epoch": 3.9653465346534653, + "grad_norm": 0.28299424052238464, + "learning_rate": 4.854376187077337e-05, + "loss": 0.0109, + "num_input_tokens_seen": 7605120, + "step": 36045 + }, + { + "epoch": 3.965896589658966, + "grad_norm": 0.04915713891386986, + "learning_rate": 4.854295458969544e-05, + "loss": 0.027, + "num_input_tokens_seen": 7606112, + "step": 36050 + }, + { + "epoch": 3.966446644664466, + "grad_norm": 0.3500480651855469, + "learning_rate": 4.854214709163339e-05, + "loss": 0.0316, + "num_input_tokens_seen": 7607168, + "step": 36055 + }, + { + "epoch": 3.9669966996699673, + "grad_norm": 0.07877923548221588, + "learning_rate": 4.854133937659467e-05, + "loss": 0.011, + "num_input_tokens_seen": 7608256, + "step": 36060 + }, + { + "epoch": 3.9675467546754675, + "grad_norm": 0.19934192299842834, + "learning_rate": 4.854053144458672e-05, + "loss": 0.0994, + "num_input_tokens_seen": 7609280, + "step": 36065 + }, + { + "epoch": 3.968096809680968, + "grad_norm": 0.24863247573375702, + "learning_rate": 4.8539723295616994e-05, + "loss": 0.1534, + "num_input_tokens_seen": 7610336, + "step": 36070 + }, + { + "epoch": 3.9686468646864688, + "grad_norm": 0.5774099826812744, + "learning_rate": 4.853891492969293e-05, + "loss": 0.1159, + "num_input_tokens_seen": 7611456, + "step": 36075 + }, + { + "epoch": 3.969196919691969, + "grad_norm": 1.7990020513534546, + "learning_rate": 4.853810634682198e-05, + "loss": 0.1144, + "num_input_tokens_seen": 7612576, + "step": 36080 + }, + { + "epoch": 3.9697469746974696, + "grad_norm": 0.6902977228164673, + "learning_rate": 4.85372975470116e-05, + "loss": 0.23, + "num_input_tokens_seen": 7613696, + "step": 36085 + }, + { + "epoch": 3.9702970297029703, + "grad_norm": 0.02422301471233368, + "learning_rate": 4.853648853026924e-05, + "loss": 0.1003, + "num_input_tokens_seen": 7614720, + "step": 36090 + }, + { + "epoch": 3.970847084708471, + "grad_norm": 0.28627654910087585, + "learning_rate": 4.853567929660237e-05, + "loss": 0.0574, + "num_input_tokens_seen": 7615776, + "step": 36095 + }, + { + "epoch": 3.9713971397139716, + "grad_norm": 0.06901717185974121, + "learning_rate": 4.853486984601843e-05, + "loss": 0.0101, + "num_input_tokens_seen": 7616800, + "step": 36100 + }, + { + "epoch": 3.9719471947194718, + "grad_norm": 0.9955426454544067, + "learning_rate": 4.853406017852488e-05, + "loss": 0.1147, + "num_input_tokens_seen": 7617920, + "step": 36105 + }, + { + "epoch": 3.9724972497249724, + "grad_norm": 0.12686988711357117, + "learning_rate": 4.85332502941292e-05, + "loss": 0.0293, + "num_input_tokens_seen": 7619008, + "step": 36110 + }, + { + "epoch": 3.973047304730473, + "grad_norm": 0.2383802831172943, + "learning_rate": 4.853244019283884e-05, + "loss": 0.0482, + "num_input_tokens_seen": 7620064, + "step": 36115 + }, + { + "epoch": 3.9735973597359737, + "grad_norm": 0.054983798414468765, + "learning_rate": 4.853162987466128e-05, + "loss": 0.0622, + "num_input_tokens_seen": 7621184, + "step": 36120 + }, + { + "epoch": 3.9741474147414744, + "grad_norm": 0.03447617590427399, + "learning_rate": 4.853081933960396e-05, + "loss": 0.0397, + "num_input_tokens_seen": 7622336, + "step": 36125 + }, + { + "epoch": 3.9746974697469746, + "grad_norm": 0.17498604953289032, + "learning_rate": 4.8530008587674386e-05, + "loss": 0.1568, + "num_input_tokens_seen": 7623456, + "step": 36130 + }, + { + "epoch": 3.9752475247524752, + "grad_norm": 0.12640219926834106, + "learning_rate": 4.852919761888001e-05, + "loss": 0.064, + "num_input_tokens_seen": 7624512, + "step": 36135 + }, + { + "epoch": 3.975797579757976, + "grad_norm": 0.021676907315850258, + "learning_rate": 4.8528386433228306e-05, + "loss": 0.0137, + "num_input_tokens_seen": 7625536, + "step": 36140 + }, + { + "epoch": 3.976347634763476, + "grad_norm": 0.06277130544185638, + "learning_rate": 4.8527575030726755e-05, + "loss": 0.0928, + "num_input_tokens_seen": 7626592, + "step": 36145 + }, + { + "epoch": 3.976897689768977, + "grad_norm": 0.031108101829886436, + "learning_rate": 4.852676341138284e-05, + "loss": 0.0768, + "num_input_tokens_seen": 7627680, + "step": 36150 + }, + { + "epoch": 3.9774477447744774, + "grad_norm": 0.06959528475999832, + "learning_rate": 4.852595157520403e-05, + "loss": 0.01, + "num_input_tokens_seen": 7628736, + "step": 36155 + }, + { + "epoch": 3.977997799779978, + "grad_norm": 0.1901085078716278, + "learning_rate": 4.852513952219782e-05, + "loss": 0.1131, + "num_input_tokens_seen": 7629728, + "step": 36160 + }, + { + "epoch": 3.9785478547854787, + "grad_norm": 0.3973475992679596, + "learning_rate": 4.852432725237168e-05, + "loss": 0.0452, + "num_input_tokens_seen": 7630816, + "step": 36165 + }, + { + "epoch": 3.979097909790979, + "grad_norm": 0.15438532829284668, + "learning_rate": 4.85235147657331e-05, + "loss": 0.045, + "num_input_tokens_seen": 7631872, + "step": 36170 + }, + { + "epoch": 3.9796479647964795, + "grad_norm": 0.0593186654150486, + "learning_rate": 4.8522702062289584e-05, + "loss": 0.0978, + "num_input_tokens_seen": 7632992, + "step": 36175 + }, + { + "epoch": 3.98019801980198, + "grad_norm": 0.22890162467956543, + "learning_rate": 4.85218891420486e-05, + "loss": 0.0324, + "num_input_tokens_seen": 7634048, + "step": 36180 + }, + { + "epoch": 3.980748074807481, + "grad_norm": 0.14625594019889832, + "learning_rate": 4.852107600501766e-05, + "loss": 0.0338, + "num_input_tokens_seen": 7635168, + "step": 36185 + }, + { + "epoch": 3.9812981298129815, + "grad_norm": 0.3238672614097595, + "learning_rate": 4.852026265120424e-05, + "loss": 0.0596, + "num_input_tokens_seen": 7636288, + "step": 36190 + }, + { + "epoch": 3.9818481848184817, + "grad_norm": 0.5447947978973389, + "learning_rate": 4.8519449080615855e-05, + "loss": 0.043, + "num_input_tokens_seen": 7637312, + "step": 36195 + }, + { + "epoch": 3.9823982398239823, + "grad_norm": 1.3546292781829834, + "learning_rate": 4.8518635293259987e-05, + "loss": 0.149, + "num_input_tokens_seen": 7638400, + "step": 36200 + }, + { + "epoch": 3.982948294829483, + "grad_norm": 0.201960951089859, + "learning_rate": 4.851782128914414e-05, + "loss": 0.019, + "num_input_tokens_seen": 7639488, + "step": 36205 + }, + { + "epoch": 3.9834983498349836, + "grad_norm": 0.1717589646577835, + "learning_rate": 4.8517007068275825e-05, + "loss": 0.0165, + "num_input_tokens_seen": 7640512, + "step": 36210 + }, + { + "epoch": 3.9840484048404843, + "grad_norm": 0.10323283076286316, + "learning_rate": 4.851619263066254e-05, + "loss": 0.0049, + "num_input_tokens_seen": 7641568, + "step": 36215 + }, + { + "epoch": 3.9845984598459845, + "grad_norm": 0.14582297205924988, + "learning_rate": 4.8515377976311795e-05, + "loss": 0.0231, + "num_input_tokens_seen": 7642592, + "step": 36220 + }, + { + "epoch": 3.985148514851485, + "grad_norm": 0.07683191448450089, + "learning_rate": 4.851456310523108e-05, + "loss": 0.078, + "num_input_tokens_seen": 7643648, + "step": 36225 + }, + { + "epoch": 3.985698569856986, + "grad_norm": 0.2506696283817291, + "learning_rate": 4.8513748017427934e-05, + "loss": 0.1433, + "num_input_tokens_seen": 7644768, + "step": 36230 + }, + { + "epoch": 3.986248624862486, + "grad_norm": 0.27157625555992126, + "learning_rate": 4.8512932712909856e-05, + "loss": 0.0775, + "num_input_tokens_seen": 7645856, + "step": 36235 + }, + { + "epoch": 3.9867986798679866, + "grad_norm": 1.1870640516281128, + "learning_rate": 4.8512117191684355e-05, + "loss": 0.0605, + "num_input_tokens_seen": 7646912, + "step": 36240 + }, + { + "epoch": 3.9873487348734873, + "grad_norm": 0.20590925216674805, + "learning_rate": 4.851130145375895e-05, + "loss": 0.1035, + "num_input_tokens_seen": 7647936, + "step": 36245 + }, + { + "epoch": 3.987898789878988, + "grad_norm": 2.4124717712402344, + "learning_rate": 4.851048549914117e-05, + "loss": 0.1561, + "num_input_tokens_seen": 7648960, + "step": 36250 + }, + { + "epoch": 3.9884488448844886, + "grad_norm": 0.24918517470359802, + "learning_rate": 4.8509669327838514e-05, + "loss": 0.1416, + "num_input_tokens_seen": 7649984, + "step": 36255 + }, + { + "epoch": 3.988998899889989, + "grad_norm": 0.5256307125091553, + "learning_rate": 4.850885293985853e-05, + "loss": 0.0277, + "num_input_tokens_seen": 7651040, + "step": 36260 + }, + { + "epoch": 3.9895489548954894, + "grad_norm": 0.15720264613628387, + "learning_rate": 4.850803633520872e-05, + "loss": 0.0276, + "num_input_tokens_seen": 7652128, + "step": 36265 + }, + { + "epoch": 3.99009900990099, + "grad_norm": 0.1263883113861084, + "learning_rate": 4.850721951389662e-05, + "loss": 0.0147, + "num_input_tokens_seen": 7653216, + "step": 36270 + }, + { + "epoch": 3.9906490649064907, + "grad_norm": 1.8725138902664185, + "learning_rate": 4.850640247592976e-05, + "loss": 0.1222, + "num_input_tokens_seen": 7654272, + "step": 36275 + }, + { + "epoch": 3.9911991199119914, + "grad_norm": 1.5507333278656006, + "learning_rate": 4.8505585221315666e-05, + "loss": 0.1314, + "num_input_tokens_seen": 7655328, + "step": 36280 + }, + { + "epoch": 3.9917491749174916, + "grad_norm": 0.21829847991466522, + "learning_rate": 4.850476775006188e-05, + "loss": 0.0473, + "num_input_tokens_seen": 7656416, + "step": 36285 + }, + { + "epoch": 3.9922992299229922, + "grad_norm": 0.030779486522078514, + "learning_rate": 4.8503950062175916e-05, + "loss": 0.0425, + "num_input_tokens_seen": 7657472, + "step": 36290 + }, + { + "epoch": 3.992849284928493, + "grad_norm": 1.1839408874511719, + "learning_rate": 4.850313215766533e-05, + "loss": 0.0794, + "num_input_tokens_seen": 7658464, + "step": 36295 + }, + { + "epoch": 3.9933993399339935, + "grad_norm": 1.5742789506912231, + "learning_rate": 4.850231403653766e-05, + "loss": 0.0972, + "num_input_tokens_seen": 7659488, + "step": 36300 + }, + { + "epoch": 3.993949394939494, + "grad_norm": 0.08531758189201355, + "learning_rate": 4.850149569880042e-05, + "loss": 0.0582, + "num_input_tokens_seen": 7660608, + "step": 36305 + }, + { + "epoch": 3.9944994499449944, + "grad_norm": 0.35372036695480347, + "learning_rate": 4.850067714446118e-05, + "loss": 0.0627, + "num_input_tokens_seen": 7661664, + "step": 36310 + }, + { + "epoch": 3.995049504950495, + "grad_norm": 0.05898810923099518, + "learning_rate": 4.849985837352749e-05, + "loss": 0.0409, + "num_input_tokens_seen": 7662688, + "step": 36315 + }, + { + "epoch": 3.9955995599559957, + "grad_norm": 0.08965074270963669, + "learning_rate": 4.8499039386006864e-05, + "loss": 0.0473, + "num_input_tokens_seen": 7663744, + "step": 36320 + }, + { + "epoch": 3.996149614961496, + "grad_norm": 0.03432779014110565, + "learning_rate": 4.849822018190687e-05, + "loss": 0.0296, + "num_input_tokens_seen": 7664832, + "step": 36325 + }, + { + "epoch": 3.9966996699669965, + "grad_norm": 1.3356657028198242, + "learning_rate": 4.8497400761235056e-05, + "loss": 0.1328, + "num_input_tokens_seen": 7665856, + "step": 36330 + }, + { + "epoch": 3.997249724972497, + "grad_norm": 0.020573168992996216, + "learning_rate": 4.8496581123998976e-05, + "loss": 0.0137, + "num_input_tokens_seen": 7666912, + "step": 36335 + }, + { + "epoch": 3.997799779977998, + "grad_norm": 0.032637789845466614, + "learning_rate": 4.8495761270206184e-05, + "loss": 0.0184, + "num_input_tokens_seen": 7667904, + "step": 36340 + }, + { + "epoch": 3.9983498349834985, + "grad_norm": 0.18689583241939545, + "learning_rate": 4.8494941199864235e-05, + "loss": 0.015, + "num_input_tokens_seen": 7669024, + "step": 36345 + }, + { + "epoch": 3.9988998899889987, + "grad_norm": 1.2763794660568237, + "learning_rate": 4.849412091298069e-05, + "loss": 0.0423, + "num_input_tokens_seen": 7670016, + "step": 36350 + }, + { + "epoch": 3.9994499449944994, + "grad_norm": 0.28466302156448364, + "learning_rate": 4.84933004095631e-05, + "loss": 0.1273, + "num_input_tokens_seen": 7671072, + "step": 36355 + }, + { + "epoch": 4.0, + "grad_norm": 2.119602680206299, + "learning_rate": 4.8492479689619036e-05, + "loss": 0.2799, + "num_input_tokens_seen": 7672000, + "step": 36360 + }, + { + "epoch": 4.0, + "eval_loss": 0.06972524523735046, + "eval_runtime": 36.9958, + "eval_samples_per_second": 109.202, + "eval_steps_per_second": 27.3, + "num_input_tokens_seen": 7672000, + "step": 36360 + }, + { + "epoch": 4.0005500550055, + "grad_norm": 0.04496987909078598, + "learning_rate": 4.849165875315606e-05, + "loss": 0.0687, + "num_input_tokens_seen": 7673024, + "step": 36365 + }, + { + "epoch": 4.001100110011001, + "grad_norm": 0.5262559056282043, + "learning_rate": 4.849083760018174e-05, + "loss": 0.0794, + "num_input_tokens_seen": 7674144, + "step": 36370 + }, + { + "epoch": 4.0016501650165015, + "grad_norm": 0.08886972069740295, + "learning_rate": 4.849001623070364e-05, + "loss": 0.1, + "num_input_tokens_seen": 7675168, + "step": 36375 + }, + { + "epoch": 4.002200220022003, + "grad_norm": 0.547118067741394, + "learning_rate": 4.848919464472933e-05, + "loss": 0.1086, + "num_input_tokens_seen": 7676192, + "step": 36380 + }, + { + "epoch": 4.002750275027503, + "grad_norm": 0.9582743048667908, + "learning_rate": 4.8488372842266386e-05, + "loss": 0.0462, + "num_input_tokens_seen": 7677312, + "step": 36385 + }, + { + "epoch": 4.003300330033003, + "grad_norm": 0.14140236377716064, + "learning_rate": 4.848755082332238e-05, + "loss": 0.0844, + "num_input_tokens_seen": 7678368, + "step": 36390 + }, + { + "epoch": 4.003850385038504, + "grad_norm": 0.13695397973060608, + "learning_rate": 4.848672858790488e-05, + "loss": 0.0649, + "num_input_tokens_seen": 7679488, + "step": 36395 + }, + { + "epoch": 4.004400440044004, + "grad_norm": 0.9121379852294922, + "learning_rate": 4.848590613602148e-05, + "loss": 0.0681, + "num_input_tokens_seen": 7680448, + "step": 36400 + }, + { + "epoch": 4.0049504950495045, + "grad_norm": 0.038429249078035355, + "learning_rate": 4.8485083467679756e-05, + "loss": 0.0152, + "num_input_tokens_seen": 7681472, + "step": 36405 + }, + { + "epoch": 4.005500550055006, + "grad_norm": 0.09333253651857376, + "learning_rate": 4.8484260582887284e-05, + "loss": 0.0267, + "num_input_tokens_seen": 7682592, + "step": 36410 + }, + { + "epoch": 4.006050605060506, + "grad_norm": 0.18130961060523987, + "learning_rate": 4.848343748165165e-05, + "loss": 0.008, + "num_input_tokens_seen": 7683616, + "step": 36415 + }, + { + "epoch": 4.006600660066007, + "grad_norm": 0.8457877039909363, + "learning_rate": 4.848261416398044e-05, + "loss": 0.0388, + "num_input_tokens_seen": 7684704, + "step": 36420 + }, + { + "epoch": 4.007150715071507, + "grad_norm": 0.3473375737667084, + "learning_rate": 4.848179062988125e-05, + "loss": 0.0133, + "num_input_tokens_seen": 7685760, + "step": 36425 + }, + { + "epoch": 4.007700770077007, + "grad_norm": 0.06129142642021179, + "learning_rate": 4.848096687936165e-05, + "loss": 0.067, + "num_input_tokens_seen": 7686912, + "step": 36430 + }, + { + "epoch": 4.008250825082508, + "grad_norm": 0.09000030905008316, + "learning_rate": 4.848014291242926e-05, + "loss": 0.0493, + "num_input_tokens_seen": 7688096, + "step": 36435 + }, + { + "epoch": 4.008800880088009, + "grad_norm": 1.0315152406692505, + "learning_rate": 4.847931872909166e-05, + "loss": 0.0666, + "num_input_tokens_seen": 7689152, + "step": 36440 + }, + { + "epoch": 4.00935093509351, + "grad_norm": 0.3236318528652191, + "learning_rate": 4.847849432935644e-05, + "loss": 0.0324, + "num_input_tokens_seen": 7690112, + "step": 36445 + }, + { + "epoch": 4.00990099009901, + "grad_norm": 0.7600762844085693, + "learning_rate": 4.84776697132312e-05, + "loss": 0.0838, + "num_input_tokens_seen": 7691200, + "step": 36450 + }, + { + "epoch": 4.01045104510451, + "grad_norm": 1.2114982604980469, + "learning_rate": 4.847684488072355e-05, + "loss": 0.1302, + "num_input_tokens_seen": 7692160, + "step": 36455 + }, + { + "epoch": 4.011001100110011, + "grad_norm": 0.1254706084728241, + "learning_rate": 4.847601983184108e-05, + "loss": 0.0304, + "num_input_tokens_seen": 7693248, + "step": 36460 + }, + { + "epoch": 4.011551155115511, + "grad_norm": 0.16896265745162964, + "learning_rate": 4.847519456659141e-05, + "loss": 0.037, + "num_input_tokens_seen": 7694240, + "step": 36465 + }, + { + "epoch": 4.0121012101210125, + "grad_norm": 0.025650355964899063, + "learning_rate": 4.847436908498213e-05, + "loss": 0.0219, + "num_input_tokens_seen": 7695392, + "step": 36470 + }, + { + "epoch": 4.012651265126513, + "grad_norm": 0.8085517287254333, + "learning_rate": 4.8473543387020846e-05, + "loss": 0.0887, + "num_input_tokens_seen": 7696512, + "step": 36475 + }, + { + "epoch": 4.013201320132013, + "grad_norm": 0.048011135309934616, + "learning_rate": 4.847271747271519e-05, + "loss": 0.087, + "num_input_tokens_seen": 7697536, + "step": 36480 + }, + { + "epoch": 4.013751375137514, + "grad_norm": 0.16164761781692505, + "learning_rate": 4.847189134207275e-05, + "loss": 0.0566, + "num_input_tokens_seen": 7698624, + "step": 36485 + }, + { + "epoch": 4.014301430143014, + "grad_norm": 1.3794152736663818, + "learning_rate": 4.847106499510116e-05, + "loss": 0.13, + "num_input_tokens_seen": 7699776, + "step": 36490 + }, + { + "epoch": 4.014851485148514, + "grad_norm": 0.48300784826278687, + "learning_rate": 4.8470238431808024e-05, + "loss": 0.0334, + "num_input_tokens_seen": 7700800, + "step": 36495 + }, + { + "epoch": 4.0154015401540155, + "grad_norm": 0.781069278717041, + "learning_rate": 4.846941165220096e-05, + "loss": 0.0688, + "num_input_tokens_seen": 7701888, + "step": 36500 + }, + { + "epoch": 4.015951595159516, + "grad_norm": 0.05097315087914467, + "learning_rate": 4.8468584656287594e-05, + "loss": 0.0508, + "num_input_tokens_seen": 7703008, + "step": 36505 + }, + { + "epoch": 4.016501650165017, + "grad_norm": 0.030383436009287834, + "learning_rate": 4.8467757444075535e-05, + "loss": 0.0308, + "num_input_tokens_seen": 7704000, + "step": 36510 + }, + { + "epoch": 4.017051705170517, + "grad_norm": 0.12438145279884338, + "learning_rate": 4.846693001557243e-05, + "loss": 0.0324, + "num_input_tokens_seen": 7705056, + "step": 36515 + }, + { + "epoch": 4.017601760176017, + "grad_norm": 0.675349771976471, + "learning_rate": 4.846610237078588e-05, + "loss": 0.0356, + "num_input_tokens_seen": 7706176, + "step": 36520 + }, + { + "epoch": 4.018151815181518, + "grad_norm": 0.16042640805244446, + "learning_rate": 4.846527450972353e-05, + "loss": 0.0265, + "num_input_tokens_seen": 7707264, + "step": 36525 + }, + { + "epoch": 4.0187018701870185, + "grad_norm": 0.6569885611534119, + "learning_rate": 4.846444643239301e-05, + "loss": 0.0682, + "num_input_tokens_seen": 7708384, + "step": 36530 + }, + { + "epoch": 4.01925192519252, + "grad_norm": 0.053266849368810654, + "learning_rate": 4.846361813880194e-05, + "loss": 0.0586, + "num_input_tokens_seen": 7709408, + "step": 36535 + }, + { + "epoch": 4.01980198019802, + "grad_norm": 1.286417841911316, + "learning_rate": 4.846278962895795e-05, + "loss": 0.0641, + "num_input_tokens_seen": 7710464, + "step": 36540 + }, + { + "epoch": 4.02035203520352, + "grad_norm": 0.04782189801335335, + "learning_rate": 4.84619609028687e-05, + "loss": 0.0606, + "num_input_tokens_seen": 7711520, + "step": 36545 + }, + { + "epoch": 4.020902090209021, + "grad_norm": 0.49463751912117004, + "learning_rate": 4.8461131960541814e-05, + "loss": 0.0358, + "num_input_tokens_seen": 7712608, + "step": 36550 + }, + { + "epoch": 4.021452145214521, + "grad_norm": 0.021257469430565834, + "learning_rate": 4.846030280198493e-05, + "loss": 0.0322, + "num_input_tokens_seen": 7713664, + "step": 36555 + }, + { + "epoch": 4.022002200220022, + "grad_norm": 0.9105276465415955, + "learning_rate": 4.8459473427205694e-05, + "loss": 0.0648, + "num_input_tokens_seen": 7714720, + "step": 36560 + }, + { + "epoch": 4.022552255225523, + "grad_norm": 0.7612142562866211, + "learning_rate": 4.845864383621175e-05, + "loss": 0.1092, + "num_input_tokens_seen": 7715776, + "step": 36565 + }, + { + "epoch": 4.023102310231023, + "grad_norm": 0.31990230083465576, + "learning_rate": 4.8457814029010737e-05, + "loss": 0.0273, + "num_input_tokens_seen": 7716832, + "step": 36570 + }, + { + "epoch": 4.023652365236524, + "grad_norm": 1.1231887340545654, + "learning_rate": 4.845698400561031e-05, + "loss": 0.0749, + "num_input_tokens_seen": 7717824, + "step": 36575 + }, + { + "epoch": 4.024202420242024, + "grad_norm": 0.6252363920211792, + "learning_rate": 4.845615376601812e-05, + "loss": 0.0289, + "num_input_tokens_seen": 7718848, + "step": 36580 + }, + { + "epoch": 4.024752475247524, + "grad_norm": 0.18384331464767456, + "learning_rate": 4.845532331024181e-05, + "loss": 0.0491, + "num_input_tokens_seen": 7719936, + "step": 36585 + }, + { + "epoch": 4.025302530253025, + "grad_norm": 0.04781119525432587, + "learning_rate": 4.845449263828904e-05, + "loss": 0.0534, + "num_input_tokens_seen": 7721024, + "step": 36590 + }, + { + "epoch": 4.025852585258526, + "grad_norm": 0.944206714630127, + "learning_rate": 4.845366175016748e-05, + "loss": 0.1104, + "num_input_tokens_seen": 7722080, + "step": 36595 + }, + { + "epoch": 4.026402640264027, + "grad_norm": 0.31674256920814514, + "learning_rate": 4.845283064588476e-05, + "loss": 0.0449, + "num_input_tokens_seen": 7723168, + "step": 36600 + }, + { + "epoch": 4.026952695269527, + "grad_norm": 1.0712990760803223, + "learning_rate": 4.845199932544856e-05, + "loss": 0.068, + "num_input_tokens_seen": 7724160, + "step": 36605 + }, + { + "epoch": 4.027502750275027, + "grad_norm": 0.032257627695798874, + "learning_rate": 4.845116778886653e-05, + "loss": 0.0543, + "num_input_tokens_seen": 7725184, + "step": 36610 + }, + { + "epoch": 4.028052805280528, + "grad_norm": 1.0560466051101685, + "learning_rate": 4.8450336036146336e-05, + "loss": 0.0681, + "num_input_tokens_seen": 7726240, + "step": 36615 + }, + { + "epoch": 4.028602860286028, + "grad_norm": 1.71182119846344, + "learning_rate": 4.844950406729566e-05, + "loss": 0.1198, + "num_input_tokens_seen": 7727360, + "step": 36620 + }, + { + "epoch": 4.0291529152915295, + "grad_norm": 0.406156063079834, + "learning_rate": 4.844867188232215e-05, + "loss": 0.0594, + "num_input_tokens_seen": 7728416, + "step": 36625 + }, + { + "epoch": 4.02970297029703, + "grad_norm": 1.2061753273010254, + "learning_rate": 4.844783948123348e-05, + "loss": 0.0991, + "num_input_tokens_seen": 7729408, + "step": 36630 + }, + { + "epoch": 4.03025302530253, + "grad_norm": 0.05236577242612839, + "learning_rate": 4.844700686403733e-05, + "loss": 0.0058, + "num_input_tokens_seen": 7730432, + "step": 36635 + }, + { + "epoch": 4.030803080308031, + "grad_norm": 0.78631991147995, + "learning_rate": 4.844617403074137e-05, + "loss": 0.05, + "num_input_tokens_seen": 7731456, + "step": 36640 + }, + { + "epoch": 4.031353135313531, + "grad_norm": 0.07496096938848495, + "learning_rate": 4.844534098135327e-05, + "loss": 0.0666, + "num_input_tokens_seen": 7732512, + "step": 36645 + }, + { + "epoch": 4.031903190319032, + "grad_norm": 0.16185593605041504, + "learning_rate": 4.8444507715880716e-05, + "loss": 0.0594, + "num_input_tokens_seen": 7733536, + "step": 36650 + }, + { + "epoch": 4.0324532453245325, + "grad_norm": 0.025518210604786873, + "learning_rate": 4.844367423433138e-05, + "loss": 0.024, + "num_input_tokens_seen": 7734624, + "step": 36655 + }, + { + "epoch": 4.033003300330033, + "grad_norm": 1.401997447013855, + "learning_rate": 4.844284053671295e-05, + "loss": 0.0404, + "num_input_tokens_seen": 7735712, + "step": 36660 + }, + { + "epoch": 4.033553355335534, + "grad_norm": 0.029871482402086258, + "learning_rate": 4.844200662303311e-05, + "loss": 0.0301, + "num_input_tokens_seen": 7736832, + "step": 36665 + }, + { + "epoch": 4.034103410341034, + "grad_norm": 0.10484012216329575, + "learning_rate": 4.844117249329955e-05, + "loss": 0.0884, + "num_input_tokens_seen": 7737920, + "step": 36670 + }, + { + "epoch": 4.034653465346534, + "grad_norm": 0.35537055134773254, + "learning_rate": 4.844033814751994e-05, + "loss": 0.0131, + "num_input_tokens_seen": 7738976, + "step": 36675 + }, + { + "epoch": 4.035203520352035, + "grad_norm": 0.12142220139503479, + "learning_rate": 4.843950358570198e-05, + "loss": 0.1108, + "num_input_tokens_seen": 7740032, + "step": 36680 + }, + { + "epoch": 4.0357535753575355, + "grad_norm": 0.025061847642064095, + "learning_rate": 4.843866880785337e-05, + "loss": 0.0619, + "num_input_tokens_seen": 7741056, + "step": 36685 + }, + { + "epoch": 4.036303630363037, + "grad_norm": 0.06855930387973785, + "learning_rate": 4.843783381398179e-05, + "loss": 0.0071, + "num_input_tokens_seen": 7742080, + "step": 36690 + }, + { + "epoch": 4.036853685368537, + "grad_norm": 0.0412774458527565, + "learning_rate": 4.8436998604094945e-05, + "loss": 0.0466, + "num_input_tokens_seen": 7743136, + "step": 36695 + }, + { + "epoch": 4.037403740374037, + "grad_norm": 0.25835999846458435, + "learning_rate": 4.8436163178200534e-05, + "loss": 0.0511, + "num_input_tokens_seen": 7744256, + "step": 36700 + }, + { + "epoch": 4.037953795379538, + "grad_norm": 0.06363888829946518, + "learning_rate": 4.843532753630625e-05, + "loss": 0.003, + "num_input_tokens_seen": 7745344, + "step": 36705 + }, + { + "epoch": 4.038503850385038, + "grad_norm": 0.34422212839126587, + "learning_rate": 4.843449167841979e-05, + "loss": 0.033, + "num_input_tokens_seen": 7746336, + "step": 36710 + }, + { + "epoch": 4.039053905390539, + "grad_norm": 0.05519632622599602, + "learning_rate": 4.8433655604548875e-05, + "loss": 0.0204, + "num_input_tokens_seen": 7747360, + "step": 36715 + }, + { + "epoch": 4.03960396039604, + "grad_norm": 0.1783362776041031, + "learning_rate": 4.84328193147012e-05, + "loss": 0.0416, + "num_input_tokens_seen": 7748384, + "step": 36720 + }, + { + "epoch": 4.04015401540154, + "grad_norm": 0.05315731093287468, + "learning_rate": 4.843198280888447e-05, + "loss": 0.0482, + "num_input_tokens_seen": 7749408, + "step": 36725 + }, + { + "epoch": 4.040704070407041, + "grad_norm": 0.04208224639296532, + "learning_rate": 4.8431146087106405e-05, + "loss": 0.0699, + "num_input_tokens_seen": 7750432, + "step": 36730 + }, + { + "epoch": 4.041254125412541, + "grad_norm": 0.44827401638031006, + "learning_rate": 4.8430309149374695e-05, + "loss": 0.0363, + "num_input_tokens_seen": 7751424, + "step": 36735 + }, + { + "epoch": 4.041804180418042, + "grad_norm": 0.39975735545158386, + "learning_rate": 4.8429471995697084e-05, + "loss": 0.0321, + "num_input_tokens_seen": 7752480, + "step": 36740 + }, + { + "epoch": 4.042354235423542, + "grad_norm": 0.3087228834629059, + "learning_rate": 4.842863462608127e-05, + "loss": 0.0735, + "num_input_tokens_seen": 7753504, + "step": 36745 + }, + { + "epoch": 4.042904290429043, + "grad_norm": 0.031709011644124985, + "learning_rate": 4.842779704053497e-05, + "loss": 0.0552, + "num_input_tokens_seen": 7754624, + "step": 36750 + }, + { + "epoch": 4.043454345434544, + "grad_norm": 0.09125924110412598, + "learning_rate": 4.8426959239065905e-05, + "loss": 0.0075, + "num_input_tokens_seen": 7755680, + "step": 36755 + }, + { + "epoch": 4.044004400440044, + "grad_norm": 0.5877514481544495, + "learning_rate": 4.8426121221681795e-05, + "loss": 0.1002, + "num_input_tokens_seen": 7756736, + "step": 36760 + }, + { + "epoch": 4.044554455445544, + "grad_norm": 0.04664718359708786, + "learning_rate": 4.8425282988390376e-05, + "loss": 0.0753, + "num_input_tokens_seen": 7757760, + "step": 36765 + }, + { + "epoch": 4.045104510451045, + "grad_norm": 0.15040917694568634, + "learning_rate": 4.842444453919935e-05, + "loss": 0.0155, + "num_input_tokens_seen": 7758816, + "step": 36770 + }, + { + "epoch": 4.0456545654565454, + "grad_norm": 0.5175292491912842, + "learning_rate": 4.8423605874116475e-05, + "loss": 0.0239, + "num_input_tokens_seen": 7759904, + "step": 36775 + }, + { + "epoch": 4.0462046204620465, + "grad_norm": 1.8837298154830933, + "learning_rate": 4.842276699314945e-05, + "loss": 0.0555, + "num_input_tokens_seen": 7760960, + "step": 36780 + }, + { + "epoch": 4.046754675467547, + "grad_norm": 0.01879741996526718, + "learning_rate": 4.842192789630603e-05, + "loss": 0.0551, + "num_input_tokens_seen": 7761984, + "step": 36785 + }, + { + "epoch": 4.047304730473047, + "grad_norm": 0.8282031416893005, + "learning_rate": 4.842108858359394e-05, + "loss": 0.0718, + "num_input_tokens_seen": 7763072, + "step": 36790 + }, + { + "epoch": 4.047854785478548, + "grad_norm": 0.6896113157272339, + "learning_rate": 4.842024905502091e-05, + "loss": 0.0212, + "num_input_tokens_seen": 7764160, + "step": 36795 + }, + { + "epoch": 4.048404840484048, + "grad_norm": 0.009807427413761616, + "learning_rate": 4.841940931059469e-05, + "loss": 0.0781, + "num_input_tokens_seen": 7765248, + "step": 36800 + }, + { + "epoch": 4.048954895489549, + "grad_norm": 0.1304619461297989, + "learning_rate": 4.8418569350323e-05, + "loss": 0.0193, + "num_input_tokens_seen": 7766304, + "step": 36805 + }, + { + "epoch": 4.0495049504950495, + "grad_norm": 1.084182620048523, + "learning_rate": 4.84177291742136e-05, + "loss": 0.11, + "num_input_tokens_seen": 7767360, + "step": 36810 + }, + { + "epoch": 4.05005500550055, + "grad_norm": 0.07631640881299973, + "learning_rate": 4.841688878227423e-05, + "loss": 0.0377, + "num_input_tokens_seen": 7768416, + "step": 36815 + }, + { + "epoch": 4.050605060506051, + "grad_norm": 0.17842181026935577, + "learning_rate": 4.841604817451263e-05, + "loss": 0.0387, + "num_input_tokens_seen": 7769440, + "step": 36820 + }, + { + "epoch": 4.051155115511551, + "grad_norm": 0.038545265793800354, + "learning_rate": 4.8415207350936554e-05, + "loss": 0.0336, + "num_input_tokens_seen": 7770528, + "step": 36825 + }, + { + "epoch": 4.051705170517051, + "grad_norm": 0.030440011993050575, + "learning_rate": 4.8414366311553736e-05, + "loss": 0.0188, + "num_input_tokens_seen": 7771552, + "step": 36830 + }, + { + "epoch": 4.052255225522552, + "grad_norm": 0.2546082139015198, + "learning_rate": 4.8413525056371955e-05, + "loss": 0.0225, + "num_input_tokens_seen": 7772608, + "step": 36835 + }, + { + "epoch": 4.052805280528053, + "grad_norm": 0.035465069115161896, + "learning_rate": 4.8412683585398935e-05, + "loss": 0.0873, + "num_input_tokens_seen": 7773664, + "step": 36840 + }, + { + "epoch": 4.053355335533554, + "grad_norm": 0.5537311434745789, + "learning_rate": 4.841184189864245e-05, + "loss": 0.0939, + "num_input_tokens_seen": 7774720, + "step": 36845 + }, + { + "epoch": 4.053905390539054, + "grad_norm": 0.8933922648429871, + "learning_rate": 4.8410999996110256e-05, + "loss": 0.0701, + "num_input_tokens_seen": 7775680, + "step": 36850 + }, + { + "epoch": 4.054455445544554, + "grad_norm": 0.18373779952526093, + "learning_rate": 4.84101578778101e-05, + "loss": 0.0218, + "num_input_tokens_seen": 7776768, + "step": 36855 + }, + { + "epoch": 4.055005500550055, + "grad_norm": 0.01689859852194786, + "learning_rate": 4.840931554374976e-05, + "loss": 0.0814, + "num_input_tokens_seen": 7777824, + "step": 36860 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.0171556044369936, + "learning_rate": 4.8408472993936994e-05, + "loss": 0.1534, + "num_input_tokens_seen": 7778880, + "step": 36865 + }, + { + "epoch": 4.0561056105610565, + "grad_norm": 0.8540180921554565, + "learning_rate": 4.840763022837956e-05, + "loss": 0.0948, + "num_input_tokens_seen": 7779936, + "step": 36870 + }, + { + "epoch": 4.056655665566557, + "grad_norm": 0.5238032341003418, + "learning_rate": 4.8406787247085226e-05, + "loss": 0.0457, + "num_input_tokens_seen": 7781024, + "step": 36875 + }, + { + "epoch": 4.057205720572057, + "grad_norm": 0.06388774514198303, + "learning_rate": 4.840594405006177e-05, + "loss": 0.0648, + "num_input_tokens_seen": 7782080, + "step": 36880 + }, + { + "epoch": 4.057755775577558, + "grad_norm": 0.10634969919919968, + "learning_rate": 4.8405100637316956e-05, + "loss": 0.0747, + "num_input_tokens_seen": 7783232, + "step": 36885 + }, + { + "epoch": 4.058305830583058, + "grad_norm": 1.2486708164215088, + "learning_rate": 4.8404257008858564e-05, + "loss": 0.1225, + "num_input_tokens_seen": 7784256, + "step": 36890 + }, + { + "epoch": 4.058855885588559, + "grad_norm": 0.40838873386383057, + "learning_rate": 4.840341316469437e-05, + "loss": 0.2069, + "num_input_tokens_seen": 7785376, + "step": 36895 + }, + { + "epoch": 4.0594059405940595, + "grad_norm": 0.02196383848786354, + "learning_rate": 4.840256910483214e-05, + "loss": 0.06, + "num_input_tokens_seen": 7786432, + "step": 36900 + }, + { + "epoch": 4.05995599559956, + "grad_norm": 0.1947585493326187, + "learning_rate": 4.840172482927966e-05, + "loss": 0.1058, + "num_input_tokens_seen": 7787424, + "step": 36905 + }, + { + "epoch": 4.060506050605061, + "grad_norm": 0.1582043468952179, + "learning_rate": 4.840088033804472e-05, + "loss": 0.0248, + "num_input_tokens_seen": 7788448, + "step": 36910 + }, + { + "epoch": 4.061056105610561, + "grad_norm": 0.08596961200237274, + "learning_rate": 4.8400035631135084e-05, + "loss": 0.0432, + "num_input_tokens_seen": 7789472, + "step": 36915 + }, + { + "epoch": 4.061606160616061, + "grad_norm": 0.05259429290890694, + "learning_rate": 4.839919070855855e-05, + "loss": 0.0439, + "num_input_tokens_seen": 7790560, + "step": 36920 + }, + { + "epoch": 4.062156215621562, + "grad_norm": 0.11086300015449524, + "learning_rate": 4.8398345570322906e-05, + "loss": 0.031, + "num_input_tokens_seen": 7791616, + "step": 36925 + }, + { + "epoch": 4.0627062706270625, + "grad_norm": 0.4553774297237396, + "learning_rate": 4.839750021643594e-05, + "loss": 0.0223, + "num_input_tokens_seen": 7792608, + "step": 36930 + }, + { + "epoch": 4.063256325632564, + "grad_norm": 0.6035534739494324, + "learning_rate": 4.839665464690544e-05, + "loss": 0.0416, + "num_input_tokens_seen": 7793664, + "step": 36935 + }, + { + "epoch": 4.063806380638064, + "grad_norm": 0.5933160781860352, + "learning_rate": 4.83958088617392e-05, + "loss": 0.0246, + "num_input_tokens_seen": 7794656, + "step": 36940 + }, + { + "epoch": 4.064356435643564, + "grad_norm": 0.8565084934234619, + "learning_rate": 4.839496286094501e-05, + "loss": 0.1206, + "num_input_tokens_seen": 7795680, + "step": 36945 + }, + { + "epoch": 4.064906490649065, + "grad_norm": 0.03594468906521797, + "learning_rate": 4.839411664453068e-05, + "loss": 0.0142, + "num_input_tokens_seen": 7796736, + "step": 36950 + }, + { + "epoch": 4.065456545654565, + "grad_norm": 0.02382517047226429, + "learning_rate": 4.8393270212504006e-05, + "loss": 0.0952, + "num_input_tokens_seen": 7797728, + "step": 36955 + }, + { + "epoch": 4.066006600660066, + "grad_norm": 0.10508637875318527, + "learning_rate": 4.839242356487278e-05, + "loss": 0.0217, + "num_input_tokens_seen": 7798720, + "step": 36960 + }, + { + "epoch": 4.066556655665567, + "grad_norm": 0.015554584562778473, + "learning_rate": 4.839157670164481e-05, + "loss": 0.0682, + "num_input_tokens_seen": 7799744, + "step": 36965 + }, + { + "epoch": 4.067106710671067, + "grad_norm": 1.3104249238967896, + "learning_rate": 4.839072962282791e-05, + "loss": 0.0618, + "num_input_tokens_seen": 7800832, + "step": 36970 + }, + { + "epoch": 4.067656765676568, + "grad_norm": 0.37437111139297485, + "learning_rate": 4.838988232842987e-05, + "loss": 0.0496, + "num_input_tokens_seen": 7801856, + "step": 36975 + }, + { + "epoch": 4.068206820682068, + "grad_norm": 0.301085501909256, + "learning_rate": 4.8389034818458514e-05, + "loss": 0.0345, + "num_input_tokens_seen": 7802912, + "step": 36980 + }, + { + "epoch": 4.068756875687569, + "grad_norm": 0.12351302057504654, + "learning_rate": 4.8388187092921645e-05, + "loss": 0.1839, + "num_input_tokens_seen": 7803872, + "step": 36985 + }, + { + "epoch": 4.069306930693069, + "grad_norm": 0.028123609721660614, + "learning_rate": 4.838733915182708e-05, + "loss": 0.0407, + "num_input_tokens_seen": 7804864, + "step": 36990 + }, + { + "epoch": 4.06985698569857, + "grad_norm": 0.2384689748287201, + "learning_rate": 4.838649099518263e-05, + "loss": 0.0363, + "num_input_tokens_seen": 7805920, + "step": 36995 + }, + { + "epoch": 4.070407040704071, + "grad_norm": 0.19528426229953766, + "learning_rate": 4.838564262299612e-05, + "loss": 0.0786, + "num_input_tokens_seen": 7806944, + "step": 37000 + }, + { + "epoch": 4.070957095709571, + "grad_norm": 0.1268257051706314, + "learning_rate": 4.838479403527535e-05, + "loss": 0.0077, + "num_input_tokens_seen": 7808000, + "step": 37005 + }, + { + "epoch": 4.071507150715071, + "grad_norm": 0.22572177648544312, + "learning_rate": 4.838394523202817e-05, + "loss": 0.0952, + "num_input_tokens_seen": 7809056, + "step": 37010 + }, + { + "epoch": 4.072057205720572, + "grad_norm": 0.18026496469974518, + "learning_rate": 4.838309621326238e-05, + "loss": 0.0092, + "num_input_tokens_seen": 7810176, + "step": 37015 + }, + { + "epoch": 4.072607260726072, + "grad_norm": 0.7035964727401733, + "learning_rate": 4.838224697898581e-05, + "loss": 0.0388, + "num_input_tokens_seen": 7811200, + "step": 37020 + }, + { + "epoch": 4.0731573157315735, + "grad_norm": 1.4988042116165161, + "learning_rate": 4.838139752920629e-05, + "loss": 0.0605, + "num_input_tokens_seen": 7812224, + "step": 37025 + }, + { + "epoch": 4.073707370737074, + "grad_norm": 0.6744740605354309, + "learning_rate": 4.838054786393166e-05, + "loss": 0.0357, + "num_input_tokens_seen": 7813312, + "step": 37030 + }, + { + "epoch": 4.074257425742574, + "grad_norm": 0.1657530963420868, + "learning_rate": 4.837969798316973e-05, + "loss": 0.0173, + "num_input_tokens_seen": 7814368, + "step": 37035 + }, + { + "epoch": 4.074807480748075, + "grad_norm": 0.2345985323190689, + "learning_rate": 4.8378847886928335e-05, + "loss": 0.0674, + "num_input_tokens_seen": 7815424, + "step": 37040 + }, + { + "epoch": 4.075357535753575, + "grad_norm": 0.059758260846138, + "learning_rate": 4.8377997575215326e-05, + "loss": 0.0614, + "num_input_tokens_seen": 7816544, + "step": 37045 + }, + { + "epoch": 4.075907590759076, + "grad_norm": 0.10950944572687149, + "learning_rate": 4.837714704803853e-05, + "loss": 0.015, + "num_input_tokens_seen": 7817536, + "step": 37050 + }, + { + "epoch": 4.0764576457645765, + "grad_norm": 0.19489999115467072, + "learning_rate": 4.837629630540579e-05, + "loss": 0.1291, + "num_input_tokens_seen": 7818560, + "step": 37055 + }, + { + "epoch": 4.077007700770077, + "grad_norm": 0.009168106131255627, + "learning_rate": 4.8375445347324946e-05, + "loss": 0.0279, + "num_input_tokens_seen": 7819712, + "step": 37060 + }, + { + "epoch": 4.077557755775578, + "grad_norm": 0.06993269920349121, + "learning_rate": 4.8374594173803835e-05, + "loss": 0.0267, + "num_input_tokens_seen": 7820800, + "step": 37065 + }, + { + "epoch": 4.078107810781078, + "grad_norm": 0.04738312587141991, + "learning_rate": 4.8373742784850296e-05, + "loss": 0.0389, + "num_input_tokens_seen": 7821856, + "step": 37070 + }, + { + "epoch": 4.078657865786579, + "grad_norm": 0.26922422647476196, + "learning_rate": 4.83728911804722e-05, + "loss": 0.0413, + "num_input_tokens_seen": 7822944, + "step": 37075 + }, + { + "epoch": 4.079207920792079, + "grad_norm": 0.48791569471359253, + "learning_rate": 4.837203936067737e-05, + "loss": 0.081, + "num_input_tokens_seen": 7824000, + "step": 37080 + }, + { + "epoch": 4.0797579757975795, + "grad_norm": 0.12367323040962219, + "learning_rate": 4.837118732547368e-05, + "loss": 0.0248, + "num_input_tokens_seen": 7825024, + "step": 37085 + }, + { + "epoch": 4.080308030803081, + "grad_norm": 0.10410330444574356, + "learning_rate": 4.837033507486897e-05, + "loss": 0.0816, + "num_input_tokens_seen": 7826048, + "step": 37090 + }, + { + "epoch": 4.080858085808581, + "grad_norm": 0.4078981876373291, + "learning_rate": 4.836948260887108e-05, + "loss": 0.0435, + "num_input_tokens_seen": 7827168, + "step": 37095 + }, + { + "epoch": 4.081408140814081, + "grad_norm": 0.7405015826225281, + "learning_rate": 4.836862992748789e-05, + "loss": 0.067, + "num_input_tokens_seen": 7828160, + "step": 37100 + }, + { + "epoch": 4.081958195819582, + "grad_norm": 0.7679916024208069, + "learning_rate": 4.8367777030727264e-05, + "loss": 0.2055, + "num_input_tokens_seen": 7829216, + "step": 37105 + }, + { + "epoch": 4.082508250825082, + "grad_norm": 0.9759681224822998, + "learning_rate": 4.836692391859704e-05, + "loss": 0.0878, + "num_input_tokens_seen": 7830272, + "step": 37110 + }, + { + "epoch": 4.083058305830583, + "grad_norm": 1.096946358680725, + "learning_rate": 4.8366070591105095e-05, + "loss": 0.0766, + "num_input_tokens_seen": 7831328, + "step": 37115 + }, + { + "epoch": 4.083608360836084, + "grad_norm": 0.7935112714767456, + "learning_rate": 4.8365217048259284e-05, + "loss": 0.0759, + "num_input_tokens_seen": 7832448, + "step": 37120 + }, + { + "epoch": 4.084158415841584, + "grad_norm": 0.025732625275850296, + "learning_rate": 4.836436329006748e-05, + "loss": 0.0682, + "num_input_tokens_seen": 7833536, + "step": 37125 + }, + { + "epoch": 4.084708470847085, + "grad_norm": 0.4036450982093811, + "learning_rate": 4.836350931653755e-05, + "loss": 0.0466, + "num_input_tokens_seen": 7834560, + "step": 37130 + }, + { + "epoch": 4.085258525852585, + "grad_norm": 0.8036026358604431, + "learning_rate": 4.836265512767737e-05, + "loss": 0.099, + "num_input_tokens_seen": 7835584, + "step": 37135 + }, + { + "epoch": 4.085808580858086, + "grad_norm": 0.08617441356182098, + "learning_rate": 4.836180072349481e-05, + "loss": 0.0113, + "num_input_tokens_seen": 7836640, + "step": 37140 + }, + { + "epoch": 4.086358635863586, + "grad_norm": 0.05016336962580681, + "learning_rate": 4.8360946103997735e-05, + "loss": 0.0994, + "num_input_tokens_seen": 7837664, + "step": 37145 + }, + { + "epoch": 4.086908690869087, + "grad_norm": 0.82428377866745, + "learning_rate": 4.836009126919403e-05, + "loss": 0.0305, + "num_input_tokens_seen": 7838720, + "step": 37150 + }, + { + "epoch": 4.087458745874588, + "grad_norm": 0.30809712409973145, + "learning_rate": 4.835923621909158e-05, + "loss": 0.0857, + "num_input_tokens_seen": 7839776, + "step": 37155 + }, + { + "epoch": 4.088008800880088, + "grad_norm": 0.16114503145217896, + "learning_rate": 4.8358380953698254e-05, + "loss": 0.0532, + "num_input_tokens_seen": 7840832, + "step": 37160 + }, + { + "epoch": 4.088558855885589, + "grad_norm": 1.1372648477554321, + "learning_rate": 4.835752547302194e-05, + "loss": 0.0811, + "num_input_tokens_seen": 7841952, + "step": 37165 + }, + { + "epoch": 4.089108910891089, + "grad_norm": 0.5364861488342285, + "learning_rate": 4.8356669777070525e-05, + "loss": 0.0157, + "num_input_tokens_seen": 7842944, + "step": 37170 + }, + { + "epoch": 4.089658965896589, + "grad_norm": 0.05156426131725311, + "learning_rate": 4.835581386585188e-05, + "loss": 0.0341, + "num_input_tokens_seen": 7844000, + "step": 37175 + }, + { + "epoch": 4.0902090209020905, + "grad_norm": 0.24312639236450195, + "learning_rate": 4.835495773937392e-05, + "loss": 0.0559, + "num_input_tokens_seen": 7845120, + "step": 37180 + }, + { + "epoch": 4.090759075907591, + "grad_norm": 0.6253484487533569, + "learning_rate": 4.835410139764452e-05, + "loss": 0.0754, + "num_input_tokens_seen": 7846208, + "step": 37185 + }, + { + "epoch": 4.091309130913091, + "grad_norm": 1.66909658908844, + "learning_rate": 4.835324484067157e-05, + "loss": 0.0814, + "num_input_tokens_seen": 7847264, + "step": 37190 + }, + { + "epoch": 4.091859185918592, + "grad_norm": 0.05412314459681511, + "learning_rate": 4.8352388068462965e-05, + "loss": 0.0126, + "num_input_tokens_seen": 7848320, + "step": 37195 + }, + { + "epoch": 4.092409240924092, + "grad_norm": 0.013571259565651417, + "learning_rate": 4.8351531081026614e-05, + "loss": 0.0913, + "num_input_tokens_seen": 7849376, + "step": 37200 + }, + { + "epoch": 4.092959295929593, + "grad_norm": 0.6597194671630859, + "learning_rate": 4.835067387837039e-05, + "loss": 0.0624, + "num_input_tokens_seen": 7850464, + "step": 37205 + }, + { + "epoch": 4.0935093509350935, + "grad_norm": 0.07554785907268524, + "learning_rate": 4.834981646050223e-05, + "loss": 0.0487, + "num_input_tokens_seen": 7851456, + "step": 37210 + }, + { + "epoch": 4.094059405940594, + "grad_norm": 0.12228640168905258, + "learning_rate": 4.834895882743e-05, + "loss": 0.0731, + "num_input_tokens_seen": 7852544, + "step": 37215 + }, + { + "epoch": 4.094609460946095, + "grad_norm": 0.13968908786773682, + "learning_rate": 4.834810097916163e-05, + "loss": 0.017, + "num_input_tokens_seen": 7853600, + "step": 37220 + }, + { + "epoch": 4.095159515951595, + "grad_norm": 1.7129539251327515, + "learning_rate": 4.834724291570501e-05, + "loss": 0.0495, + "num_input_tokens_seen": 7854656, + "step": 37225 + }, + { + "epoch": 4.095709570957096, + "grad_norm": 0.6772172451019287, + "learning_rate": 4.834638463706805e-05, + "loss": 0.065, + "num_input_tokens_seen": 7855744, + "step": 37230 + }, + { + "epoch": 4.096259625962596, + "grad_norm": 0.6105794310569763, + "learning_rate": 4.834552614325868e-05, + "loss": 0.028, + "num_input_tokens_seen": 7856768, + "step": 37235 + }, + { + "epoch": 4.0968096809680965, + "grad_norm": 0.04930697754025459, + "learning_rate": 4.83446674342848e-05, + "loss": 0.0305, + "num_input_tokens_seen": 7857824, + "step": 37240 + }, + { + "epoch": 4.097359735973598, + "grad_norm": 0.07484463602304459, + "learning_rate": 4.834380851015431e-05, + "loss": 0.0104, + "num_input_tokens_seen": 7858848, + "step": 37245 + }, + { + "epoch": 4.097909790979098, + "grad_norm": 0.15234580636024475, + "learning_rate": 4.834294937087514e-05, + "loss": 0.0374, + "num_input_tokens_seen": 7859968, + "step": 37250 + }, + { + "epoch": 4.098459845984599, + "grad_norm": 0.17634305357933044, + "learning_rate": 4.834209001645521e-05, + "loss": 0.0498, + "num_input_tokens_seen": 7861024, + "step": 37255 + }, + { + "epoch": 4.099009900990099, + "grad_norm": 0.5627508163452148, + "learning_rate": 4.834123044690243e-05, + "loss": 0.029, + "num_input_tokens_seen": 7862080, + "step": 37260 + }, + { + "epoch": 4.099559955995599, + "grad_norm": 0.049423884600400925, + "learning_rate": 4.834037066222474e-05, + "loss": 0.0455, + "num_input_tokens_seen": 7863168, + "step": 37265 + }, + { + "epoch": 4.1001100110011, + "grad_norm": 1.6320154666900635, + "learning_rate": 4.8339510662430046e-05, + "loss": 0.1879, + "num_input_tokens_seen": 7864192, + "step": 37270 + }, + { + "epoch": 4.100660066006601, + "grad_norm": 0.0942855104804039, + "learning_rate": 4.833865044752628e-05, + "loss": 0.0206, + "num_input_tokens_seen": 7865280, + "step": 37275 + }, + { + "epoch": 4.101210121012101, + "grad_norm": 0.032619960606098175, + "learning_rate": 4.833779001752138e-05, + "loss": 0.1647, + "num_input_tokens_seen": 7866304, + "step": 37280 + }, + { + "epoch": 4.101760176017602, + "grad_norm": 0.04148659482598305, + "learning_rate": 4.833692937242327e-05, + "loss": 0.0526, + "num_input_tokens_seen": 7867360, + "step": 37285 + }, + { + "epoch": 4.102310231023102, + "grad_norm": 0.15093444287776947, + "learning_rate": 4.833606851223986e-05, + "loss": 0.0059, + "num_input_tokens_seen": 7868384, + "step": 37290 + }, + { + "epoch": 4.102860286028603, + "grad_norm": 0.11818864941596985, + "learning_rate": 4.8335207436979124e-05, + "loss": 0.0994, + "num_input_tokens_seen": 7869440, + "step": 37295 + }, + { + "epoch": 4.103410341034103, + "grad_norm": 0.8787484169006348, + "learning_rate": 4.833434614664897e-05, + "loss": 0.043, + "num_input_tokens_seen": 7870432, + "step": 37300 + }, + { + "epoch": 4.103960396039604, + "grad_norm": 0.2022142857313156, + "learning_rate": 4.8333484641257355e-05, + "loss": 0.0682, + "num_input_tokens_seen": 7871488, + "step": 37305 + }, + { + "epoch": 4.104510451045105, + "grad_norm": 0.3004368841648102, + "learning_rate": 4.83326229208122e-05, + "loss": 0.0464, + "num_input_tokens_seen": 7872448, + "step": 37310 + }, + { + "epoch": 4.105060506050605, + "grad_norm": 0.09343735128641129, + "learning_rate": 4.833176098532145e-05, + "loss": 0.0097, + "num_input_tokens_seen": 7873472, + "step": 37315 + }, + { + "epoch": 4.105610561056106, + "grad_norm": 0.671778678894043, + "learning_rate": 4.833089883479306e-05, + "loss": 0.2284, + "num_input_tokens_seen": 7874528, + "step": 37320 + }, + { + "epoch": 4.106160616061606, + "grad_norm": 0.23653925955295563, + "learning_rate": 4.8330036469234974e-05, + "loss": 0.0221, + "num_input_tokens_seen": 7875552, + "step": 37325 + }, + { + "epoch": 4.106710671067106, + "grad_norm": 0.03397149220108986, + "learning_rate": 4.8329173888655134e-05, + "loss": 0.0629, + "num_input_tokens_seen": 7876608, + "step": 37330 + }, + { + "epoch": 4.1072607260726075, + "grad_norm": 0.920964241027832, + "learning_rate": 4.83283110930615e-05, + "loss": 0.1031, + "num_input_tokens_seen": 7877600, + "step": 37335 + }, + { + "epoch": 4.107810781078108, + "grad_norm": 0.05619807168841362, + "learning_rate": 4.8327448082462004e-05, + "loss": 0.0263, + "num_input_tokens_seen": 7878688, + "step": 37340 + }, + { + "epoch": 4.108360836083609, + "grad_norm": 0.14587529003620148, + "learning_rate": 4.8326584856864617e-05, + "loss": 0.0361, + "num_input_tokens_seen": 7879744, + "step": 37345 + }, + { + "epoch": 4.108910891089109, + "grad_norm": 0.8478748798370361, + "learning_rate": 4.832572141627729e-05, + "loss": 0.0391, + "num_input_tokens_seen": 7880736, + "step": 37350 + }, + { + "epoch": 4.109460946094609, + "grad_norm": 0.08983349055051804, + "learning_rate": 4.832485776070799e-05, + "loss": 0.062, + "num_input_tokens_seen": 7881760, + "step": 37355 + }, + { + "epoch": 4.11001100110011, + "grad_norm": 0.27543526887893677, + "learning_rate": 4.832399389016465e-05, + "loss": 0.1196, + "num_input_tokens_seen": 7882816, + "step": 37360 + }, + { + "epoch": 4.1105610561056105, + "grad_norm": 0.26426246762275696, + "learning_rate": 4.832312980465526e-05, + "loss": 0.0355, + "num_input_tokens_seen": 7883872, + "step": 37365 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.07434643059968948, + "learning_rate": 4.832226550418778e-05, + "loss": 0.0816, + "num_input_tokens_seen": 7884896, + "step": 37370 + }, + { + "epoch": 4.111661166116612, + "grad_norm": 1.20295250415802, + "learning_rate": 4.832140098877017e-05, + "loss": 0.1345, + "num_input_tokens_seen": 7885984, + "step": 37375 + }, + { + "epoch": 4.112211221122112, + "grad_norm": 0.5419124960899353, + "learning_rate": 4.832053625841039e-05, + "loss": 0.1007, + "num_input_tokens_seen": 7887040, + "step": 37380 + }, + { + "epoch": 4.112761276127613, + "grad_norm": 0.11984146386384964, + "learning_rate": 4.831967131311642e-05, + "loss": 0.1133, + "num_input_tokens_seen": 7888096, + "step": 37385 + }, + { + "epoch": 4.113311331133113, + "grad_norm": 0.5285068154335022, + "learning_rate": 4.8318806152896235e-05, + "loss": 0.0527, + "num_input_tokens_seen": 7889152, + "step": 37390 + }, + { + "epoch": 4.1138613861386135, + "grad_norm": 0.2559034526348114, + "learning_rate": 4.83179407777578e-05, + "loss": 0.0913, + "num_input_tokens_seen": 7890208, + "step": 37395 + }, + { + "epoch": 4.114411441144115, + "grad_norm": 0.08569471538066864, + "learning_rate": 4.831707518770909e-05, + "loss": 0.0936, + "num_input_tokens_seen": 7891232, + "step": 37400 + }, + { + "epoch": 4.114961496149615, + "grad_norm": 0.07860082387924194, + "learning_rate": 4.831620938275809e-05, + "loss": 0.0823, + "num_input_tokens_seen": 7892352, + "step": 37405 + }, + { + "epoch": 4.115511551155116, + "grad_norm": 1.0001510381698608, + "learning_rate": 4.8315343362912776e-05, + "loss": 0.2123, + "num_input_tokens_seen": 7893440, + "step": 37410 + }, + { + "epoch": 4.116061606160616, + "grad_norm": 0.44218572974205017, + "learning_rate": 4.831447712818113e-05, + "loss": 0.0418, + "num_input_tokens_seen": 7894528, + "step": 37415 + }, + { + "epoch": 4.116611661166116, + "grad_norm": 0.9604182243347168, + "learning_rate": 4.831361067857113e-05, + "loss": 0.0797, + "num_input_tokens_seen": 7895552, + "step": 37420 + }, + { + "epoch": 4.117161716171617, + "grad_norm": 0.3336714804172516, + "learning_rate": 4.831274401409077e-05, + "loss": 0.0205, + "num_input_tokens_seen": 7896512, + "step": 37425 + }, + { + "epoch": 4.117711771177118, + "grad_norm": 0.11181232333183289, + "learning_rate": 4.831187713474804e-05, + "loss": 0.0389, + "num_input_tokens_seen": 7897504, + "step": 37430 + }, + { + "epoch": 4.118261826182618, + "grad_norm": 0.318220853805542, + "learning_rate": 4.831101004055092e-05, + "loss": 0.0523, + "num_input_tokens_seen": 7898592, + "step": 37435 + }, + { + "epoch": 4.118811881188119, + "grad_norm": 0.5114614367485046, + "learning_rate": 4.8310142731507416e-05, + "loss": 0.051, + "num_input_tokens_seen": 7899680, + "step": 37440 + }, + { + "epoch": 4.119361936193619, + "grad_norm": 0.055559366941452026, + "learning_rate": 4.83092752076255e-05, + "loss": 0.0406, + "num_input_tokens_seen": 7900768, + "step": 37445 + }, + { + "epoch": 4.11991199119912, + "grad_norm": 0.14812244474887848, + "learning_rate": 4.830840746891318e-05, + "loss": 0.0408, + "num_input_tokens_seen": 7901856, + "step": 37450 + }, + { + "epoch": 4.12046204620462, + "grad_norm": 0.3064742386341095, + "learning_rate": 4.830753951537846e-05, + "loss": 0.0336, + "num_input_tokens_seen": 7903008, + "step": 37455 + }, + { + "epoch": 4.121012101210121, + "grad_norm": 0.018105484545230865, + "learning_rate": 4.830667134702933e-05, + "loss": 0.0793, + "num_input_tokens_seen": 7904096, + "step": 37460 + }, + { + "epoch": 4.121562156215622, + "grad_norm": 0.13286742568016052, + "learning_rate": 4.830580296387379e-05, + "loss": 0.1165, + "num_input_tokens_seen": 7905088, + "step": 37465 + }, + { + "epoch": 4.122112211221122, + "grad_norm": 0.2064877152442932, + "learning_rate": 4.830493436591985e-05, + "loss": 0.0793, + "num_input_tokens_seen": 7906048, + "step": 37470 + }, + { + "epoch": 4.122662266226623, + "grad_norm": 0.48576807975769043, + "learning_rate": 4.8304065553175515e-05, + "loss": 0.0489, + "num_input_tokens_seen": 7907072, + "step": 37475 + }, + { + "epoch": 4.123212321232123, + "grad_norm": 0.17731620371341705, + "learning_rate": 4.8303196525648783e-05, + "loss": 0.0231, + "num_input_tokens_seen": 7908160, + "step": 37480 + }, + { + "epoch": 4.123762376237623, + "grad_norm": 0.10163655132055283, + "learning_rate": 4.830232728334767e-05, + "loss": 0.059, + "num_input_tokens_seen": 7909152, + "step": 37485 + }, + { + "epoch": 4.1243124312431245, + "grad_norm": 0.07063333690166473, + "learning_rate": 4.83014578262802e-05, + "loss": 0.011, + "num_input_tokens_seen": 7910144, + "step": 37490 + }, + { + "epoch": 4.124862486248625, + "grad_norm": 0.49911290407180786, + "learning_rate": 4.830058815445437e-05, + "loss": 0.0275, + "num_input_tokens_seen": 7911200, + "step": 37495 + }, + { + "epoch": 4.125412541254126, + "grad_norm": 0.036285340785980225, + "learning_rate": 4.829971826787819e-05, + "loss": 0.1185, + "num_input_tokens_seen": 7912224, + "step": 37500 + }, + { + "epoch": 4.125962596259626, + "grad_norm": 0.046986158937215805, + "learning_rate": 4.8298848166559695e-05, + "loss": 0.0334, + "num_input_tokens_seen": 7913280, + "step": 37505 + }, + { + "epoch": 4.126512651265126, + "grad_norm": 0.39823681116104126, + "learning_rate": 4.8297977850506895e-05, + "loss": 0.02, + "num_input_tokens_seen": 7914368, + "step": 37510 + }, + { + "epoch": 4.127062706270627, + "grad_norm": 0.5182715654373169, + "learning_rate": 4.829710731972782e-05, + "loss": 0.0526, + "num_input_tokens_seen": 7915424, + "step": 37515 + }, + { + "epoch": 4.1276127612761275, + "grad_norm": 0.5175299644470215, + "learning_rate": 4.8296236574230474e-05, + "loss": 0.0529, + "num_input_tokens_seen": 7916512, + "step": 37520 + }, + { + "epoch": 4.128162816281628, + "grad_norm": 0.09897268563508987, + "learning_rate": 4.82953656140229e-05, + "loss": 0.1229, + "num_input_tokens_seen": 7917568, + "step": 37525 + }, + { + "epoch": 4.128712871287129, + "grad_norm": 0.7530443668365479, + "learning_rate": 4.829449443911312e-05, + "loss": 0.0279, + "num_input_tokens_seen": 7918656, + "step": 37530 + }, + { + "epoch": 4.129262926292629, + "grad_norm": 0.07183974236249924, + "learning_rate": 4.829362304950916e-05, + "loss": 0.0084, + "num_input_tokens_seen": 7919680, + "step": 37535 + }, + { + "epoch": 4.12981298129813, + "grad_norm": 0.1362861841917038, + "learning_rate": 4.8292751445219056e-05, + "loss": 0.0179, + "num_input_tokens_seen": 7920768, + "step": 37540 + }, + { + "epoch": 4.13036303630363, + "grad_norm": 0.20648068189620972, + "learning_rate": 4.8291879626250835e-05, + "loss": 0.0157, + "num_input_tokens_seen": 7921888, + "step": 37545 + }, + { + "epoch": 4.1309130913091305, + "grad_norm": 0.28406184911727905, + "learning_rate": 4.8291007592612535e-05, + "loss": 0.0594, + "num_input_tokens_seen": 7923008, + "step": 37550 + }, + { + "epoch": 4.131463146314632, + "grad_norm": 0.053002286702394485, + "learning_rate": 4.82901353443122e-05, + "loss": 0.0431, + "num_input_tokens_seen": 7924064, + "step": 37555 + }, + { + "epoch": 4.132013201320132, + "grad_norm": 0.5778297781944275, + "learning_rate": 4.828926288135787e-05, + "loss": 0.0623, + "num_input_tokens_seen": 7925184, + "step": 37560 + }, + { + "epoch": 4.132563256325633, + "grad_norm": 0.2883136570453644, + "learning_rate": 4.828839020375756e-05, + "loss": 0.0379, + "num_input_tokens_seen": 7926240, + "step": 37565 + }, + { + "epoch": 4.133113311331133, + "grad_norm": 0.496210515499115, + "learning_rate": 4.8287517311519346e-05, + "loss": 0.0278, + "num_input_tokens_seen": 7927296, + "step": 37570 + }, + { + "epoch": 4.133663366336633, + "grad_norm": 0.028089536353945732, + "learning_rate": 4.828664420465126e-05, + "loss": 0.0813, + "num_input_tokens_seen": 7928320, + "step": 37575 + }, + { + "epoch": 4.134213421342134, + "grad_norm": 1.1594082117080688, + "learning_rate": 4.828577088316134e-05, + "loss": 0.081, + "num_input_tokens_seen": 7929344, + "step": 37580 + }, + { + "epoch": 4.134763476347635, + "grad_norm": 0.08450334519147873, + "learning_rate": 4.828489734705764e-05, + "loss": 0.0587, + "num_input_tokens_seen": 7930336, + "step": 37585 + }, + { + "epoch": 4.135313531353136, + "grad_norm": 0.034955818206071854, + "learning_rate": 4.8284023596348224e-05, + "loss": 0.0494, + "num_input_tokens_seen": 7931424, + "step": 37590 + }, + { + "epoch": 4.135863586358636, + "grad_norm": 0.43816012144088745, + "learning_rate": 4.828314963104114e-05, + "loss": 0.02, + "num_input_tokens_seen": 7932448, + "step": 37595 + }, + { + "epoch": 4.136413641364136, + "grad_norm": 0.030967554077506065, + "learning_rate": 4.8282275451144434e-05, + "loss": 0.0116, + "num_input_tokens_seen": 7933440, + "step": 37600 + }, + { + "epoch": 4.136963696369637, + "grad_norm": 0.030196381732821465, + "learning_rate": 4.828140105666616e-05, + "loss": 0.0435, + "num_input_tokens_seen": 7934432, + "step": 37605 + }, + { + "epoch": 4.137513751375137, + "grad_norm": 0.03376597911119461, + "learning_rate": 4.828052644761439e-05, + "loss": 0.0342, + "num_input_tokens_seen": 7935456, + "step": 37610 + }, + { + "epoch": 4.138063806380638, + "grad_norm": 0.3966977894306183, + "learning_rate": 4.827965162399718e-05, + "loss": 0.0232, + "num_input_tokens_seen": 7936544, + "step": 37615 + }, + { + "epoch": 4.138613861386139, + "grad_norm": 0.021648526191711426, + "learning_rate": 4.827877658582258e-05, + "loss": 0.01, + "num_input_tokens_seen": 7937568, + "step": 37620 + }, + { + "epoch": 4.139163916391639, + "grad_norm": 0.08158233761787415, + "learning_rate": 4.827790133309868e-05, + "loss": 0.0221, + "num_input_tokens_seen": 7938624, + "step": 37625 + }, + { + "epoch": 4.13971397139714, + "grad_norm": 0.40919065475463867, + "learning_rate": 4.827702586583353e-05, + "loss": 0.0353, + "num_input_tokens_seen": 7939648, + "step": 37630 + }, + { + "epoch": 4.14026402640264, + "grad_norm": 0.1076890230178833, + "learning_rate": 4.8276150184035194e-05, + "loss": 0.0227, + "num_input_tokens_seen": 7940640, + "step": 37635 + }, + { + "epoch": 4.1408140814081404, + "grad_norm": 0.06174137815833092, + "learning_rate": 4.8275274287711767e-05, + "loss": 0.0456, + "num_input_tokens_seen": 7941696, + "step": 37640 + }, + { + "epoch": 4.1413641364136415, + "grad_norm": 0.0602046363055706, + "learning_rate": 4.8274398176871295e-05, + "loss": 0.0192, + "num_input_tokens_seen": 7942720, + "step": 37645 + }, + { + "epoch": 4.141914191419142, + "grad_norm": 0.3306097388267517, + "learning_rate": 4.8273521851521865e-05, + "loss": 0.1587, + "num_input_tokens_seen": 7943744, + "step": 37650 + }, + { + "epoch": 4.142464246424643, + "grad_norm": 0.03467379882931709, + "learning_rate": 4.827264531167155e-05, + "loss": 0.024, + "num_input_tokens_seen": 7944800, + "step": 37655 + }, + { + "epoch": 4.143014301430143, + "grad_norm": 1.1472609043121338, + "learning_rate": 4.827176855732843e-05, + "loss": 0.0964, + "num_input_tokens_seen": 7945856, + "step": 37660 + }, + { + "epoch": 4.143564356435643, + "grad_norm": 0.4543076753616333, + "learning_rate": 4.827089158850059e-05, + "loss": 0.0822, + "num_input_tokens_seen": 7946944, + "step": 37665 + }, + { + "epoch": 4.144114411441144, + "grad_norm": 0.27156785130500793, + "learning_rate": 4.82700144051961e-05, + "loss": 0.3062, + "num_input_tokens_seen": 7948000, + "step": 37670 + }, + { + "epoch": 4.1446644664466445, + "grad_norm": 0.37655410170555115, + "learning_rate": 4.826913700742306e-05, + "loss": 0.0761, + "num_input_tokens_seen": 7949056, + "step": 37675 + }, + { + "epoch": 4.145214521452146, + "grad_norm": 0.07658462971448898, + "learning_rate": 4.826825939518955e-05, + "loss": 0.0374, + "num_input_tokens_seen": 7950080, + "step": 37680 + }, + { + "epoch": 4.145764576457646, + "grad_norm": 0.021952692419290543, + "learning_rate": 4.826738156850366e-05, + "loss": 0.1279, + "num_input_tokens_seen": 7951136, + "step": 37685 + }, + { + "epoch": 4.146314631463146, + "grad_norm": 0.02677672542631626, + "learning_rate": 4.826650352737347e-05, + "loss": 0.0176, + "num_input_tokens_seen": 7952192, + "step": 37690 + }, + { + "epoch": 4.146864686468647, + "grad_norm": 0.03373902663588524, + "learning_rate": 4.826562527180709e-05, + "loss": 0.1109, + "num_input_tokens_seen": 7953216, + "step": 37695 + }, + { + "epoch": 4.147414741474147, + "grad_norm": 0.22784478962421417, + "learning_rate": 4.826474680181261e-05, + "loss": 0.1081, + "num_input_tokens_seen": 7954272, + "step": 37700 + }, + { + "epoch": 4.1479647964796476, + "grad_norm": 0.037908803671598434, + "learning_rate": 4.826386811739811e-05, + "loss": 0.0415, + "num_input_tokens_seen": 7955328, + "step": 37705 + }, + { + "epoch": 4.148514851485149, + "grad_norm": 0.07787960767745972, + "learning_rate": 4.826298921857171e-05, + "loss": 0.0372, + "num_input_tokens_seen": 7956352, + "step": 37710 + }, + { + "epoch": 4.149064906490649, + "grad_norm": 0.03440781310200691, + "learning_rate": 4.82621101053415e-05, + "loss": 0.0189, + "num_input_tokens_seen": 7957408, + "step": 37715 + }, + { + "epoch": 4.14961496149615, + "grad_norm": 0.5304103493690491, + "learning_rate": 4.826123077771557e-05, + "loss": 0.029, + "num_input_tokens_seen": 7958432, + "step": 37720 + }, + { + "epoch": 4.15016501650165, + "grad_norm": 0.07608331739902496, + "learning_rate": 4.826035123570205e-05, + "loss": 0.051, + "num_input_tokens_seen": 7959488, + "step": 37725 + }, + { + "epoch": 4.15071507150715, + "grad_norm": 0.10241128504276276, + "learning_rate": 4.825947147930904e-05, + "loss": 0.0857, + "num_input_tokens_seen": 7960544, + "step": 37730 + }, + { + "epoch": 4.1512651265126514, + "grad_norm": 1.1711770296096802, + "learning_rate": 4.8258591508544624e-05, + "loss": 0.0611, + "num_input_tokens_seen": 7961600, + "step": 37735 + }, + { + "epoch": 4.151815181518152, + "grad_norm": 1.7548480033874512, + "learning_rate": 4.8257711323416946e-05, + "loss": 0.1069, + "num_input_tokens_seen": 7962656, + "step": 37740 + }, + { + "epoch": 4.152365236523653, + "grad_norm": 0.02374282293021679, + "learning_rate": 4.8256830923934104e-05, + "loss": 0.0161, + "num_input_tokens_seen": 7963712, + "step": 37745 + }, + { + "epoch": 4.152915291529153, + "grad_norm": 0.2399551123380661, + "learning_rate": 4.82559503101042e-05, + "loss": 0.0279, + "num_input_tokens_seen": 7964864, + "step": 37750 + }, + { + "epoch": 4.153465346534653, + "grad_norm": 0.07271473854780197, + "learning_rate": 4.825506948193537e-05, + "loss": 0.0271, + "num_input_tokens_seen": 7965952, + "step": 37755 + }, + { + "epoch": 4.154015401540154, + "grad_norm": 0.05435746908187866, + "learning_rate": 4.825418843943572e-05, + "loss": 0.0601, + "num_input_tokens_seen": 7966976, + "step": 37760 + }, + { + "epoch": 4.1545654565456545, + "grad_norm": 0.041496772319078445, + "learning_rate": 4.825330718261337e-05, + "loss": 0.0275, + "num_input_tokens_seen": 7968032, + "step": 37765 + }, + { + "epoch": 4.1551155115511555, + "grad_norm": 0.02727080136537552, + "learning_rate": 4.825242571147645e-05, + "loss": 0.0761, + "num_input_tokens_seen": 7969088, + "step": 37770 + }, + { + "epoch": 4.155665566556656, + "grad_norm": 1.1945645809173584, + "learning_rate": 4.825154402603308e-05, + "loss": 0.1157, + "num_input_tokens_seen": 7970048, + "step": 37775 + }, + { + "epoch": 4.156215621562156, + "grad_norm": 0.04364858940243721, + "learning_rate": 4.825066212629139e-05, + "loss": 0.0118, + "num_input_tokens_seen": 7971168, + "step": 37780 + }, + { + "epoch": 4.156765676567657, + "grad_norm": 0.5841900110244751, + "learning_rate": 4.824978001225949e-05, + "loss": 0.0255, + "num_input_tokens_seen": 7972224, + "step": 37785 + }, + { + "epoch": 4.157315731573157, + "grad_norm": 0.023051362484693527, + "learning_rate": 4.824889768394554e-05, + "loss": 0.0698, + "num_input_tokens_seen": 7973312, + "step": 37790 + }, + { + "epoch": 4.1578657865786575, + "grad_norm": 0.4786112904548645, + "learning_rate": 4.824801514135765e-05, + "loss": 0.1034, + "num_input_tokens_seen": 7974368, + "step": 37795 + }, + { + "epoch": 4.158415841584159, + "grad_norm": 0.06934206187725067, + "learning_rate": 4.8247132384503954e-05, + "loss": 0.0148, + "num_input_tokens_seen": 7975424, + "step": 37800 + }, + { + "epoch": 4.158965896589659, + "grad_norm": 0.018985815346240997, + "learning_rate": 4.82462494133926e-05, + "loss": 0.007, + "num_input_tokens_seen": 7976448, + "step": 37805 + }, + { + "epoch": 4.15951595159516, + "grad_norm": 0.026721959933638573, + "learning_rate": 4.8245366228031716e-05, + "loss": 0.1717, + "num_input_tokens_seen": 7977440, + "step": 37810 + }, + { + "epoch": 4.16006600660066, + "grad_norm": 1.4731312990188599, + "learning_rate": 4.824448282842945e-05, + "loss": 0.1174, + "num_input_tokens_seen": 7978464, + "step": 37815 + }, + { + "epoch": 4.16061606160616, + "grad_norm": 0.03198901563882828, + "learning_rate": 4.824359921459394e-05, + "loss": 0.1021, + "num_input_tokens_seen": 7979488, + "step": 37820 + }, + { + "epoch": 4.161166116611661, + "grad_norm": 0.09008404612541199, + "learning_rate": 4.824271538653333e-05, + "loss": 0.0089, + "num_input_tokens_seen": 7980544, + "step": 37825 + }, + { + "epoch": 4.161716171617162, + "grad_norm": 0.2371780127286911, + "learning_rate": 4.824183134425576e-05, + "loss": 0.0153, + "num_input_tokens_seen": 7981600, + "step": 37830 + }, + { + "epoch": 4.162266226622663, + "grad_norm": 0.5885151624679565, + "learning_rate": 4.824094708776938e-05, + "loss": 0.1074, + "num_input_tokens_seen": 7982656, + "step": 37835 + }, + { + "epoch": 4.162816281628163, + "grad_norm": 0.7028750777244568, + "learning_rate": 4.8240062617082356e-05, + "loss": 0.0834, + "num_input_tokens_seen": 7983744, + "step": 37840 + }, + { + "epoch": 4.163366336633663, + "grad_norm": 0.14046452939510345, + "learning_rate": 4.8239177932202816e-05, + "loss": 0.0266, + "num_input_tokens_seen": 7984864, + "step": 37845 + }, + { + "epoch": 4.163916391639164, + "grad_norm": 1.185551404953003, + "learning_rate": 4.823829303313893e-05, + "loss": 0.1044, + "num_input_tokens_seen": 7985952, + "step": 37850 + }, + { + "epoch": 4.164466446644664, + "grad_norm": 1.3557273149490356, + "learning_rate": 4.823740791989885e-05, + "loss": 0.0576, + "num_input_tokens_seen": 7987040, + "step": 37855 + }, + { + "epoch": 4.165016501650165, + "grad_norm": 0.10578174144029617, + "learning_rate": 4.8236522592490715e-05, + "loss": 0.0218, + "num_input_tokens_seen": 7988032, + "step": 37860 + }, + { + "epoch": 4.165566556655666, + "grad_norm": 0.6165164113044739, + "learning_rate": 4.8235637050922724e-05, + "loss": 0.0702, + "num_input_tokens_seen": 7989120, + "step": 37865 + }, + { + "epoch": 4.166116611661166, + "grad_norm": 0.08642975986003876, + "learning_rate": 4.8234751295203e-05, + "loss": 0.043, + "num_input_tokens_seen": 7990176, + "step": 37870 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.025842396542429924, + "learning_rate": 4.823386532533973e-05, + "loss": 0.031, + "num_input_tokens_seen": 7991200, + "step": 37875 + }, + { + "epoch": 4.167216721672167, + "grad_norm": 0.8720211982727051, + "learning_rate": 4.823297914134106e-05, + "loss": 0.0679, + "num_input_tokens_seen": 7992288, + "step": 37880 + }, + { + "epoch": 4.167766776677667, + "grad_norm": 0.5768584609031677, + "learning_rate": 4.823209274321518e-05, + "loss": 0.0249, + "num_input_tokens_seen": 7993312, + "step": 37885 + }, + { + "epoch": 4.1683168316831685, + "grad_norm": 0.18492121994495392, + "learning_rate": 4.823120613097025e-05, + "loss": 0.0191, + "num_input_tokens_seen": 7994368, + "step": 37890 + }, + { + "epoch": 4.168866886688669, + "grad_norm": 0.010795462876558304, + "learning_rate": 4.8230319304614444e-05, + "loss": 0.0363, + "num_input_tokens_seen": 7995424, + "step": 37895 + }, + { + "epoch": 4.16941694169417, + "grad_norm": 0.1909824162721634, + "learning_rate": 4.822943226415592e-05, + "loss": 0.0664, + "num_input_tokens_seen": 7996480, + "step": 37900 + }, + { + "epoch": 4.16996699669967, + "grad_norm": 0.011299395002424717, + "learning_rate": 4.822854500960288e-05, + "loss": 0.0179, + "num_input_tokens_seen": 7997568, + "step": 37905 + }, + { + "epoch": 4.17051705170517, + "grad_norm": 0.042159564793109894, + "learning_rate": 4.8227657540963475e-05, + "loss": 0.1062, + "num_input_tokens_seen": 7998656, + "step": 37910 + }, + { + "epoch": 4.171067106710671, + "grad_norm": 0.0747765451669693, + "learning_rate": 4.8226769858245894e-05, + "loss": 0.0332, + "num_input_tokens_seen": 7999776, + "step": 37915 + }, + { + "epoch": 4.1716171617161715, + "grad_norm": 2.820784568786621, + "learning_rate": 4.822588196145833e-05, + "loss": 0.1712, + "num_input_tokens_seen": 8000832, + "step": 37920 + }, + { + "epoch": 4.172167216721673, + "grad_norm": 0.016140658408403397, + "learning_rate": 4.822499385060895e-05, + "loss": 0.0063, + "num_input_tokens_seen": 8001824, + "step": 37925 + }, + { + "epoch": 4.172717271727173, + "grad_norm": 0.707894504070282, + "learning_rate": 4.8224105525705955e-05, + "loss": 0.0252, + "num_input_tokens_seen": 8002848, + "step": 37930 + }, + { + "epoch": 4.173267326732673, + "grad_norm": 0.461046040058136, + "learning_rate": 4.8223216986757517e-05, + "loss": 0.0778, + "num_input_tokens_seen": 8003936, + "step": 37935 + }, + { + "epoch": 4.173817381738174, + "grad_norm": 0.013180720619857311, + "learning_rate": 4.822232823377183e-05, + "loss": 0.0056, + "num_input_tokens_seen": 8004960, + "step": 37940 + }, + { + "epoch": 4.174367436743674, + "grad_norm": 0.4643358886241913, + "learning_rate": 4.822143926675709e-05, + "loss": 0.035, + "num_input_tokens_seen": 8005952, + "step": 37945 + }, + { + "epoch": 4.174917491749175, + "grad_norm": 0.37797096371650696, + "learning_rate": 4.822055008572148e-05, + "loss": 0.1785, + "num_input_tokens_seen": 8007072, + "step": 37950 + }, + { + "epoch": 4.175467546754676, + "grad_norm": 0.08381318300962448, + "learning_rate": 4.8219660690673205e-05, + "loss": 0.0241, + "num_input_tokens_seen": 8008096, + "step": 37955 + }, + { + "epoch": 4.176017601760176, + "grad_norm": 1.240336298942566, + "learning_rate": 4.821877108162047e-05, + "loss": 0.0886, + "num_input_tokens_seen": 8009216, + "step": 37960 + }, + { + "epoch": 4.176567656765677, + "grad_norm": 0.4945850670337677, + "learning_rate": 4.821788125857145e-05, + "loss": 0.0516, + "num_input_tokens_seen": 8010208, + "step": 37965 + }, + { + "epoch": 4.177117711771177, + "grad_norm": 0.43340927362442017, + "learning_rate": 4.821699122153437e-05, + "loss": 0.0201, + "num_input_tokens_seen": 8011264, + "step": 37970 + }, + { + "epoch": 4.177667766776677, + "grad_norm": 0.028849132359027863, + "learning_rate": 4.821610097051742e-05, + "loss": 0.0282, + "num_input_tokens_seen": 8012384, + "step": 37975 + }, + { + "epoch": 4.178217821782178, + "grad_norm": 0.08287008106708527, + "learning_rate": 4.82152105055288e-05, + "loss": 0.1215, + "num_input_tokens_seen": 8013472, + "step": 37980 + }, + { + "epoch": 4.178767876787679, + "grad_norm": 0.35406872630119324, + "learning_rate": 4.821431982657673e-05, + "loss": 0.1032, + "num_input_tokens_seen": 8014496, + "step": 37985 + }, + { + "epoch": 4.17931793179318, + "grad_norm": 0.42492005228996277, + "learning_rate": 4.821342893366941e-05, + "loss": 0.0265, + "num_input_tokens_seen": 8015616, + "step": 37990 + }, + { + "epoch": 4.17986798679868, + "grad_norm": 0.059900350868701935, + "learning_rate": 4.8212537826815064e-05, + "loss": 0.0339, + "num_input_tokens_seen": 8016736, + "step": 37995 + }, + { + "epoch": 4.18041804180418, + "grad_norm": 0.08083714544773102, + "learning_rate": 4.8211646506021884e-05, + "loss": 0.0128, + "num_input_tokens_seen": 8017728, + "step": 38000 + }, + { + "epoch": 4.180968096809681, + "grad_norm": 0.033588383346796036, + "learning_rate": 4.82107549712981e-05, + "loss": 0.0361, + "num_input_tokens_seen": 8018784, + "step": 38005 + }, + { + "epoch": 4.181518151815181, + "grad_norm": 0.5687190890312195, + "learning_rate": 4.820986322265193e-05, + "loss": 0.0555, + "num_input_tokens_seen": 8019840, + "step": 38010 + }, + { + "epoch": 4.1820682068206825, + "grad_norm": 0.3371807336807251, + "learning_rate": 4.820897126009158e-05, + "loss": 0.0178, + "num_input_tokens_seen": 8020896, + "step": 38015 + }, + { + "epoch": 4.182618261826183, + "grad_norm": 0.8495520353317261, + "learning_rate": 4.8208079083625285e-05, + "loss": 0.1161, + "num_input_tokens_seen": 8021952, + "step": 38020 + }, + { + "epoch": 4.183168316831683, + "grad_norm": 0.6301699876785278, + "learning_rate": 4.820718669326126e-05, + "loss": 0.0378, + "num_input_tokens_seen": 8023008, + "step": 38025 + }, + { + "epoch": 4.183718371837184, + "grad_norm": 0.09719076007604599, + "learning_rate": 4.820629408900773e-05, + "loss": 0.0711, + "num_input_tokens_seen": 8024032, + "step": 38030 + }, + { + "epoch": 4.184268426842684, + "grad_norm": 0.4023229777812958, + "learning_rate": 4.820540127087292e-05, + "loss": 0.1315, + "num_input_tokens_seen": 8025056, + "step": 38035 + }, + { + "epoch": 4.184818481848184, + "grad_norm": 0.11030435562133789, + "learning_rate": 4.8204508238865064e-05, + "loss": 0.0116, + "num_input_tokens_seen": 8026080, + "step": 38040 + }, + { + "epoch": 4.1853685368536855, + "grad_norm": 0.7871020436286926, + "learning_rate": 4.8203614992992396e-05, + "loss": 0.0397, + "num_input_tokens_seen": 8027136, + "step": 38045 + }, + { + "epoch": 4.185918591859186, + "grad_norm": 0.011358500458300114, + "learning_rate": 4.8202721533263134e-05, + "loss": 0.2281, + "num_input_tokens_seen": 8028160, + "step": 38050 + }, + { + "epoch": 4.186468646864687, + "grad_norm": 0.0507768951356411, + "learning_rate": 4.820182785968552e-05, + "loss": 0.0252, + "num_input_tokens_seen": 8029216, + "step": 38055 + }, + { + "epoch": 4.187018701870187, + "grad_norm": 0.005090795923024416, + "learning_rate": 4.8200933972267803e-05, + "loss": 0.0463, + "num_input_tokens_seen": 8030208, + "step": 38060 + }, + { + "epoch": 4.187568756875687, + "grad_norm": 0.43355026841163635, + "learning_rate": 4.8200039871018196e-05, + "loss": 0.0826, + "num_input_tokens_seen": 8031232, + "step": 38065 + }, + { + "epoch": 4.188118811881188, + "grad_norm": 0.08076830953359604, + "learning_rate": 4.8199145555944966e-05, + "loss": 0.0292, + "num_input_tokens_seen": 8032256, + "step": 38070 + }, + { + "epoch": 4.1886688668866885, + "grad_norm": 0.0718473568558693, + "learning_rate": 4.819825102705634e-05, + "loss": 0.01, + "num_input_tokens_seen": 8033344, + "step": 38075 + }, + { + "epoch": 4.18921892189219, + "grad_norm": 0.04279331490397453, + "learning_rate": 4.819735628436057e-05, + "loss": 0.0294, + "num_input_tokens_seen": 8034400, + "step": 38080 + }, + { + "epoch": 4.18976897689769, + "grad_norm": 0.043672192841768265, + "learning_rate": 4.819646132786589e-05, + "loss": 0.0548, + "num_input_tokens_seen": 8035456, + "step": 38085 + }, + { + "epoch": 4.19031903190319, + "grad_norm": 0.02342938259243965, + "learning_rate": 4.819556615758056e-05, + "loss": 0.0456, + "num_input_tokens_seen": 8036480, + "step": 38090 + }, + { + "epoch": 4.190869086908691, + "grad_norm": 0.8687965869903564, + "learning_rate": 4.819467077351283e-05, + "loss": 0.0489, + "num_input_tokens_seen": 8037600, + "step": 38095 + }, + { + "epoch": 4.191419141914191, + "grad_norm": 0.14055152237415314, + "learning_rate": 4.8193775175670944e-05, + "loss": 0.1198, + "num_input_tokens_seen": 8038624, + "step": 38100 + }, + { + "epoch": 4.191969196919692, + "grad_norm": 0.7816714644432068, + "learning_rate": 4.819287936406317e-05, + "loss": 0.0479, + "num_input_tokens_seen": 8039712, + "step": 38105 + }, + { + "epoch": 4.192519251925193, + "grad_norm": 0.08719465136528015, + "learning_rate": 4.819198333869774e-05, + "loss": 0.0475, + "num_input_tokens_seen": 8040768, + "step": 38110 + }, + { + "epoch": 4.193069306930693, + "grad_norm": 0.5010629892349243, + "learning_rate": 4.8191087099582944e-05, + "loss": 0.0368, + "num_input_tokens_seen": 8041792, + "step": 38115 + }, + { + "epoch": 4.193619361936194, + "grad_norm": 0.017068179324269295, + "learning_rate": 4.819019064672703e-05, + "loss": 0.01, + "num_input_tokens_seen": 8042944, + "step": 38120 + }, + { + "epoch": 4.194169416941694, + "grad_norm": 0.023128362372517586, + "learning_rate": 4.8189293980138236e-05, + "loss": 0.0148, + "num_input_tokens_seen": 8044000, + "step": 38125 + }, + { + "epoch": 4.194719471947194, + "grad_norm": 0.23167365789413452, + "learning_rate": 4.8188397099824865e-05, + "loss": 0.0431, + "num_input_tokens_seen": 8045088, + "step": 38130 + }, + { + "epoch": 4.195269526952695, + "grad_norm": 0.026039328426122665, + "learning_rate": 4.8187500005795154e-05, + "loss": 0.0184, + "num_input_tokens_seen": 8046144, + "step": 38135 + }, + { + "epoch": 4.195819581958196, + "grad_norm": 1.6912388801574707, + "learning_rate": 4.8186602698057385e-05, + "loss": 0.0833, + "num_input_tokens_seen": 8047168, + "step": 38140 + }, + { + "epoch": 4.196369636963697, + "grad_norm": 0.014451942406594753, + "learning_rate": 4.8185705176619836e-05, + "loss": 0.1183, + "num_input_tokens_seen": 8048288, + "step": 38145 + }, + { + "epoch": 4.196919691969197, + "grad_norm": 0.07145972549915314, + "learning_rate": 4.8184807441490756e-05, + "loss": 0.064, + "num_input_tokens_seen": 8049344, + "step": 38150 + }, + { + "epoch": 4.197469746974697, + "grad_norm": 0.03739600628614426, + "learning_rate": 4.8183909492678424e-05, + "loss": 0.0819, + "num_input_tokens_seen": 8050336, + "step": 38155 + }, + { + "epoch": 4.198019801980198, + "grad_norm": 1.5948432683944702, + "learning_rate": 4.8183011330191135e-05, + "loss": 0.0967, + "num_input_tokens_seen": 8051392, + "step": 38160 + }, + { + "epoch": 4.198569856985698, + "grad_norm": 0.04910269379615784, + "learning_rate": 4.818211295403715e-05, + "loss": 0.0694, + "num_input_tokens_seen": 8052480, + "step": 38165 + }, + { + "epoch": 4.1991199119911995, + "grad_norm": 0.5865067839622498, + "learning_rate": 4.818121436422476e-05, + "loss": 0.0567, + "num_input_tokens_seen": 8053536, + "step": 38170 + }, + { + "epoch": 4.1996699669967, + "grad_norm": 0.2792135179042816, + "learning_rate": 4.8180315560762236e-05, + "loss": 0.0396, + "num_input_tokens_seen": 8054624, + "step": 38175 + }, + { + "epoch": 4.2002200220022, + "grad_norm": 0.07158742845058441, + "learning_rate": 4.8179416543657864e-05, + "loss": 0.0965, + "num_input_tokens_seen": 8055680, + "step": 38180 + }, + { + "epoch": 4.200770077007701, + "grad_norm": 0.013680965639650822, + "learning_rate": 4.817851731291994e-05, + "loss": 0.0266, + "num_input_tokens_seen": 8056736, + "step": 38185 + }, + { + "epoch": 4.201320132013201, + "grad_norm": 0.06474779546260834, + "learning_rate": 4.8177617868556736e-05, + "loss": 0.0716, + "num_input_tokens_seen": 8057792, + "step": 38190 + }, + { + "epoch": 4.201870187018702, + "grad_norm": 0.01562894508242607, + "learning_rate": 4.817671821057655e-05, + "loss": 0.0812, + "num_input_tokens_seen": 8058848, + "step": 38195 + }, + { + "epoch": 4.2024202420242025, + "grad_norm": 0.13688324391841888, + "learning_rate": 4.817581833898768e-05, + "loss": 0.052, + "num_input_tokens_seen": 8059840, + "step": 38200 + }, + { + "epoch": 4.202970297029703, + "grad_norm": 0.43392762541770935, + "learning_rate": 4.817491825379841e-05, + "loss": 0.0256, + "num_input_tokens_seen": 8060896, + "step": 38205 + }, + { + "epoch": 4.203520352035204, + "grad_norm": 0.1246824786067009, + "learning_rate": 4.8174017955017034e-05, + "loss": 0.0129, + "num_input_tokens_seen": 8061920, + "step": 38210 + }, + { + "epoch": 4.204070407040704, + "grad_norm": 0.1627397984266281, + "learning_rate": 4.8173117442651864e-05, + "loss": 0.0751, + "num_input_tokens_seen": 8062944, + "step": 38215 + }, + { + "epoch": 4.204620462046204, + "grad_norm": 0.2142915278673172, + "learning_rate": 4.8172216716711185e-05, + "loss": 0.1302, + "num_input_tokens_seen": 8064000, + "step": 38220 + }, + { + "epoch": 4.205170517051705, + "grad_norm": 1.2909841537475586, + "learning_rate": 4.81713157772033e-05, + "loss": 0.1105, + "num_input_tokens_seen": 8065088, + "step": 38225 + }, + { + "epoch": 4.2057205720572055, + "grad_norm": 0.1681205779314041, + "learning_rate": 4.817041462413652e-05, + "loss": 0.0593, + "num_input_tokens_seen": 8066112, + "step": 38230 + }, + { + "epoch": 4.206270627062707, + "grad_norm": 0.9889879822731018, + "learning_rate": 4.816951325751915e-05, + "loss": 0.0447, + "num_input_tokens_seen": 8067168, + "step": 38235 + }, + { + "epoch": 4.206820682068207, + "grad_norm": 0.0408574640750885, + "learning_rate": 4.816861167735948e-05, + "loss": 0.0294, + "num_input_tokens_seen": 8068256, + "step": 38240 + }, + { + "epoch": 4.207370737073707, + "grad_norm": 0.540460467338562, + "learning_rate": 4.816770988366584e-05, + "loss": 0.1964, + "num_input_tokens_seen": 8069312, + "step": 38245 + }, + { + "epoch": 4.207920792079208, + "grad_norm": 0.21767626702785492, + "learning_rate": 4.8166807876446546e-05, + "loss": 0.0142, + "num_input_tokens_seen": 8070368, + "step": 38250 + }, + { + "epoch": 4.208470847084708, + "grad_norm": 0.16647392511367798, + "learning_rate": 4.8165905655709895e-05, + "loss": 0.0211, + "num_input_tokens_seen": 8071424, + "step": 38255 + }, + { + "epoch": 4.209020902090209, + "grad_norm": 0.3046453297138214, + "learning_rate": 4.81650032214642e-05, + "loss": 0.029, + "num_input_tokens_seen": 8072480, + "step": 38260 + }, + { + "epoch": 4.20957095709571, + "grad_norm": 1.1445002555847168, + "learning_rate": 4.81641005737178e-05, + "loss": 0.1123, + "num_input_tokens_seen": 8073536, + "step": 38265 + }, + { + "epoch": 4.21012101210121, + "grad_norm": 1.0054831504821777, + "learning_rate": 4.816319771247899e-05, + "loss": 0.0668, + "num_input_tokens_seen": 8074560, + "step": 38270 + }, + { + "epoch": 4.210671067106711, + "grad_norm": 0.05871080607175827, + "learning_rate": 4.81622946377561e-05, + "loss": 0.0232, + "num_input_tokens_seen": 8075616, + "step": 38275 + }, + { + "epoch": 4.211221122112211, + "grad_norm": 0.20259720087051392, + "learning_rate": 4.816139134955746e-05, + "loss": 0.0367, + "num_input_tokens_seen": 8076704, + "step": 38280 + }, + { + "epoch": 4.211771177117711, + "grad_norm": 0.19807854294776917, + "learning_rate": 4.816048784789139e-05, + "loss": 0.032, + "num_input_tokens_seen": 8077792, + "step": 38285 + }, + { + "epoch": 4.212321232123212, + "grad_norm": 0.49328893423080444, + "learning_rate": 4.815958413276621e-05, + "loss": 0.0273, + "num_input_tokens_seen": 8078848, + "step": 38290 + }, + { + "epoch": 4.212871287128713, + "grad_norm": 1.0890930891036987, + "learning_rate": 4.8158680204190264e-05, + "loss": 0.0968, + "num_input_tokens_seen": 8079904, + "step": 38295 + }, + { + "epoch": 4.213421342134214, + "grad_norm": 0.060321301221847534, + "learning_rate": 4.8157776062171875e-05, + "loss": 0.0264, + "num_input_tokens_seen": 8080960, + "step": 38300 + }, + { + "epoch": 4.213971397139714, + "grad_norm": 1.0548774003982544, + "learning_rate": 4.815687170671937e-05, + "loss": 0.0379, + "num_input_tokens_seen": 8081984, + "step": 38305 + }, + { + "epoch": 4.214521452145214, + "grad_norm": 2.0963757038116455, + "learning_rate": 4.8155967137841094e-05, + "loss": 0.0645, + "num_input_tokens_seen": 8083072, + "step": 38310 + }, + { + "epoch": 4.215071507150715, + "grad_norm": 0.08445242047309875, + "learning_rate": 4.815506235554538e-05, + "loss": 0.1061, + "num_input_tokens_seen": 8084160, + "step": 38315 + }, + { + "epoch": 4.215621562156215, + "grad_norm": 0.505066454410553, + "learning_rate": 4.815415735984057e-05, + "loss": 0.1685, + "num_input_tokens_seen": 8085152, + "step": 38320 + }, + { + "epoch": 4.2161716171617165, + "grad_norm": 0.017369400709867477, + "learning_rate": 4.8153252150734996e-05, + "loss": 0.0714, + "num_input_tokens_seen": 8086176, + "step": 38325 + }, + { + "epoch": 4.216721672167217, + "grad_norm": 0.12298444658517838, + "learning_rate": 4.8152346728237006e-05, + "loss": 0.0244, + "num_input_tokens_seen": 8087264, + "step": 38330 + }, + { + "epoch": 4.217271727172717, + "grad_norm": 0.11607485264539719, + "learning_rate": 4.815144109235496e-05, + "loss": 0.0228, + "num_input_tokens_seen": 8088352, + "step": 38335 + }, + { + "epoch": 4.217821782178218, + "grad_norm": 0.24704883992671967, + "learning_rate": 4.815053524309717e-05, + "loss": 0.0342, + "num_input_tokens_seen": 8089408, + "step": 38340 + }, + { + "epoch": 4.218371837183718, + "grad_norm": 0.40831610560417175, + "learning_rate": 4.814962918047202e-05, + "loss": 0.0225, + "num_input_tokens_seen": 8090496, + "step": 38345 + }, + { + "epoch": 4.218921892189219, + "grad_norm": 0.058423642069101334, + "learning_rate": 4.8148722904487845e-05, + "loss": 0.0311, + "num_input_tokens_seen": 8091552, + "step": 38350 + }, + { + "epoch": 4.2194719471947195, + "grad_norm": 0.1404985636472702, + "learning_rate": 4.814781641515299e-05, + "loss": 0.0163, + "num_input_tokens_seen": 8092576, + "step": 38355 + }, + { + "epoch": 4.22002200220022, + "grad_norm": 0.05248226225376129, + "learning_rate": 4.814690971247583e-05, + "loss": 0.0411, + "num_input_tokens_seen": 8093600, + "step": 38360 + }, + { + "epoch": 4.220572057205721, + "grad_norm": 0.21817569434642792, + "learning_rate": 4.81460027964647e-05, + "loss": 0.019, + "num_input_tokens_seen": 8094656, + "step": 38365 + }, + { + "epoch": 4.221122112211221, + "grad_norm": 0.3358178734779358, + "learning_rate": 4.814509566712797e-05, + "loss": 0.0142, + "num_input_tokens_seen": 8095712, + "step": 38370 + }, + { + "epoch": 4.221672167216722, + "grad_norm": 0.015520695596933365, + "learning_rate": 4.8144188324474e-05, + "loss": 0.0186, + "num_input_tokens_seen": 8096768, + "step": 38375 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 1.859175205230713, + "learning_rate": 4.814328076851116e-05, + "loss": 0.0305, + "num_input_tokens_seen": 8097856, + "step": 38380 + }, + { + "epoch": 4.2227722772277225, + "grad_norm": 1.666937232017517, + "learning_rate": 4.8142372999247795e-05, + "loss": 0.163, + "num_input_tokens_seen": 8098912, + "step": 38385 + }, + { + "epoch": 4.223322332233224, + "grad_norm": 0.14891043305397034, + "learning_rate": 4.814146501669229e-05, + "loss": 0.1097, + "num_input_tokens_seen": 8100000, + "step": 38390 + }, + { + "epoch": 4.223872387238724, + "grad_norm": 0.04444881156086922, + "learning_rate": 4.814055682085301e-05, + "loss": 0.1376, + "num_input_tokens_seen": 8101088, + "step": 38395 + }, + { + "epoch": 4.224422442244224, + "grad_norm": 0.19552141427993774, + "learning_rate": 4.813964841173831e-05, + "loss": 0.0992, + "num_input_tokens_seen": 8102112, + "step": 38400 + }, + { + "epoch": 4.224972497249725, + "grad_norm": 0.5116785168647766, + "learning_rate": 4.813873978935659e-05, + "loss": 0.0381, + "num_input_tokens_seen": 8103232, + "step": 38405 + }, + { + "epoch": 4.225522552255225, + "grad_norm": 0.016638806089758873, + "learning_rate": 4.81378309537162e-05, + "loss": 0.0304, + "num_input_tokens_seen": 8104288, + "step": 38410 + }, + { + "epoch": 4.226072607260726, + "grad_norm": 0.7670665979385376, + "learning_rate": 4.813692190482552e-05, + "loss": 0.0628, + "num_input_tokens_seen": 8105376, + "step": 38415 + }, + { + "epoch": 4.226622662266227, + "grad_norm": 0.16178785264492035, + "learning_rate": 4.813601264269294e-05, + "loss": 0.0686, + "num_input_tokens_seen": 8106432, + "step": 38420 + }, + { + "epoch": 4.227172717271727, + "grad_norm": 0.6703600287437439, + "learning_rate": 4.813510316732682e-05, + "loss": 0.0383, + "num_input_tokens_seen": 8107424, + "step": 38425 + }, + { + "epoch": 4.227722772277228, + "grad_norm": 0.24102771282196045, + "learning_rate": 4.813419347873557e-05, + "loss": 0.0641, + "num_input_tokens_seen": 8108480, + "step": 38430 + }, + { + "epoch": 4.228272827282728, + "grad_norm": 1.215640664100647, + "learning_rate": 4.813328357692756e-05, + "loss": 0.0465, + "num_input_tokens_seen": 8109536, + "step": 38435 + }, + { + "epoch": 4.228822882288229, + "grad_norm": 0.13066935539245605, + "learning_rate": 4.813237346191117e-05, + "loss": 0.0114, + "num_input_tokens_seen": 8110560, + "step": 38440 + }, + { + "epoch": 4.229372937293729, + "grad_norm": 0.32540029287338257, + "learning_rate": 4.813146313369479e-05, + "loss": 0.0903, + "num_input_tokens_seen": 8111584, + "step": 38445 + }, + { + "epoch": 4.22992299229923, + "grad_norm": 0.9758995771408081, + "learning_rate": 4.813055259228682e-05, + "loss": 0.1077, + "num_input_tokens_seen": 8112640, + "step": 38450 + }, + { + "epoch": 4.230473047304731, + "grad_norm": 0.01436921488493681, + "learning_rate": 4.812964183769564e-05, + "loss": 0.1089, + "num_input_tokens_seen": 8113696, + "step": 38455 + }, + { + "epoch": 4.231023102310231, + "grad_norm": 0.18115003407001495, + "learning_rate": 4.812873086992966e-05, + "loss": 0.0182, + "num_input_tokens_seen": 8114784, + "step": 38460 + }, + { + "epoch": 4.231573157315731, + "grad_norm": 0.24664562940597534, + "learning_rate": 4.8127819688997256e-05, + "loss": 0.1222, + "num_input_tokens_seen": 8115840, + "step": 38465 + }, + { + "epoch": 4.232123212321232, + "grad_norm": 3.1692757606506348, + "learning_rate": 4.8126908294906843e-05, + "loss": 0.1046, + "num_input_tokens_seen": 8116864, + "step": 38470 + }, + { + "epoch": 4.232673267326732, + "grad_norm": 0.11597634851932526, + "learning_rate": 4.812599668766681e-05, + "loss": 0.109, + "num_input_tokens_seen": 8117856, + "step": 38475 + }, + { + "epoch": 4.2332233223322335, + "grad_norm": 0.3432866930961609, + "learning_rate": 4.812508486728557e-05, + "loss": 0.0368, + "num_input_tokens_seen": 8118880, + "step": 38480 + }, + { + "epoch": 4.233773377337734, + "grad_norm": 0.13399124145507812, + "learning_rate": 4.81241728337715e-05, + "loss": 0.0534, + "num_input_tokens_seen": 8119904, + "step": 38485 + }, + { + "epoch": 4.234323432343234, + "grad_norm": 0.24007533490657806, + "learning_rate": 4.812326058713304e-05, + "loss": 0.0154, + "num_input_tokens_seen": 8121024, + "step": 38490 + }, + { + "epoch": 4.234873487348735, + "grad_norm": 0.2412269413471222, + "learning_rate": 4.812234812737858e-05, + "loss": 0.0391, + "num_input_tokens_seen": 8122048, + "step": 38495 + }, + { + "epoch": 4.235423542354235, + "grad_norm": 0.058825649321079254, + "learning_rate": 4.8121435454516537e-05, + "loss": 0.0757, + "num_input_tokens_seen": 8123040, + "step": 38500 + }, + { + "epoch": 4.235973597359736, + "grad_norm": 1.3749849796295166, + "learning_rate": 4.812052256855531e-05, + "loss": 0.0774, + "num_input_tokens_seen": 8124096, + "step": 38505 + }, + { + "epoch": 4.2365236523652365, + "grad_norm": 0.018225140869617462, + "learning_rate": 4.8119609469503326e-05, + "loss": 0.0762, + "num_input_tokens_seen": 8125120, + "step": 38510 + }, + { + "epoch": 4.237073707370737, + "grad_norm": 1.0935263633728027, + "learning_rate": 4.811869615736899e-05, + "loss": 0.0333, + "num_input_tokens_seen": 8126240, + "step": 38515 + }, + { + "epoch": 4.237623762376238, + "grad_norm": 0.21956802904605865, + "learning_rate": 4.811778263216073e-05, + "loss": 0.033, + "num_input_tokens_seen": 8127328, + "step": 38520 + }, + { + "epoch": 4.238173817381738, + "grad_norm": 0.055926110595464706, + "learning_rate": 4.811686889388696e-05, + "loss": 0.0462, + "num_input_tokens_seen": 8128416, + "step": 38525 + }, + { + "epoch": 4.238723872387239, + "grad_norm": 0.021116597577929497, + "learning_rate": 4.81159549425561e-05, + "loss": 0.0239, + "num_input_tokens_seen": 8129408, + "step": 38530 + }, + { + "epoch": 4.239273927392739, + "grad_norm": 1.1492791175842285, + "learning_rate": 4.811504077817658e-05, + "loss": 0.078, + "num_input_tokens_seen": 8130432, + "step": 38535 + }, + { + "epoch": 4.2398239823982395, + "grad_norm": 0.3439245820045471, + "learning_rate": 4.811412640075681e-05, + "loss": 0.0711, + "num_input_tokens_seen": 8131456, + "step": 38540 + }, + { + "epoch": 4.240374037403741, + "grad_norm": 0.0979854017496109, + "learning_rate": 4.8113211810305235e-05, + "loss": 0.1011, + "num_input_tokens_seen": 8132544, + "step": 38545 + }, + { + "epoch": 4.240924092409241, + "grad_norm": 0.24512329697608948, + "learning_rate": 4.811229700683028e-05, + "loss": 0.1399, + "num_input_tokens_seen": 8133568, + "step": 38550 + }, + { + "epoch": 4.241474147414741, + "grad_norm": 0.4192315340042114, + "learning_rate": 4.8111381990340374e-05, + "loss": 0.0563, + "num_input_tokens_seen": 8134656, + "step": 38555 + }, + { + "epoch": 4.242024202420242, + "grad_norm": 0.7892796397209167, + "learning_rate": 4.811046676084395e-05, + "loss": 0.0827, + "num_input_tokens_seen": 8135776, + "step": 38560 + }, + { + "epoch": 4.242574257425742, + "grad_norm": 0.05647876486182213, + "learning_rate": 4.8109551318349436e-05, + "loss": 0.0476, + "num_input_tokens_seen": 8136800, + "step": 38565 + }, + { + "epoch": 4.243124312431243, + "grad_norm": 0.5363205075263977, + "learning_rate": 4.810863566286529e-05, + "loss": 0.0559, + "num_input_tokens_seen": 8137888, + "step": 38570 + }, + { + "epoch": 4.243674367436744, + "grad_norm": 1.0566173791885376, + "learning_rate": 4.810771979439992e-05, + "loss": 0.0723, + "num_input_tokens_seen": 8138976, + "step": 38575 + }, + { + "epoch": 4.244224422442244, + "grad_norm": 0.01154788676649332, + "learning_rate": 4.81068037129618e-05, + "loss": 0.0571, + "num_input_tokens_seen": 8140032, + "step": 38580 + }, + { + "epoch": 4.244774477447745, + "grad_norm": 0.012653294950723648, + "learning_rate": 4.8105887418559346e-05, + "loss": 0.0107, + "num_input_tokens_seen": 8141056, + "step": 38585 + }, + { + "epoch": 4.245324532453245, + "grad_norm": 0.04799129068851471, + "learning_rate": 4.8104970911201025e-05, + "loss": 0.1833, + "num_input_tokens_seen": 8142112, + "step": 38590 + }, + { + "epoch": 4.245874587458746, + "grad_norm": 0.10681046545505524, + "learning_rate": 4.810405419089527e-05, + "loss": 0.0223, + "num_input_tokens_seen": 8143136, + "step": 38595 + }, + { + "epoch": 4.2464246424642464, + "grad_norm": 0.8845773339271545, + "learning_rate": 4.8103137257650534e-05, + "loss": 0.0792, + "num_input_tokens_seen": 8144160, + "step": 38600 + }, + { + "epoch": 4.246974697469747, + "grad_norm": 0.7517310380935669, + "learning_rate": 4.810222011147527e-05, + "loss": 0.0252, + "num_input_tokens_seen": 8145248, + "step": 38605 + }, + { + "epoch": 4.247524752475248, + "grad_norm": 0.1839783936738968, + "learning_rate": 4.810130275237792e-05, + "loss": 0.0115, + "num_input_tokens_seen": 8146304, + "step": 38610 + }, + { + "epoch": 4.248074807480748, + "grad_norm": 0.041662778705358505, + "learning_rate": 4.8100385180366955e-05, + "loss": 0.0223, + "num_input_tokens_seen": 8147392, + "step": 38615 + }, + { + "epoch": 4.248624862486249, + "grad_norm": 0.25447502732276917, + "learning_rate": 4.809946739545083e-05, + "loss": 0.085, + "num_input_tokens_seen": 8148448, + "step": 38620 + }, + { + "epoch": 4.249174917491749, + "grad_norm": 0.3366033136844635, + "learning_rate": 4.8098549397637985e-05, + "loss": 0.1156, + "num_input_tokens_seen": 8149568, + "step": 38625 + }, + { + "epoch": 4.2497249724972495, + "grad_norm": 0.3025222718715668, + "learning_rate": 4.809763118693691e-05, + "loss": 0.0361, + "num_input_tokens_seen": 8150624, + "step": 38630 + }, + { + "epoch": 4.2502750275027505, + "grad_norm": 0.36365291476249695, + "learning_rate": 4.8096712763356036e-05, + "loss": 0.1174, + "num_input_tokens_seen": 8151712, + "step": 38635 + }, + { + "epoch": 4.250825082508251, + "grad_norm": 0.21945108473300934, + "learning_rate": 4.809579412690385e-05, + "loss": 0.0151, + "num_input_tokens_seen": 8152832, + "step": 38640 + }, + { + "epoch": 4.251375137513751, + "grad_norm": 0.0716797262430191, + "learning_rate": 4.8094875277588815e-05, + "loss": 0.0377, + "num_input_tokens_seen": 8153888, + "step": 38645 + }, + { + "epoch": 4.251925192519252, + "grad_norm": 1.212182641029358, + "learning_rate": 4.809395621541939e-05, + "loss": 0.0529, + "num_input_tokens_seen": 8154912, + "step": 38650 + }, + { + "epoch": 4.252475247524752, + "grad_norm": 0.36692264676094055, + "learning_rate": 4.809303694040406e-05, + "loss": 0.046, + "num_input_tokens_seen": 8156032, + "step": 38655 + }, + { + "epoch": 4.253025302530253, + "grad_norm": 1.1981415748596191, + "learning_rate": 4.8092117452551286e-05, + "loss": 0.108, + "num_input_tokens_seen": 8157120, + "step": 38660 + }, + { + "epoch": 4.2535753575357536, + "grad_norm": 0.43125689029693604, + "learning_rate": 4.8091197751869544e-05, + "loss": 0.0692, + "num_input_tokens_seen": 8158208, + "step": 38665 + }, + { + "epoch": 4.254125412541254, + "grad_norm": 0.43597933650016785, + "learning_rate": 4.809027783836731e-05, + "loss": 0.0413, + "num_input_tokens_seen": 8159328, + "step": 38670 + }, + { + "epoch": 4.254675467546755, + "grad_norm": 0.25660592317581177, + "learning_rate": 4.808935771205307e-05, + "loss": 0.0287, + "num_input_tokens_seen": 8160352, + "step": 38675 + }, + { + "epoch": 4.255225522552255, + "grad_norm": 0.04382515326142311, + "learning_rate": 4.8088437372935305e-05, + "loss": 0.013, + "num_input_tokens_seen": 8161440, + "step": 38680 + }, + { + "epoch": 4.255775577557756, + "grad_norm": 0.20505134761333466, + "learning_rate": 4.808751682102248e-05, + "loss": 0.096, + "num_input_tokens_seen": 8162496, + "step": 38685 + }, + { + "epoch": 4.256325632563256, + "grad_norm": 0.01691809855401516, + "learning_rate": 4.8086596056323094e-05, + "loss": 0.0102, + "num_input_tokens_seen": 8163552, + "step": 38690 + }, + { + "epoch": 4.256875687568757, + "grad_norm": 0.0483037605881691, + "learning_rate": 4.808567507884563e-05, + "loss": 0.0795, + "num_input_tokens_seen": 8164608, + "step": 38695 + }, + { + "epoch": 4.257425742574258, + "grad_norm": 0.8051183819770813, + "learning_rate": 4.808475388859858e-05, + "loss": 0.0543, + "num_input_tokens_seen": 8165664, + "step": 38700 + }, + { + "epoch": 4.257975797579758, + "grad_norm": 0.009455753490328789, + "learning_rate": 4.808383248559043e-05, + "loss": 0.0307, + "num_input_tokens_seen": 8166720, + "step": 38705 + }, + { + "epoch": 4.258525852585258, + "grad_norm": 0.5052913427352905, + "learning_rate": 4.808291086982967e-05, + "loss": 0.0712, + "num_input_tokens_seen": 8167712, + "step": 38710 + }, + { + "epoch": 4.259075907590759, + "grad_norm": 0.09955193847417831, + "learning_rate": 4.80819890413248e-05, + "loss": 0.0245, + "num_input_tokens_seen": 8168768, + "step": 38715 + }, + { + "epoch": 4.259625962596259, + "grad_norm": 1.4988154172897339, + "learning_rate": 4.808106700008431e-05, + "loss": 0.1098, + "num_input_tokens_seen": 8169824, + "step": 38720 + }, + { + "epoch": 4.2601760176017605, + "grad_norm": 0.07050051540136337, + "learning_rate": 4.80801447461167e-05, + "loss": 0.1146, + "num_input_tokens_seen": 8170848, + "step": 38725 + }, + { + "epoch": 4.260726072607261, + "grad_norm": 0.365008145570755, + "learning_rate": 4.8079222279430476e-05, + "loss": 0.0128, + "num_input_tokens_seen": 8171968, + "step": 38730 + }, + { + "epoch": 4.261276127612761, + "grad_norm": 0.022864682599902153, + "learning_rate": 4.807829960003413e-05, + "loss": 0.0635, + "num_input_tokens_seen": 8173024, + "step": 38735 + }, + { + "epoch": 4.261826182618262, + "grad_norm": 0.23815162479877472, + "learning_rate": 4.8077376707936174e-05, + "loss": 0.0335, + "num_input_tokens_seen": 8174048, + "step": 38740 + }, + { + "epoch": 4.262376237623762, + "grad_norm": 0.24233664572238922, + "learning_rate": 4.807645360314511e-05, + "loss": 0.0348, + "num_input_tokens_seen": 8175072, + "step": 38745 + }, + { + "epoch": 4.262926292629263, + "grad_norm": 0.03723277896642685, + "learning_rate": 4.8075530285669444e-05, + "loss": 0.1074, + "num_input_tokens_seen": 8176096, + "step": 38750 + }, + { + "epoch": 4.2634763476347635, + "grad_norm": 0.5516993403434753, + "learning_rate": 4.807460675551769e-05, + "loss": 0.0338, + "num_input_tokens_seen": 8177056, + "step": 38755 + }, + { + "epoch": 4.264026402640264, + "grad_norm": 0.06796971708536148, + "learning_rate": 4.807368301269836e-05, + "loss": 0.0262, + "num_input_tokens_seen": 8178112, + "step": 38760 + }, + { + "epoch": 4.264576457645765, + "grad_norm": 0.21549540758132935, + "learning_rate": 4.8072759057219955e-05, + "loss": 0.0395, + "num_input_tokens_seen": 8179168, + "step": 38765 + }, + { + "epoch": 4.265126512651265, + "grad_norm": 0.9738562107086182, + "learning_rate": 4.8071834889091e-05, + "loss": 0.0444, + "num_input_tokens_seen": 8180192, + "step": 38770 + }, + { + "epoch": 4.265676567656766, + "grad_norm": 0.1620384305715561, + "learning_rate": 4.807091050832003e-05, + "loss": 0.0189, + "num_input_tokens_seen": 8181280, + "step": 38775 + }, + { + "epoch": 4.266226622662266, + "grad_norm": 0.04197302088141441, + "learning_rate": 4.806998591491554e-05, + "loss": 0.0324, + "num_input_tokens_seen": 8182336, + "step": 38780 + }, + { + "epoch": 4.2667766776677665, + "grad_norm": 0.08152422308921814, + "learning_rate": 4.806906110888606e-05, + "loss": 0.1236, + "num_input_tokens_seen": 8183360, + "step": 38785 + }, + { + "epoch": 4.267326732673268, + "grad_norm": 1.7228895425796509, + "learning_rate": 4.806813609024011e-05, + "loss": 0.2037, + "num_input_tokens_seen": 8184384, + "step": 38790 + }, + { + "epoch": 4.267876787678768, + "grad_norm": 0.28674864768981934, + "learning_rate": 4.806721085898623e-05, + "loss": 0.0151, + "num_input_tokens_seen": 8185472, + "step": 38795 + }, + { + "epoch": 4.268426842684269, + "grad_norm": 1.2497856616973877, + "learning_rate": 4.806628541513293e-05, + "loss": 0.1175, + "num_input_tokens_seen": 8186528, + "step": 38800 + }, + { + "epoch": 4.268976897689769, + "grad_norm": 0.5451351404190063, + "learning_rate": 4.806535975868874e-05, + "loss": 0.0264, + "num_input_tokens_seen": 8187616, + "step": 38805 + }, + { + "epoch": 4.269526952695269, + "grad_norm": 0.19020555913448334, + "learning_rate": 4.80644338896622e-05, + "loss": 0.0126, + "num_input_tokens_seen": 8188736, + "step": 38810 + }, + { + "epoch": 4.27007700770077, + "grad_norm": 1.6037304401397705, + "learning_rate": 4.8063507808061846e-05, + "loss": 0.0315, + "num_input_tokens_seen": 8189792, + "step": 38815 + }, + { + "epoch": 4.270627062706271, + "grad_norm": 0.08255279064178467, + "learning_rate": 4.80625815138962e-05, + "loss": 0.0774, + "num_input_tokens_seen": 8190848, + "step": 38820 + }, + { + "epoch": 4.271177117711771, + "grad_norm": 0.7890682220458984, + "learning_rate": 4.806165500717381e-05, + "loss": 0.1529, + "num_input_tokens_seen": 8191840, + "step": 38825 + }, + { + "epoch": 4.271727172717272, + "grad_norm": 0.10949226468801498, + "learning_rate": 4.8060728287903214e-05, + "loss": 0.0789, + "num_input_tokens_seen": 8192864, + "step": 38830 + }, + { + "epoch": 4.272277227722772, + "grad_norm": 0.08335939049720764, + "learning_rate": 4.805980135609295e-05, + "loss": 0.0521, + "num_input_tokens_seen": 8193920, + "step": 38835 + }, + { + "epoch": 4.272827282728273, + "grad_norm": 0.024316949769854546, + "learning_rate": 4.805887421175156e-05, + "loss": 0.1166, + "num_input_tokens_seen": 8194944, + "step": 38840 + }, + { + "epoch": 4.273377337733773, + "grad_norm": 0.04910970851778984, + "learning_rate": 4.805794685488759e-05, + "loss": 0.0134, + "num_input_tokens_seen": 8196000, + "step": 38845 + }, + { + "epoch": 4.273927392739274, + "grad_norm": 0.8610523343086243, + "learning_rate": 4.805701928550959e-05, + "loss": 0.0519, + "num_input_tokens_seen": 8197120, + "step": 38850 + }, + { + "epoch": 4.274477447744775, + "grad_norm": 0.040053389966487885, + "learning_rate": 4.80560915036261e-05, + "loss": 0.0637, + "num_input_tokens_seen": 8198144, + "step": 38855 + }, + { + "epoch": 4.275027502750275, + "grad_norm": 0.41435086727142334, + "learning_rate": 4.8055163509245694e-05, + "loss": 0.085, + "num_input_tokens_seen": 8199232, + "step": 38860 + }, + { + "epoch": 4.275577557755776, + "grad_norm": 0.03976843133568764, + "learning_rate": 4.805423530237689e-05, + "loss": 0.0315, + "num_input_tokens_seen": 8200352, + "step": 38865 + }, + { + "epoch": 4.276127612761276, + "grad_norm": 0.05291232466697693, + "learning_rate": 4.805330688302828e-05, + "loss": 0.0567, + "num_input_tokens_seen": 8201376, + "step": 38870 + }, + { + "epoch": 4.276677667766776, + "grad_norm": 1.6958688497543335, + "learning_rate": 4.80523782512084e-05, + "loss": 0.0514, + "num_input_tokens_seen": 8202368, + "step": 38875 + }, + { + "epoch": 4.2772277227722775, + "grad_norm": 0.07947923988103867, + "learning_rate": 4.8051449406925795e-05, + "loss": 0.0661, + "num_input_tokens_seen": 8203424, + "step": 38880 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.07387978583574295, + "learning_rate": 4.805052035018905e-05, + "loss": 0.0116, + "num_input_tokens_seen": 8204480, + "step": 38885 + }, + { + "epoch": 4.278327832783278, + "grad_norm": 0.6174209117889404, + "learning_rate": 4.804959108100673e-05, + "loss": 0.0394, + "num_input_tokens_seen": 8205472, + "step": 38890 + }, + { + "epoch": 4.278877887788779, + "grad_norm": 0.14781753718852997, + "learning_rate": 4.804866159938738e-05, + "loss": 0.02, + "num_input_tokens_seen": 8206496, + "step": 38895 + }, + { + "epoch": 4.279427942794279, + "grad_norm": 0.051224566996097565, + "learning_rate": 4.8047731905339574e-05, + "loss": 0.0967, + "num_input_tokens_seen": 8207616, + "step": 38900 + }, + { + "epoch": 4.27997799779978, + "grad_norm": 0.39709508419036865, + "learning_rate": 4.804680199887189e-05, + "loss": 0.085, + "num_input_tokens_seen": 8208640, + "step": 38905 + }, + { + "epoch": 4.2805280528052805, + "grad_norm": 0.016487162560224533, + "learning_rate": 4.804587187999289e-05, + "loss": 0.0252, + "num_input_tokens_seen": 8209664, + "step": 38910 + }, + { + "epoch": 4.281078107810781, + "grad_norm": 0.10835763812065125, + "learning_rate": 4.804494154871114e-05, + "loss": 0.1085, + "num_input_tokens_seen": 8210688, + "step": 38915 + }, + { + "epoch": 4.281628162816282, + "grad_norm": 0.14473606646060944, + "learning_rate": 4.804401100503523e-05, + "loss": 0.1568, + "num_input_tokens_seen": 8211744, + "step": 38920 + }, + { + "epoch": 4.282178217821782, + "grad_norm": 1.9760971069335938, + "learning_rate": 4.804308024897371e-05, + "loss": 0.0926, + "num_input_tokens_seen": 8212800, + "step": 38925 + }, + { + "epoch": 4.282728272827283, + "grad_norm": 0.04318656027317047, + "learning_rate": 4.80421492805352e-05, + "loss": 0.0556, + "num_input_tokens_seen": 8213888, + "step": 38930 + }, + { + "epoch": 4.283278327832783, + "grad_norm": 0.15063001215457916, + "learning_rate": 4.804121809972825e-05, + "loss": 0.0569, + "num_input_tokens_seen": 8214912, + "step": 38935 + }, + { + "epoch": 4.2838283828382835, + "grad_norm": 0.05579562857747078, + "learning_rate": 4.804028670656145e-05, + "loss": 0.013, + "num_input_tokens_seen": 8215936, + "step": 38940 + }, + { + "epoch": 4.284378437843785, + "grad_norm": 0.035481665283441544, + "learning_rate": 4.803935510104337e-05, + "loss": 0.0541, + "num_input_tokens_seen": 8217024, + "step": 38945 + }, + { + "epoch": 4.284928492849285, + "grad_norm": 1.0441455841064453, + "learning_rate": 4.8038423283182623e-05, + "loss": 0.1482, + "num_input_tokens_seen": 8218144, + "step": 38950 + }, + { + "epoch": 4.285478547854786, + "grad_norm": 0.3366973102092743, + "learning_rate": 4.803749125298777e-05, + "loss": 0.1072, + "num_input_tokens_seen": 8219168, + "step": 38955 + }, + { + "epoch": 4.286028602860286, + "grad_norm": 0.15748730301856995, + "learning_rate": 4.803655901046743e-05, + "loss": 0.0106, + "num_input_tokens_seen": 8220192, + "step": 38960 + }, + { + "epoch": 4.286578657865786, + "grad_norm": 0.7356612086296082, + "learning_rate": 4.803562655563018e-05, + "loss": 0.0352, + "num_input_tokens_seen": 8221312, + "step": 38965 + }, + { + "epoch": 4.287128712871287, + "grad_norm": 1.4034215211868286, + "learning_rate": 4.80346938884846e-05, + "loss": 0.1245, + "num_input_tokens_seen": 8222400, + "step": 38970 + }, + { + "epoch": 4.287678767876788, + "grad_norm": 0.6254772543907166, + "learning_rate": 4.80337610090393e-05, + "loss": 0.0751, + "num_input_tokens_seen": 8223552, + "step": 38975 + }, + { + "epoch": 4.288228822882289, + "grad_norm": 0.040440939366817474, + "learning_rate": 4.8032827917302895e-05, + "loss": 0.0503, + "num_input_tokens_seen": 8224640, + "step": 38980 + }, + { + "epoch": 4.288778877887789, + "grad_norm": 0.06254494935274124, + "learning_rate": 4.803189461328395e-05, + "loss": 0.0562, + "num_input_tokens_seen": 8225728, + "step": 38985 + }, + { + "epoch": 4.289328932893289, + "grad_norm": 0.04263708367943764, + "learning_rate": 4.803096109699109e-05, + "loss": 0.0264, + "num_input_tokens_seen": 8226752, + "step": 38990 + }, + { + "epoch": 4.28987898789879, + "grad_norm": 0.033665549010038376, + "learning_rate": 4.803002736843292e-05, + "loss": 0.0154, + "num_input_tokens_seen": 8227840, + "step": 38995 + }, + { + "epoch": 4.29042904290429, + "grad_norm": 0.10941651463508606, + "learning_rate": 4.802909342761803e-05, + "loss": 0.0419, + "num_input_tokens_seen": 8228928, + "step": 39000 + }, + { + "epoch": 4.290979097909791, + "grad_norm": 0.04392515867948532, + "learning_rate": 4.8028159274555033e-05, + "loss": 0.0153, + "num_input_tokens_seen": 8229984, + "step": 39005 + }, + { + "epoch": 4.291529152915292, + "grad_norm": 0.28257954120635986, + "learning_rate": 4.802722490925255e-05, + "loss": 0.0223, + "num_input_tokens_seen": 8231072, + "step": 39010 + }, + { + "epoch": 4.292079207920792, + "grad_norm": 1.1547554731369019, + "learning_rate": 4.802629033171918e-05, + "loss": 0.077, + "num_input_tokens_seen": 8232096, + "step": 39015 + }, + { + "epoch": 4.292629262926293, + "grad_norm": 1.3414372205734253, + "learning_rate": 4.802535554196355e-05, + "loss": 0.0813, + "num_input_tokens_seen": 8233088, + "step": 39020 + }, + { + "epoch": 4.293179317931793, + "grad_norm": 0.12955324351787567, + "learning_rate": 4.802442053999425e-05, + "loss": 0.0346, + "num_input_tokens_seen": 8234208, + "step": 39025 + }, + { + "epoch": 4.293729372937293, + "grad_norm": 0.022259624674916267, + "learning_rate": 4.802348532581993e-05, + "loss": 0.0568, + "num_input_tokens_seen": 8235264, + "step": 39030 + }, + { + "epoch": 4.2942794279427945, + "grad_norm": 0.6230353713035583, + "learning_rate": 4.8022549899449186e-05, + "loss": 0.0543, + "num_input_tokens_seen": 8236352, + "step": 39035 + }, + { + "epoch": 4.294829482948295, + "grad_norm": 0.5005927681922913, + "learning_rate": 4.8021614260890647e-05, + "loss": 0.0496, + "num_input_tokens_seen": 8237408, + "step": 39040 + }, + { + "epoch": 4.295379537953796, + "grad_norm": 0.6654898524284363, + "learning_rate": 4.8020678410152935e-05, + "loss": 0.1333, + "num_input_tokens_seen": 8238496, + "step": 39045 + }, + { + "epoch": 4.295929592959296, + "grad_norm": 0.07957768440246582, + "learning_rate": 4.801974234724468e-05, + "loss": 0.0172, + "num_input_tokens_seen": 8239616, + "step": 39050 + }, + { + "epoch": 4.296479647964796, + "grad_norm": 0.2250085473060608, + "learning_rate": 4.80188060721745e-05, + "loss": 0.0235, + "num_input_tokens_seen": 8240736, + "step": 39055 + }, + { + "epoch": 4.297029702970297, + "grad_norm": 0.175466850399971, + "learning_rate": 4.8017869584951035e-05, + "loss": 0.053, + "num_input_tokens_seen": 8241824, + "step": 39060 + }, + { + "epoch": 4.2975797579757975, + "grad_norm": 0.6631718873977661, + "learning_rate": 4.801693288558291e-05, + "loss": 0.04, + "num_input_tokens_seen": 8242848, + "step": 39065 + }, + { + "epoch": 4.298129812981298, + "grad_norm": 0.6310617923736572, + "learning_rate": 4.8015995974078764e-05, + "loss": 0.0626, + "num_input_tokens_seen": 8243872, + "step": 39070 + }, + { + "epoch": 4.298679867986799, + "grad_norm": 1.1047266721725464, + "learning_rate": 4.801505885044723e-05, + "loss": 0.0862, + "num_input_tokens_seen": 8244896, + "step": 39075 + }, + { + "epoch": 4.299229922992299, + "grad_norm": 1.4661812782287598, + "learning_rate": 4.801412151469693e-05, + "loss": 0.1842, + "num_input_tokens_seen": 8245952, + "step": 39080 + }, + { + "epoch": 4.2997799779978, + "grad_norm": 1.60382080078125, + "learning_rate": 4.801318396683652e-05, + "loss": 0.1169, + "num_input_tokens_seen": 8246976, + "step": 39085 + }, + { + "epoch": 4.3003300330033, + "grad_norm": 0.14849913120269775, + "learning_rate": 4.801224620687463e-05, + "loss": 0.0438, + "num_input_tokens_seen": 8247936, + "step": 39090 + }, + { + "epoch": 4.3008800880088005, + "grad_norm": 0.09832124412059784, + "learning_rate": 4.8011308234819916e-05, + "loss": 0.073, + "num_input_tokens_seen": 8248992, + "step": 39095 + }, + { + "epoch": 4.301430143014302, + "grad_norm": 0.044320061802864075, + "learning_rate": 4.801037005068102e-05, + "loss": 0.0279, + "num_input_tokens_seen": 8250016, + "step": 39100 + }, + { + "epoch": 4.301980198019802, + "grad_norm": 0.010734978131949902, + "learning_rate": 4.800943165446657e-05, + "loss": 0.0788, + "num_input_tokens_seen": 8251136, + "step": 39105 + }, + { + "epoch": 4.302530253025303, + "grad_norm": 0.30689167976379395, + "learning_rate": 4.800849304618525e-05, + "loss": 0.0405, + "num_input_tokens_seen": 8252192, + "step": 39110 + }, + { + "epoch": 4.303080308030803, + "grad_norm": 0.2353629171848297, + "learning_rate": 4.8007554225845674e-05, + "loss": 0.0797, + "num_input_tokens_seen": 8253216, + "step": 39115 + }, + { + "epoch": 4.303630363036303, + "grad_norm": 0.14065003395080566, + "learning_rate": 4.8006615193456514e-05, + "loss": 0.0369, + "num_input_tokens_seen": 8254240, + "step": 39120 + }, + { + "epoch": 4.304180418041804, + "grad_norm": 0.23263207077980042, + "learning_rate": 4.800567594902643e-05, + "loss": 0.0562, + "num_input_tokens_seen": 8255328, + "step": 39125 + }, + { + "epoch": 4.304730473047305, + "grad_norm": 0.8906576633453369, + "learning_rate": 4.800473649256406e-05, + "loss": 0.1004, + "num_input_tokens_seen": 8256352, + "step": 39130 + }, + { + "epoch": 4.305280528052805, + "grad_norm": 0.14704446494579315, + "learning_rate": 4.800379682407808e-05, + "loss": 0.0557, + "num_input_tokens_seen": 8257472, + "step": 39135 + }, + { + "epoch": 4.305830583058306, + "grad_norm": 0.07856491208076477, + "learning_rate": 4.800285694357714e-05, + "loss": 0.0304, + "num_input_tokens_seen": 8258496, + "step": 39140 + }, + { + "epoch": 4.306380638063806, + "grad_norm": 1.1415716409683228, + "learning_rate": 4.800191685106991e-05, + "loss": 0.0619, + "num_input_tokens_seen": 8259520, + "step": 39145 + }, + { + "epoch": 4.306930693069307, + "grad_norm": 0.14296582341194153, + "learning_rate": 4.8000976546565036e-05, + "loss": 0.0745, + "num_input_tokens_seen": 8260576, + "step": 39150 + }, + { + "epoch": 4.307480748074807, + "grad_norm": 0.25865352153778076, + "learning_rate": 4.800003603007121e-05, + "loss": 0.1612, + "num_input_tokens_seen": 8261600, + "step": 39155 + }, + { + "epoch": 4.3080308030803085, + "grad_norm": 1.7865653038024902, + "learning_rate": 4.7999095301597084e-05, + "loss": 0.1338, + "num_input_tokens_seen": 8262624, + "step": 39160 + }, + { + "epoch": 4.308580858085809, + "grad_norm": 0.13567891716957092, + "learning_rate": 4.799815436115134e-05, + "loss": 0.0771, + "num_input_tokens_seen": 8263648, + "step": 39165 + }, + { + "epoch": 4.309130913091309, + "grad_norm": 0.0697622150182724, + "learning_rate": 4.799721320874263e-05, + "loss": 0.0324, + "num_input_tokens_seen": 8264768, + "step": 39170 + }, + { + "epoch": 4.30968096809681, + "grad_norm": 0.28831467032432556, + "learning_rate": 4.799627184437965e-05, + "loss": 0.0333, + "num_input_tokens_seen": 8265888, + "step": 39175 + }, + { + "epoch": 4.31023102310231, + "grad_norm": 0.6466739177703857, + "learning_rate": 4.799533026807107e-05, + "loss": 0.1214, + "num_input_tokens_seen": 8266976, + "step": 39180 + }, + { + "epoch": 4.31078107810781, + "grad_norm": 0.2417551428079605, + "learning_rate": 4.799438847982555e-05, + "loss": 0.0265, + "num_input_tokens_seen": 8268064, + "step": 39185 + }, + { + "epoch": 4.3113311331133115, + "grad_norm": 0.0406217984855175, + "learning_rate": 4.79934464796518e-05, + "loss": 0.0251, + "num_input_tokens_seen": 8269152, + "step": 39190 + }, + { + "epoch": 4.311881188118812, + "grad_norm": 0.21055471897125244, + "learning_rate": 4.799250426755848e-05, + "loss": 0.0466, + "num_input_tokens_seen": 8270240, + "step": 39195 + }, + { + "epoch": 4.312431243124313, + "grad_norm": 0.04262929409742355, + "learning_rate": 4.799156184355428e-05, + "loss": 0.0776, + "num_input_tokens_seen": 8271360, + "step": 39200 + }, + { + "epoch": 4.312981298129813, + "grad_norm": 0.1380627453327179, + "learning_rate": 4.799061920764789e-05, + "loss": 0.0351, + "num_input_tokens_seen": 8272480, + "step": 39205 + }, + { + "epoch": 4.313531353135313, + "grad_norm": 1.0321770906448364, + "learning_rate": 4.7989676359848e-05, + "loss": 0.0738, + "num_input_tokens_seen": 8273536, + "step": 39210 + }, + { + "epoch": 4.314081408140814, + "grad_norm": 0.03750620782375336, + "learning_rate": 4.798873330016329e-05, + "loss": 0.0076, + "num_input_tokens_seen": 8274592, + "step": 39215 + }, + { + "epoch": 4.3146314631463145, + "grad_norm": 0.03647632151842117, + "learning_rate": 4.7987790028602454e-05, + "loss": 0.0374, + "num_input_tokens_seen": 8275680, + "step": 39220 + }, + { + "epoch": 4.315181518151816, + "grad_norm": 0.29333794116973877, + "learning_rate": 4.798684654517419e-05, + "loss": 0.0443, + "num_input_tokens_seen": 8276736, + "step": 39225 + }, + { + "epoch": 4.315731573157316, + "grad_norm": 1.0291389226913452, + "learning_rate": 4.798590284988719e-05, + "loss": 0.0741, + "num_input_tokens_seen": 8277760, + "step": 39230 + }, + { + "epoch": 4.316281628162816, + "grad_norm": 0.529800295829773, + "learning_rate": 4.798495894275015e-05, + "loss": 0.0826, + "num_input_tokens_seen": 8278848, + "step": 39235 + }, + { + "epoch": 4.316831683168317, + "grad_norm": 0.12552012503147125, + "learning_rate": 4.798401482377178e-05, + "loss": 0.0304, + "num_input_tokens_seen": 8279904, + "step": 39240 + }, + { + "epoch": 4.317381738173817, + "grad_norm": 0.4022986888885498, + "learning_rate": 4.798307049296077e-05, + "loss": 0.0078, + "num_input_tokens_seen": 8280928, + "step": 39245 + }, + { + "epoch": 4.3179317931793175, + "grad_norm": 0.19373340904712677, + "learning_rate": 4.798212595032583e-05, + "loss": 0.0261, + "num_input_tokens_seen": 8282016, + "step": 39250 + }, + { + "epoch": 4.318481848184819, + "grad_norm": 0.06679052859544754, + "learning_rate": 4.798118119587566e-05, + "loss": 0.0127, + "num_input_tokens_seen": 8283104, + "step": 39255 + }, + { + "epoch": 4.319031903190319, + "grad_norm": 0.40723925828933716, + "learning_rate": 4.7980236229618974e-05, + "loss": 0.0681, + "num_input_tokens_seen": 8284128, + "step": 39260 + }, + { + "epoch": 4.31958195819582, + "grad_norm": 0.2500942349433899, + "learning_rate": 4.797929105156448e-05, + "loss": 0.1041, + "num_input_tokens_seen": 8285184, + "step": 39265 + }, + { + "epoch": 4.32013201320132, + "grad_norm": 0.549050509929657, + "learning_rate": 4.797834566172088e-05, + "loss": 0.0912, + "num_input_tokens_seen": 8286272, + "step": 39270 + }, + { + "epoch": 4.32068206820682, + "grad_norm": 0.3270789086818695, + "learning_rate": 4.797740006009689e-05, + "loss": 0.0265, + "num_input_tokens_seen": 8287360, + "step": 39275 + }, + { + "epoch": 4.321232123212321, + "grad_norm": 0.46873268485069275, + "learning_rate": 4.7976454246701244e-05, + "loss": 0.0591, + "num_input_tokens_seen": 8288416, + "step": 39280 + }, + { + "epoch": 4.321782178217822, + "grad_norm": 0.30107900500297546, + "learning_rate": 4.797550822154263e-05, + "loss": 0.1039, + "num_input_tokens_seen": 8289472, + "step": 39285 + }, + { + "epoch": 4.322332233223323, + "grad_norm": 0.014892435632646084, + "learning_rate": 4.797456198462979e-05, + "loss": 0.0081, + "num_input_tokens_seen": 8290528, + "step": 39290 + }, + { + "epoch": 4.322882288228823, + "grad_norm": 0.18619690835475922, + "learning_rate": 4.797361553597143e-05, + "loss": 0.0862, + "num_input_tokens_seen": 8291584, + "step": 39295 + }, + { + "epoch": 4.323432343234323, + "grad_norm": 0.12379462271928787, + "learning_rate": 4.7972668875576285e-05, + "loss": 0.0089, + "num_input_tokens_seen": 8292672, + "step": 39300 + }, + { + "epoch": 4.323982398239824, + "grad_norm": 0.10301479697227478, + "learning_rate": 4.7971722003453076e-05, + "loss": 0.0117, + "num_input_tokens_seen": 8293728, + "step": 39305 + }, + { + "epoch": 4.324532453245324, + "grad_norm": 0.02617192082107067, + "learning_rate": 4.7970774919610525e-05, + "loss": 0.0491, + "num_input_tokens_seen": 8294784, + "step": 39310 + }, + { + "epoch": 4.325082508250825, + "grad_norm": 0.17953413724899292, + "learning_rate": 4.796982762405736e-05, + "loss": 0.0157, + "num_input_tokens_seen": 8295840, + "step": 39315 + }, + { + "epoch": 4.325632563256326, + "grad_norm": 1.200649380683899, + "learning_rate": 4.7968880116802326e-05, + "loss": 0.1354, + "num_input_tokens_seen": 8296960, + "step": 39320 + }, + { + "epoch": 4.326182618261826, + "grad_norm": 0.029566042125225067, + "learning_rate": 4.796793239785414e-05, + "loss": 0.0471, + "num_input_tokens_seen": 8298016, + "step": 39325 + }, + { + "epoch": 4.326732673267327, + "grad_norm": 1.2624605894088745, + "learning_rate": 4.7966984467221544e-05, + "loss": 0.1379, + "num_input_tokens_seen": 8299040, + "step": 39330 + }, + { + "epoch": 4.327282728272827, + "grad_norm": 0.052428144961595535, + "learning_rate": 4.7966036324913266e-05, + "loss": 0.0353, + "num_input_tokens_seen": 8300128, + "step": 39335 + }, + { + "epoch": 4.327832783278327, + "grad_norm": 0.5564770698547363, + "learning_rate": 4.7965087970938055e-05, + "loss": 0.0451, + "num_input_tokens_seen": 8301184, + "step": 39340 + }, + { + "epoch": 4.3283828382838285, + "grad_norm": 0.09195853769779205, + "learning_rate": 4.796413940530465e-05, + "loss": 0.0746, + "num_input_tokens_seen": 8302272, + "step": 39345 + }, + { + "epoch": 4.328932893289329, + "grad_norm": 0.5907503962516785, + "learning_rate": 4.796319062802179e-05, + "loss": 0.0575, + "num_input_tokens_seen": 8303360, + "step": 39350 + }, + { + "epoch": 4.32948294829483, + "grad_norm": 0.09939399361610413, + "learning_rate": 4.796224163909822e-05, + "loss": 0.0133, + "num_input_tokens_seen": 8304416, + "step": 39355 + }, + { + "epoch": 4.33003300330033, + "grad_norm": 0.018872514367103577, + "learning_rate": 4.7961292438542685e-05, + "loss": 0.0437, + "num_input_tokens_seen": 8305408, + "step": 39360 + }, + { + "epoch": 4.33058305830583, + "grad_norm": 0.0825069472193718, + "learning_rate": 4.796034302636394e-05, + "loss": 0.0162, + "num_input_tokens_seen": 8306464, + "step": 39365 + }, + { + "epoch": 4.331133113311331, + "grad_norm": 0.41001150012016296, + "learning_rate": 4.795939340257073e-05, + "loss": 0.0404, + "num_input_tokens_seen": 8307552, + "step": 39370 + }, + { + "epoch": 4.3316831683168315, + "grad_norm": 0.4495946764945984, + "learning_rate": 4.795844356717181e-05, + "loss": 0.0596, + "num_input_tokens_seen": 8308640, + "step": 39375 + }, + { + "epoch": 4.332233223322333, + "grad_norm": 0.05496811494231224, + "learning_rate": 4.795749352017593e-05, + "loss": 0.0586, + "num_input_tokens_seen": 8309728, + "step": 39380 + }, + { + "epoch": 4.332783278327833, + "grad_norm": 0.03568204119801521, + "learning_rate": 4.7956543261591846e-05, + "loss": 0.0797, + "num_input_tokens_seen": 8310784, + "step": 39385 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.12646269798278809, + "learning_rate": 4.7955592791428314e-05, + "loss": 0.0973, + "num_input_tokens_seen": 8311872, + "step": 39390 + }, + { + "epoch": 4.333883388338834, + "grad_norm": 0.07626517117023468, + "learning_rate": 4.795464210969411e-05, + "loss": 0.0674, + "num_input_tokens_seen": 8312896, + "step": 39395 + }, + { + "epoch": 4.334433443344334, + "grad_norm": 0.9747085571289062, + "learning_rate": 4.7953691216397975e-05, + "loss": 0.0987, + "num_input_tokens_seen": 8313920, + "step": 39400 + }, + { + "epoch": 4.334983498349835, + "grad_norm": 0.14426188170909882, + "learning_rate": 4.795274011154869e-05, + "loss": 0.0156, + "num_input_tokens_seen": 8314976, + "step": 39405 + }, + { + "epoch": 4.335533553355336, + "grad_norm": 0.9735906720161438, + "learning_rate": 4.795178879515501e-05, + "loss": 0.033, + "num_input_tokens_seen": 8316032, + "step": 39410 + }, + { + "epoch": 4.336083608360836, + "grad_norm": 0.33174002170562744, + "learning_rate": 4.7950837267225704e-05, + "loss": 0.0372, + "num_input_tokens_seen": 8317056, + "step": 39415 + }, + { + "epoch": 4.336633663366337, + "grad_norm": 0.052122388035058975, + "learning_rate": 4.7949885527769545e-05, + "loss": 0.0121, + "num_input_tokens_seen": 8318112, + "step": 39420 + }, + { + "epoch": 4.337183718371837, + "grad_norm": 0.1759432703256607, + "learning_rate": 4.7948933576795306e-05, + "loss": 0.0466, + "num_input_tokens_seen": 8319168, + "step": 39425 + }, + { + "epoch": 4.337733773377337, + "grad_norm": 0.01625034585595131, + "learning_rate": 4.794798141431176e-05, + "loss": 0.0508, + "num_input_tokens_seen": 8320256, + "step": 39430 + }, + { + "epoch": 4.338283828382838, + "grad_norm": 0.03484374284744263, + "learning_rate": 4.794702904032767e-05, + "loss": 0.1816, + "num_input_tokens_seen": 8321248, + "step": 39435 + }, + { + "epoch": 4.338833883388339, + "grad_norm": 0.5883337259292603, + "learning_rate": 4.794607645485183e-05, + "loss": 0.0247, + "num_input_tokens_seen": 8322272, + "step": 39440 + }, + { + "epoch": 4.33938393839384, + "grad_norm": 0.4912356436252594, + "learning_rate": 4.7945123657893023e-05, + "loss": 0.0933, + "num_input_tokens_seen": 8323328, + "step": 39445 + }, + { + "epoch": 4.33993399339934, + "grad_norm": 0.11337818950414658, + "learning_rate": 4.794417064946001e-05, + "loss": 0.0958, + "num_input_tokens_seen": 8324416, + "step": 39450 + }, + { + "epoch": 4.34048404840484, + "grad_norm": 0.08759557455778122, + "learning_rate": 4.794321742956159e-05, + "loss": 0.0234, + "num_input_tokens_seen": 8325472, + "step": 39455 + }, + { + "epoch": 4.341034103410341, + "grad_norm": 0.04521917179226875, + "learning_rate": 4.794226399820654e-05, + "loss": 0.0511, + "num_input_tokens_seen": 8326592, + "step": 39460 + }, + { + "epoch": 4.341584158415841, + "grad_norm": 0.036787815392017365, + "learning_rate": 4.7941310355403656e-05, + "loss": 0.0343, + "num_input_tokens_seen": 8327584, + "step": 39465 + }, + { + "epoch": 4.3421342134213425, + "grad_norm": 0.1279984712600708, + "learning_rate": 4.794035650116172e-05, + "loss": 0.0687, + "num_input_tokens_seen": 8328576, + "step": 39470 + }, + { + "epoch": 4.342684268426843, + "grad_norm": 0.04137197136878967, + "learning_rate": 4.793940243548953e-05, + "loss": 0.0798, + "num_input_tokens_seen": 8329568, + "step": 39475 + }, + { + "epoch": 4.343234323432343, + "grad_norm": 0.5263304114341736, + "learning_rate": 4.793844815839588e-05, + "loss": 0.0479, + "num_input_tokens_seen": 8330624, + "step": 39480 + }, + { + "epoch": 4.343784378437844, + "grad_norm": 0.1521252691745758, + "learning_rate": 4.7937493669889546e-05, + "loss": 0.0876, + "num_input_tokens_seen": 8331680, + "step": 39485 + }, + { + "epoch": 4.344334433443344, + "grad_norm": 0.10665185004472733, + "learning_rate": 4.793653896997935e-05, + "loss": 0.0228, + "num_input_tokens_seen": 8332768, + "step": 39490 + }, + { + "epoch": 4.3448844884488445, + "grad_norm": 0.04781685769557953, + "learning_rate": 4.793558405867408e-05, + "loss": 0.05, + "num_input_tokens_seen": 8333856, + "step": 39495 + }, + { + "epoch": 4.3454345434543455, + "grad_norm": 0.06972219794988632, + "learning_rate": 4.793462893598253e-05, + "loss": 0.0599, + "num_input_tokens_seen": 8335008, + "step": 39500 + }, + { + "epoch": 4.345984598459846, + "grad_norm": 1.2175960540771484, + "learning_rate": 4.793367360191351e-05, + "loss": 0.0807, + "num_input_tokens_seen": 8336096, + "step": 39505 + }, + { + "epoch": 4.346534653465347, + "grad_norm": 0.3948844373226166, + "learning_rate": 4.793271805647583e-05, + "loss": 0.1011, + "num_input_tokens_seen": 8337088, + "step": 39510 + }, + { + "epoch": 4.347084708470847, + "grad_norm": 1.2645576000213623, + "learning_rate": 4.7931762299678295e-05, + "loss": 0.1204, + "num_input_tokens_seen": 8338176, + "step": 39515 + }, + { + "epoch": 4.347634763476347, + "grad_norm": 0.020744118839502335, + "learning_rate": 4.793080633152971e-05, + "loss": 0.0426, + "num_input_tokens_seen": 8339264, + "step": 39520 + }, + { + "epoch": 4.348184818481848, + "grad_norm": 0.14431343972682953, + "learning_rate": 4.792985015203888e-05, + "loss": 0.041, + "num_input_tokens_seen": 8340352, + "step": 39525 + }, + { + "epoch": 4.3487348734873486, + "grad_norm": 0.6901479959487915, + "learning_rate": 4.792889376121462e-05, + "loss": 0.0598, + "num_input_tokens_seen": 8341376, + "step": 39530 + }, + { + "epoch": 4.34928492849285, + "grad_norm": 0.41125208139419556, + "learning_rate": 4.792793715906576e-05, + "loss": 0.0123, + "num_input_tokens_seen": 8342432, + "step": 39535 + }, + { + "epoch": 4.34983498349835, + "grad_norm": 0.06583521515130997, + "learning_rate": 4.79269803456011e-05, + "loss": 0.0462, + "num_input_tokens_seen": 8343488, + "step": 39540 + }, + { + "epoch": 4.35038503850385, + "grad_norm": 0.2352515608072281, + "learning_rate": 4.792602332082946e-05, + "loss": 0.0589, + "num_input_tokens_seen": 8344576, + "step": 39545 + }, + { + "epoch": 4.350935093509351, + "grad_norm": 0.7940509915351868, + "learning_rate": 4.792506608475967e-05, + "loss": 0.0222, + "num_input_tokens_seen": 8345632, + "step": 39550 + }, + { + "epoch": 4.351485148514851, + "grad_norm": 0.017708154395222664, + "learning_rate": 4.7924108637400546e-05, + "loss": 0.0601, + "num_input_tokens_seen": 8346784, + "step": 39555 + }, + { + "epoch": 4.3520352035203524, + "grad_norm": 0.02288978546857834, + "learning_rate": 4.792315097876091e-05, + "loss": 0.0449, + "num_input_tokens_seen": 8347808, + "step": 39560 + }, + { + "epoch": 4.352585258525853, + "grad_norm": 0.05219075083732605, + "learning_rate": 4.79221931088496e-05, + "loss": 0.0579, + "num_input_tokens_seen": 8348832, + "step": 39565 + }, + { + "epoch": 4.353135313531353, + "grad_norm": 1.2701810598373413, + "learning_rate": 4.7921235027675426e-05, + "loss": 0.1124, + "num_input_tokens_seen": 8349856, + "step": 39570 + }, + { + "epoch": 4.353685368536854, + "grad_norm": 0.7853071689605713, + "learning_rate": 4.792027673524723e-05, + "loss": 0.0321, + "num_input_tokens_seen": 8350816, + "step": 39575 + }, + { + "epoch": 4.354235423542354, + "grad_norm": 0.8416903614997864, + "learning_rate": 4.791931823157384e-05, + "loss": 0.0418, + "num_input_tokens_seen": 8351840, + "step": 39580 + }, + { + "epoch": 4.354785478547855, + "grad_norm": 0.05906695872545242, + "learning_rate": 4.791835951666409e-05, + "loss": 0.0786, + "num_input_tokens_seen": 8352864, + "step": 39585 + }, + { + "epoch": 4.3553355335533555, + "grad_norm": 1.5156283378601074, + "learning_rate": 4.791740059052682e-05, + "loss": 0.1301, + "num_input_tokens_seen": 8353888, + "step": 39590 + }, + { + "epoch": 4.355885588558856, + "grad_norm": 0.4131070077419281, + "learning_rate": 4.7916441453170866e-05, + "loss": 0.0817, + "num_input_tokens_seen": 8354944, + "step": 39595 + }, + { + "epoch": 4.356435643564357, + "grad_norm": 0.3006386160850525, + "learning_rate": 4.791548210460507e-05, + "loss": 0.0959, + "num_input_tokens_seen": 8356032, + "step": 39600 + }, + { + "epoch": 4.356985698569857, + "grad_norm": 0.11563123762607574, + "learning_rate": 4.791452254483827e-05, + "loss": 0.0735, + "num_input_tokens_seen": 8357088, + "step": 39605 + }, + { + "epoch": 4.357535753575357, + "grad_norm": 0.1595098227262497, + "learning_rate": 4.7913562773879303e-05, + "loss": 0.1228, + "num_input_tokens_seen": 8358176, + "step": 39610 + }, + { + "epoch": 4.358085808580858, + "grad_norm": 0.04611380398273468, + "learning_rate": 4.7912602791737035e-05, + "loss": 0.0376, + "num_input_tokens_seen": 8359264, + "step": 39615 + }, + { + "epoch": 4.3586358635863585, + "grad_norm": 1.341788649559021, + "learning_rate": 4.791164259842029e-05, + "loss": 0.0741, + "num_input_tokens_seen": 8360288, + "step": 39620 + }, + { + "epoch": 4.3591859185918596, + "grad_norm": 0.2096668928861618, + "learning_rate": 4.791068219393794e-05, + "loss": 0.0455, + "num_input_tokens_seen": 8361344, + "step": 39625 + }, + { + "epoch": 4.35973597359736, + "grad_norm": 0.35025739669799805, + "learning_rate": 4.790972157829882e-05, + "loss": 0.1071, + "num_input_tokens_seen": 8362432, + "step": 39630 + }, + { + "epoch": 4.36028602860286, + "grad_norm": 0.056146036833524704, + "learning_rate": 4.790876075151179e-05, + "loss": 0.013, + "num_input_tokens_seen": 8363488, + "step": 39635 + }, + { + "epoch": 4.360836083608361, + "grad_norm": 0.7386009097099304, + "learning_rate": 4.7907799713585705e-05, + "loss": 0.0267, + "num_input_tokens_seen": 8364576, + "step": 39640 + }, + { + "epoch": 4.361386138613861, + "grad_norm": 0.016132639721035957, + "learning_rate": 4.790683846452943e-05, + "loss": 0.027, + "num_input_tokens_seen": 8365632, + "step": 39645 + }, + { + "epoch": 4.361936193619362, + "grad_norm": 1.288142442703247, + "learning_rate": 4.79058770043518e-05, + "loss": 0.1117, + "num_input_tokens_seen": 8366688, + "step": 39650 + }, + { + "epoch": 4.362486248624863, + "grad_norm": 0.4573061466217041, + "learning_rate": 4.790491533306171e-05, + "loss": 0.0193, + "num_input_tokens_seen": 8367712, + "step": 39655 + }, + { + "epoch": 4.363036303630363, + "grad_norm": 1.0526551008224487, + "learning_rate": 4.790395345066799e-05, + "loss": 0.1611, + "num_input_tokens_seen": 8368800, + "step": 39660 + }, + { + "epoch": 4.363586358635864, + "grad_norm": 0.41930681467056274, + "learning_rate": 4.790299135717953e-05, + "loss": 0.0439, + "num_input_tokens_seen": 8369856, + "step": 39665 + }, + { + "epoch": 4.364136413641364, + "grad_norm": 0.03390621021389961, + "learning_rate": 4.7902029052605194e-05, + "loss": 0.0438, + "num_input_tokens_seen": 8370880, + "step": 39670 + }, + { + "epoch": 4.364686468646864, + "grad_norm": 0.0231496449559927, + "learning_rate": 4.790106653695384e-05, + "loss": 0.1084, + "num_input_tokens_seen": 8372000, + "step": 39675 + }, + { + "epoch": 4.365236523652365, + "grad_norm": 0.22705259919166565, + "learning_rate": 4.790010381023434e-05, + "loss": 0.0277, + "num_input_tokens_seen": 8373056, + "step": 39680 + }, + { + "epoch": 4.365786578657866, + "grad_norm": 0.03955535590648651, + "learning_rate": 4.7899140872455584e-05, + "loss": 0.0908, + "num_input_tokens_seen": 8374144, + "step": 39685 + }, + { + "epoch": 4.366336633663367, + "grad_norm": 0.39997661113739014, + "learning_rate": 4.7898177723626425e-05, + "loss": 0.2098, + "num_input_tokens_seen": 8375264, + "step": 39690 + }, + { + "epoch": 4.366886688668867, + "grad_norm": 1.3431870937347412, + "learning_rate": 4.789721436375575e-05, + "loss": 0.0446, + "num_input_tokens_seen": 8376256, + "step": 39695 + }, + { + "epoch": 4.367436743674367, + "grad_norm": 0.029186757281422615, + "learning_rate": 4.789625079285244e-05, + "loss": 0.0155, + "num_input_tokens_seen": 8377376, + "step": 39700 + }, + { + "epoch": 4.367986798679868, + "grad_norm": 0.2876865565776825, + "learning_rate": 4.789528701092537e-05, + "loss": 0.0184, + "num_input_tokens_seen": 8378432, + "step": 39705 + }, + { + "epoch": 4.368536853685368, + "grad_norm": 0.2953610122203827, + "learning_rate": 4.789432301798343e-05, + "loss": 0.0386, + "num_input_tokens_seen": 8379520, + "step": 39710 + }, + { + "epoch": 4.3690869086908695, + "grad_norm": 0.18710613250732422, + "learning_rate": 4.78933588140355e-05, + "loss": 0.0167, + "num_input_tokens_seen": 8380512, + "step": 39715 + }, + { + "epoch": 4.36963696369637, + "grad_norm": 0.00986772496253252, + "learning_rate": 4.789239439909047e-05, + "loss": 0.0684, + "num_input_tokens_seen": 8381568, + "step": 39720 + }, + { + "epoch": 4.37018701870187, + "grad_norm": 0.8896377682685852, + "learning_rate": 4.789142977315721e-05, + "loss": 0.0562, + "num_input_tokens_seen": 8382592, + "step": 39725 + }, + { + "epoch": 4.370737073707371, + "grad_norm": 0.03806941211223602, + "learning_rate": 4.789046493624464e-05, + "loss": 0.0623, + "num_input_tokens_seen": 8383584, + "step": 39730 + }, + { + "epoch": 4.371287128712871, + "grad_norm": 0.3044026792049408, + "learning_rate": 4.7889499888361636e-05, + "loss": 0.0755, + "num_input_tokens_seen": 8384576, + "step": 39735 + }, + { + "epoch": 4.371837183718371, + "grad_norm": 0.06554632633924484, + "learning_rate": 4.78885346295171e-05, + "loss": 0.073, + "num_input_tokens_seen": 8385632, + "step": 39740 + }, + { + "epoch": 4.3723872387238725, + "grad_norm": 1.4864673614501953, + "learning_rate": 4.7887569159719914e-05, + "loss": 0.1005, + "num_input_tokens_seen": 8386720, + "step": 39745 + }, + { + "epoch": 4.372937293729373, + "grad_norm": 0.4811498522758484, + "learning_rate": 4.7886603478978996e-05, + "loss": 0.1355, + "num_input_tokens_seen": 8387808, + "step": 39750 + }, + { + "epoch": 4.373487348734874, + "grad_norm": 0.028830094262957573, + "learning_rate": 4.7885637587303225e-05, + "loss": 0.0563, + "num_input_tokens_seen": 8388832, + "step": 39755 + }, + { + "epoch": 4.374037403740374, + "grad_norm": 0.04314570873975754, + "learning_rate": 4.788467148470152e-05, + "loss": 0.0308, + "num_input_tokens_seen": 8389824, + "step": 39760 + }, + { + "epoch": 4.374587458745874, + "grad_norm": 0.691919207572937, + "learning_rate": 4.788370517118278e-05, + "loss": 0.0595, + "num_input_tokens_seen": 8390848, + "step": 39765 + }, + { + "epoch": 4.375137513751375, + "grad_norm": 0.8743782639503479, + "learning_rate": 4.788273864675591e-05, + "loss": 0.0601, + "num_input_tokens_seen": 8391872, + "step": 39770 + }, + { + "epoch": 4.3756875687568755, + "grad_norm": 0.36246562004089355, + "learning_rate": 4.788177191142981e-05, + "loss": 0.0202, + "num_input_tokens_seen": 8392864, + "step": 39775 + }, + { + "epoch": 4.376237623762377, + "grad_norm": 0.39041030406951904, + "learning_rate": 4.788080496521341e-05, + "loss": 0.0209, + "num_input_tokens_seen": 8393952, + "step": 39780 + }, + { + "epoch": 4.376787678767877, + "grad_norm": 0.03603330999612808, + "learning_rate": 4.78798378081156e-05, + "loss": 0.0304, + "num_input_tokens_seen": 8395104, + "step": 39785 + }, + { + "epoch": 4.377337733773377, + "grad_norm": 0.920119047164917, + "learning_rate": 4.787887044014531e-05, + "loss": 0.0628, + "num_input_tokens_seen": 8396192, + "step": 39790 + }, + { + "epoch": 4.377887788778878, + "grad_norm": 0.09516856074333191, + "learning_rate": 4.7877902861311446e-05, + "loss": 0.0494, + "num_input_tokens_seen": 8397280, + "step": 39795 + }, + { + "epoch": 4.378437843784378, + "grad_norm": 0.09542759507894516, + "learning_rate": 4.787693507162293e-05, + "loss": 0.0269, + "num_input_tokens_seen": 8398368, + "step": 39800 + }, + { + "epoch": 4.378987898789879, + "grad_norm": 0.14874570071697235, + "learning_rate": 4.787596707108868e-05, + "loss": 0.0105, + "num_input_tokens_seen": 8399392, + "step": 39805 + }, + { + "epoch": 4.37953795379538, + "grad_norm": 1.2243366241455078, + "learning_rate": 4.7874998859717624e-05, + "loss": 0.14, + "num_input_tokens_seen": 8400512, + "step": 39810 + }, + { + "epoch": 4.38008800880088, + "grad_norm": 0.0550941526889801, + "learning_rate": 4.787403043751867e-05, + "loss": 0.0685, + "num_input_tokens_seen": 8401600, + "step": 39815 + }, + { + "epoch": 4.380638063806381, + "grad_norm": 0.3561939001083374, + "learning_rate": 4.787306180450076e-05, + "loss": 0.0998, + "num_input_tokens_seen": 8402656, + "step": 39820 + }, + { + "epoch": 4.381188118811881, + "grad_norm": 0.08026468008756638, + "learning_rate": 4.787209296067282e-05, + "loss": 0.0062, + "num_input_tokens_seen": 8403712, + "step": 39825 + }, + { + "epoch": 4.381738173817382, + "grad_norm": 0.02826569601893425, + "learning_rate": 4.787112390604377e-05, + "loss": 0.0377, + "num_input_tokens_seen": 8404736, + "step": 39830 + }, + { + "epoch": 4.382288228822882, + "grad_norm": 0.08279518783092499, + "learning_rate": 4.787015464062254e-05, + "loss": 0.1168, + "num_input_tokens_seen": 8405888, + "step": 39835 + }, + { + "epoch": 4.382838283828383, + "grad_norm": 0.08069303631782532, + "learning_rate": 4.786918516441807e-05, + "loss": 0.1071, + "num_input_tokens_seen": 8406976, + "step": 39840 + }, + { + "epoch": 4.383388338833884, + "grad_norm": 0.0718470886349678, + "learning_rate": 4.786821547743931e-05, + "loss": 0.0191, + "num_input_tokens_seen": 8408032, + "step": 39845 + }, + { + "epoch": 4.383938393839384, + "grad_norm": 0.0703977644443512, + "learning_rate": 4.786724557969516e-05, + "loss": 0.0664, + "num_input_tokens_seen": 8409056, + "step": 39850 + }, + { + "epoch": 4.384488448844884, + "grad_norm": 0.9484971165657043, + "learning_rate": 4.786627547119459e-05, + "loss": 0.1042, + "num_input_tokens_seen": 8410176, + "step": 39855 + }, + { + "epoch": 4.385038503850385, + "grad_norm": 0.287001371383667, + "learning_rate": 4.786530515194653e-05, + "loss": 0.0265, + "num_input_tokens_seen": 8411232, + "step": 39860 + }, + { + "epoch": 4.385588558855885, + "grad_norm": 0.03938288986682892, + "learning_rate": 4.7864334621959916e-05, + "loss": 0.019, + "num_input_tokens_seen": 8412256, + "step": 39865 + }, + { + "epoch": 4.3861386138613865, + "grad_norm": 0.4696027636528015, + "learning_rate": 4.786336388124372e-05, + "loss": 0.0495, + "num_input_tokens_seen": 8413376, + "step": 39870 + }, + { + "epoch": 4.386688668866887, + "grad_norm": 0.06397655606269836, + "learning_rate": 4.786239292980685e-05, + "loss": 0.0401, + "num_input_tokens_seen": 8414432, + "step": 39875 + }, + { + "epoch": 4.387238723872387, + "grad_norm": 0.4107729196548462, + "learning_rate": 4.786142176765829e-05, + "loss": 0.0364, + "num_input_tokens_seen": 8415456, + "step": 39880 + }, + { + "epoch": 4.387788778877888, + "grad_norm": 0.04664159193634987, + "learning_rate": 4.7860450394806966e-05, + "loss": 0.0171, + "num_input_tokens_seen": 8416512, + "step": 39885 + }, + { + "epoch": 4.388338833883388, + "grad_norm": 0.3420512080192566, + "learning_rate": 4.785947881126184e-05, + "loss": 0.0682, + "num_input_tokens_seen": 8417664, + "step": 39890 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.8222035765647888, + "learning_rate": 4.7858507017031876e-05, + "loss": 0.0589, + "num_input_tokens_seen": 8418720, + "step": 39895 + }, + { + "epoch": 4.3894389438943895, + "grad_norm": 0.13210268318653107, + "learning_rate": 4.785753501212601e-05, + "loss": 0.0311, + "num_input_tokens_seen": 8419744, + "step": 39900 + }, + { + "epoch": 4.38998899889989, + "grad_norm": 0.05888257920742035, + "learning_rate": 4.785656279655322e-05, + "loss": 0.0074, + "num_input_tokens_seen": 8420832, + "step": 39905 + }, + { + "epoch": 4.390539053905391, + "grad_norm": 0.05407052859663963, + "learning_rate": 4.7855590370322455e-05, + "loss": 0.0534, + "num_input_tokens_seen": 8421888, + "step": 39910 + }, + { + "epoch": 4.391089108910891, + "grad_norm": 0.3243838846683502, + "learning_rate": 4.7854617733442686e-05, + "loss": 0.054, + "num_input_tokens_seen": 8422976, + "step": 39915 + }, + { + "epoch": 4.391639163916391, + "grad_norm": 0.20864051580429077, + "learning_rate": 4.7853644885922865e-05, + "loss": 0.0291, + "num_input_tokens_seen": 8424032, + "step": 39920 + }, + { + "epoch": 4.392189218921892, + "grad_norm": 0.5989189147949219, + "learning_rate": 4.7852671827771964e-05, + "loss": 0.0499, + "num_input_tokens_seen": 8425056, + "step": 39925 + }, + { + "epoch": 4.3927392739273925, + "grad_norm": 0.0329083651304245, + "learning_rate": 4.785169855899896e-05, + "loss": 0.0103, + "num_input_tokens_seen": 8426176, + "step": 39930 + }, + { + "epoch": 4.393289328932894, + "grad_norm": 0.13010728359222412, + "learning_rate": 4.785072507961281e-05, + "loss": 0.0815, + "num_input_tokens_seen": 8427296, + "step": 39935 + }, + { + "epoch": 4.393839383938394, + "grad_norm": 0.01953052170574665, + "learning_rate": 4.78497513896225e-05, + "loss": 0.0405, + "num_input_tokens_seen": 8428352, + "step": 39940 + }, + { + "epoch": 4.394389438943894, + "grad_norm": 0.8822452425956726, + "learning_rate": 4.784877748903699e-05, + "loss": 0.0324, + "num_input_tokens_seen": 8429440, + "step": 39945 + }, + { + "epoch": 4.394939493949395, + "grad_norm": 0.039183877408504486, + "learning_rate": 4.784780337786526e-05, + "loss": 0.0935, + "num_input_tokens_seen": 8430496, + "step": 39950 + }, + { + "epoch": 4.395489548954895, + "grad_norm": 0.921490490436554, + "learning_rate": 4.7846829056116295e-05, + "loss": 0.0411, + "num_input_tokens_seen": 8431520, + "step": 39955 + }, + { + "epoch": 4.396039603960396, + "grad_norm": 0.06592679768800735, + "learning_rate": 4.7845854523799065e-05, + "loss": 0.1238, + "num_input_tokens_seen": 8432608, + "step": 39960 + }, + { + "epoch": 4.396589658965897, + "grad_norm": 0.07807617634534836, + "learning_rate": 4.784487978092256e-05, + "loss": 0.0655, + "num_input_tokens_seen": 8433696, + "step": 39965 + }, + { + "epoch": 4.397139713971397, + "grad_norm": 0.14697077870368958, + "learning_rate": 4.7843904827495755e-05, + "loss": 0.1367, + "num_input_tokens_seen": 8434752, + "step": 39970 + }, + { + "epoch": 4.397689768976898, + "grad_norm": 0.6127001047134399, + "learning_rate": 4.7842929663527645e-05, + "loss": 0.038, + "num_input_tokens_seen": 8435872, + "step": 39975 + }, + { + "epoch": 4.398239823982398, + "grad_norm": 0.0521438866853714, + "learning_rate": 4.784195428902721e-05, + "loss": 0.1114, + "num_input_tokens_seen": 8436928, + "step": 39980 + }, + { + "epoch": 4.398789878987899, + "grad_norm": 0.13645866513252258, + "learning_rate": 4.784097870400345e-05, + "loss": 0.0239, + "num_input_tokens_seen": 8438048, + "step": 39985 + }, + { + "epoch": 4.399339933993399, + "grad_norm": 0.10208872705698013, + "learning_rate": 4.784000290846534e-05, + "loss": 0.0649, + "num_input_tokens_seen": 8439104, + "step": 39990 + }, + { + "epoch": 4.3998899889989, + "grad_norm": 0.32216590642929077, + "learning_rate": 4.7839026902421894e-05, + "loss": 0.0419, + "num_input_tokens_seen": 8440192, + "step": 39995 + }, + { + "epoch": 4.400440044004401, + "grad_norm": 0.03246568515896797, + "learning_rate": 4.7838050685882085e-05, + "loss": 0.0556, + "num_input_tokens_seen": 8441280, + "step": 40000 + }, + { + "epoch": 4.400990099009901, + "grad_norm": 0.23642875254154205, + "learning_rate": 4.783707425885493e-05, + "loss": 0.1145, + "num_input_tokens_seen": 8442304, + "step": 40005 + }, + { + "epoch": 4.401540154015402, + "grad_norm": 0.08362230658531189, + "learning_rate": 4.783609762134941e-05, + "loss": 0.0399, + "num_input_tokens_seen": 8443328, + "step": 40010 + }, + { + "epoch": 4.402090209020902, + "grad_norm": 0.45850634574890137, + "learning_rate": 4.783512077337454e-05, + "loss": 0.034, + "num_input_tokens_seen": 8444320, + "step": 40015 + }, + { + "epoch": 4.402640264026402, + "grad_norm": 0.013687764294445515, + "learning_rate": 4.7834143714939326e-05, + "loss": 0.0896, + "num_input_tokens_seen": 8445344, + "step": 40020 + }, + { + "epoch": 4.4031903190319035, + "grad_norm": 0.185768261551857, + "learning_rate": 4.7833166446052756e-05, + "loss": 0.0281, + "num_input_tokens_seen": 8446368, + "step": 40025 + }, + { + "epoch": 4.403740374037404, + "grad_norm": 0.42316746711730957, + "learning_rate": 4.783218896672386e-05, + "loss": 0.1027, + "num_input_tokens_seen": 8447456, + "step": 40030 + }, + { + "epoch": 4.404290429042904, + "grad_norm": 0.19442641735076904, + "learning_rate": 4.783121127696162e-05, + "loss": 0.0596, + "num_input_tokens_seen": 8448480, + "step": 40035 + }, + { + "epoch": 4.404840484048405, + "grad_norm": 0.13716796040534973, + "learning_rate": 4.7830233376775066e-05, + "loss": 0.0135, + "num_input_tokens_seen": 8449568, + "step": 40040 + }, + { + "epoch": 4.405390539053905, + "grad_norm": 0.09777562320232391, + "learning_rate": 4.782925526617321e-05, + "loss": 0.0515, + "num_input_tokens_seen": 8450528, + "step": 40045 + }, + { + "epoch": 4.405940594059406, + "grad_norm": 0.33902233839035034, + "learning_rate": 4.782827694516506e-05, + "loss": 0.0579, + "num_input_tokens_seen": 8451552, + "step": 40050 + }, + { + "epoch": 4.4064906490649065, + "grad_norm": 0.31558874249458313, + "learning_rate": 4.782729841375963e-05, + "loss": 0.0382, + "num_input_tokens_seen": 8452608, + "step": 40055 + }, + { + "epoch": 4.407040704070407, + "grad_norm": 1.300995945930481, + "learning_rate": 4.782631967196596e-05, + "loss": 0.0993, + "num_input_tokens_seen": 8453632, + "step": 40060 + }, + { + "epoch": 4.407590759075908, + "grad_norm": 0.04743287339806557, + "learning_rate": 4.782534071979304e-05, + "loss": 0.0546, + "num_input_tokens_seen": 8454688, + "step": 40065 + }, + { + "epoch": 4.408140814081408, + "grad_norm": 0.5027390718460083, + "learning_rate": 4.782436155724992e-05, + "loss": 0.0995, + "num_input_tokens_seen": 8455648, + "step": 40070 + }, + { + "epoch": 4.408690869086909, + "grad_norm": 0.38896557688713074, + "learning_rate": 4.78233821843456e-05, + "loss": 0.0742, + "num_input_tokens_seen": 8456672, + "step": 40075 + }, + { + "epoch": 4.409240924092409, + "grad_norm": 1.185820460319519, + "learning_rate": 4.782240260108912e-05, + "loss": 0.1552, + "num_input_tokens_seen": 8457728, + "step": 40080 + }, + { + "epoch": 4.4097909790979095, + "grad_norm": 0.032634131610393524, + "learning_rate": 4.7821422807489515e-05, + "loss": 0.0203, + "num_input_tokens_seen": 8458784, + "step": 40085 + }, + { + "epoch": 4.410341034103411, + "grad_norm": 0.050523217767477036, + "learning_rate": 4.78204428035558e-05, + "loss": 0.0977, + "num_input_tokens_seen": 8459840, + "step": 40090 + }, + { + "epoch": 4.410891089108911, + "grad_norm": 0.18009546399116516, + "learning_rate": 4.781946258929702e-05, + "loss": 0.0424, + "num_input_tokens_seen": 8460896, + "step": 40095 + }, + { + "epoch": 4.411441144114411, + "grad_norm": 0.19335252046585083, + "learning_rate": 4.7818482164722186e-05, + "loss": 0.0752, + "num_input_tokens_seen": 8461888, + "step": 40100 + }, + { + "epoch": 4.411991199119912, + "grad_norm": 0.14525917172431946, + "learning_rate": 4.781750152984037e-05, + "loss": 0.0597, + "num_input_tokens_seen": 8462944, + "step": 40105 + }, + { + "epoch": 4.412541254125412, + "grad_norm": 0.06342102587223053, + "learning_rate": 4.781652068466058e-05, + "loss": 0.1008, + "num_input_tokens_seen": 8464032, + "step": 40110 + }, + { + "epoch": 4.413091309130913, + "grad_norm": 0.2386583834886551, + "learning_rate": 4.781553962919187e-05, + "loss": 0.0993, + "num_input_tokens_seen": 8465088, + "step": 40115 + }, + { + "epoch": 4.413641364136414, + "grad_norm": 1.20539128780365, + "learning_rate": 4.7814558363443284e-05, + "loss": 0.0755, + "num_input_tokens_seen": 8466144, + "step": 40120 + }, + { + "epoch": 4.414191419141914, + "grad_norm": 0.08126102387905121, + "learning_rate": 4.781357688742386e-05, + "loss": 0.0572, + "num_input_tokens_seen": 8467200, + "step": 40125 + }, + { + "epoch": 4.414741474147415, + "grad_norm": 0.1984722763299942, + "learning_rate": 4.781259520114264e-05, + "loss": 0.0269, + "num_input_tokens_seen": 8468256, + "step": 40130 + }, + { + "epoch": 4.415291529152915, + "grad_norm": 0.012782303616404533, + "learning_rate": 4.7811613304608684e-05, + "loss": 0.0357, + "num_input_tokens_seen": 8469248, + "step": 40135 + }, + { + "epoch": 4.415841584158416, + "grad_norm": 1.251617670059204, + "learning_rate": 4.781063119783103e-05, + "loss": 0.047, + "num_input_tokens_seen": 8470304, + "step": 40140 + }, + { + "epoch": 4.416391639163916, + "grad_norm": 0.4816155433654785, + "learning_rate": 4.780964888081872e-05, + "loss": 0.0356, + "num_input_tokens_seen": 8471424, + "step": 40145 + }, + { + "epoch": 4.416941694169417, + "grad_norm": 0.02593032829463482, + "learning_rate": 4.780866635358084e-05, + "loss": 0.0179, + "num_input_tokens_seen": 8472512, + "step": 40150 + }, + { + "epoch": 4.417491749174918, + "grad_norm": 0.23045794665813446, + "learning_rate": 4.7807683616126426e-05, + "loss": 0.0248, + "num_input_tokens_seen": 8473600, + "step": 40155 + }, + { + "epoch": 4.418041804180418, + "grad_norm": 0.0619855597615242, + "learning_rate": 4.7806700668464535e-05, + "loss": 0.0187, + "num_input_tokens_seen": 8474656, + "step": 40160 + }, + { + "epoch": 4.418591859185918, + "grad_norm": 0.9887810945510864, + "learning_rate": 4.780571751060422e-05, + "loss": 0.0533, + "num_input_tokens_seen": 8475776, + "step": 40165 + }, + { + "epoch": 4.419141914191419, + "grad_norm": 0.9699445962905884, + "learning_rate": 4.7804734142554554e-05, + "loss": 0.0289, + "num_input_tokens_seen": 8476832, + "step": 40170 + }, + { + "epoch": 4.419691969196919, + "grad_norm": 1.1409789323806763, + "learning_rate": 4.78037505643246e-05, + "loss": 0.0673, + "num_input_tokens_seen": 8477920, + "step": 40175 + }, + { + "epoch": 4.4202420242024205, + "grad_norm": 0.33961543440818787, + "learning_rate": 4.7802766775923416e-05, + "loss": 0.0196, + "num_input_tokens_seen": 8479072, + "step": 40180 + }, + { + "epoch": 4.420792079207921, + "grad_norm": 0.043507590889930725, + "learning_rate": 4.7801782777360074e-05, + "loss": 0.0762, + "num_input_tokens_seen": 8480160, + "step": 40185 + }, + { + "epoch": 4.421342134213421, + "grad_norm": 0.030506199225783348, + "learning_rate": 4.7800798568643635e-05, + "loss": 0.0358, + "num_input_tokens_seen": 8481152, + "step": 40190 + }, + { + "epoch": 4.421892189218922, + "grad_norm": 0.011786098591983318, + "learning_rate": 4.779981414978318e-05, + "loss": 0.0274, + "num_input_tokens_seen": 8482112, + "step": 40195 + }, + { + "epoch": 4.422442244224422, + "grad_norm": 0.026652447879314423, + "learning_rate": 4.779882952078778e-05, + "loss": 0.0281, + "num_input_tokens_seen": 8483200, + "step": 40200 + }, + { + "epoch": 4.422992299229923, + "grad_norm": 0.5055622458457947, + "learning_rate": 4.779784468166651e-05, + "loss": 0.104, + "num_input_tokens_seen": 8484288, + "step": 40205 + }, + { + "epoch": 4.4235423542354235, + "grad_norm": 0.15346525609493256, + "learning_rate": 4.779685963242844e-05, + "loss": 0.1158, + "num_input_tokens_seen": 8485344, + "step": 40210 + }, + { + "epoch": 4.424092409240924, + "grad_norm": 0.06722386926412582, + "learning_rate": 4.779587437308266e-05, + "loss": 0.0392, + "num_input_tokens_seen": 8486368, + "step": 40215 + }, + { + "epoch": 4.424642464246425, + "grad_norm": 0.06060397997498512, + "learning_rate": 4.779488890363824e-05, + "loss": 0.1385, + "num_input_tokens_seen": 8487424, + "step": 40220 + }, + { + "epoch": 4.425192519251925, + "grad_norm": 0.016552532091736794, + "learning_rate": 4.779390322410427e-05, + "loss": 0.0164, + "num_input_tokens_seen": 8488448, + "step": 40225 + }, + { + "epoch": 4.425742574257426, + "grad_norm": 0.5659486055374146, + "learning_rate": 4.779291733448983e-05, + "loss": 0.1146, + "num_input_tokens_seen": 8489440, + "step": 40230 + }, + { + "epoch": 4.426292629262926, + "grad_norm": 0.13375224173069, + "learning_rate": 4.779193123480401e-05, + "loss": 0.0649, + "num_input_tokens_seen": 8490496, + "step": 40235 + }, + { + "epoch": 4.4268426842684265, + "grad_norm": 0.1584172248840332, + "learning_rate": 4.7790944925055894e-05, + "loss": 0.0363, + "num_input_tokens_seen": 8491552, + "step": 40240 + }, + { + "epoch": 4.427392739273928, + "grad_norm": 0.22567114233970642, + "learning_rate": 4.778995840525458e-05, + "loss": 0.0618, + "num_input_tokens_seen": 8492640, + "step": 40245 + }, + { + "epoch": 4.427942794279428, + "grad_norm": 0.47328078746795654, + "learning_rate": 4.7788971675409144e-05, + "loss": 0.0211, + "num_input_tokens_seen": 8493664, + "step": 40250 + }, + { + "epoch": 4.428492849284929, + "grad_norm": 0.5524643659591675, + "learning_rate": 4.77879847355287e-05, + "loss": 0.1119, + "num_input_tokens_seen": 8494688, + "step": 40255 + }, + { + "epoch": 4.429042904290429, + "grad_norm": 0.1132979616522789, + "learning_rate": 4.778699758562233e-05, + "loss": 0.1371, + "num_input_tokens_seen": 8495680, + "step": 40260 + }, + { + "epoch": 4.429592959295929, + "grad_norm": 0.043935421854257584, + "learning_rate": 4.778601022569914e-05, + "loss": 0.0471, + "num_input_tokens_seen": 8496736, + "step": 40265 + }, + { + "epoch": 4.43014301430143, + "grad_norm": 1.4383091926574707, + "learning_rate": 4.778502265576823e-05, + "loss": 0.1001, + "num_input_tokens_seen": 8497824, + "step": 40270 + }, + { + "epoch": 4.430693069306931, + "grad_norm": 0.0215915534645319, + "learning_rate": 4.7784034875838696e-05, + "loss": 0.0146, + "num_input_tokens_seen": 8498880, + "step": 40275 + }, + { + "epoch": 4.431243124312431, + "grad_norm": 0.9529373049736023, + "learning_rate": 4.7783046885919646e-05, + "loss": 0.0654, + "num_input_tokens_seen": 8499936, + "step": 40280 + }, + { + "epoch": 4.431793179317932, + "grad_norm": 0.5428739786148071, + "learning_rate": 4.7782058686020186e-05, + "loss": 0.0531, + "num_input_tokens_seen": 8501024, + "step": 40285 + }, + { + "epoch": 4.432343234323432, + "grad_norm": 0.019762828946113586, + "learning_rate": 4.778107027614942e-05, + "loss": 0.0153, + "num_input_tokens_seen": 8502080, + "step": 40290 + }, + { + "epoch": 4.432893289328933, + "grad_norm": 0.21095137298107147, + "learning_rate": 4.778008165631647e-05, + "loss": 0.0705, + "num_input_tokens_seen": 8503168, + "step": 40295 + }, + { + "epoch": 4.433443344334433, + "grad_norm": 0.313215047121048, + "learning_rate": 4.777909282653042e-05, + "loss": 0.0152, + "num_input_tokens_seen": 8504192, + "step": 40300 + }, + { + "epoch": 4.433993399339934, + "grad_norm": 0.29917266964912415, + "learning_rate": 4.777810378680042e-05, + "loss": 0.0888, + "num_input_tokens_seen": 8505312, + "step": 40305 + }, + { + "epoch": 4.434543454345435, + "grad_norm": 0.05047742277383804, + "learning_rate": 4.777711453713557e-05, + "loss": 0.0551, + "num_input_tokens_seen": 8506304, + "step": 40310 + }, + { + "epoch": 4.435093509350935, + "grad_norm": 0.010674641467630863, + "learning_rate": 4.777612507754497e-05, + "loss": 0.0094, + "num_input_tokens_seen": 8507360, + "step": 40315 + }, + { + "epoch": 4.435643564356436, + "grad_norm": 0.2964891493320465, + "learning_rate": 4.7775135408037765e-05, + "loss": 0.0349, + "num_input_tokens_seen": 8508448, + "step": 40320 + }, + { + "epoch": 4.436193619361936, + "grad_norm": 1.5587235689163208, + "learning_rate": 4.777414552862306e-05, + "loss": 0.0763, + "num_input_tokens_seen": 8509504, + "step": 40325 + }, + { + "epoch": 4.436743674367436, + "grad_norm": 0.8762340545654297, + "learning_rate": 4.777315543930998e-05, + "loss": 0.1333, + "num_input_tokens_seen": 8510528, + "step": 40330 + }, + { + "epoch": 4.4372937293729375, + "grad_norm": 0.4119114279747009, + "learning_rate": 4.777216514010766e-05, + "loss": 0.0213, + "num_input_tokens_seen": 8511552, + "step": 40335 + }, + { + "epoch": 4.437843784378438, + "grad_norm": 0.02386959083378315, + "learning_rate": 4.777117463102522e-05, + "loss": 0.016, + "num_input_tokens_seen": 8512640, + "step": 40340 + }, + { + "epoch": 4.438393839383938, + "grad_norm": 0.19030170142650604, + "learning_rate": 4.777018391207179e-05, + "loss": 0.0124, + "num_input_tokens_seen": 8513696, + "step": 40345 + }, + { + "epoch": 4.438943894389439, + "grad_norm": 0.8312574625015259, + "learning_rate": 4.77691929832565e-05, + "loss": 0.0349, + "num_input_tokens_seen": 8514816, + "step": 40350 + }, + { + "epoch": 4.439493949394939, + "grad_norm": 0.029456479474902153, + "learning_rate": 4.776820184458848e-05, + "loss": 0.0295, + "num_input_tokens_seen": 8515840, + "step": 40355 + }, + { + "epoch": 4.44004400440044, + "grad_norm": 0.035680048167705536, + "learning_rate": 4.776721049607687e-05, + "loss": 0.0359, + "num_input_tokens_seen": 8516864, + "step": 40360 + }, + { + "epoch": 4.4405940594059405, + "grad_norm": 0.0113362492993474, + "learning_rate": 4.7766218937730816e-05, + "loss": 0.1688, + "num_input_tokens_seen": 8517920, + "step": 40365 + }, + { + "epoch": 4.441144114411441, + "grad_norm": 0.33601751923561096, + "learning_rate": 4.7765227169559444e-05, + "loss": 0.0556, + "num_input_tokens_seen": 8519008, + "step": 40370 + }, + { + "epoch": 4.441694169416942, + "grad_norm": 0.2884197235107422, + "learning_rate": 4.776423519157189e-05, + "loss": 0.0241, + "num_input_tokens_seen": 8520032, + "step": 40375 + }, + { + "epoch": 4.442244224422442, + "grad_norm": 0.02111709676682949, + "learning_rate": 4.77632430037773e-05, + "loss": 0.0279, + "num_input_tokens_seen": 8521056, + "step": 40380 + }, + { + "epoch": 4.442794279427943, + "grad_norm": 0.07835425436496735, + "learning_rate": 4.776225060618483e-05, + "loss": 0.0411, + "num_input_tokens_seen": 8522080, + "step": 40385 + }, + { + "epoch": 4.443344334433443, + "grad_norm": 0.07922571152448654, + "learning_rate": 4.776125799880362e-05, + "loss": 0.0114, + "num_input_tokens_seen": 8523200, + "step": 40390 + }, + { + "epoch": 4.4438943894389435, + "grad_norm": 2.380695343017578, + "learning_rate": 4.7760265181642816e-05, + "loss": 0.0759, + "num_input_tokens_seen": 8524224, + "step": 40395 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.13829833269119263, + "learning_rate": 4.775927215471156e-05, + "loss": 0.015, + "num_input_tokens_seen": 8525312, + "step": 40400 + }, + { + "epoch": 4.444994499449945, + "grad_norm": 0.06200254708528519, + "learning_rate": 4.775827891801903e-05, + "loss": 0.0761, + "num_input_tokens_seen": 8526368, + "step": 40405 + }, + { + "epoch": 4.445544554455446, + "grad_norm": 0.6443361639976501, + "learning_rate": 4.7757285471574355e-05, + "loss": 0.0314, + "num_input_tokens_seen": 8527488, + "step": 40410 + }, + { + "epoch": 4.446094609460946, + "grad_norm": 0.50555819272995, + "learning_rate": 4.77562918153867e-05, + "loss": 0.0766, + "num_input_tokens_seen": 8528512, + "step": 40415 + }, + { + "epoch": 4.446644664466446, + "grad_norm": 0.038863178342580795, + "learning_rate": 4.7755297949465216e-05, + "loss": 0.0374, + "num_input_tokens_seen": 8529536, + "step": 40420 + }, + { + "epoch": 4.447194719471947, + "grad_norm": 0.05315880849957466, + "learning_rate": 4.7754303873819076e-05, + "loss": 0.0512, + "num_input_tokens_seen": 8530592, + "step": 40425 + }, + { + "epoch": 4.447744774477448, + "grad_norm": 0.23175495862960815, + "learning_rate": 4.775330958845744e-05, + "loss": 0.0327, + "num_input_tokens_seen": 8531648, + "step": 40430 + }, + { + "epoch": 4.448294829482949, + "grad_norm": 0.8466717600822449, + "learning_rate": 4.775231509338946e-05, + "loss": 0.1297, + "num_input_tokens_seen": 8532672, + "step": 40435 + }, + { + "epoch": 4.448844884488449, + "grad_norm": 0.1507582813501358, + "learning_rate": 4.775132038862431e-05, + "loss": 0.0753, + "num_input_tokens_seen": 8533728, + "step": 40440 + }, + { + "epoch": 4.449394939493949, + "grad_norm": 0.022597739472985268, + "learning_rate": 4.775032547417116e-05, + "loss": 0.062, + "num_input_tokens_seen": 8534752, + "step": 40445 + }, + { + "epoch": 4.44994499449945, + "grad_norm": 0.1405450999736786, + "learning_rate": 4.774933035003918e-05, + "loss": 0.076, + "num_input_tokens_seen": 8535776, + "step": 40450 + }, + { + "epoch": 4.4504950495049505, + "grad_norm": 0.06477964669466019, + "learning_rate": 4.7748335016237534e-05, + "loss": 0.0258, + "num_input_tokens_seen": 8536832, + "step": 40455 + }, + { + "epoch": 4.451045104510451, + "grad_norm": 0.7313069105148315, + "learning_rate": 4.77473394727754e-05, + "loss": 0.1189, + "num_input_tokens_seen": 8537888, + "step": 40460 + }, + { + "epoch": 4.451595159515952, + "grad_norm": 0.07515710592269897, + "learning_rate": 4.774634371966196e-05, + "loss": 0.0179, + "num_input_tokens_seen": 8538976, + "step": 40465 + }, + { + "epoch": 4.452145214521452, + "grad_norm": 0.25380173325538635, + "learning_rate": 4.774534775690637e-05, + "loss": 0.0494, + "num_input_tokens_seen": 8540128, + "step": 40470 + }, + { + "epoch": 4.452695269526953, + "grad_norm": 0.028148582205176353, + "learning_rate": 4.7744351584517836e-05, + "loss": 0.0596, + "num_input_tokens_seen": 8541216, + "step": 40475 + }, + { + "epoch": 4.453245324532453, + "grad_norm": 0.545625627040863, + "learning_rate": 4.7743355202505525e-05, + "loss": 0.1079, + "num_input_tokens_seen": 8542304, + "step": 40480 + }, + { + "epoch": 4.4537953795379535, + "grad_norm": 0.4650105834007263, + "learning_rate": 4.774235861087862e-05, + "loss": 0.0743, + "num_input_tokens_seen": 8543360, + "step": 40485 + }, + { + "epoch": 4.4543454345434546, + "grad_norm": 0.034973230212926865, + "learning_rate": 4.774136180964631e-05, + "loss": 0.1276, + "num_input_tokens_seen": 8544480, + "step": 40490 + }, + { + "epoch": 4.454895489548955, + "grad_norm": 0.05580627918243408, + "learning_rate": 4.774036479881778e-05, + "loss": 0.0117, + "num_input_tokens_seen": 8545536, + "step": 40495 + }, + { + "epoch": 4.455445544554456, + "grad_norm": 0.08426976948976517, + "learning_rate": 4.7739367578402224e-05, + "loss": 0.042, + "num_input_tokens_seen": 8546688, + "step": 40500 + }, + { + "epoch": 4.455995599559956, + "grad_norm": 0.025212906301021576, + "learning_rate": 4.773837014840882e-05, + "loss": 0.046, + "num_input_tokens_seen": 8547744, + "step": 40505 + }, + { + "epoch": 4.456545654565456, + "grad_norm": 0.052223317325115204, + "learning_rate": 4.773737250884678e-05, + "loss": 0.0683, + "num_input_tokens_seen": 8548736, + "step": 40510 + }, + { + "epoch": 4.457095709570957, + "grad_norm": 0.04358874633908272, + "learning_rate": 4.773637465972528e-05, + "loss": 0.0127, + "num_input_tokens_seen": 8549792, + "step": 40515 + }, + { + "epoch": 4.457645764576458, + "grad_norm": 0.10497526824474335, + "learning_rate": 4.773537660105353e-05, + "loss": 0.0265, + "num_input_tokens_seen": 8550784, + "step": 40520 + }, + { + "epoch": 4.458195819581958, + "grad_norm": 0.024083172902464867, + "learning_rate": 4.773437833284071e-05, + "loss": 0.0686, + "num_input_tokens_seen": 8551872, + "step": 40525 + }, + { + "epoch": 4.458745874587459, + "grad_norm": 0.04522760584950447, + "learning_rate": 4.773337985509605e-05, + "loss": 0.0701, + "num_input_tokens_seen": 8552864, + "step": 40530 + }, + { + "epoch": 4.459295929592959, + "grad_norm": 0.1286356896162033, + "learning_rate": 4.7732381167828735e-05, + "loss": 0.0334, + "num_input_tokens_seen": 8553984, + "step": 40535 + }, + { + "epoch": 4.45984598459846, + "grad_norm": 0.6794852614402771, + "learning_rate": 4.773138227104796e-05, + "loss": 0.0277, + "num_input_tokens_seen": 8555040, + "step": 40540 + }, + { + "epoch": 4.46039603960396, + "grad_norm": 0.08452979475259781, + "learning_rate": 4.773038316476295e-05, + "loss": 0.0335, + "num_input_tokens_seen": 8556096, + "step": 40545 + }, + { + "epoch": 4.460946094609461, + "grad_norm": 0.06770496815443039, + "learning_rate": 4.77293838489829e-05, + "loss": 0.1117, + "num_input_tokens_seen": 8557152, + "step": 40550 + }, + { + "epoch": 4.461496149614962, + "grad_norm": 0.024250390008091927, + "learning_rate": 4.772838432371703e-05, + "loss": 0.024, + "num_input_tokens_seen": 8558240, + "step": 40555 + }, + { + "epoch": 4.462046204620462, + "grad_norm": 0.041862353682518005, + "learning_rate": 4.7727384588974544e-05, + "loss": 0.0673, + "num_input_tokens_seen": 8559296, + "step": 40560 + }, + { + "epoch": 4.462596259625963, + "grad_norm": 1.5593607425689697, + "learning_rate": 4.7726384644764665e-05, + "loss": 0.0217, + "num_input_tokens_seen": 8560384, + "step": 40565 + }, + { + "epoch": 4.463146314631463, + "grad_norm": 0.251696914434433, + "learning_rate": 4.77253844910966e-05, + "loss": 0.1419, + "num_input_tokens_seen": 8561472, + "step": 40570 + }, + { + "epoch": 4.463696369636963, + "grad_norm": 1.6190515756607056, + "learning_rate": 4.772438412797958e-05, + "loss": 0.0439, + "num_input_tokens_seen": 8562528, + "step": 40575 + }, + { + "epoch": 4.4642464246424645, + "grad_norm": 1.542039155960083, + "learning_rate": 4.7723383555422805e-05, + "loss": 0.035, + "num_input_tokens_seen": 8563552, + "step": 40580 + }, + { + "epoch": 4.464796479647965, + "grad_norm": 0.5751222968101501, + "learning_rate": 4.7722382773435515e-05, + "loss": 0.0589, + "num_input_tokens_seen": 8564608, + "step": 40585 + }, + { + "epoch": 4.465346534653466, + "grad_norm": 1.0343669652938843, + "learning_rate": 4.772138178202692e-05, + "loss": 0.0575, + "num_input_tokens_seen": 8565664, + "step": 40590 + }, + { + "epoch": 4.465896589658966, + "grad_norm": 0.4404450058937073, + "learning_rate": 4.7720380581206256e-05, + "loss": 0.0406, + "num_input_tokens_seen": 8566688, + "step": 40595 + }, + { + "epoch": 4.466446644664466, + "grad_norm": 0.2870015501976013, + "learning_rate": 4.771937917098274e-05, + "loss": 0.0436, + "num_input_tokens_seen": 8567744, + "step": 40600 + }, + { + "epoch": 4.466996699669967, + "grad_norm": 0.015539889223873615, + "learning_rate": 4.771837755136561e-05, + "loss": 0.055, + "num_input_tokens_seen": 8568704, + "step": 40605 + }, + { + "epoch": 4.4675467546754675, + "grad_norm": 0.05157230794429779, + "learning_rate": 4.77173757223641e-05, + "loss": 0.0461, + "num_input_tokens_seen": 8569696, + "step": 40610 + }, + { + "epoch": 4.468096809680969, + "grad_norm": 0.6243958473205566, + "learning_rate": 4.7716373683987435e-05, + "loss": 0.159, + "num_input_tokens_seen": 8570784, + "step": 40615 + }, + { + "epoch": 4.468646864686469, + "grad_norm": 0.3900948464870453, + "learning_rate": 4.771537143624486e-05, + "loss": 0.0337, + "num_input_tokens_seen": 8571808, + "step": 40620 + }, + { + "epoch": 4.469196919691969, + "grad_norm": 0.2276971936225891, + "learning_rate": 4.771436897914561e-05, + "loss": 0.0856, + "num_input_tokens_seen": 8572864, + "step": 40625 + }, + { + "epoch": 4.46974697469747, + "grad_norm": 0.31021562218666077, + "learning_rate": 4.7713366312698906e-05, + "loss": 0.0765, + "num_input_tokens_seen": 8573952, + "step": 40630 + }, + { + "epoch": 4.47029702970297, + "grad_norm": 0.3760276436805725, + "learning_rate": 4.771236343691401e-05, + "loss": 0.0267, + "num_input_tokens_seen": 8574976, + "step": 40635 + }, + { + "epoch": 4.4708470847084705, + "grad_norm": 0.1697348803281784, + "learning_rate": 4.771136035180016e-05, + "loss": 0.0083, + "num_input_tokens_seen": 8576000, + "step": 40640 + }, + { + "epoch": 4.471397139713972, + "grad_norm": 0.18299074470996857, + "learning_rate": 4.77103570573666e-05, + "loss": 0.1466, + "num_input_tokens_seen": 8577056, + "step": 40645 + }, + { + "epoch": 4.471947194719472, + "grad_norm": 0.052161090075969696, + "learning_rate": 4.770935355362257e-05, + "loss": 0.0152, + "num_input_tokens_seen": 8578112, + "step": 40650 + }, + { + "epoch": 4.472497249724973, + "grad_norm": 0.27792036533355713, + "learning_rate": 4.7708349840577336e-05, + "loss": 0.1367, + "num_input_tokens_seen": 8579136, + "step": 40655 + }, + { + "epoch": 4.473047304730473, + "grad_norm": 0.3047437369823456, + "learning_rate": 4.7707345918240134e-05, + "loss": 0.1309, + "num_input_tokens_seen": 8580224, + "step": 40660 + }, + { + "epoch": 4.473597359735973, + "grad_norm": 0.20802141726016998, + "learning_rate": 4.770634178662022e-05, + "loss": 0.1418, + "num_input_tokens_seen": 8581280, + "step": 40665 + }, + { + "epoch": 4.474147414741474, + "grad_norm": 0.12221676856279373, + "learning_rate": 4.770533744572685e-05, + "loss": 0.0668, + "num_input_tokens_seen": 8582432, + "step": 40670 + }, + { + "epoch": 4.474697469746975, + "grad_norm": 1.5563756227493286, + "learning_rate": 4.7704332895569274e-05, + "loss": 0.1523, + "num_input_tokens_seen": 8583488, + "step": 40675 + }, + { + "epoch": 4.475247524752476, + "grad_norm": 0.6890806555747986, + "learning_rate": 4.770332813615677e-05, + "loss": 0.0849, + "num_input_tokens_seen": 8584608, + "step": 40680 + }, + { + "epoch": 4.475797579757976, + "grad_norm": 0.08625824749469757, + "learning_rate": 4.7702323167498565e-05, + "loss": 0.0587, + "num_input_tokens_seen": 8585728, + "step": 40685 + }, + { + "epoch": 4.476347634763476, + "grad_norm": 0.039817918092012405, + "learning_rate": 4.770131798960396e-05, + "loss": 0.0055, + "num_input_tokens_seen": 8586720, + "step": 40690 + }, + { + "epoch": 4.476897689768977, + "grad_norm": 0.49440810084342957, + "learning_rate": 4.770031260248219e-05, + "loss": 0.1386, + "num_input_tokens_seen": 8587808, + "step": 40695 + }, + { + "epoch": 4.477447744774477, + "grad_norm": 0.034226350486278534, + "learning_rate": 4.769930700614253e-05, + "loss": 0.0851, + "num_input_tokens_seen": 8588832, + "step": 40700 + }, + { + "epoch": 4.477997799779978, + "grad_norm": 0.05085963383316994, + "learning_rate": 4.7698301200594254e-05, + "loss": 0.0266, + "num_input_tokens_seen": 8589856, + "step": 40705 + }, + { + "epoch": 4.478547854785479, + "grad_norm": 0.31773221492767334, + "learning_rate": 4.7697295185846626e-05, + "loss": 0.0153, + "num_input_tokens_seen": 8591008, + "step": 40710 + }, + { + "epoch": 4.479097909790979, + "grad_norm": 0.4874587059020996, + "learning_rate": 4.769628896190892e-05, + "loss": 0.0535, + "num_input_tokens_seen": 8592032, + "step": 40715 + }, + { + "epoch": 4.47964796479648, + "grad_norm": 0.1872262954711914, + "learning_rate": 4.7695282528790405e-05, + "loss": 0.0558, + "num_input_tokens_seen": 8593056, + "step": 40720 + }, + { + "epoch": 4.48019801980198, + "grad_norm": 0.42396920919418335, + "learning_rate": 4.7694275886500366e-05, + "loss": 0.1263, + "num_input_tokens_seen": 8594176, + "step": 40725 + }, + { + "epoch": 4.48074807480748, + "grad_norm": 0.09717299789190292, + "learning_rate": 4.769326903504808e-05, + "loss": 0.0903, + "num_input_tokens_seen": 8595168, + "step": 40730 + }, + { + "epoch": 4.4812981298129815, + "grad_norm": 0.0812072828412056, + "learning_rate": 4.769226197444282e-05, + "loss": 0.0198, + "num_input_tokens_seen": 8596256, + "step": 40735 + }, + { + "epoch": 4.481848184818482, + "grad_norm": 0.08869358897209167, + "learning_rate": 4.769125470469387e-05, + "loss": 0.0195, + "num_input_tokens_seen": 8597280, + "step": 40740 + }, + { + "epoch": 4.482398239823983, + "grad_norm": 0.5486516356468201, + "learning_rate": 4.76902472258105e-05, + "loss": 0.1071, + "num_input_tokens_seen": 8598336, + "step": 40745 + }, + { + "epoch": 4.482948294829483, + "grad_norm": 0.7355551719665527, + "learning_rate": 4.7689239537802025e-05, + "loss": 0.0488, + "num_input_tokens_seen": 8599392, + "step": 40750 + }, + { + "epoch": 4.483498349834983, + "grad_norm": 0.013071884401142597, + "learning_rate": 4.768823164067772e-05, + "loss": 0.037, + "num_input_tokens_seen": 8600416, + "step": 40755 + }, + { + "epoch": 4.484048404840484, + "grad_norm": 0.28894945979118347, + "learning_rate": 4.768722353444686e-05, + "loss": 0.0234, + "num_input_tokens_seen": 8601504, + "step": 40760 + }, + { + "epoch": 4.4845984598459845, + "grad_norm": 0.01217544637620449, + "learning_rate": 4.768621521911875e-05, + "loss": 0.0056, + "num_input_tokens_seen": 8602592, + "step": 40765 + }, + { + "epoch": 4.485148514851485, + "grad_norm": 0.19128558039665222, + "learning_rate": 4.7685206694702676e-05, + "loss": 0.0192, + "num_input_tokens_seen": 8603616, + "step": 40770 + }, + { + "epoch": 4.485698569856986, + "grad_norm": 0.5119654536247253, + "learning_rate": 4.7684197961207945e-05, + "loss": 0.1024, + "num_input_tokens_seen": 8604736, + "step": 40775 + }, + { + "epoch": 4.486248624862486, + "grad_norm": 0.016781089827418327, + "learning_rate": 4.768318901864384e-05, + "loss": 0.0281, + "num_input_tokens_seen": 8605824, + "step": 40780 + }, + { + "epoch": 4.486798679867987, + "grad_norm": 0.8012826442718506, + "learning_rate": 4.768217986701967e-05, + "loss": 0.0591, + "num_input_tokens_seen": 8606976, + "step": 40785 + }, + { + "epoch": 4.487348734873487, + "grad_norm": 0.19545108079910278, + "learning_rate": 4.768117050634473e-05, + "loss": 0.0513, + "num_input_tokens_seen": 8608000, + "step": 40790 + }, + { + "epoch": 4.4878987898789875, + "grad_norm": 0.05018224939703941, + "learning_rate": 4.7680160936628334e-05, + "loss": 0.0754, + "num_input_tokens_seen": 8609024, + "step": 40795 + }, + { + "epoch": 4.488448844884489, + "grad_norm": 0.46302568912506104, + "learning_rate": 4.767915115787977e-05, + "loss": 0.0226, + "num_input_tokens_seen": 8610080, + "step": 40800 + }, + { + "epoch": 4.488998899889989, + "grad_norm": 0.06698256731033325, + "learning_rate": 4.7678141170108345e-05, + "loss": 0.0105, + "num_input_tokens_seen": 8611072, + "step": 40805 + }, + { + "epoch": 4.48954895489549, + "grad_norm": 0.05108880624175072, + "learning_rate": 4.7677130973323385e-05, + "loss": 0.0078, + "num_input_tokens_seen": 8612128, + "step": 40810 + }, + { + "epoch": 4.49009900990099, + "grad_norm": 0.3326229751110077, + "learning_rate": 4.767612056753419e-05, + "loss": 0.0925, + "num_input_tokens_seen": 8613184, + "step": 40815 + }, + { + "epoch": 4.49064906490649, + "grad_norm": 0.3338058590888977, + "learning_rate": 4.767510995275007e-05, + "loss": 0.0394, + "num_input_tokens_seen": 8614208, + "step": 40820 + }, + { + "epoch": 4.491199119911991, + "grad_norm": 0.0388682596385479, + "learning_rate": 4.7674099128980345e-05, + "loss": 0.046, + "num_input_tokens_seen": 8615232, + "step": 40825 + }, + { + "epoch": 4.491749174917492, + "grad_norm": 0.166523277759552, + "learning_rate": 4.767308809623432e-05, + "loss": 0.1449, + "num_input_tokens_seen": 8616288, + "step": 40830 + }, + { + "epoch": 4.492299229922993, + "grad_norm": 0.057342205196619034, + "learning_rate": 4.767207685452133e-05, + "loss": 0.031, + "num_input_tokens_seen": 8617312, + "step": 40835 + }, + { + "epoch": 4.492849284928493, + "grad_norm": 0.019949516281485558, + "learning_rate": 4.767106540385068e-05, + "loss": 0.0092, + "num_input_tokens_seen": 8618400, + "step": 40840 + }, + { + "epoch": 4.493399339933993, + "grad_norm": 0.44709354639053345, + "learning_rate": 4.767005374423171e-05, + "loss": 0.051, + "num_input_tokens_seen": 8619456, + "step": 40845 + }, + { + "epoch": 4.493949394939494, + "grad_norm": 1.8464795351028442, + "learning_rate": 4.7669041875673726e-05, + "loss": 0.229, + "num_input_tokens_seen": 8620544, + "step": 40850 + }, + { + "epoch": 4.494499449944994, + "grad_norm": 0.04172521457076073, + "learning_rate": 4.7668029798186066e-05, + "loss": 0.0327, + "num_input_tokens_seen": 8621632, + "step": 40855 + }, + { + "epoch": 4.4950495049504955, + "grad_norm": 0.3544113039970398, + "learning_rate": 4.766701751177805e-05, + "loss": 0.0812, + "num_input_tokens_seen": 8622784, + "step": 40860 + }, + { + "epoch": 4.495599559955996, + "grad_norm": 0.764539897441864, + "learning_rate": 4.7666005016459e-05, + "loss": 0.0219, + "num_input_tokens_seen": 8623808, + "step": 40865 + }, + { + "epoch": 4.496149614961496, + "grad_norm": 1.2673132419586182, + "learning_rate": 4.766499231223827e-05, + "loss": 0.0696, + "num_input_tokens_seen": 8624800, + "step": 40870 + }, + { + "epoch": 4.496699669966997, + "grad_norm": 0.009112576954066753, + "learning_rate": 4.766397939912517e-05, + "loss": 0.0381, + "num_input_tokens_seen": 8625920, + "step": 40875 + }, + { + "epoch": 4.497249724972497, + "grad_norm": 0.423418253660202, + "learning_rate": 4.7662966277129056e-05, + "loss": 0.0592, + "num_input_tokens_seen": 8626976, + "step": 40880 + }, + { + "epoch": 4.497799779977997, + "grad_norm": 0.07100025564432144, + "learning_rate": 4.766195294625926e-05, + "loss": 0.0625, + "num_input_tokens_seen": 8628000, + "step": 40885 + }, + { + "epoch": 4.4983498349834985, + "grad_norm": 0.4575658142566681, + "learning_rate": 4.7660939406525104e-05, + "loss": 0.0843, + "num_input_tokens_seen": 8629088, + "step": 40890 + }, + { + "epoch": 4.498899889988999, + "grad_norm": 0.01937090791761875, + "learning_rate": 4.7659925657935955e-05, + "loss": 0.0192, + "num_input_tokens_seen": 8630080, + "step": 40895 + }, + { + "epoch": 4.4994499449945, + "grad_norm": 0.8587784767150879, + "learning_rate": 4.765891170050114e-05, + "loss": 0.1363, + "num_input_tokens_seen": 8631136, + "step": 40900 + }, + { + "epoch": 4.5, + "grad_norm": 0.1713091880083084, + "learning_rate": 4.765789753423001e-05, + "loss": 0.0443, + "num_input_tokens_seen": 8632224, + "step": 40905 + }, + { + "epoch": 4.5005500550055, + "grad_norm": 0.3040390908718109, + "learning_rate": 4.765688315913192e-05, + "loss": 0.0437, + "num_input_tokens_seen": 8633344, + "step": 40910 + }, + { + "epoch": 4.501100110011001, + "grad_norm": 0.2106294482946396, + "learning_rate": 4.7655868575216196e-05, + "loss": 0.0753, + "num_input_tokens_seen": 8634368, + "step": 40915 + }, + { + "epoch": 4.5016501650165015, + "grad_norm": 0.24473224580287933, + "learning_rate": 4.76548537824922e-05, + "loss": 0.0457, + "num_input_tokens_seen": 8635392, + "step": 40920 + }, + { + "epoch": 4.502200220022003, + "grad_norm": 0.1569412648677826, + "learning_rate": 4.76538387809693e-05, + "loss": 0.1559, + "num_input_tokens_seen": 8636448, + "step": 40925 + }, + { + "epoch": 4.502750275027503, + "grad_norm": 0.7340296506881714, + "learning_rate": 4.765282357065683e-05, + "loss": 0.1078, + "num_input_tokens_seen": 8637536, + "step": 40930 + }, + { + "epoch": 4.503300330033003, + "grad_norm": 0.01319101545959711, + "learning_rate": 4.765180815156416e-05, + "loss": 0.0095, + "num_input_tokens_seen": 8638624, + "step": 40935 + }, + { + "epoch": 4.503850385038504, + "grad_norm": 1.4895029067993164, + "learning_rate": 4.765079252370064e-05, + "loss": 0.0204, + "num_input_tokens_seen": 8639648, + "step": 40940 + }, + { + "epoch": 4.504400440044004, + "grad_norm": 0.21045184135437012, + "learning_rate": 4.764977668707564e-05, + "loss": 0.061, + "num_input_tokens_seen": 8640672, + "step": 40945 + }, + { + "epoch": 4.5049504950495045, + "grad_norm": 0.9343061447143555, + "learning_rate": 4.7648760641698514e-05, + "loss": 0.074, + "num_input_tokens_seen": 8641792, + "step": 40950 + }, + { + "epoch": 4.505500550055006, + "grad_norm": 0.41589364409446716, + "learning_rate": 4.764774438757863e-05, + "loss": 0.0469, + "num_input_tokens_seen": 8642880, + "step": 40955 + }, + { + "epoch": 4.506050605060506, + "grad_norm": 1.1063185930252075, + "learning_rate": 4.764672792472535e-05, + "loss": 0.0544, + "num_input_tokens_seen": 8644000, + "step": 40960 + }, + { + "epoch": 4.506600660066007, + "grad_norm": 0.01795605942606926, + "learning_rate": 4.764571125314805e-05, + "loss": 0.0432, + "num_input_tokens_seen": 8645024, + "step": 40965 + }, + { + "epoch": 4.507150715071507, + "grad_norm": 1.1145262718200684, + "learning_rate": 4.764469437285609e-05, + "loss": 0.0584, + "num_input_tokens_seen": 8646112, + "step": 40970 + }, + { + "epoch": 4.507700770077007, + "grad_norm": 0.20803304016590118, + "learning_rate": 4.764367728385885e-05, + "loss": 0.0151, + "num_input_tokens_seen": 8647200, + "step": 40975 + }, + { + "epoch": 4.508250825082508, + "grad_norm": 0.3035951852798462, + "learning_rate": 4.76426599861657e-05, + "loss": 0.0963, + "num_input_tokens_seen": 8648256, + "step": 40980 + }, + { + "epoch": 4.508800880088009, + "grad_norm": 0.7788867950439453, + "learning_rate": 4.764164247978603e-05, + "loss": 0.0198, + "num_input_tokens_seen": 8649280, + "step": 40985 + }, + { + "epoch": 4.50935093509351, + "grad_norm": 0.15220871567726135, + "learning_rate": 4.7640624764729193e-05, + "loss": 0.0282, + "num_input_tokens_seen": 8650304, + "step": 40990 + }, + { + "epoch": 4.50990099009901, + "grad_norm": 0.02683710679411888, + "learning_rate": 4.763960684100458e-05, + "loss": 0.0267, + "num_input_tokens_seen": 8651360, + "step": 40995 + }, + { + "epoch": 4.51045104510451, + "grad_norm": 0.42785534262657166, + "learning_rate": 4.763858870862158e-05, + "loss": 0.182, + "num_input_tokens_seen": 8652416, + "step": 41000 + }, + { + "epoch": 4.511001100110011, + "grad_norm": 0.015190578997135162, + "learning_rate": 4.763757036758957e-05, + "loss": 0.062, + "num_input_tokens_seen": 8653440, + "step": 41005 + }, + { + "epoch": 4.511551155115511, + "grad_norm": 0.16056367754936218, + "learning_rate": 4.763655181791794e-05, + "loss": 0.0269, + "num_input_tokens_seen": 8654464, + "step": 41010 + }, + { + "epoch": 4.512101210121012, + "grad_norm": 0.09770902991294861, + "learning_rate": 4.763553305961607e-05, + "loss": 0.0166, + "num_input_tokens_seen": 8655552, + "step": 41015 + }, + { + "epoch": 4.512651265126513, + "grad_norm": 0.7993180751800537, + "learning_rate": 4.7634514092693354e-05, + "loss": 0.0619, + "num_input_tokens_seen": 8656608, + "step": 41020 + }, + { + "epoch": 4.513201320132013, + "grad_norm": 0.47041794657707214, + "learning_rate": 4.7633494917159185e-05, + "loss": 0.0558, + "num_input_tokens_seen": 8657600, + "step": 41025 + }, + { + "epoch": 4.513751375137514, + "grad_norm": 0.05615273118019104, + "learning_rate": 4.7632475533022946e-05, + "loss": 0.0407, + "num_input_tokens_seen": 8658624, + "step": 41030 + }, + { + "epoch": 4.514301430143014, + "grad_norm": 0.25428953766822815, + "learning_rate": 4.763145594029405e-05, + "loss": 0.1044, + "num_input_tokens_seen": 8659648, + "step": 41035 + }, + { + "epoch": 4.514851485148515, + "grad_norm": 0.2340923696756363, + "learning_rate": 4.763043613898188e-05, + "loss": 0.0653, + "num_input_tokens_seen": 8660608, + "step": 41040 + }, + { + "epoch": 4.5154015401540155, + "grad_norm": 0.253001868724823, + "learning_rate": 4.7629416129095836e-05, + "loss": 0.0119, + "num_input_tokens_seen": 8661632, + "step": 41045 + }, + { + "epoch": 4.515951595159516, + "grad_norm": 0.07336805760860443, + "learning_rate": 4.762839591064533e-05, + "loss": 0.0633, + "num_input_tokens_seen": 8662720, + "step": 41050 + }, + { + "epoch": 4.516501650165017, + "grad_norm": 0.05252543464303017, + "learning_rate": 4.7627375483639756e-05, + "loss": 0.0678, + "num_input_tokens_seen": 8663712, + "step": 41055 + }, + { + "epoch": 4.517051705170517, + "grad_norm": 0.2863490581512451, + "learning_rate": 4.762635484808851e-05, + "loss": 0.0645, + "num_input_tokens_seen": 8664768, + "step": 41060 + }, + { + "epoch": 4.517601760176017, + "grad_norm": 0.08553499728441238, + "learning_rate": 4.762533400400102e-05, + "loss": 0.0585, + "num_input_tokens_seen": 8665760, + "step": 41065 + }, + { + "epoch": 4.518151815181518, + "grad_norm": 0.32749059796333313, + "learning_rate": 4.762431295138667e-05, + "loss": 0.0647, + "num_input_tokens_seen": 8666816, + "step": 41070 + }, + { + "epoch": 4.5187018701870185, + "grad_norm": 0.15251129865646362, + "learning_rate": 4.76232916902549e-05, + "loss": 0.1003, + "num_input_tokens_seen": 8667904, + "step": 41075 + }, + { + "epoch": 4.51925192519252, + "grad_norm": 1.2079344987869263, + "learning_rate": 4.7622270220615096e-05, + "loss": 0.1051, + "num_input_tokens_seen": 8668960, + "step": 41080 + }, + { + "epoch": 4.51980198019802, + "grad_norm": 0.04857667535543442, + "learning_rate": 4.762124854247668e-05, + "loss": 0.031, + "num_input_tokens_seen": 8670016, + "step": 41085 + }, + { + "epoch": 4.52035203520352, + "grad_norm": 0.07070810347795486, + "learning_rate": 4.762022665584908e-05, + "loss": 0.0161, + "num_input_tokens_seen": 8671072, + "step": 41090 + }, + { + "epoch": 4.520902090209021, + "grad_norm": 0.10447092354297638, + "learning_rate": 4.7619204560741704e-05, + "loss": 0.004, + "num_input_tokens_seen": 8672096, + "step": 41095 + }, + { + "epoch": 4.521452145214521, + "grad_norm": 0.09496493637561798, + "learning_rate": 4.761818225716397e-05, + "loss": 0.0479, + "num_input_tokens_seen": 8673152, + "step": 41100 + }, + { + "epoch": 4.522002200220022, + "grad_norm": 0.06808975338935852, + "learning_rate": 4.76171597451253e-05, + "loss": 0.0329, + "num_input_tokens_seen": 8674240, + "step": 41105 + }, + { + "epoch": 4.522552255225523, + "grad_norm": 0.059298768639564514, + "learning_rate": 4.761613702463512e-05, + "loss": 0.0226, + "num_input_tokens_seen": 8675264, + "step": 41110 + }, + { + "epoch": 4.523102310231023, + "grad_norm": 0.040735550224781036, + "learning_rate": 4.761511409570287e-05, + "loss": 0.0466, + "num_input_tokens_seen": 8676288, + "step": 41115 + }, + { + "epoch": 4.523652365236524, + "grad_norm": 0.023429283872246742, + "learning_rate": 4.761409095833796e-05, + "loss": 0.0146, + "num_input_tokens_seen": 8677408, + "step": 41120 + }, + { + "epoch": 4.524202420242024, + "grad_norm": 0.21621781587600708, + "learning_rate": 4.7613067612549816e-05, + "loss": 0.0683, + "num_input_tokens_seen": 8678496, + "step": 41125 + }, + { + "epoch": 4.524752475247524, + "grad_norm": 1.6793193817138672, + "learning_rate": 4.761204405834788e-05, + "loss": 0.1559, + "num_input_tokens_seen": 8679552, + "step": 41130 + }, + { + "epoch": 4.525302530253025, + "grad_norm": 0.06317427009344101, + "learning_rate": 4.761102029574159e-05, + "loss": 0.0069, + "num_input_tokens_seen": 8680608, + "step": 41135 + }, + { + "epoch": 4.525852585258526, + "grad_norm": 0.14784112572669983, + "learning_rate": 4.760999632474038e-05, + "loss": 0.0689, + "num_input_tokens_seen": 8681696, + "step": 41140 + }, + { + "epoch": 4.526402640264027, + "grad_norm": 0.07537820190191269, + "learning_rate": 4.760897214535368e-05, + "loss": 0.0775, + "num_input_tokens_seen": 8682720, + "step": 41145 + }, + { + "epoch": 4.526952695269527, + "grad_norm": 2.56974458694458, + "learning_rate": 4.7607947757590934e-05, + "loss": 0.0686, + "num_input_tokens_seen": 8683808, + "step": 41150 + }, + { + "epoch": 4.527502750275027, + "grad_norm": 0.8329265117645264, + "learning_rate": 4.760692316146157e-05, + "loss": 0.0578, + "num_input_tokens_seen": 8684864, + "step": 41155 + }, + { + "epoch": 4.528052805280528, + "grad_norm": 0.023996472358703613, + "learning_rate": 4.7605898356975055e-05, + "loss": 0.1052, + "num_input_tokens_seen": 8685888, + "step": 41160 + }, + { + "epoch": 4.528602860286028, + "grad_norm": 0.0675431564450264, + "learning_rate": 4.760487334414082e-05, + "loss": 0.0215, + "num_input_tokens_seen": 8686944, + "step": 41165 + }, + { + "epoch": 4.5291529152915295, + "grad_norm": 0.7211551070213318, + "learning_rate": 4.760384812296831e-05, + "loss": 0.0647, + "num_input_tokens_seen": 8687936, + "step": 41170 + }, + { + "epoch": 4.52970297029703, + "grad_norm": 0.7306487560272217, + "learning_rate": 4.760282269346698e-05, + "loss": 0.0328, + "num_input_tokens_seen": 8689024, + "step": 41175 + }, + { + "epoch": 4.53025302530253, + "grad_norm": 0.25451308488845825, + "learning_rate": 4.760179705564628e-05, + "loss": 0.0753, + "num_input_tokens_seen": 8690048, + "step": 41180 + }, + { + "epoch": 4.530803080308031, + "grad_norm": 0.585181474685669, + "learning_rate": 4.760077120951567e-05, + "loss": 0.0484, + "num_input_tokens_seen": 8691104, + "step": 41185 + }, + { + "epoch": 4.531353135313531, + "grad_norm": 1.5660403966903687, + "learning_rate": 4.7599745155084584e-05, + "loss": 0.1192, + "num_input_tokens_seen": 8692224, + "step": 41190 + }, + { + "epoch": 4.531903190319031, + "grad_norm": 1.0342247486114502, + "learning_rate": 4.7598718892362496e-05, + "loss": 0.0676, + "num_input_tokens_seen": 8693184, + "step": 41195 + }, + { + "epoch": 4.5324532453245325, + "grad_norm": 0.14465171098709106, + "learning_rate": 4.759769242135886e-05, + "loss": 0.0261, + "num_input_tokens_seen": 8694240, + "step": 41200 + }, + { + "epoch": 4.533003300330033, + "grad_norm": 0.39613544940948486, + "learning_rate": 4.7596665742083135e-05, + "loss": 0.0249, + "num_input_tokens_seen": 8695360, + "step": 41205 + }, + { + "epoch": 4.533553355335534, + "grad_norm": 0.018286649137735367, + "learning_rate": 4.759563885454479e-05, + "loss": 0.0179, + "num_input_tokens_seen": 8696352, + "step": 41210 + }, + { + "epoch": 4.534103410341034, + "grad_norm": 0.09051280468702316, + "learning_rate": 4.759461175875328e-05, + "loss": 0.0577, + "num_input_tokens_seen": 8697376, + "step": 41215 + }, + { + "epoch": 4.534653465346535, + "grad_norm": 0.8464234471321106, + "learning_rate": 4.759358445471808e-05, + "loss": 0.1191, + "num_input_tokens_seen": 8698464, + "step": 41220 + }, + { + "epoch": 4.535203520352035, + "grad_norm": 0.028275305405259132, + "learning_rate": 4.759255694244865e-05, + "loss": 0.0367, + "num_input_tokens_seen": 8699520, + "step": 41225 + }, + { + "epoch": 4.5357535753575355, + "grad_norm": 0.007893755100667477, + "learning_rate": 4.759152922195445e-05, + "loss": 0.0102, + "num_input_tokens_seen": 8700512, + "step": 41230 + }, + { + "epoch": 4.536303630363037, + "grad_norm": 0.3129385709762573, + "learning_rate": 4.759050129324499e-05, + "loss": 0.044, + "num_input_tokens_seen": 8701536, + "step": 41235 + }, + { + "epoch": 4.536853685368537, + "grad_norm": 0.1947568953037262, + "learning_rate": 4.7589473156329704e-05, + "loss": 0.017, + "num_input_tokens_seen": 8702688, + "step": 41240 + }, + { + "epoch": 4.537403740374037, + "grad_norm": 0.08496860414743423, + "learning_rate": 4.758844481121809e-05, + "loss": 0.0421, + "num_input_tokens_seen": 8703712, + "step": 41245 + }, + { + "epoch": 4.537953795379538, + "grad_norm": 0.017979225143790245, + "learning_rate": 4.7587416257919616e-05, + "loss": 0.0175, + "num_input_tokens_seen": 8704704, + "step": 41250 + }, + { + "epoch": 4.538503850385038, + "grad_norm": 0.02796810492873192, + "learning_rate": 4.758638749644376e-05, + "loss": 0.0096, + "num_input_tokens_seen": 8705760, + "step": 41255 + }, + { + "epoch": 4.539053905390539, + "grad_norm": 0.044559333473443985, + "learning_rate": 4.7585358526800024e-05, + "loss": 0.0776, + "num_input_tokens_seen": 8706784, + "step": 41260 + }, + { + "epoch": 4.53960396039604, + "grad_norm": 0.02015763707458973, + "learning_rate": 4.7584329348997866e-05, + "loss": 0.0815, + "num_input_tokens_seen": 8707840, + "step": 41265 + }, + { + "epoch": 4.54015401540154, + "grad_norm": 0.07254696637392044, + "learning_rate": 4.758329996304678e-05, + "loss": 0.0189, + "num_input_tokens_seen": 8708928, + "step": 41270 + }, + { + "epoch": 4.540704070407041, + "grad_norm": 0.041678301990032196, + "learning_rate": 4.7582270368956254e-05, + "loss": 0.0269, + "num_input_tokens_seen": 8709952, + "step": 41275 + }, + { + "epoch": 4.541254125412541, + "grad_norm": 0.39507952332496643, + "learning_rate": 4.758124056673578e-05, + "loss": 0.0383, + "num_input_tokens_seen": 8711008, + "step": 41280 + }, + { + "epoch": 4.541804180418042, + "grad_norm": 0.21262900531291962, + "learning_rate": 4.758021055639485e-05, + "loss": 0.0623, + "num_input_tokens_seen": 8712064, + "step": 41285 + }, + { + "epoch": 4.542354235423542, + "grad_norm": 0.3483174443244934, + "learning_rate": 4.757918033794295e-05, + "loss": 0.0545, + "num_input_tokens_seen": 8713152, + "step": 41290 + }, + { + "epoch": 4.542904290429043, + "grad_norm": 0.13976964354515076, + "learning_rate": 4.757814991138958e-05, + "loss": 0.0223, + "num_input_tokens_seen": 8714272, + "step": 41295 + }, + { + "epoch": 4.543454345434544, + "grad_norm": 0.12386617809534073, + "learning_rate": 4.7577119276744245e-05, + "loss": 0.0439, + "num_input_tokens_seen": 8715296, + "step": 41300 + }, + { + "epoch": 4.544004400440044, + "grad_norm": 0.19513924419879913, + "learning_rate": 4.7576088434016417e-05, + "loss": 0.0186, + "num_input_tokens_seen": 8716320, + "step": 41305 + }, + { + "epoch": 4.544554455445544, + "grad_norm": 1.5017343759536743, + "learning_rate": 4.757505738321563e-05, + "loss": 0.0749, + "num_input_tokens_seen": 8717344, + "step": 41310 + }, + { + "epoch": 4.545104510451045, + "grad_norm": 1.6151957511901855, + "learning_rate": 4.757402612435137e-05, + "loss": 0.1121, + "num_input_tokens_seen": 8718368, + "step": 41315 + }, + { + "epoch": 4.5456545654565454, + "grad_norm": 0.04144572094082832, + "learning_rate": 4.757299465743314e-05, + "loss": 0.132, + "num_input_tokens_seen": 8719456, + "step": 41320 + }, + { + "epoch": 4.5462046204620465, + "grad_norm": 0.0357479564845562, + "learning_rate": 4.757196298247045e-05, + "loss": 0.0717, + "num_input_tokens_seen": 8720480, + "step": 41325 + }, + { + "epoch": 4.546754675467547, + "grad_norm": 0.051405299454927444, + "learning_rate": 4.757093109947281e-05, + "loss": 0.0084, + "num_input_tokens_seen": 8721600, + "step": 41330 + }, + { + "epoch": 4.547304730473047, + "grad_norm": 0.16820955276489258, + "learning_rate": 4.756989900844973e-05, + "loss": 0.1359, + "num_input_tokens_seen": 8722656, + "step": 41335 + }, + { + "epoch": 4.547854785478548, + "grad_norm": 0.39538177847862244, + "learning_rate": 4.756886670941071e-05, + "loss": 0.0416, + "num_input_tokens_seen": 8723744, + "step": 41340 + }, + { + "epoch": 4.548404840484048, + "grad_norm": 0.4352000951766968, + "learning_rate": 4.7567834202365294e-05, + "loss": 0.0886, + "num_input_tokens_seen": 8724800, + "step": 41345 + }, + { + "epoch": 4.548954895489549, + "grad_norm": 0.8315248489379883, + "learning_rate": 4.756680148732297e-05, + "loss": 0.0801, + "num_input_tokens_seen": 8725856, + "step": 41350 + }, + { + "epoch": 4.5495049504950495, + "grad_norm": 0.9446569085121155, + "learning_rate": 4.756576856429326e-05, + "loss": 0.0327, + "num_input_tokens_seen": 8726848, + "step": 41355 + }, + { + "epoch": 4.55005500550055, + "grad_norm": 1.0420047044754028, + "learning_rate": 4.756473543328569e-05, + "loss": 0.0801, + "num_input_tokens_seen": 8727872, + "step": 41360 + }, + { + "epoch": 4.550605060506051, + "grad_norm": 0.05396890267729759, + "learning_rate": 4.756370209430978e-05, + "loss": 0.0565, + "num_input_tokens_seen": 8728992, + "step": 41365 + }, + { + "epoch": 4.551155115511551, + "grad_norm": 0.33917999267578125, + "learning_rate": 4.756266854737507e-05, + "loss": 0.0496, + "num_input_tokens_seen": 8730016, + "step": 41370 + }, + { + "epoch": 4.551705170517051, + "grad_norm": 0.12951301038265228, + "learning_rate": 4.756163479249105e-05, + "loss": 0.0169, + "num_input_tokens_seen": 8731072, + "step": 41375 + }, + { + "epoch": 4.552255225522552, + "grad_norm": 0.9533755779266357, + "learning_rate": 4.756060082966728e-05, + "loss": 0.0511, + "num_input_tokens_seen": 8732128, + "step": 41380 + }, + { + "epoch": 4.552805280528053, + "grad_norm": 1.063173532485962, + "learning_rate": 4.755956665891328e-05, + "loss": 0.0711, + "num_input_tokens_seen": 8733216, + "step": 41385 + }, + { + "epoch": 4.553355335533554, + "grad_norm": 0.010720227845013142, + "learning_rate": 4.755853228023858e-05, + "loss": 0.0439, + "num_input_tokens_seen": 8734272, + "step": 41390 + }, + { + "epoch": 4.553905390539054, + "grad_norm": 0.285780131816864, + "learning_rate": 4.7557497693652694e-05, + "loss": 0.0318, + "num_input_tokens_seen": 8735360, + "step": 41395 + }, + { + "epoch": 4.554455445544555, + "grad_norm": 0.1217174157500267, + "learning_rate": 4.7556462899165196e-05, + "loss": 0.0176, + "num_input_tokens_seen": 8736480, + "step": 41400 + }, + { + "epoch": 4.555005500550055, + "grad_norm": 0.02660713903605938, + "learning_rate": 4.75554278967856e-05, + "loss": 0.0107, + "num_input_tokens_seen": 8737536, + "step": 41405 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.4927047491073608, + "learning_rate": 4.755439268652344e-05, + "loss": 0.2192, + "num_input_tokens_seen": 8738560, + "step": 41410 + }, + { + "epoch": 4.5561056105610565, + "grad_norm": 0.34180107712745667, + "learning_rate": 4.755335726838828e-05, + "loss": 0.0379, + "num_input_tokens_seen": 8739584, + "step": 41415 + }, + { + "epoch": 4.556655665566557, + "grad_norm": 0.11284586042165756, + "learning_rate": 4.755232164238963e-05, + "loss": 0.0324, + "num_input_tokens_seen": 8740672, + "step": 41420 + }, + { + "epoch": 4.557205720572057, + "grad_norm": 0.2311616688966751, + "learning_rate": 4.755128580853706e-05, + "loss": 0.077, + "num_input_tokens_seen": 8741696, + "step": 41425 + }, + { + "epoch": 4.557755775577558, + "grad_norm": 0.008662848733365536, + "learning_rate": 4.7550249766840115e-05, + "loss": 0.0217, + "num_input_tokens_seen": 8742752, + "step": 41430 + }, + { + "epoch": 4.558305830583058, + "grad_norm": 0.679909348487854, + "learning_rate": 4.7549213517308335e-05, + "loss": 0.0697, + "num_input_tokens_seen": 8743872, + "step": 41435 + }, + { + "epoch": 4.558855885588558, + "grad_norm": 0.10006321966648102, + "learning_rate": 4.754817705995127e-05, + "loss": 0.037, + "num_input_tokens_seen": 8744960, + "step": 41440 + }, + { + "epoch": 4.5594059405940595, + "grad_norm": 0.008274094201624393, + "learning_rate": 4.754714039477848e-05, + "loss": 0.0579, + "num_input_tokens_seen": 8746048, + "step": 41445 + }, + { + "epoch": 4.55995599559956, + "grad_norm": 0.07831736654043198, + "learning_rate": 4.7546103521799526e-05, + "loss": 0.0255, + "num_input_tokens_seen": 8747136, + "step": 41450 + }, + { + "epoch": 4.560506050605061, + "grad_norm": 0.6537491083145142, + "learning_rate": 4.754506644102395e-05, + "loss": 0.0223, + "num_input_tokens_seen": 8748256, + "step": 41455 + }, + { + "epoch": 4.561056105610561, + "grad_norm": 0.03735924884676933, + "learning_rate": 4.75440291524613e-05, + "loss": 0.0756, + "num_input_tokens_seen": 8749312, + "step": 41460 + }, + { + "epoch": 4.561606160616062, + "grad_norm": 0.5476725101470947, + "learning_rate": 4.754299165612116e-05, + "loss": 0.04, + "num_input_tokens_seen": 8750400, + "step": 41465 + }, + { + "epoch": 4.562156215621562, + "grad_norm": 0.13588833808898926, + "learning_rate": 4.7541953952013084e-05, + "loss": 0.0621, + "num_input_tokens_seen": 8751392, + "step": 41470 + }, + { + "epoch": 4.5627062706270625, + "grad_norm": 0.08111204206943512, + "learning_rate": 4.754091604014664e-05, + "loss": 0.0544, + "num_input_tokens_seen": 8752480, + "step": 41475 + }, + { + "epoch": 4.563256325632564, + "grad_norm": 0.0164751298725605, + "learning_rate": 4.7539877920531384e-05, + "loss": 0.0264, + "num_input_tokens_seen": 8753504, + "step": 41480 + }, + { + "epoch": 4.563806380638064, + "grad_norm": 0.5806475877761841, + "learning_rate": 4.753883959317689e-05, + "loss": 0.054, + "num_input_tokens_seen": 8754528, + "step": 41485 + }, + { + "epoch": 4.564356435643564, + "grad_norm": 0.06772258877754211, + "learning_rate": 4.7537801058092725e-05, + "loss": 0.0167, + "num_input_tokens_seen": 8755552, + "step": 41490 + }, + { + "epoch": 4.564906490649065, + "grad_norm": 0.5279439687728882, + "learning_rate": 4.753676231528846e-05, + "loss": 0.0809, + "num_input_tokens_seen": 8756608, + "step": 41495 + }, + { + "epoch": 4.565456545654565, + "grad_norm": 0.2574574649333954, + "learning_rate": 4.753572336477368e-05, + "loss": 0.0271, + "num_input_tokens_seen": 8757664, + "step": 41500 + }, + { + "epoch": 4.566006600660066, + "grad_norm": 0.04728948697447777, + "learning_rate": 4.753468420655794e-05, + "loss": 0.0241, + "num_input_tokens_seen": 8758688, + "step": 41505 + }, + { + "epoch": 4.566556655665567, + "grad_norm": 0.020916979759931564, + "learning_rate": 4.7533644840650834e-05, + "loss": 0.0598, + "num_input_tokens_seen": 8759712, + "step": 41510 + }, + { + "epoch": 4.567106710671067, + "grad_norm": 0.12878672778606415, + "learning_rate": 4.753260526706194e-05, + "loss": 0.0759, + "num_input_tokens_seen": 8760768, + "step": 41515 + }, + { + "epoch": 4.567656765676568, + "grad_norm": 0.26096677780151367, + "learning_rate": 4.7531565485800834e-05, + "loss": 0.0679, + "num_input_tokens_seen": 8761792, + "step": 41520 + }, + { + "epoch": 4.568206820682068, + "grad_norm": 1.0280964374542236, + "learning_rate": 4.75305254968771e-05, + "loss": 0.0352, + "num_input_tokens_seen": 8762848, + "step": 41525 + }, + { + "epoch": 4.568756875687569, + "grad_norm": 0.9783772230148315, + "learning_rate": 4.7529485300300324e-05, + "loss": 0.1349, + "num_input_tokens_seen": 8763872, + "step": 41530 + }, + { + "epoch": 4.569306930693069, + "grad_norm": 0.011493350379168987, + "learning_rate": 4.752844489608009e-05, + "loss": 0.0985, + "num_input_tokens_seen": 8764928, + "step": 41535 + }, + { + "epoch": 4.56985698569857, + "grad_norm": 0.3332138657569885, + "learning_rate": 4.7527404284225984e-05, + "loss": 0.0168, + "num_input_tokens_seen": 8766016, + "step": 41540 + }, + { + "epoch": 4.570407040704071, + "grad_norm": 1.6204535961151123, + "learning_rate": 4.752636346474762e-05, + "loss": 0.0813, + "num_input_tokens_seen": 8767104, + "step": 41545 + }, + { + "epoch": 4.570957095709571, + "grad_norm": 0.7565198540687561, + "learning_rate": 4.7525322437654555e-05, + "loss": 0.1093, + "num_input_tokens_seen": 8768192, + "step": 41550 + }, + { + "epoch": 4.571507150715071, + "grad_norm": 0.09702378511428833, + "learning_rate": 4.752428120295641e-05, + "loss": 0.0072, + "num_input_tokens_seen": 8769184, + "step": 41555 + }, + { + "epoch": 4.572057205720572, + "grad_norm": 0.019055774435400963, + "learning_rate": 4.752323976066277e-05, + "loss": 0.1079, + "num_input_tokens_seen": 8770176, + "step": 41560 + }, + { + "epoch": 4.572607260726072, + "grad_norm": 1.5018609762191772, + "learning_rate": 4.752219811078325e-05, + "loss": 0.1065, + "num_input_tokens_seen": 8771232, + "step": 41565 + }, + { + "epoch": 4.5731573157315735, + "grad_norm": 0.028996089473366737, + "learning_rate": 4.7521156253327424e-05, + "loss": 0.0227, + "num_input_tokens_seen": 8772256, + "step": 41570 + }, + { + "epoch": 4.573707370737074, + "grad_norm": 0.3708908259868622, + "learning_rate": 4.752011418830491e-05, + "loss": 0.0822, + "num_input_tokens_seen": 8773344, + "step": 41575 + }, + { + "epoch": 4.574257425742574, + "grad_norm": 1.207228183746338, + "learning_rate": 4.7519071915725304e-05, + "loss": 0.1688, + "num_input_tokens_seen": 8774432, + "step": 41580 + }, + { + "epoch": 4.574807480748075, + "grad_norm": 1.035083293914795, + "learning_rate": 4.751802943559823e-05, + "loss": 0.1213, + "num_input_tokens_seen": 8775456, + "step": 41585 + }, + { + "epoch": 4.575357535753575, + "grad_norm": 0.06526363641023636, + "learning_rate": 4.751698674793327e-05, + "loss": 0.0429, + "num_input_tokens_seen": 8776480, + "step": 41590 + }, + { + "epoch": 4.575907590759076, + "grad_norm": 0.3892184793949127, + "learning_rate": 4.751594385274006e-05, + "loss": 0.0662, + "num_input_tokens_seen": 8777536, + "step": 41595 + }, + { + "epoch": 4.5764576457645765, + "grad_norm": 0.11984800547361374, + "learning_rate": 4.751490075002819e-05, + "loss": 0.0329, + "num_input_tokens_seen": 8778560, + "step": 41600 + }, + { + "epoch": 4.577007700770077, + "grad_norm": 0.02985086292028427, + "learning_rate": 4.75138574398073e-05, + "loss": 0.0078, + "num_input_tokens_seen": 8779552, + "step": 41605 + }, + { + "epoch": 4.577557755775578, + "grad_norm": 0.09757813811302185, + "learning_rate": 4.751281392208698e-05, + "loss": 0.0388, + "num_input_tokens_seen": 8780672, + "step": 41610 + }, + { + "epoch": 4.578107810781078, + "grad_norm": 0.3277796506881714, + "learning_rate": 4.751177019687685e-05, + "loss": 0.0697, + "num_input_tokens_seen": 8781696, + "step": 41615 + }, + { + "epoch": 4.578657865786578, + "grad_norm": 1.742311716079712, + "learning_rate": 4.751072626418653e-05, + "loss": 0.0704, + "num_input_tokens_seen": 8782688, + "step": 41620 + }, + { + "epoch": 4.579207920792079, + "grad_norm": 0.4519003927707672, + "learning_rate": 4.750968212402567e-05, + "loss": 0.0198, + "num_input_tokens_seen": 8783744, + "step": 41625 + }, + { + "epoch": 4.5797579757975795, + "grad_norm": 0.2607654631137848, + "learning_rate": 4.750863777640385e-05, + "loss": 0.079, + "num_input_tokens_seen": 8784832, + "step": 41630 + }, + { + "epoch": 4.580308030803081, + "grad_norm": 0.86269211769104, + "learning_rate": 4.7507593221330725e-05, + "loss": 0.037, + "num_input_tokens_seen": 8785856, + "step": 41635 + }, + { + "epoch": 4.580858085808581, + "grad_norm": 0.2834540903568268, + "learning_rate": 4.750654845881592e-05, + "loss": 0.0604, + "num_input_tokens_seen": 8786944, + "step": 41640 + }, + { + "epoch": 4.581408140814082, + "grad_norm": 0.14648029208183289, + "learning_rate": 4.750550348886905e-05, + "loss": 0.0239, + "num_input_tokens_seen": 8788000, + "step": 41645 + }, + { + "epoch": 4.581958195819582, + "grad_norm": 0.5220646858215332, + "learning_rate": 4.750445831149976e-05, + "loss": 0.015, + "num_input_tokens_seen": 8789088, + "step": 41650 + }, + { + "epoch": 4.582508250825082, + "grad_norm": 0.42748579382896423, + "learning_rate": 4.750341292671767e-05, + "loss": 0.062, + "num_input_tokens_seen": 8790144, + "step": 41655 + }, + { + "epoch": 4.583058305830583, + "grad_norm": 0.3748071491718292, + "learning_rate": 4.750236733453242e-05, + "loss": 0.0427, + "num_input_tokens_seen": 8791232, + "step": 41660 + }, + { + "epoch": 4.583608360836084, + "grad_norm": 0.6354237794876099, + "learning_rate": 4.7501321534953656e-05, + "loss": 0.037, + "num_input_tokens_seen": 8792288, + "step": 41665 + }, + { + "epoch": 4.584158415841584, + "grad_norm": 0.9903833866119385, + "learning_rate": 4.7500275527991e-05, + "loss": 0.0664, + "num_input_tokens_seen": 8793280, + "step": 41670 + }, + { + "epoch": 4.584708470847085, + "grad_norm": 0.12762992084026337, + "learning_rate": 4.7499229313654105e-05, + "loss": 0.0496, + "num_input_tokens_seen": 8794304, + "step": 41675 + }, + { + "epoch": 4.585258525852585, + "grad_norm": 0.21265432238578796, + "learning_rate": 4.749818289195261e-05, + "loss": 0.0205, + "num_input_tokens_seen": 8795296, + "step": 41680 + }, + { + "epoch": 4.585808580858086, + "grad_norm": 0.1006990373134613, + "learning_rate": 4.749713626289616e-05, + "loss": 0.0111, + "num_input_tokens_seen": 8796320, + "step": 41685 + }, + { + "epoch": 4.586358635863586, + "grad_norm": 0.17517778277397156, + "learning_rate": 4.74960894264944e-05, + "loss": 0.0232, + "num_input_tokens_seen": 8797344, + "step": 41690 + }, + { + "epoch": 4.586908690869087, + "grad_norm": 0.09253225475549698, + "learning_rate": 4.749504238275698e-05, + "loss": 0.0258, + "num_input_tokens_seen": 8798400, + "step": 41695 + }, + { + "epoch": 4.587458745874588, + "grad_norm": 0.7356845736503601, + "learning_rate": 4.749399513169354e-05, + "loss": 0.0377, + "num_input_tokens_seen": 8799424, + "step": 41700 + }, + { + "epoch": 4.588008800880088, + "grad_norm": 0.023965071886777878, + "learning_rate": 4.749294767331375e-05, + "loss": 0.0716, + "num_input_tokens_seen": 8800480, + "step": 41705 + }, + { + "epoch": 4.588558855885589, + "grad_norm": 0.2595823109149933, + "learning_rate": 4.749190000762725e-05, + "loss": 0.0203, + "num_input_tokens_seen": 8801504, + "step": 41710 + }, + { + "epoch": 4.589108910891089, + "grad_norm": 0.413013219833374, + "learning_rate": 4.74908521346437e-05, + "loss": 0.0414, + "num_input_tokens_seen": 8802496, + "step": 41715 + }, + { + "epoch": 4.589658965896589, + "grad_norm": 0.07108446210622787, + "learning_rate": 4.748980405437276e-05, + "loss": 0.0729, + "num_input_tokens_seen": 8803488, + "step": 41720 + }, + { + "epoch": 4.5902090209020905, + "grad_norm": 0.1439347267150879, + "learning_rate": 4.7488755766824086e-05, + "loss": 0.0528, + "num_input_tokens_seen": 8804576, + "step": 41725 + }, + { + "epoch": 4.590759075907591, + "grad_norm": 0.07666868716478348, + "learning_rate": 4.748770727200734e-05, + "loss": 0.0722, + "num_input_tokens_seen": 8805568, + "step": 41730 + }, + { + "epoch": 4.591309130913091, + "grad_norm": 1.0390738248825073, + "learning_rate": 4.748665856993219e-05, + "loss": 0.1381, + "num_input_tokens_seen": 8806688, + "step": 41735 + }, + { + "epoch": 4.591859185918592, + "grad_norm": 0.10429824143648148, + "learning_rate": 4.7485609660608296e-05, + "loss": 0.0255, + "num_input_tokens_seen": 8807776, + "step": 41740 + }, + { + "epoch": 4.592409240924092, + "grad_norm": 0.4465246796607971, + "learning_rate": 4.748456054404533e-05, + "loss": 0.0125, + "num_input_tokens_seen": 8808864, + "step": 41745 + }, + { + "epoch": 4.592959295929593, + "grad_norm": 0.06421936303377151, + "learning_rate": 4.748351122025295e-05, + "loss": 0.0278, + "num_input_tokens_seen": 8809920, + "step": 41750 + }, + { + "epoch": 4.5935093509350935, + "grad_norm": 0.06861189007759094, + "learning_rate": 4.748246168924085e-05, + "loss": 0.099, + "num_input_tokens_seen": 8810944, + "step": 41755 + }, + { + "epoch": 4.594059405940594, + "grad_norm": 0.18637827038764954, + "learning_rate": 4.748141195101868e-05, + "loss": 0.0248, + "num_input_tokens_seen": 8812032, + "step": 41760 + }, + { + "epoch": 4.594609460946095, + "grad_norm": 0.276496022939682, + "learning_rate": 4.7480362005596126e-05, + "loss": 0.0637, + "num_input_tokens_seen": 8813088, + "step": 41765 + }, + { + "epoch": 4.595159515951595, + "grad_norm": 0.4821145534515381, + "learning_rate": 4.747931185298286e-05, + "loss": 0.0361, + "num_input_tokens_seen": 8814144, + "step": 41770 + }, + { + "epoch": 4.595709570957096, + "grad_norm": 0.03935279697179794, + "learning_rate": 4.747826149318856e-05, + "loss": 0.0934, + "num_input_tokens_seen": 8815168, + "step": 41775 + }, + { + "epoch": 4.596259625962596, + "grad_norm": 0.1032915860414505, + "learning_rate": 4.747721092622292e-05, + "loss": 0.0062, + "num_input_tokens_seen": 8816224, + "step": 41780 + }, + { + "epoch": 4.5968096809680965, + "grad_norm": 0.00968269631266594, + "learning_rate": 4.74761601520956e-05, + "loss": 0.072, + "num_input_tokens_seen": 8817280, + "step": 41785 + }, + { + "epoch": 4.597359735973598, + "grad_norm": 0.03939440846443176, + "learning_rate": 4.747510917081631e-05, + "loss": 0.0146, + "num_input_tokens_seen": 8818336, + "step": 41790 + }, + { + "epoch": 4.597909790979098, + "grad_norm": 0.06283894181251526, + "learning_rate": 4.747405798239472e-05, + "loss": 0.0131, + "num_input_tokens_seen": 8819360, + "step": 41795 + }, + { + "epoch": 4.598459845984598, + "grad_norm": 0.03189140185713768, + "learning_rate": 4.747300658684052e-05, + "loss": 0.0613, + "num_input_tokens_seen": 8820384, + "step": 41800 + }, + { + "epoch": 4.599009900990099, + "grad_norm": 0.054683711379766464, + "learning_rate": 4.7471954984163404e-05, + "loss": 0.0423, + "num_input_tokens_seen": 8821440, + "step": 41805 + }, + { + "epoch": 4.599559955995599, + "grad_norm": 0.028476418927311897, + "learning_rate": 4.747090317437306e-05, + "loss": 0.0775, + "num_input_tokens_seen": 8822528, + "step": 41810 + }, + { + "epoch": 4.6001100110011, + "grad_norm": 0.5032525658607483, + "learning_rate": 4.7469851157479177e-05, + "loss": 0.0251, + "num_input_tokens_seen": 8823584, + "step": 41815 + }, + { + "epoch": 4.600660066006601, + "grad_norm": 0.4358896315097809, + "learning_rate": 4.746879893349147e-05, + "loss": 0.0832, + "num_input_tokens_seen": 8824640, + "step": 41820 + }, + { + "epoch": 4.601210121012102, + "grad_norm": 0.36836209893226624, + "learning_rate": 4.746774650241961e-05, + "loss": 0.0589, + "num_input_tokens_seen": 8825696, + "step": 41825 + }, + { + "epoch": 4.601760176017602, + "grad_norm": 0.02714953012764454, + "learning_rate": 4.7466693864273325e-05, + "loss": 0.0528, + "num_input_tokens_seen": 8826784, + "step": 41830 + }, + { + "epoch": 4.602310231023102, + "grad_norm": 0.09209315478801727, + "learning_rate": 4.74656410190623e-05, + "loss": 0.0208, + "num_input_tokens_seen": 8827872, + "step": 41835 + }, + { + "epoch": 4.602860286028603, + "grad_norm": 0.8833763003349304, + "learning_rate": 4.7464587966796234e-05, + "loss": 0.0845, + "num_input_tokens_seen": 8828960, + "step": 41840 + }, + { + "epoch": 4.603410341034103, + "grad_norm": 1.3310571908950806, + "learning_rate": 4.746353470748485e-05, + "loss": 0.0675, + "num_input_tokens_seen": 8829952, + "step": 41845 + }, + { + "epoch": 4.603960396039604, + "grad_norm": 0.005237691570073366, + "learning_rate": 4.746248124113785e-05, + "loss": 0.0042, + "num_input_tokens_seen": 8831008, + "step": 41850 + }, + { + "epoch": 4.604510451045105, + "grad_norm": 1.5027040243148804, + "learning_rate": 4.7461427567764924e-05, + "loss": 0.0823, + "num_input_tokens_seen": 8832128, + "step": 41855 + }, + { + "epoch": 4.605060506050605, + "grad_norm": 0.10210365056991577, + "learning_rate": 4.7460373687375805e-05, + "loss": 0.0506, + "num_input_tokens_seen": 8833184, + "step": 41860 + }, + { + "epoch": 4.605610561056105, + "grad_norm": 0.14102526009082794, + "learning_rate": 4.74593195999802e-05, + "loss": 0.0466, + "num_input_tokens_seen": 8834272, + "step": 41865 + }, + { + "epoch": 4.606160616061606, + "grad_norm": 0.6524513959884644, + "learning_rate": 4.745826530558782e-05, + "loss": 0.0167, + "num_input_tokens_seen": 8835264, + "step": 41870 + }, + { + "epoch": 4.606710671067106, + "grad_norm": 1.1004401445388794, + "learning_rate": 4.7457210804208396e-05, + "loss": 0.116, + "num_input_tokens_seen": 8836288, + "step": 41875 + }, + { + "epoch": 4.6072607260726075, + "grad_norm": 1.7864271402359009, + "learning_rate": 4.7456156095851625e-05, + "loss": 0.0684, + "num_input_tokens_seen": 8837312, + "step": 41880 + }, + { + "epoch": 4.607810781078108, + "grad_norm": 0.036229901015758514, + "learning_rate": 4.745510118052724e-05, + "loss": 0.1047, + "num_input_tokens_seen": 8838400, + "step": 41885 + }, + { + "epoch": 4.608360836083609, + "grad_norm": 0.19099269807338715, + "learning_rate": 4.745404605824497e-05, + "loss": 0.2514, + "num_input_tokens_seen": 8839456, + "step": 41890 + }, + { + "epoch": 4.608910891089109, + "grad_norm": 1.3960403203964233, + "learning_rate": 4.7452990729014526e-05, + "loss": 0.0758, + "num_input_tokens_seen": 8840512, + "step": 41895 + }, + { + "epoch": 4.609460946094609, + "grad_norm": 0.039268992841243744, + "learning_rate": 4.745193519284564e-05, + "loss": 0.0331, + "num_input_tokens_seen": 8841504, + "step": 41900 + }, + { + "epoch": 4.61001100110011, + "grad_norm": 0.06723053753376007, + "learning_rate": 4.745087944974804e-05, + "loss": 0.037, + "num_input_tokens_seen": 8842528, + "step": 41905 + }, + { + "epoch": 4.6105610561056105, + "grad_norm": 1.1711397171020508, + "learning_rate": 4.744982349973146e-05, + "loss": 0.1503, + "num_input_tokens_seen": 8843552, + "step": 41910 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.5996670126914978, + "learning_rate": 4.744876734280562e-05, + "loss": 0.0489, + "num_input_tokens_seen": 8844640, + "step": 41915 + }, + { + "epoch": 4.611661166116612, + "grad_norm": 0.07309941947460175, + "learning_rate": 4.7447710978980274e-05, + "loss": 0.028, + "num_input_tokens_seen": 8845728, + "step": 41920 + }, + { + "epoch": 4.612211221122112, + "grad_norm": 0.04463689401745796, + "learning_rate": 4.7446654408265146e-05, + "loss": 0.055, + "num_input_tokens_seen": 8846752, + "step": 41925 + }, + { + "epoch": 4.612761276127613, + "grad_norm": 0.299578458070755, + "learning_rate": 4.744559763066998e-05, + "loss": 0.0298, + "num_input_tokens_seen": 8847808, + "step": 41930 + }, + { + "epoch": 4.613311331133113, + "grad_norm": 0.03251595422625542, + "learning_rate": 4.7444540646204504e-05, + "loss": 0.0316, + "num_input_tokens_seen": 8848896, + "step": 41935 + }, + { + "epoch": 4.6138613861386135, + "grad_norm": 0.014307561330497265, + "learning_rate": 4.7443483454878466e-05, + "loss": 0.021, + "num_input_tokens_seen": 8849952, + "step": 41940 + }, + { + "epoch": 4.614411441144115, + "grad_norm": 0.9541518688201904, + "learning_rate": 4.744242605670161e-05, + "loss": 0.0548, + "num_input_tokens_seen": 8851072, + "step": 41945 + }, + { + "epoch": 4.614961496149615, + "grad_norm": 0.04036996141076088, + "learning_rate": 4.744136845168369e-05, + "loss": 0.0622, + "num_input_tokens_seen": 8852160, + "step": 41950 + }, + { + "epoch": 4.615511551155116, + "grad_norm": 0.17972815036773682, + "learning_rate": 4.744031063983444e-05, + "loss": 0.0841, + "num_input_tokens_seen": 8853152, + "step": 41955 + }, + { + "epoch": 4.616061606160616, + "grad_norm": 0.049681082367897034, + "learning_rate": 4.743925262116361e-05, + "loss": 0.0431, + "num_input_tokens_seen": 8854208, + "step": 41960 + }, + { + "epoch": 4.616611661166116, + "grad_norm": 0.07200966030359268, + "learning_rate": 4.743819439568096e-05, + "loss": 0.0268, + "num_input_tokens_seen": 8855264, + "step": 41965 + }, + { + "epoch": 4.617161716171617, + "grad_norm": 0.11220856755971909, + "learning_rate": 4.743713596339624e-05, + "loss": 0.01, + "num_input_tokens_seen": 8856320, + "step": 41970 + }, + { + "epoch": 4.617711771177118, + "grad_norm": 1.403783917427063, + "learning_rate": 4.743607732431921e-05, + "loss": 0.1279, + "num_input_tokens_seen": 8857408, + "step": 41975 + }, + { + "epoch": 4.618261826182618, + "grad_norm": 0.04942702502012253, + "learning_rate": 4.743501847845961e-05, + "loss": 0.1116, + "num_input_tokens_seen": 8858368, + "step": 41980 + }, + { + "epoch": 4.618811881188119, + "grad_norm": 0.20840923488140106, + "learning_rate": 4.7433959425827215e-05, + "loss": 0.1152, + "num_input_tokens_seen": 8859456, + "step": 41985 + }, + { + "epoch": 4.619361936193619, + "grad_norm": 0.39972802996635437, + "learning_rate": 4.743290016643178e-05, + "loss": 0.0461, + "num_input_tokens_seen": 8860576, + "step": 41990 + }, + { + "epoch": 4.61991199119912, + "grad_norm": 0.42519864439964294, + "learning_rate": 4.743184070028307e-05, + "loss": 0.1175, + "num_input_tokens_seen": 8861600, + "step": 41995 + }, + { + "epoch": 4.62046204620462, + "grad_norm": 1.213462471961975, + "learning_rate": 4.7430781027390845e-05, + "loss": 0.1205, + "num_input_tokens_seen": 8862624, + "step": 42000 + }, + { + "epoch": 4.621012101210121, + "grad_norm": 1.4821683168411255, + "learning_rate": 4.7429721147764886e-05, + "loss": 0.1072, + "num_input_tokens_seen": 8863744, + "step": 42005 + }, + { + "epoch": 4.621562156215622, + "grad_norm": 0.05226895585656166, + "learning_rate": 4.742866106141494e-05, + "loss": 0.0091, + "num_input_tokens_seen": 8864800, + "step": 42010 + }, + { + "epoch": 4.622112211221122, + "grad_norm": 0.10889016836881638, + "learning_rate": 4.742760076835078e-05, + "loss": 0.0179, + "num_input_tokens_seen": 8865888, + "step": 42015 + }, + { + "epoch": 4.622662266226623, + "grad_norm": 0.8160265684127808, + "learning_rate": 4.74265402685822e-05, + "loss": 0.1471, + "num_input_tokens_seen": 8866880, + "step": 42020 + }, + { + "epoch": 4.623212321232123, + "grad_norm": 0.07169582694768906, + "learning_rate": 4.742547956211895e-05, + "loss": 0.0619, + "num_input_tokens_seen": 8867872, + "step": 42025 + }, + { + "epoch": 4.623762376237623, + "grad_norm": 0.4740346372127533, + "learning_rate": 4.742441864897082e-05, + "loss": 0.0998, + "num_input_tokens_seen": 8868928, + "step": 42030 + }, + { + "epoch": 4.6243124312431245, + "grad_norm": 0.1618802398443222, + "learning_rate": 4.742335752914758e-05, + "loss": 0.0462, + "num_input_tokens_seen": 8869920, + "step": 42035 + }, + { + "epoch": 4.624862486248625, + "grad_norm": 0.07760363817214966, + "learning_rate": 4.742229620265902e-05, + "loss": 0.0319, + "num_input_tokens_seen": 8870976, + "step": 42040 + }, + { + "epoch": 4.625412541254125, + "grad_norm": 0.00729386555030942, + "learning_rate": 4.742123466951491e-05, + "loss": 0.0603, + "num_input_tokens_seen": 8872000, + "step": 42045 + }, + { + "epoch": 4.625962596259626, + "grad_norm": 0.07178173959255219, + "learning_rate": 4.742017292972504e-05, + "loss": 0.0208, + "num_input_tokens_seen": 8873056, + "step": 42050 + }, + { + "epoch": 4.626512651265126, + "grad_norm": 0.017125429585576057, + "learning_rate": 4.74191109832992e-05, + "loss": 0.0193, + "num_input_tokens_seen": 8874112, + "step": 42055 + }, + { + "epoch": 4.627062706270627, + "grad_norm": 0.30062127113342285, + "learning_rate": 4.7418048830247164e-05, + "loss": 0.0356, + "num_input_tokens_seen": 8875136, + "step": 42060 + }, + { + "epoch": 4.6276127612761275, + "grad_norm": 1.554445505142212, + "learning_rate": 4.741698647057873e-05, + "loss": 0.0719, + "num_input_tokens_seen": 8876224, + "step": 42065 + }, + { + "epoch": 4.628162816281629, + "grad_norm": 0.06519943475723267, + "learning_rate": 4.7415923904303695e-05, + "loss": 0.0103, + "num_input_tokens_seen": 8877312, + "step": 42070 + }, + { + "epoch": 4.628712871287129, + "grad_norm": 1.4022294282913208, + "learning_rate": 4.741486113143184e-05, + "loss": 0.0976, + "num_input_tokens_seen": 8878336, + "step": 42075 + }, + { + "epoch": 4.629262926292629, + "grad_norm": 0.09924058616161346, + "learning_rate": 4.741379815197297e-05, + "loss": 0.0152, + "num_input_tokens_seen": 8879360, + "step": 42080 + }, + { + "epoch": 4.62981298129813, + "grad_norm": 0.9063827395439148, + "learning_rate": 4.7412734965936865e-05, + "loss": 0.0705, + "num_input_tokens_seen": 8880384, + "step": 42085 + }, + { + "epoch": 4.63036303630363, + "grad_norm": 0.1819324791431427, + "learning_rate": 4.741167157333335e-05, + "loss": 0.0294, + "num_input_tokens_seen": 8881408, + "step": 42090 + }, + { + "epoch": 4.6309130913091305, + "grad_norm": 0.16079603135585785, + "learning_rate": 4.741060797417221e-05, + "loss": 0.0981, + "num_input_tokens_seen": 8882464, + "step": 42095 + }, + { + "epoch": 4.631463146314632, + "grad_norm": 1.2051193714141846, + "learning_rate": 4.740954416846324e-05, + "loss": 0.1245, + "num_input_tokens_seen": 8883488, + "step": 42100 + }, + { + "epoch": 4.632013201320132, + "grad_norm": 0.6808302402496338, + "learning_rate": 4.7408480156216264e-05, + "loss": 0.0769, + "num_input_tokens_seen": 8884576, + "step": 42105 + }, + { + "epoch": 4.632563256325633, + "grad_norm": 0.040570590645074844, + "learning_rate": 4.740741593744108e-05, + "loss": 0.0556, + "num_input_tokens_seen": 8885600, + "step": 42110 + }, + { + "epoch": 4.633113311331133, + "grad_norm": 0.13039913773536682, + "learning_rate": 4.740635151214749e-05, + "loss": 0.0754, + "num_input_tokens_seen": 8886688, + "step": 42115 + }, + { + "epoch": 4.633663366336633, + "grad_norm": 0.36921563744544983, + "learning_rate": 4.740528688034531e-05, + "loss": 0.0107, + "num_input_tokens_seen": 8887712, + "step": 42120 + }, + { + "epoch": 4.634213421342134, + "grad_norm": 0.4942653775215149, + "learning_rate": 4.740422204204435e-05, + "loss": 0.0512, + "num_input_tokens_seen": 8888736, + "step": 42125 + }, + { + "epoch": 4.634763476347635, + "grad_norm": 0.04216423258185387, + "learning_rate": 4.740315699725444e-05, + "loss": 0.1244, + "num_input_tokens_seen": 8889760, + "step": 42130 + }, + { + "epoch": 4.635313531353136, + "grad_norm": 0.09113036096096039, + "learning_rate": 4.7402091745985366e-05, + "loss": 0.1313, + "num_input_tokens_seen": 8890752, + "step": 42135 + }, + { + "epoch": 4.635863586358636, + "grad_norm": 0.30369895696640015, + "learning_rate": 4.740102628824697e-05, + "loss": 0.0918, + "num_input_tokens_seen": 8891840, + "step": 42140 + }, + { + "epoch": 4.636413641364136, + "grad_norm": 0.05559881031513214, + "learning_rate": 4.739996062404907e-05, + "loss": 0.0054, + "num_input_tokens_seen": 8892896, + "step": 42145 + }, + { + "epoch": 4.636963696369637, + "grad_norm": 1.07989501953125, + "learning_rate": 4.7398894753401466e-05, + "loss": 0.1602, + "num_input_tokens_seen": 8893888, + "step": 42150 + }, + { + "epoch": 4.637513751375137, + "grad_norm": 0.2930101752281189, + "learning_rate": 4.7397828676314004e-05, + "loss": 0.0553, + "num_input_tokens_seen": 8894944, + "step": 42155 + }, + { + "epoch": 4.638063806380638, + "grad_norm": 0.403236448764801, + "learning_rate": 4.739676239279651e-05, + "loss": 0.0168, + "num_input_tokens_seen": 8896064, + "step": 42160 + }, + { + "epoch": 4.638613861386139, + "grad_norm": 0.7825961112976074, + "learning_rate": 4.739569590285879e-05, + "loss": 0.0471, + "num_input_tokens_seen": 8897152, + "step": 42165 + }, + { + "epoch": 4.639163916391639, + "grad_norm": 0.21292859315872192, + "learning_rate": 4.73946292065107e-05, + "loss": 0.0194, + "num_input_tokens_seen": 8898240, + "step": 42170 + }, + { + "epoch": 4.63971397139714, + "grad_norm": 0.7939143776893616, + "learning_rate": 4.739356230376205e-05, + "loss": 0.0769, + "num_input_tokens_seen": 8899360, + "step": 42175 + }, + { + "epoch": 4.64026402640264, + "grad_norm": 0.05377858132123947, + "learning_rate": 4.739249519462269e-05, + "loss": 0.0058, + "num_input_tokens_seen": 8900352, + "step": 42180 + }, + { + "epoch": 4.6408140814081404, + "grad_norm": 0.07272467762231827, + "learning_rate": 4.739142787910244e-05, + "loss": 0.0354, + "num_input_tokens_seen": 8901440, + "step": 42185 + }, + { + "epoch": 4.6413641364136415, + "grad_norm": 0.18049097061157227, + "learning_rate": 4.739036035721114e-05, + "loss": 0.0576, + "num_input_tokens_seen": 8902528, + "step": 42190 + }, + { + "epoch": 4.641914191419142, + "grad_norm": 0.05879613012075424, + "learning_rate": 4.738929262895864e-05, + "loss": 0.042, + "num_input_tokens_seen": 8903616, + "step": 42195 + }, + { + "epoch": 4.642464246424643, + "grad_norm": 0.5621775388717651, + "learning_rate": 4.738822469435477e-05, + "loss": 0.071, + "num_input_tokens_seen": 8904640, + "step": 42200 + }, + { + "epoch": 4.643014301430143, + "grad_norm": 0.03206669166684151, + "learning_rate": 4.738715655340938e-05, + "loss": 0.0647, + "num_input_tokens_seen": 8905760, + "step": 42205 + }, + { + "epoch": 4.643564356435643, + "grad_norm": 0.04240800812840462, + "learning_rate": 4.738608820613231e-05, + "loss": 0.0282, + "num_input_tokens_seen": 8906816, + "step": 42210 + }, + { + "epoch": 4.644114411441144, + "grad_norm": 1.308285117149353, + "learning_rate": 4.73850196525334e-05, + "loss": 0.048, + "num_input_tokens_seen": 8907776, + "step": 42215 + }, + { + "epoch": 4.6446644664466445, + "grad_norm": 0.8105915784835815, + "learning_rate": 4.7383950892622506e-05, + "loss": 0.0465, + "num_input_tokens_seen": 8908864, + "step": 42220 + }, + { + "epoch": 4.645214521452145, + "grad_norm": 0.05518604442477226, + "learning_rate": 4.738288192640949e-05, + "loss": 0.1327, + "num_input_tokens_seen": 8909888, + "step": 42225 + }, + { + "epoch": 4.645764576457646, + "grad_norm": 0.048015836626291275, + "learning_rate": 4.7381812753904176e-05, + "loss": 0.0732, + "num_input_tokens_seen": 8910880, + "step": 42230 + }, + { + "epoch": 4.646314631463146, + "grad_norm": 0.016932040452957153, + "learning_rate": 4.7380743375116445e-05, + "loss": 0.0094, + "num_input_tokens_seen": 8911968, + "step": 42235 + }, + { + "epoch": 4.646864686468647, + "grad_norm": 0.07663972675800323, + "learning_rate": 4.7379673790056133e-05, + "loss": 0.0268, + "num_input_tokens_seen": 8913056, + "step": 42240 + }, + { + "epoch": 4.647414741474147, + "grad_norm": 0.8940971493721008, + "learning_rate": 4.7378603998733115e-05, + "loss": 0.0298, + "num_input_tokens_seen": 8914080, + "step": 42245 + }, + { + "epoch": 4.647964796479648, + "grad_norm": 1.127474308013916, + "learning_rate": 4.7377534001157235e-05, + "loss": 0.0592, + "num_input_tokens_seen": 8915136, + "step": 42250 + }, + { + "epoch": 4.648514851485149, + "grad_norm": 0.4046591520309448, + "learning_rate": 4.7376463797338366e-05, + "loss": 0.0557, + "num_input_tokens_seen": 8916224, + "step": 42255 + }, + { + "epoch": 4.649064906490649, + "grad_norm": 2.5259947776794434, + "learning_rate": 4.737539338728637e-05, + "loss": 0.0588, + "num_input_tokens_seen": 8917280, + "step": 42260 + }, + { + "epoch": 4.64961496149615, + "grad_norm": 0.15332427620887756, + "learning_rate": 4.73743227710111e-05, + "loss": 0.0263, + "num_input_tokens_seen": 8918336, + "step": 42265 + }, + { + "epoch": 4.65016501650165, + "grad_norm": 0.00509687652811408, + "learning_rate": 4.737325194852244e-05, + "loss": 0.0917, + "num_input_tokens_seen": 8919392, + "step": 42270 + }, + { + "epoch": 4.65071507150715, + "grad_norm": 0.6877890825271606, + "learning_rate": 4.737218091983026e-05, + "loss": 0.0292, + "num_input_tokens_seen": 8920416, + "step": 42275 + }, + { + "epoch": 4.6512651265126514, + "grad_norm": 1.3427342176437378, + "learning_rate": 4.7371109684944406e-05, + "loss": 0.103, + "num_input_tokens_seen": 8921504, + "step": 42280 + }, + { + "epoch": 4.651815181518152, + "grad_norm": 0.33371701836586, + "learning_rate": 4.7370038243874785e-05, + "loss": 0.0213, + "num_input_tokens_seen": 8922592, + "step": 42285 + }, + { + "epoch": 4.652365236523653, + "grad_norm": 0.1960885226726532, + "learning_rate": 4.7368966596631245e-05, + "loss": 0.0272, + "num_input_tokens_seen": 8923584, + "step": 42290 + }, + { + "epoch": 4.652915291529153, + "grad_norm": 1.693711757659912, + "learning_rate": 4.736789474322368e-05, + "loss": 0.07, + "num_input_tokens_seen": 8924736, + "step": 42295 + }, + { + "epoch": 4.653465346534653, + "grad_norm": 0.8350136280059814, + "learning_rate": 4.736682268366196e-05, + "loss": 0.199, + "num_input_tokens_seen": 8925760, + "step": 42300 + }, + { + "epoch": 4.654015401540154, + "grad_norm": 0.22709998488426208, + "learning_rate": 4.736575041795597e-05, + "loss": 0.0094, + "num_input_tokens_seen": 8926720, + "step": 42305 + }, + { + "epoch": 4.6545654565456545, + "grad_norm": 0.5489059090614319, + "learning_rate": 4.736467794611558e-05, + "loss": 0.0571, + "num_input_tokens_seen": 8927776, + "step": 42310 + }, + { + "epoch": 4.6551155115511555, + "grad_norm": 0.8286339044570923, + "learning_rate": 4.7363605268150704e-05, + "loss": 0.095, + "num_input_tokens_seen": 8928864, + "step": 42315 + }, + { + "epoch": 4.655665566556656, + "grad_norm": 0.09166041016578674, + "learning_rate": 4.736253238407119e-05, + "loss": 0.0442, + "num_input_tokens_seen": 8929984, + "step": 42320 + }, + { + "epoch": 4.656215621562156, + "grad_norm": 0.0551852285861969, + "learning_rate": 4.7361459293886956e-05, + "loss": 0.0094, + "num_input_tokens_seen": 8931008, + "step": 42325 + }, + { + "epoch": 4.656765676567657, + "grad_norm": 0.38969162106513977, + "learning_rate": 4.736038599760788e-05, + "loss": 0.0409, + "num_input_tokens_seen": 8932064, + "step": 42330 + }, + { + "epoch": 4.657315731573157, + "grad_norm": 0.10636133700609207, + "learning_rate": 4.735931249524386e-05, + "loss": 0.0177, + "num_input_tokens_seen": 8933056, + "step": 42335 + }, + { + "epoch": 4.6578657865786575, + "grad_norm": 0.4428882896900177, + "learning_rate": 4.735823878680478e-05, + "loss": 0.1042, + "num_input_tokens_seen": 8934112, + "step": 42340 + }, + { + "epoch": 4.658415841584159, + "grad_norm": 0.15143778920173645, + "learning_rate": 4.735716487230054e-05, + "loss": 0.0127, + "num_input_tokens_seen": 8935168, + "step": 42345 + }, + { + "epoch": 4.658965896589659, + "grad_norm": 0.17334401607513428, + "learning_rate": 4.735609075174105e-05, + "loss": 0.0493, + "num_input_tokens_seen": 8936192, + "step": 42350 + }, + { + "epoch": 4.65951595159516, + "grad_norm": 0.02144746482372284, + "learning_rate": 4.735501642513619e-05, + "loss": 0.0934, + "num_input_tokens_seen": 8937248, + "step": 42355 + }, + { + "epoch": 4.66006600660066, + "grad_norm": 0.11136375367641449, + "learning_rate": 4.735394189249587e-05, + "loss": 0.0889, + "num_input_tokens_seen": 8938368, + "step": 42360 + }, + { + "epoch": 4.66061606160616, + "grad_norm": 1.5107868909835815, + "learning_rate": 4.7352867153829994e-05, + "loss": 0.0784, + "num_input_tokens_seen": 8939392, + "step": 42365 + }, + { + "epoch": 4.661166116611661, + "grad_norm": 0.011141189374029636, + "learning_rate": 4.735179220914847e-05, + "loss": 0.0232, + "num_input_tokens_seen": 8940480, + "step": 42370 + }, + { + "epoch": 4.661716171617162, + "grad_norm": 0.030363664031028748, + "learning_rate": 4.7350717058461205e-05, + "loss": 0.0933, + "num_input_tokens_seen": 8941504, + "step": 42375 + }, + { + "epoch": 4.662266226622663, + "grad_norm": 0.07080145925283432, + "learning_rate": 4.73496417017781e-05, + "loss": 0.0921, + "num_input_tokens_seen": 8942528, + "step": 42380 + }, + { + "epoch": 4.662816281628163, + "grad_norm": 0.7442094087600708, + "learning_rate": 4.734856613910907e-05, + "loss": 0.0414, + "num_input_tokens_seen": 8943616, + "step": 42385 + }, + { + "epoch": 4.663366336633663, + "grad_norm": 0.21647489070892334, + "learning_rate": 4.734749037046404e-05, + "loss": 0.024, + "num_input_tokens_seen": 8944672, + "step": 42390 + }, + { + "epoch": 4.663916391639164, + "grad_norm": 0.2208595722913742, + "learning_rate": 4.734641439585291e-05, + "loss": 0.0573, + "num_input_tokens_seen": 8945664, + "step": 42395 + }, + { + "epoch": 4.664466446644664, + "grad_norm": 0.11886133253574371, + "learning_rate": 4.73453382152856e-05, + "loss": 0.1449, + "num_input_tokens_seen": 8946752, + "step": 42400 + }, + { + "epoch": 4.665016501650165, + "grad_norm": 0.5535987615585327, + "learning_rate": 4.734426182877203e-05, + "loss": 0.0831, + "num_input_tokens_seen": 8947872, + "step": 42405 + }, + { + "epoch": 4.665566556655666, + "grad_norm": 0.7323012351989746, + "learning_rate": 4.734318523632212e-05, + "loss": 0.0371, + "num_input_tokens_seen": 8948896, + "step": 42410 + }, + { + "epoch": 4.666116611661166, + "grad_norm": 0.037362635135650635, + "learning_rate": 4.73421084379458e-05, + "loss": 0.0914, + "num_input_tokens_seen": 8949888, + "step": 42415 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.2039223909378052, + "learning_rate": 4.7341031433652976e-05, + "loss": 0.0404, + "num_input_tokens_seen": 8950944, + "step": 42420 + }, + { + "epoch": 4.667216721672167, + "grad_norm": 0.33506515622138977, + "learning_rate": 4.73399542234536e-05, + "loss": 0.0272, + "num_input_tokens_seen": 8952032, + "step": 42425 + }, + { + "epoch": 4.667766776677668, + "grad_norm": 0.052596863359212875, + "learning_rate": 4.733887680735757e-05, + "loss": 0.0815, + "num_input_tokens_seen": 8953056, + "step": 42430 + }, + { + "epoch": 4.6683168316831685, + "grad_norm": 0.8398985862731934, + "learning_rate": 4.7337799185374844e-05, + "loss": 0.0488, + "num_input_tokens_seen": 8954144, + "step": 42435 + }, + { + "epoch": 4.668866886688669, + "grad_norm": 0.46262386441230774, + "learning_rate": 4.7336721357515335e-05, + "loss": 0.0642, + "num_input_tokens_seen": 8955168, + "step": 42440 + }, + { + "epoch": 4.66941694169417, + "grad_norm": 0.12283816188573837, + "learning_rate": 4.733564332378898e-05, + "loss": 0.0739, + "num_input_tokens_seen": 8956192, + "step": 42445 + }, + { + "epoch": 4.66996699669967, + "grad_norm": 0.5300114154815674, + "learning_rate": 4.7334565084205724e-05, + "loss": 0.054, + "num_input_tokens_seen": 8957216, + "step": 42450 + }, + { + "epoch": 4.67051705170517, + "grad_norm": 0.3785826861858368, + "learning_rate": 4.7333486638775496e-05, + "loss": 0.0354, + "num_input_tokens_seen": 8958304, + "step": 42455 + }, + { + "epoch": 4.671067106710671, + "grad_norm": 0.18529477715492249, + "learning_rate": 4.7332407987508246e-05, + "loss": 0.0431, + "num_input_tokens_seen": 8959328, + "step": 42460 + }, + { + "epoch": 4.6716171617161715, + "grad_norm": 0.15015293657779694, + "learning_rate": 4.733132913041389e-05, + "loss": 0.0127, + "num_input_tokens_seen": 8960384, + "step": 42465 + }, + { + "epoch": 4.672167216721672, + "grad_norm": 0.038851454854011536, + "learning_rate": 4.733025006750241e-05, + "loss": 0.0113, + "num_input_tokens_seen": 8961472, + "step": 42470 + }, + { + "epoch": 4.672717271727173, + "grad_norm": 0.8113204836845398, + "learning_rate": 4.732917079878372e-05, + "loss": 0.0377, + "num_input_tokens_seen": 8962528, + "step": 42475 + }, + { + "epoch": 4.673267326732673, + "grad_norm": 2.0155792236328125, + "learning_rate": 4.7328091324267774e-05, + "loss": 0.2108, + "num_input_tokens_seen": 8963616, + "step": 42480 + }, + { + "epoch": 4.673817381738174, + "grad_norm": 0.13405491411685944, + "learning_rate": 4.732701164396453e-05, + "loss": 0.0245, + "num_input_tokens_seen": 8964640, + "step": 42485 + }, + { + "epoch": 4.674367436743674, + "grad_norm": 0.10441163927316666, + "learning_rate": 4.7325931757883934e-05, + "loss": 0.0359, + "num_input_tokens_seen": 8965760, + "step": 42490 + }, + { + "epoch": 4.674917491749175, + "grad_norm": 0.6692001223564148, + "learning_rate": 4.732485166603593e-05, + "loss": 0.1318, + "num_input_tokens_seen": 8966784, + "step": 42495 + }, + { + "epoch": 4.675467546754676, + "grad_norm": 0.29203110933303833, + "learning_rate": 4.732377136843049e-05, + "loss": 0.1104, + "num_input_tokens_seen": 8967872, + "step": 42500 + }, + { + "epoch": 4.676017601760176, + "grad_norm": 0.03056197054684162, + "learning_rate": 4.7322690865077554e-05, + "loss": 0.0134, + "num_input_tokens_seen": 8968960, + "step": 42505 + }, + { + "epoch": 4.676567656765677, + "grad_norm": 1.1553384065628052, + "learning_rate": 4.732161015598709e-05, + "loss": 0.0359, + "num_input_tokens_seen": 8969952, + "step": 42510 + }, + { + "epoch": 4.677117711771177, + "grad_norm": 0.157235786318779, + "learning_rate": 4.7320529241169065e-05, + "loss": 0.0118, + "num_input_tokens_seen": 8971008, + "step": 42515 + }, + { + "epoch": 4.677667766776677, + "grad_norm": 0.022303521633148193, + "learning_rate": 4.731944812063343e-05, + "loss": 0.1233, + "num_input_tokens_seen": 8972096, + "step": 42520 + }, + { + "epoch": 4.678217821782178, + "grad_norm": 0.5764988660812378, + "learning_rate": 4.731836679439014e-05, + "loss": 0.0639, + "num_input_tokens_seen": 8973120, + "step": 42525 + }, + { + "epoch": 4.678767876787679, + "grad_norm": 0.1505679190158844, + "learning_rate": 4.731728526244918e-05, + "loss": 0.0991, + "num_input_tokens_seen": 8974240, + "step": 42530 + }, + { + "epoch": 4.67931793179318, + "grad_norm": 0.04442496970295906, + "learning_rate": 4.7316203524820504e-05, + "loss": 0.0312, + "num_input_tokens_seen": 8975296, + "step": 42535 + }, + { + "epoch": 4.67986798679868, + "grad_norm": 0.2264445722103119, + "learning_rate": 4.73151215815141e-05, + "loss": 0.0481, + "num_input_tokens_seen": 8976256, + "step": 42540 + }, + { + "epoch": 4.68041804180418, + "grad_norm": 0.2014806866645813, + "learning_rate": 4.7314039432539924e-05, + "loss": 0.037, + "num_input_tokens_seen": 8977312, + "step": 42545 + }, + { + "epoch": 4.680968096809681, + "grad_norm": 0.11325329542160034, + "learning_rate": 4.731295707790795e-05, + "loss": 0.015, + "num_input_tokens_seen": 8978336, + "step": 42550 + }, + { + "epoch": 4.681518151815181, + "grad_norm": 0.8502760529518127, + "learning_rate": 4.731187451762816e-05, + "loss": 0.0418, + "num_input_tokens_seen": 8979360, + "step": 42555 + }, + { + "epoch": 4.6820682068206825, + "grad_norm": 0.4588749408721924, + "learning_rate": 4.7310791751710526e-05, + "loss": 0.0344, + "num_input_tokens_seen": 8980416, + "step": 42560 + }, + { + "epoch": 4.682618261826183, + "grad_norm": 0.0815635621547699, + "learning_rate": 4.7309708780165036e-05, + "loss": 0.0413, + "num_input_tokens_seen": 8981440, + "step": 42565 + }, + { + "epoch": 4.683168316831683, + "grad_norm": 0.45719799399375916, + "learning_rate": 4.730862560300166e-05, + "loss": 0.0492, + "num_input_tokens_seen": 8982528, + "step": 42570 + }, + { + "epoch": 4.683718371837184, + "grad_norm": 0.05873989686369896, + "learning_rate": 4.730754222023039e-05, + "loss": 0.0234, + "num_input_tokens_seen": 8983584, + "step": 42575 + }, + { + "epoch": 4.684268426842684, + "grad_norm": 0.049231864511966705, + "learning_rate": 4.7306458631861205e-05, + "loss": 0.0986, + "num_input_tokens_seen": 8984704, + "step": 42580 + }, + { + "epoch": 4.684818481848184, + "grad_norm": 0.13257984817028046, + "learning_rate": 4.730537483790409e-05, + "loss": 0.0525, + "num_input_tokens_seen": 8985728, + "step": 42585 + }, + { + "epoch": 4.6853685368536855, + "grad_norm": 0.23140832781791687, + "learning_rate": 4.730429083836905e-05, + "loss": 0.017, + "num_input_tokens_seen": 8986784, + "step": 42590 + }, + { + "epoch": 4.685918591859186, + "grad_norm": 0.01011404674500227, + "learning_rate": 4.7303206633266064e-05, + "loss": 0.0033, + "num_input_tokens_seen": 8987840, + "step": 42595 + }, + { + "epoch": 4.686468646864687, + "grad_norm": 0.07681744545698166, + "learning_rate": 4.730212222260512e-05, + "loss": 0.0344, + "num_input_tokens_seen": 8988896, + "step": 42600 + }, + { + "epoch": 4.687018701870187, + "grad_norm": 0.8127397298812866, + "learning_rate": 4.730103760639621e-05, + "loss": 0.0742, + "num_input_tokens_seen": 8989984, + "step": 42605 + }, + { + "epoch": 4.687568756875687, + "grad_norm": 0.30890318751335144, + "learning_rate": 4.729995278464934e-05, + "loss": 0.0807, + "num_input_tokens_seen": 8991008, + "step": 42610 + }, + { + "epoch": 4.688118811881188, + "grad_norm": 0.08516772836446762, + "learning_rate": 4.729886775737451e-05, + "loss": 0.029, + "num_input_tokens_seen": 8992000, + "step": 42615 + }, + { + "epoch": 4.6886688668866885, + "grad_norm": 0.0664718747138977, + "learning_rate": 4.729778252458171e-05, + "loss": 0.1147, + "num_input_tokens_seen": 8993056, + "step": 42620 + }, + { + "epoch": 4.68921892189219, + "grad_norm": 0.024529531598091125, + "learning_rate": 4.729669708628096e-05, + "loss": 0.0811, + "num_input_tokens_seen": 8994112, + "step": 42625 + }, + { + "epoch": 4.68976897689769, + "grad_norm": 0.5980114936828613, + "learning_rate": 4.7295611442482244e-05, + "loss": 0.044, + "num_input_tokens_seen": 8995168, + "step": 42630 + }, + { + "epoch": 4.69031903190319, + "grad_norm": 0.04594479501247406, + "learning_rate": 4.729452559319558e-05, + "loss": 0.0259, + "num_input_tokens_seen": 8996192, + "step": 42635 + }, + { + "epoch": 4.690869086908691, + "grad_norm": 0.11144495010375977, + "learning_rate": 4.7293439538430964e-05, + "loss": 0.0099, + "num_input_tokens_seen": 8997248, + "step": 42640 + }, + { + "epoch": 4.691419141914191, + "grad_norm": 0.2846452295780182, + "learning_rate": 4.729235327819842e-05, + "loss": 0.0552, + "num_input_tokens_seen": 8998304, + "step": 42645 + }, + { + "epoch": 4.6919691969196915, + "grad_norm": 0.12406361848115921, + "learning_rate": 4.729126681250795e-05, + "loss": 0.0906, + "num_input_tokens_seen": 8999328, + "step": 42650 + }, + { + "epoch": 4.692519251925193, + "grad_norm": 1.2683402299880981, + "learning_rate": 4.7290180141369564e-05, + "loss": 0.077, + "num_input_tokens_seen": 9000352, + "step": 42655 + }, + { + "epoch": 4.693069306930693, + "grad_norm": 0.17070229351520538, + "learning_rate": 4.7289093264793296e-05, + "loss": 0.0094, + "num_input_tokens_seen": 9001440, + "step": 42660 + }, + { + "epoch": 4.693619361936194, + "grad_norm": 0.16708829998970032, + "learning_rate": 4.728800618278915e-05, + "loss": 0.0254, + "num_input_tokens_seen": 9002464, + "step": 42665 + }, + { + "epoch": 4.694169416941694, + "grad_norm": 0.4442504346370697, + "learning_rate": 4.728691889536713e-05, + "loss": 0.0151, + "num_input_tokens_seen": 9003552, + "step": 42670 + }, + { + "epoch": 4.694719471947195, + "grad_norm": 0.13617686927318573, + "learning_rate": 4.728583140253728e-05, + "loss": 0.0381, + "num_input_tokens_seen": 9004608, + "step": 42675 + }, + { + "epoch": 4.695269526952695, + "grad_norm": 0.8392490148544312, + "learning_rate": 4.7284743704309626e-05, + "loss": 0.0987, + "num_input_tokens_seen": 9005728, + "step": 42680 + }, + { + "epoch": 4.695819581958196, + "grad_norm": 0.07662160694599152, + "learning_rate": 4.728365580069417e-05, + "loss": 0.0828, + "num_input_tokens_seen": 9006848, + "step": 42685 + }, + { + "epoch": 4.696369636963697, + "grad_norm": 0.16890625655651093, + "learning_rate": 4.7282567691700966e-05, + "loss": 0.0113, + "num_input_tokens_seen": 9007872, + "step": 42690 + }, + { + "epoch": 4.696919691969197, + "grad_norm": 0.1266811639070511, + "learning_rate": 4.728147937734001e-05, + "loss": 0.059, + "num_input_tokens_seen": 9008864, + "step": 42695 + }, + { + "epoch": 4.697469746974697, + "grad_norm": 0.06886952370405197, + "learning_rate": 4.728039085762136e-05, + "loss": 0.0608, + "num_input_tokens_seen": 9009920, + "step": 42700 + }, + { + "epoch": 4.698019801980198, + "grad_norm": 1.3269591331481934, + "learning_rate": 4.727930213255504e-05, + "loss": 0.0305, + "num_input_tokens_seen": 9010944, + "step": 42705 + }, + { + "epoch": 4.698569856985698, + "grad_norm": 0.14072787761688232, + "learning_rate": 4.727821320215108e-05, + "loss": 0.072, + "num_input_tokens_seen": 9012032, + "step": 42710 + }, + { + "epoch": 4.6991199119911995, + "grad_norm": 0.30745404958724976, + "learning_rate": 4.727712406641952e-05, + "loss": 0.0291, + "num_input_tokens_seen": 9013184, + "step": 42715 + }, + { + "epoch": 4.6996699669967, + "grad_norm": 0.41960379481315613, + "learning_rate": 4.72760347253704e-05, + "loss": 0.0659, + "num_input_tokens_seen": 9014240, + "step": 42720 + }, + { + "epoch": 4.7002200220022, + "grad_norm": 0.2732478678226471, + "learning_rate": 4.727494517901375e-05, + "loss": 0.0251, + "num_input_tokens_seen": 9015328, + "step": 42725 + }, + { + "epoch": 4.700770077007701, + "grad_norm": 0.08922340720891953, + "learning_rate": 4.727385542735962e-05, + "loss": 0.0586, + "num_input_tokens_seen": 9016352, + "step": 42730 + }, + { + "epoch": 4.701320132013201, + "grad_norm": 0.18677422404289246, + "learning_rate": 4.727276547041805e-05, + "loss": 0.0488, + "num_input_tokens_seen": 9017408, + "step": 42735 + }, + { + "epoch": 4.701870187018702, + "grad_norm": 0.14750276505947113, + "learning_rate": 4.7271675308199106e-05, + "loss": 0.0725, + "num_input_tokens_seen": 9018432, + "step": 42740 + }, + { + "epoch": 4.7024202420242025, + "grad_norm": 0.25490137934684753, + "learning_rate": 4.72705849407128e-05, + "loss": 0.0378, + "num_input_tokens_seen": 9019456, + "step": 42745 + }, + { + "epoch": 4.702970297029703, + "grad_norm": 0.21044686436653137, + "learning_rate": 4.72694943679692e-05, + "loss": 0.059, + "num_input_tokens_seen": 9020480, + "step": 42750 + }, + { + "epoch": 4.703520352035204, + "grad_norm": 0.1358310878276825, + "learning_rate": 4.726840358997836e-05, + "loss": 0.0539, + "num_input_tokens_seen": 9021504, + "step": 42755 + }, + { + "epoch": 4.704070407040704, + "grad_norm": 0.12858399748802185, + "learning_rate": 4.7267312606750334e-05, + "loss": 0.0485, + "num_input_tokens_seen": 9022496, + "step": 42760 + }, + { + "epoch": 4.704620462046204, + "grad_norm": 1.075978398323059, + "learning_rate": 4.726622141829516e-05, + "loss": 0.1899, + "num_input_tokens_seen": 9023520, + "step": 42765 + }, + { + "epoch": 4.705170517051705, + "grad_norm": 0.01818162016570568, + "learning_rate": 4.726513002462292e-05, + "loss": 0.0487, + "num_input_tokens_seen": 9024512, + "step": 42770 + }, + { + "epoch": 4.7057205720572055, + "grad_norm": 0.5267859101295471, + "learning_rate": 4.726403842574366e-05, + "loss": 0.065, + "num_input_tokens_seen": 9025568, + "step": 42775 + }, + { + "epoch": 4.706270627062707, + "grad_norm": 0.06083972379565239, + "learning_rate": 4.7262946621667435e-05, + "loss": 0.0091, + "num_input_tokens_seen": 9026656, + "step": 42780 + }, + { + "epoch": 4.706820682068207, + "grad_norm": 1.771979808807373, + "learning_rate": 4.7261854612404316e-05, + "loss": 0.149, + "num_input_tokens_seen": 9027744, + "step": 42785 + }, + { + "epoch": 4.707370737073707, + "grad_norm": 0.5024795532226562, + "learning_rate": 4.7260762397964365e-05, + "loss": 0.0159, + "num_input_tokens_seen": 9028768, + "step": 42790 + }, + { + "epoch": 4.707920792079208, + "grad_norm": 0.16762253642082214, + "learning_rate": 4.725966997835765e-05, + "loss": 0.0556, + "num_input_tokens_seen": 9029792, + "step": 42795 + }, + { + "epoch": 4.708470847084708, + "grad_norm": 0.23250384628772736, + "learning_rate": 4.7258577353594235e-05, + "loss": 0.0449, + "num_input_tokens_seen": 9030848, + "step": 42800 + }, + { + "epoch": 4.709020902090209, + "grad_norm": 0.1399705410003662, + "learning_rate": 4.72574845236842e-05, + "loss": 0.0291, + "num_input_tokens_seen": 9031904, + "step": 42805 + }, + { + "epoch": 4.70957095709571, + "grad_norm": 0.17179392278194427, + "learning_rate": 4.7256391488637606e-05, + "loss": 0.0883, + "num_input_tokens_seen": 9032992, + "step": 42810 + }, + { + "epoch": 4.71012101210121, + "grad_norm": 0.8084506392478943, + "learning_rate": 4.725529824846453e-05, + "loss": 0.078, + "num_input_tokens_seen": 9034048, + "step": 42815 + }, + { + "epoch": 4.710671067106711, + "grad_norm": 0.14638736844062805, + "learning_rate": 4.725420480317505e-05, + "loss": 0.0124, + "num_input_tokens_seen": 9035072, + "step": 42820 + }, + { + "epoch": 4.711221122112211, + "grad_norm": 0.3126293122768402, + "learning_rate": 4.725311115277924e-05, + "loss": 0.0678, + "num_input_tokens_seen": 9036128, + "step": 42825 + }, + { + "epoch": 4.711771177117711, + "grad_norm": 0.650372326374054, + "learning_rate": 4.7252017297287186e-05, + "loss": 0.0205, + "num_input_tokens_seen": 9037152, + "step": 42830 + }, + { + "epoch": 4.712321232123212, + "grad_norm": 0.04837046563625336, + "learning_rate": 4.725092323670897e-05, + "loss": 0.0198, + "num_input_tokens_seen": 9038208, + "step": 42835 + }, + { + "epoch": 4.712871287128713, + "grad_norm": 0.08974071592092514, + "learning_rate": 4.7249828971054664e-05, + "loss": 0.0375, + "num_input_tokens_seen": 9039264, + "step": 42840 + }, + { + "epoch": 4.713421342134214, + "grad_norm": 0.27645182609558105, + "learning_rate": 4.7248734500334366e-05, + "loss": 0.0106, + "num_input_tokens_seen": 9040288, + "step": 42845 + }, + { + "epoch": 4.713971397139714, + "grad_norm": 0.18488232791423798, + "learning_rate": 4.724763982455815e-05, + "loss": 0.0342, + "num_input_tokens_seen": 9041344, + "step": 42850 + }, + { + "epoch": 4.714521452145215, + "grad_norm": 2.0137434005737305, + "learning_rate": 4.724654494373613e-05, + "loss": 0.0279, + "num_input_tokens_seen": 9042368, + "step": 42855 + }, + { + "epoch": 4.715071507150715, + "grad_norm": 0.11038640886545181, + "learning_rate": 4.7245449857878365e-05, + "loss": 0.0093, + "num_input_tokens_seen": 9043424, + "step": 42860 + }, + { + "epoch": 4.715621562156215, + "grad_norm": 0.6474816799163818, + "learning_rate": 4.724435456699497e-05, + "loss": 0.0715, + "num_input_tokens_seen": 9044448, + "step": 42865 + }, + { + "epoch": 4.7161716171617165, + "grad_norm": 0.021884214133024216, + "learning_rate": 4.724325907109603e-05, + "loss": 0.0639, + "num_input_tokens_seen": 9045504, + "step": 42870 + }, + { + "epoch": 4.716721672167217, + "grad_norm": 0.49366867542266846, + "learning_rate": 4.724216337019165e-05, + "loss": 0.0336, + "num_input_tokens_seen": 9046560, + "step": 42875 + }, + { + "epoch": 4.717271727172717, + "grad_norm": 0.03540852665901184, + "learning_rate": 4.724106746429191e-05, + "loss": 0.0177, + "num_input_tokens_seen": 9047648, + "step": 42880 + }, + { + "epoch": 4.717821782178218, + "grad_norm": 0.4746801257133484, + "learning_rate": 4.7239971353406936e-05, + "loss": 0.0136, + "num_input_tokens_seen": 9048672, + "step": 42885 + }, + { + "epoch": 4.718371837183718, + "grad_norm": 2.6625473499298096, + "learning_rate": 4.723887503754681e-05, + "loss": 0.1322, + "num_input_tokens_seen": 9049728, + "step": 42890 + }, + { + "epoch": 4.718921892189218, + "grad_norm": 1.1852294206619263, + "learning_rate": 4.723777851672165e-05, + "loss": 0.0538, + "num_input_tokens_seen": 9050784, + "step": 42895 + }, + { + "epoch": 4.7194719471947195, + "grad_norm": 0.5684489011764526, + "learning_rate": 4.7236681790941554e-05, + "loss": 0.1245, + "num_input_tokens_seen": 9051872, + "step": 42900 + }, + { + "epoch": 4.72002200220022, + "grad_norm": 0.6846001148223877, + "learning_rate": 4.723558486021663e-05, + "loss": 0.0325, + "num_input_tokens_seen": 9052864, + "step": 42905 + }, + { + "epoch": 4.720572057205721, + "grad_norm": 1.939063310623169, + "learning_rate": 4.7234487724557e-05, + "loss": 0.0948, + "num_input_tokens_seen": 9053856, + "step": 42910 + }, + { + "epoch": 4.721122112211221, + "grad_norm": 0.04540756717324257, + "learning_rate": 4.7233390383972754e-05, + "loss": 0.1433, + "num_input_tokens_seen": 9054912, + "step": 42915 + }, + { + "epoch": 4.721672167216722, + "grad_norm": 1.2121564149856567, + "learning_rate": 4.723229283847402e-05, + "loss": 0.0263, + "num_input_tokens_seen": 9056000, + "step": 42920 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.04408576339483261, + "learning_rate": 4.723119508807091e-05, + "loss": 0.0343, + "num_input_tokens_seen": 9057088, + "step": 42925 + }, + { + "epoch": 4.7227722772277225, + "grad_norm": 0.4521125853061676, + "learning_rate": 4.7230097132773545e-05, + "loss": 0.0657, + "num_input_tokens_seen": 9058144, + "step": 42930 + }, + { + "epoch": 4.723322332233224, + "grad_norm": 0.06742984801530838, + "learning_rate": 4.722899897259204e-05, + "loss": 0.0465, + "num_input_tokens_seen": 9059264, + "step": 42935 + }, + { + "epoch": 4.723872387238724, + "grad_norm": 0.031279757618904114, + "learning_rate": 4.7227900607536525e-05, + "loss": 0.0727, + "num_input_tokens_seen": 9060256, + "step": 42940 + }, + { + "epoch": 4.724422442244224, + "grad_norm": 0.616716742515564, + "learning_rate": 4.7226802037617105e-05, + "loss": 0.1143, + "num_input_tokens_seen": 9061344, + "step": 42945 + }, + { + "epoch": 4.724972497249725, + "grad_norm": 0.25487762689590454, + "learning_rate": 4.7225703262843925e-05, + "loss": 0.0326, + "num_input_tokens_seen": 9062336, + "step": 42950 + }, + { + "epoch": 4.725522552255225, + "grad_norm": 0.12567761540412903, + "learning_rate": 4.7224604283227095e-05, + "loss": 0.016, + "num_input_tokens_seen": 9063328, + "step": 42955 + }, + { + "epoch": 4.726072607260726, + "grad_norm": 0.559309184551239, + "learning_rate": 4.722350509877676e-05, + "loss": 0.163, + "num_input_tokens_seen": 9064384, + "step": 42960 + }, + { + "epoch": 4.726622662266227, + "grad_norm": 1.6324927806854248, + "learning_rate": 4.722240570950304e-05, + "loss": 0.1064, + "num_input_tokens_seen": 9065408, + "step": 42965 + }, + { + "epoch": 4.727172717271727, + "grad_norm": 0.04793215170502663, + "learning_rate": 4.722130611541606e-05, + "loss": 0.0722, + "num_input_tokens_seen": 9066432, + "step": 42970 + }, + { + "epoch": 4.727722772277228, + "grad_norm": 0.12987641990184784, + "learning_rate": 4.7220206316525976e-05, + "loss": 0.0081, + "num_input_tokens_seen": 9067520, + "step": 42975 + }, + { + "epoch": 4.728272827282728, + "grad_norm": 0.5941084623336792, + "learning_rate": 4.721910631284291e-05, + "loss": 0.032, + "num_input_tokens_seen": 9068608, + "step": 42980 + }, + { + "epoch": 4.728822882288229, + "grad_norm": 0.2957055866718292, + "learning_rate": 4.721800610437701e-05, + "loss": 0.0226, + "num_input_tokens_seen": 9069760, + "step": 42985 + }, + { + "epoch": 4.729372937293729, + "grad_norm": 0.09331435710191727, + "learning_rate": 4.7216905691138396e-05, + "loss": 0.0059, + "num_input_tokens_seen": 9070880, + "step": 42990 + }, + { + "epoch": 4.72992299229923, + "grad_norm": 0.09673750400543213, + "learning_rate": 4.721580507313723e-05, + "loss": 0.0679, + "num_input_tokens_seen": 9071968, + "step": 42995 + }, + { + "epoch": 4.730473047304731, + "grad_norm": 0.03869984671473503, + "learning_rate": 4.721470425038365e-05, + "loss": 0.04, + "num_input_tokens_seen": 9072960, + "step": 43000 + }, + { + "epoch": 4.731023102310231, + "grad_norm": 1.4511535167694092, + "learning_rate": 4.7213603222887795e-05, + "loss": 0.1045, + "num_input_tokens_seen": 9074048, + "step": 43005 + }, + { + "epoch": 4.731573157315731, + "grad_norm": 0.4988265335559845, + "learning_rate": 4.7212501990659816e-05, + "loss": 0.0271, + "num_input_tokens_seen": 9075072, + "step": 43010 + }, + { + "epoch": 4.732123212321232, + "grad_norm": 0.5675687193870544, + "learning_rate": 4.721140055370987e-05, + "loss": 0.0237, + "num_input_tokens_seen": 9076096, + "step": 43015 + }, + { + "epoch": 4.732673267326732, + "grad_norm": 0.042606692761182785, + "learning_rate": 4.7210298912048105e-05, + "loss": 0.0641, + "num_input_tokens_seen": 9077184, + "step": 43020 + }, + { + "epoch": 4.7332233223322335, + "grad_norm": 0.15381377935409546, + "learning_rate": 4.7209197065684675e-05, + "loss": 0.0624, + "num_input_tokens_seen": 9078240, + "step": 43025 + }, + { + "epoch": 4.733773377337734, + "grad_norm": 0.01964070461690426, + "learning_rate": 4.720809501462973e-05, + "loss": 0.017, + "num_input_tokens_seen": 9079200, + "step": 43030 + }, + { + "epoch": 4.734323432343234, + "grad_norm": 0.2386712282896042, + "learning_rate": 4.720699275889342e-05, + "loss": 0.0486, + "num_input_tokens_seen": 9080288, + "step": 43035 + }, + { + "epoch": 4.734873487348735, + "grad_norm": 1.162781000137329, + "learning_rate": 4.720589029848592e-05, + "loss": 0.0948, + "num_input_tokens_seen": 9081408, + "step": 43040 + }, + { + "epoch": 4.735423542354235, + "grad_norm": 0.031075619161128998, + "learning_rate": 4.7204787633417394e-05, + "loss": 0.0236, + "num_input_tokens_seen": 9082496, + "step": 43045 + }, + { + "epoch": 4.735973597359736, + "grad_norm": 0.020609157159924507, + "learning_rate": 4.720368476369798e-05, + "loss": 0.1834, + "num_input_tokens_seen": 9083584, + "step": 43050 + }, + { + "epoch": 4.7365236523652365, + "grad_norm": 0.058324508368968964, + "learning_rate": 4.7202581689337865e-05, + "loss": 0.0066, + "num_input_tokens_seen": 9084576, + "step": 43055 + }, + { + "epoch": 4.737073707370737, + "grad_norm": 0.29262417554855347, + "learning_rate": 4.720147841034721e-05, + "loss": 0.056, + "num_input_tokens_seen": 9085632, + "step": 43060 + }, + { + "epoch": 4.737623762376238, + "grad_norm": 0.14519882202148438, + "learning_rate": 4.720037492673618e-05, + "loss": 0.1396, + "num_input_tokens_seen": 9086656, + "step": 43065 + }, + { + "epoch": 4.738173817381738, + "grad_norm": 0.11152853071689606, + "learning_rate": 4.7199271238514945e-05, + "loss": 0.0333, + "num_input_tokens_seen": 9087776, + "step": 43070 + }, + { + "epoch": 4.738723872387238, + "grad_norm": 0.5464486479759216, + "learning_rate": 4.719816734569369e-05, + "loss": 0.0427, + "num_input_tokens_seen": 9088832, + "step": 43075 + }, + { + "epoch": 4.739273927392739, + "grad_norm": 0.1373472958803177, + "learning_rate": 4.719706324828256e-05, + "loss": 0.017, + "num_input_tokens_seen": 9089952, + "step": 43080 + }, + { + "epoch": 4.7398239823982395, + "grad_norm": 0.025704175233840942, + "learning_rate": 4.719595894629176e-05, + "loss": 0.0495, + "num_input_tokens_seen": 9090976, + "step": 43085 + }, + { + "epoch": 4.740374037403741, + "grad_norm": 0.8391005396842957, + "learning_rate": 4.719485443973146e-05, + "loss": 0.0353, + "num_input_tokens_seen": 9092032, + "step": 43090 + }, + { + "epoch": 4.740924092409241, + "grad_norm": 1.1931151151657104, + "learning_rate": 4.7193749728611835e-05, + "loss": 0.1079, + "num_input_tokens_seen": 9093024, + "step": 43095 + }, + { + "epoch": 4.741474147414742, + "grad_norm": 0.295224666595459, + "learning_rate": 4.719264481294306e-05, + "loss": 0.0815, + "num_input_tokens_seen": 9094144, + "step": 43100 + }, + { + "epoch": 4.742024202420242, + "grad_norm": 0.8702100515365601, + "learning_rate": 4.7191539692735336e-05, + "loss": 0.0584, + "num_input_tokens_seen": 9095200, + "step": 43105 + }, + { + "epoch": 4.742574257425742, + "grad_norm": 1.0382411479949951, + "learning_rate": 4.719043436799884e-05, + "loss": 0.133, + "num_input_tokens_seen": 9096288, + "step": 43110 + }, + { + "epoch": 4.743124312431243, + "grad_norm": 0.7478064298629761, + "learning_rate": 4.718932883874375e-05, + "loss": 0.0352, + "num_input_tokens_seen": 9097312, + "step": 43115 + }, + { + "epoch": 4.743674367436744, + "grad_norm": 0.6453801989555359, + "learning_rate": 4.718822310498027e-05, + "loss": 0.0293, + "num_input_tokens_seen": 9098368, + "step": 43120 + }, + { + "epoch": 4.744224422442244, + "grad_norm": 0.384696900844574, + "learning_rate": 4.718711716671859e-05, + "loss": 0.0411, + "num_input_tokens_seen": 9099328, + "step": 43125 + }, + { + "epoch": 4.744774477447745, + "grad_norm": 0.7726055383682251, + "learning_rate": 4.7186011023968886e-05, + "loss": 0.0698, + "num_input_tokens_seen": 9100320, + "step": 43130 + }, + { + "epoch": 4.745324532453245, + "grad_norm": 0.032393645495176315, + "learning_rate": 4.7184904676741365e-05, + "loss": 0.0276, + "num_input_tokens_seen": 9101312, + "step": 43135 + }, + { + "epoch": 4.745874587458746, + "grad_norm": 0.6767167448997498, + "learning_rate": 4.718379812504623e-05, + "loss": 0.0779, + "num_input_tokens_seen": 9102368, + "step": 43140 + }, + { + "epoch": 4.7464246424642464, + "grad_norm": 0.2287270426750183, + "learning_rate": 4.7182691368893664e-05, + "loss": 0.0489, + "num_input_tokens_seen": 9103424, + "step": 43145 + }, + { + "epoch": 4.746974697469747, + "grad_norm": 1.0394748449325562, + "learning_rate": 4.718158440829388e-05, + "loss": 0.1286, + "num_input_tokens_seen": 9104512, + "step": 43150 + }, + { + "epoch": 4.747524752475248, + "grad_norm": 0.181489035487175, + "learning_rate": 4.7180477243257085e-05, + "loss": 0.2594, + "num_input_tokens_seen": 9105600, + "step": 43155 + }, + { + "epoch": 4.748074807480748, + "grad_norm": 4.63646125793457, + "learning_rate": 4.717936987379347e-05, + "loss": 0.108, + "num_input_tokens_seen": 9106688, + "step": 43160 + }, + { + "epoch": 4.748624862486249, + "grad_norm": 0.3080579936504364, + "learning_rate": 4.7178262299913236e-05, + "loss": 0.0657, + "num_input_tokens_seen": 9107808, + "step": 43165 + }, + { + "epoch": 4.749174917491749, + "grad_norm": 0.22711169719696045, + "learning_rate": 4.717715452162661e-05, + "loss": 0.0662, + "num_input_tokens_seen": 9108864, + "step": 43170 + }, + { + "epoch": 4.7497249724972495, + "grad_norm": 0.023616701364517212, + "learning_rate": 4.7176046538943784e-05, + "loss": 0.1061, + "num_input_tokens_seen": 9109920, + "step": 43175 + }, + { + "epoch": 4.7502750275027505, + "grad_norm": 0.7656006813049316, + "learning_rate": 4.717493835187499e-05, + "loss": 0.1083, + "num_input_tokens_seen": 9110944, + "step": 43180 + }, + { + "epoch": 4.750825082508251, + "grad_norm": 0.2281656265258789, + "learning_rate": 4.7173829960430416e-05, + "loss": 0.0168, + "num_input_tokens_seen": 9112000, + "step": 43185 + }, + { + "epoch": 4.751375137513751, + "grad_norm": 0.3234347999095917, + "learning_rate": 4.7172721364620295e-05, + "loss": 0.0419, + "num_input_tokens_seen": 9112992, + "step": 43190 + }, + { + "epoch": 4.751925192519252, + "grad_norm": 1.1597129106521606, + "learning_rate": 4.717161256445485e-05, + "loss": 0.1357, + "num_input_tokens_seen": 9114048, + "step": 43195 + }, + { + "epoch": 4.752475247524752, + "grad_norm": 0.10564141720533371, + "learning_rate": 4.717050355994428e-05, + "loss": 0.0463, + "num_input_tokens_seen": 9115232, + "step": 43200 + }, + { + "epoch": 4.753025302530253, + "grad_norm": 0.16708412766456604, + "learning_rate": 4.716939435109883e-05, + "loss": 0.0445, + "num_input_tokens_seen": 9116288, + "step": 43205 + }, + { + "epoch": 4.7535753575357536, + "grad_norm": 0.07134690880775452, + "learning_rate": 4.71682849379287e-05, + "loss": 0.0214, + "num_input_tokens_seen": 9117344, + "step": 43210 + }, + { + "epoch": 4.754125412541254, + "grad_norm": 0.1811354011297226, + "learning_rate": 4.716717532044412e-05, + "loss": 0.0642, + "num_input_tokens_seen": 9118432, + "step": 43215 + }, + { + "epoch": 4.754675467546755, + "grad_norm": 0.57936692237854, + "learning_rate": 4.7166065498655336e-05, + "loss": 0.0755, + "num_input_tokens_seen": 9119552, + "step": 43220 + }, + { + "epoch": 4.755225522552255, + "grad_norm": 1.2223329544067383, + "learning_rate": 4.716495547257256e-05, + "loss": 0.0418, + "num_input_tokens_seen": 9120576, + "step": 43225 + }, + { + "epoch": 4.755775577557756, + "grad_norm": 1.4852705001831055, + "learning_rate": 4.7163845242206016e-05, + "loss": 0.1039, + "num_input_tokens_seen": 9121664, + "step": 43230 + }, + { + "epoch": 4.756325632563256, + "grad_norm": 0.019784852862358093, + "learning_rate": 4.7162734807565957e-05, + "loss": 0.0246, + "num_input_tokens_seen": 9122688, + "step": 43235 + }, + { + "epoch": 4.756875687568757, + "grad_norm": 0.25536930561065674, + "learning_rate": 4.71616241686626e-05, + "loss": 0.0958, + "num_input_tokens_seen": 9123744, + "step": 43240 + }, + { + "epoch": 4.757425742574258, + "grad_norm": 1.5686825513839722, + "learning_rate": 4.716051332550618e-05, + "loss": 0.0772, + "num_input_tokens_seen": 9124768, + "step": 43245 + }, + { + "epoch": 4.757975797579758, + "grad_norm": 0.6283181309700012, + "learning_rate": 4.715940227810695e-05, + "loss": 0.0926, + "num_input_tokens_seen": 9125888, + "step": 43250 + }, + { + "epoch": 4.758525852585258, + "grad_norm": 0.2650257647037506, + "learning_rate": 4.715829102647514e-05, + "loss": 0.0165, + "num_input_tokens_seen": 9126880, + "step": 43255 + }, + { + "epoch": 4.759075907590759, + "grad_norm": 0.04632003977894783, + "learning_rate": 4.7157179570621e-05, + "loss": 0.0215, + "num_input_tokens_seen": 9127904, + "step": 43260 + }, + { + "epoch": 4.759625962596259, + "grad_norm": 0.35375669598579407, + "learning_rate": 4.715606791055477e-05, + "loss": 0.0199, + "num_input_tokens_seen": 9129024, + "step": 43265 + }, + { + "epoch": 4.7601760176017605, + "grad_norm": 0.17834046483039856, + "learning_rate": 4.715495604628668e-05, + "loss": 0.0116, + "num_input_tokens_seen": 9130080, + "step": 43270 + }, + { + "epoch": 4.760726072607261, + "grad_norm": 0.16291143000125885, + "learning_rate": 4.7153843977827e-05, + "loss": 0.0434, + "num_input_tokens_seen": 9131104, + "step": 43275 + }, + { + "epoch": 4.761276127612762, + "grad_norm": 0.09118811786174774, + "learning_rate": 4.715273170518597e-05, + "loss": 0.0064, + "num_input_tokens_seen": 9132160, + "step": 43280 + }, + { + "epoch": 4.761826182618262, + "grad_norm": 0.040306415408849716, + "learning_rate": 4.715161922837384e-05, + "loss": 0.0719, + "num_input_tokens_seen": 9133216, + "step": 43285 + }, + { + "epoch": 4.762376237623762, + "grad_norm": 0.008403556421399117, + "learning_rate": 4.715050654740086e-05, + "loss": 0.1009, + "num_input_tokens_seen": 9134272, + "step": 43290 + }, + { + "epoch": 4.762926292629263, + "grad_norm": 0.0455569289624691, + "learning_rate": 4.71493936622773e-05, + "loss": 0.0991, + "num_input_tokens_seen": 9135296, + "step": 43295 + }, + { + "epoch": 4.7634763476347635, + "grad_norm": 0.24566665291786194, + "learning_rate": 4.71482805730134e-05, + "loss": 0.0134, + "num_input_tokens_seen": 9136320, + "step": 43300 + }, + { + "epoch": 4.764026402640264, + "grad_norm": 0.26042434573173523, + "learning_rate": 4.714716727961943e-05, + "loss": 0.0179, + "num_input_tokens_seen": 9137408, + "step": 43305 + }, + { + "epoch": 4.764576457645765, + "grad_norm": 0.5838744044303894, + "learning_rate": 4.7146053782105646e-05, + "loss": 0.1576, + "num_input_tokens_seen": 9138432, + "step": 43310 + }, + { + "epoch": 4.765126512651265, + "grad_norm": 0.09730280935764313, + "learning_rate": 4.714494008048231e-05, + "loss": 0.0396, + "num_input_tokens_seen": 9139552, + "step": 43315 + }, + { + "epoch": 4.765676567656766, + "grad_norm": 0.8524450659751892, + "learning_rate": 4.714382617475969e-05, + "loss": 0.044, + "num_input_tokens_seen": 9140576, + "step": 43320 + }, + { + "epoch": 4.766226622662266, + "grad_norm": 0.4297909140586853, + "learning_rate": 4.7142712064948045e-05, + "loss": 0.0961, + "num_input_tokens_seen": 9141632, + "step": 43325 + }, + { + "epoch": 4.7667766776677665, + "grad_norm": 0.28986889123916626, + "learning_rate": 4.714159775105765e-05, + "loss": 0.0683, + "num_input_tokens_seen": 9142688, + "step": 43330 + }, + { + "epoch": 4.767326732673268, + "grad_norm": 0.2240915149450302, + "learning_rate": 4.714048323309878e-05, + "loss": 0.0346, + "num_input_tokens_seen": 9143776, + "step": 43335 + }, + { + "epoch": 4.767876787678768, + "grad_norm": 0.028399577364325523, + "learning_rate": 4.7139368511081686e-05, + "loss": 0.0334, + "num_input_tokens_seen": 9144864, + "step": 43340 + }, + { + "epoch": 4.768426842684269, + "grad_norm": 0.03473633527755737, + "learning_rate": 4.713825358501667e-05, + "loss": 0.0778, + "num_input_tokens_seen": 9145920, + "step": 43345 + }, + { + "epoch": 4.768976897689769, + "grad_norm": 0.08829179406166077, + "learning_rate": 4.713713845491399e-05, + "loss": 0.0969, + "num_input_tokens_seen": 9146912, + "step": 43350 + }, + { + "epoch": 4.769526952695269, + "grad_norm": 0.31545862555503845, + "learning_rate": 4.713602312078392e-05, + "loss": 0.0314, + "num_input_tokens_seen": 9147968, + "step": 43355 + }, + { + "epoch": 4.77007700770077, + "grad_norm": 0.11992410570383072, + "learning_rate": 4.713490758263675e-05, + "loss": 0.0406, + "num_input_tokens_seen": 9149024, + "step": 43360 + }, + { + "epoch": 4.770627062706271, + "grad_norm": 0.721354603767395, + "learning_rate": 4.7133791840482755e-05, + "loss": 0.0329, + "num_input_tokens_seen": 9150048, + "step": 43365 + }, + { + "epoch": 4.771177117711771, + "grad_norm": 0.20980215072631836, + "learning_rate": 4.713267589433223e-05, + "loss": 0.0556, + "num_input_tokens_seen": 9151104, + "step": 43370 + }, + { + "epoch": 4.771727172717272, + "grad_norm": 0.7281913757324219, + "learning_rate": 4.7131559744195445e-05, + "loss": 0.1376, + "num_input_tokens_seen": 9152160, + "step": 43375 + }, + { + "epoch": 4.772277227722772, + "grad_norm": 0.07512117177248001, + "learning_rate": 4.71304433900827e-05, + "loss": 0.1875, + "num_input_tokens_seen": 9153152, + "step": 43380 + }, + { + "epoch": 4.772827282728273, + "grad_norm": 1.3775628805160522, + "learning_rate": 4.712932683200427e-05, + "loss": 0.1542, + "num_input_tokens_seen": 9154240, + "step": 43385 + }, + { + "epoch": 4.773377337733773, + "grad_norm": 0.15702372789382935, + "learning_rate": 4.712821006997046e-05, + "loss": 0.0174, + "num_input_tokens_seen": 9155296, + "step": 43390 + }, + { + "epoch": 4.773927392739274, + "grad_norm": 0.893314003944397, + "learning_rate": 4.712709310399155e-05, + "loss": 0.0789, + "num_input_tokens_seen": 9156384, + "step": 43395 + }, + { + "epoch": 4.774477447744775, + "grad_norm": 0.12408535182476044, + "learning_rate": 4.712597593407784e-05, + "loss": 0.0232, + "num_input_tokens_seen": 9157472, + "step": 43400 + }, + { + "epoch": 4.775027502750275, + "grad_norm": 1.0967152118682861, + "learning_rate": 4.712485856023963e-05, + "loss": 0.0501, + "num_input_tokens_seen": 9158560, + "step": 43405 + }, + { + "epoch": 4.775577557755776, + "grad_norm": 0.029505809769034386, + "learning_rate": 4.712374098248722e-05, + "loss": 0.0495, + "num_input_tokens_seen": 9159680, + "step": 43410 + }, + { + "epoch": 4.776127612761276, + "grad_norm": 0.6836159229278564, + "learning_rate": 4.712262320083089e-05, + "loss": 0.0729, + "num_input_tokens_seen": 9160736, + "step": 43415 + }, + { + "epoch": 4.776677667766776, + "grad_norm": 0.04883578419685364, + "learning_rate": 4.712150521528098e-05, + "loss": 0.0185, + "num_input_tokens_seen": 9161792, + "step": 43420 + }, + { + "epoch": 4.7772277227722775, + "grad_norm": 0.026051674038171768, + "learning_rate": 4.7120387025847754e-05, + "loss": 0.0392, + "num_input_tokens_seen": 9162816, + "step": 43425 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.0177143644541502, + "learning_rate": 4.711926863254154e-05, + "loss": 0.016, + "num_input_tokens_seen": 9163936, + "step": 43430 + }, + { + "epoch": 4.778327832783278, + "grad_norm": 1.5259932279586792, + "learning_rate": 4.711815003537264e-05, + "loss": 0.0421, + "num_input_tokens_seen": 9165056, + "step": 43435 + }, + { + "epoch": 4.778877887788779, + "grad_norm": 0.9319519996643066, + "learning_rate": 4.711703123435137e-05, + "loss": 0.1035, + "num_input_tokens_seen": 9166144, + "step": 43440 + }, + { + "epoch": 4.779427942794279, + "grad_norm": 0.12209673225879669, + "learning_rate": 4.711591222948803e-05, + "loss": 0.0178, + "num_input_tokens_seen": 9167200, + "step": 43445 + }, + { + "epoch": 4.77997799779978, + "grad_norm": 1.0186885595321655, + "learning_rate": 4.7114793020792944e-05, + "loss": 0.0222, + "num_input_tokens_seen": 9168256, + "step": 43450 + }, + { + "epoch": 4.7805280528052805, + "grad_norm": 0.07518190145492554, + "learning_rate": 4.711367360827642e-05, + "loss": 0.1033, + "num_input_tokens_seen": 9169312, + "step": 43455 + }, + { + "epoch": 4.781078107810782, + "grad_norm": 0.1704457402229309, + "learning_rate": 4.711255399194877e-05, + "loss": 0.1235, + "num_input_tokens_seen": 9170464, + "step": 43460 + }, + { + "epoch": 4.781628162816282, + "grad_norm": 0.7107109427452087, + "learning_rate": 4.711143417182033e-05, + "loss": 0.0338, + "num_input_tokens_seen": 9171488, + "step": 43465 + }, + { + "epoch": 4.782178217821782, + "grad_norm": 1.0212947130203247, + "learning_rate": 4.711031414790141e-05, + "loss": 0.1197, + "num_input_tokens_seen": 9172512, + "step": 43470 + }, + { + "epoch": 4.782728272827283, + "grad_norm": 0.43895989656448364, + "learning_rate": 4.710919392020233e-05, + "loss": 0.0736, + "num_input_tokens_seen": 9173504, + "step": 43475 + }, + { + "epoch": 4.783278327832783, + "grad_norm": 0.08328410238027573, + "learning_rate": 4.710807348873342e-05, + "loss": 0.0749, + "num_input_tokens_seen": 9174624, + "step": 43480 + }, + { + "epoch": 4.7838283828382835, + "grad_norm": 0.4374995231628418, + "learning_rate": 4.710695285350501e-05, + "loss": 0.0551, + "num_input_tokens_seen": 9175584, + "step": 43485 + }, + { + "epoch": 4.784378437843785, + "grad_norm": 0.4212195575237274, + "learning_rate": 4.710583201452742e-05, + "loss": 0.0809, + "num_input_tokens_seen": 9176672, + "step": 43490 + }, + { + "epoch": 4.784928492849285, + "grad_norm": 0.09666819125413895, + "learning_rate": 4.710471097181098e-05, + "loss": 0.0249, + "num_input_tokens_seen": 9177760, + "step": 43495 + }, + { + "epoch": 4.785478547854785, + "grad_norm": 0.06629149615764618, + "learning_rate": 4.710358972536603e-05, + "loss": 0.0847, + "num_input_tokens_seen": 9178816, + "step": 43500 + }, + { + "epoch": 4.786028602860286, + "grad_norm": 0.5433341860771179, + "learning_rate": 4.7102468275202906e-05, + "loss": 0.0642, + "num_input_tokens_seen": 9179840, + "step": 43505 + }, + { + "epoch": 4.786578657865786, + "grad_norm": 0.28996631503105164, + "learning_rate": 4.7101346621331926e-05, + "loss": 0.0174, + "num_input_tokens_seen": 9180896, + "step": 43510 + }, + { + "epoch": 4.787128712871287, + "grad_norm": 1.5049386024475098, + "learning_rate": 4.710022476376345e-05, + "loss": 0.0596, + "num_input_tokens_seen": 9181984, + "step": 43515 + }, + { + "epoch": 4.787678767876788, + "grad_norm": 0.22919948399066925, + "learning_rate": 4.70991027025078e-05, + "loss": 0.0537, + "num_input_tokens_seen": 9183008, + "step": 43520 + }, + { + "epoch": 4.788228822882289, + "grad_norm": 0.9789348244667053, + "learning_rate": 4.7097980437575324e-05, + "loss": 0.0964, + "num_input_tokens_seen": 9184000, + "step": 43525 + }, + { + "epoch": 4.788778877887789, + "grad_norm": 0.03546712175011635, + "learning_rate": 4.709685796897637e-05, + "loss": 0.0096, + "num_input_tokens_seen": 9185024, + "step": 43530 + }, + { + "epoch": 4.789328932893289, + "grad_norm": 0.16268517076969147, + "learning_rate": 4.709573529672128e-05, + "loss": 0.0382, + "num_input_tokens_seen": 9186112, + "step": 43535 + }, + { + "epoch": 4.78987898789879, + "grad_norm": 0.6813090443611145, + "learning_rate": 4.7094612420820396e-05, + "loss": 0.0322, + "num_input_tokens_seen": 9187136, + "step": 43540 + }, + { + "epoch": 4.79042904290429, + "grad_norm": 0.39866071939468384, + "learning_rate": 4.709348934128407e-05, + "loss": 0.0335, + "num_input_tokens_seen": 9188128, + "step": 43545 + }, + { + "epoch": 4.790979097909791, + "grad_norm": 0.020764442160725594, + "learning_rate": 4.709236605812266e-05, + "loss": 0.019, + "num_input_tokens_seen": 9189216, + "step": 43550 + }, + { + "epoch": 4.791529152915292, + "grad_norm": 0.45441874861717224, + "learning_rate": 4.709124257134651e-05, + "loss": 0.0322, + "num_input_tokens_seen": 9190176, + "step": 43555 + }, + { + "epoch": 4.792079207920792, + "grad_norm": 0.022305071353912354, + "learning_rate": 4.709011888096597e-05, + "loss": 0.1131, + "num_input_tokens_seen": 9191232, + "step": 43560 + }, + { + "epoch": 4.792629262926293, + "grad_norm": 0.966113805770874, + "learning_rate": 4.708899498699142e-05, + "loss": 0.1252, + "num_input_tokens_seen": 9192288, + "step": 43565 + }, + { + "epoch": 4.793179317931793, + "grad_norm": 0.07574526965618134, + "learning_rate": 4.708787088943319e-05, + "loss": 0.0959, + "num_input_tokens_seen": 9193344, + "step": 43570 + }, + { + "epoch": 4.793729372937293, + "grad_norm": 0.08221647888422012, + "learning_rate": 4.708674658830166e-05, + "loss": 0.012, + "num_input_tokens_seen": 9194400, + "step": 43575 + }, + { + "epoch": 4.7942794279427945, + "grad_norm": 0.15093563497066498, + "learning_rate": 4.708562208360718e-05, + "loss": 0.0459, + "num_input_tokens_seen": 9195456, + "step": 43580 + }, + { + "epoch": 4.794829482948295, + "grad_norm": 1.256521463394165, + "learning_rate": 4.708449737536013e-05, + "loss": 0.0822, + "num_input_tokens_seen": 9196480, + "step": 43585 + }, + { + "epoch": 4.795379537953796, + "grad_norm": 0.004905825015157461, + "learning_rate": 4.708337246357085e-05, + "loss": 0.0275, + "num_input_tokens_seen": 9197536, + "step": 43590 + }, + { + "epoch": 4.795929592959296, + "grad_norm": 0.00979683082550764, + "learning_rate": 4.708224734824973e-05, + "loss": 0.0047, + "num_input_tokens_seen": 9198624, + "step": 43595 + }, + { + "epoch": 4.796479647964796, + "grad_norm": 0.13148191571235657, + "learning_rate": 4.708112202940713e-05, + "loss": 0.0992, + "num_input_tokens_seen": 9199648, + "step": 43600 + }, + { + "epoch": 4.797029702970297, + "grad_norm": 0.03523515537381172, + "learning_rate": 4.707999650705342e-05, + "loss": 0.0188, + "num_input_tokens_seen": 9200768, + "step": 43605 + }, + { + "epoch": 4.7975797579757975, + "grad_norm": 0.4837287664413452, + "learning_rate": 4.7078870781198984e-05, + "loss": 0.0172, + "num_input_tokens_seen": 9201824, + "step": 43610 + }, + { + "epoch": 4.798129812981298, + "grad_norm": 1.5036332607269287, + "learning_rate": 4.707774485185419e-05, + "loss": 0.0664, + "num_input_tokens_seen": 9202880, + "step": 43615 + }, + { + "epoch": 4.798679867986799, + "grad_norm": 0.4531814157962799, + "learning_rate": 4.707661871902942e-05, + "loss": 0.0373, + "num_input_tokens_seen": 9203936, + "step": 43620 + }, + { + "epoch": 4.799229922992299, + "grad_norm": 1.397674322128296, + "learning_rate": 4.707549238273504e-05, + "loss": 0.0254, + "num_input_tokens_seen": 9205088, + "step": 43625 + }, + { + "epoch": 4.7997799779978, + "grad_norm": 1.5637762546539307, + "learning_rate": 4.7074365842981435e-05, + "loss": 0.1468, + "num_input_tokens_seen": 9206144, + "step": 43630 + }, + { + "epoch": 4.8003300330033, + "grad_norm": 0.03828790411353111, + "learning_rate": 4.707323909977901e-05, + "loss": 0.0096, + "num_input_tokens_seen": 9207232, + "step": 43635 + }, + { + "epoch": 4.8008800880088005, + "grad_norm": 0.016539065167307854, + "learning_rate": 4.7072112153138114e-05, + "loss": 0.0988, + "num_input_tokens_seen": 9208320, + "step": 43640 + }, + { + "epoch": 4.801430143014302, + "grad_norm": 0.591378927230835, + "learning_rate": 4.707098500306916e-05, + "loss": 0.0419, + "num_input_tokens_seen": 9209440, + "step": 43645 + }, + { + "epoch": 4.801980198019802, + "grad_norm": 0.15395285189151764, + "learning_rate": 4.706985764958253e-05, + "loss": 0.0114, + "num_input_tokens_seen": 9210464, + "step": 43650 + }, + { + "epoch": 4.802530253025303, + "grad_norm": 0.21417851746082306, + "learning_rate": 4.706873009268861e-05, + "loss": 0.0587, + "num_input_tokens_seen": 9211584, + "step": 43655 + }, + { + "epoch": 4.803080308030803, + "grad_norm": 0.025236409157514572, + "learning_rate": 4.70676023323978e-05, + "loss": 0.081, + "num_input_tokens_seen": 9212704, + "step": 43660 + }, + { + "epoch": 4.803630363036303, + "grad_norm": 0.05137490853667259, + "learning_rate": 4.706647436872048e-05, + "loss": 0.0606, + "num_input_tokens_seen": 9213760, + "step": 43665 + }, + { + "epoch": 4.804180418041804, + "grad_norm": 0.46352508664131165, + "learning_rate": 4.706534620166705e-05, + "loss": 0.0439, + "num_input_tokens_seen": 9214880, + "step": 43670 + }, + { + "epoch": 4.804730473047305, + "grad_norm": 0.03075544349849224, + "learning_rate": 4.706421783124792e-05, + "loss": 0.0171, + "num_input_tokens_seen": 9215872, + "step": 43675 + }, + { + "epoch": 4.805280528052805, + "grad_norm": 0.07058189064264297, + "learning_rate": 4.706308925747348e-05, + "loss": 0.0145, + "num_input_tokens_seen": 9216928, + "step": 43680 + }, + { + "epoch": 4.805830583058306, + "grad_norm": 0.8355627059936523, + "learning_rate": 4.7061960480354134e-05, + "loss": 0.1115, + "num_input_tokens_seen": 9218016, + "step": 43685 + }, + { + "epoch": 4.806380638063806, + "grad_norm": 0.47375479340553284, + "learning_rate": 4.7060831499900285e-05, + "loss": 0.0836, + "num_input_tokens_seen": 9219040, + "step": 43690 + }, + { + "epoch": 4.806930693069307, + "grad_norm": 0.8795232772827148, + "learning_rate": 4.705970231612234e-05, + "loss": 0.0745, + "num_input_tokens_seen": 9220128, + "step": 43695 + }, + { + "epoch": 4.807480748074807, + "grad_norm": 0.049882326275110245, + "learning_rate": 4.70585729290307e-05, + "loss": 0.0807, + "num_input_tokens_seen": 9221120, + "step": 43700 + }, + { + "epoch": 4.8080308030803085, + "grad_norm": 0.7160853147506714, + "learning_rate": 4.7057443338635775e-05, + "loss": 0.0597, + "num_input_tokens_seen": 9222112, + "step": 43705 + }, + { + "epoch": 4.808580858085809, + "grad_norm": 0.5615253448486328, + "learning_rate": 4.7056313544947985e-05, + "loss": 0.0671, + "num_input_tokens_seen": 9223168, + "step": 43710 + }, + { + "epoch": 4.809130913091309, + "grad_norm": 0.07332387566566467, + "learning_rate": 4.705518354797773e-05, + "loss": 0.0122, + "num_input_tokens_seen": 9224256, + "step": 43715 + }, + { + "epoch": 4.80968096809681, + "grad_norm": 1.6065698862075806, + "learning_rate": 4.7054053347735436e-05, + "loss": 0.0851, + "num_input_tokens_seen": 9225312, + "step": 43720 + }, + { + "epoch": 4.81023102310231, + "grad_norm": 0.1307058483362198, + "learning_rate": 4.7052922944231514e-05, + "loss": 0.068, + "num_input_tokens_seen": 9226368, + "step": 43725 + }, + { + "epoch": 4.81078107810781, + "grad_norm": 0.9041957259178162, + "learning_rate": 4.7051792337476385e-05, + "loss": 0.0419, + "num_input_tokens_seen": 9227424, + "step": 43730 + }, + { + "epoch": 4.8113311331133115, + "grad_norm": 0.06943602114915848, + "learning_rate": 4.7050661527480464e-05, + "loss": 0.193, + "num_input_tokens_seen": 9228416, + "step": 43735 + }, + { + "epoch": 4.811881188118812, + "grad_norm": 0.04217718914151192, + "learning_rate": 4.704953051425418e-05, + "loss": 0.0915, + "num_input_tokens_seen": 9229472, + "step": 43740 + }, + { + "epoch": 4.812431243124313, + "grad_norm": 0.05868852138519287, + "learning_rate": 4.7048399297807954e-05, + "loss": 0.0272, + "num_input_tokens_seen": 9230496, + "step": 43745 + }, + { + "epoch": 4.812981298129813, + "grad_norm": 0.3483830988407135, + "learning_rate": 4.7047267878152215e-05, + "loss": 0.0857, + "num_input_tokens_seen": 9231520, + "step": 43750 + }, + { + "epoch": 4.813531353135313, + "grad_norm": 0.12410447746515274, + "learning_rate": 4.704613625529738e-05, + "loss": 0.0371, + "num_input_tokens_seen": 9232576, + "step": 43755 + }, + { + "epoch": 4.814081408140814, + "grad_norm": 0.21974295377731323, + "learning_rate": 4.7045004429253894e-05, + "loss": 0.0174, + "num_input_tokens_seen": 9233632, + "step": 43760 + }, + { + "epoch": 4.8146314631463145, + "grad_norm": 0.028659656643867493, + "learning_rate": 4.7043872400032176e-05, + "loss": 0.016, + "num_input_tokens_seen": 9234688, + "step": 43765 + }, + { + "epoch": 4.815181518151816, + "grad_norm": 0.03738462179899216, + "learning_rate": 4.704274016764266e-05, + "loss": 0.0161, + "num_input_tokens_seen": 9235712, + "step": 43770 + }, + { + "epoch": 4.815731573157316, + "grad_norm": 0.12799595296382904, + "learning_rate": 4.704160773209578e-05, + "loss": 0.1107, + "num_input_tokens_seen": 9236832, + "step": 43775 + }, + { + "epoch": 4.816281628162816, + "grad_norm": 0.6025366187095642, + "learning_rate": 4.7040475093401994e-05, + "loss": 0.0411, + "num_input_tokens_seen": 9237920, + "step": 43780 + }, + { + "epoch": 4.816831683168317, + "grad_norm": 0.01649629883468151, + "learning_rate": 4.703934225157171e-05, + "loss": 0.0244, + "num_input_tokens_seen": 9239008, + "step": 43785 + }, + { + "epoch": 4.817381738173817, + "grad_norm": 0.5238274931907654, + "learning_rate": 4.703820920661539e-05, + "loss": 0.0951, + "num_input_tokens_seen": 9240032, + "step": 43790 + }, + { + "epoch": 4.8179317931793175, + "grad_norm": 0.009616473689675331, + "learning_rate": 4.7037075958543475e-05, + "loss": 0.0256, + "num_input_tokens_seen": 9241120, + "step": 43795 + }, + { + "epoch": 4.818481848184819, + "grad_norm": 0.11645335704088211, + "learning_rate": 4.70359425073664e-05, + "loss": 0.1106, + "num_input_tokens_seen": 9242176, + "step": 43800 + }, + { + "epoch": 4.819031903190319, + "grad_norm": 0.09248317033052444, + "learning_rate": 4.703480885309462e-05, + "loss": 0.0321, + "num_input_tokens_seen": 9243264, + "step": 43805 + }, + { + "epoch": 4.81958195819582, + "grad_norm": 0.04151655733585358, + "learning_rate": 4.703367499573857e-05, + "loss": 0.0066, + "num_input_tokens_seen": 9244352, + "step": 43810 + }, + { + "epoch": 4.82013201320132, + "grad_norm": 0.10369370132684708, + "learning_rate": 4.7032540935308726e-05, + "loss": 0.1029, + "num_input_tokens_seen": 9245408, + "step": 43815 + }, + { + "epoch": 4.82068206820682, + "grad_norm": 0.2268248200416565, + "learning_rate": 4.703140667181552e-05, + "loss": 0.0083, + "num_input_tokens_seen": 9246400, + "step": 43820 + }, + { + "epoch": 4.821232123212321, + "grad_norm": 0.045297134667634964, + "learning_rate": 4.703027220526941e-05, + "loss": 0.0852, + "num_input_tokens_seen": 9247424, + "step": 43825 + }, + { + "epoch": 4.821782178217822, + "grad_norm": 0.7187598347663879, + "learning_rate": 4.702913753568084e-05, + "loss": 0.0477, + "num_input_tokens_seen": 9248448, + "step": 43830 + }, + { + "epoch": 4.822332233223323, + "grad_norm": 0.6049140095710754, + "learning_rate": 4.70280026630603e-05, + "loss": 0.0077, + "num_input_tokens_seen": 9249504, + "step": 43835 + }, + { + "epoch": 4.822882288228823, + "grad_norm": 1.4188830852508545, + "learning_rate": 4.7026867587418215e-05, + "loss": 0.0543, + "num_input_tokens_seen": 9250560, + "step": 43840 + }, + { + "epoch": 4.823432343234323, + "grad_norm": 1.1949297189712524, + "learning_rate": 4.702573230876507e-05, + "loss": 0.1181, + "num_input_tokens_seen": 9251648, + "step": 43845 + }, + { + "epoch": 4.823982398239824, + "grad_norm": 1.188595175743103, + "learning_rate": 4.7024596827111314e-05, + "loss": 0.1124, + "num_input_tokens_seen": 9252704, + "step": 43850 + }, + { + "epoch": 4.824532453245324, + "grad_norm": 0.055892281234264374, + "learning_rate": 4.702346114246742e-05, + "loss": 0.0263, + "num_input_tokens_seen": 9253728, + "step": 43855 + }, + { + "epoch": 4.825082508250825, + "grad_norm": 0.05740790069103241, + "learning_rate": 4.702232525484385e-05, + "loss": 0.0195, + "num_input_tokens_seen": 9254752, + "step": 43860 + }, + { + "epoch": 4.825632563256326, + "grad_norm": 0.08721187710762024, + "learning_rate": 4.702118916425108e-05, + "loss": 0.0909, + "num_input_tokens_seen": 9255840, + "step": 43865 + }, + { + "epoch": 4.826182618261826, + "grad_norm": 1.5255175828933716, + "learning_rate": 4.7020052870699566e-05, + "loss": 0.0644, + "num_input_tokens_seen": 9256960, + "step": 43870 + }, + { + "epoch": 4.826732673267327, + "grad_norm": 1.2576045989990234, + "learning_rate": 4.70189163741998e-05, + "loss": 0.1315, + "num_input_tokens_seen": 9257920, + "step": 43875 + }, + { + "epoch": 4.827282728272827, + "grad_norm": 0.13507221639156342, + "learning_rate": 4.701777967476224e-05, + "loss": 0.0139, + "num_input_tokens_seen": 9259040, + "step": 43880 + }, + { + "epoch": 4.827832783278328, + "grad_norm": 0.43711432814598083, + "learning_rate": 4.701664277239738e-05, + "loss": 0.023, + "num_input_tokens_seen": 9260096, + "step": 43885 + }, + { + "epoch": 4.8283828382838285, + "grad_norm": 0.04600189998745918, + "learning_rate": 4.701550566711568e-05, + "loss": 0.0216, + "num_input_tokens_seen": 9261152, + "step": 43890 + }, + { + "epoch": 4.828932893289329, + "grad_norm": 0.043807562440633774, + "learning_rate": 4.701436835892764e-05, + "loss": 0.0792, + "num_input_tokens_seen": 9262176, + "step": 43895 + }, + { + "epoch": 4.82948294829483, + "grad_norm": 0.06308382004499435, + "learning_rate": 4.7013230847843715e-05, + "loss": 0.0471, + "num_input_tokens_seen": 9263296, + "step": 43900 + }, + { + "epoch": 4.83003300330033, + "grad_norm": 0.7289270758628845, + "learning_rate": 4.7012093133874415e-05, + "loss": 0.1433, + "num_input_tokens_seen": 9264320, + "step": 43905 + }, + { + "epoch": 4.83058305830583, + "grad_norm": 0.2004159837961197, + "learning_rate": 4.701095521703021e-05, + "loss": 0.0264, + "num_input_tokens_seen": 9265344, + "step": 43910 + }, + { + "epoch": 4.831133113311331, + "grad_norm": 0.029208684340119362, + "learning_rate": 4.700981709732159e-05, + "loss": 0.1093, + "num_input_tokens_seen": 9266400, + "step": 43915 + }, + { + "epoch": 4.8316831683168315, + "grad_norm": 0.21781793236732483, + "learning_rate": 4.700867877475905e-05, + "loss": 0.0219, + "num_input_tokens_seen": 9267520, + "step": 43920 + }, + { + "epoch": 4.832233223322332, + "grad_norm": 0.1493101567029953, + "learning_rate": 4.7007540249353086e-05, + "loss": 0.0733, + "num_input_tokens_seen": 9268544, + "step": 43925 + }, + { + "epoch": 4.832783278327833, + "grad_norm": 0.4831086993217468, + "learning_rate": 4.700640152111417e-05, + "loss": 0.0422, + "num_input_tokens_seen": 9269568, + "step": 43930 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.7318242788314819, + "learning_rate": 4.7005262590052826e-05, + "loss": 0.0217, + "num_input_tokens_seen": 9270560, + "step": 43935 + }, + { + "epoch": 4.833883388338834, + "grad_norm": 2.6210520267486572, + "learning_rate": 4.700412345617952e-05, + "loss": 0.0659, + "num_input_tokens_seen": 9271616, + "step": 43940 + }, + { + "epoch": 4.834433443344334, + "grad_norm": 0.3142896294593811, + "learning_rate": 4.7002984119504775e-05, + "loss": 0.027, + "num_input_tokens_seen": 9272640, + "step": 43945 + }, + { + "epoch": 4.834983498349835, + "grad_norm": 0.11535616964101791, + "learning_rate": 4.7001844580039084e-05, + "loss": 0.0451, + "num_input_tokens_seen": 9273728, + "step": 43950 + }, + { + "epoch": 4.835533553355336, + "grad_norm": 0.045172177255153656, + "learning_rate": 4.700070483779295e-05, + "loss": 0.0952, + "num_input_tokens_seen": 9274848, + "step": 43955 + }, + { + "epoch": 4.836083608360836, + "grad_norm": 0.06778153032064438, + "learning_rate": 4.699956489277687e-05, + "loss": 0.0149, + "num_input_tokens_seen": 9275904, + "step": 43960 + }, + { + "epoch": 4.836633663366337, + "grad_norm": 0.6345807313919067, + "learning_rate": 4.699842474500137e-05, + "loss": 0.0259, + "num_input_tokens_seen": 9276960, + "step": 43965 + }, + { + "epoch": 4.837183718371837, + "grad_norm": 0.057339951395988464, + "learning_rate": 4.699728439447693e-05, + "loss": 0.0432, + "num_input_tokens_seen": 9278016, + "step": 43970 + }, + { + "epoch": 4.837733773377337, + "grad_norm": 0.11528995633125305, + "learning_rate": 4.699614384121408e-05, + "loss": 0.1086, + "num_input_tokens_seen": 9279136, + "step": 43975 + }, + { + "epoch": 4.838283828382838, + "grad_norm": 0.1907404214143753, + "learning_rate": 4.6995003085223334e-05, + "loss": 0.038, + "num_input_tokens_seen": 9280224, + "step": 43980 + }, + { + "epoch": 4.838833883388339, + "grad_norm": 0.7818440198898315, + "learning_rate": 4.6993862126515186e-05, + "loss": 0.0686, + "num_input_tokens_seen": 9281216, + "step": 43985 + }, + { + "epoch": 4.83938393839384, + "grad_norm": 0.031059592962265015, + "learning_rate": 4.699272096510018e-05, + "loss": 0.0049, + "num_input_tokens_seen": 9282240, + "step": 43990 + }, + { + "epoch": 4.83993399339934, + "grad_norm": 0.26049941778182983, + "learning_rate": 4.6991579600988804e-05, + "loss": 0.0436, + "num_input_tokens_seen": 9283232, + "step": 43995 + }, + { + "epoch": 4.84048404840484, + "grad_norm": 0.18270930647850037, + "learning_rate": 4.69904380341916e-05, + "loss": 0.0263, + "num_input_tokens_seen": 9284288, + "step": 44000 + }, + { + "epoch": 4.841034103410341, + "grad_norm": 0.07216056436300278, + "learning_rate": 4.698929626471908e-05, + "loss": 0.0327, + "num_input_tokens_seen": 9285312, + "step": 44005 + }, + { + "epoch": 4.841584158415841, + "grad_norm": 0.0474531427025795, + "learning_rate": 4.698815429258177e-05, + "loss": 0.042, + "num_input_tokens_seen": 9286272, + "step": 44010 + }, + { + "epoch": 4.8421342134213425, + "grad_norm": 0.26546210050582886, + "learning_rate": 4.698701211779018e-05, + "loss": 0.0225, + "num_input_tokens_seen": 9287360, + "step": 44015 + }, + { + "epoch": 4.842684268426843, + "grad_norm": 0.6515597701072693, + "learning_rate": 4.698586974035486e-05, + "loss": 0.0311, + "num_input_tokens_seen": 9288352, + "step": 44020 + }, + { + "epoch": 4.843234323432343, + "grad_norm": 0.1342155486345291, + "learning_rate": 4.6984727160286326e-05, + "loss": 0.0224, + "num_input_tokens_seen": 9289408, + "step": 44025 + }, + { + "epoch": 4.843784378437844, + "grad_norm": 0.6141088604927063, + "learning_rate": 4.6983584377595116e-05, + "loss": 0.0393, + "num_input_tokens_seen": 9290464, + "step": 44030 + }, + { + "epoch": 4.844334433443344, + "grad_norm": 0.1730802357196808, + "learning_rate": 4.698244139229175e-05, + "loss": 0.0091, + "num_input_tokens_seen": 9291520, + "step": 44035 + }, + { + "epoch": 4.8448844884488445, + "grad_norm": 0.011181308887898922, + "learning_rate": 4.698129820438677e-05, + "loss": 0.0236, + "num_input_tokens_seen": 9292576, + "step": 44040 + }, + { + "epoch": 4.8454345434543455, + "grad_norm": 0.4059424102306366, + "learning_rate": 4.698015481389072e-05, + "loss": 0.0894, + "num_input_tokens_seen": 9293632, + "step": 44045 + }, + { + "epoch": 4.845984598459846, + "grad_norm": 0.14012742042541504, + "learning_rate": 4.697901122081412e-05, + "loss": 0.0433, + "num_input_tokens_seen": 9294656, + "step": 44050 + }, + { + "epoch": 4.846534653465347, + "grad_norm": 0.01013163197785616, + "learning_rate": 4.697786742516753e-05, + "loss": 0.0273, + "num_input_tokens_seen": 9295680, + "step": 44055 + }, + { + "epoch": 4.847084708470847, + "grad_norm": 0.2694588303565979, + "learning_rate": 4.6976723426961475e-05, + "loss": 0.083, + "num_input_tokens_seen": 9296672, + "step": 44060 + }, + { + "epoch": 4.847634763476347, + "grad_norm": 0.15316687524318695, + "learning_rate": 4.6975579226206505e-05, + "loss": 0.0559, + "num_input_tokens_seen": 9297728, + "step": 44065 + }, + { + "epoch": 4.848184818481848, + "grad_norm": 0.11534883081912994, + "learning_rate": 4.697443482291317e-05, + "loss": 0.0109, + "num_input_tokens_seen": 9298784, + "step": 44070 + }, + { + "epoch": 4.8487348734873486, + "grad_norm": 0.006512445863336325, + "learning_rate": 4.6973290217092016e-05, + "loss": 0.0884, + "num_input_tokens_seen": 9299872, + "step": 44075 + }, + { + "epoch": 4.84928492849285, + "grad_norm": 0.0257098488509655, + "learning_rate": 4.6972145408753584e-05, + "loss": 0.0508, + "num_input_tokens_seen": 9300960, + "step": 44080 + }, + { + "epoch": 4.84983498349835, + "grad_norm": 0.07628072053194046, + "learning_rate": 4.697100039790844e-05, + "loss": 0.0237, + "num_input_tokens_seen": 9302048, + "step": 44085 + }, + { + "epoch": 4.85038503850385, + "grad_norm": 0.0046002306044101715, + "learning_rate": 4.696985518456711e-05, + "loss": 0.0734, + "num_input_tokens_seen": 9303072, + "step": 44090 + }, + { + "epoch": 4.850935093509351, + "grad_norm": 0.019845247268676758, + "learning_rate": 4.696870976874018e-05, + "loss": 0.0163, + "num_input_tokens_seen": 9304096, + "step": 44095 + }, + { + "epoch": 4.851485148514851, + "grad_norm": 0.49446412920951843, + "learning_rate": 4.69675641504382e-05, + "loss": 0.029, + "num_input_tokens_seen": 9305088, + "step": 44100 + }, + { + "epoch": 4.852035203520352, + "grad_norm": 0.5347188115119934, + "learning_rate": 4.6966418329671715e-05, + "loss": 0.0328, + "num_input_tokens_seen": 9306176, + "step": 44105 + }, + { + "epoch": 4.852585258525853, + "grad_norm": 0.01393947284668684, + "learning_rate": 4.696527230645129e-05, + "loss": 0.0674, + "num_input_tokens_seen": 9307200, + "step": 44110 + }, + { + "epoch": 4.853135313531353, + "grad_norm": 0.016092922538518906, + "learning_rate": 4.696412608078749e-05, + "loss": 0.0057, + "num_input_tokens_seen": 9308256, + "step": 44115 + }, + { + "epoch": 4.853685368536854, + "grad_norm": 0.33052361011505127, + "learning_rate": 4.696297965269089e-05, + "loss": 0.1185, + "num_input_tokens_seen": 9309344, + "step": 44120 + }, + { + "epoch": 4.854235423542354, + "grad_norm": 0.2609706223011017, + "learning_rate": 4.696183302217203e-05, + "loss": 0.0094, + "num_input_tokens_seen": 9310400, + "step": 44125 + }, + { + "epoch": 4.854785478547855, + "grad_norm": 0.2969568371772766, + "learning_rate": 4.69606861892415e-05, + "loss": 0.0386, + "num_input_tokens_seen": 9311456, + "step": 44130 + }, + { + "epoch": 4.8553355335533555, + "grad_norm": 0.06426070630550385, + "learning_rate": 4.695953915390987e-05, + "loss": 0.0257, + "num_input_tokens_seen": 9312512, + "step": 44135 + }, + { + "epoch": 4.855885588558856, + "grad_norm": 0.06134539097547531, + "learning_rate": 4.6958391916187695e-05, + "loss": 0.0147, + "num_input_tokens_seen": 9313568, + "step": 44140 + }, + { + "epoch": 4.856435643564357, + "grad_norm": 0.09412029385566711, + "learning_rate": 4.695724447608556e-05, + "loss": 0.0175, + "num_input_tokens_seen": 9314624, + "step": 44145 + }, + { + "epoch": 4.856985698569857, + "grad_norm": 0.839455783367157, + "learning_rate": 4.695609683361404e-05, + "loss": 0.1386, + "num_input_tokens_seen": 9315680, + "step": 44150 + }, + { + "epoch": 4.857535753575357, + "grad_norm": 1.171847939491272, + "learning_rate": 4.695494898878371e-05, + "loss": 0.1238, + "num_input_tokens_seen": 9316736, + "step": 44155 + }, + { + "epoch": 4.858085808580858, + "grad_norm": 0.14548568427562714, + "learning_rate": 4.6953800941605156e-05, + "loss": 0.036, + "num_input_tokens_seen": 9317760, + "step": 44160 + }, + { + "epoch": 4.8586358635863585, + "grad_norm": 1.2341020107269287, + "learning_rate": 4.695265269208895e-05, + "loss": 0.0398, + "num_input_tokens_seen": 9318816, + "step": 44165 + }, + { + "epoch": 4.8591859185918596, + "grad_norm": 0.049763649702072144, + "learning_rate": 4.695150424024567e-05, + "loss": 0.0476, + "num_input_tokens_seen": 9319840, + "step": 44170 + }, + { + "epoch": 4.85973597359736, + "grad_norm": 0.010741045698523521, + "learning_rate": 4.695035558608592e-05, + "loss": 0.0774, + "num_input_tokens_seen": 9320800, + "step": 44175 + }, + { + "epoch": 4.86028602860286, + "grad_norm": 1.2202500104904175, + "learning_rate": 4.694920672962027e-05, + "loss": 0.1008, + "num_input_tokens_seen": 9321888, + "step": 44180 + }, + { + "epoch": 4.860836083608361, + "grad_norm": 1.1960904598236084, + "learning_rate": 4.694805767085932e-05, + "loss": 0.0587, + "num_input_tokens_seen": 9322912, + "step": 44185 + }, + { + "epoch": 4.861386138613861, + "grad_norm": 1.114013671875, + "learning_rate": 4.694690840981365e-05, + "loss": 0.0275, + "num_input_tokens_seen": 9323968, + "step": 44190 + }, + { + "epoch": 4.861936193619362, + "grad_norm": 0.03738558664917946, + "learning_rate": 4.694575894649386e-05, + "loss": 0.0172, + "num_input_tokens_seen": 9325024, + "step": 44195 + }, + { + "epoch": 4.862486248624863, + "grad_norm": 0.32411903142929077, + "learning_rate": 4.694460928091053e-05, + "loss": 0.0147, + "num_input_tokens_seen": 9326080, + "step": 44200 + }, + { + "epoch": 4.863036303630363, + "grad_norm": 1.7701777219772339, + "learning_rate": 4.6943459413074275e-05, + "loss": 0.0932, + "num_input_tokens_seen": 9327104, + "step": 44205 + }, + { + "epoch": 4.863586358635864, + "grad_norm": 0.03225763142108917, + "learning_rate": 4.6942309342995676e-05, + "loss": 0.0112, + "num_input_tokens_seen": 9328224, + "step": 44210 + }, + { + "epoch": 4.864136413641364, + "grad_norm": 0.027358591556549072, + "learning_rate": 4.6941159070685345e-05, + "loss": 0.0432, + "num_input_tokens_seen": 9329280, + "step": 44215 + }, + { + "epoch": 4.864686468646864, + "grad_norm": 0.36086219549179077, + "learning_rate": 4.694000859615389e-05, + "loss": 0.0835, + "num_input_tokens_seen": 9330368, + "step": 44220 + }, + { + "epoch": 4.865236523652365, + "grad_norm": 0.37187132239341736, + "learning_rate": 4.693885791941189e-05, + "loss": 0.044, + "num_input_tokens_seen": 9331456, + "step": 44225 + }, + { + "epoch": 4.865786578657866, + "grad_norm": 0.24548207223415375, + "learning_rate": 4.693770704046997e-05, + "loss": 0.0151, + "num_input_tokens_seen": 9332512, + "step": 44230 + }, + { + "epoch": 4.866336633663367, + "grad_norm": 1.3223446607589722, + "learning_rate": 4.6936555959338726e-05, + "loss": 0.0548, + "num_input_tokens_seen": 9333536, + "step": 44235 + }, + { + "epoch": 4.866886688668867, + "grad_norm": 0.8987883925437927, + "learning_rate": 4.6935404676028775e-05, + "loss": 0.1062, + "num_input_tokens_seen": 9334560, + "step": 44240 + }, + { + "epoch": 4.867436743674367, + "grad_norm": 1.801088571548462, + "learning_rate": 4.693425319055073e-05, + "loss": 0.064, + "num_input_tokens_seen": 9335648, + "step": 44245 + }, + { + "epoch": 4.867986798679868, + "grad_norm": 0.07739720493555069, + "learning_rate": 4.693310150291518e-05, + "loss": 0.0046, + "num_input_tokens_seen": 9336672, + "step": 44250 + }, + { + "epoch": 4.868536853685368, + "grad_norm": 0.35469624400138855, + "learning_rate": 4.693194961313277e-05, + "loss": 0.2001, + "num_input_tokens_seen": 9337696, + "step": 44255 + }, + { + "epoch": 4.8690869086908695, + "grad_norm": 0.10357662290334702, + "learning_rate": 4.693079752121411e-05, + "loss": 0.1478, + "num_input_tokens_seen": 9338752, + "step": 44260 + }, + { + "epoch": 4.86963696369637, + "grad_norm": 0.0747578889131546, + "learning_rate": 4.692964522716981e-05, + "loss": 0.0266, + "num_input_tokens_seen": 9339808, + "step": 44265 + }, + { + "epoch": 4.87018701870187, + "grad_norm": 1.0929653644561768, + "learning_rate": 4.692849273101049e-05, + "loss": 0.2132, + "num_input_tokens_seen": 9340832, + "step": 44270 + }, + { + "epoch": 4.870737073707371, + "grad_norm": 0.024688951671123505, + "learning_rate": 4.692734003274678e-05, + "loss": 0.0605, + "num_input_tokens_seen": 9341888, + "step": 44275 + }, + { + "epoch": 4.871287128712871, + "grad_norm": 0.12158370018005371, + "learning_rate": 4.692618713238929e-05, + "loss": 0.0074, + "num_input_tokens_seen": 9342944, + "step": 44280 + }, + { + "epoch": 4.871837183718371, + "grad_norm": 0.04234086349606514, + "learning_rate": 4.6925034029948654e-05, + "loss": 0.0707, + "num_input_tokens_seen": 9344032, + "step": 44285 + }, + { + "epoch": 4.8723872387238725, + "grad_norm": 0.2039259970188141, + "learning_rate": 4.692388072543551e-05, + "loss": 0.0346, + "num_input_tokens_seen": 9345088, + "step": 44290 + }, + { + "epoch": 4.872937293729373, + "grad_norm": 0.19991470873355865, + "learning_rate": 4.692272721886048e-05, + "loss": 0.0149, + "num_input_tokens_seen": 9346080, + "step": 44295 + }, + { + "epoch": 4.873487348734874, + "grad_norm": 1.093736171722412, + "learning_rate": 4.6921573510234176e-05, + "loss": 0.0496, + "num_input_tokens_seen": 9347200, + "step": 44300 + }, + { + "epoch": 4.874037403740374, + "grad_norm": 0.14548099040985107, + "learning_rate": 4.6920419599567255e-05, + "loss": 0.0203, + "num_input_tokens_seen": 9348320, + "step": 44305 + }, + { + "epoch": 4.874587458745875, + "grad_norm": 0.682869553565979, + "learning_rate": 4.6919265486870344e-05, + "loss": 0.1098, + "num_input_tokens_seen": 9349280, + "step": 44310 + }, + { + "epoch": 4.875137513751375, + "grad_norm": 0.20995746552944183, + "learning_rate": 4.691811117215409e-05, + "loss": 0.0195, + "num_input_tokens_seen": 9350368, + "step": 44315 + }, + { + "epoch": 4.8756875687568755, + "grad_norm": 0.21988919377326965, + "learning_rate": 4.691695665542911e-05, + "loss": 0.0606, + "num_input_tokens_seen": 9351424, + "step": 44320 + }, + { + "epoch": 4.876237623762377, + "grad_norm": 0.12979814410209656, + "learning_rate": 4.691580193670607e-05, + "loss": 0.062, + "num_input_tokens_seen": 9352512, + "step": 44325 + }, + { + "epoch": 4.876787678767877, + "grad_norm": 0.049609992653131485, + "learning_rate": 4.691464701599559e-05, + "loss": 0.0693, + "num_input_tokens_seen": 9353536, + "step": 44330 + }, + { + "epoch": 4.877337733773377, + "grad_norm": 0.0374811626970768, + "learning_rate": 4.691349189330833e-05, + "loss": 0.0471, + "num_input_tokens_seen": 9354624, + "step": 44335 + }, + { + "epoch": 4.877887788778878, + "grad_norm": 0.9552961587905884, + "learning_rate": 4.6912336568654925e-05, + "loss": 0.1223, + "num_input_tokens_seen": 9355776, + "step": 44340 + }, + { + "epoch": 4.878437843784378, + "grad_norm": 0.11450377851724625, + "learning_rate": 4.691118104204603e-05, + "loss": 0.0255, + "num_input_tokens_seen": 9356832, + "step": 44345 + }, + { + "epoch": 4.878987898789879, + "grad_norm": 0.1962164342403412, + "learning_rate": 4.691002531349229e-05, + "loss": 0.1039, + "num_input_tokens_seen": 9357920, + "step": 44350 + }, + { + "epoch": 4.87953795379538, + "grad_norm": 0.07374516129493713, + "learning_rate": 4.6908869383004374e-05, + "loss": 0.1848, + "num_input_tokens_seen": 9358912, + "step": 44355 + }, + { + "epoch": 4.88008800880088, + "grad_norm": 0.05859118327498436, + "learning_rate": 4.690771325059291e-05, + "loss": 0.1037, + "num_input_tokens_seen": 9360000, + "step": 44360 + }, + { + "epoch": 4.880638063806381, + "grad_norm": 0.18888050317764282, + "learning_rate": 4.690655691626856e-05, + "loss": 0.0382, + "num_input_tokens_seen": 9361056, + "step": 44365 + }, + { + "epoch": 4.881188118811881, + "grad_norm": 0.4955455958843231, + "learning_rate": 4.6905400380042006e-05, + "loss": 0.0145, + "num_input_tokens_seen": 9362112, + "step": 44370 + }, + { + "epoch": 4.881738173817382, + "grad_norm": 0.049531567841768265, + "learning_rate": 4.690424364192387e-05, + "loss": 0.0197, + "num_input_tokens_seen": 9363232, + "step": 44375 + }, + { + "epoch": 4.882288228822882, + "grad_norm": 0.02253459393978119, + "learning_rate": 4.6903086701924836e-05, + "loss": 0.0897, + "num_input_tokens_seen": 9364224, + "step": 44380 + }, + { + "epoch": 4.882838283828383, + "grad_norm": 0.0452287383377552, + "learning_rate": 4.690192956005557e-05, + "loss": 0.0145, + "num_input_tokens_seen": 9365312, + "step": 44385 + }, + { + "epoch": 4.883388338833884, + "grad_norm": 1.2967603206634521, + "learning_rate": 4.690077221632672e-05, + "loss": 0.1399, + "num_input_tokens_seen": 9366400, + "step": 44390 + }, + { + "epoch": 4.883938393839384, + "grad_norm": 0.022547731176018715, + "learning_rate": 4.689961467074896e-05, + "loss": 0.0331, + "num_input_tokens_seen": 9367456, + "step": 44395 + }, + { + "epoch": 4.884488448844884, + "grad_norm": 1.5059902667999268, + "learning_rate": 4.689845692333297e-05, + "loss": 0.1037, + "num_input_tokens_seen": 9368512, + "step": 44400 + }, + { + "epoch": 4.885038503850385, + "grad_norm": 0.7008766531944275, + "learning_rate": 4.68972989740894e-05, + "loss": 0.0714, + "num_input_tokens_seen": 9369600, + "step": 44405 + }, + { + "epoch": 4.885588558855885, + "grad_norm": 0.25641074776649475, + "learning_rate": 4.689614082302894e-05, + "loss": 0.0128, + "num_input_tokens_seen": 9370720, + "step": 44410 + }, + { + "epoch": 4.8861386138613865, + "grad_norm": 0.029036356136202812, + "learning_rate": 4.6894982470162254e-05, + "loss": 0.0199, + "num_input_tokens_seen": 9371776, + "step": 44415 + }, + { + "epoch": 4.886688668866887, + "grad_norm": 0.3430361747741699, + "learning_rate": 4.689382391550002e-05, + "loss": 0.0347, + "num_input_tokens_seen": 9372800, + "step": 44420 + }, + { + "epoch": 4.887238723872387, + "grad_norm": 0.7308872938156128, + "learning_rate": 4.6892665159052916e-05, + "loss": 0.094, + "num_input_tokens_seen": 9373856, + "step": 44425 + }, + { + "epoch": 4.887788778877888, + "grad_norm": 0.40638962388038635, + "learning_rate": 4.689150620083163e-05, + "loss": 0.0158, + "num_input_tokens_seen": 9374848, + "step": 44430 + }, + { + "epoch": 4.888338833883388, + "grad_norm": 0.09873056411743164, + "learning_rate": 4.6890347040846825e-05, + "loss": 0.0372, + "num_input_tokens_seen": 9375904, + "step": 44435 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.03231540694832802, + "learning_rate": 4.68891876791092e-05, + "loss": 0.077, + "num_input_tokens_seen": 9376992, + "step": 44440 + }, + { + "epoch": 4.8894389438943895, + "grad_norm": 0.39732933044433594, + "learning_rate": 4.6888028115629436e-05, + "loss": 0.1261, + "num_input_tokens_seen": 9378016, + "step": 44445 + }, + { + "epoch": 4.88998899889989, + "grad_norm": 0.151368647813797, + "learning_rate": 4.6886868350418214e-05, + "loss": 0.0202, + "num_input_tokens_seen": 9379072, + "step": 44450 + }, + { + "epoch": 4.890539053905391, + "grad_norm": 0.058892589062452316, + "learning_rate": 4.688570838348624e-05, + "loss": 0.08, + "num_input_tokens_seen": 9380128, + "step": 44455 + }, + { + "epoch": 4.891089108910891, + "grad_norm": 0.16254712641239166, + "learning_rate": 4.688454821484418e-05, + "loss": 0.0164, + "num_input_tokens_seen": 9381088, + "step": 44460 + }, + { + "epoch": 4.891639163916391, + "grad_norm": 0.10147154331207275, + "learning_rate": 4.6883387844502755e-05, + "loss": 0.0505, + "num_input_tokens_seen": 9382144, + "step": 44465 + }, + { + "epoch": 4.892189218921892, + "grad_norm": 0.04977797344326973, + "learning_rate": 4.6882227272472634e-05, + "loss": 0.0071, + "num_input_tokens_seen": 9383200, + "step": 44470 + }, + { + "epoch": 4.8927392739273925, + "grad_norm": 0.059815384447574615, + "learning_rate": 4.6881066498764524e-05, + "loss": 0.0299, + "num_input_tokens_seen": 9384320, + "step": 44475 + }, + { + "epoch": 4.893289328932894, + "grad_norm": 0.40791943669319153, + "learning_rate": 4.687990552338913e-05, + "loss": 0.0488, + "num_input_tokens_seen": 9385344, + "step": 44480 + }, + { + "epoch": 4.893839383938394, + "grad_norm": 0.031091604381799698, + "learning_rate": 4.687874434635715e-05, + "loss": 0.035, + "num_input_tokens_seen": 9386400, + "step": 44485 + }, + { + "epoch": 4.894389438943895, + "grad_norm": 0.14395803213119507, + "learning_rate": 4.687758296767927e-05, + "loss": 0.0465, + "num_input_tokens_seen": 9387424, + "step": 44490 + }, + { + "epoch": 4.894939493949395, + "grad_norm": 1.5462557077407837, + "learning_rate": 4.687642138736621e-05, + "loss": 0.1937, + "num_input_tokens_seen": 9388480, + "step": 44495 + }, + { + "epoch": 4.895489548954895, + "grad_norm": 0.908544659614563, + "learning_rate": 4.6875259605428674e-05, + "loss": 0.0429, + "num_input_tokens_seen": 9389504, + "step": 44500 + }, + { + "epoch": 4.896039603960396, + "grad_norm": 0.1847739964723587, + "learning_rate": 4.6874097621877364e-05, + "loss": 0.0442, + "num_input_tokens_seen": 9390560, + "step": 44505 + }, + { + "epoch": 4.896589658965897, + "grad_norm": 0.03344198316335678, + "learning_rate": 4.6872935436723e-05, + "loss": 0.0493, + "num_input_tokens_seen": 9391680, + "step": 44510 + }, + { + "epoch": 4.897139713971397, + "grad_norm": 1.4119778871536255, + "learning_rate": 4.687177304997627e-05, + "loss": 0.2077, + "num_input_tokens_seen": 9392768, + "step": 44515 + }, + { + "epoch": 4.897689768976898, + "grad_norm": 0.33313044905662537, + "learning_rate": 4.687061046164792e-05, + "loss": 0.0439, + "num_input_tokens_seen": 9393888, + "step": 44520 + }, + { + "epoch": 4.898239823982398, + "grad_norm": 0.06385277211666107, + "learning_rate": 4.686944767174864e-05, + "loss": 0.0054, + "num_input_tokens_seen": 9395040, + "step": 44525 + }, + { + "epoch": 4.898789878987898, + "grad_norm": 0.07928596436977386, + "learning_rate": 4.6868284680289154e-05, + "loss": 0.0094, + "num_input_tokens_seen": 9396064, + "step": 44530 + }, + { + "epoch": 4.899339933993399, + "grad_norm": 0.8675643801689148, + "learning_rate": 4.686712148728019e-05, + "loss": 0.0455, + "num_input_tokens_seen": 9397120, + "step": 44535 + }, + { + "epoch": 4.8998899889989, + "grad_norm": 0.2520785331726074, + "learning_rate": 4.686595809273244e-05, + "loss": 0.0667, + "num_input_tokens_seen": 9398144, + "step": 44540 + }, + { + "epoch": 4.900440044004401, + "grad_norm": 0.053515490144491196, + "learning_rate": 4.686479449665666e-05, + "loss": 0.0312, + "num_input_tokens_seen": 9399200, + "step": 44545 + }, + { + "epoch": 4.900990099009901, + "grad_norm": 0.10622428357601166, + "learning_rate": 4.686363069906357e-05, + "loss": 0.0156, + "num_input_tokens_seen": 9400256, + "step": 44550 + }, + { + "epoch": 4.901540154015402, + "grad_norm": 0.05826763063669205, + "learning_rate": 4.686246669996387e-05, + "loss": 0.0237, + "num_input_tokens_seen": 9401312, + "step": 44555 + }, + { + "epoch": 4.902090209020902, + "grad_norm": 0.11913695931434631, + "learning_rate": 4.686130249936831e-05, + "loss": 0.0411, + "num_input_tokens_seen": 9402368, + "step": 44560 + }, + { + "epoch": 4.902640264026402, + "grad_norm": 0.20928436517715454, + "learning_rate": 4.686013809728762e-05, + "loss": 0.1011, + "num_input_tokens_seen": 9403360, + "step": 44565 + }, + { + "epoch": 4.9031903190319035, + "grad_norm": 0.4326481521129608, + "learning_rate": 4.685897349373253e-05, + "loss": 0.0823, + "num_input_tokens_seen": 9404448, + "step": 44570 + }, + { + "epoch": 4.903740374037404, + "grad_norm": 0.22941604256629944, + "learning_rate": 4.6857808688713754e-05, + "loss": 0.1207, + "num_input_tokens_seen": 9405536, + "step": 44575 + }, + { + "epoch": 4.904290429042904, + "grad_norm": 0.5070186853408813, + "learning_rate": 4.685664368224206e-05, + "loss": 0.0191, + "num_input_tokens_seen": 9406656, + "step": 44580 + }, + { + "epoch": 4.904840484048405, + "grad_norm": 0.0660887360572815, + "learning_rate": 4.685547847432816e-05, + "loss": 0.0328, + "num_input_tokens_seen": 9407680, + "step": 44585 + }, + { + "epoch": 4.905390539053905, + "grad_norm": 0.06790632009506226, + "learning_rate": 4.685431306498281e-05, + "loss": 0.0479, + "num_input_tokens_seen": 9408736, + "step": 44590 + }, + { + "epoch": 4.905940594059406, + "grad_norm": 0.284150093793869, + "learning_rate": 4.685314745421674e-05, + "loss": 0.0096, + "num_input_tokens_seen": 9409792, + "step": 44595 + }, + { + "epoch": 4.9064906490649065, + "grad_norm": 0.10904623568058014, + "learning_rate": 4.685198164204069e-05, + "loss": 0.0069, + "num_input_tokens_seen": 9410848, + "step": 44600 + }, + { + "epoch": 4.907040704070407, + "grad_norm": 0.13713786005973816, + "learning_rate": 4.6850815628465416e-05, + "loss": 0.0083, + "num_input_tokens_seen": 9411936, + "step": 44605 + }, + { + "epoch": 4.907590759075908, + "grad_norm": 1.0347137451171875, + "learning_rate": 4.6849649413501664e-05, + "loss": 0.0913, + "num_input_tokens_seen": 9412992, + "step": 44610 + }, + { + "epoch": 4.908140814081408, + "grad_norm": 0.034793656319379807, + "learning_rate": 4.684848299716017e-05, + "loss": 0.1388, + "num_input_tokens_seen": 9414112, + "step": 44615 + }, + { + "epoch": 4.908690869086909, + "grad_norm": 0.021347513422369957, + "learning_rate": 4.684731637945169e-05, + "loss": 0.0174, + "num_input_tokens_seen": 9415168, + "step": 44620 + }, + { + "epoch": 4.909240924092409, + "grad_norm": 1.063693881034851, + "learning_rate": 4.684614956038699e-05, + "loss": 0.0229, + "num_input_tokens_seen": 9416256, + "step": 44625 + }, + { + "epoch": 4.9097909790979095, + "grad_norm": 0.10823900997638702, + "learning_rate": 4.68449825399768e-05, + "loss": 0.0721, + "num_input_tokens_seen": 9417280, + "step": 44630 + }, + { + "epoch": 4.910341034103411, + "grad_norm": 1.5149471759796143, + "learning_rate": 4.684381531823189e-05, + "loss": 0.0754, + "num_input_tokens_seen": 9418336, + "step": 44635 + }, + { + "epoch": 4.910891089108911, + "grad_norm": 0.02147563174366951, + "learning_rate": 4.684264789516302e-05, + "loss": 0.0634, + "num_input_tokens_seen": 9419392, + "step": 44640 + }, + { + "epoch": 4.911441144114411, + "grad_norm": 0.3354455530643463, + "learning_rate": 4.684148027078094e-05, + "loss": 0.0353, + "num_input_tokens_seen": 9420384, + "step": 44645 + }, + { + "epoch": 4.911991199119912, + "grad_norm": 0.09253612905740738, + "learning_rate": 4.684031244509643e-05, + "loss": 0.1438, + "num_input_tokens_seen": 9421440, + "step": 44650 + }, + { + "epoch": 4.912541254125412, + "grad_norm": 0.36390843987464905, + "learning_rate": 4.683914441812023e-05, + "loss": 0.0402, + "num_input_tokens_seen": 9422464, + "step": 44655 + }, + { + "epoch": 4.913091309130913, + "grad_norm": 0.07939733564853668, + "learning_rate": 4.683797618986312e-05, + "loss": 0.0177, + "num_input_tokens_seen": 9423488, + "step": 44660 + }, + { + "epoch": 4.913641364136414, + "grad_norm": 0.26988983154296875, + "learning_rate": 4.6836807760335857e-05, + "loss": 0.0197, + "num_input_tokens_seen": 9424576, + "step": 44665 + }, + { + "epoch": 4.914191419141914, + "grad_norm": 0.1677190512418747, + "learning_rate": 4.683563912954921e-05, + "loss": 0.011, + "num_input_tokens_seen": 9425664, + "step": 44670 + }, + { + "epoch": 4.914741474147415, + "grad_norm": 0.2521607577800751, + "learning_rate": 4.683447029751397e-05, + "loss": 0.1025, + "num_input_tokens_seen": 9426688, + "step": 44675 + }, + { + "epoch": 4.915291529152915, + "grad_norm": 0.608882486820221, + "learning_rate": 4.683330126424089e-05, + "loss": 0.072, + "num_input_tokens_seen": 9427712, + "step": 44680 + }, + { + "epoch": 4.915841584158416, + "grad_norm": 0.07069139927625656, + "learning_rate": 4.6832132029740746e-05, + "loss": 0.034, + "num_input_tokens_seen": 9428768, + "step": 44685 + }, + { + "epoch": 4.916391639163916, + "grad_norm": 0.2455042451620102, + "learning_rate": 4.6830962594024316e-05, + "loss": 0.0628, + "num_input_tokens_seen": 9429824, + "step": 44690 + }, + { + "epoch": 4.916941694169417, + "grad_norm": 0.41669607162475586, + "learning_rate": 4.682979295710238e-05, + "loss": 0.0502, + "num_input_tokens_seen": 9430880, + "step": 44695 + }, + { + "epoch": 4.917491749174918, + "grad_norm": 0.47325557470321655, + "learning_rate": 4.682862311898572e-05, + "loss": 0.0205, + "num_input_tokens_seen": 9432000, + "step": 44700 + }, + { + "epoch": 4.918041804180418, + "grad_norm": 0.886934220790863, + "learning_rate": 4.6827453079685116e-05, + "loss": 0.05, + "num_input_tokens_seen": 9432992, + "step": 44705 + }, + { + "epoch": 4.918591859185918, + "grad_norm": 0.1261291652917862, + "learning_rate": 4.682628283921134e-05, + "loss": 0.0127, + "num_input_tokens_seen": 9434016, + "step": 44710 + }, + { + "epoch": 4.919141914191419, + "grad_norm": 0.22342966496944427, + "learning_rate": 4.68251123975752e-05, + "loss": 0.0923, + "num_input_tokens_seen": 9435040, + "step": 44715 + }, + { + "epoch": 4.919691969196919, + "grad_norm": 0.17342554032802582, + "learning_rate": 4.6823941754787464e-05, + "loss": 0.031, + "num_input_tokens_seen": 9436064, + "step": 44720 + }, + { + "epoch": 4.9202420242024205, + "grad_norm": 1.832485318183899, + "learning_rate": 4.682277091085893e-05, + "loss": 0.0847, + "num_input_tokens_seen": 9437088, + "step": 44725 + }, + { + "epoch": 4.920792079207921, + "grad_norm": 1.2470444440841675, + "learning_rate": 4.6821599865800395e-05, + "loss": 0.0277, + "num_input_tokens_seen": 9438144, + "step": 44730 + }, + { + "epoch": 4.921342134213422, + "grad_norm": 0.5532561540603638, + "learning_rate": 4.682042861962264e-05, + "loss": 0.043, + "num_input_tokens_seen": 9439200, + "step": 44735 + }, + { + "epoch": 4.921892189218922, + "grad_norm": 0.17993487417697906, + "learning_rate": 4.6819257172336466e-05, + "loss": 0.0508, + "num_input_tokens_seen": 9440192, + "step": 44740 + }, + { + "epoch": 4.922442244224422, + "grad_norm": 0.41955363750457764, + "learning_rate": 4.681808552395267e-05, + "loss": 0.0999, + "num_input_tokens_seen": 9441280, + "step": 44745 + }, + { + "epoch": 4.922992299229923, + "grad_norm": 0.006543436087667942, + "learning_rate": 4.681691367448204e-05, + "loss": 0.0106, + "num_input_tokens_seen": 9442368, + "step": 44750 + }, + { + "epoch": 4.9235423542354235, + "grad_norm": 0.05142214894294739, + "learning_rate": 4.681574162393539e-05, + "loss": 0.0169, + "num_input_tokens_seen": 9443456, + "step": 44755 + }, + { + "epoch": 4.924092409240924, + "grad_norm": 0.13274934887886047, + "learning_rate": 4.681456937232352e-05, + "loss": 0.0229, + "num_input_tokens_seen": 9444480, + "step": 44760 + }, + { + "epoch": 4.924642464246425, + "grad_norm": 0.14566071331501007, + "learning_rate": 4.681339691965724e-05, + "loss": 0.0214, + "num_input_tokens_seen": 9445568, + "step": 44765 + }, + { + "epoch": 4.925192519251925, + "grad_norm": 0.016436289995908737, + "learning_rate": 4.6812224265947344e-05, + "loss": 0.1002, + "num_input_tokens_seen": 9446688, + "step": 44770 + }, + { + "epoch": 4.925742574257426, + "grad_norm": 1.0668139457702637, + "learning_rate": 4.681105141120463e-05, + "loss": 0.0296, + "num_input_tokens_seen": 9447840, + "step": 44775 + }, + { + "epoch": 4.926292629262926, + "grad_norm": 0.007260996848344803, + "learning_rate": 4.6809878355439935e-05, + "loss": 0.0365, + "num_input_tokens_seen": 9448928, + "step": 44780 + }, + { + "epoch": 4.9268426842684265, + "grad_norm": 0.1812257319688797, + "learning_rate": 4.680870509866405e-05, + "loss": 0.0154, + "num_input_tokens_seen": 9450016, + "step": 44785 + }, + { + "epoch": 4.927392739273928, + "grad_norm": 0.011880417354404926, + "learning_rate": 4.68075316408878e-05, + "loss": 0.0622, + "num_input_tokens_seen": 9451040, + "step": 44790 + }, + { + "epoch": 4.927942794279428, + "grad_norm": 0.026857387274503708, + "learning_rate": 4.680635798212199e-05, + "loss": 0.0424, + "num_input_tokens_seen": 9452128, + "step": 44795 + }, + { + "epoch": 4.928492849284929, + "grad_norm": 0.6774559617042542, + "learning_rate": 4.680518412237744e-05, + "loss": 0.0399, + "num_input_tokens_seen": 9453216, + "step": 44800 + }, + { + "epoch": 4.929042904290429, + "grad_norm": 0.008813438005745411, + "learning_rate": 4.680401006166498e-05, + "loss": 0.0286, + "num_input_tokens_seen": 9454272, + "step": 44805 + }, + { + "epoch": 4.929592959295929, + "grad_norm": 0.8287068605422974, + "learning_rate": 4.680283579999541e-05, + "loss": 0.0229, + "num_input_tokens_seen": 9455296, + "step": 44810 + }, + { + "epoch": 4.93014301430143, + "grad_norm": 0.7250837087631226, + "learning_rate": 4.6801661337379575e-05, + "loss": 0.1071, + "num_input_tokens_seen": 9456288, + "step": 44815 + }, + { + "epoch": 4.930693069306931, + "grad_norm": 0.06674227118492126, + "learning_rate": 4.680048667382829e-05, + "loss": 0.0408, + "num_input_tokens_seen": 9457408, + "step": 44820 + }, + { + "epoch": 4.931243124312431, + "grad_norm": 0.7544296979904175, + "learning_rate": 4.6799311809352376e-05, + "loss": 0.0894, + "num_input_tokens_seen": 9458432, + "step": 44825 + }, + { + "epoch": 4.931793179317932, + "grad_norm": 0.319502055644989, + "learning_rate": 4.6798136743962665e-05, + "loss": 0.0091, + "num_input_tokens_seen": 9459488, + "step": 44830 + }, + { + "epoch": 4.932343234323432, + "grad_norm": 1.6404863595962524, + "learning_rate": 4.6796961477669986e-05, + "loss": 0.1337, + "num_input_tokens_seen": 9460480, + "step": 44835 + }, + { + "epoch": 4.932893289328933, + "grad_norm": 0.842414915561676, + "learning_rate": 4.6795786010485175e-05, + "loss": 0.0277, + "num_input_tokens_seen": 9461504, + "step": 44840 + }, + { + "epoch": 4.933443344334433, + "grad_norm": 0.10390204936265945, + "learning_rate": 4.679461034241906e-05, + "loss": 0.0449, + "num_input_tokens_seen": 9462560, + "step": 44845 + }, + { + "epoch": 4.933993399339934, + "grad_norm": 0.02283315733075142, + "learning_rate": 4.6793434473482484e-05, + "loss": 0.0143, + "num_input_tokens_seen": 9463584, + "step": 44850 + }, + { + "epoch": 4.934543454345435, + "grad_norm": 0.4657764434814453, + "learning_rate": 4.6792258403686275e-05, + "loss": 0.0197, + "num_input_tokens_seen": 9464608, + "step": 44855 + }, + { + "epoch": 4.935093509350935, + "grad_norm": 0.21291057765483856, + "learning_rate": 4.679108213304128e-05, + "loss": 0.0276, + "num_input_tokens_seen": 9465664, + "step": 44860 + }, + { + "epoch": 4.935643564356436, + "grad_norm": 0.18708938360214233, + "learning_rate": 4.6789905661558344e-05, + "loss": 0.0437, + "num_input_tokens_seen": 9466720, + "step": 44865 + }, + { + "epoch": 4.936193619361936, + "grad_norm": 0.18831545114517212, + "learning_rate": 4.6788728989248286e-05, + "loss": 0.0177, + "num_input_tokens_seen": 9467776, + "step": 44870 + }, + { + "epoch": 4.936743674367436, + "grad_norm": 0.04221295565366745, + "learning_rate": 4.678755211612198e-05, + "loss": 0.0335, + "num_input_tokens_seen": 9468864, + "step": 44875 + }, + { + "epoch": 4.9372937293729375, + "grad_norm": 0.03146206587553024, + "learning_rate": 4.678637504219026e-05, + "loss": 0.0633, + "num_input_tokens_seen": 9469984, + "step": 44880 + }, + { + "epoch": 4.937843784378438, + "grad_norm": 0.49626606702804565, + "learning_rate": 4.678519776746397e-05, + "loss": 0.0612, + "num_input_tokens_seen": 9471072, + "step": 44885 + }, + { + "epoch": 4.938393839383938, + "grad_norm": 0.2354564219713211, + "learning_rate": 4.678402029195398e-05, + "loss": 0.0266, + "num_input_tokens_seen": 9472128, + "step": 44890 + }, + { + "epoch": 4.938943894389439, + "grad_norm": 1.6763361692428589, + "learning_rate": 4.678284261567111e-05, + "loss": 0.0941, + "num_input_tokens_seen": 9473216, + "step": 44895 + }, + { + "epoch": 4.939493949394939, + "grad_norm": 0.6322303414344788, + "learning_rate": 4.678166473862624e-05, + "loss": 0.1373, + "num_input_tokens_seen": 9474240, + "step": 44900 + }, + { + "epoch": 4.94004400440044, + "grad_norm": 0.011802378110587597, + "learning_rate": 4.678048666083022e-05, + "loss": 0.0068, + "num_input_tokens_seen": 9475360, + "step": 44905 + }, + { + "epoch": 4.9405940594059405, + "grad_norm": 0.026847606524825096, + "learning_rate": 4.6779308382293904e-05, + "loss": 0.1315, + "num_input_tokens_seen": 9476416, + "step": 44910 + }, + { + "epoch": 4.941144114411442, + "grad_norm": 0.08548500388860703, + "learning_rate": 4.677812990302816e-05, + "loss": 0.0303, + "num_input_tokens_seen": 9477504, + "step": 44915 + }, + { + "epoch": 4.941694169416942, + "grad_norm": 1.5407153367996216, + "learning_rate": 4.677695122304383e-05, + "loss": 0.0486, + "num_input_tokens_seen": 9478592, + "step": 44920 + }, + { + "epoch": 4.942244224422442, + "grad_norm": 0.009473958984017372, + "learning_rate": 4.6775772342351795e-05, + "loss": 0.0041, + "num_input_tokens_seen": 9479680, + "step": 44925 + }, + { + "epoch": 4.942794279427943, + "grad_norm": 0.004682730883359909, + "learning_rate": 4.6774593260962915e-05, + "loss": 0.0247, + "num_input_tokens_seen": 9480704, + "step": 44930 + }, + { + "epoch": 4.943344334433443, + "grad_norm": 0.0038470826111733913, + "learning_rate": 4.677341397888806e-05, + "loss": 0.0204, + "num_input_tokens_seen": 9481760, + "step": 44935 + }, + { + "epoch": 4.9438943894389435, + "grad_norm": 0.477157860994339, + "learning_rate": 4.67722344961381e-05, + "loss": 0.0987, + "num_input_tokens_seen": 9482720, + "step": 44940 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.8516382575035095, + "learning_rate": 4.6771054812723896e-05, + "loss": 0.0951, + "num_input_tokens_seen": 9483776, + "step": 44945 + }, + { + "epoch": 4.944994499449945, + "grad_norm": 0.9658413529396057, + "learning_rate": 4.676987492865633e-05, + "loss": 0.1456, + "num_input_tokens_seen": 9484832, + "step": 44950 + }, + { + "epoch": 4.945544554455445, + "grad_norm": 0.02316289022564888, + "learning_rate": 4.676869484394627e-05, + "loss": 0.0246, + "num_input_tokens_seen": 9485856, + "step": 44955 + }, + { + "epoch": 4.946094609460946, + "grad_norm": 0.17324034869670868, + "learning_rate": 4.6767514558604596e-05, + "loss": 0.0203, + "num_input_tokens_seen": 9486912, + "step": 44960 + }, + { + "epoch": 4.946644664466446, + "grad_norm": 0.011560003273189068, + "learning_rate": 4.676633407264218e-05, + "loss": 0.0885, + "num_input_tokens_seen": 9487968, + "step": 44965 + }, + { + "epoch": 4.947194719471947, + "grad_norm": 0.3033398985862732, + "learning_rate": 4.6765153386069915e-05, + "loss": 0.0208, + "num_input_tokens_seen": 9489088, + "step": 44970 + }, + { + "epoch": 4.947744774477448, + "grad_norm": 0.144570454955101, + "learning_rate": 4.676397249889868e-05, + "loss": 0.0273, + "num_input_tokens_seen": 9490144, + "step": 44975 + }, + { + "epoch": 4.948294829482949, + "grad_norm": 0.7430587410926819, + "learning_rate": 4.676279141113934e-05, + "loss": 0.0573, + "num_input_tokens_seen": 9491200, + "step": 44980 + }, + { + "epoch": 4.948844884488449, + "grad_norm": 0.06199515238404274, + "learning_rate": 4.67616101228028e-05, + "loss": 0.0116, + "num_input_tokens_seen": 9492224, + "step": 44985 + }, + { + "epoch": 4.949394939493949, + "grad_norm": 3.354339122772217, + "learning_rate": 4.6760428633899946e-05, + "loss": 0.0815, + "num_input_tokens_seen": 9493248, + "step": 44990 + }, + { + "epoch": 4.94994499449945, + "grad_norm": 0.13154686987400055, + "learning_rate": 4.675924694444166e-05, + "loss": 0.0617, + "num_input_tokens_seen": 9494304, + "step": 44995 + }, + { + "epoch": 4.9504950495049505, + "grad_norm": 0.8125394582748413, + "learning_rate": 4.6758065054438834e-05, + "loss": 0.0262, + "num_input_tokens_seen": 9495328, + "step": 45000 + }, + { + "epoch": 4.951045104510451, + "grad_norm": 0.5719960927963257, + "learning_rate": 4.675688296390237e-05, + "loss": 0.0096, + "num_input_tokens_seen": 9496320, + "step": 45005 + }, + { + "epoch": 4.951595159515952, + "grad_norm": 0.28545689582824707, + "learning_rate": 4.675570067284315e-05, + "loss": 0.06, + "num_input_tokens_seen": 9497408, + "step": 45010 + }, + { + "epoch": 4.952145214521452, + "grad_norm": 1.097350001335144, + "learning_rate": 4.6754518181272075e-05, + "loss": 0.153, + "num_input_tokens_seen": 9498496, + "step": 45015 + }, + { + "epoch": 4.952695269526953, + "grad_norm": 0.033654067665338516, + "learning_rate": 4.6753335489200044e-05, + "loss": 0.0091, + "num_input_tokens_seen": 9499520, + "step": 45020 + }, + { + "epoch": 4.953245324532453, + "grad_norm": 1.6937060356140137, + "learning_rate": 4.6752152596637965e-05, + "loss": 0.0531, + "num_input_tokens_seen": 9500608, + "step": 45025 + }, + { + "epoch": 4.9537953795379535, + "grad_norm": 0.04887949675321579, + "learning_rate": 4.675096950359673e-05, + "loss": 0.0054, + "num_input_tokens_seen": 9501664, + "step": 45030 + }, + { + "epoch": 4.9543454345434546, + "grad_norm": 0.16903343796730042, + "learning_rate": 4.674978621008724e-05, + "loss": 0.0841, + "num_input_tokens_seen": 9502752, + "step": 45035 + }, + { + "epoch": 4.954895489548955, + "grad_norm": 0.9330942034721375, + "learning_rate": 4.6748602716120414e-05, + "loss": 0.0781, + "num_input_tokens_seen": 9503776, + "step": 45040 + }, + { + "epoch": 4.955445544554456, + "grad_norm": 1.0197807550430298, + "learning_rate": 4.674741902170715e-05, + "loss": 0.0256, + "num_input_tokens_seen": 9504832, + "step": 45045 + }, + { + "epoch": 4.955995599559956, + "grad_norm": 0.5407295823097229, + "learning_rate": 4.674623512685836e-05, + "loss": 0.0208, + "num_input_tokens_seen": 9505856, + "step": 45050 + }, + { + "epoch": 4.956545654565456, + "grad_norm": 0.3443633019924164, + "learning_rate": 4.6745051031584964e-05, + "loss": 0.0139, + "num_input_tokens_seen": 9506976, + "step": 45055 + }, + { + "epoch": 4.957095709570957, + "grad_norm": 0.010829711332917213, + "learning_rate": 4.674386673589786e-05, + "loss": 0.0445, + "num_input_tokens_seen": 9507968, + "step": 45060 + }, + { + "epoch": 4.957645764576458, + "grad_norm": 0.6323581337928772, + "learning_rate": 4.674268223980797e-05, + "loss": 0.0265, + "num_input_tokens_seen": 9509120, + "step": 45065 + }, + { + "epoch": 4.958195819581958, + "grad_norm": 1.1385040283203125, + "learning_rate": 4.6741497543326206e-05, + "loss": 0.1053, + "num_input_tokens_seen": 9510208, + "step": 45070 + }, + { + "epoch": 4.958745874587459, + "grad_norm": 0.1169322282075882, + "learning_rate": 4.67403126464635e-05, + "loss": 0.0325, + "num_input_tokens_seen": 9511296, + "step": 45075 + }, + { + "epoch": 4.959295929592959, + "grad_norm": 0.08364185690879822, + "learning_rate": 4.6739127549230754e-05, + "loss": 0.0193, + "num_input_tokens_seen": 9512448, + "step": 45080 + }, + { + "epoch": 4.95984598459846, + "grad_norm": 0.03281806781888008, + "learning_rate": 4.673794225163891e-05, + "loss": 0.0598, + "num_input_tokens_seen": 9513472, + "step": 45085 + }, + { + "epoch": 4.96039603960396, + "grad_norm": 0.17154155671596527, + "learning_rate": 4.6736756753698884e-05, + "loss": 0.0231, + "num_input_tokens_seen": 9514464, + "step": 45090 + }, + { + "epoch": 4.960946094609461, + "grad_norm": 0.054785508662462234, + "learning_rate": 4.6735571055421604e-05, + "loss": 0.0052, + "num_input_tokens_seen": 9515616, + "step": 45095 + }, + { + "epoch": 4.961496149614962, + "grad_norm": 0.2737433612346649, + "learning_rate": 4.673438515681798e-05, + "loss": 0.0179, + "num_input_tokens_seen": 9516608, + "step": 45100 + }, + { + "epoch": 4.962046204620462, + "grad_norm": 1.5323220491409302, + "learning_rate": 4.673319905789897e-05, + "loss": 0.0977, + "num_input_tokens_seen": 9517632, + "step": 45105 + }, + { + "epoch": 4.962596259625963, + "grad_norm": 0.16239382326602936, + "learning_rate": 4.673201275867549e-05, + "loss": 0.0621, + "num_input_tokens_seen": 9518656, + "step": 45110 + }, + { + "epoch": 4.963146314631463, + "grad_norm": 1.7281068563461304, + "learning_rate": 4.673082625915847e-05, + "loss": 0.1297, + "num_input_tokens_seen": 9519648, + "step": 45115 + }, + { + "epoch": 4.963696369636963, + "grad_norm": 0.08062082529067993, + "learning_rate": 4.672963955935886e-05, + "loss": 0.0234, + "num_input_tokens_seen": 9520672, + "step": 45120 + }, + { + "epoch": 4.9642464246424645, + "grad_norm": 0.005249468609690666, + "learning_rate": 4.672845265928759e-05, + "loss": 0.0319, + "num_input_tokens_seen": 9521760, + "step": 45125 + }, + { + "epoch": 4.964796479647965, + "grad_norm": 0.25365549325942993, + "learning_rate": 4.672726555895559e-05, + "loss": 0.0292, + "num_input_tokens_seen": 9522816, + "step": 45130 + }, + { + "epoch": 4.965346534653465, + "grad_norm": 0.17140105366706848, + "learning_rate": 4.672607825837382e-05, + "loss": 0.1076, + "num_input_tokens_seen": 9523936, + "step": 45135 + }, + { + "epoch": 4.965896589658966, + "grad_norm": 0.0973394587635994, + "learning_rate": 4.67248907575532e-05, + "loss": 0.0699, + "num_input_tokens_seen": 9524928, + "step": 45140 + }, + { + "epoch": 4.966446644664466, + "grad_norm": 0.006954425945878029, + "learning_rate": 4.6723703056504697e-05, + "loss": 0.0799, + "num_input_tokens_seen": 9525888, + "step": 45145 + }, + { + "epoch": 4.966996699669967, + "grad_norm": 0.5031284689903259, + "learning_rate": 4.6722515155239245e-05, + "loss": 0.0427, + "num_input_tokens_seen": 9526944, + "step": 45150 + }, + { + "epoch": 4.9675467546754675, + "grad_norm": 0.5432925820350647, + "learning_rate": 4.672132705376779e-05, + "loss": 0.0249, + "num_input_tokens_seen": 9528032, + "step": 45155 + }, + { + "epoch": 4.968096809680969, + "grad_norm": 0.044373124837875366, + "learning_rate": 4.6720138752101287e-05, + "loss": 0.0228, + "num_input_tokens_seen": 9529056, + "step": 45160 + }, + { + "epoch": 4.968646864686469, + "grad_norm": 1.1685469150543213, + "learning_rate": 4.671895025025069e-05, + "loss": 0.0408, + "num_input_tokens_seen": 9530080, + "step": 45165 + }, + { + "epoch": 4.969196919691969, + "grad_norm": 0.03036479838192463, + "learning_rate": 4.6717761548226946e-05, + "loss": 0.0689, + "num_input_tokens_seen": 9531072, + "step": 45170 + }, + { + "epoch": 4.96974697469747, + "grad_norm": 0.33064720034599304, + "learning_rate": 4.6716572646041024e-05, + "loss": 0.0392, + "num_input_tokens_seen": 9532096, + "step": 45175 + }, + { + "epoch": 4.97029702970297, + "grad_norm": 3.2538797855377197, + "learning_rate": 4.671538354370386e-05, + "loss": 0.0458, + "num_input_tokens_seen": 9533120, + "step": 45180 + }, + { + "epoch": 4.9708470847084705, + "grad_norm": 0.09243372082710266, + "learning_rate": 4.6714194241226436e-05, + "loss": 0.0609, + "num_input_tokens_seen": 9534144, + "step": 45185 + }, + { + "epoch": 4.971397139713972, + "grad_norm": 0.49906203150749207, + "learning_rate": 4.6713004738619705e-05, + "loss": 0.0377, + "num_input_tokens_seen": 9535264, + "step": 45190 + }, + { + "epoch": 4.971947194719472, + "grad_norm": 0.01403302326798439, + "learning_rate": 4.671181503589461e-05, + "loss": 0.0217, + "num_input_tokens_seen": 9536288, + "step": 45195 + }, + { + "epoch": 4.972497249724973, + "grad_norm": 0.9881892800331116, + "learning_rate": 4.6710625133062155e-05, + "loss": 0.0798, + "num_input_tokens_seen": 9537376, + "step": 45200 + }, + { + "epoch": 4.973047304730473, + "grad_norm": 0.03303268924355507, + "learning_rate": 4.670943503013328e-05, + "loss": 0.0098, + "num_input_tokens_seen": 9538432, + "step": 45205 + }, + { + "epoch": 4.973597359735973, + "grad_norm": 0.018314125016331673, + "learning_rate": 4.6708244727118955e-05, + "loss": 0.0232, + "num_input_tokens_seen": 9539488, + "step": 45210 + }, + { + "epoch": 4.974147414741474, + "grad_norm": 2.1280055046081543, + "learning_rate": 4.670705422403016e-05, + "loss": 0.061, + "num_input_tokens_seen": 9540512, + "step": 45215 + }, + { + "epoch": 4.974697469746975, + "grad_norm": 0.03934537619352341, + "learning_rate": 4.670586352087786e-05, + "loss": 0.1276, + "num_input_tokens_seen": 9541600, + "step": 45220 + }, + { + "epoch": 4.975247524752476, + "grad_norm": 0.5532460808753967, + "learning_rate": 4.670467261767303e-05, + "loss": 0.0235, + "num_input_tokens_seen": 9542624, + "step": 45225 + }, + { + "epoch": 4.975797579757976, + "grad_norm": 0.08499474823474884, + "learning_rate": 4.670348151442665e-05, + "loss": 0.0905, + "num_input_tokens_seen": 9543584, + "step": 45230 + }, + { + "epoch": 4.976347634763476, + "grad_norm": 0.14880695939064026, + "learning_rate": 4.670229021114969e-05, + "loss": 0.0425, + "num_input_tokens_seen": 9544608, + "step": 45235 + }, + { + "epoch": 4.976897689768977, + "grad_norm": 0.24727816879749298, + "learning_rate": 4.670109870785314e-05, + "loss": 0.0168, + "num_input_tokens_seen": 9545728, + "step": 45240 + }, + { + "epoch": 4.977447744774477, + "grad_norm": 0.30679425597190857, + "learning_rate": 4.669990700454797e-05, + "loss": 0.0776, + "num_input_tokens_seen": 9546816, + "step": 45245 + }, + { + "epoch": 4.977997799779978, + "grad_norm": 0.6486489176750183, + "learning_rate": 4.6698715101245186e-05, + "loss": 0.0208, + "num_input_tokens_seen": 9547872, + "step": 45250 + }, + { + "epoch": 4.978547854785479, + "grad_norm": 0.9524360299110413, + "learning_rate": 4.669752299795574e-05, + "loss": 0.0885, + "num_input_tokens_seen": 9548864, + "step": 45255 + }, + { + "epoch": 4.979097909790979, + "grad_norm": 0.03948467597365379, + "learning_rate": 4.669633069469064e-05, + "loss": 0.0701, + "num_input_tokens_seen": 9549952, + "step": 45260 + }, + { + "epoch": 4.97964796479648, + "grad_norm": 0.05189003795385361, + "learning_rate": 4.669513819146087e-05, + "loss": 0.114, + "num_input_tokens_seen": 9551040, + "step": 45265 + }, + { + "epoch": 4.98019801980198, + "grad_norm": 0.06754700839519501, + "learning_rate": 4.669394548827742e-05, + "loss": 0.004, + "num_input_tokens_seen": 9552064, + "step": 45270 + }, + { + "epoch": 4.98074807480748, + "grad_norm": 0.05475761741399765, + "learning_rate": 4.66927525851513e-05, + "loss": 0.1026, + "num_input_tokens_seen": 9553120, + "step": 45275 + }, + { + "epoch": 4.9812981298129815, + "grad_norm": 1.4035462141036987, + "learning_rate": 4.669155948209347e-05, + "loss": 0.0704, + "num_input_tokens_seen": 9554240, + "step": 45280 + }, + { + "epoch": 4.981848184818482, + "grad_norm": 0.8682942986488342, + "learning_rate": 4.669036617911495e-05, + "loss": 0.0993, + "num_input_tokens_seen": 9555360, + "step": 45285 + }, + { + "epoch": 4.982398239823983, + "grad_norm": 0.7524265646934509, + "learning_rate": 4.6689172676226744e-05, + "loss": 0.0349, + "num_input_tokens_seen": 9556384, + "step": 45290 + }, + { + "epoch": 4.982948294829483, + "grad_norm": 0.09823606163263321, + "learning_rate": 4.6687978973439825e-05, + "loss": 0.0846, + "num_input_tokens_seen": 9557472, + "step": 45295 + }, + { + "epoch": 4.983498349834983, + "grad_norm": 0.8682717680931091, + "learning_rate": 4.668678507076522e-05, + "loss": 0.088, + "num_input_tokens_seen": 9558528, + "step": 45300 + }, + { + "epoch": 4.984048404840484, + "grad_norm": 0.1141308844089508, + "learning_rate": 4.6685590968213924e-05, + "loss": 0.1247, + "num_input_tokens_seen": 9559616, + "step": 45305 + }, + { + "epoch": 4.9845984598459845, + "grad_norm": 0.2528202533721924, + "learning_rate": 4.668439666579694e-05, + "loss": 0.0496, + "num_input_tokens_seen": 9560672, + "step": 45310 + }, + { + "epoch": 4.985148514851485, + "grad_norm": 0.058206167072057724, + "learning_rate": 4.668320216352527e-05, + "loss": 0.0179, + "num_input_tokens_seen": 9561760, + "step": 45315 + }, + { + "epoch": 4.985698569856986, + "grad_norm": 1.5832791328430176, + "learning_rate": 4.668200746140994e-05, + "loss": 0.0608, + "num_input_tokens_seen": 9562784, + "step": 45320 + }, + { + "epoch": 4.986248624862486, + "grad_norm": 0.022422274574637413, + "learning_rate": 4.668081255946195e-05, + "loss": 0.0492, + "num_input_tokens_seen": 9563808, + "step": 45325 + }, + { + "epoch": 4.986798679867987, + "grad_norm": 0.049418989568948746, + "learning_rate": 4.667961745769231e-05, + "loss": 0.0146, + "num_input_tokens_seen": 9564832, + "step": 45330 + }, + { + "epoch": 4.987348734873487, + "grad_norm": 0.04704835265874863, + "learning_rate": 4.667842215611204e-05, + "loss": 0.0242, + "num_input_tokens_seen": 9565888, + "step": 45335 + }, + { + "epoch": 4.987898789878988, + "grad_norm": 0.1827760487794876, + "learning_rate": 4.6677226654732154e-05, + "loss": 0.0709, + "num_input_tokens_seen": 9566880, + "step": 45340 + }, + { + "epoch": 4.988448844884489, + "grad_norm": 0.03587857261300087, + "learning_rate": 4.667603095356368e-05, + "loss": 0.056, + "num_input_tokens_seen": 9567936, + "step": 45345 + }, + { + "epoch": 4.988998899889989, + "grad_norm": 0.18204186856746674, + "learning_rate": 4.667483505261762e-05, + "loss": 0.0221, + "num_input_tokens_seen": 9569024, + "step": 45350 + }, + { + "epoch": 4.98954895489549, + "grad_norm": 0.16231180727481842, + "learning_rate": 4.667363895190501e-05, + "loss": 0.024, + "num_input_tokens_seen": 9570112, + "step": 45355 + }, + { + "epoch": 4.99009900990099, + "grad_norm": 0.1887415498495102, + "learning_rate": 4.6672442651436864e-05, + "loss": 0.0353, + "num_input_tokens_seen": 9571136, + "step": 45360 + }, + { + "epoch": 4.99064906490649, + "grad_norm": 0.38261011242866516, + "learning_rate": 4.667124615122422e-05, + "loss": 0.0891, + "num_input_tokens_seen": 9572224, + "step": 45365 + }, + { + "epoch": 4.991199119911991, + "grad_norm": 0.10738732665777206, + "learning_rate": 4.66700494512781e-05, + "loss": 0.0344, + "num_input_tokens_seen": 9573312, + "step": 45370 + }, + { + "epoch": 4.991749174917492, + "grad_norm": 0.008248268626630306, + "learning_rate": 4.6668852551609524e-05, + "loss": 0.084, + "num_input_tokens_seen": 9574336, + "step": 45375 + }, + { + "epoch": 4.992299229922993, + "grad_norm": 0.1778036206960678, + "learning_rate": 4.666765545222954e-05, + "loss": 0.0111, + "num_input_tokens_seen": 9575360, + "step": 45380 + }, + { + "epoch": 4.992849284928493, + "grad_norm": 0.029480187222361565, + "learning_rate": 4.666645815314917e-05, + "loss": 0.0068, + "num_input_tokens_seen": 9576416, + "step": 45385 + }, + { + "epoch": 4.993399339933993, + "grad_norm": 0.22732000052928925, + "learning_rate": 4.666526065437945e-05, + "loss": 0.0456, + "num_input_tokens_seen": 9577408, + "step": 45390 + }, + { + "epoch": 4.993949394939494, + "grad_norm": 1.0002574920654297, + "learning_rate": 4.666406295593143e-05, + "loss": 0.0704, + "num_input_tokens_seen": 9578432, + "step": 45395 + }, + { + "epoch": 4.994499449944994, + "grad_norm": 0.7380262613296509, + "learning_rate": 4.6662865057816125e-05, + "loss": 0.0246, + "num_input_tokens_seen": 9579520, + "step": 45400 + }, + { + "epoch": 4.9950495049504955, + "grad_norm": 0.08180281519889832, + "learning_rate": 4.666166696004459e-05, + "loss": 0.1024, + "num_input_tokens_seen": 9580576, + "step": 45405 + }, + { + "epoch": 4.995599559955996, + "grad_norm": 0.25417208671569824, + "learning_rate": 4.666046866262787e-05, + "loss": 0.0076, + "num_input_tokens_seen": 9581632, + "step": 45410 + }, + { + "epoch": 4.996149614961496, + "grad_norm": 1.735480546951294, + "learning_rate": 4.6659270165577e-05, + "loss": 0.1061, + "num_input_tokens_seen": 9582688, + "step": 45415 + }, + { + "epoch": 4.996699669966997, + "grad_norm": 0.5005389451980591, + "learning_rate": 4.665807146890303e-05, + "loss": 0.0237, + "num_input_tokens_seen": 9583744, + "step": 45420 + }, + { + "epoch": 4.997249724972497, + "grad_norm": 0.1780199557542801, + "learning_rate": 4.665687257261701e-05, + "loss": 0.0986, + "num_input_tokens_seen": 9584800, + "step": 45425 + }, + { + "epoch": 4.997799779977997, + "grad_norm": 1.3994706869125366, + "learning_rate": 4.665567347672998e-05, + "loss": 0.0991, + "num_input_tokens_seen": 9585920, + "step": 45430 + }, + { + "epoch": 4.9983498349834985, + "grad_norm": 0.08275965601205826, + "learning_rate": 4.6654474181253e-05, + "loss": 0.0536, + "num_input_tokens_seen": 9587008, + "step": 45435 + }, + { + "epoch": 4.998899889988999, + "grad_norm": 0.32925674319267273, + "learning_rate": 4.665327468619713e-05, + "loss": 0.1271, + "num_input_tokens_seen": 9587968, + "step": 45440 + }, + { + "epoch": 4.9994499449945, + "grad_norm": 0.023754647001624107, + "learning_rate": 4.6652074991573416e-05, + "loss": 0.0884, + "num_input_tokens_seen": 9589056, + "step": 45445 + }, + { + "epoch": 5.0, + "grad_norm": 0.008203749544918537, + "learning_rate": 4.665087509739291e-05, + "loss": 0.0141, + "num_input_tokens_seen": 9590080, + "step": 45450 + }, + { + "epoch": 5.0, + "eval_loss": 0.0657351166009903, + "eval_runtime": 37.0124, + "eval_samples_per_second": 109.153, + "eval_steps_per_second": 27.288, + "num_input_tokens_seen": 9590080, + "step": 45450 + }, + { + "epoch": 5.0005500550055, + "grad_norm": 0.15629874169826508, + "learning_rate": 4.6649675003666684e-05, + "loss": 0.0572, + "num_input_tokens_seen": 9591168, + "step": 45455 + }, + { + "epoch": 5.001100110011001, + "grad_norm": 0.10307160019874573, + "learning_rate": 4.664847471040579e-05, + "loss": 0.0582, + "num_input_tokens_seen": 9592192, + "step": 45460 + }, + { + "epoch": 5.0016501650165015, + "grad_norm": 0.04014645516872406, + "learning_rate": 4.664727421762129e-05, + "loss": 0.091, + "num_input_tokens_seen": 9593248, + "step": 45465 + }, + { + "epoch": 5.002200220022003, + "grad_norm": 0.0606563426554203, + "learning_rate": 4.664607352532425e-05, + "loss": 0.0206, + "num_input_tokens_seen": 9594304, + "step": 45470 + }, + { + "epoch": 5.002750275027503, + "grad_norm": 0.029675407335162163, + "learning_rate": 4.6644872633525745e-05, + "loss": 0.0122, + "num_input_tokens_seen": 9595328, + "step": 45475 + }, + { + "epoch": 5.003300330033003, + "grad_norm": 1.8436146974563599, + "learning_rate": 4.6643671542236824e-05, + "loss": 0.0649, + "num_input_tokens_seen": 9596320, + "step": 45480 + }, + { + "epoch": 5.003850385038504, + "grad_norm": 0.816588819026947, + "learning_rate": 4.6642470251468575e-05, + "loss": 0.0424, + "num_input_tokens_seen": 9597408, + "step": 45485 + }, + { + "epoch": 5.004400440044004, + "grad_norm": 0.054114192724227905, + "learning_rate": 4.664126876123206e-05, + "loss": 0.0779, + "num_input_tokens_seen": 9598464, + "step": 45490 + }, + { + "epoch": 5.0049504950495045, + "grad_norm": 0.04459875077009201, + "learning_rate": 4.664006707153836e-05, + "loss": 0.0628, + "num_input_tokens_seen": 9599520, + "step": 45495 + }, + { + "epoch": 5.005500550055006, + "grad_norm": 0.10183358937501907, + "learning_rate": 4.663886518239855e-05, + "loss": 0.0742, + "num_input_tokens_seen": 9600576, + "step": 45500 + }, + { + "epoch": 5.006050605060506, + "grad_norm": 0.3596498370170593, + "learning_rate": 4.663766309382369e-05, + "loss": 0.09, + "num_input_tokens_seen": 9601664, + "step": 45505 + }, + { + "epoch": 5.006600660066007, + "grad_norm": 0.16608533263206482, + "learning_rate": 4.663646080582488e-05, + "loss": 0.0149, + "num_input_tokens_seen": 9602784, + "step": 45510 + }, + { + "epoch": 5.007150715071507, + "grad_norm": 0.8155417442321777, + "learning_rate": 4.6635258318413186e-05, + "loss": 0.0523, + "num_input_tokens_seen": 9603808, + "step": 45515 + }, + { + "epoch": 5.007700770077007, + "grad_norm": 0.13786178827285767, + "learning_rate": 4.663405563159971e-05, + "loss": 0.0427, + "num_input_tokens_seen": 9604864, + "step": 45520 + }, + { + "epoch": 5.008250825082508, + "grad_norm": 0.5732167959213257, + "learning_rate": 4.663285274539552e-05, + "loss": 0.125, + "num_input_tokens_seen": 9605984, + "step": 45525 + }, + { + "epoch": 5.008800880088009, + "grad_norm": 0.2195339947938919, + "learning_rate": 4.66316496598117e-05, + "loss": 0.0395, + "num_input_tokens_seen": 9607072, + "step": 45530 + }, + { + "epoch": 5.00935093509351, + "grad_norm": 0.6995657086372375, + "learning_rate": 4.663044637485935e-05, + "loss": 0.0713, + "num_input_tokens_seen": 9608096, + "step": 45535 + }, + { + "epoch": 5.00990099009901, + "grad_norm": 1.2748934030532837, + "learning_rate": 4.662924289054955e-05, + "loss": 0.1859, + "num_input_tokens_seen": 9609216, + "step": 45540 + }, + { + "epoch": 5.01045104510451, + "grad_norm": 0.3617134392261505, + "learning_rate": 4.66280392068934e-05, + "loss": 0.0389, + "num_input_tokens_seen": 9610272, + "step": 45545 + }, + { + "epoch": 5.011001100110011, + "grad_norm": 0.09164033085107803, + "learning_rate": 4.6626835323902e-05, + "loss": 0.0492, + "num_input_tokens_seen": 9611328, + "step": 45550 + }, + { + "epoch": 5.011551155115511, + "grad_norm": 0.19943812489509583, + "learning_rate": 4.662563124158642e-05, + "loss": 0.0394, + "num_input_tokens_seen": 9612384, + "step": 45555 + }, + { + "epoch": 5.0121012101210125, + "grad_norm": 0.08625148981809616, + "learning_rate": 4.6624426959957785e-05, + "loss": 0.0283, + "num_input_tokens_seen": 9613504, + "step": 45560 + }, + { + "epoch": 5.012651265126513, + "grad_norm": 0.13801003992557526, + "learning_rate": 4.6623222479027176e-05, + "loss": 0.0949, + "num_input_tokens_seen": 9614560, + "step": 45565 + }, + { + "epoch": 5.013201320132013, + "grad_norm": 0.06845070421695709, + "learning_rate": 4.6622017798805705e-05, + "loss": 0.0472, + "num_input_tokens_seen": 9615680, + "step": 45570 + }, + { + "epoch": 5.013751375137514, + "grad_norm": 0.016969474032521248, + "learning_rate": 4.6620812919304474e-05, + "loss": 0.0262, + "num_input_tokens_seen": 9616704, + "step": 45575 + }, + { + "epoch": 5.014301430143014, + "grad_norm": 0.3795265257358551, + "learning_rate": 4.661960784053459e-05, + "loss": 0.0297, + "num_input_tokens_seen": 9617760, + "step": 45580 + }, + { + "epoch": 5.014851485148514, + "grad_norm": 0.24987660348415375, + "learning_rate": 4.661840256250714e-05, + "loss": 0.0449, + "num_input_tokens_seen": 9618784, + "step": 45585 + }, + { + "epoch": 5.0154015401540155, + "grad_norm": 1.2427829504013062, + "learning_rate": 4.6617197085233256e-05, + "loss": 0.0943, + "num_input_tokens_seen": 9619872, + "step": 45590 + }, + { + "epoch": 5.015951595159516, + "grad_norm": 1.818855881690979, + "learning_rate": 4.661599140872404e-05, + "loss": 0.0847, + "num_input_tokens_seen": 9620896, + "step": 45595 + }, + { + "epoch": 5.016501650165017, + "grad_norm": 0.03393121063709259, + "learning_rate": 4.6614785532990595e-05, + "loss": 0.0109, + "num_input_tokens_seen": 9621920, + "step": 45600 + }, + { + "epoch": 5.017051705170517, + "grad_norm": 0.06732961535453796, + "learning_rate": 4.661357945804405e-05, + "loss": 0.0164, + "num_input_tokens_seen": 9622976, + "step": 45605 + }, + { + "epoch": 5.017601760176017, + "grad_norm": 0.11130048334598541, + "learning_rate": 4.6612373183895516e-05, + "loss": 0.0166, + "num_input_tokens_seen": 9623968, + "step": 45610 + }, + { + "epoch": 5.018151815181518, + "grad_norm": 0.50323486328125, + "learning_rate": 4.6611166710556104e-05, + "loss": 0.0292, + "num_input_tokens_seen": 9625056, + "step": 45615 + }, + { + "epoch": 5.0187018701870185, + "grad_norm": 0.7576644420623779, + "learning_rate": 4.660996003803694e-05, + "loss": 0.0693, + "num_input_tokens_seen": 9626112, + "step": 45620 + }, + { + "epoch": 5.01925192519252, + "grad_norm": 0.47914350032806396, + "learning_rate": 4.660875316634915e-05, + "loss": 0.0494, + "num_input_tokens_seen": 9627168, + "step": 45625 + }, + { + "epoch": 5.01980198019802, + "grad_norm": 0.43054983019828796, + "learning_rate": 4.660754609550384e-05, + "loss": 0.0168, + "num_input_tokens_seen": 9628256, + "step": 45630 + }, + { + "epoch": 5.02035203520352, + "grad_norm": 0.007445377763360739, + "learning_rate": 4.660633882551215e-05, + "loss": 0.0399, + "num_input_tokens_seen": 9629280, + "step": 45635 + }, + { + "epoch": 5.020902090209021, + "grad_norm": 0.9634773135185242, + "learning_rate": 4.660513135638521e-05, + "loss": 0.0427, + "num_input_tokens_seen": 9630304, + "step": 45640 + }, + { + "epoch": 5.021452145214521, + "grad_norm": 0.38073310256004333, + "learning_rate": 4.660392368813414e-05, + "loss": 0.0276, + "num_input_tokens_seen": 9631328, + "step": 45645 + }, + { + "epoch": 5.022002200220022, + "grad_norm": 1.5615850687026978, + "learning_rate": 4.660271582077007e-05, + "loss": 0.0914, + "num_input_tokens_seen": 9632352, + "step": 45650 + }, + { + "epoch": 5.022552255225523, + "grad_norm": 0.09797509759664536, + "learning_rate": 4.660150775430413e-05, + "loss": 0.0221, + "num_input_tokens_seen": 9633408, + "step": 45655 + }, + { + "epoch": 5.023102310231023, + "grad_norm": 0.2280082404613495, + "learning_rate": 4.6600299488747456e-05, + "loss": 0.0266, + "num_input_tokens_seen": 9634528, + "step": 45660 + }, + { + "epoch": 5.023652365236524, + "grad_norm": 0.28185757994651794, + "learning_rate": 4.6599091024111194e-05, + "loss": 0.0549, + "num_input_tokens_seen": 9635616, + "step": 45665 + }, + { + "epoch": 5.024202420242024, + "grad_norm": 0.15088485181331635, + "learning_rate": 4.6597882360406475e-05, + "loss": 0.0138, + "num_input_tokens_seen": 9636672, + "step": 45670 + }, + { + "epoch": 5.024752475247524, + "grad_norm": 0.06534681469202042, + "learning_rate": 4.6596673497644427e-05, + "loss": 0.0067, + "num_input_tokens_seen": 9637664, + "step": 45675 + }, + { + "epoch": 5.025302530253025, + "grad_norm": 0.07043851166963577, + "learning_rate": 4.659546443583621e-05, + "loss": 0.0539, + "num_input_tokens_seen": 9638752, + "step": 45680 + }, + { + "epoch": 5.025852585258526, + "grad_norm": 0.30396080017089844, + "learning_rate": 4.659425517499296e-05, + "loss": 0.0348, + "num_input_tokens_seen": 9639872, + "step": 45685 + }, + { + "epoch": 5.026402640264027, + "grad_norm": 1.6737366914749146, + "learning_rate": 4.659304571512583e-05, + "loss": 0.1143, + "num_input_tokens_seen": 9640960, + "step": 45690 + }, + { + "epoch": 5.026952695269527, + "grad_norm": 0.0205826498568058, + "learning_rate": 4.6591836056245944e-05, + "loss": 0.0976, + "num_input_tokens_seen": 9642016, + "step": 45695 + }, + { + "epoch": 5.027502750275027, + "grad_norm": 0.08066998422145844, + "learning_rate": 4.6590626198364476e-05, + "loss": 0.0197, + "num_input_tokens_seen": 9643072, + "step": 45700 + }, + { + "epoch": 5.028052805280528, + "grad_norm": 0.11522924154996872, + "learning_rate": 4.658941614149256e-05, + "loss": 0.0635, + "num_input_tokens_seen": 9644096, + "step": 45705 + }, + { + "epoch": 5.028602860286028, + "grad_norm": 0.18843095004558563, + "learning_rate": 4.6588205885641355e-05, + "loss": 0.0551, + "num_input_tokens_seen": 9645184, + "step": 45710 + }, + { + "epoch": 5.0291529152915295, + "grad_norm": 0.08781960606575012, + "learning_rate": 4.6586995430822024e-05, + "loss": 0.0558, + "num_input_tokens_seen": 9646208, + "step": 45715 + }, + { + "epoch": 5.02970297029703, + "grad_norm": 0.008127058856189251, + "learning_rate": 4.658578477704571e-05, + "loss": 0.0079, + "num_input_tokens_seen": 9647296, + "step": 45720 + }, + { + "epoch": 5.03025302530253, + "grad_norm": 0.0048417020589113235, + "learning_rate": 4.658457392432357e-05, + "loss": 0.1027, + "num_input_tokens_seen": 9648480, + "step": 45725 + }, + { + "epoch": 5.030803080308031, + "grad_norm": 0.08180835098028183, + "learning_rate": 4.658336287266678e-05, + "loss": 0.063, + "num_input_tokens_seen": 9649568, + "step": 45730 + }, + { + "epoch": 5.031353135313531, + "grad_norm": 0.2572840750217438, + "learning_rate": 4.6582151622086484e-05, + "loss": 0.0781, + "num_input_tokens_seen": 9650656, + "step": 45735 + }, + { + "epoch": 5.031903190319032, + "grad_norm": 0.2754251956939697, + "learning_rate": 4.6580940172593855e-05, + "loss": 0.1097, + "num_input_tokens_seen": 9651712, + "step": 45740 + }, + { + "epoch": 5.0324532453245325, + "grad_norm": 0.5429680347442627, + "learning_rate": 4.657972852420005e-05, + "loss": 0.0628, + "num_input_tokens_seen": 9652768, + "step": 45745 + }, + { + "epoch": 5.033003300330033, + "grad_norm": 0.016631508246064186, + "learning_rate": 4.657851667691625e-05, + "loss": 0.0571, + "num_input_tokens_seen": 9653824, + "step": 45750 + }, + { + "epoch": 5.033553355335534, + "grad_norm": 0.25585848093032837, + "learning_rate": 4.657730463075362e-05, + "loss": 0.0544, + "num_input_tokens_seen": 9654880, + "step": 45755 + }, + { + "epoch": 5.034103410341034, + "grad_norm": 0.01071752980351448, + "learning_rate": 4.6576092385723315e-05, + "loss": 0.0215, + "num_input_tokens_seen": 9656000, + "step": 45760 + }, + { + "epoch": 5.034653465346534, + "grad_norm": 0.22771331667900085, + "learning_rate": 4.657487994183652e-05, + "loss": 0.0742, + "num_input_tokens_seen": 9657024, + "step": 45765 + }, + { + "epoch": 5.035203520352035, + "grad_norm": 0.18420158326625824, + "learning_rate": 4.657366729910442e-05, + "loss": 0.0467, + "num_input_tokens_seen": 9658176, + "step": 45770 + }, + { + "epoch": 5.0357535753575355, + "grad_norm": 0.01107668038457632, + "learning_rate": 4.657245445753817e-05, + "loss": 0.0258, + "num_input_tokens_seen": 9659168, + "step": 45775 + }, + { + "epoch": 5.036303630363037, + "grad_norm": 0.12428862601518631, + "learning_rate": 4.6571241417148966e-05, + "loss": 0.135, + "num_input_tokens_seen": 9660288, + "step": 45780 + }, + { + "epoch": 5.036853685368537, + "grad_norm": 0.20596136152744293, + "learning_rate": 4.6570028177947974e-05, + "loss": 0.0187, + "num_input_tokens_seen": 9661344, + "step": 45785 + }, + { + "epoch": 5.037403740374037, + "grad_norm": 0.049812763929367065, + "learning_rate": 4.6568814739946385e-05, + "loss": 0.0062, + "num_input_tokens_seen": 9662400, + "step": 45790 + }, + { + "epoch": 5.037953795379538, + "grad_norm": 0.3876461982727051, + "learning_rate": 4.6567601103155386e-05, + "loss": 0.0245, + "num_input_tokens_seen": 9663488, + "step": 45795 + }, + { + "epoch": 5.038503850385038, + "grad_norm": 0.8335099816322327, + "learning_rate": 4.656638726758615e-05, + "loss": 0.0492, + "num_input_tokens_seen": 9664576, + "step": 45800 + }, + { + "epoch": 5.039053905390539, + "grad_norm": 0.3526975214481354, + "learning_rate": 4.6565173233249874e-05, + "loss": 0.0612, + "num_input_tokens_seen": 9665632, + "step": 45805 + }, + { + "epoch": 5.03960396039604, + "grad_norm": 1.4304354190826416, + "learning_rate": 4.6563959000157756e-05, + "loss": 0.0291, + "num_input_tokens_seen": 9666656, + "step": 45810 + }, + { + "epoch": 5.04015401540154, + "grad_norm": 0.20475102961063385, + "learning_rate": 4.656274456832096e-05, + "loss": 0.0582, + "num_input_tokens_seen": 9667680, + "step": 45815 + }, + { + "epoch": 5.040704070407041, + "grad_norm": 0.024526357650756836, + "learning_rate": 4.6561529937750695e-05, + "loss": 0.012, + "num_input_tokens_seen": 9668672, + "step": 45820 + }, + { + "epoch": 5.041254125412541, + "grad_norm": 0.7776287198066711, + "learning_rate": 4.6560315108458154e-05, + "loss": 0.0718, + "num_input_tokens_seen": 9669728, + "step": 45825 + }, + { + "epoch": 5.041804180418042, + "grad_norm": 2.3223469257354736, + "learning_rate": 4.6559100080454536e-05, + "loss": 0.1537, + "num_input_tokens_seen": 9670816, + "step": 45830 + }, + { + "epoch": 5.042354235423542, + "grad_norm": 0.41060182452201843, + "learning_rate": 4.655788485375104e-05, + "loss": 0.0125, + "num_input_tokens_seen": 9671936, + "step": 45835 + }, + { + "epoch": 5.042904290429043, + "grad_norm": 0.35427194833755493, + "learning_rate": 4.655666942835887e-05, + "loss": 0.0456, + "num_input_tokens_seen": 9672960, + "step": 45840 + }, + { + "epoch": 5.043454345434544, + "grad_norm": 0.09562774747610092, + "learning_rate": 4.6555453804289215e-05, + "loss": 0.0144, + "num_input_tokens_seen": 9674016, + "step": 45845 + }, + { + "epoch": 5.044004400440044, + "grad_norm": 0.03458615019917488, + "learning_rate": 4.655423798155328e-05, + "loss": 0.0285, + "num_input_tokens_seen": 9675008, + "step": 45850 + }, + { + "epoch": 5.044554455445544, + "grad_norm": 0.08027534186840057, + "learning_rate": 4.655302196016228e-05, + "loss": 0.0281, + "num_input_tokens_seen": 9676032, + "step": 45855 + }, + { + "epoch": 5.045104510451045, + "grad_norm": 0.025233402848243713, + "learning_rate": 4.6551805740127426e-05, + "loss": 0.0118, + "num_input_tokens_seen": 9677120, + "step": 45860 + }, + { + "epoch": 5.0456545654565454, + "grad_norm": 0.2819673717021942, + "learning_rate": 4.655058932145991e-05, + "loss": 0.01, + "num_input_tokens_seen": 9678144, + "step": 45865 + }, + { + "epoch": 5.0462046204620465, + "grad_norm": 0.042788732796907425, + "learning_rate": 4.6549372704170956e-05, + "loss": 0.0801, + "num_input_tokens_seen": 9679232, + "step": 45870 + }, + { + "epoch": 5.046754675467547, + "grad_norm": 1.4629656076431274, + "learning_rate": 4.654815588827178e-05, + "loss": 0.0244, + "num_input_tokens_seen": 9680288, + "step": 45875 + }, + { + "epoch": 5.047304730473047, + "grad_norm": 0.023176252841949463, + "learning_rate": 4.6546938873773585e-05, + "loss": 0.0195, + "num_input_tokens_seen": 9681408, + "step": 45880 + }, + { + "epoch": 5.047854785478548, + "grad_norm": 0.006014527752995491, + "learning_rate": 4.65457216606876e-05, + "loss": 0.0426, + "num_input_tokens_seen": 9682528, + "step": 45885 + }, + { + "epoch": 5.048404840484048, + "grad_norm": 0.01195763610303402, + "learning_rate": 4.654450424902503e-05, + "loss": 0.0108, + "num_input_tokens_seen": 9683584, + "step": 45890 + }, + { + "epoch": 5.048954895489549, + "grad_norm": 0.34843480587005615, + "learning_rate": 4.65432866387971e-05, + "loss": 0.1378, + "num_input_tokens_seen": 9684608, + "step": 45895 + }, + { + "epoch": 5.0495049504950495, + "grad_norm": 0.18480290472507477, + "learning_rate": 4.654206883001504e-05, + "loss": 0.0299, + "num_input_tokens_seen": 9685696, + "step": 45900 + }, + { + "epoch": 5.05005500550055, + "grad_norm": 0.04267840087413788, + "learning_rate": 4.654085082269007e-05, + "loss": 0.0347, + "num_input_tokens_seen": 9686784, + "step": 45905 + }, + { + "epoch": 5.050605060506051, + "grad_norm": 0.024599280208349228, + "learning_rate": 4.653963261683342e-05, + "loss": 0.0295, + "num_input_tokens_seen": 9687840, + "step": 45910 + }, + { + "epoch": 5.051155115511551, + "grad_norm": 0.1674814671278, + "learning_rate": 4.65384142124563e-05, + "loss": 0.0299, + "num_input_tokens_seen": 9688864, + "step": 45915 + }, + { + "epoch": 5.051705170517051, + "grad_norm": 0.07918117195367813, + "learning_rate": 4.653719560956995e-05, + "loss": 0.0423, + "num_input_tokens_seen": 9689920, + "step": 45920 + }, + { + "epoch": 5.052255225522552, + "grad_norm": 1.2421104907989502, + "learning_rate": 4.653597680818561e-05, + "loss": 0.1419, + "num_input_tokens_seen": 9690976, + "step": 45925 + }, + { + "epoch": 5.052805280528053, + "grad_norm": 0.018041366711258888, + "learning_rate": 4.6534757808314505e-05, + "loss": 0.0453, + "num_input_tokens_seen": 9691968, + "step": 45930 + }, + { + "epoch": 5.053355335533554, + "grad_norm": 0.2233828753232956, + "learning_rate": 4.653353860996787e-05, + "loss": 0.0139, + "num_input_tokens_seen": 9693024, + "step": 45935 + }, + { + "epoch": 5.053905390539054, + "grad_norm": 0.08153889328241348, + "learning_rate": 4.653231921315694e-05, + "loss": 0.0081, + "num_input_tokens_seen": 9694144, + "step": 45940 + }, + { + "epoch": 5.054455445544554, + "grad_norm": 0.47827857732772827, + "learning_rate": 4.653109961789296e-05, + "loss": 0.0394, + "num_input_tokens_seen": 9695232, + "step": 45945 + }, + { + "epoch": 5.055005500550055, + "grad_norm": 0.4202873110771179, + "learning_rate": 4.652987982418716e-05, + "loss": 0.1441, + "num_input_tokens_seen": 9696288, + "step": 45950 + }, + { + "epoch": 5.055555555555555, + "grad_norm": 0.12763787806034088, + "learning_rate": 4.65286598320508e-05, + "loss": 0.017, + "num_input_tokens_seen": 9697344, + "step": 45955 + }, + { + "epoch": 5.0561056105610565, + "grad_norm": 0.9487475752830505, + "learning_rate": 4.65274396414951e-05, + "loss": 0.0328, + "num_input_tokens_seen": 9698432, + "step": 45960 + }, + { + "epoch": 5.056655665566557, + "grad_norm": 0.6535781621932983, + "learning_rate": 4.652621925253133e-05, + "loss": 0.1218, + "num_input_tokens_seen": 9699488, + "step": 45965 + }, + { + "epoch": 5.057205720572057, + "grad_norm": 0.25938206911087036, + "learning_rate": 4.652499866517072e-05, + "loss": 0.0177, + "num_input_tokens_seen": 9700544, + "step": 45970 + }, + { + "epoch": 5.057755775577558, + "grad_norm": 1.879573941230774, + "learning_rate": 4.6523777879424524e-05, + "loss": 0.1368, + "num_input_tokens_seen": 9701696, + "step": 45975 + }, + { + "epoch": 5.058305830583058, + "grad_norm": 0.03592430427670479, + "learning_rate": 4.652255689530401e-05, + "loss": 0.0348, + "num_input_tokens_seen": 9702720, + "step": 45980 + }, + { + "epoch": 5.058855885588559, + "grad_norm": 0.4958576560020447, + "learning_rate": 4.6521335712820394e-05, + "loss": 0.0228, + "num_input_tokens_seen": 9703680, + "step": 45985 + }, + { + "epoch": 5.0594059405940595, + "grad_norm": 0.8841380476951599, + "learning_rate": 4.652011433198496e-05, + "loss": 0.0679, + "num_input_tokens_seen": 9704768, + "step": 45990 + }, + { + "epoch": 5.05995599559956, + "grad_norm": 0.6477260589599609, + "learning_rate": 4.6518892752808966e-05, + "loss": 0.1323, + "num_input_tokens_seen": 9705792, + "step": 45995 + }, + { + "epoch": 5.060506050605061, + "grad_norm": 0.4117671549320221, + "learning_rate": 4.651767097530366e-05, + "loss": 0.0468, + "num_input_tokens_seen": 9706880, + "step": 46000 + }, + { + "epoch": 5.061056105610561, + "grad_norm": 0.008999881334602833, + "learning_rate": 4.651644899948031e-05, + "loss": 0.0518, + "num_input_tokens_seen": 9708000, + "step": 46005 + }, + { + "epoch": 5.061606160616061, + "grad_norm": 0.012496382929384708, + "learning_rate": 4.6515226825350174e-05, + "loss": 0.0074, + "num_input_tokens_seen": 9709024, + "step": 46010 + }, + { + "epoch": 5.062156215621562, + "grad_norm": 0.0472293347120285, + "learning_rate": 4.651400445292451e-05, + "loss": 0.036, + "num_input_tokens_seen": 9709984, + "step": 46015 + }, + { + "epoch": 5.0627062706270625, + "grad_norm": 1.209892749786377, + "learning_rate": 4.65127818822146e-05, + "loss": 0.062, + "num_input_tokens_seen": 9710944, + "step": 46020 + }, + { + "epoch": 5.063256325632564, + "grad_norm": 1.663785696029663, + "learning_rate": 4.651155911323168e-05, + "loss": 0.2284, + "num_input_tokens_seen": 9712000, + "step": 46025 + }, + { + "epoch": 5.063806380638064, + "grad_norm": 0.0703829675912857, + "learning_rate": 4.651033614598706e-05, + "loss": 0.038, + "num_input_tokens_seen": 9713024, + "step": 46030 + }, + { + "epoch": 5.064356435643564, + "grad_norm": 0.4425969123840332, + "learning_rate": 4.6509112980492e-05, + "loss": 0.0305, + "num_input_tokens_seen": 9714048, + "step": 46035 + }, + { + "epoch": 5.064906490649065, + "grad_norm": 1.160089373588562, + "learning_rate": 4.6507889616757756e-05, + "loss": 0.0435, + "num_input_tokens_seen": 9715168, + "step": 46040 + }, + { + "epoch": 5.065456545654565, + "grad_norm": 0.6305546760559082, + "learning_rate": 4.650666605479561e-05, + "loss": 0.0903, + "num_input_tokens_seen": 9716256, + "step": 46045 + }, + { + "epoch": 5.066006600660066, + "grad_norm": 1.366120457649231, + "learning_rate": 4.6505442294616855e-05, + "loss": 0.087, + "num_input_tokens_seen": 9717344, + "step": 46050 + }, + { + "epoch": 5.066556655665567, + "grad_norm": 0.05319763347506523, + "learning_rate": 4.650421833623275e-05, + "loss": 0.0105, + "num_input_tokens_seen": 9718432, + "step": 46055 + }, + { + "epoch": 5.067106710671067, + "grad_norm": 0.39942827820777893, + "learning_rate": 4.650299417965458e-05, + "loss": 0.0279, + "num_input_tokens_seen": 9719488, + "step": 46060 + }, + { + "epoch": 5.067656765676568, + "grad_norm": 0.15409629046916962, + "learning_rate": 4.650176982489364e-05, + "loss": 0.0786, + "num_input_tokens_seen": 9720512, + "step": 46065 + }, + { + "epoch": 5.068206820682068, + "grad_norm": 2.1878929138183594, + "learning_rate": 4.65005452719612e-05, + "loss": 0.0617, + "num_input_tokens_seen": 9721632, + "step": 46070 + }, + { + "epoch": 5.068756875687569, + "grad_norm": 0.6149526238441467, + "learning_rate": 4.649932052086855e-05, + "loss": 0.054, + "num_input_tokens_seen": 9722656, + "step": 46075 + }, + { + "epoch": 5.069306930693069, + "grad_norm": 0.7699737548828125, + "learning_rate": 4.6498095571626975e-05, + "loss": 0.1035, + "num_input_tokens_seen": 9723744, + "step": 46080 + }, + { + "epoch": 5.06985698569857, + "grad_norm": 0.04854746162891388, + "learning_rate": 4.649687042424778e-05, + "loss": 0.0267, + "num_input_tokens_seen": 9724832, + "step": 46085 + }, + { + "epoch": 5.070407040704071, + "grad_norm": 0.06257878243923187, + "learning_rate": 4.6495645078742235e-05, + "loss": 0.0615, + "num_input_tokens_seen": 9725984, + "step": 46090 + }, + { + "epoch": 5.070957095709571, + "grad_norm": 0.05289723724126816, + "learning_rate": 4.6494419535121655e-05, + "loss": 0.0822, + "num_input_tokens_seen": 9727104, + "step": 46095 + }, + { + "epoch": 5.071507150715071, + "grad_norm": 0.08636415004730225, + "learning_rate": 4.6493193793397315e-05, + "loss": 0.0287, + "num_input_tokens_seen": 9728096, + "step": 46100 + }, + { + "epoch": 5.072057205720572, + "grad_norm": 0.7072445154190063, + "learning_rate": 4.649196785358052e-05, + "loss": 0.059, + "num_input_tokens_seen": 9729216, + "step": 46105 + }, + { + "epoch": 5.072607260726072, + "grad_norm": 1.0469107627868652, + "learning_rate": 4.6490741715682576e-05, + "loss": 0.0785, + "num_input_tokens_seen": 9730304, + "step": 46110 + }, + { + "epoch": 5.0731573157315735, + "grad_norm": 0.16219475865364075, + "learning_rate": 4.648951537971478e-05, + "loss": 0.0836, + "num_input_tokens_seen": 9731328, + "step": 46115 + }, + { + "epoch": 5.073707370737074, + "grad_norm": 0.6181681752204895, + "learning_rate": 4.648828884568843e-05, + "loss": 0.1005, + "num_input_tokens_seen": 9732320, + "step": 46120 + }, + { + "epoch": 5.074257425742574, + "grad_norm": 0.4080333113670349, + "learning_rate": 4.6487062113614827e-05, + "loss": 0.0636, + "num_input_tokens_seen": 9733312, + "step": 46125 + }, + { + "epoch": 5.074807480748075, + "grad_norm": 0.040069643408060074, + "learning_rate": 4.648583518350529e-05, + "loss": 0.0076, + "num_input_tokens_seen": 9734368, + "step": 46130 + }, + { + "epoch": 5.075357535753575, + "grad_norm": 0.17115890979766846, + "learning_rate": 4.6484608055371114e-05, + "loss": 0.0182, + "num_input_tokens_seen": 9735456, + "step": 46135 + }, + { + "epoch": 5.075907590759076, + "grad_norm": 0.0927344262599945, + "learning_rate": 4.648338072922362e-05, + "loss": 0.0508, + "num_input_tokens_seen": 9736576, + "step": 46140 + }, + { + "epoch": 5.0764576457645765, + "grad_norm": 0.012446293607354164, + "learning_rate": 4.6482153205074115e-05, + "loss": 0.0249, + "num_input_tokens_seen": 9737632, + "step": 46145 + }, + { + "epoch": 5.077007700770077, + "grad_norm": 0.348959743976593, + "learning_rate": 4.648092548293391e-05, + "loss": 0.1353, + "num_input_tokens_seen": 9738720, + "step": 46150 + }, + { + "epoch": 5.077557755775578, + "grad_norm": 0.061861056834459305, + "learning_rate": 4.6479697562814315e-05, + "loss": 0.0139, + "num_input_tokens_seen": 9739744, + "step": 46155 + }, + { + "epoch": 5.078107810781078, + "grad_norm": 0.7787953019142151, + "learning_rate": 4.647846944472667e-05, + "loss": 0.0235, + "num_input_tokens_seen": 9740832, + "step": 46160 + }, + { + "epoch": 5.078657865786579, + "grad_norm": 0.12012789398431778, + "learning_rate": 4.647724112868227e-05, + "loss": 0.0071, + "num_input_tokens_seen": 9741888, + "step": 46165 + }, + { + "epoch": 5.079207920792079, + "grad_norm": 0.17787492275238037, + "learning_rate": 4.6476012614692446e-05, + "loss": 0.0217, + "num_input_tokens_seen": 9742976, + "step": 46170 + }, + { + "epoch": 5.0797579757975795, + "grad_norm": 0.03673049807548523, + "learning_rate": 4.647478390276851e-05, + "loss": 0.0209, + "num_input_tokens_seen": 9744032, + "step": 46175 + }, + { + "epoch": 5.080308030803081, + "grad_norm": 0.13621212542057037, + "learning_rate": 4.6473554992921805e-05, + "loss": 0.0753, + "num_input_tokens_seen": 9745056, + "step": 46180 + }, + { + "epoch": 5.080858085808581, + "grad_norm": 1.1792949438095093, + "learning_rate": 4.647232588516365e-05, + "loss": 0.1226, + "num_input_tokens_seen": 9746208, + "step": 46185 + }, + { + "epoch": 5.081408140814081, + "grad_norm": 0.07322963327169418, + "learning_rate": 4.647109657950536e-05, + "loss": 0.074, + "num_input_tokens_seen": 9747200, + "step": 46190 + }, + { + "epoch": 5.081958195819582, + "grad_norm": 0.49352434277534485, + "learning_rate": 4.6469867075958286e-05, + "loss": 0.0238, + "num_input_tokens_seen": 9748192, + "step": 46195 + }, + { + "epoch": 5.082508250825082, + "grad_norm": 0.027032772079110146, + "learning_rate": 4.646863737453374e-05, + "loss": 0.036, + "num_input_tokens_seen": 9749248, + "step": 46200 + }, + { + "epoch": 5.083058305830583, + "grad_norm": 0.10569979250431061, + "learning_rate": 4.6467407475243076e-05, + "loss": 0.0139, + "num_input_tokens_seen": 9750304, + "step": 46205 + }, + { + "epoch": 5.083608360836084, + "grad_norm": 0.019672518596053123, + "learning_rate": 4.646617737809761e-05, + "loss": 0.0218, + "num_input_tokens_seen": 9751360, + "step": 46210 + }, + { + "epoch": 5.084158415841584, + "grad_norm": 0.4586735665798187, + "learning_rate": 4.646494708310869e-05, + "loss": 0.0469, + "num_input_tokens_seen": 9752384, + "step": 46215 + }, + { + "epoch": 5.084708470847085, + "grad_norm": 0.05641557276248932, + "learning_rate": 4.6463716590287656e-05, + "loss": 0.0922, + "num_input_tokens_seen": 9753472, + "step": 46220 + }, + { + "epoch": 5.085258525852585, + "grad_norm": 0.06299788504838943, + "learning_rate": 4.646248589964584e-05, + "loss": 0.0394, + "num_input_tokens_seen": 9754464, + "step": 46225 + }, + { + "epoch": 5.085808580858086, + "grad_norm": 0.23023298382759094, + "learning_rate": 4.6461255011194596e-05, + "loss": 0.0413, + "num_input_tokens_seen": 9755520, + "step": 46230 + }, + { + "epoch": 5.086358635863586, + "grad_norm": 0.0901569351553917, + "learning_rate": 4.6460023924945256e-05, + "loss": 0.0294, + "num_input_tokens_seen": 9756640, + "step": 46235 + }, + { + "epoch": 5.086908690869087, + "grad_norm": 0.15936031937599182, + "learning_rate": 4.6458792640909175e-05, + "loss": 0.0526, + "num_input_tokens_seen": 9757664, + "step": 46240 + }, + { + "epoch": 5.087458745874588, + "grad_norm": 0.09910618513822556, + "learning_rate": 4.6457561159097704e-05, + "loss": 0.1255, + "num_input_tokens_seen": 9758656, + "step": 46245 + }, + { + "epoch": 5.088008800880088, + "grad_norm": 0.5396925806999207, + "learning_rate": 4.645632947952219e-05, + "loss": 0.0876, + "num_input_tokens_seen": 9759680, + "step": 46250 + }, + { + "epoch": 5.088558855885589, + "grad_norm": 0.09781006723642349, + "learning_rate": 4.645509760219397e-05, + "loss": 0.0348, + "num_input_tokens_seen": 9760736, + "step": 46255 + }, + { + "epoch": 5.089108910891089, + "grad_norm": 0.8573319911956787, + "learning_rate": 4.6453865527124416e-05, + "loss": 0.0962, + "num_input_tokens_seen": 9761792, + "step": 46260 + }, + { + "epoch": 5.089658965896589, + "grad_norm": 0.5786278247833252, + "learning_rate": 4.645263325432488e-05, + "loss": 0.0471, + "num_input_tokens_seen": 9762848, + "step": 46265 + }, + { + "epoch": 5.0902090209020905, + "grad_norm": 0.35261115431785583, + "learning_rate": 4.645140078380672e-05, + "loss": 0.0286, + "num_input_tokens_seen": 9763904, + "step": 46270 + }, + { + "epoch": 5.090759075907591, + "grad_norm": 0.6267970204353333, + "learning_rate": 4.6450168115581295e-05, + "loss": 0.0191, + "num_input_tokens_seen": 9764896, + "step": 46275 + }, + { + "epoch": 5.091309130913091, + "grad_norm": 0.42956793308258057, + "learning_rate": 4.644893524965995e-05, + "loss": 0.0244, + "num_input_tokens_seen": 9765952, + "step": 46280 + }, + { + "epoch": 5.091859185918592, + "grad_norm": 0.20775192975997925, + "learning_rate": 4.644770218605408e-05, + "loss": 0.0115, + "num_input_tokens_seen": 9766976, + "step": 46285 + }, + { + "epoch": 5.092409240924092, + "grad_norm": 0.018214182928204536, + "learning_rate": 4.644646892477501e-05, + "loss": 0.0084, + "num_input_tokens_seen": 9768128, + "step": 46290 + }, + { + "epoch": 5.092959295929593, + "grad_norm": 0.48839133977890015, + "learning_rate": 4.6445235465834134e-05, + "loss": 0.0482, + "num_input_tokens_seen": 9769184, + "step": 46295 + }, + { + "epoch": 5.0935093509350935, + "grad_norm": 0.11249575018882751, + "learning_rate": 4.6444001809242824e-05, + "loss": 0.0467, + "num_input_tokens_seen": 9770240, + "step": 46300 + }, + { + "epoch": 5.094059405940594, + "grad_norm": 0.02211221307516098, + "learning_rate": 4.644276795501242e-05, + "loss": 0.0442, + "num_input_tokens_seen": 9771232, + "step": 46305 + }, + { + "epoch": 5.094609460946095, + "grad_norm": 0.037016112357378006, + "learning_rate": 4.644153390315433e-05, + "loss": 0.0059, + "num_input_tokens_seen": 9772288, + "step": 46310 + }, + { + "epoch": 5.095159515951595, + "grad_norm": 0.9309348464012146, + "learning_rate": 4.64402996536799e-05, + "loss": 0.1428, + "num_input_tokens_seen": 9773312, + "step": 46315 + }, + { + "epoch": 5.095709570957096, + "grad_norm": 1.418413519859314, + "learning_rate": 4.643906520660052e-05, + "loss": 0.1068, + "num_input_tokens_seen": 9774368, + "step": 46320 + }, + { + "epoch": 5.096259625962596, + "grad_norm": 0.0904431939125061, + "learning_rate": 4.643783056192757e-05, + "loss": 0.169, + "num_input_tokens_seen": 9775424, + "step": 46325 + }, + { + "epoch": 5.0968096809680965, + "grad_norm": 0.6335884928703308, + "learning_rate": 4.643659571967242e-05, + "loss": 0.0156, + "num_input_tokens_seen": 9776416, + "step": 46330 + }, + { + "epoch": 5.097359735973598, + "grad_norm": 0.030327238142490387, + "learning_rate": 4.643536067984645e-05, + "loss": 0.0366, + "num_input_tokens_seen": 9777440, + "step": 46335 + }, + { + "epoch": 5.097909790979098, + "grad_norm": 0.16025295853614807, + "learning_rate": 4.643412544246104e-05, + "loss": 0.0441, + "num_input_tokens_seen": 9778432, + "step": 46340 + }, + { + "epoch": 5.098459845984599, + "grad_norm": 0.20939481258392334, + "learning_rate": 4.643289000752759e-05, + "loss": 0.0105, + "num_input_tokens_seen": 9779488, + "step": 46345 + }, + { + "epoch": 5.099009900990099, + "grad_norm": 0.1526506394147873, + "learning_rate": 4.6431654375057474e-05, + "loss": 0.0454, + "num_input_tokens_seen": 9780512, + "step": 46350 + }, + { + "epoch": 5.099559955995599, + "grad_norm": 0.06365735828876495, + "learning_rate": 4.643041854506208e-05, + "loss": 0.0059, + "num_input_tokens_seen": 9781568, + "step": 46355 + }, + { + "epoch": 5.1001100110011, + "grad_norm": 1.3509374856948853, + "learning_rate": 4.642918251755281e-05, + "loss": 0.0964, + "num_input_tokens_seen": 9782592, + "step": 46360 + }, + { + "epoch": 5.100660066006601, + "grad_norm": 0.8039038777351379, + "learning_rate": 4.642794629254104e-05, + "loss": 0.092, + "num_input_tokens_seen": 9783680, + "step": 46365 + }, + { + "epoch": 5.101210121012101, + "grad_norm": 0.11754009872674942, + "learning_rate": 4.642670987003818e-05, + "loss": 0.0308, + "num_input_tokens_seen": 9784768, + "step": 46370 + }, + { + "epoch": 5.101760176017602, + "grad_norm": 0.040499404072761536, + "learning_rate": 4.642547325005561e-05, + "loss": 0.004, + "num_input_tokens_seen": 9785824, + "step": 46375 + }, + { + "epoch": 5.102310231023102, + "grad_norm": 0.07205001264810562, + "learning_rate": 4.642423643260473e-05, + "loss": 0.0165, + "num_input_tokens_seen": 9786880, + "step": 46380 + }, + { + "epoch": 5.102860286028603, + "grad_norm": 0.4970007538795471, + "learning_rate": 4.6422999417696954e-05, + "loss": 0.1676, + "num_input_tokens_seen": 9787968, + "step": 46385 + }, + { + "epoch": 5.103410341034103, + "grad_norm": 0.1924923062324524, + "learning_rate": 4.642176220534367e-05, + "loss": 0.0298, + "num_input_tokens_seen": 9789024, + "step": 46390 + }, + { + "epoch": 5.103960396039604, + "grad_norm": 0.10353701561689377, + "learning_rate": 4.6420524795556274e-05, + "loss": 0.0196, + "num_input_tokens_seen": 9790080, + "step": 46395 + }, + { + "epoch": 5.104510451045105, + "grad_norm": 0.19808614253997803, + "learning_rate": 4.641928718834618e-05, + "loss": 0.0274, + "num_input_tokens_seen": 9791168, + "step": 46400 + }, + { + "epoch": 5.105060506050605, + "grad_norm": 0.3114151656627655, + "learning_rate": 4.6418049383724806e-05, + "loss": 0.0191, + "num_input_tokens_seen": 9792192, + "step": 46405 + }, + { + "epoch": 5.105610561056106, + "grad_norm": 0.22960954904556274, + "learning_rate": 4.641681138170354e-05, + "loss": 0.0605, + "num_input_tokens_seen": 9793312, + "step": 46410 + }, + { + "epoch": 5.106160616061606, + "grad_norm": 1.2655854225158691, + "learning_rate": 4.64155731822938e-05, + "loss": 0.0509, + "num_input_tokens_seen": 9794368, + "step": 46415 + }, + { + "epoch": 5.106710671067106, + "grad_norm": 0.04534200206398964, + "learning_rate": 4.6414334785506995e-05, + "loss": 0.037, + "num_input_tokens_seen": 9795456, + "step": 46420 + }, + { + "epoch": 5.1072607260726075, + "grad_norm": 0.13173983991146088, + "learning_rate": 4.641309619135454e-05, + "loss": 0.0121, + "num_input_tokens_seen": 9796576, + "step": 46425 + }, + { + "epoch": 5.107810781078108, + "grad_norm": 0.034074462950229645, + "learning_rate": 4.641185739984787e-05, + "loss": 0.0745, + "num_input_tokens_seen": 9797632, + "step": 46430 + }, + { + "epoch": 5.108360836083609, + "grad_norm": 0.0625801756978035, + "learning_rate": 4.641061841099837e-05, + "loss": 0.0129, + "num_input_tokens_seen": 9798688, + "step": 46435 + }, + { + "epoch": 5.108910891089109, + "grad_norm": 0.04155087471008301, + "learning_rate": 4.640937922481748e-05, + "loss": 0.037, + "num_input_tokens_seen": 9799680, + "step": 46440 + }, + { + "epoch": 5.109460946094609, + "grad_norm": 0.22065240144729614, + "learning_rate": 4.6408139841316614e-05, + "loss": 0.0088, + "num_input_tokens_seen": 9800800, + "step": 46445 + }, + { + "epoch": 5.11001100110011, + "grad_norm": 0.07442204654216766, + "learning_rate": 4.640690026050719e-05, + "loss": 0.0431, + "num_input_tokens_seen": 9801856, + "step": 46450 + }, + { + "epoch": 5.1105610561056105, + "grad_norm": 0.06929991394281387, + "learning_rate": 4.6405660482400645e-05, + "loss": 0.0111, + "num_input_tokens_seen": 9802912, + "step": 46455 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 0.08003541082143784, + "learning_rate": 4.6404420507008397e-05, + "loss": 0.0083, + "num_input_tokens_seen": 9804000, + "step": 46460 + }, + { + "epoch": 5.111661166116612, + "grad_norm": 0.6744977235794067, + "learning_rate": 4.640318033434187e-05, + "loss": 0.0148, + "num_input_tokens_seen": 9805024, + "step": 46465 + }, + { + "epoch": 5.112211221122112, + "grad_norm": 0.07580479979515076, + "learning_rate": 4.6401939964412503e-05, + "loss": 0.0136, + "num_input_tokens_seen": 9806080, + "step": 46470 + }, + { + "epoch": 5.112761276127613, + "grad_norm": 0.6090216040611267, + "learning_rate": 4.640069939723173e-05, + "loss": 0.1335, + "num_input_tokens_seen": 9807136, + "step": 46475 + }, + { + "epoch": 5.113311331133113, + "grad_norm": 0.008768118917942047, + "learning_rate": 4.6399458632810977e-05, + "loss": 0.0282, + "num_input_tokens_seen": 9808160, + "step": 46480 + }, + { + "epoch": 5.1138613861386135, + "grad_norm": 0.00722155487164855, + "learning_rate": 4.639821767116168e-05, + "loss": 0.1289, + "num_input_tokens_seen": 9809248, + "step": 46485 + }, + { + "epoch": 5.114411441144115, + "grad_norm": 0.5274016261100769, + "learning_rate": 4.639697651229528e-05, + "loss": 0.0917, + "num_input_tokens_seen": 9810240, + "step": 46490 + }, + { + "epoch": 5.114961496149615, + "grad_norm": 0.41835546493530273, + "learning_rate": 4.639573515622322e-05, + "loss": 0.0246, + "num_input_tokens_seen": 9811296, + "step": 46495 + }, + { + "epoch": 5.115511551155116, + "grad_norm": 0.5780714154243469, + "learning_rate": 4.6394493602956926e-05, + "loss": 0.0202, + "num_input_tokens_seen": 9812320, + "step": 46500 + }, + { + "epoch": 5.116061606160616, + "grad_norm": 0.1196330115199089, + "learning_rate": 4.639325185250786e-05, + "loss": 0.0131, + "num_input_tokens_seen": 9813376, + "step": 46505 + }, + { + "epoch": 5.116611661166116, + "grad_norm": 0.03393341228365898, + "learning_rate": 4.639200990488745e-05, + "loss": 0.0292, + "num_input_tokens_seen": 9814400, + "step": 46510 + }, + { + "epoch": 5.117161716171617, + "grad_norm": 0.10386663675308228, + "learning_rate": 4.639076776010716e-05, + "loss": 0.0104, + "num_input_tokens_seen": 9815488, + "step": 46515 + }, + { + "epoch": 5.117711771177118, + "grad_norm": 0.8246883749961853, + "learning_rate": 4.638952541817841e-05, + "loss": 0.0683, + "num_input_tokens_seen": 9816480, + "step": 46520 + }, + { + "epoch": 5.118261826182618, + "grad_norm": 0.04065977409482002, + "learning_rate": 4.638828287911268e-05, + "loss": 0.0374, + "num_input_tokens_seen": 9817568, + "step": 46525 + }, + { + "epoch": 5.118811881188119, + "grad_norm": 0.48702794313430786, + "learning_rate": 4.638704014292141e-05, + "loss": 0.0402, + "num_input_tokens_seen": 9818656, + "step": 46530 + }, + { + "epoch": 5.119361936193619, + "grad_norm": 0.051597315818071365, + "learning_rate": 4.6385797209616054e-05, + "loss": 0.0483, + "num_input_tokens_seen": 9819744, + "step": 46535 + }, + { + "epoch": 5.11991199119912, + "grad_norm": 0.034815434366464615, + "learning_rate": 4.638455407920807e-05, + "loss": 0.0085, + "num_input_tokens_seen": 9820800, + "step": 46540 + }, + { + "epoch": 5.12046204620462, + "grad_norm": 0.04295659065246582, + "learning_rate": 4.63833107517089e-05, + "loss": 0.0036, + "num_input_tokens_seen": 9821856, + "step": 46545 + }, + { + "epoch": 5.121012101210121, + "grad_norm": 0.21947400271892548, + "learning_rate": 4.638206722713002e-05, + "loss": 0.0381, + "num_input_tokens_seen": 9822880, + "step": 46550 + }, + { + "epoch": 5.121562156215622, + "grad_norm": 0.23120518028736115, + "learning_rate": 4.638082350548289e-05, + "loss": 0.0636, + "num_input_tokens_seen": 9823968, + "step": 46555 + }, + { + "epoch": 5.122112211221122, + "grad_norm": 0.046467337757349014, + "learning_rate": 4.6379579586778965e-05, + "loss": 0.0607, + "num_input_tokens_seen": 9825056, + "step": 46560 + }, + { + "epoch": 5.122662266226623, + "grad_norm": 0.14898057281970978, + "learning_rate": 4.637833547102972e-05, + "loss": 0.0777, + "num_input_tokens_seen": 9826144, + "step": 46565 + }, + { + "epoch": 5.123212321232123, + "grad_norm": 0.11284336447715759, + "learning_rate": 4.637709115824661e-05, + "loss": 0.0065, + "num_input_tokens_seen": 9827200, + "step": 46570 + }, + { + "epoch": 5.123762376237623, + "grad_norm": 0.08906667679548264, + "learning_rate": 4.6375846648441116e-05, + "loss": 0.0102, + "num_input_tokens_seen": 9828224, + "step": 46575 + }, + { + "epoch": 5.1243124312431245, + "grad_norm": 1.4546178579330444, + "learning_rate": 4.63746019416247e-05, + "loss": 0.0761, + "num_input_tokens_seen": 9829248, + "step": 46580 + }, + { + "epoch": 5.124862486248625, + "grad_norm": 0.16397009789943695, + "learning_rate": 4.637335703780882e-05, + "loss": 0.0379, + "num_input_tokens_seen": 9830304, + "step": 46585 + }, + { + "epoch": 5.125412541254126, + "grad_norm": 1.5675569772720337, + "learning_rate": 4.6372111937004976e-05, + "loss": 0.0673, + "num_input_tokens_seen": 9831392, + "step": 46590 + }, + { + "epoch": 5.125962596259626, + "grad_norm": 0.03196364641189575, + "learning_rate": 4.637086663922463e-05, + "loss": 0.0701, + "num_input_tokens_seen": 9832384, + "step": 46595 + }, + { + "epoch": 5.126512651265126, + "grad_norm": 0.017118997871875763, + "learning_rate": 4.636962114447926e-05, + "loss": 0.0655, + "num_input_tokens_seen": 9833440, + "step": 46600 + }, + { + "epoch": 5.127062706270627, + "grad_norm": 0.08582670986652374, + "learning_rate": 4.6368375452780344e-05, + "loss": 0.0392, + "num_input_tokens_seen": 9834528, + "step": 46605 + }, + { + "epoch": 5.1276127612761275, + "grad_norm": 0.013219824060797691, + "learning_rate": 4.636712956413937e-05, + "loss": 0.0934, + "num_input_tokens_seen": 9835616, + "step": 46610 + }, + { + "epoch": 5.128162816281628, + "grad_norm": 0.12129922211170197, + "learning_rate": 4.636588347856781e-05, + "loss": 0.0803, + "num_input_tokens_seen": 9836672, + "step": 46615 + }, + { + "epoch": 5.128712871287129, + "grad_norm": 0.015912417322397232, + "learning_rate": 4.6364637196077164e-05, + "loss": 0.0231, + "num_input_tokens_seen": 9837728, + "step": 46620 + }, + { + "epoch": 5.129262926292629, + "grad_norm": 1.1612366437911987, + "learning_rate": 4.63633907166789e-05, + "loss": 0.0418, + "num_input_tokens_seen": 9838688, + "step": 46625 + }, + { + "epoch": 5.12981298129813, + "grad_norm": 0.08196961879730225, + "learning_rate": 4.636214404038451e-05, + "loss": 0.0133, + "num_input_tokens_seen": 9839712, + "step": 46630 + }, + { + "epoch": 5.13036303630363, + "grad_norm": 0.6522573828697205, + "learning_rate": 4.63608971672055e-05, + "loss": 0.0106, + "num_input_tokens_seen": 9840768, + "step": 46635 + }, + { + "epoch": 5.1309130913091305, + "grad_norm": 0.042608484625816345, + "learning_rate": 4.635965009715335e-05, + "loss": 0.054, + "num_input_tokens_seen": 9841824, + "step": 46640 + }, + { + "epoch": 5.131463146314632, + "grad_norm": 0.12765568494796753, + "learning_rate": 4.635840283023955e-05, + "loss": 0.0128, + "num_input_tokens_seen": 9842944, + "step": 46645 + }, + { + "epoch": 5.132013201320132, + "grad_norm": 0.2499467432498932, + "learning_rate": 4.63571553664756e-05, + "loss": 0.0376, + "num_input_tokens_seen": 9844000, + "step": 46650 + }, + { + "epoch": 5.132563256325633, + "grad_norm": 0.23125606775283813, + "learning_rate": 4.6355907705873e-05, + "loss": 0.1091, + "num_input_tokens_seen": 9845024, + "step": 46655 + }, + { + "epoch": 5.133113311331133, + "grad_norm": 0.009262853302061558, + "learning_rate": 4.635465984844325e-05, + "loss": 0.0072, + "num_input_tokens_seen": 9846080, + "step": 46660 + }, + { + "epoch": 5.133663366336633, + "grad_norm": 0.8594698905944824, + "learning_rate": 4.6353411794197837e-05, + "loss": 0.0384, + "num_input_tokens_seen": 9847136, + "step": 46665 + }, + { + "epoch": 5.134213421342134, + "grad_norm": 0.025241496041417122, + "learning_rate": 4.6352163543148275e-05, + "loss": 0.0301, + "num_input_tokens_seen": 9848256, + "step": 46670 + }, + { + "epoch": 5.134763476347635, + "grad_norm": 0.012580810114741325, + "learning_rate": 4.635091509530607e-05, + "loss": 0.0649, + "num_input_tokens_seen": 9849344, + "step": 46675 + }, + { + "epoch": 5.135313531353136, + "grad_norm": 0.8813607096672058, + "learning_rate": 4.6349666450682726e-05, + "loss": 0.0234, + "num_input_tokens_seen": 9850464, + "step": 46680 + }, + { + "epoch": 5.135863586358636, + "grad_norm": 0.09027811884880066, + "learning_rate": 4.634841760928976e-05, + "loss": 0.013, + "num_input_tokens_seen": 9851520, + "step": 46685 + }, + { + "epoch": 5.136413641364136, + "grad_norm": 0.016723720356822014, + "learning_rate": 4.634716857113866e-05, + "loss": 0.0622, + "num_input_tokens_seen": 9852512, + "step": 46690 + }, + { + "epoch": 5.136963696369637, + "grad_norm": 0.009344859980046749, + "learning_rate": 4.6345919336240955e-05, + "loss": 0.036, + "num_input_tokens_seen": 9853504, + "step": 46695 + }, + { + "epoch": 5.137513751375137, + "grad_norm": 0.007831422612071037, + "learning_rate": 4.6344669904608154e-05, + "loss": 0.1013, + "num_input_tokens_seen": 9854592, + "step": 46700 + }, + { + "epoch": 5.138063806380638, + "grad_norm": 0.03447999432682991, + "learning_rate": 4.634342027625177e-05, + "loss": 0.0037, + "num_input_tokens_seen": 9855648, + "step": 46705 + }, + { + "epoch": 5.138613861386139, + "grad_norm": 1.9396933317184448, + "learning_rate": 4.634217045118333e-05, + "loss": 0.2269, + "num_input_tokens_seen": 9856704, + "step": 46710 + }, + { + "epoch": 5.139163916391639, + "grad_norm": 1.0386980772018433, + "learning_rate": 4.634092042941434e-05, + "loss": 0.1386, + "num_input_tokens_seen": 9857728, + "step": 46715 + }, + { + "epoch": 5.13971397139714, + "grad_norm": 0.011871139518916607, + "learning_rate": 4.633967021095633e-05, + "loss": 0.1089, + "num_input_tokens_seen": 9858752, + "step": 46720 + }, + { + "epoch": 5.14026402640264, + "grad_norm": 0.12894830107688904, + "learning_rate": 4.633841979582082e-05, + "loss": 0.0344, + "num_input_tokens_seen": 9859776, + "step": 46725 + }, + { + "epoch": 5.1408140814081404, + "grad_norm": 0.19503457844257355, + "learning_rate": 4.633716918401933e-05, + "loss": 0.0481, + "num_input_tokens_seen": 9860768, + "step": 46730 + }, + { + "epoch": 5.1413641364136415, + "grad_norm": 0.036627594381570816, + "learning_rate": 4.633591837556339e-05, + "loss": 0.0105, + "num_input_tokens_seen": 9861824, + "step": 46735 + }, + { + "epoch": 5.141914191419142, + "grad_norm": 1.944858431816101, + "learning_rate": 4.6334667370464535e-05, + "loss": 0.0586, + "num_input_tokens_seen": 9862912, + "step": 46740 + }, + { + "epoch": 5.142464246424643, + "grad_norm": 1.3008784055709839, + "learning_rate": 4.633341616873428e-05, + "loss": 0.0782, + "num_input_tokens_seen": 9864000, + "step": 46745 + }, + { + "epoch": 5.143014301430143, + "grad_norm": 0.3062739074230194, + "learning_rate": 4.633216477038417e-05, + "loss": 0.1618, + "num_input_tokens_seen": 9864992, + "step": 46750 + }, + { + "epoch": 5.143564356435643, + "grad_norm": 0.7682717442512512, + "learning_rate": 4.633091317542573e-05, + "loss": 0.0792, + "num_input_tokens_seen": 9866016, + "step": 46755 + }, + { + "epoch": 5.144114411441144, + "grad_norm": 0.34907495975494385, + "learning_rate": 4.6329661383870505e-05, + "loss": 0.1814, + "num_input_tokens_seen": 9867104, + "step": 46760 + }, + { + "epoch": 5.1446644664466445, + "grad_norm": 1.268542766571045, + "learning_rate": 4.632840939573002e-05, + "loss": 0.0586, + "num_input_tokens_seen": 9868160, + "step": 46765 + }, + { + "epoch": 5.145214521452146, + "grad_norm": 0.3757374584674835, + "learning_rate": 4.6327157211015824e-05, + "loss": 0.0246, + "num_input_tokens_seen": 9869184, + "step": 46770 + }, + { + "epoch": 5.145764576457646, + "grad_norm": 0.5801289081573486, + "learning_rate": 4.6325904829739456e-05, + "loss": 0.1479, + "num_input_tokens_seen": 9870240, + "step": 46775 + }, + { + "epoch": 5.146314631463146, + "grad_norm": 0.27607595920562744, + "learning_rate": 4.6324652251912446e-05, + "loss": 0.0968, + "num_input_tokens_seen": 9871328, + "step": 46780 + }, + { + "epoch": 5.146864686468647, + "grad_norm": 0.012417049147188663, + "learning_rate": 4.6323399477546356e-05, + "loss": 0.0099, + "num_input_tokens_seen": 9872384, + "step": 46785 + }, + { + "epoch": 5.147414741474147, + "grad_norm": 0.8912261724472046, + "learning_rate": 4.632214650665272e-05, + "loss": 0.0785, + "num_input_tokens_seen": 9873376, + "step": 46790 + }, + { + "epoch": 5.1479647964796476, + "grad_norm": 0.9201136827468872, + "learning_rate": 4.63208933392431e-05, + "loss": 0.0655, + "num_input_tokens_seen": 9874400, + "step": 46795 + }, + { + "epoch": 5.148514851485149, + "grad_norm": 0.019307104870676994, + "learning_rate": 4.6319639975329026e-05, + "loss": 0.0388, + "num_input_tokens_seen": 9875360, + "step": 46800 + }, + { + "epoch": 5.149064906490649, + "grad_norm": 0.06651446223258972, + "learning_rate": 4.6318386414922066e-05, + "loss": 0.0839, + "num_input_tokens_seen": 9876480, + "step": 46805 + }, + { + "epoch": 5.14961496149615, + "grad_norm": 0.03834773972630501, + "learning_rate": 4.6317132658033766e-05, + "loss": 0.0994, + "num_input_tokens_seen": 9877504, + "step": 46810 + }, + { + "epoch": 5.15016501650165, + "grad_norm": 0.05926847830414772, + "learning_rate": 4.6315878704675685e-05, + "loss": 0.0328, + "num_input_tokens_seen": 9878528, + "step": 46815 + }, + { + "epoch": 5.15071507150715, + "grad_norm": 0.028449587523937225, + "learning_rate": 4.6314624554859386e-05, + "loss": 0.0035, + "num_input_tokens_seen": 9879552, + "step": 46820 + }, + { + "epoch": 5.1512651265126514, + "grad_norm": 0.07865133136510849, + "learning_rate": 4.6313370208596406e-05, + "loss": 0.0497, + "num_input_tokens_seen": 9880640, + "step": 46825 + }, + { + "epoch": 5.151815181518152, + "grad_norm": 0.08691538125276566, + "learning_rate": 4.6312115665898334e-05, + "loss": 0.0066, + "num_input_tokens_seen": 9881664, + "step": 46830 + }, + { + "epoch": 5.152365236523653, + "grad_norm": 1.3093723058700562, + "learning_rate": 4.631086092677671e-05, + "loss": 0.1123, + "num_input_tokens_seen": 9882688, + "step": 46835 + }, + { + "epoch": 5.152915291529153, + "grad_norm": 0.06727470457553864, + "learning_rate": 4.630960599124311e-05, + "loss": 0.0545, + "num_input_tokens_seen": 9883744, + "step": 46840 + }, + { + "epoch": 5.153465346534653, + "grad_norm": 0.127113938331604, + "learning_rate": 4.630835085930909e-05, + "loss": 0.0732, + "num_input_tokens_seen": 9884864, + "step": 46845 + }, + { + "epoch": 5.154015401540154, + "grad_norm": 0.20873798429965973, + "learning_rate": 4.6307095530986234e-05, + "loss": 0.0383, + "num_input_tokens_seen": 9885856, + "step": 46850 + }, + { + "epoch": 5.1545654565456545, + "grad_norm": 0.06597355753183365, + "learning_rate": 4.630584000628609e-05, + "loss": 0.058, + "num_input_tokens_seen": 9886944, + "step": 46855 + }, + { + "epoch": 5.1551155115511555, + "grad_norm": 2.4654924869537354, + "learning_rate": 4.630458428522025e-05, + "loss": 0.1093, + "num_input_tokens_seen": 9888000, + "step": 46860 + }, + { + "epoch": 5.155665566556656, + "grad_norm": 0.026629921048879623, + "learning_rate": 4.6303328367800284e-05, + "loss": 0.0094, + "num_input_tokens_seen": 9889120, + "step": 46865 + }, + { + "epoch": 5.156215621562156, + "grad_norm": 0.02287480980157852, + "learning_rate": 4.6302072254037766e-05, + "loss": 0.0376, + "num_input_tokens_seen": 9890208, + "step": 46870 + }, + { + "epoch": 5.156765676567657, + "grad_norm": 1.5909581184387207, + "learning_rate": 4.630081594394425e-05, + "loss": 0.143, + "num_input_tokens_seen": 9891264, + "step": 46875 + }, + { + "epoch": 5.157315731573157, + "grad_norm": 0.19018790125846863, + "learning_rate": 4.629955943753135e-05, + "loss": 0.1292, + "num_input_tokens_seen": 9892256, + "step": 46880 + }, + { + "epoch": 5.1578657865786575, + "grad_norm": 0.013911060057580471, + "learning_rate": 4.629830273481063e-05, + "loss": 0.018, + "num_input_tokens_seen": 9893312, + "step": 46885 + }, + { + "epoch": 5.158415841584159, + "grad_norm": 0.08207070082426071, + "learning_rate": 4.629704583579367e-05, + "loss": 0.1219, + "num_input_tokens_seen": 9894368, + "step": 46890 + }, + { + "epoch": 5.158965896589659, + "grad_norm": 0.28688931465148926, + "learning_rate": 4.6295788740492064e-05, + "loss": 0.0736, + "num_input_tokens_seen": 9895328, + "step": 46895 + }, + { + "epoch": 5.15951595159516, + "grad_norm": 0.03745083883404732, + "learning_rate": 4.6294531448917386e-05, + "loss": 0.0441, + "num_input_tokens_seen": 9896352, + "step": 46900 + }, + { + "epoch": 5.16006600660066, + "grad_norm": 0.13629071414470673, + "learning_rate": 4.6293273961081237e-05, + "loss": 0.0189, + "num_input_tokens_seen": 9897408, + "step": 46905 + }, + { + "epoch": 5.16061606160616, + "grad_norm": 0.05237875506281853, + "learning_rate": 4.629201627699519e-05, + "loss": 0.0422, + "num_input_tokens_seen": 9898464, + "step": 46910 + }, + { + "epoch": 5.161166116611661, + "grad_norm": 0.5741884708404541, + "learning_rate": 4.6290758396670854e-05, + "loss": 0.0863, + "num_input_tokens_seen": 9899488, + "step": 46915 + }, + { + "epoch": 5.161716171617162, + "grad_norm": 1.8386099338531494, + "learning_rate": 4.62895003201198e-05, + "loss": 0.122, + "num_input_tokens_seen": 9900576, + "step": 46920 + }, + { + "epoch": 5.162266226622663, + "grad_norm": 0.24754725396633148, + "learning_rate": 4.628824204735365e-05, + "loss": 0.0984, + "num_input_tokens_seen": 9901600, + "step": 46925 + }, + { + "epoch": 5.162816281628163, + "grad_norm": 0.16821356117725372, + "learning_rate": 4.628698357838399e-05, + "loss": 0.0497, + "num_input_tokens_seen": 9902688, + "step": 46930 + }, + { + "epoch": 5.163366336633663, + "grad_norm": 0.3955376148223877, + "learning_rate": 4.6285724913222415e-05, + "loss": 0.0805, + "num_input_tokens_seen": 9903808, + "step": 46935 + }, + { + "epoch": 5.163916391639164, + "grad_norm": 0.3481671214103699, + "learning_rate": 4.628446605188052e-05, + "loss": 0.055, + "num_input_tokens_seen": 9904864, + "step": 46940 + }, + { + "epoch": 5.164466446644664, + "grad_norm": 0.08991049975156784, + "learning_rate": 4.628320699436992e-05, + "loss": 0.0369, + "num_input_tokens_seen": 9905888, + "step": 46945 + }, + { + "epoch": 5.165016501650165, + "grad_norm": 1.4948585033416748, + "learning_rate": 4.6281947740702215e-05, + "loss": 0.083, + "num_input_tokens_seen": 9906912, + "step": 46950 + }, + { + "epoch": 5.165566556655666, + "grad_norm": 0.9409573674201965, + "learning_rate": 4.628068829088901e-05, + "loss": 0.0398, + "num_input_tokens_seen": 9908000, + "step": 46955 + }, + { + "epoch": 5.166116611661166, + "grad_norm": 0.13236188888549805, + "learning_rate": 4.6279428644941904e-05, + "loss": 0.0169, + "num_input_tokens_seen": 9909088, + "step": 46960 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.9288174510002136, + "learning_rate": 4.6278168802872525e-05, + "loss": 0.0327, + "num_input_tokens_seen": 9910144, + "step": 46965 + }, + { + "epoch": 5.167216721672167, + "grad_norm": 0.02193891629576683, + "learning_rate": 4.6276908764692474e-05, + "loss": 0.0785, + "num_input_tokens_seen": 9911264, + "step": 46970 + }, + { + "epoch": 5.167766776677667, + "grad_norm": 0.765790581703186, + "learning_rate": 4.627564853041336e-05, + "loss": 0.0217, + "num_input_tokens_seen": 9912416, + "step": 46975 + }, + { + "epoch": 5.1683168316831685, + "grad_norm": 1.4294673204421997, + "learning_rate": 4.62743881000468e-05, + "loss": 0.1461, + "num_input_tokens_seen": 9913472, + "step": 46980 + }, + { + "epoch": 5.168866886688669, + "grad_norm": 0.037232302129268646, + "learning_rate": 4.6273127473604415e-05, + "loss": 0.0272, + "num_input_tokens_seen": 9914496, + "step": 46985 + }, + { + "epoch": 5.16941694169417, + "grad_norm": 0.5067039728164673, + "learning_rate": 4.627186665109783e-05, + "loss": 0.0186, + "num_input_tokens_seen": 9915584, + "step": 46990 + }, + { + "epoch": 5.16996699669967, + "grad_norm": 0.4719642996788025, + "learning_rate": 4.627060563253865e-05, + "loss": 0.0854, + "num_input_tokens_seen": 9916608, + "step": 46995 + }, + { + "epoch": 5.17051705170517, + "grad_norm": 0.03167872875928879, + "learning_rate": 4.626934441793851e-05, + "loss": 0.0089, + "num_input_tokens_seen": 9917632, + "step": 47000 + }, + { + "epoch": 5.171067106710671, + "grad_norm": 0.12033659219741821, + "learning_rate": 4.626808300730902e-05, + "loss": 0.0671, + "num_input_tokens_seen": 9918688, + "step": 47005 + }, + { + "epoch": 5.1716171617161715, + "grad_norm": 0.0659608319401741, + "learning_rate": 4.626682140066182e-05, + "loss": 0.0609, + "num_input_tokens_seen": 9919776, + "step": 47010 + }, + { + "epoch": 5.172167216721673, + "grad_norm": 0.0466986708343029, + "learning_rate": 4.626555959800853e-05, + "loss": 0.1121, + "num_input_tokens_seen": 9920864, + "step": 47015 + }, + { + "epoch": 5.172717271727173, + "grad_norm": 0.15636247396469116, + "learning_rate": 4.6264297599360787e-05, + "loss": 0.0353, + "num_input_tokens_seen": 9921888, + "step": 47020 + }, + { + "epoch": 5.173267326732673, + "grad_norm": 0.6800618171691895, + "learning_rate": 4.62630354047302e-05, + "loss": 0.0132, + "num_input_tokens_seen": 9922944, + "step": 47025 + }, + { + "epoch": 5.173817381738174, + "grad_norm": 0.5469658970832825, + "learning_rate": 4.626177301412844e-05, + "loss": 0.0532, + "num_input_tokens_seen": 9924000, + "step": 47030 + }, + { + "epoch": 5.174367436743674, + "grad_norm": 1.1498056650161743, + "learning_rate": 4.62605104275671e-05, + "loss": 0.0513, + "num_input_tokens_seen": 9925120, + "step": 47035 + }, + { + "epoch": 5.174917491749175, + "grad_norm": 0.09264819324016571, + "learning_rate": 4.6259247645057854e-05, + "loss": 0.0966, + "num_input_tokens_seen": 9926176, + "step": 47040 + }, + { + "epoch": 5.175467546754676, + "grad_norm": 0.07268806546926498, + "learning_rate": 4.625798466661232e-05, + "loss": 0.0981, + "num_input_tokens_seen": 9927136, + "step": 47045 + }, + { + "epoch": 5.176017601760176, + "grad_norm": 0.6029097437858582, + "learning_rate": 4.6256721492242136e-05, + "loss": 0.1303, + "num_input_tokens_seen": 9928224, + "step": 47050 + }, + { + "epoch": 5.176567656765677, + "grad_norm": 0.3137069642543793, + "learning_rate": 4.625545812195896e-05, + "loss": 0.0102, + "num_input_tokens_seen": 9929312, + "step": 47055 + }, + { + "epoch": 5.177117711771177, + "grad_norm": 0.5661217570304871, + "learning_rate": 4.625419455577442e-05, + "loss": 0.0717, + "num_input_tokens_seen": 9930304, + "step": 47060 + }, + { + "epoch": 5.177667766776677, + "grad_norm": 0.34588736295700073, + "learning_rate": 4.625293079370017e-05, + "loss": 0.0357, + "num_input_tokens_seen": 9931392, + "step": 47065 + }, + { + "epoch": 5.178217821782178, + "grad_norm": 0.2855546474456787, + "learning_rate": 4.625166683574784e-05, + "loss": 0.1749, + "num_input_tokens_seen": 9932448, + "step": 47070 + }, + { + "epoch": 5.178767876787679, + "grad_norm": 1.0949715375900269, + "learning_rate": 4.6250402681929115e-05, + "loss": 0.0596, + "num_input_tokens_seen": 9933536, + "step": 47075 + }, + { + "epoch": 5.17931793179318, + "grad_norm": 1.1545157432556152, + "learning_rate": 4.6249138332255614e-05, + "loss": 0.0537, + "num_input_tokens_seen": 9934528, + "step": 47080 + }, + { + "epoch": 5.17986798679868, + "grad_norm": 0.037515509873628616, + "learning_rate": 4.6247873786739e-05, + "loss": 0.0112, + "num_input_tokens_seen": 9935616, + "step": 47085 + }, + { + "epoch": 5.18041804180418, + "grad_norm": 0.04162327200174332, + "learning_rate": 4.624660904539094e-05, + "loss": 0.0857, + "num_input_tokens_seen": 9936704, + "step": 47090 + }, + { + "epoch": 5.180968096809681, + "grad_norm": 0.017113113775849342, + "learning_rate": 4.624534410822307e-05, + "loss": 0.009, + "num_input_tokens_seen": 9937728, + "step": 47095 + }, + { + "epoch": 5.181518151815181, + "grad_norm": 0.029194418340921402, + "learning_rate": 4.6244078975247064e-05, + "loss": 0.0381, + "num_input_tokens_seen": 9938848, + "step": 47100 + }, + { + "epoch": 5.1820682068206825, + "grad_norm": 0.07117040455341339, + "learning_rate": 4.624281364647457e-05, + "loss": 0.0392, + "num_input_tokens_seen": 9939872, + "step": 47105 + }, + { + "epoch": 5.182618261826183, + "grad_norm": 0.21483787894248962, + "learning_rate": 4.624154812191725e-05, + "loss": 0.0303, + "num_input_tokens_seen": 9940960, + "step": 47110 + }, + { + "epoch": 5.183168316831683, + "grad_norm": 0.07183094322681427, + "learning_rate": 4.6240282401586786e-05, + "loss": 0.0094, + "num_input_tokens_seen": 9941984, + "step": 47115 + }, + { + "epoch": 5.183718371837184, + "grad_norm": 0.03011901117861271, + "learning_rate": 4.623901648549483e-05, + "loss": 0.0358, + "num_input_tokens_seen": 9943008, + "step": 47120 + }, + { + "epoch": 5.184268426842684, + "grad_norm": 0.07890857011079788, + "learning_rate": 4.623775037365305e-05, + "loss": 0.0062, + "num_input_tokens_seen": 9944064, + "step": 47125 + }, + { + "epoch": 5.184818481848184, + "grad_norm": 1.2150051593780518, + "learning_rate": 4.6236484066073117e-05, + "loss": 0.0809, + "num_input_tokens_seen": 9945120, + "step": 47130 + }, + { + "epoch": 5.1853685368536855, + "grad_norm": 0.09313753992319107, + "learning_rate": 4.623521756276669e-05, + "loss": 0.0075, + "num_input_tokens_seen": 9946176, + "step": 47135 + }, + { + "epoch": 5.185918591859186, + "grad_norm": 0.21805788576602936, + "learning_rate": 4.6233950863745464e-05, + "loss": 0.0402, + "num_input_tokens_seen": 9947168, + "step": 47140 + }, + { + "epoch": 5.186468646864687, + "grad_norm": 0.11649828404188156, + "learning_rate": 4.6232683969021096e-05, + "loss": 0.0391, + "num_input_tokens_seen": 9948192, + "step": 47145 + }, + { + "epoch": 5.187018701870187, + "grad_norm": 0.22159235179424286, + "learning_rate": 4.623141687860527e-05, + "loss": 0.0077, + "num_input_tokens_seen": 9949280, + "step": 47150 + }, + { + "epoch": 5.187568756875687, + "grad_norm": 0.11432897299528122, + "learning_rate": 4.623014959250966e-05, + "loss": 0.1433, + "num_input_tokens_seen": 9950336, + "step": 47155 + }, + { + "epoch": 5.188118811881188, + "grad_norm": 0.5463320016860962, + "learning_rate": 4.622888211074595e-05, + "loss": 0.0403, + "num_input_tokens_seen": 9951360, + "step": 47160 + }, + { + "epoch": 5.1886688668866885, + "grad_norm": 0.7282057404518127, + "learning_rate": 4.622761443332582e-05, + "loss": 0.1107, + "num_input_tokens_seen": 9952416, + "step": 47165 + }, + { + "epoch": 5.18921892189219, + "grad_norm": 0.48775675892829895, + "learning_rate": 4.622634656026096e-05, + "loss": 0.0481, + "num_input_tokens_seen": 9953472, + "step": 47170 + }, + { + "epoch": 5.18976897689769, + "grad_norm": 0.08587702363729477, + "learning_rate": 4.622507849156305e-05, + "loss": 0.0638, + "num_input_tokens_seen": 9954528, + "step": 47175 + }, + { + "epoch": 5.19031903190319, + "grad_norm": 0.11961369961500168, + "learning_rate": 4.622381022724376e-05, + "loss": 0.1676, + "num_input_tokens_seen": 9955584, + "step": 47180 + }, + { + "epoch": 5.190869086908691, + "grad_norm": 0.14928555488586426, + "learning_rate": 4.6222541767314806e-05, + "loss": 0.019, + "num_input_tokens_seen": 9956640, + "step": 47185 + }, + { + "epoch": 5.191419141914191, + "grad_norm": 0.01524713821709156, + "learning_rate": 4.622127311178787e-05, + "loss": 0.0211, + "num_input_tokens_seen": 9957664, + "step": 47190 + }, + { + "epoch": 5.191969196919692, + "grad_norm": 0.5246093273162842, + "learning_rate": 4.622000426067464e-05, + "loss": 0.1468, + "num_input_tokens_seen": 9958784, + "step": 47195 + }, + { + "epoch": 5.192519251925193, + "grad_norm": 0.07026371359825134, + "learning_rate": 4.621873521398681e-05, + "loss": 0.0434, + "num_input_tokens_seen": 9959840, + "step": 47200 + }, + { + "epoch": 5.193069306930693, + "grad_norm": 0.39659401774406433, + "learning_rate": 4.6217465971736086e-05, + "loss": 0.0407, + "num_input_tokens_seen": 9960800, + "step": 47205 + }, + { + "epoch": 5.193619361936194, + "grad_norm": 1.079107403755188, + "learning_rate": 4.621619653393415e-05, + "loss": 0.0398, + "num_input_tokens_seen": 9961888, + "step": 47210 + }, + { + "epoch": 5.194169416941694, + "grad_norm": 0.1851319670677185, + "learning_rate": 4.621492690059272e-05, + "loss": 0.0103, + "num_input_tokens_seen": 9962944, + "step": 47215 + }, + { + "epoch": 5.194719471947194, + "grad_norm": 0.026789182797074318, + "learning_rate": 4.621365707172348e-05, + "loss": 0.0049, + "num_input_tokens_seen": 9964000, + "step": 47220 + }, + { + "epoch": 5.195269526952695, + "grad_norm": 0.07527453452348709, + "learning_rate": 4.621238704733815e-05, + "loss": 0.1483, + "num_input_tokens_seen": 9965120, + "step": 47225 + }, + { + "epoch": 5.195819581958196, + "grad_norm": 0.17527717351913452, + "learning_rate": 4.621111682744843e-05, + "loss": 0.014, + "num_input_tokens_seen": 9966176, + "step": 47230 + }, + { + "epoch": 5.196369636963697, + "grad_norm": 1.1528078317642212, + "learning_rate": 4.6209846412066016e-05, + "loss": 0.0537, + "num_input_tokens_seen": 9967264, + "step": 47235 + }, + { + "epoch": 5.196919691969197, + "grad_norm": 0.2570931017398834, + "learning_rate": 4.6208575801202625e-05, + "loss": 0.0346, + "num_input_tokens_seen": 9968384, + "step": 47240 + }, + { + "epoch": 5.197469746974697, + "grad_norm": 0.06102295219898224, + "learning_rate": 4.620730499486997e-05, + "loss": 0.0419, + "num_input_tokens_seen": 9969408, + "step": 47245 + }, + { + "epoch": 5.198019801980198, + "grad_norm": 0.023461146280169487, + "learning_rate": 4.6206033993079765e-05, + "loss": 0.0203, + "num_input_tokens_seen": 9970432, + "step": 47250 + }, + { + "epoch": 5.198569856985698, + "grad_norm": 0.33201485872268677, + "learning_rate": 4.6204762795843715e-05, + "loss": 0.0258, + "num_input_tokens_seen": 9971488, + "step": 47255 + }, + { + "epoch": 5.1991199119911995, + "grad_norm": 0.15424883365631104, + "learning_rate": 4.620349140317355e-05, + "loss": 0.0399, + "num_input_tokens_seen": 9972576, + "step": 47260 + }, + { + "epoch": 5.1996699669967, + "grad_norm": 0.019770506769418716, + "learning_rate": 4.620221981508097e-05, + "loss": 0.011, + "num_input_tokens_seen": 9973632, + "step": 47265 + }, + { + "epoch": 5.2002200220022, + "grad_norm": 0.27084794640541077, + "learning_rate": 4.6200948031577705e-05, + "loss": 0.0734, + "num_input_tokens_seen": 9974752, + "step": 47270 + }, + { + "epoch": 5.200770077007701, + "grad_norm": 0.02067864127457142, + "learning_rate": 4.619967605267548e-05, + "loss": 0.0181, + "num_input_tokens_seen": 9975840, + "step": 47275 + }, + { + "epoch": 5.201320132013201, + "grad_norm": 0.0729534849524498, + "learning_rate": 4.619840387838601e-05, + "loss": 0.0512, + "num_input_tokens_seen": 9976896, + "step": 47280 + }, + { + "epoch": 5.201870187018702, + "grad_norm": 0.02598540112376213, + "learning_rate": 4.6197131508721026e-05, + "loss": 0.0577, + "num_input_tokens_seen": 9977952, + "step": 47285 + }, + { + "epoch": 5.2024202420242025, + "grad_norm": 0.1540450155735016, + "learning_rate": 4.619585894369225e-05, + "loss": 0.0148, + "num_input_tokens_seen": 9978944, + "step": 47290 + }, + { + "epoch": 5.202970297029703, + "grad_norm": 0.6701503396034241, + "learning_rate": 4.61945861833114e-05, + "loss": 0.0818, + "num_input_tokens_seen": 9980000, + "step": 47295 + }, + { + "epoch": 5.203520352035204, + "grad_norm": 0.09735456854104996, + "learning_rate": 4.619331322759024e-05, + "loss": 0.019, + "num_input_tokens_seen": 9981056, + "step": 47300 + }, + { + "epoch": 5.204070407040704, + "grad_norm": 0.0395762100815773, + "learning_rate": 4.6192040076540476e-05, + "loss": 0.0078, + "num_input_tokens_seen": 9982080, + "step": 47305 + }, + { + "epoch": 5.204620462046204, + "grad_norm": 0.023793019354343414, + "learning_rate": 4.619076673017384e-05, + "loss": 0.0233, + "num_input_tokens_seen": 9983104, + "step": 47310 + }, + { + "epoch": 5.205170517051705, + "grad_norm": 0.9866649508476257, + "learning_rate": 4.618949318850209e-05, + "loss": 0.0445, + "num_input_tokens_seen": 9984128, + "step": 47315 + }, + { + "epoch": 5.2057205720572055, + "grad_norm": 0.022018402814865112, + "learning_rate": 4.618821945153694e-05, + "loss": 0.0105, + "num_input_tokens_seen": 9985152, + "step": 47320 + }, + { + "epoch": 5.206270627062707, + "grad_norm": 0.010258468799293041, + "learning_rate": 4.618694551929013e-05, + "loss": 0.008, + "num_input_tokens_seen": 9986144, + "step": 47325 + }, + { + "epoch": 5.206820682068207, + "grad_norm": 0.9008150100708008, + "learning_rate": 4.618567139177342e-05, + "loss": 0.058, + "num_input_tokens_seen": 9987200, + "step": 47330 + }, + { + "epoch": 5.207370737073707, + "grad_norm": 0.8662551641464233, + "learning_rate": 4.618439706899854e-05, + "loss": 0.0801, + "num_input_tokens_seen": 9988224, + "step": 47335 + }, + { + "epoch": 5.207920792079208, + "grad_norm": 0.07377705723047256, + "learning_rate": 4.6183122550977234e-05, + "loss": 0.0093, + "num_input_tokens_seen": 9989344, + "step": 47340 + }, + { + "epoch": 5.208470847084708, + "grad_norm": 0.09251580387353897, + "learning_rate": 4.618184783772126e-05, + "loss": 0.0205, + "num_input_tokens_seen": 9990368, + "step": 47345 + }, + { + "epoch": 5.209020902090209, + "grad_norm": 0.014777673408389091, + "learning_rate": 4.618057292924235e-05, + "loss": 0.0335, + "num_input_tokens_seen": 9991392, + "step": 47350 + }, + { + "epoch": 5.20957095709571, + "grad_norm": 2.088487386703491, + "learning_rate": 4.617929782555227e-05, + "loss": 0.0903, + "num_input_tokens_seen": 9992416, + "step": 47355 + }, + { + "epoch": 5.21012101210121, + "grad_norm": 0.04161593317985535, + "learning_rate": 4.617802252666276e-05, + "loss": 0.0907, + "num_input_tokens_seen": 9993504, + "step": 47360 + }, + { + "epoch": 5.210671067106711, + "grad_norm": 0.006121935322880745, + "learning_rate": 4.617674703258559e-05, + "loss": 0.0613, + "num_input_tokens_seen": 9994624, + "step": 47365 + }, + { + "epoch": 5.211221122112211, + "grad_norm": 0.17634817957878113, + "learning_rate": 4.6175471343332485e-05, + "loss": 0.0969, + "num_input_tokens_seen": 9995648, + "step": 47370 + }, + { + "epoch": 5.211771177117711, + "grad_norm": 0.08921793848276138, + "learning_rate": 4.6174195458915236e-05, + "loss": 0.0719, + "num_input_tokens_seen": 9996736, + "step": 47375 + }, + { + "epoch": 5.212321232123212, + "grad_norm": 0.6231233477592468, + "learning_rate": 4.617291937934558e-05, + "loss": 0.0373, + "num_input_tokens_seen": 9997824, + "step": 47380 + }, + { + "epoch": 5.212871287128713, + "grad_norm": 2.018695116043091, + "learning_rate": 4.617164310463529e-05, + "loss": 0.0602, + "num_input_tokens_seen": 9998880, + "step": 47385 + }, + { + "epoch": 5.213421342134214, + "grad_norm": 0.06353329867124557, + "learning_rate": 4.6170366634796125e-05, + "loss": 0.021, + "num_input_tokens_seen": 9999936, + "step": 47390 + }, + { + "epoch": 5.213971397139714, + "grad_norm": 0.046879541128873825, + "learning_rate": 4.6169089969839844e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10000960, + "step": 47395 + }, + { + "epoch": 5.214521452145214, + "grad_norm": 0.017238350585103035, + "learning_rate": 4.6167813109778225e-05, + "loss": 0.0049, + "num_input_tokens_seen": 10002016, + "step": 47400 + }, + { + "epoch": 5.215071507150715, + "grad_norm": 0.07479724287986755, + "learning_rate": 4.6166536054623025e-05, + "loss": 0.1109, + "num_input_tokens_seen": 10003040, + "step": 47405 + }, + { + "epoch": 5.215621562156215, + "grad_norm": 2.0833566188812256, + "learning_rate": 4.6165258804386026e-05, + "loss": 0.125, + "num_input_tokens_seen": 10004096, + "step": 47410 + }, + { + "epoch": 5.2161716171617165, + "grad_norm": 0.004698953591287136, + "learning_rate": 4.6163981359078986e-05, + "loss": 0.0099, + "num_input_tokens_seen": 10005216, + "step": 47415 + }, + { + "epoch": 5.216721672167217, + "grad_norm": 0.10371886938810349, + "learning_rate": 4.616270371871368e-05, + "loss": 0.0469, + "num_input_tokens_seen": 10006272, + "step": 47420 + }, + { + "epoch": 5.217271727172717, + "grad_norm": 1.2066903114318848, + "learning_rate": 4.61614258833019e-05, + "loss": 0.1479, + "num_input_tokens_seen": 10007328, + "step": 47425 + }, + { + "epoch": 5.217821782178218, + "grad_norm": 0.1430387645959854, + "learning_rate": 4.6160147852855405e-05, + "loss": 0.1485, + "num_input_tokens_seen": 10008384, + "step": 47430 + }, + { + "epoch": 5.218371837183718, + "grad_norm": 0.43213847279548645, + "learning_rate": 4.6158869627385986e-05, + "loss": 0.0192, + "num_input_tokens_seen": 10009472, + "step": 47435 + }, + { + "epoch": 5.218921892189219, + "grad_norm": 0.3482222259044647, + "learning_rate": 4.615759120690542e-05, + "loss": 0.072, + "num_input_tokens_seen": 10010528, + "step": 47440 + }, + { + "epoch": 5.2194719471947195, + "grad_norm": 0.4642751216888428, + "learning_rate": 4.6156312591425485e-05, + "loss": 0.0253, + "num_input_tokens_seen": 10011616, + "step": 47445 + }, + { + "epoch": 5.22002200220022, + "grad_norm": 0.20252428948879242, + "learning_rate": 4.6155033780957965e-05, + "loss": 0.0371, + "num_input_tokens_seen": 10012640, + "step": 47450 + }, + { + "epoch": 5.220572057205721, + "grad_norm": 0.0666762962937355, + "learning_rate": 4.6153754775514655e-05, + "loss": 0.011, + "num_input_tokens_seen": 10013728, + "step": 47455 + }, + { + "epoch": 5.221122112211221, + "grad_norm": 0.021369026973843575, + "learning_rate": 4.615247557510733e-05, + "loss": 0.0342, + "num_input_tokens_seen": 10014784, + "step": 47460 + }, + { + "epoch": 5.221672167216722, + "grad_norm": 0.03936014324426651, + "learning_rate": 4.61511961797478e-05, + "loss": 0.0135, + "num_input_tokens_seen": 10015904, + "step": 47465 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 2.231816053390503, + "learning_rate": 4.6149916589447836e-05, + "loss": 0.0276, + "num_input_tokens_seen": 10016992, + "step": 47470 + }, + { + "epoch": 5.2227722772277225, + "grad_norm": 0.21789522469043732, + "learning_rate": 4.614863680421924e-05, + "loss": 0.101, + "num_input_tokens_seen": 10018176, + "step": 47475 + }, + { + "epoch": 5.223322332233224, + "grad_norm": 0.05112810432910919, + "learning_rate": 4.6147356824073805e-05, + "loss": 0.0213, + "num_input_tokens_seen": 10019296, + "step": 47480 + }, + { + "epoch": 5.223872387238724, + "grad_norm": 0.21737942099571228, + "learning_rate": 4.614607664902333e-05, + "loss": 0.0154, + "num_input_tokens_seen": 10020352, + "step": 47485 + }, + { + "epoch": 5.224422442244224, + "grad_norm": 0.5194084048271179, + "learning_rate": 4.614479627907962e-05, + "loss": 0.0297, + "num_input_tokens_seen": 10021376, + "step": 47490 + }, + { + "epoch": 5.224972497249725, + "grad_norm": 1.6764602661132812, + "learning_rate": 4.6143515714254465e-05, + "loss": 0.1359, + "num_input_tokens_seen": 10022464, + "step": 47495 + }, + { + "epoch": 5.225522552255225, + "grad_norm": 0.06230902671813965, + "learning_rate": 4.614223495455967e-05, + "loss": 0.0328, + "num_input_tokens_seen": 10023488, + "step": 47500 + }, + { + "epoch": 5.226072607260726, + "grad_norm": 0.23112048208713531, + "learning_rate": 4.614095400000704e-05, + "loss": 0.0153, + "num_input_tokens_seen": 10024544, + "step": 47505 + }, + { + "epoch": 5.226622662266227, + "grad_norm": 0.08420468866825104, + "learning_rate": 4.613967285060839e-05, + "loss": 0.0873, + "num_input_tokens_seen": 10025568, + "step": 47510 + }, + { + "epoch": 5.227172717271727, + "grad_norm": 0.024675779044628143, + "learning_rate": 4.613839150637551e-05, + "loss": 0.0081, + "num_input_tokens_seen": 10026592, + "step": 47515 + }, + { + "epoch": 5.227722772277228, + "grad_norm": 0.16597189009189606, + "learning_rate": 4.613710996732021e-05, + "loss": 0.0102, + "num_input_tokens_seen": 10027648, + "step": 47520 + }, + { + "epoch": 5.228272827282728, + "grad_norm": 0.011551721021533012, + "learning_rate": 4.6135828233454325e-05, + "loss": 0.1098, + "num_input_tokens_seen": 10028704, + "step": 47525 + }, + { + "epoch": 5.228822882288229, + "grad_norm": 0.046683844178915024, + "learning_rate": 4.613454630478965e-05, + "loss": 0.0236, + "num_input_tokens_seen": 10029792, + "step": 47530 + }, + { + "epoch": 5.229372937293729, + "grad_norm": 0.15266665816307068, + "learning_rate": 4.6133264181338006e-05, + "loss": 0.015, + "num_input_tokens_seen": 10030880, + "step": 47535 + }, + { + "epoch": 5.22992299229923, + "grad_norm": 2.133563995361328, + "learning_rate": 4.6131981863111204e-05, + "loss": 0.0591, + "num_input_tokens_seen": 10031968, + "step": 47540 + }, + { + "epoch": 5.230473047304731, + "grad_norm": 1.0215425491333008, + "learning_rate": 4.613069935012106e-05, + "loss": 0.0683, + "num_input_tokens_seen": 10033024, + "step": 47545 + }, + { + "epoch": 5.231023102310231, + "grad_norm": 0.2879922688007355, + "learning_rate": 4.61294166423794e-05, + "loss": 0.0304, + "num_input_tokens_seen": 10034080, + "step": 47550 + }, + { + "epoch": 5.231573157315731, + "grad_norm": 0.46369603276252747, + "learning_rate": 4.612813373989805e-05, + "loss": 0.0182, + "num_input_tokens_seen": 10035168, + "step": 47555 + }, + { + "epoch": 5.232123212321232, + "grad_norm": 0.17182953655719757, + "learning_rate": 4.6126850642688834e-05, + "loss": 0.0059, + "num_input_tokens_seen": 10036288, + "step": 47560 + }, + { + "epoch": 5.232673267326732, + "grad_norm": 1.7780898809432983, + "learning_rate": 4.612556735076357e-05, + "loss": 0.1173, + "num_input_tokens_seen": 10037408, + "step": 47565 + }, + { + "epoch": 5.2332233223322335, + "grad_norm": 0.3336867094039917, + "learning_rate": 4.6124283864134077e-05, + "loss": 0.0426, + "num_input_tokens_seen": 10038528, + "step": 47570 + }, + { + "epoch": 5.233773377337734, + "grad_norm": 0.03373142331838608, + "learning_rate": 4.612300018281221e-05, + "loss": 0.0406, + "num_input_tokens_seen": 10039584, + "step": 47575 + }, + { + "epoch": 5.234323432343234, + "grad_norm": 0.1527780145406723, + "learning_rate": 4.612171630680978e-05, + "loss": 0.0114, + "num_input_tokens_seen": 10040640, + "step": 47580 + }, + { + "epoch": 5.234873487348735, + "grad_norm": 0.020071491599082947, + "learning_rate": 4.612043223613863e-05, + "loss": 0.1001, + "num_input_tokens_seen": 10041696, + "step": 47585 + }, + { + "epoch": 5.235423542354235, + "grad_norm": 0.3593951463699341, + "learning_rate": 4.611914797081059e-05, + "loss": 0.0157, + "num_input_tokens_seen": 10042720, + "step": 47590 + }, + { + "epoch": 5.235973597359736, + "grad_norm": 0.03365075960755348, + "learning_rate": 4.611786351083749e-05, + "loss": 0.0131, + "num_input_tokens_seen": 10043808, + "step": 47595 + }, + { + "epoch": 5.2365236523652365, + "grad_norm": 0.8063982725143433, + "learning_rate": 4.611657885623119e-05, + "loss": 0.0451, + "num_input_tokens_seen": 10044896, + "step": 47600 + }, + { + "epoch": 5.237073707370737, + "grad_norm": 0.6462043523788452, + "learning_rate": 4.61152940070035e-05, + "loss": 0.0684, + "num_input_tokens_seen": 10045920, + "step": 47605 + }, + { + "epoch": 5.237623762376238, + "grad_norm": 0.03388172760605812, + "learning_rate": 4.611400896316628e-05, + "loss": 0.0485, + "num_input_tokens_seen": 10046880, + "step": 47610 + }, + { + "epoch": 5.238173817381738, + "grad_norm": 0.04658487066626549, + "learning_rate": 4.611272372473138e-05, + "loss": 0.0768, + "num_input_tokens_seen": 10048000, + "step": 47615 + }, + { + "epoch": 5.238723872387239, + "grad_norm": 0.5559689998626709, + "learning_rate": 4.611143829171063e-05, + "loss": 0.0351, + "num_input_tokens_seen": 10049056, + "step": 47620 + }, + { + "epoch": 5.239273927392739, + "grad_norm": 0.15447326004505157, + "learning_rate": 4.611015266411588e-05, + "loss": 0.0794, + "num_input_tokens_seen": 10050176, + "step": 47625 + }, + { + "epoch": 5.2398239823982395, + "grad_norm": 0.13474099338054657, + "learning_rate": 4.6108866841958985e-05, + "loss": 0.0165, + "num_input_tokens_seen": 10051232, + "step": 47630 + }, + { + "epoch": 5.240374037403741, + "grad_norm": 0.045989930629730225, + "learning_rate": 4.61075808252518e-05, + "loss": 0.06, + "num_input_tokens_seen": 10052288, + "step": 47635 + }, + { + "epoch": 5.240924092409241, + "grad_norm": 0.010114342905580997, + "learning_rate": 4.610629461400617e-05, + "loss": 0.0059, + "num_input_tokens_seen": 10053312, + "step": 47640 + }, + { + "epoch": 5.241474147414741, + "grad_norm": 0.14928483963012695, + "learning_rate": 4.6105008208233945e-05, + "loss": 0.0353, + "num_input_tokens_seen": 10054368, + "step": 47645 + }, + { + "epoch": 5.242024202420242, + "grad_norm": 0.3660421669483185, + "learning_rate": 4.610372160794699e-05, + "loss": 0.0316, + "num_input_tokens_seen": 10055424, + "step": 47650 + }, + { + "epoch": 5.242574257425742, + "grad_norm": 0.528454601764679, + "learning_rate": 4.610243481315715e-05, + "loss": 0.0186, + "num_input_tokens_seen": 10056512, + "step": 47655 + }, + { + "epoch": 5.243124312431243, + "grad_norm": 0.016272373497486115, + "learning_rate": 4.61011478238763e-05, + "loss": 0.2065, + "num_input_tokens_seen": 10057568, + "step": 47660 + }, + { + "epoch": 5.243674367436744, + "grad_norm": 0.22401954233646393, + "learning_rate": 4.6099860640116296e-05, + "loss": 0.0643, + "num_input_tokens_seen": 10058592, + "step": 47665 + }, + { + "epoch": 5.244224422442244, + "grad_norm": 0.4866335093975067, + "learning_rate": 4.609857326188901e-05, + "loss": 0.0497, + "num_input_tokens_seen": 10059648, + "step": 47670 + }, + { + "epoch": 5.244774477447745, + "grad_norm": 0.3425121605396271, + "learning_rate": 4.6097285689206297e-05, + "loss": 0.0382, + "num_input_tokens_seen": 10060736, + "step": 47675 + }, + { + "epoch": 5.245324532453245, + "grad_norm": 0.8248506188392639, + "learning_rate": 4.609599792208001e-05, + "loss": 0.0387, + "num_input_tokens_seen": 10061728, + "step": 47680 + }, + { + "epoch": 5.245874587458746, + "grad_norm": 0.10343094170093536, + "learning_rate": 4.609470996052204e-05, + "loss": 0.0608, + "num_input_tokens_seen": 10062816, + "step": 47685 + }, + { + "epoch": 5.2464246424642464, + "grad_norm": 0.6231657862663269, + "learning_rate": 4.6093421804544256e-05, + "loss": 0.1528, + "num_input_tokens_seen": 10063840, + "step": 47690 + }, + { + "epoch": 5.246974697469747, + "grad_norm": 0.08244755119085312, + "learning_rate": 4.609213345415852e-05, + "loss": 0.0421, + "num_input_tokens_seen": 10064864, + "step": 47695 + }, + { + "epoch": 5.247524752475248, + "grad_norm": 0.5817788243293762, + "learning_rate": 4.6090844909376716e-05, + "loss": 0.1518, + "num_input_tokens_seen": 10065952, + "step": 47700 + }, + { + "epoch": 5.248074807480748, + "grad_norm": 0.016236918047070503, + "learning_rate": 4.6089556170210715e-05, + "loss": 0.0183, + "num_input_tokens_seen": 10067072, + "step": 47705 + }, + { + "epoch": 5.248624862486249, + "grad_norm": 0.4443546533584595, + "learning_rate": 4.608826723667239e-05, + "loss": 0.0407, + "num_input_tokens_seen": 10068128, + "step": 47710 + }, + { + "epoch": 5.249174917491749, + "grad_norm": 0.6711934208869934, + "learning_rate": 4.608697810877363e-05, + "loss": 0.0903, + "num_input_tokens_seen": 10069184, + "step": 47715 + }, + { + "epoch": 5.2497249724972495, + "grad_norm": 0.5603197813034058, + "learning_rate": 4.60856887865263e-05, + "loss": 0.0735, + "num_input_tokens_seen": 10070240, + "step": 47720 + }, + { + "epoch": 5.2502750275027505, + "grad_norm": 0.1723940223455429, + "learning_rate": 4.6084399269942305e-05, + "loss": 0.0617, + "num_input_tokens_seen": 10071264, + "step": 47725 + }, + { + "epoch": 5.250825082508251, + "grad_norm": 0.007152792066335678, + "learning_rate": 4.608310955903351e-05, + "loss": 0.0429, + "num_input_tokens_seen": 10072288, + "step": 47730 + }, + { + "epoch": 5.251375137513751, + "grad_norm": 0.15889926254749298, + "learning_rate": 4.608181965381182e-05, + "loss": 0.033, + "num_input_tokens_seen": 10073312, + "step": 47735 + }, + { + "epoch": 5.251925192519252, + "grad_norm": 0.8795632719993591, + "learning_rate": 4.608052955428911e-05, + "loss": 0.0434, + "num_input_tokens_seen": 10074432, + "step": 47740 + }, + { + "epoch": 5.252475247524752, + "grad_norm": 0.007885163649916649, + "learning_rate": 4.607923926047727e-05, + "loss": 0.0454, + "num_input_tokens_seen": 10075488, + "step": 47745 + }, + { + "epoch": 5.253025302530253, + "grad_norm": 0.029360780492424965, + "learning_rate": 4.607794877238821e-05, + "loss": 0.0112, + "num_input_tokens_seen": 10076608, + "step": 47750 + }, + { + "epoch": 5.2535753575357536, + "grad_norm": 0.10081798583269119, + "learning_rate": 4.60766580900338e-05, + "loss": 0.0497, + "num_input_tokens_seen": 10077696, + "step": 47755 + }, + { + "epoch": 5.254125412541254, + "grad_norm": 0.0369684100151062, + "learning_rate": 4.607536721342595e-05, + "loss": 0.0054, + "num_input_tokens_seen": 10078720, + "step": 47760 + }, + { + "epoch": 5.254675467546755, + "grad_norm": 1.0331107378005981, + "learning_rate": 4.607407614257655e-05, + "loss": 0.0867, + "num_input_tokens_seen": 10079744, + "step": 47765 + }, + { + "epoch": 5.255225522552255, + "grad_norm": 0.018394455313682556, + "learning_rate": 4.6072784877497504e-05, + "loss": 0.1097, + "num_input_tokens_seen": 10080768, + "step": 47770 + }, + { + "epoch": 5.255775577557756, + "grad_norm": 0.5573519468307495, + "learning_rate": 4.6071493418200714e-05, + "loss": 0.0571, + "num_input_tokens_seen": 10081856, + "step": 47775 + }, + { + "epoch": 5.256325632563256, + "grad_norm": 0.21629558503627777, + "learning_rate": 4.607020176469807e-05, + "loss": 0.0375, + "num_input_tokens_seen": 10082912, + "step": 47780 + }, + { + "epoch": 5.256875687568757, + "grad_norm": 0.1146056056022644, + "learning_rate": 4.60689099170015e-05, + "loss": 0.047, + "num_input_tokens_seen": 10084000, + "step": 47785 + }, + { + "epoch": 5.257425742574258, + "grad_norm": 0.10003804415464401, + "learning_rate": 4.606761787512289e-05, + "loss": 0.0125, + "num_input_tokens_seen": 10085056, + "step": 47790 + }, + { + "epoch": 5.257975797579758, + "grad_norm": 0.44987019896507263, + "learning_rate": 4.606632563907416e-05, + "loss": 0.1356, + "num_input_tokens_seen": 10086080, + "step": 47795 + }, + { + "epoch": 5.258525852585258, + "grad_norm": 0.12075795978307724, + "learning_rate": 4.6065033208867216e-05, + "loss": 0.0348, + "num_input_tokens_seen": 10087136, + "step": 47800 + }, + { + "epoch": 5.259075907590759, + "grad_norm": 0.6837313175201416, + "learning_rate": 4.606374058451396e-05, + "loss": 0.0245, + "num_input_tokens_seen": 10088128, + "step": 47805 + }, + { + "epoch": 5.259625962596259, + "grad_norm": 0.06368991732597351, + "learning_rate": 4.606244776602632e-05, + "loss": 0.0535, + "num_input_tokens_seen": 10089184, + "step": 47810 + }, + { + "epoch": 5.2601760176017605, + "grad_norm": 0.022088875994086266, + "learning_rate": 4.60611547534162e-05, + "loss": 0.196, + "num_input_tokens_seen": 10090272, + "step": 47815 + }, + { + "epoch": 5.260726072607261, + "grad_norm": 0.05782891437411308, + "learning_rate": 4.605986154669554e-05, + "loss": 0.0756, + "num_input_tokens_seen": 10091264, + "step": 47820 + }, + { + "epoch": 5.261276127612761, + "grad_norm": 0.34778663516044617, + "learning_rate": 4.605856814587622e-05, + "loss": 0.139, + "num_input_tokens_seen": 10092288, + "step": 47825 + }, + { + "epoch": 5.261826182618262, + "grad_norm": 0.15489405393600464, + "learning_rate": 4.60572745509702e-05, + "loss": 0.017, + "num_input_tokens_seen": 10093312, + "step": 47830 + }, + { + "epoch": 5.262376237623762, + "grad_norm": 0.6355433464050293, + "learning_rate": 4.605598076198937e-05, + "loss": 0.0402, + "num_input_tokens_seen": 10094336, + "step": 47835 + }, + { + "epoch": 5.262926292629263, + "grad_norm": 0.1014786958694458, + "learning_rate": 4.6054686778945674e-05, + "loss": 0.0342, + "num_input_tokens_seen": 10095392, + "step": 47840 + }, + { + "epoch": 5.2634763476347635, + "grad_norm": 0.023743391036987305, + "learning_rate": 4.605339260185104e-05, + "loss": 0.0523, + "num_input_tokens_seen": 10096416, + "step": 47845 + }, + { + "epoch": 5.264026402640264, + "grad_norm": 0.7258632183074951, + "learning_rate": 4.6052098230717376e-05, + "loss": 0.0587, + "num_input_tokens_seen": 10097440, + "step": 47850 + }, + { + "epoch": 5.264576457645765, + "grad_norm": 0.10411582142114639, + "learning_rate": 4.605080366555664e-05, + "loss": 0.009, + "num_input_tokens_seen": 10098560, + "step": 47855 + }, + { + "epoch": 5.265126512651265, + "grad_norm": 0.9941041469573975, + "learning_rate": 4.604950890638073e-05, + "loss": 0.0737, + "num_input_tokens_seen": 10099584, + "step": 47860 + }, + { + "epoch": 5.265676567656766, + "grad_norm": 0.04510282725095749, + "learning_rate": 4.6048213953201615e-05, + "loss": 0.0038, + "num_input_tokens_seen": 10100640, + "step": 47865 + }, + { + "epoch": 5.266226622662266, + "grad_norm": 0.8447640538215637, + "learning_rate": 4.60469188060312e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10101664, + "step": 47870 + }, + { + "epoch": 5.2667766776677665, + "grad_norm": 0.011529812589287758, + "learning_rate": 4.604562346488144e-05, + "loss": 0.022, + "num_input_tokens_seen": 10102688, + "step": 47875 + }, + { + "epoch": 5.267326732673268, + "grad_norm": 0.011308512650430202, + "learning_rate": 4.6044327929764264e-05, + "loss": 0.0277, + "num_input_tokens_seen": 10103680, + "step": 47880 + }, + { + "epoch": 5.267876787678768, + "grad_norm": 0.5503657460212708, + "learning_rate": 4.6043032200691616e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10104768, + "step": 47885 + }, + { + "epoch": 5.268426842684269, + "grad_norm": 0.1404850035905838, + "learning_rate": 4.6041736277675434e-05, + "loss": 0.0237, + "num_input_tokens_seen": 10105760, + "step": 47890 + }, + { + "epoch": 5.268976897689769, + "grad_norm": 0.3238740861415863, + "learning_rate": 4.604044016072767e-05, + "loss": 0.0246, + "num_input_tokens_seen": 10106816, + "step": 47895 + }, + { + "epoch": 5.269526952695269, + "grad_norm": 0.022227946668863297, + "learning_rate": 4.6039143849860264e-05, + "loss": 0.0364, + "num_input_tokens_seen": 10107808, + "step": 47900 + }, + { + "epoch": 5.27007700770077, + "grad_norm": 1.349094271659851, + "learning_rate": 4.603784734508516e-05, + "loss": 0.1539, + "num_input_tokens_seen": 10108832, + "step": 47905 + }, + { + "epoch": 5.270627062706271, + "grad_norm": 2.9669132232666016, + "learning_rate": 4.603655064641432e-05, + "loss": 0.0658, + "num_input_tokens_seen": 10109920, + "step": 47910 + }, + { + "epoch": 5.271177117711771, + "grad_norm": 0.5797343254089355, + "learning_rate": 4.603525375385968e-05, + "loss": 0.0477, + "num_input_tokens_seen": 10110976, + "step": 47915 + }, + { + "epoch": 5.271727172717272, + "grad_norm": 0.30254605412483215, + "learning_rate": 4.603395666743321e-05, + "loss": 0.0251, + "num_input_tokens_seen": 10112032, + "step": 47920 + }, + { + "epoch": 5.272277227722772, + "grad_norm": 0.08711094409227371, + "learning_rate": 4.603265938714685e-05, + "loss": 0.0036, + "num_input_tokens_seen": 10113024, + "step": 47925 + }, + { + "epoch": 5.272827282728273, + "grad_norm": 0.4362258315086365, + "learning_rate": 4.603136191301256e-05, + "loss": 0.0568, + "num_input_tokens_seen": 10114080, + "step": 47930 + }, + { + "epoch": 5.273377337733773, + "grad_norm": 0.00806147325783968, + "learning_rate": 4.6030064245042295e-05, + "loss": 0.0645, + "num_input_tokens_seen": 10115136, + "step": 47935 + }, + { + "epoch": 5.273927392739274, + "grad_norm": 0.5007501840591431, + "learning_rate": 4.602876638324802e-05, + "loss": 0.0232, + "num_input_tokens_seen": 10116160, + "step": 47940 + }, + { + "epoch": 5.274477447744775, + "grad_norm": 0.06544794887304306, + "learning_rate": 4.60274683276417e-05, + "loss": 0.0125, + "num_input_tokens_seen": 10117248, + "step": 47945 + }, + { + "epoch": 5.275027502750275, + "grad_norm": 1.1931895017623901, + "learning_rate": 4.602617007823529e-05, + "loss": 0.0637, + "num_input_tokens_seen": 10118304, + "step": 47950 + }, + { + "epoch": 5.275577557755776, + "grad_norm": 0.020832715556025505, + "learning_rate": 4.602487163504077e-05, + "loss": 0.0713, + "num_input_tokens_seen": 10119424, + "step": 47955 + }, + { + "epoch": 5.276127612761276, + "grad_norm": 0.12060323357582092, + "learning_rate": 4.602357299807008e-05, + "loss": 0.0431, + "num_input_tokens_seen": 10120512, + "step": 47960 + }, + { + "epoch": 5.276677667766776, + "grad_norm": 0.10136005282402039, + "learning_rate": 4.6022274167335225e-05, + "loss": 0.0892, + "num_input_tokens_seen": 10121536, + "step": 47965 + }, + { + "epoch": 5.2772277227722775, + "grad_norm": 0.06290246546268463, + "learning_rate": 4.6020975142848146e-05, + "loss": 0.034, + "num_input_tokens_seen": 10122656, + "step": 47970 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.043132904917001724, + "learning_rate": 4.601967592462082e-05, + "loss": 0.0127, + "num_input_tokens_seen": 10123680, + "step": 47975 + }, + { + "epoch": 5.278327832783278, + "grad_norm": 0.008062445558607578, + "learning_rate": 4.6018376512665235e-05, + "loss": 0.0037, + "num_input_tokens_seen": 10124736, + "step": 47980 + }, + { + "epoch": 5.278877887788779, + "grad_norm": 0.2631624937057495, + "learning_rate": 4.601707690699336e-05, + "loss": 0.0495, + "num_input_tokens_seen": 10125824, + "step": 47985 + }, + { + "epoch": 5.279427942794279, + "grad_norm": 0.12117428332567215, + "learning_rate": 4.601577710761717e-05, + "loss": 0.0321, + "num_input_tokens_seen": 10126880, + "step": 47990 + }, + { + "epoch": 5.27997799779978, + "grad_norm": 0.029871182516217232, + "learning_rate": 4.6014477114548645e-05, + "loss": 0.0406, + "num_input_tokens_seen": 10127904, + "step": 47995 + }, + { + "epoch": 5.2805280528052805, + "grad_norm": 0.04159798100590706, + "learning_rate": 4.6013176927799764e-05, + "loss": 0.0064, + "num_input_tokens_seen": 10129024, + "step": 48000 + }, + { + "epoch": 5.281078107810781, + "grad_norm": 0.03373970463871956, + "learning_rate": 4.6011876547382526e-05, + "loss": 0.0549, + "num_input_tokens_seen": 10130112, + "step": 48005 + }, + { + "epoch": 5.281628162816282, + "grad_norm": 0.9160332083702087, + "learning_rate": 4.60105759733089e-05, + "loss": 0.0116, + "num_input_tokens_seen": 10131200, + "step": 48010 + }, + { + "epoch": 5.282178217821782, + "grad_norm": 0.12189822643995285, + "learning_rate": 4.600927520559087e-05, + "loss": 0.0833, + "num_input_tokens_seen": 10132288, + "step": 48015 + }, + { + "epoch": 5.282728272827283, + "grad_norm": 0.03981584310531616, + "learning_rate": 4.600797424424044e-05, + "loss": 0.0062, + "num_input_tokens_seen": 10133280, + "step": 48020 + }, + { + "epoch": 5.283278327832783, + "grad_norm": 0.013537397608160973, + "learning_rate": 4.600667308926959e-05, + "loss": 0.1359, + "num_input_tokens_seen": 10134336, + "step": 48025 + }, + { + "epoch": 5.2838283828382835, + "grad_norm": 0.0512339249253273, + "learning_rate": 4.6005371740690304e-05, + "loss": 0.005, + "num_input_tokens_seen": 10135360, + "step": 48030 + }, + { + "epoch": 5.284378437843785, + "grad_norm": 0.9906959533691406, + "learning_rate": 4.6004070198514606e-05, + "loss": 0.0748, + "num_input_tokens_seen": 10136480, + "step": 48035 + }, + { + "epoch": 5.284928492849285, + "grad_norm": 0.9620081782341003, + "learning_rate": 4.600276846275446e-05, + "loss": 0.0143, + "num_input_tokens_seen": 10137568, + "step": 48040 + }, + { + "epoch": 5.285478547854786, + "grad_norm": 0.3285142481327057, + "learning_rate": 4.6001466533421875e-05, + "loss": 0.0449, + "num_input_tokens_seen": 10138592, + "step": 48045 + }, + { + "epoch": 5.286028602860286, + "grad_norm": 0.1611122339963913, + "learning_rate": 4.6000164410528845e-05, + "loss": 0.0778, + "num_input_tokens_seen": 10139680, + "step": 48050 + }, + { + "epoch": 5.286578657865786, + "grad_norm": 0.1192583441734314, + "learning_rate": 4.599886209408739e-05, + "loss": 0.0263, + "num_input_tokens_seen": 10140736, + "step": 48055 + }, + { + "epoch": 5.287128712871287, + "grad_norm": 0.34206023812294006, + "learning_rate": 4.599755958410949e-05, + "loss": 0.0206, + "num_input_tokens_seen": 10141760, + "step": 48060 + }, + { + "epoch": 5.287678767876788, + "grad_norm": 0.17315387725830078, + "learning_rate": 4.5996256880607156e-05, + "loss": 0.0657, + "num_input_tokens_seen": 10142912, + "step": 48065 + }, + { + "epoch": 5.288228822882289, + "grad_norm": 0.04990798607468605, + "learning_rate": 4.5994953983592406e-05, + "loss": 0.0315, + "num_input_tokens_seen": 10144000, + "step": 48070 + }, + { + "epoch": 5.288778877887789, + "grad_norm": 0.22090986371040344, + "learning_rate": 4.599365089307724e-05, + "loss": 0.1362, + "num_input_tokens_seen": 10145088, + "step": 48075 + }, + { + "epoch": 5.289328932893289, + "grad_norm": 0.7243814468383789, + "learning_rate": 4.599234760907366e-05, + "loss": 0.0301, + "num_input_tokens_seen": 10146176, + "step": 48080 + }, + { + "epoch": 5.28987898789879, + "grad_norm": 0.48072174191474915, + "learning_rate": 4.599104413159369e-05, + "loss": 0.0406, + "num_input_tokens_seen": 10147264, + "step": 48085 + }, + { + "epoch": 5.29042904290429, + "grad_norm": 0.018721459433436394, + "learning_rate": 4.598974046064934e-05, + "loss": 0.0205, + "num_input_tokens_seen": 10148288, + "step": 48090 + }, + { + "epoch": 5.290979097909791, + "grad_norm": 0.029987074434757233, + "learning_rate": 4.598843659625262e-05, + "loss": 0.0287, + "num_input_tokens_seen": 10149376, + "step": 48095 + }, + { + "epoch": 5.291529152915292, + "grad_norm": 1.728981852531433, + "learning_rate": 4.598713253841556e-05, + "loss": 0.0956, + "num_input_tokens_seen": 10150368, + "step": 48100 + }, + { + "epoch": 5.292079207920792, + "grad_norm": 0.13117077946662903, + "learning_rate": 4.598582828715016e-05, + "loss": 0.0172, + "num_input_tokens_seen": 10151424, + "step": 48105 + }, + { + "epoch": 5.292629262926293, + "grad_norm": 0.23961324989795685, + "learning_rate": 4.598452384246845e-05, + "loss": 0.0465, + "num_input_tokens_seen": 10152384, + "step": 48110 + }, + { + "epoch": 5.293179317931793, + "grad_norm": 0.02371158078312874, + "learning_rate": 4.598321920438245e-05, + "loss": 0.0492, + "num_input_tokens_seen": 10153408, + "step": 48115 + }, + { + "epoch": 5.293729372937293, + "grad_norm": 0.020244378596544266, + "learning_rate": 4.59819143729042e-05, + "loss": 0.0069, + "num_input_tokens_seen": 10154432, + "step": 48120 + }, + { + "epoch": 5.2942794279427945, + "grad_norm": 0.03327883407473564, + "learning_rate": 4.5980609348045705e-05, + "loss": 0.0146, + "num_input_tokens_seen": 10155456, + "step": 48125 + }, + { + "epoch": 5.294829482948295, + "grad_norm": 0.9796680212020874, + "learning_rate": 4.5979304129819e-05, + "loss": 0.052, + "num_input_tokens_seen": 10156512, + "step": 48130 + }, + { + "epoch": 5.295379537953796, + "grad_norm": 1.9282433986663818, + "learning_rate": 4.5977998718236126e-05, + "loss": 0.0794, + "num_input_tokens_seen": 10157664, + "step": 48135 + }, + { + "epoch": 5.295929592959296, + "grad_norm": 0.03863153234124184, + "learning_rate": 4.59766931133091e-05, + "loss": 0.0146, + "num_input_tokens_seen": 10158688, + "step": 48140 + }, + { + "epoch": 5.296479647964796, + "grad_norm": 0.031070100143551826, + "learning_rate": 4.597538731504995e-05, + "loss": 0.0105, + "num_input_tokens_seen": 10159712, + "step": 48145 + }, + { + "epoch": 5.297029702970297, + "grad_norm": 1.245117425918579, + "learning_rate": 4.597408132347073e-05, + "loss": 0.1638, + "num_input_tokens_seen": 10160736, + "step": 48150 + }, + { + "epoch": 5.2975797579757975, + "grad_norm": 0.9031936526298523, + "learning_rate": 4.597277513858346e-05, + "loss": 0.1015, + "num_input_tokens_seen": 10161888, + "step": 48155 + }, + { + "epoch": 5.298129812981298, + "grad_norm": 0.013134804554283619, + "learning_rate": 4.597146876040019e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10162976, + "step": 48160 + }, + { + "epoch": 5.298679867986799, + "grad_norm": 1.5143563747406006, + "learning_rate": 4.5970162188932954e-05, + "loss": 0.1741, + "num_input_tokens_seen": 10164032, + "step": 48165 + }, + { + "epoch": 5.299229922992299, + "grad_norm": 0.03141223266720772, + "learning_rate": 4.596885542419379e-05, + "loss": 0.0042, + "num_input_tokens_seen": 10165056, + "step": 48170 + }, + { + "epoch": 5.2997799779978, + "grad_norm": 0.9784573912620544, + "learning_rate": 4.596754846619475e-05, + "loss": 0.0598, + "num_input_tokens_seen": 10166144, + "step": 48175 + }, + { + "epoch": 5.3003300330033, + "grad_norm": 0.04598992317914963, + "learning_rate": 4.596624131494789e-05, + "loss": 0.0081, + "num_input_tokens_seen": 10167232, + "step": 48180 + }, + { + "epoch": 5.3008800880088005, + "grad_norm": 1.0523985624313354, + "learning_rate": 4.596493397046523e-05, + "loss": 0.0588, + "num_input_tokens_seen": 10168288, + "step": 48185 + }, + { + "epoch": 5.301430143014302, + "grad_norm": 0.04670555144548416, + "learning_rate": 4.5963626432758836e-05, + "loss": 0.0753, + "num_input_tokens_seen": 10169344, + "step": 48190 + }, + { + "epoch": 5.301980198019802, + "grad_norm": 0.012343787588179111, + "learning_rate": 4.5962318701840754e-05, + "loss": 0.0058, + "num_input_tokens_seen": 10170400, + "step": 48195 + }, + { + "epoch": 5.302530253025303, + "grad_norm": 1.0089540481567383, + "learning_rate": 4.596101077772305e-05, + "loss": 0.0402, + "num_input_tokens_seen": 10171424, + "step": 48200 + }, + { + "epoch": 5.303080308030803, + "grad_norm": 2.3982746601104736, + "learning_rate": 4.595970266041776e-05, + "loss": 0.0392, + "num_input_tokens_seen": 10172448, + "step": 48205 + }, + { + "epoch": 5.303630363036303, + "grad_norm": 0.4004497230052948, + "learning_rate": 4.595839434993695e-05, + "loss": 0.0169, + "num_input_tokens_seen": 10173504, + "step": 48210 + }, + { + "epoch": 5.304180418041804, + "grad_norm": 0.07590291649103165, + "learning_rate": 4.5957085846292674e-05, + "loss": 0.0124, + "num_input_tokens_seen": 10174592, + "step": 48215 + }, + { + "epoch": 5.304730473047305, + "grad_norm": 0.30961930751800537, + "learning_rate": 4.5955777149496994e-05, + "loss": 0.0663, + "num_input_tokens_seen": 10175648, + "step": 48220 + }, + { + "epoch": 5.305280528052805, + "grad_norm": 0.4594988226890564, + "learning_rate": 4.595446825956198e-05, + "loss": 0.051, + "num_input_tokens_seen": 10176736, + "step": 48225 + }, + { + "epoch": 5.305830583058306, + "grad_norm": 0.12773874402046204, + "learning_rate": 4.595315917649967e-05, + "loss": 0.0199, + "num_input_tokens_seen": 10177728, + "step": 48230 + }, + { + "epoch": 5.306380638063806, + "grad_norm": 0.037064168602228165, + "learning_rate": 4.5951849900322165e-05, + "loss": 0.0597, + "num_input_tokens_seen": 10178784, + "step": 48235 + }, + { + "epoch": 5.306930693069307, + "grad_norm": 0.05768599361181259, + "learning_rate": 4.59505404310415e-05, + "loss": 0.048, + "num_input_tokens_seen": 10179840, + "step": 48240 + }, + { + "epoch": 5.307480748074807, + "grad_norm": 0.016859175637364388, + "learning_rate": 4.594923076866976e-05, + "loss": 0.0039, + "num_input_tokens_seen": 10180928, + "step": 48245 + }, + { + "epoch": 5.3080308030803085, + "grad_norm": 0.07402335852384567, + "learning_rate": 4.594792091321901e-05, + "loss": 0.0156, + "num_input_tokens_seen": 10182016, + "step": 48250 + }, + { + "epoch": 5.308580858085809, + "grad_norm": 0.18362164497375488, + "learning_rate": 4.5946610864701335e-05, + "loss": 0.0393, + "num_input_tokens_seen": 10183072, + "step": 48255 + }, + { + "epoch": 5.309130913091309, + "grad_norm": 0.021958300843834877, + "learning_rate": 4.594530062312879e-05, + "loss": 0.0559, + "num_input_tokens_seen": 10184096, + "step": 48260 + }, + { + "epoch": 5.30968096809681, + "grad_norm": 0.10005341470241547, + "learning_rate": 4.594399018851346e-05, + "loss": 0.1302, + "num_input_tokens_seen": 10185152, + "step": 48265 + }, + { + "epoch": 5.31023102310231, + "grad_norm": 0.8462108373641968, + "learning_rate": 4.5942679560867425e-05, + "loss": 0.1223, + "num_input_tokens_seen": 10186208, + "step": 48270 + }, + { + "epoch": 5.31078107810781, + "grad_norm": 0.014752448536455631, + "learning_rate": 4.594136874020276e-05, + "loss": 0.0222, + "num_input_tokens_seen": 10187200, + "step": 48275 + }, + { + "epoch": 5.3113311331133115, + "grad_norm": 2.1763036251068115, + "learning_rate": 4.5940057726531546e-05, + "loss": 0.0955, + "num_input_tokens_seen": 10188256, + "step": 48280 + }, + { + "epoch": 5.311881188118812, + "grad_norm": 0.1485612690448761, + "learning_rate": 4.593874651986587e-05, + "loss": 0.007, + "num_input_tokens_seen": 10189312, + "step": 48285 + }, + { + "epoch": 5.312431243124313, + "grad_norm": 0.044501807540655136, + "learning_rate": 4.593743512021782e-05, + "loss": 0.1214, + "num_input_tokens_seen": 10190400, + "step": 48290 + }, + { + "epoch": 5.312981298129813, + "grad_norm": 0.6307691335678101, + "learning_rate": 4.5936123527599474e-05, + "loss": 0.0168, + "num_input_tokens_seen": 10191424, + "step": 48295 + }, + { + "epoch": 5.313531353135313, + "grad_norm": 0.12549777328968048, + "learning_rate": 4.593481174202292e-05, + "loss": 0.0265, + "num_input_tokens_seen": 10192512, + "step": 48300 + }, + { + "epoch": 5.314081408140814, + "grad_norm": 0.8724179863929749, + "learning_rate": 4.593349976350025e-05, + "loss": 0.0512, + "num_input_tokens_seen": 10193600, + "step": 48305 + }, + { + "epoch": 5.3146314631463145, + "grad_norm": 0.20134153962135315, + "learning_rate": 4.593218759204356e-05, + "loss": 0.0328, + "num_input_tokens_seen": 10194624, + "step": 48310 + }, + { + "epoch": 5.315181518151816, + "grad_norm": 0.9158152937889099, + "learning_rate": 4.593087522766495e-05, + "loss": 0.0902, + "num_input_tokens_seen": 10195616, + "step": 48315 + }, + { + "epoch": 5.315731573157316, + "grad_norm": 0.05464452877640724, + "learning_rate": 4.59295626703765e-05, + "loss": 0.0136, + "num_input_tokens_seen": 10196672, + "step": 48320 + }, + { + "epoch": 5.316281628162816, + "grad_norm": 0.7425295114517212, + "learning_rate": 4.5928249920190315e-05, + "loss": 0.0417, + "num_input_tokens_seen": 10197792, + "step": 48325 + }, + { + "epoch": 5.316831683168317, + "grad_norm": 0.021703427657485008, + "learning_rate": 4.5926936977118486e-05, + "loss": 0.1, + "num_input_tokens_seen": 10198784, + "step": 48330 + }, + { + "epoch": 5.317381738173817, + "grad_norm": 0.022192593663930893, + "learning_rate": 4.592562384117313e-05, + "loss": 0.1138, + "num_input_tokens_seen": 10199776, + "step": 48335 + }, + { + "epoch": 5.3179317931793175, + "grad_norm": 1.2727017402648926, + "learning_rate": 4.592431051236633e-05, + "loss": 0.0516, + "num_input_tokens_seen": 10200864, + "step": 48340 + }, + { + "epoch": 5.318481848184819, + "grad_norm": 1.046691656112671, + "learning_rate": 4.592299699071021e-05, + "loss": 0.1036, + "num_input_tokens_seen": 10201824, + "step": 48345 + }, + { + "epoch": 5.319031903190319, + "grad_norm": 2.3926846981048584, + "learning_rate": 4.592168327621686e-05, + "loss": 0.1375, + "num_input_tokens_seen": 10202912, + "step": 48350 + }, + { + "epoch": 5.31958195819582, + "grad_norm": 0.049132563173770905, + "learning_rate": 4.59203693688984e-05, + "loss": 0.0097, + "num_input_tokens_seen": 10203968, + "step": 48355 + }, + { + "epoch": 5.32013201320132, + "grad_norm": 0.2771233916282654, + "learning_rate": 4.591905526876693e-05, + "loss": 0.026, + "num_input_tokens_seen": 10204992, + "step": 48360 + }, + { + "epoch": 5.32068206820682, + "grad_norm": 0.015869304537773132, + "learning_rate": 4.5917740975834565e-05, + "loss": 0.0055, + "num_input_tokens_seen": 10206016, + "step": 48365 + }, + { + "epoch": 5.321232123212321, + "grad_norm": 0.229841947555542, + "learning_rate": 4.591642649011342e-05, + "loss": 0.068, + "num_input_tokens_seen": 10207104, + "step": 48370 + }, + { + "epoch": 5.321782178217822, + "grad_norm": 0.3343541920185089, + "learning_rate": 4.5915111811615607e-05, + "loss": 0.044, + "num_input_tokens_seen": 10208192, + "step": 48375 + }, + { + "epoch": 5.322332233223323, + "grad_norm": 0.10443765670061111, + "learning_rate": 4.591379694035325e-05, + "loss": 0.0356, + "num_input_tokens_seen": 10209216, + "step": 48380 + }, + { + "epoch": 5.322882288228823, + "grad_norm": 0.12880030274391174, + "learning_rate": 4.591248187633845e-05, + "loss": 0.0491, + "num_input_tokens_seen": 10210240, + "step": 48385 + }, + { + "epoch": 5.323432343234323, + "grad_norm": 0.06017250567674637, + "learning_rate": 4.591116661958336e-05, + "loss": 0.0697, + "num_input_tokens_seen": 10211232, + "step": 48390 + }, + { + "epoch": 5.323982398239824, + "grad_norm": 1.3344571590423584, + "learning_rate": 4.5909851170100064e-05, + "loss": 0.0787, + "num_input_tokens_seen": 10212288, + "step": 48395 + }, + { + "epoch": 5.324532453245324, + "grad_norm": 0.2426847517490387, + "learning_rate": 4.5908535527900706e-05, + "loss": 0.0128, + "num_input_tokens_seen": 10213408, + "step": 48400 + }, + { + "epoch": 5.325082508250825, + "grad_norm": 0.7285429239273071, + "learning_rate": 4.5907219692997416e-05, + "loss": 0.0381, + "num_input_tokens_seen": 10214464, + "step": 48405 + }, + { + "epoch": 5.325632563256326, + "grad_norm": 0.03301641345024109, + "learning_rate": 4.590590366540231e-05, + "loss": 0.0118, + "num_input_tokens_seen": 10215488, + "step": 48410 + }, + { + "epoch": 5.326182618261826, + "grad_norm": 0.15901736915111542, + "learning_rate": 4.590458744512752e-05, + "loss": 0.0166, + "num_input_tokens_seen": 10216544, + "step": 48415 + }, + { + "epoch": 5.326732673267327, + "grad_norm": 0.08393687009811401, + "learning_rate": 4.590327103218518e-05, + "loss": 0.0588, + "num_input_tokens_seen": 10217632, + "step": 48420 + }, + { + "epoch": 5.327282728272827, + "grad_norm": 0.010247752070426941, + "learning_rate": 4.5901954426587425e-05, + "loss": 0.0195, + "num_input_tokens_seen": 10218688, + "step": 48425 + }, + { + "epoch": 5.327832783278327, + "grad_norm": 1.0832449197769165, + "learning_rate": 4.5900637628346374e-05, + "loss": 0.0677, + "num_input_tokens_seen": 10219712, + "step": 48430 + }, + { + "epoch": 5.3283828382838285, + "grad_norm": 0.8415932059288025, + "learning_rate": 4.589932063747419e-05, + "loss": 0.0548, + "num_input_tokens_seen": 10220736, + "step": 48435 + }, + { + "epoch": 5.328932893289329, + "grad_norm": 0.282339870929718, + "learning_rate": 4.589800345398298e-05, + "loss": 0.0159, + "num_input_tokens_seen": 10221792, + "step": 48440 + }, + { + "epoch": 5.32948294829483, + "grad_norm": 0.8927667140960693, + "learning_rate": 4.589668607788492e-05, + "loss": 0.0241, + "num_input_tokens_seen": 10222816, + "step": 48445 + }, + { + "epoch": 5.33003300330033, + "grad_norm": 0.02367216721177101, + "learning_rate": 4.5895368509192115e-05, + "loss": 0.0142, + "num_input_tokens_seen": 10223904, + "step": 48450 + }, + { + "epoch": 5.33058305830583, + "grad_norm": 0.388776957988739, + "learning_rate": 4.589405074791674e-05, + "loss": 0.0492, + "num_input_tokens_seen": 10224992, + "step": 48455 + }, + { + "epoch": 5.331133113311331, + "grad_norm": 0.005676655564457178, + "learning_rate": 4.5892732794070914e-05, + "loss": 0.0978, + "num_input_tokens_seen": 10226080, + "step": 48460 + }, + { + "epoch": 5.3316831683168315, + "grad_norm": 0.02318945899605751, + "learning_rate": 4.58914146476668e-05, + "loss": 0.007, + "num_input_tokens_seen": 10227072, + "step": 48465 + }, + { + "epoch": 5.332233223322333, + "grad_norm": 0.11989102512598038, + "learning_rate": 4.589009630871653e-05, + "loss": 0.0104, + "num_input_tokens_seen": 10228128, + "step": 48470 + }, + { + "epoch": 5.332783278327833, + "grad_norm": 0.0850653350353241, + "learning_rate": 4.5888777777232286e-05, + "loss": 0.0348, + "num_input_tokens_seen": 10229248, + "step": 48475 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 2.2166574001312256, + "learning_rate": 4.588745905322619e-05, + "loss": 0.1137, + "num_input_tokens_seen": 10230240, + "step": 48480 + }, + { + "epoch": 5.333883388338834, + "grad_norm": 0.015190238133072853, + "learning_rate": 4.5886140136710396e-05, + "loss": 0.0158, + "num_input_tokens_seen": 10231328, + "step": 48485 + }, + { + "epoch": 5.334433443344334, + "grad_norm": 0.21585290133953094, + "learning_rate": 4.588482102769709e-05, + "loss": 0.0128, + "num_input_tokens_seen": 10232384, + "step": 48490 + }, + { + "epoch": 5.334983498349835, + "grad_norm": 0.018379339948296547, + "learning_rate": 4.5883501726198396e-05, + "loss": 0.0207, + "num_input_tokens_seen": 10233376, + "step": 48495 + }, + { + "epoch": 5.335533553355336, + "grad_norm": 0.35503876209259033, + "learning_rate": 4.588218223222649e-05, + "loss": 0.1173, + "num_input_tokens_seen": 10234400, + "step": 48500 + }, + { + "epoch": 5.336083608360836, + "grad_norm": 0.17801079154014587, + "learning_rate": 4.588086254579353e-05, + "loss": 0.0106, + "num_input_tokens_seen": 10235424, + "step": 48505 + }, + { + "epoch": 5.336633663366337, + "grad_norm": 1.7145088911056519, + "learning_rate": 4.587954266691169e-05, + "loss": 0.1011, + "num_input_tokens_seen": 10236448, + "step": 48510 + }, + { + "epoch": 5.337183718371837, + "grad_norm": 0.24910487234592438, + "learning_rate": 4.587822259559312e-05, + "loss": 0.0119, + "num_input_tokens_seen": 10237472, + "step": 48515 + }, + { + "epoch": 5.337733773377337, + "grad_norm": 1.152724027633667, + "learning_rate": 4.587690233184999e-05, + "loss": 0.11, + "num_input_tokens_seen": 10238496, + "step": 48520 + }, + { + "epoch": 5.338283828382838, + "grad_norm": 0.5917744636535645, + "learning_rate": 4.587558187569446e-05, + "loss": 0.209, + "num_input_tokens_seen": 10239552, + "step": 48525 + }, + { + "epoch": 5.338833883388339, + "grad_norm": 0.09657157957553864, + "learning_rate": 4.587426122713872e-05, + "loss": 0.0053, + "num_input_tokens_seen": 10240576, + "step": 48530 + }, + { + "epoch": 5.33938393839384, + "grad_norm": 0.6524280905723572, + "learning_rate": 4.587294038619493e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10241632, + "step": 48535 + }, + { + "epoch": 5.33993399339934, + "grad_norm": 0.010214456357061863, + "learning_rate": 4.5871619352875256e-05, + "loss": 0.0417, + "num_input_tokens_seen": 10242592, + "step": 48540 + }, + { + "epoch": 5.34048404840484, + "grad_norm": 0.5931288599967957, + "learning_rate": 4.587029812719189e-05, + "loss": 0.0584, + "num_input_tokens_seen": 10243584, + "step": 48545 + }, + { + "epoch": 5.341034103410341, + "grad_norm": 0.12442361563444138, + "learning_rate": 4.5868976709156995e-05, + "loss": 0.0684, + "num_input_tokens_seen": 10244608, + "step": 48550 + }, + { + "epoch": 5.341584158415841, + "grad_norm": 0.3245832026004791, + "learning_rate": 4.5867655098782756e-05, + "loss": 0.0276, + "num_input_tokens_seen": 10245696, + "step": 48555 + }, + { + "epoch": 5.3421342134213425, + "grad_norm": 0.035015083849430084, + "learning_rate": 4.586633329608135e-05, + "loss": 0.0603, + "num_input_tokens_seen": 10246688, + "step": 48560 + }, + { + "epoch": 5.342684268426843, + "grad_norm": 0.2604864835739136, + "learning_rate": 4.586501130106498e-05, + "loss": 0.0677, + "num_input_tokens_seen": 10247744, + "step": 48565 + }, + { + "epoch": 5.343234323432343, + "grad_norm": 0.01736568659543991, + "learning_rate": 4.5863689113745794e-05, + "loss": 0.0506, + "num_input_tokens_seen": 10248800, + "step": 48570 + }, + { + "epoch": 5.343784378437844, + "grad_norm": 1.0192385911941528, + "learning_rate": 4.5862366734136e-05, + "loss": 0.0794, + "num_input_tokens_seen": 10249952, + "step": 48575 + }, + { + "epoch": 5.344334433443344, + "grad_norm": 0.0765167623758316, + "learning_rate": 4.586104416224779e-05, + "loss": 0.0869, + "num_input_tokens_seen": 10251040, + "step": 48580 + }, + { + "epoch": 5.3448844884488445, + "grad_norm": 0.17523007094860077, + "learning_rate": 4.5859721398093344e-05, + "loss": 0.0592, + "num_input_tokens_seen": 10252128, + "step": 48585 + }, + { + "epoch": 5.3454345434543455, + "grad_norm": 0.09091813117265701, + "learning_rate": 4.585839844168485e-05, + "loss": 0.0371, + "num_input_tokens_seen": 10253184, + "step": 48590 + }, + { + "epoch": 5.345984598459846, + "grad_norm": 0.13987679779529572, + "learning_rate": 4.5857075293034514e-05, + "loss": 0.011, + "num_input_tokens_seen": 10254240, + "step": 48595 + }, + { + "epoch": 5.346534653465347, + "grad_norm": 0.02374115213751793, + "learning_rate": 4.5855751952154515e-05, + "loss": 0.0173, + "num_input_tokens_seen": 10255360, + "step": 48600 + }, + { + "epoch": 5.347084708470847, + "grad_norm": 0.15591780841350555, + "learning_rate": 4.585442841905706e-05, + "loss": 0.0295, + "num_input_tokens_seen": 10256448, + "step": 48605 + }, + { + "epoch": 5.347634763476347, + "grad_norm": 0.7207655310630798, + "learning_rate": 4.585310469375435e-05, + "loss": 0.0236, + "num_input_tokens_seen": 10257504, + "step": 48610 + }, + { + "epoch": 5.348184818481848, + "grad_norm": 1.1093453168869019, + "learning_rate": 4.585178077625858e-05, + "loss": 0.0769, + "num_input_tokens_seen": 10258528, + "step": 48615 + }, + { + "epoch": 5.3487348734873486, + "grad_norm": 0.07081051170825958, + "learning_rate": 4.585045666658194e-05, + "loss": 0.0946, + "num_input_tokens_seen": 10259552, + "step": 48620 + }, + { + "epoch": 5.34928492849285, + "grad_norm": 0.4001659154891968, + "learning_rate": 4.584913236473666e-05, + "loss": 0.0948, + "num_input_tokens_seen": 10260544, + "step": 48625 + }, + { + "epoch": 5.34983498349835, + "grad_norm": 0.035414278507232666, + "learning_rate": 4.5847807870734926e-05, + "loss": 0.0556, + "num_input_tokens_seen": 10261568, + "step": 48630 + }, + { + "epoch": 5.35038503850385, + "grad_norm": 0.3463747203350067, + "learning_rate": 4.584648318458895e-05, + "loss": 0.0148, + "num_input_tokens_seen": 10262560, + "step": 48635 + }, + { + "epoch": 5.350935093509351, + "grad_norm": 0.8322550654411316, + "learning_rate": 4.5845158306310944e-05, + "loss": 0.0791, + "num_input_tokens_seen": 10263584, + "step": 48640 + }, + { + "epoch": 5.351485148514851, + "grad_norm": 1.0695223808288574, + "learning_rate": 4.5843833235913116e-05, + "loss": 0.1058, + "num_input_tokens_seen": 10264704, + "step": 48645 + }, + { + "epoch": 5.3520352035203524, + "grad_norm": 0.23301932215690613, + "learning_rate": 4.584250797340768e-05, + "loss": 0.0105, + "num_input_tokens_seen": 10265728, + "step": 48650 + }, + { + "epoch": 5.352585258525853, + "grad_norm": 0.1983199268579483, + "learning_rate": 4.584118251880685e-05, + "loss": 0.0147, + "num_input_tokens_seen": 10266816, + "step": 48655 + }, + { + "epoch": 5.353135313531353, + "grad_norm": 0.68904709815979, + "learning_rate": 4.583985687212283e-05, + "loss": 0.0269, + "num_input_tokens_seen": 10267808, + "step": 48660 + }, + { + "epoch": 5.353685368536854, + "grad_norm": 0.13301879167556763, + "learning_rate": 4.5838531033367856e-05, + "loss": 0.0094, + "num_input_tokens_seen": 10268832, + "step": 48665 + }, + { + "epoch": 5.354235423542354, + "grad_norm": 0.4792577028274536, + "learning_rate": 4.583720500255415e-05, + "loss": 0.0434, + "num_input_tokens_seen": 10269856, + "step": 48670 + }, + { + "epoch": 5.354785478547855, + "grad_norm": 0.137215718626976, + "learning_rate": 4.583587877969392e-05, + "loss": 0.0152, + "num_input_tokens_seen": 10270976, + "step": 48675 + }, + { + "epoch": 5.3553355335533555, + "grad_norm": 0.013348604552447796, + "learning_rate": 4.583455236479939e-05, + "loss": 0.1027, + "num_input_tokens_seen": 10272000, + "step": 48680 + }, + { + "epoch": 5.355885588558856, + "grad_norm": 0.07294129580259323, + "learning_rate": 4.5833225757882786e-05, + "loss": 0.0821, + "num_input_tokens_seen": 10272992, + "step": 48685 + }, + { + "epoch": 5.356435643564357, + "grad_norm": 0.9648712277412415, + "learning_rate": 4.5831898958956335e-05, + "loss": 0.164, + "num_input_tokens_seen": 10273984, + "step": 48690 + }, + { + "epoch": 5.356985698569857, + "grad_norm": 0.013547729700803757, + "learning_rate": 4.5830571968032277e-05, + "loss": 0.0429, + "num_input_tokens_seen": 10275040, + "step": 48695 + }, + { + "epoch": 5.357535753575357, + "grad_norm": 0.039659593254327774, + "learning_rate": 4.5829244785122827e-05, + "loss": 0.0252, + "num_input_tokens_seen": 10276096, + "step": 48700 + }, + { + "epoch": 5.358085808580858, + "grad_norm": 0.8279671669006348, + "learning_rate": 4.582791741024022e-05, + "loss": 0.1976, + "num_input_tokens_seen": 10277216, + "step": 48705 + }, + { + "epoch": 5.3586358635863585, + "grad_norm": 0.11103415489196777, + "learning_rate": 4.582658984339669e-05, + "loss": 0.0509, + "num_input_tokens_seen": 10278304, + "step": 48710 + }, + { + "epoch": 5.3591859185918596, + "grad_norm": 0.04827844724059105, + "learning_rate": 4.582526208460449e-05, + "loss": 0.0432, + "num_input_tokens_seen": 10279392, + "step": 48715 + }, + { + "epoch": 5.35973597359736, + "grad_norm": 0.020702846348285675, + "learning_rate": 4.5823934133875834e-05, + "loss": 0.0435, + "num_input_tokens_seen": 10280512, + "step": 48720 + }, + { + "epoch": 5.36028602860286, + "grad_norm": 0.03548509627580643, + "learning_rate": 4.5822605991222964e-05, + "loss": 0.0187, + "num_input_tokens_seen": 10281536, + "step": 48725 + }, + { + "epoch": 5.360836083608361, + "grad_norm": 1.0110894441604614, + "learning_rate": 4.582127765665813e-05, + "loss": 0.116, + "num_input_tokens_seen": 10282656, + "step": 48730 + }, + { + "epoch": 5.361386138613861, + "grad_norm": 1.0368250608444214, + "learning_rate": 4.581994913019357e-05, + "loss": 0.0601, + "num_input_tokens_seen": 10283744, + "step": 48735 + }, + { + "epoch": 5.361936193619362, + "grad_norm": 0.0298618134111166, + "learning_rate": 4.581862041184154e-05, + "loss": 0.0122, + "num_input_tokens_seen": 10284832, + "step": 48740 + }, + { + "epoch": 5.362486248624863, + "grad_norm": 0.07735767960548401, + "learning_rate": 4.5817291501614254e-05, + "loss": 0.008, + "num_input_tokens_seen": 10285920, + "step": 48745 + }, + { + "epoch": 5.363036303630363, + "grad_norm": 0.03359108790755272, + "learning_rate": 4.5815962399524e-05, + "loss": 0.0386, + "num_input_tokens_seen": 10286944, + "step": 48750 + }, + { + "epoch": 5.363586358635864, + "grad_norm": 0.10085958987474442, + "learning_rate": 4.5814633105583e-05, + "loss": 0.0101, + "num_input_tokens_seen": 10287968, + "step": 48755 + }, + { + "epoch": 5.364136413641364, + "grad_norm": 1.423926591873169, + "learning_rate": 4.581330361980352e-05, + "loss": 0.2083, + "num_input_tokens_seen": 10288992, + "step": 48760 + }, + { + "epoch": 5.364686468646864, + "grad_norm": 0.3598721921443939, + "learning_rate": 4.58119739421978e-05, + "loss": 0.077, + "num_input_tokens_seen": 10290048, + "step": 48765 + }, + { + "epoch": 5.365236523652365, + "grad_norm": 0.3013227880001068, + "learning_rate": 4.5810644072778106e-05, + "loss": 0.0295, + "num_input_tokens_seen": 10291136, + "step": 48770 + }, + { + "epoch": 5.365786578657866, + "grad_norm": 0.0991474911570549, + "learning_rate": 4.5809314011556694e-05, + "loss": 0.0255, + "num_input_tokens_seen": 10292192, + "step": 48775 + }, + { + "epoch": 5.366336633663367, + "grad_norm": 0.2782534956932068, + "learning_rate": 4.580798375854582e-05, + "loss": 0.0432, + "num_input_tokens_seen": 10293248, + "step": 48780 + }, + { + "epoch": 5.366886688668867, + "grad_norm": 0.6755859851837158, + "learning_rate": 4.580665331375774e-05, + "loss": 0.09, + "num_input_tokens_seen": 10294272, + "step": 48785 + }, + { + "epoch": 5.367436743674367, + "grad_norm": 0.04515312984585762, + "learning_rate": 4.5805322677204725e-05, + "loss": 0.0322, + "num_input_tokens_seen": 10295296, + "step": 48790 + }, + { + "epoch": 5.367986798679868, + "grad_norm": 0.014812895096838474, + "learning_rate": 4.580399184889903e-05, + "loss": 0.0996, + "num_input_tokens_seen": 10296416, + "step": 48795 + }, + { + "epoch": 5.368536853685368, + "grad_norm": 0.08042194694280624, + "learning_rate": 4.580266082885293e-05, + "loss": 0.1257, + "num_input_tokens_seen": 10297504, + "step": 48800 + }, + { + "epoch": 5.3690869086908695, + "grad_norm": 0.14678771793842316, + "learning_rate": 4.5801329617078685e-05, + "loss": 0.0226, + "num_input_tokens_seen": 10298560, + "step": 48805 + }, + { + "epoch": 5.36963696369637, + "grad_norm": 0.02167998068034649, + "learning_rate": 4.579999821358857e-05, + "loss": 0.0282, + "num_input_tokens_seen": 10299616, + "step": 48810 + }, + { + "epoch": 5.37018701870187, + "grad_norm": 0.09481001645326614, + "learning_rate": 4.579866661839485e-05, + "loss": 0.0543, + "num_input_tokens_seen": 10300672, + "step": 48815 + }, + { + "epoch": 5.370737073707371, + "grad_norm": 0.15533751249313354, + "learning_rate": 4.57973348315098e-05, + "loss": 0.0755, + "num_input_tokens_seen": 10301824, + "step": 48820 + }, + { + "epoch": 5.371287128712871, + "grad_norm": 0.16648206114768982, + "learning_rate": 4.579600285294568e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10302848, + "step": 48825 + }, + { + "epoch": 5.371837183718371, + "grad_norm": 0.10603663325309753, + "learning_rate": 4.5794670682714805e-05, + "loss": 0.0588, + "num_input_tokens_seen": 10303840, + "step": 48830 + }, + { + "epoch": 5.3723872387238725, + "grad_norm": 0.06258408725261688, + "learning_rate": 4.5793338320829414e-05, + "loss": 0.0072, + "num_input_tokens_seen": 10304832, + "step": 48835 + }, + { + "epoch": 5.372937293729373, + "grad_norm": 0.039914730936288834, + "learning_rate": 4.57920057673018e-05, + "loss": 0.0364, + "num_input_tokens_seen": 10305824, + "step": 48840 + }, + { + "epoch": 5.373487348734874, + "grad_norm": 1.111743450164795, + "learning_rate": 4.5790673022144255e-05, + "loss": 0.0418, + "num_input_tokens_seen": 10306816, + "step": 48845 + }, + { + "epoch": 5.374037403740374, + "grad_norm": 0.07892799377441406, + "learning_rate": 4.578934008536905e-05, + "loss": 0.0629, + "num_input_tokens_seen": 10307872, + "step": 48850 + }, + { + "epoch": 5.374587458745874, + "grad_norm": 0.7914624214172363, + "learning_rate": 4.578800695698847e-05, + "loss": 0.1346, + "num_input_tokens_seen": 10308832, + "step": 48855 + }, + { + "epoch": 5.375137513751375, + "grad_norm": 0.4526178538799286, + "learning_rate": 4.578667363701481e-05, + "loss": 0.0569, + "num_input_tokens_seen": 10309888, + "step": 48860 + }, + { + "epoch": 5.3756875687568755, + "grad_norm": 1.3292421102523804, + "learning_rate": 4.578534012546034e-05, + "loss": 0.0209, + "num_input_tokens_seen": 10311008, + "step": 48865 + }, + { + "epoch": 5.376237623762377, + "grad_norm": 0.0634411945939064, + "learning_rate": 4.578400642233738e-05, + "loss": 0.0073, + "num_input_tokens_seen": 10312096, + "step": 48870 + }, + { + "epoch": 5.376787678767877, + "grad_norm": 0.3352442681789398, + "learning_rate": 4.57826725276582e-05, + "loss": 0.0209, + "num_input_tokens_seen": 10313120, + "step": 48875 + }, + { + "epoch": 5.377337733773377, + "grad_norm": 1.1732854843139648, + "learning_rate": 4.5781338441435095e-05, + "loss": 0.0794, + "num_input_tokens_seen": 10314144, + "step": 48880 + }, + { + "epoch": 5.377887788778878, + "grad_norm": 0.026759283617138863, + "learning_rate": 4.5780004163680365e-05, + "loss": 0.0026, + "num_input_tokens_seen": 10315200, + "step": 48885 + }, + { + "epoch": 5.378437843784378, + "grad_norm": 0.7199617028236389, + "learning_rate": 4.5778669694406315e-05, + "loss": 0.1138, + "num_input_tokens_seen": 10316192, + "step": 48890 + }, + { + "epoch": 5.378987898789879, + "grad_norm": 0.5772742629051208, + "learning_rate": 4.577733503362524e-05, + "loss": 0.1083, + "num_input_tokens_seen": 10317216, + "step": 48895 + }, + { + "epoch": 5.37953795379538, + "grad_norm": 0.4279453754425049, + "learning_rate": 4.577600018134943e-05, + "loss": 0.0496, + "num_input_tokens_seen": 10318272, + "step": 48900 + }, + { + "epoch": 5.38008800880088, + "grad_norm": 0.23873330652713776, + "learning_rate": 4.57746651375912e-05, + "loss": 0.0752, + "num_input_tokens_seen": 10319328, + "step": 48905 + }, + { + "epoch": 5.380638063806381, + "grad_norm": 0.4895905554294586, + "learning_rate": 4.577332990236285e-05, + "loss": 0.0332, + "num_input_tokens_seen": 10320384, + "step": 48910 + }, + { + "epoch": 5.381188118811881, + "grad_norm": 0.08230447769165039, + "learning_rate": 4.577199447567668e-05, + "loss": 0.0279, + "num_input_tokens_seen": 10321472, + "step": 48915 + }, + { + "epoch": 5.381738173817382, + "grad_norm": 0.16043050587177277, + "learning_rate": 4.577065885754501e-05, + "loss": 0.0046, + "num_input_tokens_seen": 10322528, + "step": 48920 + }, + { + "epoch": 5.382288228822882, + "grad_norm": 0.011238999664783478, + "learning_rate": 4.5769323047980146e-05, + "loss": 0.015, + "num_input_tokens_seen": 10323584, + "step": 48925 + }, + { + "epoch": 5.382838283828383, + "grad_norm": 0.5936574339866638, + "learning_rate": 4.5767987046994395e-05, + "loss": 0.0531, + "num_input_tokens_seen": 10324640, + "step": 48930 + }, + { + "epoch": 5.383388338833884, + "grad_norm": 0.8482256531715393, + "learning_rate": 4.576665085460007e-05, + "loss": 0.0537, + "num_input_tokens_seen": 10325728, + "step": 48935 + }, + { + "epoch": 5.383938393839384, + "grad_norm": 0.032846707850694656, + "learning_rate": 4.576531447080949e-05, + "loss": 0.0738, + "num_input_tokens_seen": 10326816, + "step": 48940 + }, + { + "epoch": 5.384488448844884, + "grad_norm": 0.022161051630973816, + "learning_rate": 4.576397789563497e-05, + "loss": 0.0187, + "num_input_tokens_seen": 10327840, + "step": 48945 + }, + { + "epoch": 5.385038503850385, + "grad_norm": 0.6595489382743835, + "learning_rate": 4.576264112908884e-05, + "loss": 0.0645, + "num_input_tokens_seen": 10328960, + "step": 48950 + }, + { + "epoch": 5.385588558855885, + "grad_norm": 0.01957813836634159, + "learning_rate": 4.5761304171183396e-05, + "loss": 0.0158, + "num_input_tokens_seen": 10330016, + "step": 48955 + }, + { + "epoch": 5.3861386138613865, + "grad_norm": 0.05670582503080368, + "learning_rate": 4.5759967021930976e-05, + "loss": 0.0271, + "num_input_tokens_seen": 10331040, + "step": 48960 + }, + { + "epoch": 5.386688668866887, + "grad_norm": 0.3086269795894623, + "learning_rate": 4.575862968134391e-05, + "loss": 0.0278, + "num_input_tokens_seen": 10332032, + "step": 48965 + }, + { + "epoch": 5.387238723872387, + "grad_norm": 0.8623188138008118, + "learning_rate": 4.57572921494345e-05, + "loss": 0.0489, + "num_input_tokens_seen": 10333120, + "step": 48970 + }, + { + "epoch": 5.387788778877888, + "grad_norm": 1.069831371307373, + "learning_rate": 4.57559544262151e-05, + "loss": 0.0809, + "num_input_tokens_seen": 10334144, + "step": 48975 + }, + { + "epoch": 5.388338833883388, + "grad_norm": 0.15178397297859192, + "learning_rate": 4.5754616511698026e-05, + "loss": 0.0229, + "num_input_tokens_seen": 10335168, + "step": 48980 + }, + { + "epoch": 5.388888888888889, + "grad_norm": 0.013318270444869995, + "learning_rate": 4.5753278405895614e-05, + "loss": 0.084, + "num_input_tokens_seen": 10336256, + "step": 48985 + }, + { + "epoch": 5.3894389438943895, + "grad_norm": 0.13392768800258636, + "learning_rate": 4.575194010882019e-05, + "loss": 0.0058, + "num_input_tokens_seen": 10337376, + "step": 48990 + }, + { + "epoch": 5.38998899889989, + "grad_norm": 0.760413408279419, + "learning_rate": 4.575060162048409e-05, + "loss": 0.0544, + "num_input_tokens_seen": 10338464, + "step": 48995 + }, + { + "epoch": 5.390539053905391, + "grad_norm": 0.04869284853339195, + "learning_rate": 4.574926294089965e-05, + "loss": 0.1435, + "num_input_tokens_seen": 10339552, + "step": 49000 + }, + { + "epoch": 5.391089108910891, + "grad_norm": 0.0338854156434536, + "learning_rate": 4.574792407007922e-05, + "loss": 0.0354, + "num_input_tokens_seen": 10340608, + "step": 49005 + }, + { + "epoch": 5.391639163916391, + "grad_norm": 0.32490938901901245, + "learning_rate": 4.5746585008035116e-05, + "loss": 0.0615, + "num_input_tokens_seen": 10341664, + "step": 49010 + }, + { + "epoch": 5.392189218921892, + "grad_norm": 0.012988165952265263, + "learning_rate": 4.5745245754779696e-05, + "loss": 0.0622, + "num_input_tokens_seen": 10342752, + "step": 49015 + }, + { + "epoch": 5.3927392739273925, + "grad_norm": 0.021122492849826813, + "learning_rate": 4.574390631032531e-05, + "loss": 0.0117, + "num_input_tokens_seen": 10343776, + "step": 49020 + }, + { + "epoch": 5.393289328932894, + "grad_norm": 0.20045721530914307, + "learning_rate": 4.5742566674684286e-05, + "loss": 0.0744, + "num_input_tokens_seen": 10344896, + "step": 49025 + }, + { + "epoch": 5.393839383938394, + "grad_norm": 0.8805461525917053, + "learning_rate": 4.574122684786898e-05, + "loss": 0.0861, + "num_input_tokens_seen": 10345920, + "step": 49030 + }, + { + "epoch": 5.394389438943894, + "grad_norm": 0.37815356254577637, + "learning_rate": 4.5739886829891745e-05, + "loss": 0.0377, + "num_input_tokens_seen": 10346912, + "step": 49035 + }, + { + "epoch": 5.394939493949395, + "grad_norm": 0.26447394490242004, + "learning_rate": 4.573854662076491e-05, + "loss": 0.0099, + "num_input_tokens_seen": 10347904, + "step": 49040 + }, + { + "epoch": 5.395489548954895, + "grad_norm": 0.8583385944366455, + "learning_rate": 4.573720622050085e-05, + "loss": 0.059, + "num_input_tokens_seen": 10348992, + "step": 49045 + }, + { + "epoch": 5.396039603960396, + "grad_norm": 0.7491973042488098, + "learning_rate": 4.573586562911192e-05, + "loss": 0.0497, + "num_input_tokens_seen": 10350016, + "step": 49050 + }, + { + "epoch": 5.396589658965897, + "grad_norm": 0.04814045876264572, + "learning_rate": 4.5734524846610454e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10351072, + "step": 49055 + }, + { + "epoch": 5.397139713971397, + "grad_norm": 0.6459094285964966, + "learning_rate": 4.573318387300883e-05, + "loss": 0.0241, + "num_input_tokens_seen": 10352096, + "step": 49060 + }, + { + "epoch": 5.397689768976898, + "grad_norm": 0.05563272535800934, + "learning_rate": 4.573184270831939e-05, + "loss": 0.0182, + "num_input_tokens_seen": 10353152, + "step": 49065 + }, + { + "epoch": 5.398239823982398, + "grad_norm": 0.09035032242536545, + "learning_rate": 4.5730501352554515e-05, + "loss": 0.0182, + "num_input_tokens_seen": 10354272, + "step": 49070 + }, + { + "epoch": 5.398789878987899, + "grad_norm": 0.22329966723918915, + "learning_rate": 4.572915980572655e-05, + "loss": 0.028, + "num_input_tokens_seen": 10355296, + "step": 49075 + }, + { + "epoch": 5.399339933993399, + "grad_norm": 0.06396404653787613, + "learning_rate": 4.572781806784786e-05, + "loss": 0.0582, + "num_input_tokens_seen": 10356288, + "step": 49080 + }, + { + "epoch": 5.3998899889989, + "grad_norm": 0.7651981115341187, + "learning_rate": 4.572647613893083e-05, + "loss": 0.022, + "num_input_tokens_seen": 10357344, + "step": 49085 + }, + { + "epoch": 5.400440044004401, + "grad_norm": 0.2523968517780304, + "learning_rate": 4.57251340189878e-05, + "loss": 0.013, + "num_input_tokens_seen": 10358336, + "step": 49090 + }, + { + "epoch": 5.400990099009901, + "grad_norm": 0.03773493319749832, + "learning_rate": 4.5723791708031174e-05, + "loss": 0.0176, + "num_input_tokens_seen": 10359328, + "step": 49095 + }, + { + "epoch": 5.401540154015402, + "grad_norm": 5.895981788635254, + "learning_rate": 4.572244920607329e-05, + "loss": 0.0486, + "num_input_tokens_seen": 10360352, + "step": 49100 + }, + { + "epoch": 5.402090209020902, + "grad_norm": 0.031728342175483704, + "learning_rate": 4.572110651312654e-05, + "loss": 0.0335, + "num_input_tokens_seen": 10361344, + "step": 49105 + }, + { + "epoch": 5.402640264026402, + "grad_norm": 0.04394392669200897, + "learning_rate": 4.5719763629203295e-05, + "loss": 0.0223, + "num_input_tokens_seen": 10362368, + "step": 49110 + }, + { + "epoch": 5.4031903190319035, + "grad_norm": 1.4141250848770142, + "learning_rate": 4.571842055431592e-05, + "loss": 0.1142, + "num_input_tokens_seen": 10363424, + "step": 49115 + }, + { + "epoch": 5.403740374037404, + "grad_norm": 1.3635741472244263, + "learning_rate": 4.5717077288476816e-05, + "loss": 0.1286, + "num_input_tokens_seen": 10364480, + "step": 49120 + }, + { + "epoch": 5.404290429042904, + "grad_norm": 0.031237978488206863, + "learning_rate": 4.571573383169835e-05, + "loss": 0.0432, + "num_input_tokens_seen": 10365472, + "step": 49125 + }, + { + "epoch": 5.404840484048405, + "grad_norm": 0.19124753773212433, + "learning_rate": 4.571439018399291e-05, + "loss": 0.0171, + "num_input_tokens_seen": 10366528, + "step": 49130 + }, + { + "epoch": 5.405390539053905, + "grad_norm": 0.3177548050880432, + "learning_rate": 4.571304634537286e-05, + "loss": 0.0802, + "num_input_tokens_seen": 10367552, + "step": 49135 + }, + { + "epoch": 5.405940594059406, + "grad_norm": 0.21133974194526672, + "learning_rate": 4.5711702315850614e-05, + "loss": 0.0546, + "num_input_tokens_seen": 10368608, + "step": 49140 + }, + { + "epoch": 5.4064906490649065, + "grad_norm": 0.07003480941057205, + "learning_rate": 4.571035809543854e-05, + "loss": 0.0373, + "num_input_tokens_seen": 10369664, + "step": 49145 + }, + { + "epoch": 5.407040704070407, + "grad_norm": 0.15125137567520142, + "learning_rate": 4.570901368414904e-05, + "loss": 0.0154, + "num_input_tokens_seen": 10370720, + "step": 49150 + }, + { + "epoch": 5.407590759075908, + "grad_norm": 0.046794019639492035, + "learning_rate": 4.5707669081994495e-05, + "loss": 0.0149, + "num_input_tokens_seen": 10371680, + "step": 49155 + }, + { + "epoch": 5.408140814081408, + "grad_norm": 0.016720842570066452, + "learning_rate": 4.57063242889873e-05, + "loss": 0.0058, + "num_input_tokens_seen": 10372704, + "step": 49160 + }, + { + "epoch": 5.408690869086909, + "grad_norm": 0.020035581663250923, + "learning_rate": 4.570497930513985e-05, + "loss": 0.0075, + "num_input_tokens_seen": 10373760, + "step": 49165 + }, + { + "epoch": 5.409240924092409, + "grad_norm": 0.03398147597908974, + "learning_rate": 4.5703634130464535e-05, + "loss": 0.0141, + "num_input_tokens_seen": 10374752, + "step": 49170 + }, + { + "epoch": 5.4097909790979095, + "grad_norm": 0.043411098420619965, + "learning_rate": 4.570228876497377e-05, + "loss": 0.0879, + "num_input_tokens_seen": 10375808, + "step": 49175 + }, + { + "epoch": 5.410341034103411, + "grad_norm": 0.8095825910568237, + "learning_rate": 4.5700943208679935e-05, + "loss": 0.0557, + "num_input_tokens_seen": 10376800, + "step": 49180 + }, + { + "epoch": 5.410891089108911, + "grad_norm": 0.1914464235305786, + "learning_rate": 4.569959746159545e-05, + "loss": 0.0654, + "num_input_tokens_seen": 10377824, + "step": 49185 + }, + { + "epoch": 5.411441144114411, + "grad_norm": 0.23213383555412292, + "learning_rate": 4.569825152373269e-05, + "loss": 0.0141, + "num_input_tokens_seen": 10379008, + "step": 49190 + }, + { + "epoch": 5.411991199119912, + "grad_norm": 0.014098851010203362, + "learning_rate": 4.5696905395104095e-05, + "loss": 0.1224, + "num_input_tokens_seen": 10380032, + "step": 49195 + }, + { + "epoch": 5.412541254125412, + "grad_norm": 0.10335874557495117, + "learning_rate": 4.5695559075722046e-05, + "loss": 0.0541, + "num_input_tokens_seen": 10381056, + "step": 49200 + }, + { + "epoch": 5.413091309130913, + "grad_norm": 0.017871540039777756, + "learning_rate": 4.569421256559896e-05, + "loss": 0.0095, + "num_input_tokens_seen": 10382176, + "step": 49205 + }, + { + "epoch": 5.413641364136414, + "grad_norm": 0.0875869169831276, + "learning_rate": 4.569286586474725e-05, + "loss": 0.0625, + "num_input_tokens_seen": 10383232, + "step": 49210 + }, + { + "epoch": 5.414191419141914, + "grad_norm": 0.1591162383556366, + "learning_rate": 4.569151897317931e-05, + "loss": 0.0467, + "num_input_tokens_seen": 10384288, + "step": 49215 + }, + { + "epoch": 5.414741474147415, + "grad_norm": 0.20727640390396118, + "learning_rate": 4.569017189090759e-05, + "loss": 0.0611, + "num_input_tokens_seen": 10385376, + "step": 49220 + }, + { + "epoch": 5.415291529152915, + "grad_norm": 0.020881412550807, + "learning_rate": 4.568882461794448e-05, + "loss": 0.182, + "num_input_tokens_seen": 10386464, + "step": 49225 + }, + { + "epoch": 5.415841584158416, + "grad_norm": 0.04574909061193466, + "learning_rate": 4.5687477154302396e-05, + "loss": 0.0249, + "num_input_tokens_seen": 10387584, + "step": 49230 + }, + { + "epoch": 5.416391639163916, + "grad_norm": 1.2286192178726196, + "learning_rate": 4.568612949999376e-05, + "loss": 0.0953, + "num_input_tokens_seen": 10388608, + "step": 49235 + }, + { + "epoch": 5.416941694169417, + "grad_norm": 0.1961655467748642, + "learning_rate": 4.5684781655030996e-05, + "loss": 0.0238, + "num_input_tokens_seen": 10389568, + "step": 49240 + }, + { + "epoch": 5.417491749174918, + "grad_norm": 0.02306581661105156, + "learning_rate": 4.568343361942652e-05, + "loss": 0.009, + "num_input_tokens_seen": 10390592, + "step": 49245 + }, + { + "epoch": 5.418041804180418, + "grad_norm": 0.02981092408299446, + "learning_rate": 4.568208539319276e-05, + "loss": 0.0195, + "num_input_tokens_seen": 10391648, + "step": 49250 + }, + { + "epoch": 5.418591859185918, + "grad_norm": 0.8349449634552002, + "learning_rate": 4.568073697634216e-05, + "loss": 0.0785, + "num_input_tokens_seen": 10392768, + "step": 49255 + }, + { + "epoch": 5.419141914191419, + "grad_norm": 0.042754728347063065, + "learning_rate": 4.567938836888712e-05, + "loss": 0.0848, + "num_input_tokens_seen": 10393824, + "step": 49260 + }, + { + "epoch": 5.419691969196919, + "grad_norm": 0.09176013618707657, + "learning_rate": 4.567803957084008e-05, + "loss": 0.0351, + "num_input_tokens_seen": 10394880, + "step": 49265 + }, + { + "epoch": 5.4202420242024205, + "grad_norm": 1.4684807062149048, + "learning_rate": 4.567669058221347e-05, + "loss": 0.0537, + "num_input_tokens_seen": 10396000, + "step": 49270 + }, + { + "epoch": 5.420792079207921, + "grad_norm": 0.12992240488529205, + "learning_rate": 4.5675341403019725e-05, + "loss": 0.0245, + "num_input_tokens_seen": 10397088, + "step": 49275 + }, + { + "epoch": 5.421342134213421, + "grad_norm": 0.056878138333559036, + "learning_rate": 4.5673992033271286e-05, + "loss": 0.0161, + "num_input_tokens_seen": 10398144, + "step": 49280 + }, + { + "epoch": 5.421892189218922, + "grad_norm": 0.1344379186630249, + "learning_rate": 4.567264247298058e-05, + "loss": 0.0316, + "num_input_tokens_seen": 10399136, + "step": 49285 + }, + { + "epoch": 5.422442244224422, + "grad_norm": 1.097827672958374, + "learning_rate": 4.567129272216004e-05, + "loss": 0.1031, + "num_input_tokens_seen": 10400128, + "step": 49290 + }, + { + "epoch": 5.422992299229923, + "grad_norm": 0.1742350310087204, + "learning_rate": 4.566994278082212e-05, + "loss": 0.0247, + "num_input_tokens_seen": 10401184, + "step": 49295 + }, + { + "epoch": 5.4235423542354235, + "grad_norm": 0.02042117901146412, + "learning_rate": 4.566859264897926e-05, + "loss": 0.0067, + "num_input_tokens_seen": 10402272, + "step": 49300 + }, + { + "epoch": 5.424092409240924, + "grad_norm": 0.058164749294519424, + "learning_rate": 4.5667242326643894e-05, + "loss": 0.0992, + "num_input_tokens_seen": 10403360, + "step": 49305 + }, + { + "epoch": 5.424642464246425, + "grad_norm": 0.42035624384880066, + "learning_rate": 4.566589181382847e-05, + "loss": 0.0911, + "num_input_tokens_seen": 10404352, + "step": 49310 + }, + { + "epoch": 5.425192519251925, + "grad_norm": 0.021525274962186813, + "learning_rate": 4.5664541110545445e-05, + "loss": 0.0051, + "num_input_tokens_seen": 10405376, + "step": 49315 + }, + { + "epoch": 5.425742574257426, + "grad_norm": 0.7315972447395325, + "learning_rate": 4.566319021680726e-05, + "loss": 0.1593, + "num_input_tokens_seen": 10406432, + "step": 49320 + }, + { + "epoch": 5.426292629262926, + "grad_norm": 0.16735833883285522, + "learning_rate": 4.566183913262636e-05, + "loss": 0.0091, + "num_input_tokens_seen": 10407520, + "step": 49325 + }, + { + "epoch": 5.4268426842684265, + "grad_norm": 0.03988539054989815, + "learning_rate": 4.56604878580152e-05, + "loss": 0.0602, + "num_input_tokens_seen": 10408512, + "step": 49330 + }, + { + "epoch": 5.427392739273928, + "grad_norm": 0.7793100476264954, + "learning_rate": 4.5659136392986245e-05, + "loss": 0.0706, + "num_input_tokens_seen": 10409536, + "step": 49335 + }, + { + "epoch": 5.427942794279428, + "grad_norm": 0.051656197756528854, + "learning_rate": 4.5657784737551945e-05, + "loss": 0.0139, + "num_input_tokens_seen": 10410528, + "step": 49340 + }, + { + "epoch": 5.428492849284929, + "grad_norm": 0.07161653786897659, + "learning_rate": 4.565643289172475e-05, + "loss": 0.0195, + "num_input_tokens_seen": 10411552, + "step": 49345 + }, + { + "epoch": 5.429042904290429, + "grad_norm": 0.7958306074142456, + "learning_rate": 4.565508085551713e-05, + "loss": 0.0355, + "num_input_tokens_seen": 10412640, + "step": 49350 + }, + { + "epoch": 5.429592959295929, + "grad_norm": 0.15582500398159027, + "learning_rate": 4.5653728628941536e-05, + "loss": 0.1632, + "num_input_tokens_seen": 10413760, + "step": 49355 + }, + { + "epoch": 5.43014301430143, + "grad_norm": 0.04467863216996193, + "learning_rate": 4.5652376212010434e-05, + "loss": 0.0333, + "num_input_tokens_seen": 10414816, + "step": 49360 + }, + { + "epoch": 5.430693069306931, + "grad_norm": 0.14270839095115662, + "learning_rate": 4.5651023604736296e-05, + "loss": 0.0512, + "num_input_tokens_seen": 10415840, + "step": 49365 + }, + { + "epoch": 5.431243124312431, + "grad_norm": 1.5295867919921875, + "learning_rate": 4.5649670807131586e-05, + "loss": 0.0361, + "num_input_tokens_seen": 10416896, + "step": 49370 + }, + { + "epoch": 5.431793179317932, + "grad_norm": 0.07020255923271179, + "learning_rate": 4.5648317819208756e-05, + "loss": 0.0285, + "num_input_tokens_seen": 10417984, + "step": 49375 + }, + { + "epoch": 5.432343234323432, + "grad_norm": 0.05258849263191223, + "learning_rate": 4.56469646409803e-05, + "loss": 0.0338, + "num_input_tokens_seen": 10419008, + "step": 49380 + }, + { + "epoch": 5.432893289328933, + "grad_norm": 0.7383543848991394, + "learning_rate": 4.564561127245868e-05, + "loss": 0.0375, + "num_input_tokens_seen": 10420032, + "step": 49385 + }, + { + "epoch": 5.433443344334433, + "grad_norm": 0.059231411665678024, + "learning_rate": 4.5644257713656356e-05, + "loss": 0.0054, + "num_input_tokens_seen": 10421056, + "step": 49390 + }, + { + "epoch": 5.433993399339934, + "grad_norm": 0.03743019327521324, + "learning_rate": 4.564290396458582e-05, + "loss": 0.0201, + "num_input_tokens_seen": 10422112, + "step": 49395 + }, + { + "epoch": 5.434543454345435, + "grad_norm": 0.20878568291664124, + "learning_rate": 4.564155002525955e-05, + "loss": 0.1285, + "num_input_tokens_seen": 10423136, + "step": 49400 + }, + { + "epoch": 5.435093509350935, + "grad_norm": 0.015110228210687637, + "learning_rate": 4.5640195895690006e-05, + "loss": 0.0213, + "num_input_tokens_seen": 10424128, + "step": 49405 + }, + { + "epoch": 5.435643564356436, + "grad_norm": 0.4846801161766052, + "learning_rate": 4.563884157588969e-05, + "loss": 0.0426, + "num_input_tokens_seen": 10425248, + "step": 49410 + }, + { + "epoch": 5.436193619361936, + "grad_norm": 1.71420419216156, + "learning_rate": 4.563748706587107e-05, + "loss": 0.067, + "num_input_tokens_seen": 10426240, + "step": 49415 + }, + { + "epoch": 5.436743674367436, + "grad_norm": 0.031061075627803802, + "learning_rate": 4.5636132365646636e-05, + "loss": 0.2398, + "num_input_tokens_seen": 10427296, + "step": 49420 + }, + { + "epoch": 5.4372937293729375, + "grad_norm": 0.023996984586119652, + "learning_rate": 4.5634777475228874e-05, + "loss": 0.0612, + "num_input_tokens_seen": 10428352, + "step": 49425 + }, + { + "epoch": 5.437843784378438, + "grad_norm": 0.144234761595726, + "learning_rate": 4.563342239463027e-05, + "loss": 0.0223, + "num_input_tokens_seen": 10429376, + "step": 49430 + }, + { + "epoch": 5.438393839383938, + "grad_norm": 0.045351963490247726, + "learning_rate": 4.563206712386331e-05, + "loss": 0.0437, + "num_input_tokens_seen": 10430400, + "step": 49435 + }, + { + "epoch": 5.438943894389439, + "grad_norm": 0.1072038784623146, + "learning_rate": 4.563071166294049e-05, + "loss": 0.0353, + "num_input_tokens_seen": 10431456, + "step": 49440 + }, + { + "epoch": 5.439493949394939, + "grad_norm": 0.25902920961380005, + "learning_rate": 4.56293560118743e-05, + "loss": 0.093, + "num_input_tokens_seen": 10432512, + "step": 49445 + }, + { + "epoch": 5.44004400440044, + "grad_norm": 0.11571928858757019, + "learning_rate": 4.562800017067723e-05, + "loss": 0.261, + "num_input_tokens_seen": 10433568, + "step": 49450 + }, + { + "epoch": 5.4405940594059405, + "grad_norm": 0.10610591620206833, + "learning_rate": 4.5626644139361784e-05, + "loss": 0.1417, + "num_input_tokens_seen": 10434688, + "step": 49455 + }, + { + "epoch": 5.441144114411441, + "grad_norm": 0.05973219498991966, + "learning_rate": 4.5625287917940457e-05, + "loss": 0.0108, + "num_input_tokens_seen": 10435776, + "step": 49460 + }, + { + "epoch": 5.441694169416942, + "grad_norm": 0.07045316696166992, + "learning_rate": 4.5623931506425746e-05, + "loss": 0.0083, + "num_input_tokens_seen": 10436832, + "step": 49465 + }, + { + "epoch": 5.442244224422442, + "grad_norm": 0.12031333893537521, + "learning_rate": 4.562257490483016e-05, + "loss": 0.0166, + "num_input_tokens_seen": 10437952, + "step": 49470 + }, + { + "epoch": 5.442794279427943, + "grad_norm": 0.01458023488521576, + "learning_rate": 4.562121811316619e-05, + "loss": 0.018, + "num_input_tokens_seen": 10439040, + "step": 49475 + }, + { + "epoch": 5.443344334433443, + "grad_norm": 1.0640872716903687, + "learning_rate": 4.5619861131446354e-05, + "loss": 0.0193, + "num_input_tokens_seen": 10440064, + "step": 49480 + }, + { + "epoch": 5.4438943894389435, + "grad_norm": 0.38803449273109436, + "learning_rate": 4.5618503959683144e-05, + "loss": 0.0768, + "num_input_tokens_seen": 10441152, + "step": 49485 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 0.12503276765346527, + "learning_rate": 4.561714659788908e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10442208, + "step": 49490 + }, + { + "epoch": 5.444994499449945, + "grad_norm": 1.673512578010559, + "learning_rate": 4.5615789046076676e-05, + "loss": 0.0649, + "num_input_tokens_seen": 10443232, + "step": 49495 + }, + { + "epoch": 5.445544554455446, + "grad_norm": 0.07208479940891266, + "learning_rate": 4.561443130425843e-05, + "loss": 0.0936, + "num_input_tokens_seen": 10444320, + "step": 49500 + }, + { + "epoch": 5.446094609460946, + "grad_norm": 0.12238186597824097, + "learning_rate": 4.561307337244687e-05, + "loss": 0.016, + "num_input_tokens_seen": 10445376, + "step": 49505 + }, + { + "epoch": 5.446644664466446, + "grad_norm": 0.1371147632598877, + "learning_rate": 4.5611715250654495e-05, + "loss": 0.069, + "num_input_tokens_seen": 10446400, + "step": 49510 + }, + { + "epoch": 5.447194719471947, + "grad_norm": 1.1490018367767334, + "learning_rate": 4.5610356938893825e-05, + "loss": 0.0437, + "num_input_tokens_seen": 10447456, + "step": 49515 + }, + { + "epoch": 5.447744774477448, + "grad_norm": 0.716625988483429, + "learning_rate": 4.56089984371774e-05, + "loss": 0.0153, + "num_input_tokens_seen": 10448480, + "step": 49520 + }, + { + "epoch": 5.448294829482949, + "grad_norm": 0.526578426361084, + "learning_rate": 4.560763974551772e-05, + "loss": 0.0178, + "num_input_tokens_seen": 10449536, + "step": 49525 + }, + { + "epoch": 5.448844884488449, + "grad_norm": 0.11393538117408752, + "learning_rate": 4.5606280863927313e-05, + "loss": 0.1408, + "num_input_tokens_seen": 10450592, + "step": 49530 + }, + { + "epoch": 5.449394939493949, + "grad_norm": 0.8505216836929321, + "learning_rate": 4.56049217924187e-05, + "loss": 0.0808, + "num_input_tokens_seen": 10451680, + "step": 49535 + }, + { + "epoch": 5.44994499449945, + "grad_norm": 0.31993943452835083, + "learning_rate": 4.560356253100442e-05, + "loss": 0.088, + "num_input_tokens_seen": 10452704, + "step": 49540 + }, + { + "epoch": 5.4504950495049505, + "grad_norm": 0.023932062089443207, + "learning_rate": 4.560220307969698e-05, + "loss": 0.0841, + "num_input_tokens_seen": 10453696, + "step": 49545 + }, + { + "epoch": 5.451045104510451, + "grad_norm": 0.013219093903899193, + "learning_rate": 4.560084343850892e-05, + "loss": 0.0221, + "num_input_tokens_seen": 10454752, + "step": 49550 + }, + { + "epoch": 5.451595159515952, + "grad_norm": 1.7768672704696655, + "learning_rate": 4.559948360745278e-05, + "loss": 0.111, + "num_input_tokens_seen": 10455840, + "step": 49555 + }, + { + "epoch": 5.452145214521452, + "grad_norm": 0.25194141268730164, + "learning_rate": 4.559812358654107e-05, + "loss": 0.0133, + "num_input_tokens_seen": 10456928, + "step": 49560 + }, + { + "epoch": 5.452695269526953, + "grad_norm": 0.6153976917266846, + "learning_rate": 4.559676337578635e-05, + "loss": 0.0541, + "num_input_tokens_seen": 10457920, + "step": 49565 + }, + { + "epoch": 5.453245324532453, + "grad_norm": 0.2023639976978302, + "learning_rate": 4.559540297520114e-05, + "loss": 0.0307, + "num_input_tokens_seen": 10458944, + "step": 49570 + }, + { + "epoch": 5.4537953795379535, + "grad_norm": 0.48355773091316223, + "learning_rate": 4.5594042384797994e-05, + "loss": 0.0888, + "num_input_tokens_seen": 10460000, + "step": 49575 + }, + { + "epoch": 5.4543454345434546, + "grad_norm": 0.04674691706895828, + "learning_rate": 4.559268160458943e-05, + "loss": 0.013, + "num_input_tokens_seen": 10461088, + "step": 49580 + }, + { + "epoch": 5.454895489548955, + "grad_norm": 1.1641210317611694, + "learning_rate": 4.5591320634588004e-05, + "loss": 0.1252, + "num_input_tokens_seen": 10462176, + "step": 49585 + }, + { + "epoch": 5.455445544554456, + "grad_norm": 0.659135639667511, + "learning_rate": 4.558995947480625e-05, + "loss": 0.0394, + "num_input_tokens_seen": 10463232, + "step": 49590 + }, + { + "epoch": 5.455995599559956, + "grad_norm": 0.09440525621175766, + "learning_rate": 4.558859812525673e-05, + "loss": 0.011, + "num_input_tokens_seen": 10464256, + "step": 49595 + }, + { + "epoch": 5.456545654565456, + "grad_norm": 0.40740689635276794, + "learning_rate": 4.558723658595198e-05, + "loss": 0.0199, + "num_input_tokens_seen": 10465312, + "step": 49600 + }, + { + "epoch": 5.457095709570957, + "grad_norm": 1.0059977769851685, + "learning_rate": 4.558587485690454e-05, + "loss": 0.0337, + "num_input_tokens_seen": 10466304, + "step": 49605 + }, + { + "epoch": 5.457645764576458, + "grad_norm": 0.07561126351356506, + "learning_rate": 4.558451293812698e-05, + "loss": 0.0067, + "num_input_tokens_seen": 10467360, + "step": 49610 + }, + { + "epoch": 5.458195819581958, + "grad_norm": 0.05166942626237869, + "learning_rate": 4.558315082963184e-05, + "loss": 0.0219, + "num_input_tokens_seen": 10468352, + "step": 49615 + }, + { + "epoch": 5.458745874587459, + "grad_norm": 0.36848917603492737, + "learning_rate": 4.558178853143167e-05, + "loss": 0.0119, + "num_input_tokens_seen": 10469472, + "step": 49620 + }, + { + "epoch": 5.459295929592959, + "grad_norm": 0.9885224103927612, + "learning_rate": 4.558042604353904e-05, + "loss": 0.1246, + "num_input_tokens_seen": 10470624, + "step": 49625 + }, + { + "epoch": 5.45984598459846, + "grad_norm": 0.08430434763431549, + "learning_rate": 4.557906336596649e-05, + "loss": 0.0801, + "num_input_tokens_seen": 10471648, + "step": 49630 + }, + { + "epoch": 5.46039603960396, + "grad_norm": 0.00804186798632145, + "learning_rate": 4.557770049872659e-05, + "loss": 0.0089, + "num_input_tokens_seen": 10472704, + "step": 49635 + }, + { + "epoch": 5.460946094609461, + "grad_norm": 1.4984022378921509, + "learning_rate": 4.5576337441831895e-05, + "loss": 0.1011, + "num_input_tokens_seen": 10473792, + "step": 49640 + }, + { + "epoch": 5.461496149614962, + "grad_norm": 0.016986500471830368, + "learning_rate": 4.557497419529498e-05, + "loss": 0.0183, + "num_input_tokens_seen": 10474848, + "step": 49645 + }, + { + "epoch": 5.462046204620462, + "grad_norm": 0.026091739535331726, + "learning_rate": 4.557361075912839e-05, + "loss": 0.0382, + "num_input_tokens_seen": 10475936, + "step": 49650 + }, + { + "epoch": 5.462596259625963, + "grad_norm": 0.30110180377960205, + "learning_rate": 4.5572247133344716e-05, + "loss": 0.0167, + "num_input_tokens_seen": 10476960, + "step": 49655 + }, + { + "epoch": 5.463146314631463, + "grad_norm": 1.139609932899475, + "learning_rate": 4.5570883317956505e-05, + "loss": 0.0395, + "num_input_tokens_seen": 10477984, + "step": 49660 + }, + { + "epoch": 5.463696369636963, + "grad_norm": 0.03952005133032799, + "learning_rate": 4.5569519312976326e-05, + "loss": 0.087, + "num_input_tokens_seen": 10479040, + "step": 49665 + }, + { + "epoch": 5.4642464246424645, + "grad_norm": 0.3420577645301819, + "learning_rate": 4.556815511841676e-05, + "loss": 0.0226, + "num_input_tokens_seen": 10480064, + "step": 49670 + }, + { + "epoch": 5.464796479647965, + "grad_norm": 1.1993292570114136, + "learning_rate": 4.556679073429039e-05, + "loss": 0.0336, + "num_input_tokens_seen": 10481088, + "step": 49675 + }, + { + "epoch": 5.465346534653466, + "grad_norm": 0.013633615337312222, + "learning_rate": 4.5565426160609766e-05, + "loss": 0.005, + "num_input_tokens_seen": 10482208, + "step": 49680 + }, + { + "epoch": 5.465896589658966, + "grad_norm": 0.0319468230009079, + "learning_rate": 4.556406139738748e-05, + "loss": 0.0257, + "num_input_tokens_seen": 10483232, + "step": 49685 + }, + { + "epoch": 5.466446644664466, + "grad_norm": 0.09381431341171265, + "learning_rate": 4.556269644463611e-05, + "loss": 0.0299, + "num_input_tokens_seen": 10484256, + "step": 49690 + }, + { + "epoch": 5.466996699669967, + "grad_norm": 0.014502204954624176, + "learning_rate": 4.556133130236823e-05, + "loss": 0.028, + "num_input_tokens_seen": 10485312, + "step": 49695 + }, + { + "epoch": 5.4675467546754675, + "grad_norm": 2.1018919944763184, + "learning_rate": 4.555996597059643e-05, + "loss": 0.0291, + "num_input_tokens_seen": 10486400, + "step": 49700 + }, + { + "epoch": 5.468096809680969, + "grad_norm": 1.419417381286621, + "learning_rate": 4.555860044933329e-05, + "loss": 0.0715, + "num_input_tokens_seen": 10487424, + "step": 49705 + }, + { + "epoch": 5.468646864686469, + "grad_norm": 0.030964210629463196, + "learning_rate": 4.555723473859139e-05, + "loss": 0.0288, + "num_input_tokens_seen": 10488480, + "step": 49710 + }, + { + "epoch": 5.469196919691969, + "grad_norm": 0.019914064556360245, + "learning_rate": 4.5555868838383326e-05, + "loss": 0.1465, + "num_input_tokens_seen": 10489536, + "step": 49715 + }, + { + "epoch": 5.46974697469747, + "grad_norm": 0.03854215145111084, + "learning_rate": 4.5554502748721687e-05, + "loss": 0.0316, + "num_input_tokens_seen": 10490688, + "step": 49720 + }, + { + "epoch": 5.47029702970297, + "grad_norm": 0.3692592978477478, + "learning_rate": 4.555313646961905e-05, + "loss": 0.0095, + "num_input_tokens_seen": 10491712, + "step": 49725 + }, + { + "epoch": 5.4708470847084705, + "grad_norm": 0.6763240098953247, + "learning_rate": 4.5551770001088015e-05, + "loss": 0.1198, + "num_input_tokens_seen": 10492832, + "step": 49730 + }, + { + "epoch": 5.471397139713972, + "grad_norm": 0.15879158675670624, + "learning_rate": 4.555040334314118e-05, + "loss": 0.0277, + "num_input_tokens_seen": 10493888, + "step": 49735 + }, + { + "epoch": 5.471947194719472, + "grad_norm": 0.9099308848381042, + "learning_rate": 4.5549036495791144e-05, + "loss": 0.0271, + "num_input_tokens_seen": 10494912, + "step": 49740 + }, + { + "epoch": 5.472497249724973, + "grad_norm": 0.25393345952033997, + "learning_rate": 4.554766945905049e-05, + "loss": 0.0238, + "num_input_tokens_seen": 10495968, + "step": 49745 + }, + { + "epoch": 5.473047304730473, + "grad_norm": 1.0485137701034546, + "learning_rate": 4.554630223293184e-05, + "loss": 0.054, + "num_input_tokens_seen": 10497056, + "step": 49750 + }, + { + "epoch": 5.473597359735973, + "grad_norm": 0.06217702478170395, + "learning_rate": 4.554493481744777e-05, + "loss": 0.0062, + "num_input_tokens_seen": 10498112, + "step": 49755 + }, + { + "epoch": 5.474147414741474, + "grad_norm": 0.2989291548728943, + "learning_rate": 4.554356721261089e-05, + "loss": 0.0346, + "num_input_tokens_seen": 10499136, + "step": 49760 + }, + { + "epoch": 5.474697469746975, + "grad_norm": 0.4024084806442261, + "learning_rate": 4.5542199418433825e-05, + "loss": 0.0365, + "num_input_tokens_seen": 10500128, + "step": 49765 + }, + { + "epoch": 5.475247524752476, + "grad_norm": 0.09356801211833954, + "learning_rate": 4.554083143492915e-05, + "loss": 0.057, + "num_input_tokens_seen": 10501152, + "step": 49770 + }, + { + "epoch": 5.475797579757976, + "grad_norm": 0.3392218351364136, + "learning_rate": 4.553946326210949e-05, + "loss": 0.0514, + "num_input_tokens_seen": 10502240, + "step": 49775 + }, + { + "epoch": 5.476347634763476, + "grad_norm": 1.0218878984451294, + "learning_rate": 4.553809489998747e-05, + "loss": 0.0747, + "num_input_tokens_seen": 10503264, + "step": 49780 + }, + { + "epoch": 5.476897689768977, + "grad_norm": 0.05140722543001175, + "learning_rate": 4.5536726348575666e-05, + "loss": 0.0178, + "num_input_tokens_seen": 10504384, + "step": 49785 + }, + { + "epoch": 5.477447744774477, + "grad_norm": 1.991697072982788, + "learning_rate": 4.553535760788672e-05, + "loss": 0.0415, + "num_input_tokens_seen": 10505440, + "step": 49790 + }, + { + "epoch": 5.477997799779978, + "grad_norm": 0.17109926044940948, + "learning_rate": 4.553398867793322e-05, + "loss": 0.0763, + "num_input_tokens_seen": 10506496, + "step": 49795 + }, + { + "epoch": 5.478547854785479, + "grad_norm": 1.195159912109375, + "learning_rate": 4.553261955872782e-05, + "loss": 0.0573, + "num_input_tokens_seen": 10507616, + "step": 49800 + }, + { + "epoch": 5.479097909790979, + "grad_norm": 0.06037871539592743, + "learning_rate": 4.553125025028311e-05, + "loss": 0.0527, + "num_input_tokens_seen": 10508704, + "step": 49805 + }, + { + "epoch": 5.47964796479648, + "grad_norm": 0.035653006285429, + "learning_rate": 4.552988075261172e-05, + "loss": 0.0137, + "num_input_tokens_seen": 10509728, + "step": 49810 + }, + { + "epoch": 5.48019801980198, + "grad_norm": 0.6396364569664001, + "learning_rate": 4.5528511065726267e-05, + "loss": 0.0316, + "num_input_tokens_seen": 10510784, + "step": 49815 + }, + { + "epoch": 5.48074807480748, + "grad_norm": 0.03354470804333687, + "learning_rate": 4.552714118963938e-05, + "loss": 0.0388, + "num_input_tokens_seen": 10511808, + "step": 49820 + }, + { + "epoch": 5.4812981298129815, + "grad_norm": 1.4265363216400146, + "learning_rate": 4.552577112436368e-05, + "loss": 0.0328, + "num_input_tokens_seen": 10512864, + "step": 49825 + }, + { + "epoch": 5.481848184818482, + "grad_norm": 0.5368505120277405, + "learning_rate": 4.5524400869911796e-05, + "loss": 0.0158, + "num_input_tokens_seen": 10513920, + "step": 49830 + }, + { + "epoch": 5.482398239823983, + "grad_norm": 0.029320357367396355, + "learning_rate": 4.552303042629637e-05, + "loss": 0.0464, + "num_input_tokens_seen": 10515040, + "step": 49835 + }, + { + "epoch": 5.482948294829483, + "grad_norm": 0.06252686679363251, + "learning_rate": 4.5521659793530006e-05, + "loss": 0.0465, + "num_input_tokens_seen": 10516128, + "step": 49840 + }, + { + "epoch": 5.483498349834983, + "grad_norm": 0.2327103614807129, + "learning_rate": 4.5520288971625354e-05, + "loss": 0.1383, + "num_input_tokens_seen": 10517152, + "step": 49845 + }, + { + "epoch": 5.484048404840484, + "grad_norm": 1.0189454555511475, + "learning_rate": 4.551891796059505e-05, + "loss": 0.0937, + "num_input_tokens_seen": 10518144, + "step": 49850 + }, + { + "epoch": 5.4845984598459845, + "grad_norm": 0.016473831608891487, + "learning_rate": 4.551754676045172e-05, + "loss": 0.0082, + "num_input_tokens_seen": 10519168, + "step": 49855 + }, + { + "epoch": 5.485148514851485, + "grad_norm": 0.08051343262195587, + "learning_rate": 4.5516175371208e-05, + "loss": 0.0505, + "num_input_tokens_seen": 10520256, + "step": 49860 + }, + { + "epoch": 5.485698569856986, + "grad_norm": 0.05095243081450462, + "learning_rate": 4.5514803792876556e-05, + "loss": 0.0115, + "num_input_tokens_seen": 10521344, + "step": 49865 + }, + { + "epoch": 5.486248624862486, + "grad_norm": 0.06377158313989639, + "learning_rate": 4.5513432025469994e-05, + "loss": 0.0067, + "num_input_tokens_seen": 10522368, + "step": 49870 + }, + { + "epoch": 5.486798679867987, + "grad_norm": 1.385521411895752, + "learning_rate": 4.5512060069000976e-05, + "loss": 0.132, + "num_input_tokens_seen": 10523392, + "step": 49875 + }, + { + "epoch": 5.487348734873487, + "grad_norm": 0.16698774695396423, + "learning_rate": 4.5510687923482145e-05, + "loss": 0.1468, + "num_input_tokens_seen": 10524512, + "step": 49880 + }, + { + "epoch": 5.4878987898789875, + "grad_norm": 0.3325038254261017, + "learning_rate": 4.550931558892615e-05, + "loss": 0.0251, + "num_input_tokens_seen": 10525568, + "step": 49885 + }, + { + "epoch": 5.488448844884489, + "grad_norm": 0.4011978209018707, + "learning_rate": 4.550794306534563e-05, + "loss": 0.0254, + "num_input_tokens_seen": 10526656, + "step": 49890 + }, + { + "epoch": 5.488998899889989, + "grad_norm": 0.43924272060394287, + "learning_rate": 4.550657035275323e-05, + "loss": 0.0498, + "num_input_tokens_seen": 10527680, + "step": 49895 + }, + { + "epoch": 5.48954895489549, + "grad_norm": 0.5511963367462158, + "learning_rate": 4.550519745116162e-05, + "loss": 0.0575, + "num_input_tokens_seen": 10528768, + "step": 49900 + }, + { + "epoch": 5.49009900990099, + "grad_norm": 0.02938121184706688, + "learning_rate": 4.5503824360583445e-05, + "loss": 0.1838, + "num_input_tokens_seen": 10529824, + "step": 49905 + }, + { + "epoch": 5.49064906490649, + "grad_norm": 0.055878058075904846, + "learning_rate": 4.550245108103136e-05, + "loss": 0.0201, + "num_input_tokens_seen": 10530880, + "step": 49910 + }, + { + "epoch": 5.491199119911991, + "grad_norm": 0.2455916702747345, + "learning_rate": 4.550107761251802e-05, + "loss": 0.0368, + "num_input_tokens_seen": 10531904, + "step": 49915 + }, + { + "epoch": 5.491749174917492, + "grad_norm": 0.7374818921089172, + "learning_rate": 4.5499703955056085e-05, + "loss": 0.0411, + "num_input_tokens_seen": 10532960, + "step": 49920 + }, + { + "epoch": 5.492299229922993, + "grad_norm": 0.02673473209142685, + "learning_rate": 4.549833010865821e-05, + "loss": 0.0235, + "num_input_tokens_seen": 10534016, + "step": 49925 + }, + { + "epoch": 5.492849284928493, + "grad_norm": 0.08288710564374924, + "learning_rate": 4.549695607333707e-05, + "loss": 0.0567, + "num_input_tokens_seen": 10535104, + "step": 49930 + }, + { + "epoch": 5.493399339933993, + "grad_norm": 1.5166460275650024, + "learning_rate": 4.549558184910532e-05, + "loss": 0.0488, + "num_input_tokens_seen": 10536096, + "step": 49935 + }, + { + "epoch": 5.493949394939494, + "grad_norm": 0.008886726573109627, + "learning_rate": 4.5494207435975625e-05, + "loss": 0.0141, + "num_input_tokens_seen": 10537120, + "step": 49940 + }, + { + "epoch": 5.494499449944994, + "grad_norm": 0.5190383195877075, + "learning_rate": 4.5492832833960654e-05, + "loss": 0.0639, + "num_input_tokens_seen": 10538176, + "step": 49945 + }, + { + "epoch": 5.4950495049504955, + "grad_norm": 0.3165706694126129, + "learning_rate": 4.549145804307308e-05, + "loss": 0.0145, + "num_input_tokens_seen": 10539296, + "step": 49950 + }, + { + "epoch": 5.495599559955996, + "grad_norm": 0.5147654414176941, + "learning_rate": 4.549008306332556e-05, + "loss": 0.0526, + "num_input_tokens_seen": 10540352, + "step": 49955 + }, + { + "epoch": 5.496149614961496, + "grad_norm": 0.025990838184952736, + "learning_rate": 4.5488707894730785e-05, + "loss": 0.0708, + "num_input_tokens_seen": 10541440, + "step": 49960 + }, + { + "epoch": 5.496699669966997, + "grad_norm": 0.020826827734708786, + "learning_rate": 4.548733253730142e-05, + "loss": 0.0779, + "num_input_tokens_seen": 10542496, + "step": 49965 + }, + { + "epoch": 5.497249724972497, + "grad_norm": 0.13424669206142426, + "learning_rate": 4.5485956991050135e-05, + "loss": 0.0979, + "num_input_tokens_seen": 10543552, + "step": 49970 + }, + { + "epoch": 5.497799779977997, + "grad_norm": 0.05773985758423805, + "learning_rate": 4.548458125598963e-05, + "loss": 0.0161, + "num_input_tokens_seen": 10544640, + "step": 49975 + }, + { + "epoch": 5.4983498349834985, + "grad_norm": 0.2584618926048279, + "learning_rate": 4.548320533213256e-05, + "loss": 0.1062, + "num_input_tokens_seen": 10545632, + "step": 49980 + }, + { + "epoch": 5.498899889988999, + "grad_norm": 1.3245583772659302, + "learning_rate": 4.548182921949161e-05, + "loss": 0.0835, + "num_input_tokens_seen": 10546752, + "step": 49985 + }, + { + "epoch": 5.4994499449945, + "grad_norm": 0.23594863712787628, + "learning_rate": 4.548045291807948e-05, + "loss": 0.021, + "num_input_tokens_seen": 10547744, + "step": 49990 + }, + { + "epoch": 5.5, + "grad_norm": 0.06491640210151672, + "learning_rate": 4.547907642790883e-05, + "loss": 0.0369, + "num_input_tokens_seen": 10548768, + "step": 49995 + }, + { + "epoch": 5.5005500550055, + "grad_norm": 0.2784770131111145, + "learning_rate": 4.5477699748992364e-05, + "loss": 0.0305, + "num_input_tokens_seen": 10549856, + "step": 50000 + }, + { + "epoch": 5.501100110011001, + "grad_norm": 1.0369967222213745, + "learning_rate": 4.547632288134277e-05, + "loss": 0.0984, + "num_input_tokens_seen": 10551040, + "step": 50005 + }, + { + "epoch": 5.5016501650165015, + "grad_norm": 1.1645221710205078, + "learning_rate": 4.547494582497272e-05, + "loss": 0.1045, + "num_input_tokens_seen": 10552160, + "step": 50010 + }, + { + "epoch": 5.502200220022003, + "grad_norm": 0.1496550589799881, + "learning_rate": 4.547356857989494e-05, + "loss": 0.0317, + "num_input_tokens_seen": 10553216, + "step": 50015 + }, + { + "epoch": 5.502750275027503, + "grad_norm": 0.027636311948299408, + "learning_rate": 4.547219114612209e-05, + "loss": 0.011, + "num_input_tokens_seen": 10554240, + "step": 50020 + }, + { + "epoch": 5.503300330033003, + "grad_norm": 0.03523097187280655, + "learning_rate": 4.547081352366688e-05, + "loss": 0.0053, + "num_input_tokens_seen": 10555360, + "step": 50025 + }, + { + "epoch": 5.503850385038504, + "grad_norm": 0.02118811570107937, + "learning_rate": 4.546943571254201e-05, + "loss": 0.1303, + "num_input_tokens_seen": 10556448, + "step": 50030 + }, + { + "epoch": 5.504400440044004, + "grad_norm": 0.10629798471927643, + "learning_rate": 4.546805771276016e-05, + "loss": 0.0215, + "num_input_tokens_seen": 10557472, + "step": 50035 + }, + { + "epoch": 5.5049504950495045, + "grad_norm": 0.090069480240345, + "learning_rate": 4.5466679524334054e-05, + "loss": 0.0078, + "num_input_tokens_seen": 10558560, + "step": 50040 + }, + { + "epoch": 5.505500550055006, + "grad_norm": 0.48565807938575745, + "learning_rate": 4.546530114727638e-05, + "loss": 0.0461, + "num_input_tokens_seen": 10559584, + "step": 50045 + }, + { + "epoch": 5.506050605060506, + "grad_norm": 0.4308294355869293, + "learning_rate": 4.546392258159985e-05, + "loss": 0.0753, + "num_input_tokens_seen": 10560608, + "step": 50050 + }, + { + "epoch": 5.506600660066007, + "grad_norm": 0.29684707522392273, + "learning_rate": 4.546254382731716e-05, + "loss": 0.0391, + "num_input_tokens_seen": 10561664, + "step": 50055 + }, + { + "epoch": 5.507150715071507, + "grad_norm": 0.01607418619096279, + "learning_rate": 4.546116488444102e-05, + "loss": 0.0073, + "num_input_tokens_seen": 10562688, + "step": 50060 + }, + { + "epoch": 5.507700770077007, + "grad_norm": 0.048320285975933075, + "learning_rate": 4.545978575298415e-05, + "loss": 0.0399, + "num_input_tokens_seen": 10563776, + "step": 50065 + }, + { + "epoch": 5.508250825082508, + "grad_norm": 0.048962920904159546, + "learning_rate": 4.5458406432959246e-05, + "loss": 0.1349, + "num_input_tokens_seen": 10564864, + "step": 50070 + }, + { + "epoch": 5.508800880088009, + "grad_norm": 0.2620251774787903, + "learning_rate": 4.545702692437903e-05, + "loss": 0.0303, + "num_input_tokens_seen": 10565920, + "step": 50075 + }, + { + "epoch": 5.50935093509351, + "grad_norm": 0.2666383981704712, + "learning_rate": 4.545564722725621e-05, + "loss": 0.0201, + "num_input_tokens_seen": 10567008, + "step": 50080 + }, + { + "epoch": 5.50990099009901, + "grad_norm": 1.0192996263504028, + "learning_rate": 4.545426734160351e-05, + "loss": 0.0383, + "num_input_tokens_seen": 10568064, + "step": 50085 + }, + { + "epoch": 5.51045104510451, + "grad_norm": 0.10392143577337265, + "learning_rate": 4.5452887267433635e-05, + "loss": 0.0248, + "num_input_tokens_seen": 10569152, + "step": 50090 + }, + { + "epoch": 5.511001100110011, + "grad_norm": 0.07807164639234543, + "learning_rate": 4.545150700475932e-05, + "loss": 0.0265, + "num_input_tokens_seen": 10570208, + "step": 50095 + }, + { + "epoch": 5.511551155115511, + "grad_norm": 0.010008405894041061, + "learning_rate": 4.5450126553593275e-05, + "loss": 0.1296, + "num_input_tokens_seen": 10571296, + "step": 50100 + }, + { + "epoch": 5.512101210121012, + "grad_norm": 0.6376540660858154, + "learning_rate": 4.544874591394822e-05, + "loss": 0.0262, + "num_input_tokens_seen": 10572320, + "step": 50105 + }, + { + "epoch": 5.512651265126513, + "grad_norm": 0.3459271192550659, + "learning_rate": 4.54473650858369e-05, + "loss": 0.057, + "num_input_tokens_seen": 10573376, + "step": 50110 + }, + { + "epoch": 5.513201320132013, + "grad_norm": 0.2100045382976532, + "learning_rate": 4.5445984069272016e-05, + "loss": 0.0477, + "num_input_tokens_seen": 10574432, + "step": 50115 + }, + { + "epoch": 5.513751375137514, + "grad_norm": 2.0967776775360107, + "learning_rate": 4.5444602864266314e-05, + "loss": 0.0588, + "num_input_tokens_seen": 10575520, + "step": 50120 + }, + { + "epoch": 5.514301430143014, + "grad_norm": 0.1295788586139679, + "learning_rate": 4.544322147083252e-05, + "loss": 0.0427, + "num_input_tokens_seen": 10576544, + "step": 50125 + }, + { + "epoch": 5.514851485148515, + "grad_norm": 0.7680515050888062, + "learning_rate": 4.5441839888983354e-05, + "loss": 0.1083, + "num_input_tokens_seen": 10577536, + "step": 50130 + }, + { + "epoch": 5.5154015401540155, + "grad_norm": 0.12766076624393463, + "learning_rate": 4.544045811873157e-05, + "loss": 0.0151, + "num_input_tokens_seen": 10578560, + "step": 50135 + }, + { + "epoch": 5.515951595159516, + "grad_norm": 0.08626049011945724, + "learning_rate": 4.543907616008988e-05, + "loss": 0.0342, + "num_input_tokens_seen": 10579584, + "step": 50140 + }, + { + "epoch": 5.516501650165017, + "grad_norm": 0.07363832741975784, + "learning_rate": 4.5437694013071044e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10580608, + "step": 50145 + }, + { + "epoch": 5.517051705170517, + "grad_norm": 0.021047892048954964, + "learning_rate": 4.543631167768778e-05, + "loss": 0.025, + "num_input_tokens_seen": 10581632, + "step": 50150 + }, + { + "epoch": 5.517601760176017, + "grad_norm": 0.010535507462918758, + "learning_rate": 4.5434929153952846e-05, + "loss": 0.0416, + "num_input_tokens_seen": 10582688, + "step": 50155 + }, + { + "epoch": 5.518151815181518, + "grad_norm": 0.6662623882293701, + "learning_rate": 4.5433546441878966e-05, + "loss": 0.0217, + "num_input_tokens_seen": 10583744, + "step": 50160 + }, + { + "epoch": 5.5187018701870185, + "grad_norm": 0.023816736415028572, + "learning_rate": 4.54321635414789e-05, + "loss": 0.0431, + "num_input_tokens_seen": 10584800, + "step": 50165 + }, + { + "epoch": 5.51925192519252, + "grad_norm": 0.32911571860313416, + "learning_rate": 4.5430780452765386e-05, + "loss": 0.095, + "num_input_tokens_seen": 10585888, + "step": 50170 + }, + { + "epoch": 5.51980198019802, + "grad_norm": 0.5998891592025757, + "learning_rate": 4.5429397175751165e-05, + "loss": 0.0274, + "num_input_tokens_seen": 10587008, + "step": 50175 + }, + { + "epoch": 5.52035203520352, + "grad_norm": 0.12618273496627808, + "learning_rate": 4.5428013710449e-05, + "loss": 0.0114, + "num_input_tokens_seen": 10588064, + "step": 50180 + }, + { + "epoch": 5.520902090209021, + "grad_norm": 0.35300928354263306, + "learning_rate": 4.542663005687164e-05, + "loss": 0.1525, + "num_input_tokens_seen": 10589184, + "step": 50185 + }, + { + "epoch": 5.521452145214521, + "grad_norm": 0.04501356557011604, + "learning_rate": 4.542524621503182e-05, + "loss": 0.0161, + "num_input_tokens_seen": 10590240, + "step": 50190 + }, + { + "epoch": 5.522002200220022, + "grad_norm": 0.022869883105158806, + "learning_rate": 4.5423862184942326e-05, + "loss": 0.0446, + "num_input_tokens_seen": 10591296, + "step": 50195 + }, + { + "epoch": 5.522552255225523, + "grad_norm": 0.9341749548912048, + "learning_rate": 4.542247796661587e-05, + "loss": 0.0926, + "num_input_tokens_seen": 10592416, + "step": 50200 + }, + { + "epoch": 5.523102310231023, + "grad_norm": 0.04099937528371811, + "learning_rate": 4.542109356006525e-05, + "loss": 0.0763, + "num_input_tokens_seen": 10593440, + "step": 50205 + }, + { + "epoch": 5.523652365236524, + "grad_norm": 0.04040747508406639, + "learning_rate": 4.541970896530321e-05, + "loss": 0.0563, + "num_input_tokens_seen": 10594528, + "step": 50210 + }, + { + "epoch": 5.524202420242024, + "grad_norm": 0.8736318945884705, + "learning_rate": 4.541832418234251e-05, + "loss": 0.0723, + "num_input_tokens_seen": 10595584, + "step": 50215 + }, + { + "epoch": 5.524752475247524, + "grad_norm": 0.9048600792884827, + "learning_rate": 4.541693921119591e-05, + "loss": 0.0214, + "num_input_tokens_seen": 10596608, + "step": 50220 + }, + { + "epoch": 5.525302530253025, + "grad_norm": 0.6967094540596008, + "learning_rate": 4.5415554051876174e-05, + "loss": 0.0605, + "num_input_tokens_seen": 10597632, + "step": 50225 + }, + { + "epoch": 5.525852585258526, + "grad_norm": 0.011417102999985218, + "learning_rate": 4.541416870439608e-05, + "loss": 0.0058, + "num_input_tokens_seen": 10598656, + "step": 50230 + }, + { + "epoch": 5.526402640264027, + "grad_norm": 0.014607582241296768, + "learning_rate": 4.541278316876839e-05, + "loss": 0.0301, + "num_input_tokens_seen": 10599712, + "step": 50235 + }, + { + "epoch": 5.526952695269527, + "grad_norm": 0.016029685735702515, + "learning_rate": 4.5411397445005864e-05, + "loss": 0.0091, + "num_input_tokens_seen": 10600736, + "step": 50240 + }, + { + "epoch": 5.527502750275027, + "grad_norm": 0.32343339920043945, + "learning_rate": 4.541001153312129e-05, + "loss": 0.0664, + "num_input_tokens_seen": 10601824, + "step": 50245 + }, + { + "epoch": 5.528052805280528, + "grad_norm": 0.08560920506715775, + "learning_rate": 4.540862543312743e-05, + "loss": 0.0339, + "num_input_tokens_seen": 10602880, + "step": 50250 + }, + { + "epoch": 5.528602860286028, + "grad_norm": 0.7321897745132446, + "learning_rate": 4.540723914503706e-05, + "loss": 0.0688, + "num_input_tokens_seen": 10603872, + "step": 50255 + }, + { + "epoch": 5.5291529152915295, + "grad_norm": 1.0892746448516846, + "learning_rate": 4.5405852668862955e-05, + "loss": 0.1069, + "num_input_tokens_seen": 10604896, + "step": 50260 + }, + { + "epoch": 5.52970297029703, + "grad_norm": 0.010077451355755329, + "learning_rate": 4.540446600461791e-05, + "loss": 0.0285, + "num_input_tokens_seen": 10605952, + "step": 50265 + }, + { + "epoch": 5.53025302530253, + "grad_norm": 1.4946589469909668, + "learning_rate": 4.5403079152314686e-05, + "loss": 0.0258, + "num_input_tokens_seen": 10606976, + "step": 50270 + }, + { + "epoch": 5.530803080308031, + "grad_norm": 1.4396549463272095, + "learning_rate": 4.5401692111966074e-05, + "loss": 0.0678, + "num_input_tokens_seen": 10608064, + "step": 50275 + }, + { + "epoch": 5.531353135313531, + "grad_norm": 0.4428737759590149, + "learning_rate": 4.5400304883584856e-05, + "loss": 0.048, + "num_input_tokens_seen": 10609152, + "step": 50280 + }, + { + "epoch": 5.531903190319031, + "grad_norm": 0.764453649520874, + "learning_rate": 4.539891746718381e-05, + "loss": 0.0163, + "num_input_tokens_seen": 10610240, + "step": 50285 + }, + { + "epoch": 5.5324532453245325, + "grad_norm": 0.7232774496078491, + "learning_rate": 4.539752986277574e-05, + "loss": 0.0145, + "num_input_tokens_seen": 10611360, + "step": 50290 + }, + { + "epoch": 5.533003300330033, + "grad_norm": 0.17098484933376312, + "learning_rate": 4.539614207037342e-05, + "loss": 0.0405, + "num_input_tokens_seen": 10612384, + "step": 50295 + }, + { + "epoch": 5.533553355335534, + "grad_norm": 0.206923246383667, + "learning_rate": 4.539475408998964e-05, + "loss": 0.1041, + "num_input_tokens_seen": 10613440, + "step": 50300 + }, + { + "epoch": 5.534103410341034, + "grad_norm": 1.0422768592834473, + "learning_rate": 4.539336592163721e-05, + "loss": 0.0277, + "num_input_tokens_seen": 10614528, + "step": 50305 + }, + { + "epoch": 5.534653465346535, + "grad_norm": 0.16748251020908356, + "learning_rate": 4.5391977565328904e-05, + "loss": 0.036, + "num_input_tokens_seen": 10615616, + "step": 50310 + }, + { + "epoch": 5.535203520352035, + "grad_norm": 0.06128665432333946, + "learning_rate": 4.5390589021077536e-05, + "loss": 0.0098, + "num_input_tokens_seen": 10616704, + "step": 50315 + }, + { + "epoch": 5.5357535753575355, + "grad_norm": 1.8394898176193237, + "learning_rate": 4.5389200288895876e-05, + "loss": 0.1342, + "num_input_tokens_seen": 10617856, + "step": 50320 + }, + { + "epoch": 5.536303630363037, + "grad_norm": 0.10155277699232101, + "learning_rate": 4.538781136879675e-05, + "loss": 0.0071, + "num_input_tokens_seen": 10618912, + "step": 50325 + }, + { + "epoch": 5.536853685368537, + "grad_norm": 0.02287473902106285, + "learning_rate": 4.538642226079295e-05, + "loss": 0.0125, + "num_input_tokens_seen": 10619936, + "step": 50330 + }, + { + "epoch": 5.537403740374037, + "grad_norm": 0.17165234684944153, + "learning_rate": 4.538503296489728e-05, + "loss": 0.008, + "num_input_tokens_seen": 10620960, + "step": 50335 + }, + { + "epoch": 5.537953795379538, + "grad_norm": 0.37441134452819824, + "learning_rate": 4.538364348112254e-05, + "loss": 0.0463, + "num_input_tokens_seen": 10622048, + "step": 50340 + }, + { + "epoch": 5.538503850385038, + "grad_norm": 0.219062939286232, + "learning_rate": 4.538225380948154e-05, + "loss": 0.0115, + "num_input_tokens_seen": 10623072, + "step": 50345 + }, + { + "epoch": 5.539053905390539, + "grad_norm": 0.08163754642009735, + "learning_rate": 4.538086394998709e-05, + "loss": 0.1325, + "num_input_tokens_seen": 10624064, + "step": 50350 + }, + { + "epoch": 5.53960396039604, + "grad_norm": 2.1821327209472656, + "learning_rate": 4.537947390265199e-05, + "loss": 0.1247, + "num_input_tokens_seen": 10625120, + "step": 50355 + }, + { + "epoch": 5.54015401540154, + "grad_norm": 0.08317968994379044, + "learning_rate": 4.537808366748906e-05, + "loss": 0.0423, + "num_input_tokens_seen": 10626144, + "step": 50360 + }, + { + "epoch": 5.540704070407041, + "grad_norm": 0.02945161610841751, + "learning_rate": 4.537669324451111e-05, + "loss": 0.1178, + "num_input_tokens_seen": 10627168, + "step": 50365 + }, + { + "epoch": 5.541254125412541, + "grad_norm": 0.25049757957458496, + "learning_rate": 4.537530263373097e-05, + "loss": 0.0399, + "num_input_tokens_seen": 10628192, + "step": 50370 + }, + { + "epoch": 5.541804180418042, + "grad_norm": 0.719774067401886, + "learning_rate": 4.537391183516142e-05, + "loss": 0.0516, + "num_input_tokens_seen": 10629152, + "step": 50375 + }, + { + "epoch": 5.542354235423542, + "grad_norm": 0.006144884042441845, + "learning_rate": 4.5372520848815316e-05, + "loss": 0.038, + "num_input_tokens_seen": 10630304, + "step": 50380 + }, + { + "epoch": 5.542904290429043, + "grad_norm": 1.4702768325805664, + "learning_rate": 4.537112967470546e-05, + "loss": 0.1454, + "num_input_tokens_seen": 10631360, + "step": 50385 + }, + { + "epoch": 5.543454345434544, + "grad_norm": 0.012661873362958431, + "learning_rate": 4.536973831284468e-05, + "loss": 0.0048, + "num_input_tokens_seen": 10632352, + "step": 50390 + }, + { + "epoch": 5.544004400440044, + "grad_norm": 0.5508081912994385, + "learning_rate": 4.5368346763245786e-05, + "loss": 0.0241, + "num_input_tokens_seen": 10633440, + "step": 50395 + }, + { + "epoch": 5.544554455445544, + "grad_norm": 0.03640619292855263, + "learning_rate": 4.536695502592162e-05, + "loss": 0.2434, + "num_input_tokens_seen": 10634560, + "step": 50400 + }, + { + "epoch": 5.545104510451045, + "grad_norm": 0.01869778148829937, + "learning_rate": 4.5365563100885e-05, + "loss": 0.0306, + "num_input_tokens_seen": 10635616, + "step": 50405 + }, + { + "epoch": 5.5456545654565454, + "grad_norm": 0.07444692403078079, + "learning_rate": 4.536417098814877e-05, + "loss": 0.015, + "num_input_tokens_seen": 10636672, + "step": 50410 + }, + { + "epoch": 5.5462046204620465, + "grad_norm": 0.5364521145820618, + "learning_rate": 4.5362778687725733e-05, + "loss": 0.0178, + "num_input_tokens_seen": 10637792, + "step": 50415 + }, + { + "epoch": 5.546754675467547, + "grad_norm": 0.35398611426353455, + "learning_rate": 4.536138619962874e-05, + "loss": 0.0192, + "num_input_tokens_seen": 10638816, + "step": 50420 + }, + { + "epoch": 5.547304730473047, + "grad_norm": 2.67130970954895, + "learning_rate": 4.535999352387062e-05, + "loss": 0.069, + "num_input_tokens_seen": 10639904, + "step": 50425 + }, + { + "epoch": 5.547854785478548, + "grad_norm": 0.052670545876026154, + "learning_rate": 4.535860066046421e-05, + "loss": 0.097, + "num_input_tokens_seen": 10640992, + "step": 50430 + }, + { + "epoch": 5.548404840484048, + "grad_norm": 0.9750362038612366, + "learning_rate": 4.535720760942235e-05, + "loss": 0.0454, + "num_input_tokens_seen": 10642016, + "step": 50435 + }, + { + "epoch": 5.548954895489549, + "grad_norm": 0.006201181095093489, + "learning_rate": 4.535581437075787e-05, + "loss": 0.0587, + "num_input_tokens_seen": 10643072, + "step": 50440 + }, + { + "epoch": 5.5495049504950495, + "grad_norm": 0.1559630036354065, + "learning_rate": 4.535442094448362e-05, + "loss": 0.0511, + "num_input_tokens_seen": 10644096, + "step": 50445 + }, + { + "epoch": 5.55005500550055, + "grad_norm": 0.09186477959156036, + "learning_rate": 4.535302733061244e-05, + "loss": 0.0428, + "num_input_tokens_seen": 10645152, + "step": 50450 + }, + { + "epoch": 5.550605060506051, + "grad_norm": 0.6772266030311584, + "learning_rate": 4.535163352915717e-05, + "loss": 0.0431, + "num_input_tokens_seen": 10646240, + "step": 50455 + }, + { + "epoch": 5.551155115511551, + "grad_norm": 0.04070374742150307, + "learning_rate": 4.535023954013066e-05, + "loss": 0.0703, + "num_input_tokens_seen": 10647264, + "step": 50460 + }, + { + "epoch": 5.551705170517051, + "grad_norm": 0.2210502326488495, + "learning_rate": 4.534884536354575e-05, + "loss": 0.042, + "num_input_tokens_seen": 10648384, + "step": 50465 + }, + { + "epoch": 5.552255225522552, + "grad_norm": 0.1096465066075325, + "learning_rate": 4.5347450999415315e-05, + "loss": 0.0542, + "num_input_tokens_seen": 10649408, + "step": 50470 + }, + { + "epoch": 5.552805280528053, + "grad_norm": 0.40071186423301697, + "learning_rate": 4.5346056447752176e-05, + "loss": 0.0154, + "num_input_tokens_seen": 10650496, + "step": 50475 + }, + { + "epoch": 5.553355335533554, + "grad_norm": 0.2448631227016449, + "learning_rate": 4.53446617085692e-05, + "loss": 0.0307, + "num_input_tokens_seen": 10651552, + "step": 50480 + }, + { + "epoch": 5.553905390539054, + "grad_norm": 0.06873227655887604, + "learning_rate": 4.534326678187924e-05, + "loss": 0.0596, + "num_input_tokens_seen": 10652512, + "step": 50485 + }, + { + "epoch": 5.554455445544555, + "grad_norm": 0.05921262875199318, + "learning_rate": 4.534187166769515e-05, + "loss": 0.0357, + "num_input_tokens_seen": 10653600, + "step": 50490 + }, + { + "epoch": 5.555005500550055, + "grad_norm": 1.3156839609146118, + "learning_rate": 4.5340476366029796e-05, + "loss": 0.0634, + "num_input_tokens_seen": 10654592, + "step": 50495 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.14755156636238098, + "learning_rate": 4.533908087689602e-05, + "loss": 0.0243, + "num_input_tokens_seen": 10655648, + "step": 50500 + }, + { + "epoch": 5.5561056105610565, + "grad_norm": 0.8924551606178284, + "learning_rate": 4.533768520030671e-05, + "loss": 0.0341, + "num_input_tokens_seen": 10656832, + "step": 50505 + }, + { + "epoch": 5.556655665566557, + "grad_norm": 0.021786268800497055, + "learning_rate": 4.53362893362747e-05, + "loss": 0.0469, + "num_input_tokens_seen": 10657856, + "step": 50510 + }, + { + "epoch": 5.557205720572057, + "grad_norm": 1.7180066108703613, + "learning_rate": 4.5334893284812876e-05, + "loss": 0.0329, + "num_input_tokens_seen": 10658880, + "step": 50515 + }, + { + "epoch": 5.557755775577558, + "grad_norm": 0.04143122211098671, + "learning_rate": 4.5333497045934096e-05, + "loss": 0.0225, + "num_input_tokens_seen": 10659936, + "step": 50520 + }, + { + "epoch": 5.558305830583058, + "grad_norm": 0.3408620059490204, + "learning_rate": 4.533210061965123e-05, + "loss": 0.0868, + "num_input_tokens_seen": 10660960, + "step": 50525 + }, + { + "epoch": 5.558855885588558, + "grad_norm": 0.07973184436559677, + "learning_rate": 4.533070400597715e-05, + "loss": 0.0372, + "num_input_tokens_seen": 10662016, + "step": 50530 + }, + { + "epoch": 5.5594059405940595, + "grad_norm": 1.3794208765029907, + "learning_rate": 4.532930720492473e-05, + "loss": 0.0894, + "num_input_tokens_seen": 10663136, + "step": 50535 + }, + { + "epoch": 5.55995599559956, + "grad_norm": 0.005236690863966942, + "learning_rate": 4.532791021650684e-05, + "loss": 0.0109, + "num_input_tokens_seen": 10664224, + "step": 50540 + }, + { + "epoch": 5.560506050605061, + "grad_norm": 0.09783430397510529, + "learning_rate": 4.532651304073636e-05, + "loss": 0.0234, + "num_input_tokens_seen": 10665312, + "step": 50545 + }, + { + "epoch": 5.561056105610561, + "grad_norm": 0.24715468287467957, + "learning_rate": 4.532511567762615e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10666368, + "step": 50550 + }, + { + "epoch": 5.561606160616062, + "grad_norm": 0.04864952340722084, + "learning_rate": 4.5323718127189116e-05, + "loss": 0.1118, + "num_input_tokens_seen": 10667456, + "step": 50555 + }, + { + "epoch": 5.562156215621562, + "grad_norm": 1.2755690813064575, + "learning_rate": 4.532232038943811e-05, + "loss": 0.0494, + "num_input_tokens_seen": 10668544, + "step": 50560 + }, + { + "epoch": 5.5627062706270625, + "grad_norm": 0.06526029855012894, + "learning_rate": 4.532092246438604e-05, + "loss": 0.0192, + "num_input_tokens_seen": 10669568, + "step": 50565 + }, + { + "epoch": 5.563256325632564, + "grad_norm": 0.3107900619506836, + "learning_rate": 4.531952435204577e-05, + "loss": 0.0359, + "num_input_tokens_seen": 10670592, + "step": 50570 + }, + { + "epoch": 5.563806380638064, + "grad_norm": 0.015471394173800945, + "learning_rate": 4.53181260524302e-05, + "loss": 0.0396, + "num_input_tokens_seen": 10671648, + "step": 50575 + }, + { + "epoch": 5.564356435643564, + "grad_norm": 1.8104572296142578, + "learning_rate": 4.531672756555221e-05, + "loss": 0.1386, + "num_input_tokens_seen": 10672672, + "step": 50580 + }, + { + "epoch": 5.564906490649065, + "grad_norm": 1.0843042135238647, + "learning_rate": 4.531532889142469e-05, + "loss": 0.2194, + "num_input_tokens_seen": 10673728, + "step": 50585 + }, + { + "epoch": 5.565456545654565, + "grad_norm": 0.009685025550425053, + "learning_rate": 4.531393003006054e-05, + "loss": 0.0364, + "num_input_tokens_seen": 10674784, + "step": 50590 + }, + { + "epoch": 5.566006600660066, + "grad_norm": 0.49247848987579346, + "learning_rate": 4.531253098147263e-05, + "loss": 0.1383, + "num_input_tokens_seen": 10675776, + "step": 50595 + }, + { + "epoch": 5.566556655665567, + "grad_norm": 0.2281976044178009, + "learning_rate": 4.531113174567389e-05, + "loss": 0.0101, + "num_input_tokens_seen": 10676896, + "step": 50600 + }, + { + "epoch": 5.567106710671067, + "grad_norm": 1.320273756980896, + "learning_rate": 4.530973232267717e-05, + "loss": 0.0302, + "num_input_tokens_seen": 10677952, + "step": 50605 + }, + { + "epoch": 5.567656765676568, + "grad_norm": 0.8165944218635559, + "learning_rate": 4.5308332712495406e-05, + "loss": 0.0266, + "num_input_tokens_seen": 10678976, + "step": 50610 + }, + { + "epoch": 5.568206820682068, + "grad_norm": 1.3963520526885986, + "learning_rate": 4.530693291514149e-05, + "loss": 0.1105, + "num_input_tokens_seen": 10680128, + "step": 50615 + }, + { + "epoch": 5.568756875687569, + "grad_norm": 0.32296496629714966, + "learning_rate": 4.5305532930628305e-05, + "loss": 0.0135, + "num_input_tokens_seen": 10681088, + "step": 50620 + }, + { + "epoch": 5.569306930693069, + "grad_norm": 1.2835698127746582, + "learning_rate": 4.530413275896877e-05, + "loss": 0.1014, + "num_input_tokens_seen": 10682144, + "step": 50625 + }, + { + "epoch": 5.56985698569857, + "grad_norm": 0.19974884390830994, + "learning_rate": 4.5302732400175786e-05, + "loss": 0.0074, + "num_input_tokens_seen": 10683136, + "step": 50630 + }, + { + "epoch": 5.570407040704071, + "grad_norm": 1.0444068908691406, + "learning_rate": 4.530133185426226e-05, + "loss": 0.103, + "num_input_tokens_seen": 10684192, + "step": 50635 + }, + { + "epoch": 5.570957095709571, + "grad_norm": 0.5068324208259583, + "learning_rate": 4.52999311212411e-05, + "loss": 0.0245, + "num_input_tokens_seen": 10685280, + "step": 50640 + }, + { + "epoch": 5.571507150715071, + "grad_norm": 0.08323196321725845, + "learning_rate": 4.5298530201125224e-05, + "loss": 0.0617, + "num_input_tokens_seen": 10686368, + "step": 50645 + }, + { + "epoch": 5.572057205720572, + "grad_norm": 0.021116968244314194, + "learning_rate": 4.529712909392753e-05, + "loss": 0.0177, + "num_input_tokens_seen": 10687360, + "step": 50650 + }, + { + "epoch": 5.572607260726072, + "grad_norm": 0.4779502749443054, + "learning_rate": 4.529572779966093e-05, + "loss": 0.0414, + "num_input_tokens_seen": 10688384, + "step": 50655 + }, + { + "epoch": 5.5731573157315735, + "grad_norm": 0.11405952274799347, + "learning_rate": 4.5294326318338355e-05, + "loss": 0.0214, + "num_input_tokens_seen": 10689408, + "step": 50660 + }, + { + "epoch": 5.573707370737074, + "grad_norm": 0.029439587146043777, + "learning_rate": 4.5292924649972704e-05, + "loss": 0.0204, + "num_input_tokens_seen": 10690400, + "step": 50665 + }, + { + "epoch": 5.574257425742574, + "grad_norm": 1.4938853979110718, + "learning_rate": 4.5291522794576914e-05, + "loss": 0.048, + "num_input_tokens_seen": 10691456, + "step": 50670 + }, + { + "epoch": 5.574807480748075, + "grad_norm": 0.10407089442014694, + "learning_rate": 4.5290120752163885e-05, + "loss": 0.0867, + "num_input_tokens_seen": 10692480, + "step": 50675 + }, + { + "epoch": 5.575357535753575, + "grad_norm": 0.02624429203569889, + "learning_rate": 4.5288718522746554e-05, + "loss": 0.0211, + "num_input_tokens_seen": 10693568, + "step": 50680 + }, + { + "epoch": 5.575907590759076, + "grad_norm": 0.4394429326057434, + "learning_rate": 4.528731610633784e-05, + "loss": 0.0371, + "num_input_tokens_seen": 10694560, + "step": 50685 + }, + { + "epoch": 5.5764576457645765, + "grad_norm": 0.03557097166776657, + "learning_rate": 4.528591350295067e-05, + "loss": 0.0041, + "num_input_tokens_seen": 10695616, + "step": 50690 + }, + { + "epoch": 5.577007700770077, + "grad_norm": 0.08914511650800705, + "learning_rate": 4.528451071259796e-05, + "loss": 0.0333, + "num_input_tokens_seen": 10696672, + "step": 50695 + }, + { + "epoch": 5.577557755775578, + "grad_norm": 0.08706855773925781, + "learning_rate": 4.5283107735292665e-05, + "loss": 0.1267, + "num_input_tokens_seen": 10697728, + "step": 50700 + }, + { + "epoch": 5.578107810781078, + "grad_norm": 0.1743350774049759, + "learning_rate": 4.5281704571047687e-05, + "loss": 0.0725, + "num_input_tokens_seen": 10698816, + "step": 50705 + }, + { + "epoch": 5.578657865786578, + "grad_norm": 0.07777372002601624, + "learning_rate": 4.528030121987597e-05, + "loss": 0.0328, + "num_input_tokens_seen": 10699904, + "step": 50710 + }, + { + "epoch": 5.579207920792079, + "grad_norm": 0.4882460832595825, + "learning_rate": 4.527889768179044e-05, + "loss": 0.0695, + "num_input_tokens_seen": 10700960, + "step": 50715 + }, + { + "epoch": 5.5797579757975795, + "grad_norm": 0.09747160971164703, + "learning_rate": 4.527749395680406e-05, + "loss": 0.1819, + "num_input_tokens_seen": 10702016, + "step": 50720 + }, + { + "epoch": 5.580308030803081, + "grad_norm": 0.8786129355430603, + "learning_rate": 4.5276090044929736e-05, + "loss": 0.1464, + "num_input_tokens_seen": 10703040, + "step": 50725 + }, + { + "epoch": 5.580858085808581, + "grad_norm": 0.7590784430503845, + "learning_rate": 4.527468594618043e-05, + "loss": 0.0562, + "num_input_tokens_seen": 10704128, + "step": 50730 + }, + { + "epoch": 5.581408140814082, + "grad_norm": 0.01689777337014675, + "learning_rate": 4.527328166056906e-05, + "loss": 0.037, + "num_input_tokens_seen": 10705248, + "step": 50735 + }, + { + "epoch": 5.581958195819582, + "grad_norm": 0.7442623972892761, + "learning_rate": 4.527187718810859e-05, + "loss": 0.1228, + "num_input_tokens_seen": 10706304, + "step": 50740 + }, + { + "epoch": 5.582508250825082, + "grad_norm": 0.029137400910258293, + "learning_rate": 4.527047252881195e-05, + "loss": 0.0084, + "num_input_tokens_seen": 10707328, + "step": 50745 + }, + { + "epoch": 5.583058305830583, + "grad_norm": 1.0916707515716553, + "learning_rate": 4.526906768269209e-05, + "loss": 0.0743, + "num_input_tokens_seen": 10708384, + "step": 50750 + }, + { + "epoch": 5.583608360836084, + "grad_norm": 0.9230132699012756, + "learning_rate": 4.5267662649761965e-05, + "loss": 0.1324, + "num_input_tokens_seen": 10709536, + "step": 50755 + }, + { + "epoch": 5.584158415841584, + "grad_norm": 0.3971574306488037, + "learning_rate": 4.5266257430034514e-05, + "loss": 0.0113, + "num_input_tokens_seen": 10710592, + "step": 50760 + }, + { + "epoch": 5.584708470847085, + "grad_norm": 0.0632753074169159, + "learning_rate": 4.52648520235227e-05, + "loss": 0.0118, + "num_input_tokens_seen": 10711616, + "step": 50765 + }, + { + "epoch": 5.585258525852585, + "grad_norm": 0.37976112961769104, + "learning_rate": 4.526344643023946e-05, + "loss": 0.0165, + "num_input_tokens_seen": 10712640, + "step": 50770 + }, + { + "epoch": 5.585808580858086, + "grad_norm": 0.8684609532356262, + "learning_rate": 4.5262040650197755e-05, + "loss": 0.1295, + "num_input_tokens_seen": 10713696, + "step": 50775 + }, + { + "epoch": 5.586358635863586, + "grad_norm": 0.10744085907936096, + "learning_rate": 4.5260634683410554e-05, + "loss": 0.0244, + "num_input_tokens_seen": 10714816, + "step": 50780 + }, + { + "epoch": 5.586908690869087, + "grad_norm": 0.38565921783447266, + "learning_rate": 4.52592285298908e-05, + "loss": 0.0201, + "num_input_tokens_seen": 10715840, + "step": 50785 + }, + { + "epoch": 5.587458745874588, + "grad_norm": 0.49048298597335815, + "learning_rate": 4.525782218965146e-05, + "loss": 0.0197, + "num_input_tokens_seen": 10716992, + "step": 50790 + }, + { + "epoch": 5.588008800880088, + "grad_norm": 0.03437020257115364, + "learning_rate": 4.52564156627055e-05, + "loss": 0.0161, + "num_input_tokens_seen": 10718048, + "step": 50795 + }, + { + "epoch": 5.588558855885589, + "grad_norm": 0.1472814381122589, + "learning_rate": 4.525500894906587e-05, + "loss": 0.0296, + "num_input_tokens_seen": 10719104, + "step": 50800 + }, + { + "epoch": 5.589108910891089, + "grad_norm": 1.6564548015594482, + "learning_rate": 4.5253602048745535e-05, + "loss": 0.0512, + "num_input_tokens_seen": 10720192, + "step": 50805 + }, + { + "epoch": 5.589658965896589, + "grad_norm": 0.014398125000298023, + "learning_rate": 4.525219496175748e-05, + "loss": 0.0131, + "num_input_tokens_seen": 10721312, + "step": 50810 + }, + { + "epoch": 5.5902090209020905, + "grad_norm": 0.012444701977074146, + "learning_rate": 4.5250787688114653e-05, + "loss": 0.0787, + "num_input_tokens_seen": 10722400, + "step": 50815 + }, + { + "epoch": 5.590759075907591, + "grad_norm": 0.7763575911521912, + "learning_rate": 4.524938022783004e-05, + "loss": 0.0507, + "num_input_tokens_seen": 10723488, + "step": 50820 + }, + { + "epoch": 5.591309130913091, + "grad_norm": 0.4650108814239502, + "learning_rate": 4.52479725809166e-05, + "loss": 0.0436, + "num_input_tokens_seen": 10724480, + "step": 50825 + }, + { + "epoch": 5.591859185918592, + "grad_norm": 0.06314881891012192, + "learning_rate": 4.5246564747387323e-05, + "loss": 0.0426, + "num_input_tokens_seen": 10725504, + "step": 50830 + }, + { + "epoch": 5.592409240924092, + "grad_norm": 0.13271360099315643, + "learning_rate": 4.524515672725516e-05, + "loss": 0.0197, + "num_input_tokens_seen": 10726560, + "step": 50835 + }, + { + "epoch": 5.592959295929593, + "grad_norm": 0.012301219627261162, + "learning_rate": 4.524374852053311e-05, + "loss": 0.0732, + "num_input_tokens_seen": 10727552, + "step": 50840 + }, + { + "epoch": 5.5935093509350935, + "grad_norm": 0.3284331262111664, + "learning_rate": 4.524234012723414e-05, + "loss": 0.0904, + "num_input_tokens_seen": 10728576, + "step": 50845 + }, + { + "epoch": 5.594059405940594, + "grad_norm": 0.9340175986289978, + "learning_rate": 4.524093154737124e-05, + "loss": 0.0952, + "num_input_tokens_seen": 10729632, + "step": 50850 + }, + { + "epoch": 5.594609460946095, + "grad_norm": 1.048765778541565, + "learning_rate": 4.5239522780957384e-05, + "loss": 0.0576, + "num_input_tokens_seen": 10730720, + "step": 50855 + }, + { + "epoch": 5.595159515951595, + "grad_norm": 0.1796456128358841, + "learning_rate": 4.523811382800556e-05, + "loss": 0.0124, + "num_input_tokens_seen": 10731776, + "step": 50860 + }, + { + "epoch": 5.595709570957096, + "grad_norm": 0.7435715198516846, + "learning_rate": 4.523670468852875e-05, + "loss": 0.0389, + "num_input_tokens_seen": 10732832, + "step": 50865 + }, + { + "epoch": 5.596259625962596, + "grad_norm": 0.0438062809407711, + "learning_rate": 4.5235295362539944e-05, + "loss": 0.0716, + "num_input_tokens_seen": 10733888, + "step": 50870 + }, + { + "epoch": 5.5968096809680965, + "grad_norm": 0.04860885441303253, + "learning_rate": 4.5233885850052125e-05, + "loss": 0.0288, + "num_input_tokens_seen": 10734976, + "step": 50875 + }, + { + "epoch": 5.597359735973598, + "grad_norm": 0.29002436995506287, + "learning_rate": 4.523247615107829e-05, + "loss": 0.0091, + "num_input_tokens_seen": 10735968, + "step": 50880 + }, + { + "epoch": 5.597909790979098, + "grad_norm": 0.01104607991874218, + "learning_rate": 4.523106626563144e-05, + "loss": 0.0592, + "num_input_tokens_seen": 10737056, + "step": 50885 + }, + { + "epoch": 5.598459845984598, + "grad_norm": 0.07273833453655243, + "learning_rate": 4.522965619372456e-05, + "loss": 0.0942, + "num_input_tokens_seen": 10738080, + "step": 50890 + }, + { + "epoch": 5.599009900990099, + "grad_norm": 0.524899959564209, + "learning_rate": 4.522824593537064e-05, + "loss": 0.0154, + "num_input_tokens_seen": 10739168, + "step": 50895 + }, + { + "epoch": 5.599559955995599, + "grad_norm": 0.09986144304275513, + "learning_rate": 4.522683549058268e-05, + "loss": 0.0182, + "num_input_tokens_seen": 10740288, + "step": 50900 + }, + { + "epoch": 5.6001100110011, + "grad_norm": 0.05532244220376015, + "learning_rate": 4.522542485937369e-05, + "loss": 0.0479, + "num_input_tokens_seen": 10741440, + "step": 50905 + }, + { + "epoch": 5.600660066006601, + "grad_norm": 0.19670192897319794, + "learning_rate": 4.5224014041756655e-05, + "loss": 0.1187, + "num_input_tokens_seen": 10742464, + "step": 50910 + }, + { + "epoch": 5.601210121012102, + "grad_norm": 0.08245156705379486, + "learning_rate": 4.52226030377446e-05, + "loss": 0.0137, + "num_input_tokens_seen": 10743488, + "step": 50915 + }, + { + "epoch": 5.601760176017602, + "grad_norm": 0.6781405210494995, + "learning_rate": 4.5221191847350505e-05, + "loss": 0.0731, + "num_input_tokens_seen": 10744512, + "step": 50920 + }, + { + "epoch": 5.602310231023102, + "grad_norm": 0.20081056654453278, + "learning_rate": 4.52197804705874e-05, + "loss": 0.0227, + "num_input_tokens_seen": 10745536, + "step": 50925 + }, + { + "epoch": 5.602860286028603, + "grad_norm": 0.029105843976140022, + "learning_rate": 4.5218368907468273e-05, + "loss": 0.0163, + "num_input_tokens_seen": 10746592, + "step": 50930 + }, + { + "epoch": 5.603410341034103, + "grad_norm": 0.02989654615521431, + "learning_rate": 4.521695715800614e-05, + "loss": 0.0642, + "num_input_tokens_seen": 10747712, + "step": 50935 + }, + { + "epoch": 5.603960396039604, + "grad_norm": 0.3266373574733734, + "learning_rate": 4.5215545222214016e-05, + "loss": 0.0558, + "num_input_tokens_seen": 10748736, + "step": 50940 + }, + { + "epoch": 5.604510451045105, + "grad_norm": 1.1483118534088135, + "learning_rate": 4.521413310010492e-05, + "loss": 0.0821, + "num_input_tokens_seen": 10749824, + "step": 50945 + }, + { + "epoch": 5.605060506050605, + "grad_norm": 0.6103024482727051, + "learning_rate": 4.521272079169184e-05, + "loss": 0.0323, + "num_input_tokens_seen": 10750944, + "step": 50950 + }, + { + "epoch": 5.605610561056105, + "grad_norm": 0.17712931334972382, + "learning_rate": 4.521130829698783e-05, + "loss": 0.0455, + "num_input_tokens_seen": 10752032, + "step": 50955 + }, + { + "epoch": 5.606160616061606, + "grad_norm": 0.4012081027030945, + "learning_rate": 4.5209895616005885e-05, + "loss": 0.0869, + "num_input_tokens_seen": 10753120, + "step": 50960 + }, + { + "epoch": 5.606710671067106, + "grad_norm": 0.15988658368587494, + "learning_rate": 4.5208482748759016e-05, + "loss": 0.1085, + "num_input_tokens_seen": 10754208, + "step": 50965 + }, + { + "epoch": 5.6072607260726075, + "grad_norm": 0.4736259877681732, + "learning_rate": 4.5207069695260274e-05, + "loss": 0.0677, + "num_input_tokens_seen": 10755296, + "step": 50970 + }, + { + "epoch": 5.607810781078108, + "grad_norm": 0.14691288769245148, + "learning_rate": 4.520565645552266e-05, + "loss": 0.0446, + "num_input_tokens_seen": 10756384, + "step": 50975 + }, + { + "epoch": 5.608360836083609, + "grad_norm": 0.46289557218551636, + "learning_rate": 4.5204243029559215e-05, + "loss": 0.0351, + "num_input_tokens_seen": 10757408, + "step": 50980 + }, + { + "epoch": 5.608910891089109, + "grad_norm": 0.010967986658215523, + "learning_rate": 4.520282941738295e-05, + "loss": 0.085, + "num_input_tokens_seen": 10758464, + "step": 50985 + }, + { + "epoch": 5.609460946094609, + "grad_norm": 0.10959960520267487, + "learning_rate": 4.5201415619006906e-05, + "loss": 0.028, + "num_input_tokens_seen": 10759584, + "step": 50990 + }, + { + "epoch": 5.61001100110011, + "grad_norm": 0.05101575329899788, + "learning_rate": 4.52000016344441e-05, + "loss": 0.0226, + "num_input_tokens_seen": 10760704, + "step": 50995 + }, + { + "epoch": 5.6105610561056105, + "grad_norm": 0.09509971737861633, + "learning_rate": 4.519858746370757e-05, + "loss": 0.1102, + "num_input_tokens_seen": 10761760, + "step": 51000 + }, + { + "epoch": 5.611111111111111, + "grad_norm": 0.020101910457015038, + "learning_rate": 4.519717310681036e-05, + "loss": 0.0042, + "num_input_tokens_seen": 10762816, + "step": 51005 + }, + { + "epoch": 5.611661166116612, + "grad_norm": 0.0298805832862854, + "learning_rate": 4.519575856376549e-05, + "loss": 0.0168, + "num_input_tokens_seen": 10763840, + "step": 51010 + }, + { + "epoch": 5.612211221122112, + "grad_norm": 0.039149876683950424, + "learning_rate": 4.519434383458601e-05, + "loss": 0.0102, + "num_input_tokens_seen": 10764896, + "step": 51015 + }, + { + "epoch": 5.612761276127613, + "grad_norm": 0.013034423813223839, + "learning_rate": 4.519292891928495e-05, + "loss": 0.0119, + "num_input_tokens_seen": 10765888, + "step": 51020 + }, + { + "epoch": 5.613311331133113, + "grad_norm": 0.9022788405418396, + "learning_rate": 4.5191513817875354e-05, + "loss": 0.0162, + "num_input_tokens_seen": 10766976, + "step": 51025 + }, + { + "epoch": 5.6138613861386135, + "grad_norm": 0.061985183507204056, + "learning_rate": 4.5190098530370263e-05, + "loss": 0.1357, + "num_input_tokens_seen": 10768000, + "step": 51030 + }, + { + "epoch": 5.614411441144115, + "grad_norm": 1.1865438222885132, + "learning_rate": 4.518868305678273e-05, + "loss": 0.1441, + "num_input_tokens_seen": 10769024, + "step": 51035 + }, + { + "epoch": 5.614961496149615, + "grad_norm": 0.015674330294132233, + "learning_rate": 4.518726739712578e-05, + "loss": 0.0441, + "num_input_tokens_seen": 10770112, + "step": 51040 + }, + { + "epoch": 5.615511551155116, + "grad_norm": 0.07096944749355316, + "learning_rate": 4.518585155141248e-05, + "loss": 0.0141, + "num_input_tokens_seen": 10771136, + "step": 51045 + }, + { + "epoch": 5.616061606160616, + "grad_norm": 0.08981800079345703, + "learning_rate": 4.5184435519655875e-05, + "loss": 0.0559, + "num_input_tokens_seen": 10772192, + "step": 51050 + }, + { + "epoch": 5.616611661166116, + "grad_norm": 0.955899715423584, + "learning_rate": 4.518301930186901e-05, + "loss": 0.0585, + "num_input_tokens_seen": 10773216, + "step": 51055 + }, + { + "epoch": 5.617161716171617, + "grad_norm": 0.03955701366066933, + "learning_rate": 4.5181602898064934e-05, + "loss": 0.0849, + "num_input_tokens_seen": 10774336, + "step": 51060 + }, + { + "epoch": 5.617711771177118, + "grad_norm": 0.20000481605529785, + "learning_rate": 4.518018630825672e-05, + "loss": 0.0644, + "num_input_tokens_seen": 10775424, + "step": 51065 + }, + { + "epoch": 5.618261826182618, + "grad_norm": 1.0672364234924316, + "learning_rate": 4.517876953245741e-05, + "loss": 0.0262, + "num_input_tokens_seen": 10776448, + "step": 51070 + }, + { + "epoch": 5.618811881188119, + "grad_norm": 0.8086877465248108, + "learning_rate": 4.517735257068005e-05, + "loss": 0.0764, + "num_input_tokens_seen": 10777504, + "step": 51075 + }, + { + "epoch": 5.619361936193619, + "grad_norm": 0.6203060746192932, + "learning_rate": 4.517593542293773e-05, + "loss": 0.0871, + "num_input_tokens_seen": 10778560, + "step": 51080 + }, + { + "epoch": 5.61991199119912, + "grad_norm": 0.09599179774522781, + "learning_rate": 4.517451808924349e-05, + "loss": 0.1258, + "num_input_tokens_seen": 10779616, + "step": 51085 + }, + { + "epoch": 5.62046204620462, + "grad_norm": 0.024864396080374718, + "learning_rate": 4.5173100569610395e-05, + "loss": 0.0136, + "num_input_tokens_seen": 10780672, + "step": 51090 + }, + { + "epoch": 5.621012101210121, + "grad_norm": 0.9491181373596191, + "learning_rate": 4.517168286405151e-05, + "loss": 0.0472, + "num_input_tokens_seen": 10781728, + "step": 51095 + }, + { + "epoch": 5.621562156215622, + "grad_norm": 0.024569440633058548, + "learning_rate": 4.517026497257991e-05, + "loss": 0.0169, + "num_input_tokens_seen": 10782784, + "step": 51100 + }, + { + "epoch": 5.622112211221122, + "grad_norm": 0.033871475607156754, + "learning_rate": 4.516884689520866e-05, + "loss": 0.0343, + "num_input_tokens_seen": 10783872, + "step": 51105 + }, + { + "epoch": 5.622662266226623, + "grad_norm": 0.012583665549755096, + "learning_rate": 4.516742863195081e-05, + "loss": 0.0736, + "num_input_tokens_seen": 10784960, + "step": 51110 + }, + { + "epoch": 5.623212321232123, + "grad_norm": 0.0657477155327797, + "learning_rate": 4.516601018281946e-05, + "loss": 0.0099, + "num_input_tokens_seen": 10786016, + "step": 51115 + }, + { + "epoch": 5.623762376237623, + "grad_norm": 0.5050876140594482, + "learning_rate": 4.516459154782767e-05, + "loss": 0.0835, + "num_input_tokens_seen": 10787136, + "step": 51120 + }, + { + "epoch": 5.6243124312431245, + "grad_norm": 0.365993857383728, + "learning_rate": 4.516317272698851e-05, + "loss": 0.0534, + "num_input_tokens_seen": 10788192, + "step": 51125 + }, + { + "epoch": 5.624862486248625, + "grad_norm": 0.20490431785583496, + "learning_rate": 4.5161753720315067e-05, + "loss": 0.1459, + "num_input_tokens_seen": 10789280, + "step": 51130 + }, + { + "epoch": 5.625412541254125, + "grad_norm": 0.08107542991638184, + "learning_rate": 4.516033452782041e-05, + "loss": 0.0712, + "num_input_tokens_seen": 10790304, + "step": 51135 + }, + { + "epoch": 5.625962596259626, + "grad_norm": 0.045956097543239594, + "learning_rate": 4.515891514951763e-05, + "loss": 0.0735, + "num_input_tokens_seen": 10791360, + "step": 51140 + }, + { + "epoch": 5.626512651265126, + "grad_norm": 0.04504062235355377, + "learning_rate": 4.51574955854198e-05, + "loss": 0.0902, + "num_input_tokens_seen": 10792480, + "step": 51145 + }, + { + "epoch": 5.627062706270627, + "grad_norm": 1.1180226802825928, + "learning_rate": 4.515607583554e-05, + "loss": 0.0329, + "num_input_tokens_seen": 10793568, + "step": 51150 + }, + { + "epoch": 5.6276127612761275, + "grad_norm": 0.035641707479953766, + "learning_rate": 4.515465589989133e-05, + "loss": 0.0489, + "num_input_tokens_seen": 10794656, + "step": 51155 + }, + { + "epoch": 5.628162816281629, + "grad_norm": 0.9378198981285095, + "learning_rate": 4.515323577848686e-05, + "loss": 0.1717, + "num_input_tokens_seen": 10795744, + "step": 51160 + }, + { + "epoch": 5.628712871287129, + "grad_norm": 0.06679866462945938, + "learning_rate": 4.5151815471339694e-05, + "loss": 0.0691, + "num_input_tokens_seen": 10796800, + "step": 51165 + }, + { + "epoch": 5.629262926292629, + "grad_norm": 0.20412443578243256, + "learning_rate": 4.515039497846291e-05, + "loss": 0.0156, + "num_input_tokens_seen": 10797824, + "step": 51170 + }, + { + "epoch": 5.62981298129813, + "grad_norm": 0.5998905301094055, + "learning_rate": 4.51489742998696e-05, + "loss": 0.0286, + "num_input_tokens_seen": 10798944, + "step": 51175 + }, + { + "epoch": 5.63036303630363, + "grad_norm": 0.9811620712280273, + "learning_rate": 4.514755343557287e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10799968, + "step": 51180 + }, + { + "epoch": 5.6309130913091305, + "grad_norm": 0.17768225073814392, + "learning_rate": 4.51461323855858e-05, + "loss": 0.0196, + "num_input_tokens_seen": 10801088, + "step": 51185 + }, + { + "epoch": 5.631463146314632, + "grad_norm": 0.029030535370111465, + "learning_rate": 4.51447111499215e-05, + "loss": 0.0352, + "num_input_tokens_seen": 10802208, + "step": 51190 + }, + { + "epoch": 5.632013201320132, + "grad_norm": 0.0825519859790802, + "learning_rate": 4.5143289728593064e-05, + "loss": 0.0168, + "num_input_tokens_seen": 10803232, + "step": 51195 + }, + { + "epoch": 5.632563256325633, + "grad_norm": 0.26129424571990967, + "learning_rate": 4.51418681216136e-05, + "loss": 0.0752, + "num_input_tokens_seen": 10804288, + "step": 51200 + }, + { + "epoch": 5.633113311331133, + "grad_norm": 0.45165517926216125, + "learning_rate": 4.514044632899619e-05, + "loss": 0.0873, + "num_input_tokens_seen": 10805344, + "step": 51205 + }, + { + "epoch": 5.633663366336633, + "grad_norm": 0.6373658180236816, + "learning_rate": 4.513902435075395e-05, + "loss": 0.1013, + "num_input_tokens_seen": 10806400, + "step": 51210 + }, + { + "epoch": 5.634213421342134, + "grad_norm": 0.028876202180981636, + "learning_rate": 4.5137602186899993e-05, + "loss": 0.0321, + "num_input_tokens_seen": 10807520, + "step": 51215 + }, + { + "epoch": 5.634763476347635, + "grad_norm": 0.4760751724243164, + "learning_rate": 4.513617983744741e-05, + "loss": 0.0391, + "num_input_tokens_seen": 10808608, + "step": 51220 + }, + { + "epoch": 5.635313531353136, + "grad_norm": 0.0664588063955307, + "learning_rate": 4.513475730240934e-05, + "loss": 0.0114, + "num_input_tokens_seen": 10809664, + "step": 51225 + }, + { + "epoch": 5.635863586358636, + "grad_norm": 0.1956254541873932, + "learning_rate": 4.513333458179886e-05, + "loss": 0.0077, + "num_input_tokens_seen": 10810688, + "step": 51230 + }, + { + "epoch": 5.636413641364136, + "grad_norm": 0.029507363215088844, + "learning_rate": 4.513191167562909e-05, + "loss": 0.026, + "num_input_tokens_seen": 10811840, + "step": 51235 + }, + { + "epoch": 5.636963696369637, + "grad_norm": 1.08778715133667, + "learning_rate": 4.513048858391316e-05, + "loss": 0.14, + "num_input_tokens_seen": 10812896, + "step": 51240 + }, + { + "epoch": 5.637513751375137, + "grad_norm": 1.5011835098266602, + "learning_rate": 4.512906530666417e-05, + "loss": 0.1401, + "num_input_tokens_seen": 10814016, + "step": 51245 + }, + { + "epoch": 5.638063806380638, + "grad_norm": 0.023770127445459366, + "learning_rate": 4.512764184389525e-05, + "loss": 0.0157, + "num_input_tokens_seen": 10815072, + "step": 51250 + }, + { + "epoch": 5.638613861386139, + "grad_norm": 0.08114422857761383, + "learning_rate": 4.5126218195619506e-05, + "loss": 0.078, + "num_input_tokens_seen": 10816128, + "step": 51255 + }, + { + "epoch": 5.639163916391639, + "grad_norm": 0.17292749881744385, + "learning_rate": 4.512479436185008e-05, + "loss": 0.0552, + "num_input_tokens_seen": 10817280, + "step": 51260 + }, + { + "epoch": 5.63971397139714, + "grad_norm": 0.1595049798488617, + "learning_rate": 4.512337034260006e-05, + "loss": 0.0683, + "num_input_tokens_seen": 10818304, + "step": 51265 + }, + { + "epoch": 5.64026402640264, + "grad_norm": 0.08205852657556534, + "learning_rate": 4.512194613788261e-05, + "loss": 0.0582, + "num_input_tokens_seen": 10819328, + "step": 51270 + }, + { + "epoch": 5.6408140814081404, + "grad_norm": 0.017651710659265518, + "learning_rate": 4.512052174771083e-05, + "loss": 0.0136, + "num_input_tokens_seen": 10820384, + "step": 51275 + }, + { + "epoch": 5.6413641364136415, + "grad_norm": 0.8040279746055603, + "learning_rate": 4.5119097172097855e-05, + "loss": 0.0623, + "num_input_tokens_seen": 10821440, + "step": 51280 + }, + { + "epoch": 5.641914191419142, + "grad_norm": 0.3957270681858063, + "learning_rate": 4.511767241105682e-05, + "loss": 0.0202, + "num_input_tokens_seen": 10822464, + "step": 51285 + }, + { + "epoch": 5.642464246424643, + "grad_norm": 0.07218804955482483, + "learning_rate": 4.5116247464600847e-05, + "loss": 0.0334, + "num_input_tokens_seen": 10823520, + "step": 51290 + }, + { + "epoch": 5.643014301430143, + "grad_norm": 0.21591398119926453, + "learning_rate": 4.511482233274308e-05, + "loss": 0.0422, + "num_input_tokens_seen": 10824576, + "step": 51295 + }, + { + "epoch": 5.643564356435643, + "grad_norm": 0.02975657768547535, + "learning_rate": 4.5113397015496636e-05, + "loss": 0.0068, + "num_input_tokens_seen": 10825664, + "step": 51300 + }, + { + "epoch": 5.644114411441144, + "grad_norm": 0.014843955636024475, + "learning_rate": 4.511197151287468e-05, + "loss": 0.0606, + "num_input_tokens_seen": 10826816, + "step": 51305 + }, + { + "epoch": 5.6446644664466445, + "grad_norm": 0.37069380283355713, + "learning_rate": 4.5110545824890324e-05, + "loss": 0.014, + "num_input_tokens_seen": 10827808, + "step": 51310 + }, + { + "epoch": 5.645214521452145, + "grad_norm": 1.3760783672332764, + "learning_rate": 4.510911995155671e-05, + "loss": 0.0553, + "num_input_tokens_seen": 10828800, + "step": 51315 + }, + { + "epoch": 5.645764576457646, + "grad_norm": 0.0465271919965744, + "learning_rate": 4.5107693892887e-05, + "loss": 0.0065, + "num_input_tokens_seen": 10829920, + "step": 51320 + }, + { + "epoch": 5.646314631463146, + "grad_norm": 1.2385696172714233, + "learning_rate": 4.510626764889432e-05, + "loss": 0.0345, + "num_input_tokens_seen": 10830976, + "step": 51325 + }, + { + "epoch": 5.646864686468647, + "grad_norm": 0.06274931132793427, + "learning_rate": 4.510484121959181e-05, + "loss": 0.0055, + "num_input_tokens_seen": 10832000, + "step": 51330 + }, + { + "epoch": 5.647414741474147, + "grad_norm": 1.8245798349380493, + "learning_rate": 4.5103414604992636e-05, + "loss": 0.0919, + "num_input_tokens_seen": 10833056, + "step": 51335 + }, + { + "epoch": 5.647964796479648, + "grad_norm": 0.03801615163683891, + "learning_rate": 4.510198780510993e-05, + "loss": 0.0047, + "num_input_tokens_seen": 10834112, + "step": 51340 + }, + { + "epoch": 5.648514851485149, + "grad_norm": 0.1443866640329361, + "learning_rate": 4.510056081995685e-05, + "loss": 0.0876, + "num_input_tokens_seen": 10835136, + "step": 51345 + }, + { + "epoch": 5.649064906490649, + "grad_norm": 0.04136749356985092, + "learning_rate": 4.509913364954655e-05, + "loss": 0.0703, + "num_input_tokens_seen": 10836224, + "step": 51350 + }, + { + "epoch": 5.64961496149615, + "grad_norm": 1.755557894706726, + "learning_rate": 4.509770629389218e-05, + "loss": 0.048, + "num_input_tokens_seen": 10837344, + "step": 51355 + }, + { + "epoch": 5.65016501650165, + "grad_norm": 0.018328405916690826, + "learning_rate": 4.509627875300689e-05, + "loss": 0.1268, + "num_input_tokens_seen": 10838368, + "step": 51360 + }, + { + "epoch": 5.65071507150715, + "grad_norm": 0.27594220638275146, + "learning_rate": 4.509485102690385e-05, + "loss": 0.0505, + "num_input_tokens_seen": 10839456, + "step": 51365 + }, + { + "epoch": 5.6512651265126514, + "grad_norm": 0.009383120574057102, + "learning_rate": 4.50934231155962e-05, + "loss": 0.0625, + "num_input_tokens_seen": 10840512, + "step": 51370 + }, + { + "epoch": 5.651815181518152, + "grad_norm": 0.11969117820262909, + "learning_rate": 4.509199501909711e-05, + "loss": 0.0104, + "num_input_tokens_seen": 10841568, + "step": 51375 + }, + { + "epoch": 5.652365236523653, + "grad_norm": 0.11516392976045609, + "learning_rate": 4.509056673741975e-05, + "loss": 0.0364, + "num_input_tokens_seen": 10842592, + "step": 51380 + }, + { + "epoch": 5.652915291529153, + "grad_norm": 1.508816123008728, + "learning_rate": 4.508913827057728e-05, + "loss": 0.1274, + "num_input_tokens_seen": 10843680, + "step": 51385 + }, + { + "epoch": 5.653465346534653, + "grad_norm": 0.2538914680480957, + "learning_rate": 4.508770961858285e-05, + "loss": 0.0914, + "num_input_tokens_seen": 10844768, + "step": 51390 + }, + { + "epoch": 5.654015401540154, + "grad_norm": 0.23874621093273163, + "learning_rate": 4.5086280781449654e-05, + "loss": 0.0732, + "num_input_tokens_seen": 10845856, + "step": 51395 + }, + { + "epoch": 5.6545654565456545, + "grad_norm": 0.1307896375656128, + "learning_rate": 4.508485175919084e-05, + "loss": 0.0799, + "num_input_tokens_seen": 10846912, + "step": 51400 + }, + { + "epoch": 5.6551155115511555, + "grad_norm": 1.1064510345458984, + "learning_rate": 4.508342255181958e-05, + "loss": 0.1264, + "num_input_tokens_seen": 10847936, + "step": 51405 + }, + { + "epoch": 5.655665566556656, + "grad_norm": 0.0506085604429245, + "learning_rate": 4.5081993159349056e-05, + "loss": 0.0967, + "num_input_tokens_seen": 10849024, + "step": 51410 + }, + { + "epoch": 5.656215621562156, + "grad_norm": 0.027960948646068573, + "learning_rate": 4.5080563581792436e-05, + "loss": 0.0195, + "num_input_tokens_seen": 10850112, + "step": 51415 + }, + { + "epoch": 5.656765676567657, + "grad_norm": 0.026558062061667442, + "learning_rate": 4.50791338191629e-05, + "loss": 0.0071, + "num_input_tokens_seen": 10851168, + "step": 51420 + }, + { + "epoch": 5.657315731573157, + "grad_norm": 0.02833145298063755, + "learning_rate": 4.507770387147362e-05, + "loss": 0.0071, + "num_input_tokens_seen": 10852192, + "step": 51425 + }, + { + "epoch": 5.6578657865786575, + "grad_norm": 0.05239370837807655, + "learning_rate": 4.507627373873777e-05, + "loss": 0.0167, + "num_input_tokens_seen": 10853280, + "step": 51430 + }, + { + "epoch": 5.658415841584159, + "grad_norm": 0.1501445770263672, + "learning_rate": 4.5074843420968546e-05, + "loss": 0.0138, + "num_input_tokens_seen": 10854336, + "step": 51435 + }, + { + "epoch": 5.658965896589659, + "grad_norm": 0.048102620989084244, + "learning_rate": 4.507341291817913e-05, + "loss": 0.0483, + "num_input_tokens_seen": 10855328, + "step": 51440 + }, + { + "epoch": 5.65951595159516, + "grad_norm": 0.8363233804702759, + "learning_rate": 4.507198223038269e-05, + "loss": 0.0351, + "num_input_tokens_seen": 10856352, + "step": 51445 + }, + { + "epoch": 5.66006600660066, + "grad_norm": 0.019661348313093185, + "learning_rate": 4.507055135759243e-05, + "loss": 0.0081, + "num_input_tokens_seen": 10857408, + "step": 51450 + }, + { + "epoch": 5.66061606160616, + "grad_norm": 0.7866875529289246, + "learning_rate": 4.506912029982152e-05, + "loss": 0.0579, + "num_input_tokens_seen": 10858528, + "step": 51455 + }, + { + "epoch": 5.661166116611661, + "grad_norm": 1.4443174600601196, + "learning_rate": 4.506768905708316e-05, + "loss": 0.1358, + "num_input_tokens_seen": 10859680, + "step": 51460 + }, + { + "epoch": 5.661716171617162, + "grad_norm": 0.05304623767733574, + "learning_rate": 4.5066257629390544e-05, + "loss": 0.057, + "num_input_tokens_seen": 10860704, + "step": 51465 + }, + { + "epoch": 5.662266226622663, + "grad_norm": 1.157379150390625, + "learning_rate": 4.506482601675685e-05, + "loss": 0.0524, + "num_input_tokens_seen": 10861792, + "step": 51470 + }, + { + "epoch": 5.662816281628163, + "grad_norm": 0.04189165309071541, + "learning_rate": 4.5063394219195284e-05, + "loss": 0.1238, + "num_input_tokens_seen": 10862784, + "step": 51475 + }, + { + "epoch": 5.663366336633663, + "grad_norm": 0.03948443382978439, + "learning_rate": 4.506196223671906e-05, + "loss": 0.0179, + "num_input_tokens_seen": 10863872, + "step": 51480 + }, + { + "epoch": 5.663916391639164, + "grad_norm": 0.14713142812252045, + "learning_rate": 4.506053006934133e-05, + "loss": 0.0424, + "num_input_tokens_seen": 10864960, + "step": 51485 + }, + { + "epoch": 5.664466446644664, + "grad_norm": 1.3034907579421997, + "learning_rate": 4.505909771707534e-05, + "loss": 0.0901, + "num_input_tokens_seen": 10866016, + "step": 51490 + }, + { + "epoch": 5.665016501650165, + "grad_norm": 0.5567803978919983, + "learning_rate": 4.505766517993426e-05, + "loss": 0.0367, + "num_input_tokens_seen": 10867040, + "step": 51495 + }, + { + "epoch": 5.665566556655666, + "grad_norm": 0.057146329432725906, + "learning_rate": 4.505623245793131e-05, + "loss": 0.036, + "num_input_tokens_seen": 10868128, + "step": 51500 + }, + { + "epoch": 5.666116611661166, + "grad_norm": 0.03153587505221367, + "learning_rate": 4.5054799551079695e-05, + "loss": 0.0436, + "num_input_tokens_seen": 10869152, + "step": 51505 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.056477487087249756, + "learning_rate": 4.5053366459392607e-05, + "loss": 0.0163, + "num_input_tokens_seen": 10870240, + "step": 51510 + }, + { + "epoch": 5.667216721672167, + "grad_norm": 0.4539406895637512, + "learning_rate": 4.505193318288327e-05, + "loss": 0.0635, + "num_input_tokens_seen": 10871296, + "step": 51515 + }, + { + "epoch": 5.667766776677668, + "grad_norm": 0.3855366110801697, + "learning_rate": 4.505049972156487e-05, + "loss": 0.1289, + "num_input_tokens_seen": 10872320, + "step": 51520 + }, + { + "epoch": 5.6683168316831685, + "grad_norm": 1.7832942008972168, + "learning_rate": 4.5049066075450655e-05, + "loss": 0.0797, + "num_input_tokens_seen": 10873408, + "step": 51525 + }, + { + "epoch": 5.668866886688669, + "grad_norm": 0.10721123218536377, + "learning_rate": 4.5047632244553814e-05, + "loss": 0.0722, + "num_input_tokens_seen": 10874432, + "step": 51530 + }, + { + "epoch": 5.66941694169417, + "grad_norm": 0.5618169903755188, + "learning_rate": 4.504619822888756e-05, + "loss": 0.2496, + "num_input_tokens_seen": 10875488, + "step": 51535 + }, + { + "epoch": 5.66996699669967, + "grad_norm": 0.05176271125674248, + "learning_rate": 4.504476402846512e-05, + "loss": 0.0219, + "num_input_tokens_seen": 10876512, + "step": 51540 + }, + { + "epoch": 5.67051705170517, + "grad_norm": 0.07062312215566635, + "learning_rate": 4.5043329643299706e-05, + "loss": 0.0748, + "num_input_tokens_seen": 10877536, + "step": 51545 + }, + { + "epoch": 5.671067106710671, + "grad_norm": 0.3907184600830078, + "learning_rate": 4.5041895073404546e-05, + "loss": 0.1403, + "num_input_tokens_seen": 10878528, + "step": 51550 + }, + { + "epoch": 5.6716171617161715, + "grad_norm": 0.9450640678405762, + "learning_rate": 4.504046031879285e-05, + "loss": 0.1082, + "num_input_tokens_seen": 10879584, + "step": 51555 + }, + { + "epoch": 5.672167216721672, + "grad_norm": 0.6559596061706543, + "learning_rate": 4.503902537947785e-05, + "loss": 0.0389, + "num_input_tokens_seen": 10880576, + "step": 51560 + }, + { + "epoch": 5.672717271727173, + "grad_norm": 0.15916189551353455, + "learning_rate": 4.503759025547277e-05, + "loss": 0.0374, + "num_input_tokens_seen": 10881600, + "step": 51565 + }, + { + "epoch": 5.673267326732673, + "grad_norm": 0.14710737764835358, + "learning_rate": 4.503615494679083e-05, + "loss": 0.0274, + "num_input_tokens_seen": 10882688, + "step": 51570 + }, + { + "epoch": 5.673817381738174, + "grad_norm": 0.2179863601922989, + "learning_rate": 4.5034719453445275e-05, + "loss": 0.0726, + "num_input_tokens_seen": 10883712, + "step": 51575 + }, + { + "epoch": 5.674367436743674, + "grad_norm": 1.7394187450408936, + "learning_rate": 4.5033283775449313e-05, + "loss": 0.089, + "num_input_tokens_seen": 10884736, + "step": 51580 + }, + { + "epoch": 5.674917491749175, + "grad_norm": 0.06813795119524002, + "learning_rate": 4.5031847912816196e-05, + "loss": 0.0466, + "num_input_tokens_seen": 10885760, + "step": 51585 + }, + { + "epoch": 5.675467546754676, + "grad_norm": 0.8001670837402344, + "learning_rate": 4.503041186555914e-05, + "loss": 0.1005, + "num_input_tokens_seen": 10886816, + "step": 51590 + }, + { + "epoch": 5.676017601760176, + "grad_norm": 0.03226965665817261, + "learning_rate": 4.502897563369139e-05, + "loss": 0.0098, + "num_input_tokens_seen": 10887840, + "step": 51595 + }, + { + "epoch": 5.676567656765677, + "grad_norm": 0.09640013426542282, + "learning_rate": 4.502753921722619e-05, + "loss": 0.0344, + "num_input_tokens_seen": 10888864, + "step": 51600 + }, + { + "epoch": 5.677117711771177, + "grad_norm": 0.3451438844203949, + "learning_rate": 4.502610261617677e-05, + "loss": 0.0791, + "num_input_tokens_seen": 10889888, + "step": 51605 + }, + { + "epoch": 5.677667766776677, + "grad_norm": 0.38898521661758423, + "learning_rate": 4.502466583055637e-05, + "loss": 0.0273, + "num_input_tokens_seen": 10890944, + "step": 51610 + }, + { + "epoch": 5.678217821782178, + "grad_norm": 0.2260737270116806, + "learning_rate": 4.502322886037823e-05, + "loss": 0.0336, + "num_input_tokens_seen": 10892032, + "step": 51615 + }, + { + "epoch": 5.678767876787679, + "grad_norm": 0.1682928204536438, + "learning_rate": 4.5021791705655605e-05, + "loss": 0.0269, + "num_input_tokens_seen": 10893056, + "step": 51620 + }, + { + "epoch": 5.67931793179318, + "grad_norm": 0.5270278453826904, + "learning_rate": 4.502035436640173e-05, + "loss": 0.0211, + "num_input_tokens_seen": 10894176, + "step": 51625 + }, + { + "epoch": 5.67986798679868, + "grad_norm": 0.08994784951210022, + "learning_rate": 4.5018916842629854e-05, + "loss": 0.0169, + "num_input_tokens_seen": 10895232, + "step": 51630 + }, + { + "epoch": 5.68041804180418, + "grad_norm": 0.27344805002212524, + "learning_rate": 4.5017479134353225e-05, + "loss": 0.0163, + "num_input_tokens_seen": 10896320, + "step": 51635 + }, + { + "epoch": 5.680968096809681, + "grad_norm": 0.21602995693683624, + "learning_rate": 4.50160412415851e-05, + "loss": 0.0097, + "num_input_tokens_seen": 10897312, + "step": 51640 + }, + { + "epoch": 5.681518151815181, + "grad_norm": 0.20859794318675995, + "learning_rate": 4.501460316433873e-05, + "loss": 0.0642, + "num_input_tokens_seen": 10898336, + "step": 51645 + }, + { + "epoch": 5.6820682068206825, + "grad_norm": 0.06605445593595505, + "learning_rate": 4.501316490262736e-05, + "loss": 0.0913, + "num_input_tokens_seen": 10899360, + "step": 51650 + }, + { + "epoch": 5.682618261826183, + "grad_norm": 0.01005430519580841, + "learning_rate": 4.501172645646426e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10900480, + "step": 51655 + }, + { + "epoch": 5.683168316831683, + "grad_norm": 0.04410875588655472, + "learning_rate": 4.5010287825862676e-05, + "loss": 0.0998, + "num_input_tokens_seen": 10901536, + "step": 51660 + }, + { + "epoch": 5.683718371837184, + "grad_norm": 0.012378433719277382, + "learning_rate": 4.500884901083587e-05, + "loss": 0.0169, + "num_input_tokens_seen": 10902624, + "step": 51665 + }, + { + "epoch": 5.684268426842684, + "grad_norm": 0.07948450744152069, + "learning_rate": 4.50074100113971e-05, + "loss": 0.0113, + "num_input_tokens_seen": 10903648, + "step": 51670 + }, + { + "epoch": 5.684818481848184, + "grad_norm": 0.1287427842617035, + "learning_rate": 4.500597082755964e-05, + "loss": 0.0513, + "num_input_tokens_seen": 10904672, + "step": 51675 + }, + { + "epoch": 5.6853685368536855, + "grad_norm": 0.19288544356822968, + "learning_rate": 4.500453145933674e-05, + "loss": 0.0175, + "num_input_tokens_seen": 10905792, + "step": 51680 + }, + { + "epoch": 5.685918591859186, + "grad_norm": 0.039918601512908936, + "learning_rate": 4.500309190674168e-05, + "loss": 0.0459, + "num_input_tokens_seen": 10906816, + "step": 51685 + }, + { + "epoch": 5.686468646864687, + "grad_norm": 0.19618792831897736, + "learning_rate": 4.5001652169787715e-05, + "loss": 0.0331, + "num_input_tokens_seen": 10907776, + "step": 51690 + }, + { + "epoch": 5.687018701870187, + "grad_norm": 0.016985943540930748, + "learning_rate": 4.500021224848812e-05, + "loss": 0.1041, + "num_input_tokens_seen": 10908768, + "step": 51695 + }, + { + "epoch": 5.687568756875687, + "grad_norm": 0.05917160212993622, + "learning_rate": 4.499877214285617e-05, + "loss": 0.0733, + "num_input_tokens_seen": 10909760, + "step": 51700 + }, + { + "epoch": 5.688118811881188, + "grad_norm": 1.9949917793273926, + "learning_rate": 4.499733185290513e-05, + "loss": 0.0427, + "num_input_tokens_seen": 10910752, + "step": 51705 + }, + { + "epoch": 5.6886688668866885, + "grad_norm": 0.020729729905724525, + "learning_rate": 4.499589137864828e-05, + "loss": 0.0182, + "num_input_tokens_seen": 10911776, + "step": 51710 + }, + { + "epoch": 5.68921892189219, + "grad_norm": 0.1630103439092636, + "learning_rate": 4.499445072009889e-05, + "loss": 0.0175, + "num_input_tokens_seen": 10912768, + "step": 51715 + }, + { + "epoch": 5.68976897689769, + "grad_norm": 0.004223413299769163, + "learning_rate": 4.499300987727024e-05, + "loss": 0.0306, + "num_input_tokens_seen": 10913824, + "step": 51720 + }, + { + "epoch": 5.69031903190319, + "grad_norm": 0.3441274166107178, + "learning_rate": 4.4991568850175626e-05, + "loss": 0.0389, + "num_input_tokens_seen": 10914880, + "step": 51725 + }, + { + "epoch": 5.690869086908691, + "grad_norm": 0.31588131189346313, + "learning_rate": 4.49901276388283e-05, + "loss": 0.0235, + "num_input_tokens_seen": 10916000, + "step": 51730 + }, + { + "epoch": 5.691419141914191, + "grad_norm": 0.6910699605941772, + "learning_rate": 4.4988686243241564e-05, + "loss": 0.0271, + "num_input_tokens_seen": 10917056, + "step": 51735 + }, + { + "epoch": 5.6919691969196915, + "grad_norm": 0.01459756214171648, + "learning_rate": 4.49872446634287e-05, + "loss": 0.008, + "num_input_tokens_seen": 10918112, + "step": 51740 + }, + { + "epoch": 5.692519251925193, + "grad_norm": 0.5999714732170105, + "learning_rate": 4.498580289940299e-05, + "loss": 0.0452, + "num_input_tokens_seen": 10919136, + "step": 51745 + }, + { + "epoch": 5.693069306930693, + "grad_norm": 0.08101173490285873, + "learning_rate": 4.498436095117773e-05, + "loss": 0.0083, + "num_input_tokens_seen": 10920192, + "step": 51750 + }, + { + "epoch": 5.693619361936194, + "grad_norm": 0.00948517955839634, + "learning_rate": 4.49829188187662e-05, + "loss": 0.0316, + "num_input_tokens_seen": 10921312, + "step": 51755 + }, + { + "epoch": 5.694169416941694, + "grad_norm": 0.6707798838615417, + "learning_rate": 4.4981476502181695e-05, + "loss": 0.0983, + "num_input_tokens_seen": 10922400, + "step": 51760 + }, + { + "epoch": 5.694719471947195, + "grad_norm": 0.2139500081539154, + "learning_rate": 4.498003400143751e-05, + "loss": 0.0602, + "num_input_tokens_seen": 10923520, + "step": 51765 + }, + { + "epoch": 5.695269526952695, + "grad_norm": 0.004947945475578308, + "learning_rate": 4.4978591316546945e-05, + "loss": 0.0183, + "num_input_tokens_seen": 10924608, + "step": 51770 + }, + { + "epoch": 5.695819581958196, + "grad_norm": 0.8638930916786194, + "learning_rate": 4.497714844752329e-05, + "loss": 0.0341, + "num_input_tokens_seen": 10925664, + "step": 51775 + }, + { + "epoch": 5.696369636963697, + "grad_norm": 0.3927994668483734, + "learning_rate": 4.497570539437983e-05, + "loss": 0.0245, + "num_input_tokens_seen": 10926720, + "step": 51780 + }, + { + "epoch": 5.696919691969197, + "grad_norm": 0.49907130002975464, + "learning_rate": 4.497426215712989e-05, + "loss": 0.1512, + "num_input_tokens_seen": 10927808, + "step": 51785 + }, + { + "epoch": 5.697469746974697, + "grad_norm": 0.2637271285057068, + "learning_rate": 4.497281873578676e-05, + "loss": 0.1003, + "num_input_tokens_seen": 10928800, + "step": 51790 + }, + { + "epoch": 5.698019801980198, + "grad_norm": 0.040239740163087845, + "learning_rate": 4.4971375130363744e-05, + "loss": 0.0365, + "num_input_tokens_seen": 10929856, + "step": 51795 + }, + { + "epoch": 5.698569856985698, + "grad_norm": 0.3392510414123535, + "learning_rate": 4.496993134087414e-05, + "loss": 0.0376, + "num_input_tokens_seen": 10930912, + "step": 51800 + }, + { + "epoch": 5.6991199119911995, + "grad_norm": 0.036274753510951996, + "learning_rate": 4.4968487367331266e-05, + "loss": 0.0226, + "num_input_tokens_seen": 10931936, + "step": 51805 + }, + { + "epoch": 5.6996699669967, + "grad_norm": 0.24031662940979004, + "learning_rate": 4.496704320974843e-05, + "loss": 0.1438, + "num_input_tokens_seen": 10932992, + "step": 51810 + }, + { + "epoch": 5.7002200220022, + "grad_norm": 0.02030451036989689, + "learning_rate": 4.496559886813893e-05, + "loss": 0.0465, + "num_input_tokens_seen": 10934048, + "step": 51815 + }, + { + "epoch": 5.700770077007701, + "grad_norm": 0.07228841632604599, + "learning_rate": 4.496415434251609e-05, + "loss": 0.0139, + "num_input_tokens_seen": 10935072, + "step": 51820 + }, + { + "epoch": 5.701320132013201, + "grad_norm": 0.8890305757522583, + "learning_rate": 4.496270963289322e-05, + "loss": 0.088, + "num_input_tokens_seen": 10936160, + "step": 51825 + }, + { + "epoch": 5.701870187018702, + "grad_norm": 0.6208954453468323, + "learning_rate": 4.496126473928363e-05, + "loss": 0.0325, + "num_input_tokens_seen": 10937216, + "step": 51830 + }, + { + "epoch": 5.7024202420242025, + "grad_norm": 0.07883383333683014, + "learning_rate": 4.495981966170065e-05, + "loss": 0.0069, + "num_input_tokens_seen": 10938304, + "step": 51835 + }, + { + "epoch": 5.702970297029703, + "grad_norm": 0.05955301970243454, + "learning_rate": 4.495837440015758e-05, + "loss": 0.0037, + "num_input_tokens_seen": 10939296, + "step": 51840 + }, + { + "epoch": 5.703520352035204, + "grad_norm": 1.867027997970581, + "learning_rate": 4.4956928954667755e-05, + "loss": 0.1326, + "num_input_tokens_seen": 10940384, + "step": 51845 + }, + { + "epoch": 5.704070407040704, + "grad_norm": 0.5393372774124146, + "learning_rate": 4.4955483325244484e-05, + "loss": 0.0199, + "num_input_tokens_seen": 10941472, + "step": 51850 + }, + { + "epoch": 5.704620462046204, + "grad_norm": 0.22003011405467987, + "learning_rate": 4.495403751190111e-05, + "loss": 0.1066, + "num_input_tokens_seen": 10942528, + "step": 51855 + }, + { + "epoch": 5.705170517051705, + "grad_norm": 0.042674604803323746, + "learning_rate": 4.4952591514650947e-05, + "loss": 0.0088, + "num_input_tokens_seen": 10943520, + "step": 51860 + }, + { + "epoch": 5.7057205720572055, + "grad_norm": 0.20347627997398376, + "learning_rate": 4.495114533350732e-05, + "loss": 0.0239, + "num_input_tokens_seen": 10944608, + "step": 51865 + }, + { + "epoch": 5.706270627062707, + "grad_norm": 0.0880635529756546, + "learning_rate": 4.4949698968483556e-05, + "loss": 0.0347, + "num_input_tokens_seen": 10945696, + "step": 51870 + }, + { + "epoch": 5.706820682068207, + "grad_norm": 0.03540458157658577, + "learning_rate": 4.494825241959298e-05, + "loss": 0.0948, + "num_input_tokens_seen": 10946816, + "step": 51875 + }, + { + "epoch": 5.707370737073707, + "grad_norm": 0.06754214316606522, + "learning_rate": 4.4946805686848956e-05, + "loss": 0.0249, + "num_input_tokens_seen": 10947872, + "step": 51880 + }, + { + "epoch": 5.707920792079208, + "grad_norm": 0.09613368660211563, + "learning_rate": 4.4945358770264776e-05, + "loss": 0.0266, + "num_input_tokens_seen": 10948992, + "step": 51885 + }, + { + "epoch": 5.708470847084708, + "grad_norm": 0.2893626093864441, + "learning_rate": 4.494391166985381e-05, + "loss": 0.022, + "num_input_tokens_seen": 10950048, + "step": 51890 + }, + { + "epoch": 5.709020902090209, + "grad_norm": 0.4548943340778351, + "learning_rate": 4.494246438562937e-05, + "loss": 0.0291, + "num_input_tokens_seen": 10951168, + "step": 51895 + }, + { + "epoch": 5.70957095709571, + "grad_norm": 0.12295524030923843, + "learning_rate": 4.494101691760481e-05, + "loss": 0.0317, + "num_input_tokens_seen": 10952224, + "step": 51900 + }, + { + "epoch": 5.71012101210121, + "grad_norm": 0.05023553594946861, + "learning_rate": 4.493956926579347e-05, + "loss": 0.0211, + "num_input_tokens_seen": 10953312, + "step": 51905 + }, + { + "epoch": 5.710671067106711, + "grad_norm": 0.4405849277973175, + "learning_rate": 4.493812143020868e-05, + "loss": 0.0212, + "num_input_tokens_seen": 10954368, + "step": 51910 + }, + { + "epoch": 5.711221122112211, + "grad_norm": 0.20332536101341248, + "learning_rate": 4.493667341086379e-05, + "loss": 0.007, + "num_input_tokens_seen": 10955424, + "step": 51915 + }, + { + "epoch": 5.711771177117711, + "grad_norm": 0.009197594597935677, + "learning_rate": 4.493522520777215e-05, + "loss": 0.0441, + "num_input_tokens_seen": 10956480, + "step": 51920 + }, + { + "epoch": 5.712321232123212, + "grad_norm": 1.2581703662872314, + "learning_rate": 4.493377682094711e-05, + "loss": 0.0418, + "num_input_tokens_seen": 10957536, + "step": 51925 + }, + { + "epoch": 5.712871287128713, + "grad_norm": 0.005973816849291325, + "learning_rate": 4.4932328250402014e-05, + "loss": 0.0217, + "num_input_tokens_seen": 10958592, + "step": 51930 + }, + { + "epoch": 5.713421342134214, + "grad_norm": 0.23277141153812408, + "learning_rate": 4.493087949615021e-05, + "loss": 0.0431, + "num_input_tokens_seen": 10959616, + "step": 51935 + }, + { + "epoch": 5.713971397139714, + "grad_norm": 0.02250710502266884, + "learning_rate": 4.492943055820505e-05, + "loss": 0.0027, + "num_input_tokens_seen": 10960672, + "step": 51940 + }, + { + "epoch": 5.714521452145215, + "grad_norm": 0.007425392512232065, + "learning_rate": 4.492798143657989e-05, + "loss": 0.0106, + "num_input_tokens_seen": 10961696, + "step": 51945 + }, + { + "epoch": 5.715071507150715, + "grad_norm": 0.9520920515060425, + "learning_rate": 4.49265321312881e-05, + "loss": 0.0571, + "num_input_tokens_seen": 10962784, + "step": 51950 + }, + { + "epoch": 5.715621562156215, + "grad_norm": 0.20338231325149536, + "learning_rate": 4.4925082642343016e-05, + "loss": 0.0898, + "num_input_tokens_seen": 10963872, + "step": 51955 + }, + { + "epoch": 5.7161716171617165, + "grad_norm": 0.3145036995410919, + "learning_rate": 4.492363296975801e-05, + "loss": 0.0354, + "num_input_tokens_seen": 10964928, + "step": 51960 + }, + { + "epoch": 5.716721672167217, + "grad_norm": 0.08197235316038132, + "learning_rate": 4.4922183113546435e-05, + "loss": 0.0136, + "num_input_tokens_seen": 10965952, + "step": 51965 + }, + { + "epoch": 5.717271727172717, + "grad_norm": 0.013925080187618732, + "learning_rate": 4.4920733073721664e-05, + "loss": 0.0567, + "num_input_tokens_seen": 10966976, + "step": 51970 + }, + { + "epoch": 5.717821782178218, + "grad_norm": 0.01630871370434761, + "learning_rate": 4.491928285029705e-05, + "loss": 0.0066, + "num_input_tokens_seen": 10968032, + "step": 51975 + }, + { + "epoch": 5.718371837183718, + "grad_norm": 0.4499652683734894, + "learning_rate": 4.4917832443285965e-05, + "loss": 0.0545, + "num_input_tokens_seen": 10969088, + "step": 51980 + }, + { + "epoch": 5.718921892189218, + "grad_norm": 0.059042926877737045, + "learning_rate": 4.491638185270177e-05, + "loss": 0.1169, + "num_input_tokens_seen": 10970144, + "step": 51985 + }, + { + "epoch": 5.7194719471947195, + "grad_norm": 0.045603182166814804, + "learning_rate": 4.491493107855785e-05, + "loss": 0.0721, + "num_input_tokens_seen": 10971296, + "step": 51990 + }, + { + "epoch": 5.72002200220022, + "grad_norm": 0.2508135139942169, + "learning_rate": 4.491348012086757e-05, + "loss": 0.0413, + "num_input_tokens_seen": 10972416, + "step": 51995 + }, + { + "epoch": 5.720572057205721, + "grad_norm": 0.05383619666099548, + "learning_rate": 4.4912028979644286e-05, + "loss": 0.0046, + "num_input_tokens_seen": 10973472, + "step": 52000 + }, + { + "epoch": 5.721122112211221, + "grad_norm": 0.0028769532218575478, + "learning_rate": 4.491057765490139e-05, + "loss": 0.0126, + "num_input_tokens_seen": 10974496, + "step": 52005 + }, + { + "epoch": 5.721672167216722, + "grad_norm": 0.033216748386621475, + "learning_rate": 4.490912614665226e-05, + "loss": 0.0135, + "num_input_tokens_seen": 10975616, + "step": 52010 + }, + { + "epoch": 5.722222222222222, + "grad_norm": 0.019372599199414253, + "learning_rate": 4.4907674454910265e-05, + "loss": 0.0626, + "num_input_tokens_seen": 10976672, + "step": 52015 + }, + { + "epoch": 5.7227722772277225, + "grad_norm": 1.7189520597457886, + "learning_rate": 4.4906222579688784e-05, + "loss": 0.1634, + "num_input_tokens_seen": 10977696, + "step": 52020 + }, + { + "epoch": 5.723322332233224, + "grad_norm": 0.13826973736286163, + "learning_rate": 4.49047705210012e-05, + "loss": 0.1024, + "num_input_tokens_seen": 10978752, + "step": 52025 + }, + { + "epoch": 5.723872387238724, + "grad_norm": 0.019368549808859825, + "learning_rate": 4.49033182788609e-05, + "loss": 0.1566, + "num_input_tokens_seen": 10979808, + "step": 52030 + }, + { + "epoch": 5.724422442244224, + "grad_norm": 0.4935101568698883, + "learning_rate": 4.490186585328127e-05, + "loss": 0.0484, + "num_input_tokens_seen": 10980800, + "step": 52035 + }, + { + "epoch": 5.724972497249725, + "grad_norm": 0.7196836471557617, + "learning_rate": 4.4900413244275686e-05, + "loss": 0.0225, + "num_input_tokens_seen": 10981824, + "step": 52040 + }, + { + "epoch": 5.725522552255225, + "grad_norm": 0.5077285766601562, + "learning_rate": 4.489896045185754e-05, + "loss": 0.0587, + "num_input_tokens_seen": 10982880, + "step": 52045 + }, + { + "epoch": 5.726072607260726, + "grad_norm": 0.4006985127925873, + "learning_rate": 4.4897507476040235e-05, + "loss": 0.086, + "num_input_tokens_seen": 10983840, + "step": 52050 + }, + { + "epoch": 5.726622662266227, + "grad_norm": 0.01696806214749813, + "learning_rate": 4.4896054316837146e-05, + "loss": 0.0043, + "num_input_tokens_seen": 10984864, + "step": 52055 + }, + { + "epoch": 5.727172717271727, + "grad_norm": 0.1449490189552307, + "learning_rate": 4.489460097426167e-05, + "loss": 0.0581, + "num_input_tokens_seen": 10985920, + "step": 52060 + }, + { + "epoch": 5.727722772277228, + "grad_norm": 0.143954336643219, + "learning_rate": 4.48931474483272e-05, + "loss": 0.0175, + "num_input_tokens_seen": 10987008, + "step": 52065 + }, + { + "epoch": 5.728272827282728, + "grad_norm": 0.8711826205253601, + "learning_rate": 4.489169373904713e-05, + "loss": 0.0255, + "num_input_tokens_seen": 10988064, + "step": 52070 + }, + { + "epoch": 5.728822882288229, + "grad_norm": 0.0546519011259079, + "learning_rate": 4.489023984643488e-05, + "loss": 0.035, + "num_input_tokens_seen": 10989056, + "step": 52075 + }, + { + "epoch": 5.729372937293729, + "grad_norm": 0.037436593323946, + "learning_rate": 4.488878577050383e-05, + "loss": 0.0296, + "num_input_tokens_seen": 10990144, + "step": 52080 + }, + { + "epoch": 5.72992299229923, + "grad_norm": 0.03525230288505554, + "learning_rate": 4.4887331511267384e-05, + "loss": 0.111, + "num_input_tokens_seen": 10991296, + "step": 52085 + }, + { + "epoch": 5.730473047304731, + "grad_norm": 0.16611482203006744, + "learning_rate": 4.488587706873894e-05, + "loss": 0.0505, + "num_input_tokens_seen": 10992320, + "step": 52090 + }, + { + "epoch": 5.731023102310231, + "grad_norm": 0.3097797632217407, + "learning_rate": 4.488442244293192e-05, + "loss": 0.0125, + "num_input_tokens_seen": 10993440, + "step": 52095 + }, + { + "epoch": 5.731573157315731, + "grad_norm": 0.041885849088430405, + "learning_rate": 4.4882967633859705e-05, + "loss": 0.0988, + "num_input_tokens_seen": 10994464, + "step": 52100 + }, + { + "epoch": 5.732123212321232, + "grad_norm": 0.05078038573265076, + "learning_rate": 4.488151264153573e-05, + "loss": 0.0042, + "num_input_tokens_seen": 10995488, + "step": 52105 + }, + { + "epoch": 5.732673267326732, + "grad_norm": 0.041050758212804794, + "learning_rate": 4.488005746597338e-05, + "loss": 0.0114, + "num_input_tokens_seen": 10996544, + "step": 52110 + }, + { + "epoch": 5.7332233223322335, + "grad_norm": 0.41346636414527893, + "learning_rate": 4.48786021071861e-05, + "loss": 0.0409, + "num_input_tokens_seen": 10997600, + "step": 52115 + }, + { + "epoch": 5.733773377337734, + "grad_norm": 0.9974945783615112, + "learning_rate": 4.4877146565187264e-05, + "loss": 0.0722, + "num_input_tokens_seen": 10998688, + "step": 52120 + }, + { + "epoch": 5.734323432343234, + "grad_norm": 0.039887212216854095, + "learning_rate": 4.4875690839990316e-05, + "loss": 0.0384, + "num_input_tokens_seen": 10999712, + "step": 52125 + }, + { + "epoch": 5.734873487348735, + "grad_norm": 0.5896164774894714, + "learning_rate": 4.487423493160866e-05, + "loss": 0.0563, + "num_input_tokens_seen": 11000704, + "step": 52130 + }, + { + "epoch": 5.735423542354235, + "grad_norm": 0.6557351350784302, + "learning_rate": 4.4872778840055715e-05, + "loss": 0.0538, + "num_input_tokens_seen": 11001792, + "step": 52135 + }, + { + "epoch": 5.735973597359736, + "grad_norm": 0.7573434114456177, + "learning_rate": 4.487132256534491e-05, + "loss": 0.0725, + "num_input_tokens_seen": 11002816, + "step": 52140 + }, + { + "epoch": 5.7365236523652365, + "grad_norm": 1.238623023033142, + "learning_rate": 4.4869866107489656e-05, + "loss": 0.0956, + "num_input_tokens_seen": 11003936, + "step": 52145 + }, + { + "epoch": 5.737073707370737, + "grad_norm": 0.11705595254898071, + "learning_rate": 4.486840946650338e-05, + "loss": 0.0074, + "num_input_tokens_seen": 11004992, + "step": 52150 + }, + { + "epoch": 5.737623762376238, + "grad_norm": 0.03748733550310135, + "learning_rate": 4.486695264239951e-05, + "loss": 0.0366, + "num_input_tokens_seen": 11006048, + "step": 52155 + }, + { + "epoch": 5.738173817381738, + "grad_norm": 0.4211510121822357, + "learning_rate": 4.486549563519147e-05, + "loss": 0.0714, + "num_input_tokens_seen": 11007072, + "step": 52160 + }, + { + "epoch": 5.738723872387238, + "grad_norm": 0.016107266768813133, + "learning_rate": 4.4864038444892685e-05, + "loss": 0.019, + "num_input_tokens_seen": 11008096, + "step": 52165 + }, + { + "epoch": 5.739273927392739, + "grad_norm": 0.031666941940784454, + "learning_rate": 4.48625810715166e-05, + "loss": 0.0274, + "num_input_tokens_seen": 11009120, + "step": 52170 + }, + { + "epoch": 5.7398239823982395, + "grad_norm": 0.08822813630104065, + "learning_rate": 4.486112351507663e-05, + "loss": 0.0134, + "num_input_tokens_seen": 11010144, + "step": 52175 + }, + { + "epoch": 5.740374037403741, + "grad_norm": 0.22238409519195557, + "learning_rate": 4.485966577558621e-05, + "loss": 0.0239, + "num_input_tokens_seen": 11011232, + "step": 52180 + }, + { + "epoch": 5.740924092409241, + "grad_norm": 1.3376013040542603, + "learning_rate": 4.485820785305879e-05, + "loss": 0.1724, + "num_input_tokens_seen": 11012352, + "step": 52185 + }, + { + "epoch": 5.741474147414742, + "grad_norm": 1.2022486925125122, + "learning_rate": 4.485674974750779e-05, + "loss": 0.0985, + "num_input_tokens_seen": 11013408, + "step": 52190 + }, + { + "epoch": 5.742024202420242, + "grad_norm": 1.75540030002594, + "learning_rate": 4.4855291458946655e-05, + "loss": 0.108, + "num_input_tokens_seen": 11014528, + "step": 52195 + }, + { + "epoch": 5.742574257425742, + "grad_norm": 0.9948663711547852, + "learning_rate": 4.485383298738882e-05, + "loss": 0.048, + "num_input_tokens_seen": 11015648, + "step": 52200 + }, + { + "epoch": 5.743124312431243, + "grad_norm": 0.056879062205553055, + "learning_rate": 4.485237433284775e-05, + "loss": 0.0325, + "num_input_tokens_seen": 11016640, + "step": 52205 + }, + { + "epoch": 5.743674367436744, + "grad_norm": 0.24470631778240204, + "learning_rate": 4.485091549533686e-05, + "loss": 0.0949, + "num_input_tokens_seen": 11017696, + "step": 52210 + }, + { + "epoch": 5.744224422442244, + "grad_norm": 0.5774012207984924, + "learning_rate": 4.484945647486961e-05, + "loss": 0.0124, + "num_input_tokens_seen": 11018720, + "step": 52215 + }, + { + "epoch": 5.744774477447745, + "grad_norm": 0.011107814498245716, + "learning_rate": 4.484799727145944e-05, + "loss": 0.0229, + "num_input_tokens_seen": 11019712, + "step": 52220 + }, + { + "epoch": 5.745324532453245, + "grad_norm": 0.010543696582317352, + "learning_rate": 4.484653788511981e-05, + "loss": 0.0075, + "num_input_tokens_seen": 11020768, + "step": 52225 + }, + { + "epoch": 5.745874587458746, + "grad_norm": 0.012645656242966652, + "learning_rate": 4.4845078315864165e-05, + "loss": 0.0195, + "num_input_tokens_seen": 11021792, + "step": 52230 + }, + { + "epoch": 5.7464246424642464, + "grad_norm": 0.27022385597229004, + "learning_rate": 4.484361856370595e-05, + "loss": 0.0828, + "num_input_tokens_seen": 11022880, + "step": 52235 + }, + { + "epoch": 5.746974697469747, + "grad_norm": 0.03885447606444359, + "learning_rate": 4.4842158628658625e-05, + "loss": 0.0922, + "num_input_tokens_seen": 11023968, + "step": 52240 + }, + { + "epoch": 5.747524752475248, + "grad_norm": 0.4749344289302826, + "learning_rate": 4.4840698510735646e-05, + "loss": 0.1193, + "num_input_tokens_seen": 11025024, + "step": 52245 + }, + { + "epoch": 5.748074807480748, + "grad_norm": 0.12424793839454651, + "learning_rate": 4.483923820995047e-05, + "loss": 0.0073, + "num_input_tokens_seen": 11026112, + "step": 52250 + }, + { + "epoch": 5.748624862486249, + "grad_norm": 0.009250903502106667, + "learning_rate": 4.483777772631656e-05, + "loss": 0.0093, + "num_input_tokens_seen": 11027168, + "step": 52255 + }, + { + "epoch": 5.749174917491749, + "grad_norm": 0.984552264213562, + "learning_rate": 4.4836317059847366e-05, + "loss": 0.0806, + "num_input_tokens_seen": 11028224, + "step": 52260 + }, + { + "epoch": 5.7497249724972495, + "grad_norm": 0.7790487408638, + "learning_rate": 4.483485621055636e-05, + "loss": 0.0113, + "num_input_tokens_seen": 11029248, + "step": 52265 + }, + { + "epoch": 5.7502750275027505, + "grad_norm": 0.04515717923641205, + "learning_rate": 4.483339517845699e-05, + "loss": 0.0248, + "num_input_tokens_seen": 11030272, + "step": 52270 + }, + { + "epoch": 5.750825082508251, + "grad_norm": 0.0611073300242424, + "learning_rate": 4.483193396356275e-05, + "loss": 0.1043, + "num_input_tokens_seen": 11031360, + "step": 52275 + }, + { + "epoch": 5.751375137513751, + "grad_norm": 0.12966592609882355, + "learning_rate": 4.483047256588708e-05, + "loss": 0.1034, + "num_input_tokens_seen": 11032352, + "step": 52280 + }, + { + "epoch": 5.751925192519252, + "grad_norm": 1.4439815282821655, + "learning_rate": 4.482901098544347e-05, + "loss": 0.1161, + "num_input_tokens_seen": 11033440, + "step": 52285 + }, + { + "epoch": 5.752475247524752, + "grad_norm": 0.35506343841552734, + "learning_rate": 4.482754922224538e-05, + "loss": 0.011, + "num_input_tokens_seen": 11034496, + "step": 52290 + }, + { + "epoch": 5.753025302530253, + "grad_norm": 1.2294057607650757, + "learning_rate": 4.482608727630627e-05, + "loss": 0.0573, + "num_input_tokens_seen": 11035488, + "step": 52295 + }, + { + "epoch": 5.7535753575357536, + "grad_norm": 0.11630746722221375, + "learning_rate": 4.482462514763963e-05, + "loss": 0.0434, + "num_input_tokens_seen": 11036608, + "step": 52300 + }, + { + "epoch": 5.754125412541254, + "grad_norm": 0.20057310163974762, + "learning_rate": 4.482316283625894e-05, + "loss": 0.0295, + "num_input_tokens_seen": 11037664, + "step": 52305 + }, + { + "epoch": 5.754675467546755, + "grad_norm": 0.6775537133216858, + "learning_rate": 4.4821700342177666e-05, + "loss": 0.0722, + "num_input_tokens_seen": 11038688, + "step": 52310 + }, + { + "epoch": 5.755225522552255, + "grad_norm": 0.016082003712654114, + "learning_rate": 4.48202376654093e-05, + "loss": 0.0057, + "num_input_tokens_seen": 11039712, + "step": 52315 + }, + { + "epoch": 5.755775577557756, + "grad_norm": 0.2839467227458954, + "learning_rate": 4.48187748059673e-05, + "loss": 0.0667, + "num_input_tokens_seen": 11040800, + "step": 52320 + }, + { + "epoch": 5.756325632563256, + "grad_norm": 0.13193194568157196, + "learning_rate": 4.481731176386517e-05, + "loss": 0.0318, + "num_input_tokens_seen": 11041824, + "step": 52325 + }, + { + "epoch": 5.756875687568757, + "grad_norm": 0.04015162214636803, + "learning_rate": 4.481584853911639e-05, + "loss": 0.0624, + "num_input_tokens_seen": 11042912, + "step": 52330 + }, + { + "epoch": 5.757425742574258, + "grad_norm": 0.20360946655273438, + "learning_rate": 4.4814385131734435e-05, + "loss": 0.0626, + "num_input_tokens_seen": 11043968, + "step": 52335 + }, + { + "epoch": 5.757975797579758, + "grad_norm": 0.49751946330070496, + "learning_rate": 4.481292154173281e-05, + "loss": 0.1203, + "num_input_tokens_seen": 11045056, + "step": 52340 + }, + { + "epoch": 5.758525852585258, + "grad_norm": 1.0092312097549438, + "learning_rate": 4.481145776912499e-05, + "loss": 0.0316, + "num_input_tokens_seen": 11046048, + "step": 52345 + }, + { + "epoch": 5.759075907590759, + "grad_norm": 0.1217750608921051, + "learning_rate": 4.480999381392447e-05, + "loss": 0.1223, + "num_input_tokens_seen": 11047072, + "step": 52350 + }, + { + "epoch": 5.759625962596259, + "grad_norm": 0.6183213591575623, + "learning_rate": 4.480852967614474e-05, + "loss": 0.0814, + "num_input_tokens_seen": 11048128, + "step": 52355 + }, + { + "epoch": 5.7601760176017605, + "grad_norm": 1.9297361373901367, + "learning_rate": 4.480706535579929e-05, + "loss": 0.1304, + "num_input_tokens_seen": 11049184, + "step": 52360 + }, + { + "epoch": 5.760726072607261, + "grad_norm": 0.0882202759385109, + "learning_rate": 4.4805600852901635e-05, + "loss": 0.0489, + "num_input_tokens_seen": 11050304, + "step": 52365 + }, + { + "epoch": 5.761276127612762, + "grad_norm": 0.051795411854982376, + "learning_rate": 4.480413616746526e-05, + "loss": 0.0209, + "num_input_tokens_seen": 11051392, + "step": 52370 + }, + { + "epoch": 5.761826182618262, + "grad_norm": 0.06375702470541, + "learning_rate": 4.480267129950366e-05, + "loss": 0.1715, + "num_input_tokens_seen": 11052384, + "step": 52375 + }, + { + "epoch": 5.762376237623762, + "grad_norm": 1.2302640676498413, + "learning_rate": 4.480120624903034e-05, + "loss": 0.0504, + "num_input_tokens_seen": 11053440, + "step": 52380 + }, + { + "epoch": 5.762926292629263, + "grad_norm": 0.06838198751211166, + "learning_rate": 4.4799741016058806e-05, + "loss": 0.0324, + "num_input_tokens_seen": 11054528, + "step": 52385 + }, + { + "epoch": 5.7634763476347635, + "grad_norm": 0.19379575550556183, + "learning_rate": 4.4798275600602554e-05, + "loss": 0.0188, + "num_input_tokens_seen": 11055520, + "step": 52390 + }, + { + "epoch": 5.764026402640264, + "grad_norm": 0.044201821088790894, + "learning_rate": 4.479681000267511e-05, + "loss": 0.071, + "num_input_tokens_seen": 11056576, + "step": 52395 + }, + { + "epoch": 5.764576457645765, + "grad_norm": 0.01413446944206953, + "learning_rate": 4.479534422228996e-05, + "loss": 0.004, + "num_input_tokens_seen": 11057728, + "step": 52400 + }, + { + "epoch": 5.765126512651265, + "grad_norm": 0.2864181101322174, + "learning_rate": 4.479387825946062e-05, + "loss": 0.0443, + "num_input_tokens_seen": 11058848, + "step": 52405 + }, + { + "epoch": 5.765676567656766, + "grad_norm": 0.09766894578933716, + "learning_rate": 4.47924121142006e-05, + "loss": 0.0213, + "num_input_tokens_seen": 11059936, + "step": 52410 + }, + { + "epoch": 5.766226622662266, + "grad_norm": 0.7436893582344055, + "learning_rate": 4.479094578652342e-05, + "loss": 0.0836, + "num_input_tokens_seen": 11060928, + "step": 52415 + }, + { + "epoch": 5.7667766776677665, + "grad_norm": 1.2848803997039795, + "learning_rate": 4.478947927644258e-05, + "loss": 0.1148, + "num_input_tokens_seen": 11062016, + "step": 52420 + }, + { + "epoch": 5.767326732673268, + "grad_norm": 0.60624760389328, + "learning_rate": 4.478801258397162e-05, + "loss": 0.0649, + "num_input_tokens_seen": 11063072, + "step": 52425 + }, + { + "epoch": 5.767876787678768, + "grad_norm": 0.14275184273719788, + "learning_rate": 4.478654570912404e-05, + "loss": 0.0588, + "num_input_tokens_seen": 11064096, + "step": 52430 + }, + { + "epoch": 5.768426842684269, + "grad_norm": 0.14077112078666687, + "learning_rate": 4.478507865191335e-05, + "loss": 0.0404, + "num_input_tokens_seen": 11065216, + "step": 52435 + }, + { + "epoch": 5.768976897689769, + "grad_norm": 0.8913308382034302, + "learning_rate": 4.47836114123531e-05, + "loss": 0.1032, + "num_input_tokens_seen": 11066240, + "step": 52440 + }, + { + "epoch": 5.769526952695269, + "grad_norm": 0.04013421759009361, + "learning_rate": 4.4782143990456784e-05, + "loss": 0.0975, + "num_input_tokens_seen": 11067232, + "step": 52445 + }, + { + "epoch": 5.77007700770077, + "grad_norm": 0.40165555477142334, + "learning_rate": 4.478067638623794e-05, + "loss": 0.0256, + "num_input_tokens_seen": 11068256, + "step": 52450 + }, + { + "epoch": 5.770627062706271, + "grad_norm": 0.06458720564842224, + "learning_rate": 4.4779208599710096e-05, + "loss": 0.0255, + "num_input_tokens_seen": 11069344, + "step": 52455 + }, + { + "epoch": 5.771177117711771, + "grad_norm": 0.025850936770439148, + "learning_rate": 4.477774063088679e-05, + "loss": 0.0077, + "num_input_tokens_seen": 11070400, + "step": 52460 + }, + { + "epoch": 5.771727172717272, + "grad_norm": 0.086562380194664, + "learning_rate": 4.477627247978152e-05, + "loss": 0.0376, + "num_input_tokens_seen": 11071456, + "step": 52465 + }, + { + "epoch": 5.772277227722772, + "grad_norm": 0.03143017366528511, + "learning_rate": 4.4774804146407844e-05, + "loss": 0.061, + "num_input_tokens_seen": 11072480, + "step": 52470 + }, + { + "epoch": 5.772827282728273, + "grad_norm": 0.7750535607337952, + "learning_rate": 4.477333563077929e-05, + "loss": 0.0575, + "num_input_tokens_seen": 11073472, + "step": 52475 + }, + { + "epoch": 5.773377337733773, + "grad_norm": 0.35725829005241394, + "learning_rate": 4.477186693290939e-05, + "loss": 0.0186, + "num_input_tokens_seen": 11074496, + "step": 52480 + }, + { + "epoch": 5.773927392739274, + "grad_norm": 0.019429268315434456, + "learning_rate": 4.477039805281167e-05, + "loss": 0.0271, + "num_input_tokens_seen": 11075616, + "step": 52485 + }, + { + "epoch": 5.774477447744775, + "grad_norm": 0.051404859870672226, + "learning_rate": 4.476892899049969e-05, + "loss": 0.0194, + "num_input_tokens_seen": 11076768, + "step": 52490 + }, + { + "epoch": 5.775027502750275, + "grad_norm": 0.012790274806320667, + "learning_rate": 4.4767459745986974e-05, + "loss": 0.0181, + "num_input_tokens_seen": 11077888, + "step": 52495 + }, + { + "epoch": 5.775577557755776, + "grad_norm": 0.8465166091918945, + "learning_rate": 4.476599031928707e-05, + "loss": 0.1411, + "num_input_tokens_seen": 11078912, + "step": 52500 + }, + { + "epoch": 5.776127612761276, + "grad_norm": 0.039331503212451935, + "learning_rate": 4.47645207104135e-05, + "loss": 0.028, + "num_input_tokens_seen": 11079968, + "step": 52505 + }, + { + "epoch": 5.776677667766776, + "grad_norm": 0.2612976133823395, + "learning_rate": 4.476305091937984e-05, + "loss": 0.0261, + "num_input_tokens_seen": 11080992, + "step": 52510 + }, + { + "epoch": 5.7772277227722775, + "grad_norm": 0.13645623624324799, + "learning_rate": 4.476158094619963e-05, + "loss": 0.0276, + "num_input_tokens_seen": 11081984, + "step": 52515 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.05784214660525322, + "learning_rate": 4.47601107908864e-05, + "loss": 0.0061, + "num_input_tokens_seen": 11083040, + "step": 52520 + }, + { + "epoch": 5.778327832783278, + "grad_norm": 0.09277893602848053, + "learning_rate": 4.475864045345371e-05, + "loss": 0.0058, + "num_input_tokens_seen": 11084128, + "step": 52525 + }, + { + "epoch": 5.778877887788779, + "grad_norm": 1.473941445350647, + "learning_rate": 4.475716993391512e-05, + "loss": 0.0353, + "num_input_tokens_seen": 11085184, + "step": 52530 + }, + { + "epoch": 5.779427942794279, + "grad_norm": 1.3437052965164185, + "learning_rate": 4.475569923228417e-05, + "loss": 0.0838, + "num_input_tokens_seen": 11086208, + "step": 52535 + }, + { + "epoch": 5.77997799779978, + "grad_norm": 0.09075982868671417, + "learning_rate": 4.475422834857442e-05, + "loss": 0.0692, + "num_input_tokens_seen": 11087264, + "step": 52540 + }, + { + "epoch": 5.7805280528052805, + "grad_norm": 0.012487097643315792, + "learning_rate": 4.475275728279943e-05, + "loss": 0.033, + "num_input_tokens_seen": 11088320, + "step": 52545 + }, + { + "epoch": 5.781078107810782, + "grad_norm": 0.07790393382310867, + "learning_rate": 4.475128603497275e-05, + "loss": 0.081, + "num_input_tokens_seen": 11089312, + "step": 52550 + }, + { + "epoch": 5.781628162816282, + "grad_norm": 1.4641436338424683, + "learning_rate": 4.4749814605107945e-05, + "loss": 0.0416, + "num_input_tokens_seen": 11090432, + "step": 52555 + }, + { + "epoch": 5.782178217821782, + "grad_norm": 0.10275742411613464, + "learning_rate": 4.474834299321858e-05, + "loss": 0.0256, + "num_input_tokens_seen": 11091456, + "step": 52560 + }, + { + "epoch": 5.782728272827283, + "grad_norm": 0.2026478797197342, + "learning_rate": 4.474687119931821e-05, + "loss": 0.03, + "num_input_tokens_seen": 11092480, + "step": 52565 + }, + { + "epoch": 5.783278327832783, + "grad_norm": 0.3173144459724426, + "learning_rate": 4.47453992234204e-05, + "loss": 0.0653, + "num_input_tokens_seen": 11093568, + "step": 52570 + }, + { + "epoch": 5.7838283828382835, + "grad_norm": 0.08308984339237213, + "learning_rate": 4.4743927065538725e-05, + "loss": 0.0762, + "num_input_tokens_seen": 11094624, + "step": 52575 + }, + { + "epoch": 5.784378437843785, + "grad_norm": 1.5559183359146118, + "learning_rate": 4.4742454725686744e-05, + "loss": 0.1306, + "num_input_tokens_seen": 11095680, + "step": 52580 + }, + { + "epoch": 5.784928492849285, + "grad_norm": 0.06833658367395401, + "learning_rate": 4.474098220387803e-05, + "loss": 0.0607, + "num_input_tokens_seen": 11096800, + "step": 52585 + }, + { + "epoch": 5.785478547854785, + "grad_norm": 0.21713022887706757, + "learning_rate": 4.473950950012617e-05, + "loss": 0.0072, + "num_input_tokens_seen": 11097824, + "step": 52590 + }, + { + "epoch": 5.786028602860286, + "grad_norm": 0.12210522592067719, + "learning_rate": 4.473803661444471e-05, + "loss": 0.0162, + "num_input_tokens_seen": 11098880, + "step": 52595 + }, + { + "epoch": 5.786578657865786, + "grad_norm": 1.2084542512893677, + "learning_rate": 4.473656354684724e-05, + "loss": 0.1134, + "num_input_tokens_seen": 11099936, + "step": 52600 + }, + { + "epoch": 5.787128712871287, + "grad_norm": 0.4288851022720337, + "learning_rate": 4.4735090297347336e-05, + "loss": 0.1674, + "num_input_tokens_seen": 11100992, + "step": 52605 + }, + { + "epoch": 5.787678767876788, + "grad_norm": 0.20510181784629822, + "learning_rate": 4.473361686595857e-05, + "loss": 0.0303, + "num_input_tokens_seen": 11102112, + "step": 52610 + }, + { + "epoch": 5.788228822882289, + "grad_norm": 0.04089287295937538, + "learning_rate": 4.4732143252694535e-05, + "loss": 0.0296, + "num_input_tokens_seen": 11103104, + "step": 52615 + }, + { + "epoch": 5.788778877887789, + "grad_norm": 1.2442080974578857, + "learning_rate": 4.4730669457568794e-05, + "loss": 0.0696, + "num_input_tokens_seen": 11104128, + "step": 52620 + }, + { + "epoch": 5.789328932893289, + "grad_norm": 1.5991220474243164, + "learning_rate": 4.472919548059495e-05, + "loss": 0.1065, + "num_input_tokens_seen": 11105152, + "step": 52625 + }, + { + "epoch": 5.78987898789879, + "grad_norm": 0.5423063039779663, + "learning_rate": 4.4727721321786576e-05, + "loss": 0.0623, + "num_input_tokens_seen": 11106144, + "step": 52630 + }, + { + "epoch": 5.79042904290429, + "grad_norm": 0.3130517899990082, + "learning_rate": 4.472624698115726e-05, + "loss": 0.016, + "num_input_tokens_seen": 11107264, + "step": 52635 + }, + { + "epoch": 5.790979097909791, + "grad_norm": 0.018983634188771248, + "learning_rate": 4.4724772458720585e-05, + "loss": 0.039, + "num_input_tokens_seen": 11108288, + "step": 52640 + }, + { + "epoch": 5.791529152915292, + "grad_norm": 0.032032351940870285, + "learning_rate": 4.472329775449016e-05, + "loss": 0.0884, + "num_input_tokens_seen": 11109280, + "step": 52645 + }, + { + "epoch": 5.792079207920792, + "grad_norm": 0.05818310007452965, + "learning_rate": 4.472182286847955e-05, + "loss": 0.0205, + "num_input_tokens_seen": 11110304, + "step": 52650 + }, + { + "epoch": 5.792629262926293, + "grad_norm": 0.3846580386161804, + "learning_rate": 4.472034780070237e-05, + "loss": 0.0673, + "num_input_tokens_seen": 11111392, + "step": 52655 + }, + { + "epoch": 5.793179317931793, + "grad_norm": 0.1376325786113739, + "learning_rate": 4.4718872551172206e-05, + "loss": 0.0323, + "num_input_tokens_seen": 11112448, + "step": 52660 + }, + { + "epoch": 5.793729372937293, + "grad_norm": 0.6794611811637878, + "learning_rate": 4.471739711990265e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11113504, + "step": 52665 + }, + { + "epoch": 5.7942794279427945, + "grad_norm": 0.2751480042934418, + "learning_rate": 4.471592150690731e-05, + "loss": 0.061, + "num_input_tokens_seen": 11114560, + "step": 52670 + }, + { + "epoch": 5.794829482948295, + "grad_norm": 0.042753349989652634, + "learning_rate": 4.471444571219978e-05, + "loss": 0.0103, + "num_input_tokens_seen": 11115584, + "step": 52675 + }, + { + "epoch": 5.795379537953796, + "grad_norm": 0.021703241392970085, + "learning_rate": 4.471296973579367e-05, + "loss": 0.0894, + "num_input_tokens_seen": 11116544, + "step": 52680 + }, + { + "epoch": 5.795929592959296, + "grad_norm": 0.13640403747558594, + "learning_rate": 4.4711493577702573e-05, + "loss": 0.0475, + "num_input_tokens_seen": 11117568, + "step": 52685 + }, + { + "epoch": 5.796479647964796, + "grad_norm": 0.028175512328743935, + "learning_rate": 4.47100172379401e-05, + "loss": 0.0255, + "num_input_tokens_seen": 11118592, + "step": 52690 + }, + { + "epoch": 5.797029702970297, + "grad_norm": 0.039713408797979355, + "learning_rate": 4.470854071651985e-05, + "loss": 0.0091, + "num_input_tokens_seen": 11119584, + "step": 52695 + }, + { + "epoch": 5.7975797579757975, + "grad_norm": 0.0058105699717998505, + "learning_rate": 4.470706401345545e-05, + "loss": 0.0832, + "num_input_tokens_seen": 11120704, + "step": 52700 + }, + { + "epoch": 5.798129812981298, + "grad_norm": 2.167182445526123, + "learning_rate": 4.4705587128760486e-05, + "loss": 0.0493, + "num_input_tokens_seen": 11121728, + "step": 52705 + }, + { + "epoch": 5.798679867986799, + "grad_norm": 1.4668781757354736, + "learning_rate": 4.470411006244859e-05, + "loss": 0.0898, + "num_input_tokens_seen": 11122784, + "step": 52710 + }, + { + "epoch": 5.799229922992299, + "grad_norm": 0.01710456609725952, + "learning_rate": 4.470263281453336e-05, + "loss": 0.0228, + "num_input_tokens_seen": 11123872, + "step": 52715 + }, + { + "epoch": 5.7997799779978, + "grad_norm": 0.020038343966007233, + "learning_rate": 4.4701155385028416e-05, + "loss": 0.1064, + "num_input_tokens_seen": 11124960, + "step": 52720 + }, + { + "epoch": 5.8003300330033, + "grad_norm": 0.029836442321538925, + "learning_rate": 4.469967777394738e-05, + "loss": 0.0067, + "num_input_tokens_seen": 11126048, + "step": 52725 + }, + { + "epoch": 5.8008800880088005, + "grad_norm": 0.11506504565477371, + "learning_rate": 4.469819998130386e-05, + "loss": 0.0506, + "num_input_tokens_seen": 11127104, + "step": 52730 + }, + { + "epoch": 5.801430143014302, + "grad_norm": 0.08137498795986176, + "learning_rate": 4.4696722007111494e-05, + "loss": 0.0916, + "num_input_tokens_seen": 11128160, + "step": 52735 + }, + { + "epoch": 5.801980198019802, + "grad_norm": 0.05939605087041855, + "learning_rate": 4.469524385138388e-05, + "loss": 0.0174, + "num_input_tokens_seen": 11129216, + "step": 52740 + }, + { + "epoch": 5.802530253025303, + "grad_norm": 0.4468715786933899, + "learning_rate": 4.469376551413467e-05, + "loss": 0.0234, + "num_input_tokens_seen": 11130336, + "step": 52745 + }, + { + "epoch": 5.803080308030803, + "grad_norm": 0.3066455125808716, + "learning_rate": 4.469228699537747e-05, + "loss": 0.0287, + "num_input_tokens_seen": 11131424, + "step": 52750 + }, + { + "epoch": 5.803630363036303, + "grad_norm": 0.13680334389209747, + "learning_rate": 4.4690808295125894e-05, + "loss": 0.0574, + "num_input_tokens_seen": 11132448, + "step": 52755 + }, + { + "epoch": 5.804180418041804, + "grad_norm": 0.031074201688170433, + "learning_rate": 4.46893294133936e-05, + "loss": 0.0064, + "num_input_tokens_seen": 11133504, + "step": 52760 + }, + { + "epoch": 5.804730473047305, + "grad_norm": 0.3225593864917755, + "learning_rate": 4.46878503501942e-05, + "loss": 0.0381, + "num_input_tokens_seen": 11134592, + "step": 52765 + }, + { + "epoch": 5.805280528052805, + "grad_norm": 1.1458834409713745, + "learning_rate": 4.468637110554133e-05, + "loss": 0.1372, + "num_input_tokens_seen": 11135648, + "step": 52770 + }, + { + "epoch": 5.805830583058306, + "grad_norm": 0.2741573452949524, + "learning_rate": 4.468489167944863e-05, + "loss": 0.0491, + "num_input_tokens_seen": 11136736, + "step": 52775 + }, + { + "epoch": 5.806380638063806, + "grad_norm": 0.09693226218223572, + "learning_rate": 4.468341207192972e-05, + "loss": 0.0143, + "num_input_tokens_seen": 11137792, + "step": 52780 + }, + { + "epoch": 5.806930693069307, + "grad_norm": 0.015408959239721298, + "learning_rate": 4.468193228299826e-05, + "loss": 0.0639, + "num_input_tokens_seen": 11138784, + "step": 52785 + }, + { + "epoch": 5.807480748074807, + "grad_norm": 0.012969371862709522, + "learning_rate": 4.4680452312667856e-05, + "loss": 0.0512, + "num_input_tokens_seen": 11139808, + "step": 52790 + }, + { + "epoch": 5.8080308030803085, + "grad_norm": 0.005571688991039991, + "learning_rate": 4.4678972160952185e-05, + "loss": 0.0084, + "num_input_tokens_seen": 11140864, + "step": 52795 + }, + { + "epoch": 5.808580858085809, + "grad_norm": 0.43703630566596985, + "learning_rate": 4.467749182786486e-05, + "loss": 0.1061, + "num_input_tokens_seen": 11141952, + "step": 52800 + }, + { + "epoch": 5.809130913091309, + "grad_norm": 0.17893892526626587, + "learning_rate": 4.467601131341953e-05, + "loss": 0.0103, + "num_input_tokens_seen": 11143072, + "step": 52805 + }, + { + "epoch": 5.80968096809681, + "grad_norm": 0.03730607405304909, + "learning_rate": 4.467453061762986e-05, + "loss": 0.0655, + "num_input_tokens_seen": 11144128, + "step": 52810 + }, + { + "epoch": 5.81023102310231, + "grad_norm": 0.2718065679073334, + "learning_rate": 4.467304974050947e-05, + "loss": 0.0121, + "num_input_tokens_seen": 11145184, + "step": 52815 + }, + { + "epoch": 5.81078107810781, + "grad_norm": 1.0832616090774536, + "learning_rate": 4.467156868207203e-05, + "loss": 0.1158, + "num_input_tokens_seen": 11146272, + "step": 52820 + }, + { + "epoch": 5.8113311331133115, + "grad_norm": 0.030537957325577736, + "learning_rate": 4.467008744233117e-05, + "loss": 0.0055, + "num_input_tokens_seen": 11147360, + "step": 52825 + }, + { + "epoch": 5.811881188118812, + "grad_norm": 2.1641652584075928, + "learning_rate": 4.466860602130056e-05, + "loss": 0.1263, + "num_input_tokens_seen": 11148416, + "step": 52830 + }, + { + "epoch": 5.812431243124313, + "grad_norm": 1.2747923135757446, + "learning_rate": 4.466712441899384e-05, + "loss": 0.0854, + "num_input_tokens_seen": 11149536, + "step": 52835 + }, + { + "epoch": 5.812981298129813, + "grad_norm": 0.3664744794368744, + "learning_rate": 4.466564263542468e-05, + "loss": 0.064, + "num_input_tokens_seen": 11150592, + "step": 52840 + }, + { + "epoch": 5.813531353135313, + "grad_norm": 0.013398277573287487, + "learning_rate": 4.466416067060673e-05, + "loss": 0.0583, + "num_input_tokens_seen": 11151584, + "step": 52845 + }, + { + "epoch": 5.814081408140814, + "grad_norm": 0.03665843978524208, + "learning_rate": 4.4662678524553635e-05, + "loss": 0.0432, + "num_input_tokens_seen": 11152576, + "step": 52850 + }, + { + "epoch": 5.8146314631463145, + "grad_norm": 1.1386325359344482, + "learning_rate": 4.466119619727907e-05, + "loss": 0.0545, + "num_input_tokens_seen": 11153664, + "step": 52855 + }, + { + "epoch": 5.815181518151816, + "grad_norm": 0.9353992938995361, + "learning_rate": 4.46597136887967e-05, + "loss": 0.1174, + "num_input_tokens_seen": 11154688, + "step": 52860 + }, + { + "epoch": 5.815731573157316, + "grad_norm": 0.7174708843231201, + "learning_rate": 4.4658230999120174e-05, + "loss": 0.1267, + "num_input_tokens_seen": 11155744, + "step": 52865 + }, + { + "epoch": 5.816281628162816, + "grad_norm": 0.7566555738449097, + "learning_rate": 4.465674812826318e-05, + "loss": 0.0624, + "num_input_tokens_seen": 11156800, + "step": 52870 + }, + { + "epoch": 5.816831683168317, + "grad_norm": 1.376155972480774, + "learning_rate": 4.465526507623936e-05, + "loss": 0.0923, + "num_input_tokens_seen": 11157792, + "step": 52875 + }, + { + "epoch": 5.817381738173817, + "grad_norm": 0.21561968326568604, + "learning_rate": 4.465378184306239e-05, + "loss": 0.0195, + "num_input_tokens_seen": 11158848, + "step": 52880 + }, + { + "epoch": 5.8179317931793175, + "grad_norm": 0.017313480377197266, + "learning_rate": 4.4652298428745954e-05, + "loss": 0.0579, + "num_input_tokens_seen": 11159872, + "step": 52885 + }, + { + "epoch": 5.818481848184819, + "grad_norm": 0.7436137199401855, + "learning_rate": 4.465081483330371e-05, + "loss": 0.0975, + "num_input_tokens_seen": 11160928, + "step": 52890 + }, + { + "epoch": 5.819031903190319, + "grad_norm": 0.7909044027328491, + "learning_rate": 4.4649331056749335e-05, + "loss": 0.092, + "num_input_tokens_seen": 11161984, + "step": 52895 + }, + { + "epoch": 5.81958195819582, + "grad_norm": 1.0629714727401733, + "learning_rate": 4.46478470990965e-05, + "loss": 0.071, + "num_input_tokens_seen": 11163104, + "step": 52900 + }, + { + "epoch": 5.82013201320132, + "grad_norm": 0.1409846842288971, + "learning_rate": 4.464636296035889e-05, + "loss": 0.0401, + "num_input_tokens_seen": 11164224, + "step": 52905 + }, + { + "epoch": 5.82068206820682, + "grad_norm": 0.012604700401425362, + "learning_rate": 4.4644878640550184e-05, + "loss": 0.0497, + "num_input_tokens_seen": 11165280, + "step": 52910 + }, + { + "epoch": 5.821232123212321, + "grad_norm": 0.69514000415802, + "learning_rate": 4.4643394139684045e-05, + "loss": 0.0174, + "num_input_tokens_seen": 11166272, + "step": 52915 + }, + { + "epoch": 5.821782178217822, + "grad_norm": 0.2596956193447113, + "learning_rate": 4.464190945777418e-05, + "loss": 0.0288, + "num_input_tokens_seen": 11167296, + "step": 52920 + }, + { + "epoch": 5.822332233223323, + "grad_norm": 0.43786677718162537, + "learning_rate": 4.464042459483425e-05, + "loss": 0.0344, + "num_input_tokens_seen": 11168384, + "step": 52925 + }, + { + "epoch": 5.822882288228823, + "grad_norm": 0.08971676975488663, + "learning_rate": 4.4638939550877964e-05, + "loss": 0.0106, + "num_input_tokens_seen": 11169440, + "step": 52930 + }, + { + "epoch": 5.823432343234323, + "grad_norm": 0.15674157440662384, + "learning_rate": 4.463745432591899e-05, + "loss": 0.0131, + "num_input_tokens_seen": 11170464, + "step": 52935 + }, + { + "epoch": 5.823982398239824, + "grad_norm": 0.33383551239967346, + "learning_rate": 4.463596891997102e-05, + "loss": 0.0696, + "num_input_tokens_seen": 11171456, + "step": 52940 + }, + { + "epoch": 5.824532453245324, + "grad_norm": 0.06751178950071335, + "learning_rate": 4.463448333304775e-05, + "loss": 0.0076, + "num_input_tokens_seen": 11172608, + "step": 52945 + }, + { + "epoch": 5.825082508250825, + "grad_norm": 0.511747419834137, + "learning_rate": 4.463299756516286e-05, + "loss": 0.1348, + "num_input_tokens_seen": 11173696, + "step": 52950 + }, + { + "epoch": 5.825632563256326, + "grad_norm": 0.14601491391658783, + "learning_rate": 4.463151161633007e-05, + "loss": 0.0117, + "num_input_tokens_seen": 11174816, + "step": 52955 + }, + { + "epoch": 5.826182618261826, + "grad_norm": 0.20877671241760254, + "learning_rate": 4.463002548656304e-05, + "loss": 0.0283, + "num_input_tokens_seen": 11175904, + "step": 52960 + }, + { + "epoch": 5.826732673267327, + "grad_norm": 0.012650411576032639, + "learning_rate": 4.46285391758755e-05, + "loss": 0.009, + "num_input_tokens_seen": 11176864, + "step": 52965 + }, + { + "epoch": 5.827282728272827, + "grad_norm": 0.03268866240978241, + "learning_rate": 4.462705268428112e-05, + "loss": 0.0051, + "num_input_tokens_seen": 11177952, + "step": 52970 + }, + { + "epoch": 5.827832783278328, + "grad_norm": 0.04698207229375839, + "learning_rate": 4.462556601179362e-05, + "loss": 0.0553, + "num_input_tokens_seen": 11178976, + "step": 52975 + }, + { + "epoch": 5.8283828382838285, + "grad_norm": 0.0277447197586298, + "learning_rate": 4.462407915842669e-05, + "loss": 0.0726, + "num_input_tokens_seen": 11180032, + "step": 52980 + }, + { + "epoch": 5.828932893289329, + "grad_norm": 0.5230565071105957, + "learning_rate": 4.462259212419404e-05, + "loss": 0.029, + "num_input_tokens_seen": 11181088, + "step": 52985 + }, + { + "epoch": 5.82948294829483, + "grad_norm": 0.03713173791766167, + "learning_rate": 4.462110490910938e-05, + "loss": 0.0842, + "num_input_tokens_seen": 11182112, + "step": 52990 + }, + { + "epoch": 5.83003300330033, + "grad_norm": 0.7254603505134583, + "learning_rate": 4.4619617513186406e-05, + "loss": 0.0541, + "num_input_tokens_seen": 11183168, + "step": 52995 + }, + { + "epoch": 5.83058305830583, + "grad_norm": 0.5594502687454224, + "learning_rate": 4.461812993643883e-05, + "loss": 0.1264, + "num_input_tokens_seen": 11184192, + "step": 53000 + }, + { + "epoch": 5.831133113311331, + "grad_norm": 0.08399467915296555, + "learning_rate": 4.461664217888037e-05, + "loss": 0.0279, + "num_input_tokens_seen": 11185216, + "step": 53005 + }, + { + "epoch": 5.8316831683168315, + "grad_norm": 0.16482223570346832, + "learning_rate": 4.4615154240524736e-05, + "loss": 0.0115, + "num_input_tokens_seen": 11186272, + "step": 53010 + }, + { + "epoch": 5.832233223322332, + "grad_norm": 0.058380331844091415, + "learning_rate": 4.4613666121385625e-05, + "loss": 0.0972, + "num_input_tokens_seen": 11187392, + "step": 53015 + }, + { + "epoch": 5.832783278327833, + "grad_norm": 0.24903176724910736, + "learning_rate": 4.4612177821476774e-05, + "loss": 0.0242, + "num_input_tokens_seen": 11188448, + "step": 53020 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.5381922125816345, + "learning_rate": 4.461068934081189e-05, + "loss": 0.0791, + "num_input_tokens_seen": 11189472, + "step": 53025 + }, + { + "epoch": 5.833883388338834, + "grad_norm": 0.08951588720083237, + "learning_rate": 4.4609200679404686e-05, + "loss": 0.0748, + "num_input_tokens_seen": 11190560, + "step": 53030 + }, + { + "epoch": 5.834433443344334, + "grad_norm": 0.5091251730918884, + "learning_rate": 4.4607711837268893e-05, + "loss": 0.0465, + "num_input_tokens_seen": 11191616, + "step": 53035 + }, + { + "epoch": 5.834983498349835, + "grad_norm": 0.9276660680770874, + "learning_rate": 4.460622281441823e-05, + "loss": 0.0562, + "num_input_tokens_seen": 11192640, + "step": 53040 + }, + { + "epoch": 5.835533553355336, + "grad_norm": 0.009636757895350456, + "learning_rate": 4.460473361086642e-05, + "loss": 0.0628, + "num_input_tokens_seen": 11193696, + "step": 53045 + }, + { + "epoch": 5.836083608360836, + "grad_norm": 0.4935803711414337, + "learning_rate": 4.460324422662718e-05, + "loss": 0.0282, + "num_input_tokens_seen": 11194720, + "step": 53050 + }, + { + "epoch": 5.836633663366337, + "grad_norm": 0.017252610996365547, + "learning_rate": 4.460175466171426e-05, + "loss": 0.0082, + "num_input_tokens_seen": 11195808, + "step": 53055 + }, + { + "epoch": 5.837183718371837, + "grad_norm": 0.4116937518119812, + "learning_rate": 4.4600264916141365e-05, + "loss": 0.172, + "num_input_tokens_seen": 11196896, + "step": 53060 + }, + { + "epoch": 5.837733773377337, + "grad_norm": 0.016934294253587723, + "learning_rate": 4.459877498992223e-05, + "loss": 0.0137, + "num_input_tokens_seen": 11197920, + "step": 53065 + }, + { + "epoch": 5.838283828382838, + "grad_norm": 0.15535372495651245, + "learning_rate": 4.459728488307059e-05, + "loss": 0.0716, + "num_input_tokens_seen": 11198976, + "step": 53070 + }, + { + "epoch": 5.838833883388339, + "grad_norm": 0.06866259127855301, + "learning_rate": 4.459579459560018e-05, + "loss": 0.0977, + "num_input_tokens_seen": 11199968, + "step": 53075 + }, + { + "epoch": 5.83938393839384, + "grad_norm": 0.43875372409820557, + "learning_rate": 4.459430412752473e-05, + "loss": 0.0265, + "num_input_tokens_seen": 11201056, + "step": 53080 + }, + { + "epoch": 5.83993399339934, + "grad_norm": 0.9959349036216736, + "learning_rate": 4.4592813478857995e-05, + "loss": 0.0857, + "num_input_tokens_seen": 11202112, + "step": 53085 + }, + { + "epoch": 5.84048404840484, + "grad_norm": 1.1720048189163208, + "learning_rate": 4.4591322649613685e-05, + "loss": 0.0838, + "num_input_tokens_seen": 11203168, + "step": 53090 + }, + { + "epoch": 5.841034103410341, + "grad_norm": 0.520879864692688, + "learning_rate": 4.458983163980557e-05, + "loss": 0.0568, + "num_input_tokens_seen": 11204224, + "step": 53095 + }, + { + "epoch": 5.841584158415841, + "grad_norm": 0.26044875383377075, + "learning_rate": 4.458834044944736e-05, + "loss": 0.0567, + "num_input_tokens_seen": 11205344, + "step": 53100 + }, + { + "epoch": 5.8421342134213425, + "grad_norm": 0.15309400856494904, + "learning_rate": 4.458684907855283e-05, + "loss": 0.0365, + "num_input_tokens_seen": 11206336, + "step": 53105 + }, + { + "epoch": 5.842684268426843, + "grad_norm": 0.044084582477808, + "learning_rate": 4.45853575271357e-05, + "loss": 0.0481, + "num_input_tokens_seen": 11207392, + "step": 53110 + }, + { + "epoch": 5.843234323432343, + "grad_norm": 0.687347412109375, + "learning_rate": 4.458386579520973e-05, + "loss": 0.0358, + "num_input_tokens_seen": 11208416, + "step": 53115 + }, + { + "epoch": 5.843784378437844, + "grad_norm": 0.26742181181907654, + "learning_rate": 4.4582373882788676e-05, + "loss": 0.0411, + "num_input_tokens_seen": 11209472, + "step": 53120 + }, + { + "epoch": 5.844334433443344, + "grad_norm": 0.7113716006278992, + "learning_rate": 4.4580881789886264e-05, + "loss": 0.1087, + "num_input_tokens_seen": 11210496, + "step": 53125 + }, + { + "epoch": 5.8448844884488445, + "grad_norm": 0.022242622449994087, + "learning_rate": 4.4579389516516265e-05, + "loss": 0.007, + "num_input_tokens_seen": 11211552, + "step": 53130 + }, + { + "epoch": 5.8454345434543455, + "grad_norm": 1.1011821031570435, + "learning_rate": 4.457789706269243e-05, + "loss": 0.0493, + "num_input_tokens_seen": 11212608, + "step": 53135 + }, + { + "epoch": 5.845984598459846, + "grad_norm": 0.7256076335906982, + "learning_rate": 4.457640442842851e-05, + "loss": 0.0732, + "num_input_tokens_seen": 11213696, + "step": 53140 + }, + { + "epoch": 5.846534653465347, + "grad_norm": 1.5239777565002441, + "learning_rate": 4.457491161373827e-05, + "loss": 0.0401, + "num_input_tokens_seen": 11214688, + "step": 53145 + }, + { + "epoch": 5.847084708470847, + "grad_norm": 0.1723448485136032, + "learning_rate": 4.4573418618635465e-05, + "loss": 0.0307, + "num_input_tokens_seen": 11215776, + "step": 53150 + }, + { + "epoch": 5.847634763476347, + "grad_norm": 0.26720330119132996, + "learning_rate": 4.4571925443133846e-05, + "loss": 0.0422, + "num_input_tokens_seen": 11216832, + "step": 53155 + }, + { + "epoch": 5.848184818481848, + "grad_norm": 0.13836504518985748, + "learning_rate": 4.4570432087247175e-05, + "loss": 0.0089, + "num_input_tokens_seen": 11217920, + "step": 53160 + }, + { + "epoch": 5.8487348734873486, + "grad_norm": 0.06512591242790222, + "learning_rate": 4.456893855098924e-05, + "loss": 0.0207, + "num_input_tokens_seen": 11218912, + "step": 53165 + }, + { + "epoch": 5.84928492849285, + "grad_norm": 0.05222640559077263, + "learning_rate": 4.456744483437377e-05, + "loss": 0.0135, + "num_input_tokens_seen": 11219936, + "step": 53170 + }, + { + "epoch": 5.84983498349835, + "grad_norm": 0.05444364249706268, + "learning_rate": 4.456595093741456e-05, + "loss": 0.0254, + "num_input_tokens_seen": 11220960, + "step": 53175 + }, + { + "epoch": 5.85038503850385, + "grad_norm": 0.09923721104860306, + "learning_rate": 4.456445686012537e-05, + "loss": 0.0174, + "num_input_tokens_seen": 11221984, + "step": 53180 + }, + { + "epoch": 5.850935093509351, + "grad_norm": 0.03270219266414642, + "learning_rate": 4.456296260251996e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11223072, + "step": 53185 + }, + { + "epoch": 5.851485148514851, + "grad_norm": 0.212362140417099, + "learning_rate": 4.456146816461212e-05, + "loss": 0.013, + "num_input_tokens_seen": 11224096, + "step": 53190 + }, + { + "epoch": 5.852035203520352, + "grad_norm": 0.004772933665663004, + "learning_rate": 4.4559973546415614e-05, + "loss": 0.0301, + "num_input_tokens_seen": 11225152, + "step": 53195 + }, + { + "epoch": 5.852585258525853, + "grad_norm": 0.01937444694340229, + "learning_rate": 4.455847874794421e-05, + "loss": 0.0463, + "num_input_tokens_seen": 11226304, + "step": 53200 + }, + { + "epoch": 5.853135313531353, + "grad_norm": 0.07065468281507492, + "learning_rate": 4.45569837692117e-05, + "loss": 0.0776, + "num_input_tokens_seen": 11227392, + "step": 53205 + }, + { + "epoch": 5.853685368536854, + "grad_norm": 0.2403186857700348, + "learning_rate": 4.4555488610231854e-05, + "loss": 0.0787, + "num_input_tokens_seen": 11228448, + "step": 53210 + }, + { + "epoch": 5.854235423542354, + "grad_norm": 0.0828007385134697, + "learning_rate": 4.455399327101845e-05, + "loss": 0.0727, + "num_input_tokens_seen": 11229536, + "step": 53215 + }, + { + "epoch": 5.854785478547855, + "grad_norm": 0.18430723249912262, + "learning_rate": 4.455249775158527e-05, + "loss": 0.0401, + "num_input_tokens_seen": 11230592, + "step": 53220 + }, + { + "epoch": 5.8553355335533555, + "grad_norm": 0.02968720719218254, + "learning_rate": 4.455100205194611e-05, + "loss": 0.0355, + "num_input_tokens_seen": 11231680, + "step": 53225 + }, + { + "epoch": 5.855885588558856, + "grad_norm": 0.35282373428344727, + "learning_rate": 4.4549506172114744e-05, + "loss": 0.0453, + "num_input_tokens_seen": 11232736, + "step": 53230 + }, + { + "epoch": 5.856435643564357, + "grad_norm": 0.04396122694015503, + "learning_rate": 4.4548010112104955e-05, + "loss": 0.0945, + "num_input_tokens_seen": 11233824, + "step": 53235 + }, + { + "epoch": 5.856985698569857, + "grad_norm": 1.237904667854309, + "learning_rate": 4.454651387193054e-05, + "loss": 0.0696, + "num_input_tokens_seen": 11234912, + "step": 53240 + }, + { + "epoch": 5.857535753575357, + "grad_norm": 0.259878933429718, + "learning_rate": 4.454501745160529e-05, + "loss": 0.0334, + "num_input_tokens_seen": 11235936, + "step": 53245 + }, + { + "epoch": 5.858085808580858, + "grad_norm": 0.5773954391479492, + "learning_rate": 4.454352085114298e-05, + "loss": 0.0849, + "num_input_tokens_seen": 11236992, + "step": 53250 + }, + { + "epoch": 5.8586358635863585, + "grad_norm": 0.5514191389083862, + "learning_rate": 4.454202407055742e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11238048, + "step": 53255 + }, + { + "epoch": 5.8591859185918596, + "grad_norm": 0.022457247599959373, + "learning_rate": 4.454052710986241e-05, + "loss": 0.0112, + "num_input_tokens_seen": 11239040, + "step": 53260 + }, + { + "epoch": 5.85973597359736, + "grad_norm": 0.045692022889852524, + "learning_rate": 4.4539029969071734e-05, + "loss": 0.0103, + "num_input_tokens_seen": 11240032, + "step": 53265 + }, + { + "epoch": 5.86028602860286, + "grad_norm": 0.05870768800377846, + "learning_rate": 4.453753264819919e-05, + "loss": 0.0373, + "num_input_tokens_seen": 11241088, + "step": 53270 + }, + { + "epoch": 5.860836083608361, + "grad_norm": 0.019755318760871887, + "learning_rate": 4.453603514725859e-05, + "loss": 0.0879, + "num_input_tokens_seen": 11242144, + "step": 53275 + }, + { + "epoch": 5.861386138613861, + "grad_norm": 0.23795373737812042, + "learning_rate": 4.4534537466263726e-05, + "loss": 0.1222, + "num_input_tokens_seen": 11243200, + "step": 53280 + }, + { + "epoch": 5.861936193619362, + "grad_norm": 1.0343246459960938, + "learning_rate": 4.45330396052284e-05, + "loss": 0.1126, + "num_input_tokens_seen": 11244320, + "step": 53285 + }, + { + "epoch": 5.862486248624863, + "grad_norm": 2.233172655105591, + "learning_rate": 4.453154156416642e-05, + "loss": 0.0381, + "num_input_tokens_seen": 11245376, + "step": 53290 + }, + { + "epoch": 5.863036303630363, + "grad_norm": 0.3887903690338135, + "learning_rate": 4.4530043343091596e-05, + "loss": 0.0333, + "num_input_tokens_seen": 11246400, + "step": 53295 + }, + { + "epoch": 5.863586358635864, + "grad_norm": 1.505932331085205, + "learning_rate": 4.452854494201774e-05, + "loss": 0.1207, + "num_input_tokens_seen": 11247488, + "step": 53300 + }, + { + "epoch": 5.864136413641364, + "grad_norm": 0.980539858341217, + "learning_rate": 4.452704636095865e-05, + "loss": 0.1344, + "num_input_tokens_seen": 11248576, + "step": 53305 + }, + { + "epoch": 5.864686468646864, + "grad_norm": 0.13875488936901093, + "learning_rate": 4.452554759992814e-05, + "loss": 0.011, + "num_input_tokens_seen": 11249632, + "step": 53310 + }, + { + "epoch": 5.865236523652365, + "grad_norm": 0.1450701355934143, + "learning_rate": 4.452404865894003e-05, + "loss": 0.0725, + "num_input_tokens_seen": 11250656, + "step": 53315 + }, + { + "epoch": 5.865786578657866, + "grad_norm": 0.6105225682258606, + "learning_rate": 4.452254953800814e-05, + "loss": 0.1043, + "num_input_tokens_seen": 11251680, + "step": 53320 + }, + { + "epoch": 5.866336633663367, + "grad_norm": 0.35095423460006714, + "learning_rate": 4.4521050237146274e-05, + "loss": 0.0815, + "num_input_tokens_seen": 11252704, + "step": 53325 + }, + { + "epoch": 5.866886688668867, + "grad_norm": 0.027403879910707474, + "learning_rate": 4.451955075636825e-05, + "loss": 0.0415, + "num_input_tokens_seen": 11253792, + "step": 53330 + }, + { + "epoch": 5.867436743674367, + "grad_norm": 1.1483296155929565, + "learning_rate": 4.45180510956879e-05, + "loss": 0.1996, + "num_input_tokens_seen": 11254880, + "step": 53335 + }, + { + "epoch": 5.867986798679868, + "grad_norm": 0.15723642706871033, + "learning_rate": 4.451655125511903e-05, + "loss": 0.0324, + "num_input_tokens_seen": 11255968, + "step": 53340 + }, + { + "epoch": 5.868536853685368, + "grad_norm": 0.1037263423204422, + "learning_rate": 4.4515051234675475e-05, + "loss": 0.0148, + "num_input_tokens_seen": 11256992, + "step": 53345 + }, + { + "epoch": 5.8690869086908695, + "grad_norm": 0.025762135162949562, + "learning_rate": 4.451355103437106e-05, + "loss": 0.0296, + "num_input_tokens_seen": 11258016, + "step": 53350 + }, + { + "epoch": 5.86963696369637, + "grad_norm": 0.0641743391752243, + "learning_rate": 4.451205065421961e-05, + "loss": 0.041, + "num_input_tokens_seen": 11259072, + "step": 53355 + }, + { + "epoch": 5.87018701870187, + "grad_norm": 0.11282920837402344, + "learning_rate": 4.4510550094234947e-05, + "loss": 0.0494, + "num_input_tokens_seen": 11260096, + "step": 53360 + }, + { + "epoch": 5.870737073707371, + "grad_norm": 0.7795231342315674, + "learning_rate": 4.450904935443091e-05, + "loss": 0.0631, + "num_input_tokens_seen": 11261056, + "step": 53365 + }, + { + "epoch": 5.871287128712871, + "grad_norm": 0.17717619240283966, + "learning_rate": 4.4507548434821326e-05, + "loss": 0.058, + "num_input_tokens_seen": 11262144, + "step": 53370 + }, + { + "epoch": 5.871837183718371, + "grad_norm": 0.08110454678535461, + "learning_rate": 4.450604733542003e-05, + "loss": 0.0245, + "num_input_tokens_seen": 11263232, + "step": 53375 + }, + { + "epoch": 5.8723872387238725, + "grad_norm": 0.0791674330830574, + "learning_rate": 4.450454605624085e-05, + "loss": 0.079, + "num_input_tokens_seen": 11264256, + "step": 53380 + }, + { + "epoch": 5.872937293729373, + "grad_norm": 0.2562277317047119, + "learning_rate": 4.4503044597297634e-05, + "loss": 0.016, + "num_input_tokens_seen": 11265344, + "step": 53385 + }, + { + "epoch": 5.873487348734874, + "grad_norm": 0.01238461583852768, + "learning_rate": 4.450154295860422e-05, + "loss": 0.0131, + "num_input_tokens_seen": 11266336, + "step": 53390 + }, + { + "epoch": 5.874037403740374, + "grad_norm": 0.07081930339336395, + "learning_rate": 4.450004114017443e-05, + "loss": 0.0302, + "num_input_tokens_seen": 11267392, + "step": 53395 + }, + { + "epoch": 5.874587458745875, + "grad_norm": 0.19759853184223175, + "learning_rate": 4.449853914202212e-05, + "loss": 0.0218, + "num_input_tokens_seen": 11268448, + "step": 53400 + }, + { + "epoch": 5.875137513751375, + "grad_norm": 0.992594301700592, + "learning_rate": 4.4497036964161134e-05, + "loss": 0.0223, + "num_input_tokens_seen": 11269536, + "step": 53405 + }, + { + "epoch": 5.8756875687568755, + "grad_norm": 0.09829367697238922, + "learning_rate": 4.449553460660531e-05, + "loss": 0.0092, + "num_input_tokens_seen": 11270656, + "step": 53410 + }, + { + "epoch": 5.876237623762377, + "grad_norm": 0.005895826034247875, + "learning_rate": 4.44940320693685e-05, + "loss": 0.0445, + "num_input_tokens_seen": 11271680, + "step": 53415 + }, + { + "epoch": 5.876787678767877, + "grad_norm": 0.2900873124599457, + "learning_rate": 4.449252935246455e-05, + "loss": 0.0648, + "num_input_tokens_seen": 11272800, + "step": 53420 + }, + { + "epoch": 5.877337733773377, + "grad_norm": 0.6106969714164734, + "learning_rate": 4.4491026455907314e-05, + "loss": 0.055, + "num_input_tokens_seen": 11273856, + "step": 53425 + }, + { + "epoch": 5.877887788778878, + "grad_norm": 2.022778034210205, + "learning_rate": 4.448952337971064e-05, + "loss": 0.0904, + "num_input_tokens_seen": 11274912, + "step": 53430 + }, + { + "epoch": 5.878437843784378, + "grad_norm": 0.9354892373085022, + "learning_rate": 4.448802012388838e-05, + "loss": 0.0329, + "num_input_tokens_seen": 11275968, + "step": 53435 + }, + { + "epoch": 5.878987898789879, + "grad_norm": 0.06862517446279526, + "learning_rate": 4.4486516688454385e-05, + "loss": 0.0351, + "num_input_tokens_seen": 11277024, + "step": 53440 + }, + { + "epoch": 5.87953795379538, + "grad_norm": 0.06625167280435562, + "learning_rate": 4.4485013073422524e-05, + "loss": 0.0048, + "num_input_tokens_seen": 11278080, + "step": 53445 + }, + { + "epoch": 5.88008800880088, + "grad_norm": 0.4462771415710449, + "learning_rate": 4.448350927880665e-05, + "loss": 0.0169, + "num_input_tokens_seen": 11279072, + "step": 53450 + }, + { + "epoch": 5.880638063806381, + "grad_norm": 0.2024976909160614, + "learning_rate": 4.4482005304620606e-05, + "loss": 0.0116, + "num_input_tokens_seen": 11280192, + "step": 53455 + }, + { + "epoch": 5.881188118811881, + "grad_norm": 1.371769905090332, + "learning_rate": 4.448050115087827e-05, + "loss": 0.0872, + "num_input_tokens_seen": 11281248, + "step": 53460 + }, + { + "epoch": 5.881738173817382, + "grad_norm": 0.024646025151014328, + "learning_rate": 4.447899681759351e-05, + "loss": 0.0568, + "num_input_tokens_seen": 11282336, + "step": 53465 + }, + { + "epoch": 5.882288228822882, + "grad_norm": 0.506762683391571, + "learning_rate": 4.447749230478018e-05, + "loss": 0.0172, + "num_input_tokens_seen": 11283392, + "step": 53470 + }, + { + "epoch": 5.882838283828383, + "grad_norm": 0.09916772693395615, + "learning_rate": 4.447598761245215e-05, + "loss": 0.0194, + "num_input_tokens_seen": 11284416, + "step": 53475 + }, + { + "epoch": 5.883388338833884, + "grad_norm": 0.09626064449548721, + "learning_rate": 4.447448274062329e-05, + "loss": 0.0154, + "num_input_tokens_seen": 11285536, + "step": 53480 + }, + { + "epoch": 5.883938393839384, + "grad_norm": 0.2138669639825821, + "learning_rate": 4.4472977689307474e-05, + "loss": 0.0701, + "num_input_tokens_seen": 11286592, + "step": 53485 + }, + { + "epoch": 5.884488448844884, + "grad_norm": 4.060842514038086, + "learning_rate": 4.447147245851855e-05, + "loss": 0.1863, + "num_input_tokens_seen": 11287680, + "step": 53490 + }, + { + "epoch": 5.885038503850385, + "grad_norm": 0.505135178565979, + "learning_rate": 4.446996704827042e-05, + "loss": 0.037, + "num_input_tokens_seen": 11288736, + "step": 53495 + }, + { + "epoch": 5.885588558855885, + "grad_norm": 0.661994218826294, + "learning_rate": 4.446846145857694e-05, + "loss": 0.0148, + "num_input_tokens_seen": 11289792, + "step": 53500 + }, + { + "epoch": 5.8861386138613865, + "grad_norm": 0.41540399193763733, + "learning_rate": 4.446695568945199e-05, + "loss": 0.0203, + "num_input_tokens_seen": 11290848, + "step": 53505 + }, + { + "epoch": 5.886688668866887, + "grad_norm": 0.14577032625675201, + "learning_rate": 4.446544974090946e-05, + "loss": 0.0094, + "num_input_tokens_seen": 11291872, + "step": 53510 + }, + { + "epoch": 5.887238723872387, + "grad_norm": 0.005761112552136183, + "learning_rate": 4.4463943612963216e-05, + "loss": 0.016, + "num_input_tokens_seen": 11292864, + "step": 53515 + }, + { + "epoch": 5.887788778877888, + "grad_norm": 0.1911778748035431, + "learning_rate": 4.4462437305627144e-05, + "loss": 0.067, + "num_input_tokens_seen": 11293888, + "step": 53520 + }, + { + "epoch": 5.888338833883388, + "grad_norm": 0.18579180538654327, + "learning_rate": 4.446093081891513e-05, + "loss": 0.0483, + "num_input_tokens_seen": 11294944, + "step": 53525 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 0.024320434778928757, + "learning_rate": 4.4459424152841046e-05, + "loss": 0.0414, + "num_input_tokens_seen": 11296000, + "step": 53530 + }, + { + "epoch": 5.8894389438943895, + "grad_norm": 0.017025716602802277, + "learning_rate": 4.445791730741879e-05, + "loss": 0.0187, + "num_input_tokens_seen": 11297088, + "step": 53535 + }, + { + "epoch": 5.88998899889989, + "grad_norm": 0.0936778336763382, + "learning_rate": 4.4456410282662244e-05, + "loss": 0.0108, + "num_input_tokens_seen": 11298176, + "step": 53540 + }, + { + "epoch": 5.890539053905391, + "grad_norm": 0.123633973300457, + "learning_rate": 4.4454903078585306e-05, + "loss": 0.0511, + "num_input_tokens_seen": 11299232, + "step": 53545 + }, + { + "epoch": 5.891089108910891, + "grad_norm": 0.6906003355979919, + "learning_rate": 4.445339569520186e-05, + "loss": 0.0551, + "num_input_tokens_seen": 11300352, + "step": 53550 + }, + { + "epoch": 5.891639163916391, + "grad_norm": 0.017158279195427895, + "learning_rate": 4.445188813252581e-05, + "loss": 0.0143, + "num_input_tokens_seen": 11301344, + "step": 53555 + }, + { + "epoch": 5.892189218921892, + "grad_norm": 0.22534070909023285, + "learning_rate": 4.445038039057102e-05, + "loss": 0.0642, + "num_input_tokens_seen": 11302400, + "step": 53560 + }, + { + "epoch": 5.8927392739273925, + "grad_norm": 0.10557989031076431, + "learning_rate": 4.444887246935143e-05, + "loss": 0.0102, + "num_input_tokens_seen": 11303392, + "step": 53565 + }, + { + "epoch": 5.893289328932894, + "grad_norm": 1.6704808473587036, + "learning_rate": 4.44473643688809e-05, + "loss": 0.1245, + "num_input_tokens_seen": 11304416, + "step": 53570 + }, + { + "epoch": 5.893839383938394, + "grad_norm": 0.04218333587050438, + "learning_rate": 4.4445856089173346e-05, + "loss": 0.0065, + "num_input_tokens_seen": 11305472, + "step": 53575 + }, + { + "epoch": 5.894389438943895, + "grad_norm": 0.02275937795639038, + "learning_rate": 4.444434763024267e-05, + "loss": 0.0349, + "num_input_tokens_seen": 11306496, + "step": 53580 + }, + { + "epoch": 5.894939493949395, + "grad_norm": 0.01842437870800495, + "learning_rate": 4.4442838992102774e-05, + "loss": 0.0075, + "num_input_tokens_seen": 11307552, + "step": 53585 + }, + { + "epoch": 5.895489548954895, + "grad_norm": 0.25183793902397156, + "learning_rate": 4.444133017476756e-05, + "loss": 0.0486, + "num_input_tokens_seen": 11308640, + "step": 53590 + }, + { + "epoch": 5.896039603960396, + "grad_norm": 0.06447602808475494, + "learning_rate": 4.4439821178250926e-05, + "loss": 0.0139, + "num_input_tokens_seen": 11309664, + "step": 53595 + }, + { + "epoch": 5.896589658965897, + "grad_norm": 0.0049549005925655365, + "learning_rate": 4.44383120025668e-05, + "loss": 0.0448, + "num_input_tokens_seen": 11310688, + "step": 53600 + }, + { + "epoch": 5.897139713971397, + "grad_norm": 0.12379700690507889, + "learning_rate": 4.443680264772908e-05, + "loss": 0.0164, + "num_input_tokens_seen": 11311744, + "step": 53605 + }, + { + "epoch": 5.897689768976898, + "grad_norm": 0.08734045922756195, + "learning_rate": 4.443529311375167e-05, + "loss": 0.0151, + "num_input_tokens_seen": 11312768, + "step": 53610 + }, + { + "epoch": 5.898239823982398, + "grad_norm": 0.21830542385578156, + "learning_rate": 4.443378340064849e-05, + "loss": 0.1017, + "num_input_tokens_seen": 11313920, + "step": 53615 + }, + { + "epoch": 5.898789878987898, + "grad_norm": 0.43554776906967163, + "learning_rate": 4.4432273508433455e-05, + "loss": 0.0454, + "num_input_tokens_seen": 11314944, + "step": 53620 + }, + { + "epoch": 5.899339933993399, + "grad_norm": 0.1066846251487732, + "learning_rate": 4.443076343712048e-05, + "loss": 0.0439, + "num_input_tokens_seen": 11316064, + "step": 53625 + }, + { + "epoch": 5.8998899889989, + "grad_norm": 0.6422216296195984, + "learning_rate": 4.442925318672348e-05, + "loss": 0.0463, + "num_input_tokens_seen": 11317120, + "step": 53630 + }, + { + "epoch": 5.900440044004401, + "grad_norm": 0.41400378942489624, + "learning_rate": 4.442774275725638e-05, + "loss": 0.0411, + "num_input_tokens_seen": 11318208, + "step": 53635 + }, + { + "epoch": 5.900990099009901, + "grad_norm": 0.08144836872816086, + "learning_rate": 4.4426232148733096e-05, + "loss": 0.015, + "num_input_tokens_seen": 11319328, + "step": 53640 + }, + { + "epoch": 5.901540154015402, + "grad_norm": 0.8725878596305847, + "learning_rate": 4.4424721361167557e-05, + "loss": 0.0383, + "num_input_tokens_seen": 11320384, + "step": 53645 + }, + { + "epoch": 5.902090209020902, + "grad_norm": 0.700312077999115, + "learning_rate": 4.442321039457367e-05, + "loss": 0.0247, + "num_input_tokens_seen": 11321440, + "step": 53650 + }, + { + "epoch": 5.902640264026402, + "grad_norm": 0.028254704549908638, + "learning_rate": 4.4421699248965385e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11322464, + "step": 53655 + }, + { + "epoch": 5.9031903190319035, + "grad_norm": 0.13102729618549347, + "learning_rate": 4.44201879243566e-05, + "loss": 0.0227, + "num_input_tokens_seen": 11323488, + "step": 53660 + }, + { + "epoch": 5.903740374037404, + "grad_norm": 0.5406389832496643, + "learning_rate": 4.441867642076128e-05, + "loss": 0.0446, + "num_input_tokens_seen": 11324512, + "step": 53665 + }, + { + "epoch": 5.904290429042904, + "grad_norm": 0.046591803431510925, + "learning_rate": 4.441716473819333e-05, + "loss": 0.0504, + "num_input_tokens_seen": 11325536, + "step": 53670 + }, + { + "epoch": 5.904840484048405, + "grad_norm": 0.2996572256088257, + "learning_rate": 4.441565287666669e-05, + "loss": 0.0569, + "num_input_tokens_seen": 11326592, + "step": 53675 + }, + { + "epoch": 5.905390539053905, + "grad_norm": 0.015488999895751476, + "learning_rate": 4.44141408361953e-05, + "loss": 0.0065, + "num_input_tokens_seen": 11327648, + "step": 53680 + }, + { + "epoch": 5.905940594059406, + "grad_norm": 1.358768343925476, + "learning_rate": 4.441262861679308e-05, + "loss": 0.0909, + "num_input_tokens_seen": 11328672, + "step": 53685 + }, + { + "epoch": 5.9064906490649065, + "grad_norm": 0.40087389945983887, + "learning_rate": 4.441111621847398e-05, + "loss": 0.0083, + "num_input_tokens_seen": 11329664, + "step": 53690 + }, + { + "epoch": 5.907040704070407, + "grad_norm": 0.8546962738037109, + "learning_rate": 4.440960364125194e-05, + "loss": 0.0395, + "num_input_tokens_seen": 11330688, + "step": 53695 + }, + { + "epoch": 5.907590759075908, + "grad_norm": 0.034442853182554245, + "learning_rate": 4.44080908851409e-05, + "loss": 0.0699, + "num_input_tokens_seen": 11331808, + "step": 53700 + }, + { + "epoch": 5.908140814081408, + "grad_norm": 1.8225115537643433, + "learning_rate": 4.440657795015479e-05, + "loss": 0.2498, + "num_input_tokens_seen": 11332832, + "step": 53705 + }, + { + "epoch": 5.908690869086909, + "grad_norm": 0.1772751659154892, + "learning_rate": 4.4405064836307566e-05, + "loss": 0.0488, + "num_input_tokens_seen": 11333856, + "step": 53710 + }, + { + "epoch": 5.909240924092409, + "grad_norm": 0.33282431960105896, + "learning_rate": 4.440355154361318e-05, + "loss": 0.06, + "num_input_tokens_seen": 11334912, + "step": 53715 + }, + { + "epoch": 5.9097909790979095, + "grad_norm": 0.9975900053977966, + "learning_rate": 4.4402038072085557e-05, + "loss": 0.0601, + "num_input_tokens_seen": 11336032, + "step": 53720 + }, + { + "epoch": 5.910341034103411, + "grad_norm": 0.06732148677110672, + "learning_rate": 4.440052442173866e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11337120, + "step": 53725 + }, + { + "epoch": 5.910891089108911, + "grad_norm": 0.9431760907173157, + "learning_rate": 4.4399010592586446e-05, + "loss": 0.118, + "num_input_tokens_seen": 11338176, + "step": 53730 + }, + { + "epoch": 5.911441144114411, + "grad_norm": 0.039976272732019424, + "learning_rate": 4.439749658464286e-05, + "loss": 0.0621, + "num_input_tokens_seen": 11339232, + "step": 53735 + }, + { + "epoch": 5.911991199119912, + "grad_norm": 0.5959933996200562, + "learning_rate": 4.439598239792185e-05, + "loss": 0.0496, + "num_input_tokens_seen": 11340256, + "step": 53740 + }, + { + "epoch": 5.912541254125412, + "grad_norm": 0.9744930267333984, + "learning_rate": 4.439446803243738e-05, + "loss": 0.0529, + "num_input_tokens_seen": 11341312, + "step": 53745 + }, + { + "epoch": 5.913091309130913, + "grad_norm": 0.36430028080940247, + "learning_rate": 4.43929534882034e-05, + "loss": 0.0397, + "num_input_tokens_seen": 11342368, + "step": 53750 + }, + { + "epoch": 5.913641364136414, + "grad_norm": 0.07217764109373093, + "learning_rate": 4.4391438765233885e-05, + "loss": 0.0161, + "num_input_tokens_seen": 11343488, + "step": 53755 + }, + { + "epoch": 5.914191419141914, + "grad_norm": 2.049481153488159, + "learning_rate": 4.4389923863542774e-05, + "loss": 0.0654, + "num_input_tokens_seen": 11344544, + "step": 53760 + }, + { + "epoch": 5.914741474147415, + "grad_norm": 0.019167939200997353, + "learning_rate": 4.438840878314404e-05, + "loss": 0.0661, + "num_input_tokens_seen": 11345568, + "step": 53765 + }, + { + "epoch": 5.915291529152915, + "grad_norm": 0.21490195393562317, + "learning_rate": 4.4386893524051655e-05, + "loss": 0.0131, + "num_input_tokens_seen": 11346592, + "step": 53770 + }, + { + "epoch": 5.915841584158416, + "grad_norm": 0.007392429281026125, + "learning_rate": 4.438537808627956e-05, + "loss": 0.027, + "num_input_tokens_seen": 11347648, + "step": 53775 + }, + { + "epoch": 5.916391639163916, + "grad_norm": 0.47710666060447693, + "learning_rate": 4.4383862469841745e-05, + "loss": 0.0942, + "num_input_tokens_seen": 11348736, + "step": 53780 + }, + { + "epoch": 5.916941694169417, + "grad_norm": 0.016167467460036278, + "learning_rate": 4.4382346674752175e-05, + "loss": 0.0341, + "num_input_tokens_seen": 11349856, + "step": 53785 + }, + { + "epoch": 5.917491749174918, + "grad_norm": 0.04722735658288002, + "learning_rate": 4.438083070102481e-05, + "loss": 0.0317, + "num_input_tokens_seen": 11350880, + "step": 53790 + }, + { + "epoch": 5.918041804180418, + "grad_norm": 0.1520097255706787, + "learning_rate": 4.437931454867364e-05, + "loss": 0.0106, + "num_input_tokens_seen": 11352032, + "step": 53795 + }, + { + "epoch": 5.918591859185918, + "grad_norm": 0.6717469096183777, + "learning_rate": 4.437779821771261e-05, + "loss": 0.0323, + "num_input_tokens_seen": 11353088, + "step": 53800 + }, + { + "epoch": 5.919141914191419, + "grad_norm": 0.2318495362997055, + "learning_rate": 4.4376281708155714e-05, + "loss": 0.011, + "num_input_tokens_seen": 11354176, + "step": 53805 + }, + { + "epoch": 5.919691969196919, + "grad_norm": 1.5524656772613525, + "learning_rate": 4.437476502001693e-05, + "loss": 0.1136, + "num_input_tokens_seen": 11355328, + "step": 53810 + }, + { + "epoch": 5.9202420242024205, + "grad_norm": 0.6528134942054749, + "learning_rate": 4.437324815331024e-05, + "loss": 0.1548, + "num_input_tokens_seen": 11356352, + "step": 53815 + }, + { + "epoch": 5.920792079207921, + "grad_norm": 0.45243850350379944, + "learning_rate": 4.4371731108049615e-05, + "loss": 0.0817, + "num_input_tokens_seen": 11357440, + "step": 53820 + }, + { + "epoch": 5.921342134213422, + "grad_norm": 0.008574608713388443, + "learning_rate": 4.4370213884249034e-05, + "loss": 0.033, + "num_input_tokens_seen": 11358432, + "step": 53825 + }, + { + "epoch": 5.921892189218922, + "grad_norm": 0.03498778119683266, + "learning_rate": 4.4368696481922495e-05, + "loss": 0.0873, + "num_input_tokens_seen": 11359424, + "step": 53830 + }, + { + "epoch": 5.922442244224422, + "grad_norm": 0.622248113155365, + "learning_rate": 4.4367178901083974e-05, + "loss": 0.1713, + "num_input_tokens_seen": 11360480, + "step": 53835 + }, + { + "epoch": 5.922992299229923, + "grad_norm": 0.5178218483924866, + "learning_rate": 4.436566114174745e-05, + "loss": 0.0149, + "num_input_tokens_seen": 11361536, + "step": 53840 + }, + { + "epoch": 5.9235423542354235, + "grad_norm": 0.09772950410842896, + "learning_rate": 4.4364143203926925e-05, + "loss": 0.076, + "num_input_tokens_seen": 11362560, + "step": 53845 + }, + { + "epoch": 5.924092409240924, + "grad_norm": 0.05214519798755646, + "learning_rate": 4.436262508763638e-05, + "loss": 0.0216, + "num_input_tokens_seen": 11363616, + "step": 53850 + }, + { + "epoch": 5.924642464246425, + "grad_norm": 0.06860215216875076, + "learning_rate": 4.436110679288982e-05, + "loss": 0.0152, + "num_input_tokens_seen": 11364672, + "step": 53855 + }, + { + "epoch": 5.925192519251925, + "grad_norm": 0.11752599477767944, + "learning_rate": 4.4359588319701224e-05, + "loss": 0.0113, + "num_input_tokens_seen": 11365728, + "step": 53860 + }, + { + "epoch": 5.925742574257426, + "grad_norm": 0.14424319565296173, + "learning_rate": 4.435806966808459e-05, + "loss": 0.0107, + "num_input_tokens_seen": 11366784, + "step": 53865 + }, + { + "epoch": 5.926292629262926, + "grad_norm": 0.45424365997314453, + "learning_rate": 4.435655083805392e-05, + "loss": 0.0313, + "num_input_tokens_seen": 11367840, + "step": 53870 + }, + { + "epoch": 5.9268426842684265, + "grad_norm": 0.06551156938076019, + "learning_rate": 4.4355031829623204e-05, + "loss": 0.0131, + "num_input_tokens_seen": 11368896, + "step": 53875 + }, + { + "epoch": 5.927392739273928, + "grad_norm": 0.03625597059726715, + "learning_rate": 4.4353512642806454e-05, + "loss": 0.0115, + "num_input_tokens_seen": 11369952, + "step": 53880 + }, + { + "epoch": 5.927942794279428, + "grad_norm": 0.05273232236504555, + "learning_rate": 4.435199327761766e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11371008, + "step": 53885 + }, + { + "epoch": 5.928492849284929, + "grad_norm": 1.2350740432739258, + "learning_rate": 4.435047373407083e-05, + "loss": 0.093, + "num_input_tokens_seen": 11372096, + "step": 53890 + }, + { + "epoch": 5.929042904290429, + "grad_norm": 0.01152576319873333, + "learning_rate": 4.4348954012179974e-05, + "loss": 0.0164, + "num_input_tokens_seen": 11373120, + "step": 53895 + }, + { + "epoch": 5.929592959295929, + "grad_norm": 0.4030436873435974, + "learning_rate": 4.434743411195909e-05, + "loss": 0.0636, + "num_input_tokens_seen": 11374208, + "step": 53900 + }, + { + "epoch": 5.93014301430143, + "grad_norm": 1.400780439376831, + "learning_rate": 4.434591403342219e-05, + "loss": 0.1266, + "num_input_tokens_seen": 11375200, + "step": 53905 + }, + { + "epoch": 5.930693069306931, + "grad_norm": 0.027011431753635406, + "learning_rate": 4.434439377658328e-05, + "loss": 0.0139, + "num_input_tokens_seen": 11376224, + "step": 53910 + }, + { + "epoch": 5.931243124312431, + "grad_norm": 0.03794429078698158, + "learning_rate": 4.4342873341456374e-05, + "loss": 0.0141, + "num_input_tokens_seen": 11377248, + "step": 53915 + }, + { + "epoch": 5.931793179317932, + "grad_norm": 0.40521520376205444, + "learning_rate": 4.43413527280555e-05, + "loss": 0.1045, + "num_input_tokens_seen": 11378272, + "step": 53920 + }, + { + "epoch": 5.932343234323432, + "grad_norm": 0.05517088249325752, + "learning_rate": 4.4339831936394635e-05, + "loss": 0.0067, + "num_input_tokens_seen": 11379296, + "step": 53925 + }, + { + "epoch": 5.932893289328933, + "grad_norm": 0.5244098901748657, + "learning_rate": 4.433831096648784e-05, + "loss": 0.0233, + "num_input_tokens_seen": 11380352, + "step": 53930 + }, + { + "epoch": 5.933443344334433, + "grad_norm": 0.3162131607532501, + "learning_rate": 4.43367898183491e-05, + "loss": 0.0439, + "num_input_tokens_seen": 11381376, + "step": 53935 + }, + { + "epoch": 5.933993399339934, + "grad_norm": 0.02967642992734909, + "learning_rate": 4.4335268491992455e-05, + "loss": 0.0188, + "num_input_tokens_seen": 11382496, + "step": 53940 + }, + { + "epoch": 5.934543454345435, + "grad_norm": 0.1439116895198822, + "learning_rate": 4.4333746987431916e-05, + "loss": 0.0682, + "num_input_tokens_seen": 11383616, + "step": 53945 + }, + { + "epoch": 5.935093509350935, + "grad_norm": 0.5394545793533325, + "learning_rate": 4.433222530468151e-05, + "loss": 0.0197, + "num_input_tokens_seen": 11384608, + "step": 53950 + }, + { + "epoch": 5.935643564356436, + "grad_norm": 0.1910572052001953, + "learning_rate": 4.433070344375525e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11385632, + "step": 53955 + }, + { + "epoch": 5.936193619361936, + "grad_norm": 0.043346062302589417, + "learning_rate": 4.432918140466718e-05, + "loss": 0.0359, + "num_input_tokens_seen": 11386720, + "step": 53960 + }, + { + "epoch": 5.936743674367436, + "grad_norm": 0.07738024741411209, + "learning_rate": 4.432765918743132e-05, + "loss": 0.0122, + "num_input_tokens_seen": 11387840, + "step": 53965 + }, + { + "epoch": 5.9372937293729375, + "grad_norm": 1.1230475902557373, + "learning_rate": 4.43261367920617e-05, + "loss": 0.1203, + "num_input_tokens_seen": 11388864, + "step": 53970 + }, + { + "epoch": 5.937843784378438, + "grad_norm": 0.05512990057468414, + "learning_rate": 4.432461421857235e-05, + "loss": 0.0554, + "num_input_tokens_seen": 11389856, + "step": 53975 + }, + { + "epoch": 5.938393839383938, + "grad_norm": 0.6212026476860046, + "learning_rate": 4.43230914669773e-05, + "loss": 0.0666, + "num_input_tokens_seen": 11390976, + "step": 53980 + }, + { + "epoch": 5.938943894389439, + "grad_norm": 0.219634011387825, + "learning_rate": 4.4321568537290595e-05, + "loss": 0.0184, + "num_input_tokens_seen": 11392032, + "step": 53985 + }, + { + "epoch": 5.939493949394939, + "grad_norm": 0.15765100717544556, + "learning_rate": 4.432004542952626e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11393088, + "step": 53990 + }, + { + "epoch": 5.94004400440044, + "grad_norm": 0.1426728069782257, + "learning_rate": 4.431852214369834e-05, + "loss": 0.1146, + "num_input_tokens_seen": 11394080, + "step": 53995 + }, + { + "epoch": 5.9405940594059405, + "grad_norm": 0.05174649879336357, + "learning_rate": 4.431699867982086e-05, + "loss": 0.018, + "num_input_tokens_seen": 11395168, + "step": 54000 + }, + { + "epoch": 5.941144114411442, + "grad_norm": 0.10668136179447174, + "learning_rate": 4.431547503790788e-05, + "loss": 0.0344, + "num_input_tokens_seen": 11396192, + "step": 54005 + }, + { + "epoch": 5.941694169416942, + "grad_norm": 1.0843826532363892, + "learning_rate": 4.4313951217973435e-05, + "loss": 0.0756, + "num_input_tokens_seen": 11397216, + "step": 54010 + }, + { + "epoch": 5.942244224422442, + "grad_norm": 0.2517724931240082, + "learning_rate": 4.431242722003158e-05, + "loss": 0.0073, + "num_input_tokens_seen": 11398240, + "step": 54015 + }, + { + "epoch": 5.942794279427943, + "grad_norm": 0.09437713027000427, + "learning_rate": 4.431090304409634e-05, + "loss": 0.0501, + "num_input_tokens_seen": 11399296, + "step": 54020 + }, + { + "epoch": 5.943344334433443, + "grad_norm": 0.15817049145698547, + "learning_rate": 4.430937869018177e-05, + "loss": 0.0096, + "num_input_tokens_seen": 11400352, + "step": 54025 + }, + { + "epoch": 5.9438943894389435, + "grad_norm": 0.5654287934303284, + "learning_rate": 4.4307854158301924e-05, + "loss": 0.0323, + "num_input_tokens_seen": 11401376, + "step": 54030 + }, + { + "epoch": 5.944444444444445, + "grad_norm": 0.03885548561811447, + "learning_rate": 4.430632944847085e-05, + "loss": 0.0497, + "num_input_tokens_seen": 11402400, + "step": 54035 + }, + { + "epoch": 5.944994499449945, + "grad_norm": 0.9113610982894897, + "learning_rate": 4.430480456070261e-05, + "loss": 0.0686, + "num_input_tokens_seen": 11403424, + "step": 54040 + }, + { + "epoch": 5.945544554455445, + "grad_norm": 0.014203423634171486, + "learning_rate": 4.430327949501124e-05, + "loss": 0.0158, + "num_input_tokens_seen": 11404512, + "step": 54045 + }, + { + "epoch": 5.946094609460946, + "grad_norm": 0.07739051431417465, + "learning_rate": 4.430175425141081e-05, + "loss": 0.0463, + "num_input_tokens_seen": 11405568, + "step": 54050 + }, + { + "epoch": 5.946644664466446, + "grad_norm": 0.007394266314804554, + "learning_rate": 4.430022882991538e-05, + "loss": 0.0299, + "num_input_tokens_seen": 11406656, + "step": 54055 + }, + { + "epoch": 5.947194719471947, + "grad_norm": 0.7824739217758179, + "learning_rate": 4.429870323053899e-05, + "loss": 0.0428, + "num_input_tokens_seen": 11407712, + "step": 54060 + }, + { + "epoch": 5.947744774477448, + "grad_norm": 0.043422918766736984, + "learning_rate": 4.429717745329571e-05, + "loss": 0.0628, + "num_input_tokens_seen": 11408704, + "step": 54065 + }, + { + "epoch": 5.948294829482949, + "grad_norm": 0.38380885124206543, + "learning_rate": 4.429565149819962e-05, + "loss": 0.0161, + "num_input_tokens_seen": 11409792, + "step": 54070 + }, + { + "epoch": 5.948844884488449, + "grad_norm": 0.02172851376235485, + "learning_rate": 4.429412536526476e-05, + "loss": 0.0194, + "num_input_tokens_seen": 11410880, + "step": 54075 + }, + { + "epoch": 5.949394939493949, + "grad_norm": 2.101905584335327, + "learning_rate": 4.429259905450521e-05, + "loss": 0.0763, + "num_input_tokens_seen": 11411904, + "step": 54080 + }, + { + "epoch": 5.94994499449945, + "grad_norm": 0.05756236985325813, + "learning_rate": 4.4291072565935025e-05, + "loss": 0.0246, + "num_input_tokens_seen": 11412960, + "step": 54085 + }, + { + "epoch": 5.9504950495049505, + "grad_norm": 0.0313696563243866, + "learning_rate": 4.428954589956829e-05, + "loss": 0.0643, + "num_input_tokens_seen": 11413984, + "step": 54090 + }, + { + "epoch": 5.951045104510451, + "grad_norm": 0.0241153035312891, + "learning_rate": 4.4288019055419054e-05, + "loss": 0.0233, + "num_input_tokens_seen": 11415072, + "step": 54095 + }, + { + "epoch": 5.951595159515952, + "grad_norm": 0.08524478226900101, + "learning_rate": 4.428649203350141e-05, + "loss": 0.0235, + "num_input_tokens_seen": 11416096, + "step": 54100 + }, + { + "epoch": 5.952145214521452, + "grad_norm": 1.9431012868881226, + "learning_rate": 4.428496483382942e-05, + "loss": 0.0703, + "num_input_tokens_seen": 11417184, + "step": 54105 + }, + { + "epoch": 5.952695269526953, + "grad_norm": 0.40072566270828247, + "learning_rate": 4.428343745641716e-05, + "loss": 0.0193, + "num_input_tokens_seen": 11418272, + "step": 54110 + }, + { + "epoch": 5.953245324532453, + "grad_norm": 0.2175356149673462, + "learning_rate": 4.428190990127872e-05, + "loss": 0.0811, + "num_input_tokens_seen": 11419264, + "step": 54115 + }, + { + "epoch": 5.9537953795379535, + "grad_norm": 0.3326784074306488, + "learning_rate": 4.4280382168428166e-05, + "loss": 0.0703, + "num_input_tokens_seen": 11420288, + "step": 54120 + }, + { + "epoch": 5.9543454345434546, + "grad_norm": 1.3421576023101807, + "learning_rate": 4.4278854257879575e-05, + "loss": 0.1222, + "num_input_tokens_seen": 11421344, + "step": 54125 + }, + { + "epoch": 5.954895489548955, + "grad_norm": 0.015022112056612968, + "learning_rate": 4.427732616964704e-05, + "loss": 0.018, + "num_input_tokens_seen": 11422336, + "step": 54130 + }, + { + "epoch": 5.955445544554456, + "grad_norm": 0.06879459321498871, + "learning_rate": 4.427579790374464e-05, + "loss": 0.0378, + "num_input_tokens_seen": 11423456, + "step": 54135 + }, + { + "epoch": 5.955995599559956, + "grad_norm": 1.3547629117965698, + "learning_rate": 4.427426946018646e-05, + "loss": 0.1558, + "num_input_tokens_seen": 11424512, + "step": 54140 + }, + { + "epoch": 5.956545654565456, + "grad_norm": 1.7807034254074097, + "learning_rate": 4.4272740838986585e-05, + "loss": 0.1543, + "num_input_tokens_seen": 11425632, + "step": 54145 + }, + { + "epoch": 5.957095709570957, + "grad_norm": 0.03686071187257767, + "learning_rate": 4.42712120401591e-05, + "loss": 0.0118, + "num_input_tokens_seen": 11426752, + "step": 54150 + }, + { + "epoch": 5.957645764576458, + "grad_norm": 1.2454394102096558, + "learning_rate": 4.4269683063718115e-05, + "loss": 0.0317, + "num_input_tokens_seen": 11427808, + "step": 54155 + }, + { + "epoch": 5.958195819581958, + "grad_norm": 0.10276006162166595, + "learning_rate": 4.4268153909677704e-05, + "loss": 0.0191, + "num_input_tokens_seen": 11428832, + "step": 54160 + }, + { + "epoch": 5.958745874587459, + "grad_norm": 0.3471924960613251, + "learning_rate": 4.426662457805195e-05, + "loss": 0.1083, + "num_input_tokens_seen": 11429888, + "step": 54165 + }, + { + "epoch": 5.959295929592959, + "grad_norm": 0.10642819851636887, + "learning_rate": 4.426509506885498e-05, + "loss": 0.1463, + "num_input_tokens_seen": 11430912, + "step": 54170 + }, + { + "epoch": 5.95984598459846, + "grad_norm": 1.2923147678375244, + "learning_rate": 4.426356538210087e-05, + "loss": 0.0413, + "num_input_tokens_seen": 11432032, + "step": 54175 + }, + { + "epoch": 5.96039603960396, + "grad_norm": 0.9503365159034729, + "learning_rate": 4.4262035517803716e-05, + "loss": 0.1475, + "num_input_tokens_seen": 11433088, + "step": 54180 + }, + { + "epoch": 5.960946094609461, + "grad_norm": 0.07636647671461105, + "learning_rate": 4.426050547597762e-05, + "loss": 0.035, + "num_input_tokens_seen": 11434240, + "step": 54185 + }, + { + "epoch": 5.961496149614962, + "grad_norm": 0.11627925932407379, + "learning_rate": 4.4258975256636694e-05, + "loss": 0.0821, + "num_input_tokens_seen": 11435296, + "step": 54190 + }, + { + "epoch": 5.962046204620462, + "grad_norm": 0.057897090911865234, + "learning_rate": 4.4257444859795024e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11436320, + "step": 54195 + }, + { + "epoch": 5.962596259625963, + "grad_norm": 0.013826717622578144, + "learning_rate": 4.4255914285466735e-05, + "loss": 0.0251, + "num_input_tokens_seen": 11437408, + "step": 54200 + }, + { + "epoch": 5.963146314631463, + "grad_norm": 0.10109829902648926, + "learning_rate": 4.425438353366593e-05, + "loss": 0.0336, + "num_input_tokens_seen": 11438432, + "step": 54205 + }, + { + "epoch": 5.963696369636963, + "grad_norm": 0.022630639374256134, + "learning_rate": 4.42528526044067e-05, + "loss": 0.0114, + "num_input_tokens_seen": 11439456, + "step": 54210 + }, + { + "epoch": 5.9642464246424645, + "grad_norm": 0.2283531278371811, + "learning_rate": 4.425132149770317e-05, + "loss": 0.0463, + "num_input_tokens_seen": 11440544, + "step": 54215 + }, + { + "epoch": 5.964796479647965, + "grad_norm": 1.7133625745773315, + "learning_rate": 4.424979021356944e-05, + "loss": 0.1784, + "num_input_tokens_seen": 11441568, + "step": 54220 + }, + { + "epoch": 5.965346534653465, + "grad_norm": 0.007818065583705902, + "learning_rate": 4.424825875201964e-05, + "loss": 0.0056, + "num_input_tokens_seen": 11442688, + "step": 54225 + }, + { + "epoch": 5.965896589658966, + "grad_norm": 0.15312053263187408, + "learning_rate": 4.424672711306788e-05, + "loss": 0.0106, + "num_input_tokens_seen": 11443904, + "step": 54230 + }, + { + "epoch": 5.966446644664466, + "grad_norm": 0.01906793750822544, + "learning_rate": 4.424519529672826e-05, + "loss": 0.0383, + "num_input_tokens_seen": 11444992, + "step": 54235 + }, + { + "epoch": 5.966996699669967, + "grad_norm": 1.8257980346679688, + "learning_rate": 4.4243663303014914e-05, + "loss": 0.0689, + "num_input_tokens_seen": 11446176, + "step": 54240 + }, + { + "epoch": 5.9675467546754675, + "grad_norm": 0.41713935136795044, + "learning_rate": 4.4242131131941956e-05, + "loss": 0.0296, + "num_input_tokens_seen": 11447200, + "step": 54245 + }, + { + "epoch": 5.968096809680969, + "grad_norm": 1.8979023694992065, + "learning_rate": 4.4240598783523514e-05, + "loss": 0.1082, + "num_input_tokens_seen": 11448256, + "step": 54250 + }, + { + "epoch": 5.968646864686469, + "grad_norm": 0.05902313068509102, + "learning_rate": 4.4239066257773696e-05, + "loss": 0.0051, + "num_input_tokens_seen": 11449312, + "step": 54255 + }, + { + "epoch": 5.969196919691969, + "grad_norm": 0.6766303777694702, + "learning_rate": 4.4237533554706646e-05, + "loss": 0.0349, + "num_input_tokens_seen": 11450432, + "step": 54260 + }, + { + "epoch": 5.96974697469747, + "grad_norm": 0.09593695402145386, + "learning_rate": 4.4236000674336476e-05, + "loss": 0.0142, + "num_input_tokens_seen": 11451488, + "step": 54265 + }, + { + "epoch": 5.97029702970297, + "grad_norm": 0.005084708333015442, + "learning_rate": 4.4234467616677324e-05, + "loss": 0.0403, + "num_input_tokens_seen": 11452544, + "step": 54270 + }, + { + "epoch": 5.9708470847084705, + "grad_norm": 0.019659671932458878, + "learning_rate": 4.42329343817433e-05, + "loss": 0.0785, + "num_input_tokens_seen": 11453536, + "step": 54275 + }, + { + "epoch": 5.971397139713972, + "grad_norm": 0.4544428288936615, + "learning_rate": 4.4231400969548556e-05, + "loss": 0.0152, + "num_input_tokens_seen": 11454560, + "step": 54280 + }, + { + "epoch": 5.971947194719472, + "grad_norm": 0.11959110200405121, + "learning_rate": 4.422986738010722e-05, + "loss": 0.0663, + "num_input_tokens_seen": 11455648, + "step": 54285 + }, + { + "epoch": 5.972497249724973, + "grad_norm": 0.9291970133781433, + "learning_rate": 4.422833361343342e-05, + "loss": 0.1369, + "num_input_tokens_seen": 11456608, + "step": 54290 + }, + { + "epoch": 5.973047304730473, + "grad_norm": 0.11636776477098465, + "learning_rate": 4.42267996695413e-05, + "loss": 0.0382, + "num_input_tokens_seen": 11457664, + "step": 54295 + }, + { + "epoch": 5.973597359735973, + "grad_norm": 0.03636223450303078, + "learning_rate": 4.4225265548444986e-05, + "loss": 0.0686, + "num_input_tokens_seen": 11458784, + "step": 54300 + }, + { + "epoch": 5.974147414741474, + "grad_norm": 0.3789677023887634, + "learning_rate": 4.422373125015863e-05, + "loss": 0.0452, + "num_input_tokens_seen": 11459776, + "step": 54305 + }, + { + "epoch": 5.974697469746975, + "grad_norm": 0.013317340984940529, + "learning_rate": 4.422219677469637e-05, + "loss": 0.0528, + "num_input_tokens_seen": 11460768, + "step": 54310 + }, + { + "epoch": 5.975247524752476, + "grad_norm": 0.420193612575531, + "learning_rate": 4.422066212207234e-05, + "loss": 0.0508, + "num_input_tokens_seen": 11461792, + "step": 54315 + }, + { + "epoch": 5.975797579757976, + "grad_norm": 1.2346227169036865, + "learning_rate": 4.42191272923007e-05, + "loss": 0.1759, + "num_input_tokens_seen": 11462784, + "step": 54320 + }, + { + "epoch": 5.976347634763476, + "grad_norm": 0.6728438138961792, + "learning_rate": 4.421759228539557e-05, + "loss": 0.0922, + "num_input_tokens_seen": 11463840, + "step": 54325 + }, + { + "epoch": 5.976897689768977, + "grad_norm": 1.917376160621643, + "learning_rate": 4.421605710137112e-05, + "loss": 0.0425, + "num_input_tokens_seen": 11464864, + "step": 54330 + }, + { + "epoch": 5.977447744774477, + "grad_norm": 0.03341682627797127, + "learning_rate": 4.42145217402415e-05, + "loss": 0.0758, + "num_input_tokens_seen": 11465888, + "step": 54335 + }, + { + "epoch": 5.977997799779978, + "grad_norm": 1.0110679864883423, + "learning_rate": 4.421298620202085e-05, + "loss": 0.0921, + "num_input_tokens_seen": 11466912, + "step": 54340 + }, + { + "epoch": 5.978547854785479, + "grad_norm": 0.03052777610719204, + "learning_rate": 4.4211450486723324e-05, + "loss": 0.0314, + "num_input_tokens_seen": 11468000, + "step": 54345 + }, + { + "epoch": 5.979097909790979, + "grad_norm": 0.02941039390861988, + "learning_rate": 4.420991459436308e-05, + "loss": 0.0869, + "num_input_tokens_seen": 11469088, + "step": 54350 + }, + { + "epoch": 5.97964796479648, + "grad_norm": 0.01094045303761959, + "learning_rate": 4.420837852495426e-05, + "loss": 0.0742, + "num_input_tokens_seen": 11470144, + "step": 54355 + }, + { + "epoch": 5.98019801980198, + "grad_norm": 0.028741441667079926, + "learning_rate": 4.420684227851104e-05, + "loss": 0.0315, + "num_input_tokens_seen": 11471232, + "step": 54360 + }, + { + "epoch": 5.98074807480748, + "grad_norm": 0.46021735668182373, + "learning_rate": 4.4205305855047575e-05, + "loss": 0.1457, + "num_input_tokens_seen": 11472288, + "step": 54365 + }, + { + "epoch": 5.9812981298129815, + "grad_norm": 0.029558176174759865, + "learning_rate": 4.420376925457802e-05, + "loss": 0.0073, + "num_input_tokens_seen": 11473344, + "step": 54370 + }, + { + "epoch": 5.981848184818482, + "grad_norm": 0.2671298384666443, + "learning_rate": 4.420223247711653e-05, + "loss": 0.0083, + "num_input_tokens_seen": 11474400, + "step": 54375 + }, + { + "epoch": 5.982398239823983, + "grad_norm": 0.020237691700458527, + "learning_rate": 4.4200695522677284e-05, + "loss": 0.0494, + "num_input_tokens_seen": 11475456, + "step": 54380 + }, + { + "epoch": 5.982948294829483, + "grad_norm": 0.8756004571914673, + "learning_rate": 4.419915839127444e-05, + "loss": 0.0628, + "num_input_tokens_seen": 11476512, + "step": 54385 + }, + { + "epoch": 5.983498349834983, + "grad_norm": 0.006442302372306585, + "learning_rate": 4.419762108292217e-05, + "loss": 0.0111, + "num_input_tokens_seen": 11477600, + "step": 54390 + }, + { + "epoch": 5.984048404840484, + "grad_norm": 0.012749969027936459, + "learning_rate": 4.419608359763463e-05, + "loss": 0.0563, + "num_input_tokens_seen": 11478656, + "step": 54395 + }, + { + "epoch": 5.9845984598459845, + "grad_norm": 0.06232256814837456, + "learning_rate": 4.419454593542601e-05, + "loss": 0.0875, + "num_input_tokens_seen": 11479680, + "step": 54400 + }, + { + "epoch": 5.985148514851485, + "grad_norm": 0.07282990962266922, + "learning_rate": 4.419300809631046e-05, + "loss": 0.1491, + "num_input_tokens_seen": 11480704, + "step": 54405 + }, + { + "epoch": 5.985698569856986, + "grad_norm": 0.2505929172039032, + "learning_rate": 4.419147008030217e-05, + "loss": 0.0146, + "num_input_tokens_seen": 11481760, + "step": 54410 + }, + { + "epoch": 5.986248624862486, + "grad_norm": 0.043632492423057556, + "learning_rate": 4.4189931887415304e-05, + "loss": 0.011, + "num_input_tokens_seen": 11482848, + "step": 54415 + }, + { + "epoch": 5.986798679867987, + "grad_norm": 0.028728526085615158, + "learning_rate": 4.418839351766405e-05, + "loss": 0.0269, + "num_input_tokens_seen": 11483904, + "step": 54420 + }, + { + "epoch": 5.987348734873487, + "grad_norm": 0.13014467060565948, + "learning_rate": 4.4186854971062575e-05, + "loss": 0.0375, + "num_input_tokens_seen": 11484960, + "step": 54425 + }, + { + "epoch": 5.987898789878988, + "grad_norm": 0.5389717221260071, + "learning_rate": 4.418531624762507e-05, + "loss": 0.0685, + "num_input_tokens_seen": 11485984, + "step": 54430 + }, + { + "epoch": 5.988448844884489, + "grad_norm": 0.0728418305516243, + "learning_rate": 4.4183777347365704e-05, + "loss": 0.0981, + "num_input_tokens_seen": 11487072, + "step": 54435 + }, + { + "epoch": 5.988998899889989, + "grad_norm": 0.057428084313869476, + "learning_rate": 4.418223827029867e-05, + "loss": 0.0373, + "num_input_tokens_seen": 11488064, + "step": 54440 + }, + { + "epoch": 5.98954895489549, + "grad_norm": 0.1221223771572113, + "learning_rate": 4.418069901643815e-05, + "loss": 0.0236, + "num_input_tokens_seen": 11489184, + "step": 54445 + }, + { + "epoch": 5.99009900990099, + "grad_norm": 0.04154116287827492, + "learning_rate": 4.4179159585798344e-05, + "loss": 0.0061, + "num_input_tokens_seen": 11490208, + "step": 54450 + }, + { + "epoch": 5.99064906490649, + "grad_norm": 0.4986802935600281, + "learning_rate": 4.417761997839341e-05, + "loss": 0.0568, + "num_input_tokens_seen": 11491232, + "step": 54455 + }, + { + "epoch": 5.991199119911991, + "grad_norm": 0.10839992016553879, + "learning_rate": 4.417608019423756e-05, + "loss": 0.0074, + "num_input_tokens_seen": 11492320, + "step": 54460 + }, + { + "epoch": 5.991749174917492, + "grad_norm": 0.13074032962322235, + "learning_rate": 4.4174540233344985e-05, + "loss": 0.0235, + "num_input_tokens_seen": 11493376, + "step": 54465 + }, + { + "epoch": 5.992299229922993, + "grad_norm": 0.2017538994550705, + "learning_rate": 4.417300009572987e-05, + "loss": 0.0283, + "num_input_tokens_seen": 11494432, + "step": 54470 + }, + { + "epoch": 5.992849284928493, + "grad_norm": 1.1112489700317383, + "learning_rate": 4.417145978140641e-05, + "loss": 0.0875, + "num_input_tokens_seen": 11495456, + "step": 54475 + }, + { + "epoch": 5.993399339933993, + "grad_norm": 0.10646022856235504, + "learning_rate": 4.416991929038881e-05, + "loss": 0.0099, + "num_input_tokens_seen": 11496576, + "step": 54480 + }, + { + "epoch": 5.993949394939494, + "grad_norm": 0.027863293886184692, + "learning_rate": 4.4168378622691266e-05, + "loss": 0.0328, + "num_input_tokens_seen": 11497664, + "step": 54485 + }, + { + "epoch": 5.994499449944994, + "grad_norm": 0.2668932378292084, + "learning_rate": 4.4166837778327964e-05, + "loss": 0.0227, + "num_input_tokens_seen": 11498752, + "step": 54490 + }, + { + "epoch": 5.9950495049504955, + "grad_norm": 0.1172083169221878, + "learning_rate": 4.416529675731312e-05, + "loss": 0.1561, + "num_input_tokens_seen": 11499776, + "step": 54495 + }, + { + "epoch": 5.995599559955996, + "grad_norm": 1.0829733610153198, + "learning_rate": 4.4163755559660936e-05, + "loss": 0.0682, + "num_input_tokens_seen": 11500800, + "step": 54500 + }, + { + "epoch": 5.996149614961496, + "grad_norm": 0.7735000848770142, + "learning_rate": 4.41622141853856e-05, + "loss": 0.1347, + "num_input_tokens_seen": 11501856, + "step": 54505 + }, + { + "epoch": 5.996699669966997, + "grad_norm": 0.33718791604042053, + "learning_rate": 4.4160672634501345e-05, + "loss": 0.0344, + "num_input_tokens_seen": 11502880, + "step": 54510 + }, + { + "epoch": 5.997249724972497, + "grad_norm": 0.058946967124938965, + "learning_rate": 4.415913090702236e-05, + "loss": 0.0734, + "num_input_tokens_seen": 11503904, + "step": 54515 + }, + { + "epoch": 5.997799779977997, + "grad_norm": 0.32617905735969543, + "learning_rate": 4.4157589002962856e-05, + "loss": 0.1083, + "num_input_tokens_seen": 11504992, + "step": 54520 + }, + { + "epoch": 5.9983498349834985, + "grad_norm": 1.1143672466278076, + "learning_rate": 4.4156046922337044e-05, + "loss": 0.1102, + "num_input_tokens_seen": 11505984, + "step": 54525 + }, + { + "epoch": 5.998899889988999, + "grad_norm": 0.040845759212970734, + "learning_rate": 4.415450466515915e-05, + "loss": 0.0331, + "num_input_tokens_seen": 11507072, + "step": 54530 + }, + { + "epoch": 5.9994499449945, + "grad_norm": 0.06495033949613571, + "learning_rate": 4.4152962231443376e-05, + "loss": 0.0765, + "num_input_tokens_seen": 11508096, + "step": 54535 + }, + { + "epoch": 6.0, + "grad_norm": 0.029740938916802406, + "learning_rate": 4.4151419621203925e-05, + "loss": 0.1489, + "num_input_tokens_seen": 11509088, + "step": 54540 + }, + { + "epoch": 6.0, + "eval_loss": 0.062062107026576996, + "eval_runtime": 37.1446, + "eval_samples_per_second": 108.764, + "eval_steps_per_second": 27.191, + "num_input_tokens_seen": 11509088, + "step": 54540 + }, + { + "epoch": 6.0005500550055, + "grad_norm": 0.13785715401172638, + "learning_rate": 4.414987683445505e-05, + "loss": 0.0484, + "num_input_tokens_seen": 11510144, + "step": 54545 + }, + { + "epoch": 6.001100110011001, + "grad_norm": 0.07170896977186203, + "learning_rate": 4.4148333871210936e-05, + "loss": 0.0181, + "num_input_tokens_seen": 11511168, + "step": 54550 + }, + { + "epoch": 6.0016501650165015, + "grad_norm": 0.01422035600990057, + "learning_rate": 4.4146790731485823e-05, + "loss": 0.0418, + "num_input_tokens_seen": 11512224, + "step": 54555 + }, + { + "epoch": 6.002200220022003, + "grad_norm": 0.7871376872062683, + "learning_rate": 4.414524741529392e-05, + "loss": 0.0564, + "num_input_tokens_seen": 11513280, + "step": 54560 + }, + { + "epoch": 6.002750275027503, + "grad_norm": 0.4056171774864197, + "learning_rate": 4.414370392264946e-05, + "loss": 0.0293, + "num_input_tokens_seen": 11514240, + "step": 54565 + }, + { + "epoch": 6.003300330033003, + "grad_norm": 0.05520772933959961, + "learning_rate": 4.414216025356667e-05, + "loss": 0.0059, + "num_input_tokens_seen": 11515296, + "step": 54570 + }, + { + "epoch": 6.003850385038504, + "grad_norm": 0.1204751506447792, + "learning_rate": 4.4140616408059775e-05, + "loss": 0.0216, + "num_input_tokens_seen": 11516384, + "step": 54575 + }, + { + "epoch": 6.004400440044004, + "grad_norm": 0.31451526284217834, + "learning_rate": 4.4139072386143e-05, + "loss": 0.0347, + "num_input_tokens_seen": 11517504, + "step": 54580 + }, + { + "epoch": 6.0049504950495045, + "grad_norm": 0.10639440268278122, + "learning_rate": 4.413752818783058e-05, + "loss": 0.0537, + "num_input_tokens_seen": 11518464, + "step": 54585 + }, + { + "epoch": 6.005500550055006, + "grad_norm": 0.5800754427909851, + "learning_rate": 4.413598381313675e-05, + "loss": 0.0496, + "num_input_tokens_seen": 11519552, + "step": 54590 + }, + { + "epoch": 6.006050605060506, + "grad_norm": 0.014161994680762291, + "learning_rate": 4.413443926207573e-05, + "loss": 0.0564, + "num_input_tokens_seen": 11520544, + "step": 54595 + }, + { + "epoch": 6.006600660066007, + "grad_norm": 1.021996021270752, + "learning_rate": 4.4132894534661775e-05, + "loss": 0.0813, + "num_input_tokens_seen": 11521664, + "step": 54600 + }, + { + "epoch": 6.007150715071507, + "grad_norm": 0.10476619750261307, + "learning_rate": 4.4131349630909106e-05, + "loss": 0.022, + "num_input_tokens_seen": 11522752, + "step": 54605 + }, + { + "epoch": 6.007700770077007, + "grad_norm": 0.5542688965797424, + "learning_rate": 4.412980455083197e-05, + "loss": 0.0221, + "num_input_tokens_seen": 11523776, + "step": 54610 + }, + { + "epoch": 6.008250825082508, + "grad_norm": 0.782795250415802, + "learning_rate": 4.412825929444461e-05, + "loss": 0.0551, + "num_input_tokens_seen": 11524768, + "step": 54615 + }, + { + "epoch": 6.008800880088009, + "grad_norm": 0.028072696179151535, + "learning_rate": 4.4126713861761245e-05, + "loss": 0.0259, + "num_input_tokens_seen": 11525792, + "step": 54620 + }, + { + "epoch": 6.00935093509351, + "grad_norm": 0.06913650035858154, + "learning_rate": 4.412516825279616e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11526848, + "step": 54625 + }, + { + "epoch": 6.00990099009901, + "grad_norm": 0.2846553921699524, + "learning_rate": 4.4123622467563555e-05, + "loss": 0.0132, + "num_input_tokens_seen": 11527840, + "step": 54630 + }, + { + "epoch": 6.01045104510451, + "grad_norm": 1.752039909362793, + "learning_rate": 4.412207650607771e-05, + "loss": 0.0962, + "num_input_tokens_seen": 11528928, + "step": 54635 + }, + { + "epoch": 6.011001100110011, + "grad_norm": 1.4085748195648193, + "learning_rate": 4.4120530368352864e-05, + "loss": 0.0275, + "num_input_tokens_seen": 11529952, + "step": 54640 + }, + { + "epoch": 6.011551155115511, + "grad_norm": 0.04488520696759224, + "learning_rate": 4.411898405440326e-05, + "loss": 0.0325, + "num_input_tokens_seen": 11530912, + "step": 54645 + }, + { + "epoch": 6.0121012101210125, + "grad_norm": 0.024351781234145164, + "learning_rate": 4.4117437564243144e-05, + "loss": 0.1133, + "num_input_tokens_seen": 11531936, + "step": 54650 + }, + { + "epoch": 6.012651265126513, + "grad_norm": 0.017436571419239044, + "learning_rate": 4.4115890897886794e-05, + "loss": 0.0737, + "num_input_tokens_seen": 11532992, + "step": 54655 + }, + { + "epoch": 6.013201320132013, + "grad_norm": 0.03780696168541908, + "learning_rate": 4.4114344055348445e-05, + "loss": 0.0406, + "num_input_tokens_seen": 11534080, + "step": 54660 + }, + { + "epoch": 6.013751375137514, + "grad_norm": 0.026766350492835045, + "learning_rate": 4.411279703664236e-05, + "loss": 0.0306, + "num_input_tokens_seen": 11535136, + "step": 54665 + }, + { + "epoch": 6.014301430143014, + "grad_norm": 0.02305961214005947, + "learning_rate": 4.41112498417828e-05, + "loss": 0.0173, + "num_input_tokens_seen": 11536192, + "step": 54670 + }, + { + "epoch": 6.014851485148514, + "grad_norm": 0.7397940158843994, + "learning_rate": 4.410970247078401e-05, + "loss": 0.1279, + "num_input_tokens_seen": 11537184, + "step": 54675 + }, + { + "epoch": 6.0154015401540155, + "grad_norm": 0.11477386951446533, + "learning_rate": 4.410815492366027e-05, + "loss": 0.0481, + "num_input_tokens_seen": 11538240, + "step": 54680 + }, + { + "epoch": 6.015951595159516, + "grad_norm": 0.0774892196059227, + "learning_rate": 4.410660720042583e-05, + "loss": 0.0328, + "num_input_tokens_seen": 11539296, + "step": 54685 + }, + { + "epoch": 6.016501650165017, + "grad_norm": 0.08335718512535095, + "learning_rate": 4.4105059301094965e-05, + "loss": 0.0258, + "num_input_tokens_seen": 11540480, + "step": 54690 + }, + { + "epoch": 6.017051705170517, + "grad_norm": 0.1334788203239441, + "learning_rate": 4.410351122568193e-05, + "loss": 0.0131, + "num_input_tokens_seen": 11541536, + "step": 54695 + }, + { + "epoch": 6.017601760176017, + "grad_norm": 0.4951491355895996, + "learning_rate": 4.410196297420101e-05, + "loss": 0.0292, + "num_input_tokens_seen": 11542528, + "step": 54700 + }, + { + "epoch": 6.018151815181518, + "grad_norm": 0.023663075640797615, + "learning_rate": 4.4100414546666445e-05, + "loss": 0.0356, + "num_input_tokens_seen": 11543584, + "step": 54705 + }, + { + "epoch": 6.0187018701870185, + "grad_norm": 0.7212491035461426, + "learning_rate": 4.409886594309254e-05, + "loss": 0.023, + "num_input_tokens_seen": 11544640, + "step": 54710 + }, + { + "epoch": 6.01925192519252, + "grad_norm": 0.14215879142284393, + "learning_rate": 4.409731716349354e-05, + "loss": 0.0104, + "num_input_tokens_seen": 11545696, + "step": 54715 + }, + { + "epoch": 6.01980198019802, + "grad_norm": 0.014218398369848728, + "learning_rate": 4.409576820788375e-05, + "loss": 0.1033, + "num_input_tokens_seen": 11546720, + "step": 54720 + }, + { + "epoch": 6.02035203520352, + "grad_norm": 0.10150349885225296, + "learning_rate": 4.409421907627741e-05, + "loss": 0.0388, + "num_input_tokens_seen": 11547776, + "step": 54725 + }, + { + "epoch": 6.020902090209021, + "grad_norm": 0.20449844002723694, + "learning_rate": 4.4092669768688824e-05, + "loss": 0.0258, + "num_input_tokens_seen": 11548832, + "step": 54730 + }, + { + "epoch": 6.021452145214521, + "grad_norm": 0.12895259261131287, + "learning_rate": 4.409112028513226e-05, + "loss": 0.0189, + "num_input_tokens_seen": 11549856, + "step": 54735 + }, + { + "epoch": 6.022002200220022, + "grad_norm": 0.2339332550764084, + "learning_rate": 4.4089570625622e-05, + "loss": 0.0499, + "num_input_tokens_seen": 11550912, + "step": 54740 + }, + { + "epoch": 6.022552255225523, + "grad_norm": 0.3342990279197693, + "learning_rate": 4.408802079017233e-05, + "loss": 0.078, + "num_input_tokens_seen": 11551904, + "step": 54745 + }, + { + "epoch": 6.023102310231023, + "grad_norm": 2.4295806884765625, + "learning_rate": 4.408647077879753e-05, + "loss": 0.0854, + "num_input_tokens_seen": 11552896, + "step": 54750 + }, + { + "epoch": 6.023652365236524, + "grad_norm": 0.05107857286930084, + "learning_rate": 4.408492059151188e-05, + "loss": 0.0354, + "num_input_tokens_seen": 11553952, + "step": 54755 + }, + { + "epoch": 6.024202420242024, + "grad_norm": 0.20608417689800262, + "learning_rate": 4.408337022832969e-05, + "loss": 0.0132, + "num_input_tokens_seen": 11555008, + "step": 54760 + }, + { + "epoch": 6.024752475247524, + "grad_norm": 0.19065698981285095, + "learning_rate": 4.408181968926522e-05, + "loss": 0.0475, + "num_input_tokens_seen": 11556000, + "step": 54765 + }, + { + "epoch": 6.025302530253025, + "grad_norm": 0.8717526197433472, + "learning_rate": 4.408026897433278e-05, + "loss": 0.0733, + "num_input_tokens_seen": 11557088, + "step": 54770 + }, + { + "epoch": 6.025852585258526, + "grad_norm": 0.13957379758358002, + "learning_rate": 4.407871808354665e-05, + "loss": 0.0091, + "num_input_tokens_seen": 11558176, + "step": 54775 + }, + { + "epoch": 6.026402640264027, + "grad_norm": 0.09904313832521439, + "learning_rate": 4.4077167016921145e-05, + "loss": 0.0143, + "num_input_tokens_seen": 11559264, + "step": 54780 + }, + { + "epoch": 6.026952695269527, + "grad_norm": 0.019524121657013893, + "learning_rate": 4.407561577447054e-05, + "loss": 0.1361, + "num_input_tokens_seen": 11560320, + "step": 54785 + }, + { + "epoch": 6.027502750275027, + "grad_norm": 1.1803771257400513, + "learning_rate": 4.4074064356209135e-05, + "loss": 0.0528, + "num_input_tokens_seen": 11561376, + "step": 54790 + }, + { + "epoch": 6.028052805280528, + "grad_norm": 1.2005629539489746, + "learning_rate": 4.407251276215123e-05, + "loss": 0.1489, + "num_input_tokens_seen": 11562464, + "step": 54795 + }, + { + "epoch": 6.028602860286028, + "grad_norm": 0.11838283389806747, + "learning_rate": 4.407096099231113e-05, + "loss": 0.0847, + "num_input_tokens_seen": 11563520, + "step": 54800 + }, + { + "epoch": 6.0291529152915295, + "grad_norm": 0.6680228114128113, + "learning_rate": 4.406940904670313e-05, + "loss": 0.0675, + "num_input_tokens_seen": 11564544, + "step": 54805 + }, + { + "epoch": 6.02970297029703, + "grad_norm": 0.34721454977989197, + "learning_rate": 4.406785692534154e-05, + "loss": 0.0234, + "num_input_tokens_seen": 11565632, + "step": 54810 + }, + { + "epoch": 6.03025302530253, + "grad_norm": 0.19852057099342346, + "learning_rate": 4.4066304628240664e-05, + "loss": 0.031, + "num_input_tokens_seen": 11566688, + "step": 54815 + }, + { + "epoch": 6.030803080308031, + "grad_norm": 0.03464897722005844, + "learning_rate": 4.40647521554148e-05, + "loss": 0.0689, + "num_input_tokens_seen": 11567840, + "step": 54820 + }, + { + "epoch": 6.031353135313531, + "grad_norm": 0.7177191972732544, + "learning_rate": 4.406319950687827e-05, + "loss": 0.0618, + "num_input_tokens_seen": 11568832, + "step": 54825 + }, + { + "epoch": 6.031903190319032, + "grad_norm": 0.02805132418870926, + "learning_rate": 4.406164668264537e-05, + "loss": 0.0188, + "num_input_tokens_seen": 11569952, + "step": 54830 + }, + { + "epoch": 6.0324532453245325, + "grad_norm": 0.1522303819656372, + "learning_rate": 4.406009368273043e-05, + "loss": 0.0168, + "num_input_tokens_seen": 11571008, + "step": 54835 + }, + { + "epoch": 6.033003300330033, + "grad_norm": 0.48609206080436707, + "learning_rate": 4.405854050714775e-05, + "loss": 0.0793, + "num_input_tokens_seen": 11572096, + "step": 54840 + }, + { + "epoch": 6.033553355335534, + "grad_norm": 0.0315701961517334, + "learning_rate": 4.4056987155911647e-05, + "loss": 0.1175, + "num_input_tokens_seen": 11573120, + "step": 54845 + }, + { + "epoch": 6.034103410341034, + "grad_norm": 1.6959739923477173, + "learning_rate": 4.405543362903644e-05, + "loss": 0.0562, + "num_input_tokens_seen": 11574208, + "step": 54850 + }, + { + "epoch": 6.034653465346534, + "grad_norm": 0.06500443816184998, + "learning_rate": 4.405387992653643e-05, + "loss": 0.0075, + "num_input_tokens_seen": 11575232, + "step": 54855 + }, + { + "epoch": 6.035203520352035, + "grad_norm": 0.08649942278862, + "learning_rate": 4.405232604842596e-05, + "loss": 0.0289, + "num_input_tokens_seen": 11576288, + "step": 54860 + }, + { + "epoch": 6.0357535753575355, + "grad_norm": 1.0283809900283813, + "learning_rate": 4.405077199471935e-05, + "loss": 0.0625, + "num_input_tokens_seen": 11577344, + "step": 54865 + }, + { + "epoch": 6.036303630363037, + "grad_norm": 0.41997960209846497, + "learning_rate": 4.404921776543091e-05, + "loss": 0.0352, + "num_input_tokens_seen": 11578368, + "step": 54870 + }, + { + "epoch": 6.036853685368537, + "grad_norm": 0.018223708495497704, + "learning_rate": 4.4047663360574974e-05, + "loss": 0.03, + "num_input_tokens_seen": 11579424, + "step": 54875 + }, + { + "epoch": 6.037403740374037, + "grad_norm": 0.4395909905433655, + "learning_rate": 4.404610878016586e-05, + "loss": 0.0889, + "num_input_tokens_seen": 11580480, + "step": 54880 + }, + { + "epoch": 6.037953795379538, + "grad_norm": 1.0773003101348877, + "learning_rate": 4.40445540242179e-05, + "loss": 0.0525, + "num_input_tokens_seen": 11581600, + "step": 54885 + }, + { + "epoch": 6.038503850385038, + "grad_norm": 0.39084944128990173, + "learning_rate": 4.404299909274543e-05, + "loss": 0.0254, + "num_input_tokens_seen": 11582624, + "step": 54890 + }, + { + "epoch": 6.039053905390539, + "grad_norm": 0.027901697903871536, + "learning_rate": 4.4041443985762774e-05, + "loss": 0.0062, + "num_input_tokens_seen": 11583648, + "step": 54895 + }, + { + "epoch": 6.03960396039604, + "grad_norm": 0.04597224295139313, + "learning_rate": 4.4039888703284265e-05, + "loss": 0.0261, + "num_input_tokens_seen": 11584672, + "step": 54900 + }, + { + "epoch": 6.04015401540154, + "grad_norm": 0.451667845249176, + "learning_rate": 4.403833324532424e-05, + "loss": 0.0135, + "num_input_tokens_seen": 11585696, + "step": 54905 + }, + { + "epoch": 6.040704070407041, + "grad_norm": 0.012033498845994473, + "learning_rate": 4.403677761189703e-05, + "loss": 0.0325, + "num_input_tokens_seen": 11586720, + "step": 54910 + }, + { + "epoch": 6.041254125412541, + "grad_norm": 0.17950168251991272, + "learning_rate": 4.4035221803016974e-05, + "loss": 0.006, + "num_input_tokens_seen": 11587872, + "step": 54915 + }, + { + "epoch": 6.041804180418042, + "grad_norm": 0.039593957364559174, + "learning_rate": 4.403366581869841e-05, + "loss": 0.0252, + "num_input_tokens_seen": 11588960, + "step": 54920 + }, + { + "epoch": 6.042354235423542, + "grad_norm": 0.21812501549720764, + "learning_rate": 4.403210965895569e-05, + "loss": 0.0958, + "num_input_tokens_seen": 11589984, + "step": 54925 + }, + { + "epoch": 6.042904290429043, + "grad_norm": 0.23540058732032776, + "learning_rate": 4.403055332380314e-05, + "loss": 0.0094, + "num_input_tokens_seen": 11590976, + "step": 54930 + }, + { + "epoch": 6.043454345434544, + "grad_norm": 0.06661534309387207, + "learning_rate": 4.402899681325512e-05, + "loss": 0.0099, + "num_input_tokens_seen": 11592064, + "step": 54935 + }, + { + "epoch": 6.044004400440044, + "grad_norm": 0.13880811631679535, + "learning_rate": 4.402744012732596e-05, + "loss": 0.0154, + "num_input_tokens_seen": 11593120, + "step": 54940 + }, + { + "epoch": 6.044554455445544, + "grad_norm": 0.16722901165485382, + "learning_rate": 4.402588326603002e-05, + "loss": 0.0058, + "num_input_tokens_seen": 11594176, + "step": 54945 + }, + { + "epoch": 6.045104510451045, + "grad_norm": 0.04472379386425018, + "learning_rate": 4.402432622938164e-05, + "loss": 0.1089, + "num_input_tokens_seen": 11595200, + "step": 54950 + }, + { + "epoch": 6.0456545654565454, + "grad_norm": 0.7575123906135559, + "learning_rate": 4.402276901739517e-05, + "loss": 0.0509, + "num_input_tokens_seen": 11596288, + "step": 54955 + }, + { + "epoch": 6.0462046204620465, + "grad_norm": 0.3851800262928009, + "learning_rate": 4.402121163008497e-05, + "loss": 0.1367, + "num_input_tokens_seen": 11597376, + "step": 54960 + }, + { + "epoch": 6.046754675467547, + "grad_norm": 1.7050155401229858, + "learning_rate": 4.401965406746539e-05, + "loss": 0.1054, + "num_input_tokens_seen": 11598432, + "step": 54965 + }, + { + "epoch": 6.047304730473047, + "grad_norm": 0.7809610962867737, + "learning_rate": 4.4018096329550786e-05, + "loss": 0.0394, + "num_input_tokens_seen": 11599520, + "step": 54970 + }, + { + "epoch": 6.047854785478548, + "grad_norm": 0.0362655371427536, + "learning_rate": 4.401653841635551e-05, + "loss": 0.0368, + "num_input_tokens_seen": 11600480, + "step": 54975 + }, + { + "epoch": 6.048404840484048, + "grad_norm": 0.0468687042593956, + "learning_rate": 4.401498032789393e-05, + "loss": 0.0834, + "num_input_tokens_seen": 11601472, + "step": 54980 + }, + { + "epoch": 6.048954895489549, + "grad_norm": 0.03581855818629265, + "learning_rate": 4.40134220641804e-05, + "loss": 0.065, + "num_input_tokens_seen": 11602528, + "step": 54985 + }, + { + "epoch": 6.0495049504950495, + "grad_norm": 1.4249838590621948, + "learning_rate": 4.4011863625229276e-05, + "loss": 0.0996, + "num_input_tokens_seen": 11603648, + "step": 54990 + }, + { + "epoch": 6.05005500550055, + "grad_norm": 0.048193808645009995, + "learning_rate": 4.4010305011054933e-05, + "loss": 0.0615, + "num_input_tokens_seen": 11604704, + "step": 54995 + }, + { + "epoch": 6.050605060506051, + "grad_norm": 0.16158048808574677, + "learning_rate": 4.400874622167173e-05, + "loss": 0.0136, + "num_input_tokens_seen": 11605792, + "step": 55000 + }, + { + "epoch": 6.051155115511551, + "grad_norm": 0.3412543535232544, + "learning_rate": 4.4007187257094034e-05, + "loss": 0.0248, + "num_input_tokens_seen": 11606816, + "step": 55005 + }, + { + "epoch": 6.051705170517051, + "grad_norm": 0.3018363118171692, + "learning_rate": 4.4005628117336216e-05, + "loss": 0.0269, + "num_input_tokens_seen": 11607872, + "step": 55010 + }, + { + "epoch": 6.052255225522552, + "grad_norm": 0.08245579898357391, + "learning_rate": 4.400406880241264e-05, + "loss": 0.0806, + "num_input_tokens_seen": 11608896, + "step": 55015 + }, + { + "epoch": 6.052805280528053, + "grad_norm": 0.5127856731414795, + "learning_rate": 4.400250931233768e-05, + "loss": 0.0513, + "num_input_tokens_seen": 11609920, + "step": 55020 + }, + { + "epoch": 6.053355335533554, + "grad_norm": 0.02108902484178543, + "learning_rate": 4.400094964712571e-05, + "loss": 0.0125, + "num_input_tokens_seen": 11610912, + "step": 55025 + }, + { + "epoch": 6.053905390539054, + "grad_norm": 0.010538248345255852, + "learning_rate": 4.399938980679111e-05, + "loss": 0.1066, + "num_input_tokens_seen": 11611936, + "step": 55030 + }, + { + "epoch": 6.054455445544554, + "grad_norm": 0.227356418967247, + "learning_rate": 4.399782979134825e-05, + "loss": 0.0176, + "num_input_tokens_seen": 11612960, + "step": 55035 + }, + { + "epoch": 6.055005500550055, + "grad_norm": 0.41060400009155273, + "learning_rate": 4.39962696008115e-05, + "loss": 0.0161, + "num_input_tokens_seen": 11613984, + "step": 55040 + }, + { + "epoch": 6.055555555555555, + "grad_norm": 0.8538285493850708, + "learning_rate": 4.3994709235195264e-05, + "loss": 0.0613, + "num_input_tokens_seen": 11615040, + "step": 55045 + }, + { + "epoch": 6.0561056105610565, + "grad_norm": 0.009971213527023792, + "learning_rate": 4.39931486945139e-05, + "loss": 0.0115, + "num_input_tokens_seen": 11616064, + "step": 55050 + }, + { + "epoch": 6.056655665566557, + "grad_norm": 0.567331075668335, + "learning_rate": 4.399158797878179e-05, + "loss": 0.0647, + "num_input_tokens_seen": 11617120, + "step": 55055 + }, + { + "epoch": 6.057205720572057, + "grad_norm": 0.21439893543720245, + "learning_rate": 4.3990027088013335e-05, + "loss": 0.0178, + "num_input_tokens_seen": 11618144, + "step": 55060 + }, + { + "epoch": 6.057755775577558, + "grad_norm": 0.03302084654569626, + "learning_rate": 4.3988466022222907e-05, + "loss": 0.0199, + "num_input_tokens_seen": 11619232, + "step": 55065 + }, + { + "epoch": 6.058305830583058, + "grad_norm": 0.9557912349700928, + "learning_rate": 4.398690478142491e-05, + "loss": 0.0485, + "num_input_tokens_seen": 11620288, + "step": 55070 + }, + { + "epoch": 6.058855885588559, + "grad_norm": 0.24788056313991547, + "learning_rate": 4.3985343365633716e-05, + "loss": 0.015, + "num_input_tokens_seen": 11621280, + "step": 55075 + }, + { + "epoch": 6.0594059405940595, + "grad_norm": 0.22279982268810272, + "learning_rate": 4.398378177486372e-05, + "loss": 0.0161, + "num_input_tokens_seen": 11622304, + "step": 55080 + }, + { + "epoch": 6.05995599559956, + "grad_norm": 0.7211328744888306, + "learning_rate": 4.3982220009129316e-05, + "loss": 0.0201, + "num_input_tokens_seen": 11623456, + "step": 55085 + }, + { + "epoch": 6.060506050605061, + "grad_norm": 0.002332931850105524, + "learning_rate": 4.39806580684449e-05, + "loss": 0.0038, + "num_input_tokens_seen": 11624544, + "step": 55090 + }, + { + "epoch": 6.061056105610561, + "grad_norm": 0.0069404300302267075, + "learning_rate": 4.397909595282487e-05, + "loss": 0.0035, + "num_input_tokens_seen": 11625664, + "step": 55095 + }, + { + "epoch": 6.061606160616061, + "grad_norm": 1.2764067649841309, + "learning_rate": 4.3977533662283624e-05, + "loss": 0.0573, + "num_input_tokens_seen": 11626688, + "step": 55100 + }, + { + "epoch": 6.062156215621562, + "grad_norm": 1.1459330320358276, + "learning_rate": 4.3975971196835544e-05, + "loss": 0.1276, + "num_input_tokens_seen": 11627744, + "step": 55105 + }, + { + "epoch": 6.0627062706270625, + "grad_norm": 0.47870588302612305, + "learning_rate": 4.397440855649505e-05, + "loss": 0.0232, + "num_input_tokens_seen": 11628832, + "step": 55110 + }, + { + "epoch": 6.063256325632564, + "grad_norm": 0.9094973802566528, + "learning_rate": 4.397284574127654e-05, + "loss": 0.0438, + "num_input_tokens_seen": 11629952, + "step": 55115 + }, + { + "epoch": 6.063806380638064, + "grad_norm": 0.03418764844536781, + "learning_rate": 4.397128275119441e-05, + "loss": 0.0722, + "num_input_tokens_seen": 11631040, + "step": 55120 + }, + { + "epoch": 6.064356435643564, + "grad_norm": 0.2060856968164444, + "learning_rate": 4.3969719586263065e-05, + "loss": 0.0177, + "num_input_tokens_seen": 11632160, + "step": 55125 + }, + { + "epoch": 6.064906490649065, + "grad_norm": 0.6020205020904541, + "learning_rate": 4.3968156246496925e-05, + "loss": 0.0477, + "num_input_tokens_seen": 11633216, + "step": 55130 + }, + { + "epoch": 6.065456545654565, + "grad_norm": 0.23153254389762878, + "learning_rate": 4.396659273191039e-05, + "loss": 0.1024, + "num_input_tokens_seen": 11634272, + "step": 55135 + }, + { + "epoch": 6.066006600660066, + "grad_norm": 0.12323711812496185, + "learning_rate": 4.396502904251787e-05, + "loss": 0.0276, + "num_input_tokens_seen": 11635360, + "step": 55140 + }, + { + "epoch": 6.066556655665567, + "grad_norm": 0.10090679675340652, + "learning_rate": 4.396346517833377e-05, + "loss": 0.0154, + "num_input_tokens_seen": 11636384, + "step": 55145 + }, + { + "epoch": 6.067106710671067, + "grad_norm": 1.0111368894577026, + "learning_rate": 4.396190113937252e-05, + "loss": 0.033, + "num_input_tokens_seen": 11637472, + "step": 55150 + }, + { + "epoch": 6.067656765676568, + "grad_norm": 0.12517936527729034, + "learning_rate": 4.3960336925648526e-05, + "loss": 0.0512, + "num_input_tokens_seen": 11638496, + "step": 55155 + }, + { + "epoch": 6.068206820682068, + "grad_norm": 0.28165265917778015, + "learning_rate": 4.3958772537176206e-05, + "loss": 0.0196, + "num_input_tokens_seen": 11639616, + "step": 55160 + }, + { + "epoch": 6.068756875687569, + "grad_norm": 0.020466018468141556, + "learning_rate": 4.3957207973969975e-05, + "loss": 0.0119, + "num_input_tokens_seen": 11640672, + "step": 55165 + }, + { + "epoch": 6.069306930693069, + "grad_norm": 0.999072790145874, + "learning_rate": 4.395564323604425e-05, + "loss": 0.0358, + "num_input_tokens_seen": 11641696, + "step": 55170 + }, + { + "epoch": 6.06985698569857, + "grad_norm": 0.06969954818487167, + "learning_rate": 4.395407832341346e-05, + "loss": 0.0227, + "num_input_tokens_seen": 11642720, + "step": 55175 + }, + { + "epoch": 6.070407040704071, + "grad_norm": 0.004622403532266617, + "learning_rate": 4.395251323609203e-05, + "loss": 0.0076, + "num_input_tokens_seen": 11643744, + "step": 55180 + }, + { + "epoch": 6.070957095709571, + "grad_norm": 0.06636492162942886, + "learning_rate": 4.3950947974094376e-05, + "loss": 0.0326, + "num_input_tokens_seen": 11644768, + "step": 55185 + }, + { + "epoch": 6.071507150715071, + "grad_norm": 0.022212835028767586, + "learning_rate": 4.3949382537434925e-05, + "loss": 0.1469, + "num_input_tokens_seen": 11645856, + "step": 55190 + }, + { + "epoch": 6.072057205720572, + "grad_norm": 0.020008191466331482, + "learning_rate": 4.394781692612811e-05, + "loss": 0.0635, + "num_input_tokens_seen": 11646912, + "step": 55195 + }, + { + "epoch": 6.072607260726072, + "grad_norm": 0.31325531005859375, + "learning_rate": 4.394625114018836e-05, + "loss": 0.0211, + "num_input_tokens_seen": 11647968, + "step": 55200 + }, + { + "epoch": 6.0731573157315735, + "grad_norm": 1.0409010648727417, + "learning_rate": 4.39446851796301e-05, + "loss": 0.0715, + "num_input_tokens_seen": 11649056, + "step": 55205 + }, + { + "epoch": 6.073707370737074, + "grad_norm": 0.0681624785065651, + "learning_rate": 4.3943119044467774e-05, + "loss": 0.0087, + "num_input_tokens_seen": 11650080, + "step": 55210 + }, + { + "epoch": 6.074257425742574, + "grad_norm": 0.6212990880012512, + "learning_rate": 4.3941552734715804e-05, + "loss": 0.0499, + "num_input_tokens_seen": 11651200, + "step": 55215 + }, + { + "epoch": 6.074807480748075, + "grad_norm": 0.042833756655454636, + "learning_rate": 4.393998625038863e-05, + "loss": 0.0165, + "num_input_tokens_seen": 11652192, + "step": 55220 + }, + { + "epoch": 6.075357535753575, + "grad_norm": 0.40450844168663025, + "learning_rate": 4.39384195915007e-05, + "loss": 0.0443, + "num_input_tokens_seen": 11653216, + "step": 55225 + }, + { + "epoch": 6.075907590759076, + "grad_norm": 0.027191294357180595, + "learning_rate": 4.393685275806644e-05, + "loss": 0.0097, + "num_input_tokens_seen": 11654272, + "step": 55230 + }, + { + "epoch": 6.0764576457645765, + "grad_norm": 0.11496079713106155, + "learning_rate": 4.39352857501003e-05, + "loss": 0.0928, + "num_input_tokens_seen": 11655360, + "step": 55235 + }, + { + "epoch": 6.077007700770077, + "grad_norm": 1.305938720703125, + "learning_rate": 4.39337185676167e-05, + "loss": 0.0689, + "num_input_tokens_seen": 11656320, + "step": 55240 + }, + { + "epoch": 6.077557755775578, + "grad_norm": 0.02611202374100685, + "learning_rate": 4.393215121063011e-05, + "loss": 0.0233, + "num_input_tokens_seen": 11657344, + "step": 55245 + }, + { + "epoch": 6.078107810781078, + "grad_norm": 0.046611107885837555, + "learning_rate": 4.393058367915498e-05, + "loss": 0.0804, + "num_input_tokens_seen": 11658400, + "step": 55250 + }, + { + "epoch": 6.078657865786579, + "grad_norm": 0.023137621581554413, + "learning_rate": 4.392901597320573e-05, + "loss": 0.1253, + "num_input_tokens_seen": 11659392, + "step": 55255 + }, + { + "epoch": 6.079207920792079, + "grad_norm": 0.26461517810821533, + "learning_rate": 4.3927448092796824e-05, + "loss": 0.0071, + "num_input_tokens_seen": 11660512, + "step": 55260 + }, + { + "epoch": 6.0797579757975795, + "grad_norm": 0.1785760372877121, + "learning_rate": 4.392588003794271e-05, + "loss": 0.0165, + "num_input_tokens_seen": 11661568, + "step": 55265 + }, + { + "epoch": 6.080308030803081, + "grad_norm": 0.03052222914993763, + "learning_rate": 4.392431180865785e-05, + "loss": 0.0519, + "num_input_tokens_seen": 11662688, + "step": 55270 + }, + { + "epoch": 6.080858085808581, + "grad_norm": 0.04280126467347145, + "learning_rate": 4.392274340495668e-05, + "loss": 0.0037, + "num_input_tokens_seen": 11663712, + "step": 55275 + }, + { + "epoch": 6.081408140814081, + "grad_norm": 0.01698499359190464, + "learning_rate": 4.392117482685367e-05, + "loss": 0.0278, + "num_input_tokens_seen": 11664768, + "step": 55280 + }, + { + "epoch": 6.081958195819582, + "grad_norm": 0.005835375282913446, + "learning_rate": 4.391960607436327e-05, + "loss": 0.0776, + "num_input_tokens_seen": 11665856, + "step": 55285 + }, + { + "epoch": 6.082508250825082, + "grad_norm": 0.04089180752635002, + "learning_rate": 4.3918037147499935e-05, + "loss": 0.0027, + "num_input_tokens_seen": 11666880, + "step": 55290 + }, + { + "epoch": 6.083058305830583, + "grad_norm": 0.05059399455785751, + "learning_rate": 4.391646804627814e-05, + "loss": 0.0064, + "num_input_tokens_seen": 11667872, + "step": 55295 + }, + { + "epoch": 6.083608360836084, + "grad_norm": 0.48291444778442383, + "learning_rate": 4.391489877071232e-05, + "loss": 0.096, + "num_input_tokens_seen": 11668928, + "step": 55300 + }, + { + "epoch": 6.084158415841584, + "grad_norm": 0.5883292555809021, + "learning_rate": 4.3913329320816966e-05, + "loss": 0.0977, + "num_input_tokens_seen": 11670016, + "step": 55305 + }, + { + "epoch": 6.084708470847085, + "grad_norm": 0.7189592719078064, + "learning_rate": 4.3911759696606526e-05, + "loss": 0.0311, + "num_input_tokens_seen": 11671040, + "step": 55310 + }, + { + "epoch": 6.085258525852585, + "grad_norm": 0.01035989634692669, + "learning_rate": 4.391018989809546e-05, + "loss": 0.0017, + "num_input_tokens_seen": 11672128, + "step": 55315 + }, + { + "epoch": 6.085808580858086, + "grad_norm": 0.9409369826316833, + "learning_rate": 4.390861992529827e-05, + "loss": 0.0447, + "num_input_tokens_seen": 11673152, + "step": 55320 + }, + { + "epoch": 6.086358635863586, + "grad_norm": 0.07748589664697647, + "learning_rate": 4.3907049778229395e-05, + "loss": 0.099, + "num_input_tokens_seen": 11674176, + "step": 55325 + }, + { + "epoch": 6.086908690869087, + "grad_norm": 0.3960878551006317, + "learning_rate": 4.390547945690331e-05, + "loss": 0.044, + "num_input_tokens_seen": 11675264, + "step": 55330 + }, + { + "epoch": 6.087458745874588, + "grad_norm": 0.8541228175163269, + "learning_rate": 4.39039089613345e-05, + "loss": 0.092, + "num_input_tokens_seen": 11676320, + "step": 55335 + }, + { + "epoch": 6.088008800880088, + "grad_norm": 0.1559230089187622, + "learning_rate": 4.390233829153742e-05, + "loss": 0.0214, + "num_input_tokens_seen": 11677312, + "step": 55340 + }, + { + "epoch": 6.088558855885589, + "grad_norm": 0.7960056066513062, + "learning_rate": 4.390076744752657e-05, + "loss": 0.04, + "num_input_tokens_seen": 11678368, + "step": 55345 + }, + { + "epoch": 6.089108910891089, + "grad_norm": 0.02275659702718258, + "learning_rate": 4.389919642931642e-05, + "loss": 0.0605, + "num_input_tokens_seen": 11679392, + "step": 55350 + }, + { + "epoch": 6.089658965896589, + "grad_norm": 1.3908491134643555, + "learning_rate": 4.389762523692144e-05, + "loss": 0.0753, + "num_input_tokens_seen": 11680448, + "step": 55355 + }, + { + "epoch": 6.0902090209020905, + "grad_norm": 0.28415602445602417, + "learning_rate": 4.3896053870356114e-05, + "loss": 0.0236, + "num_input_tokens_seen": 11681472, + "step": 55360 + }, + { + "epoch": 6.090759075907591, + "grad_norm": 0.8149349689483643, + "learning_rate": 4.389448232963492e-05, + "loss": 0.0351, + "num_input_tokens_seen": 11682528, + "step": 55365 + }, + { + "epoch": 6.091309130913091, + "grad_norm": 0.012323236092925072, + "learning_rate": 4.3892910614772354e-05, + "loss": 0.0058, + "num_input_tokens_seen": 11683584, + "step": 55370 + }, + { + "epoch": 6.091859185918592, + "grad_norm": 1.0044684410095215, + "learning_rate": 4.3891338725782907e-05, + "loss": 0.0435, + "num_input_tokens_seen": 11684576, + "step": 55375 + }, + { + "epoch": 6.092409240924092, + "grad_norm": 0.35958126187324524, + "learning_rate": 4.388976666268104e-05, + "loss": 0.0287, + "num_input_tokens_seen": 11685632, + "step": 55380 + }, + { + "epoch": 6.092959295929593, + "grad_norm": 0.006628930103033781, + "learning_rate": 4.3888194425481266e-05, + "loss": 0.0266, + "num_input_tokens_seen": 11686720, + "step": 55385 + }, + { + "epoch": 6.0935093509350935, + "grad_norm": 1.221988558769226, + "learning_rate": 4.388662201419806e-05, + "loss": 0.1545, + "num_input_tokens_seen": 11687808, + "step": 55390 + }, + { + "epoch": 6.094059405940594, + "grad_norm": 0.07812599837779999, + "learning_rate": 4.3885049428845935e-05, + "loss": 0.0115, + "num_input_tokens_seen": 11688832, + "step": 55395 + }, + { + "epoch": 6.094609460946095, + "grad_norm": 0.04444297403097153, + "learning_rate": 4.388347666943936e-05, + "loss": 0.0148, + "num_input_tokens_seen": 11689888, + "step": 55400 + }, + { + "epoch": 6.095159515951595, + "grad_norm": 1.0027332305908203, + "learning_rate": 4.388190373599285e-05, + "loss": 0.0501, + "num_input_tokens_seen": 11690976, + "step": 55405 + }, + { + "epoch": 6.095709570957096, + "grad_norm": 0.3253445327281952, + "learning_rate": 4.3880330628520886e-05, + "loss": 0.061, + "num_input_tokens_seen": 11691968, + "step": 55410 + }, + { + "epoch": 6.096259625962596, + "grad_norm": 0.6946842670440674, + "learning_rate": 4.387875734703798e-05, + "loss": 0.0627, + "num_input_tokens_seen": 11693024, + "step": 55415 + }, + { + "epoch": 6.0968096809680965, + "grad_norm": 1.029585361480713, + "learning_rate": 4.3877183891558624e-05, + "loss": 0.0914, + "num_input_tokens_seen": 11694144, + "step": 55420 + }, + { + "epoch": 6.097359735973598, + "grad_norm": 1.6369848251342773, + "learning_rate": 4.387561026209732e-05, + "loss": 0.042, + "num_input_tokens_seen": 11695232, + "step": 55425 + }, + { + "epoch": 6.097909790979098, + "grad_norm": 0.2436840683221817, + "learning_rate": 4.387403645866858e-05, + "loss": 0.0714, + "num_input_tokens_seen": 11696320, + "step": 55430 + }, + { + "epoch": 6.098459845984599, + "grad_norm": 0.5517005920410156, + "learning_rate": 4.38724624812869e-05, + "loss": 0.101, + "num_input_tokens_seen": 11697376, + "step": 55435 + }, + { + "epoch": 6.099009900990099, + "grad_norm": 0.051449473947286606, + "learning_rate": 4.387088832996679e-05, + "loss": 0.0295, + "num_input_tokens_seen": 11698432, + "step": 55440 + }, + { + "epoch": 6.099559955995599, + "grad_norm": 0.033368032425642014, + "learning_rate": 4.386931400472275e-05, + "loss": 0.0718, + "num_input_tokens_seen": 11699520, + "step": 55445 + }, + { + "epoch": 6.1001100110011, + "grad_norm": 0.4061642587184906, + "learning_rate": 4.386773950556931e-05, + "loss": 0.1517, + "num_input_tokens_seen": 11700544, + "step": 55450 + }, + { + "epoch": 6.100660066006601, + "grad_norm": 0.09419375658035278, + "learning_rate": 4.3866164832520954e-05, + "loss": 0.0157, + "num_input_tokens_seen": 11701600, + "step": 55455 + }, + { + "epoch": 6.101210121012101, + "grad_norm": 0.24571476876735687, + "learning_rate": 4.386458998559222e-05, + "loss": 0.0414, + "num_input_tokens_seen": 11702688, + "step": 55460 + }, + { + "epoch": 6.101760176017602, + "grad_norm": 0.03087591752409935, + "learning_rate": 4.3863014964797607e-05, + "loss": 0.0383, + "num_input_tokens_seen": 11703680, + "step": 55465 + }, + { + "epoch": 6.102310231023102, + "grad_norm": 0.049967218190431595, + "learning_rate": 4.3861439770151636e-05, + "loss": 0.0034, + "num_input_tokens_seen": 11704640, + "step": 55470 + }, + { + "epoch": 6.102860286028603, + "grad_norm": 0.10246103256940842, + "learning_rate": 4.385986440166883e-05, + "loss": 0.0351, + "num_input_tokens_seen": 11705696, + "step": 55475 + }, + { + "epoch": 6.103410341034103, + "grad_norm": 0.033375442028045654, + "learning_rate": 4.3858288859363694e-05, + "loss": 0.0404, + "num_input_tokens_seen": 11706784, + "step": 55480 + }, + { + "epoch": 6.103960396039604, + "grad_norm": 0.47986075282096863, + "learning_rate": 4.385671314325077e-05, + "loss": 0.0599, + "num_input_tokens_seen": 11707808, + "step": 55485 + }, + { + "epoch": 6.104510451045105, + "grad_norm": 0.43390852212905884, + "learning_rate": 4.385513725334457e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11708864, + "step": 55490 + }, + { + "epoch": 6.105060506050605, + "grad_norm": 3.0571913719177246, + "learning_rate": 4.385356118965961e-05, + "loss": 0.0468, + "num_input_tokens_seen": 11709952, + "step": 55495 + }, + { + "epoch": 6.105610561056106, + "grad_norm": 0.024675406515598297, + "learning_rate": 4.385198495221042e-05, + "loss": 0.0284, + "num_input_tokens_seen": 11710976, + "step": 55500 + }, + { + "epoch": 6.106160616061606, + "grad_norm": 1.4674296379089355, + "learning_rate": 4.385040854101154e-05, + "loss": 0.1103, + "num_input_tokens_seen": 11712000, + "step": 55505 + }, + { + "epoch": 6.106710671067106, + "grad_norm": 0.09016510099172592, + "learning_rate": 4.3848831956077486e-05, + "loss": 0.0089, + "num_input_tokens_seen": 11713088, + "step": 55510 + }, + { + "epoch": 6.1072607260726075, + "grad_norm": 0.01498063188046217, + "learning_rate": 4.3847255197422793e-05, + "loss": 0.0304, + "num_input_tokens_seen": 11714144, + "step": 55515 + }, + { + "epoch": 6.107810781078108, + "grad_norm": 0.20338495075702667, + "learning_rate": 4.3845678265062e-05, + "loss": 0.0893, + "num_input_tokens_seen": 11715232, + "step": 55520 + }, + { + "epoch": 6.108360836083609, + "grad_norm": 0.1556713879108429, + "learning_rate": 4.384410115900962e-05, + "loss": 0.0286, + "num_input_tokens_seen": 11716288, + "step": 55525 + }, + { + "epoch": 6.108910891089109, + "grad_norm": 0.03170711174607277, + "learning_rate": 4.3842523879280205e-05, + "loss": 0.0559, + "num_input_tokens_seen": 11717408, + "step": 55530 + }, + { + "epoch": 6.109460946094609, + "grad_norm": 0.019480301067233086, + "learning_rate": 4.3840946425888296e-05, + "loss": 0.0739, + "num_input_tokens_seen": 11718400, + "step": 55535 + }, + { + "epoch": 6.11001100110011, + "grad_norm": 0.04600470885634422, + "learning_rate": 4.383936879884842e-05, + "loss": 0.0132, + "num_input_tokens_seen": 11719392, + "step": 55540 + }, + { + "epoch": 6.1105610561056105, + "grad_norm": 0.024457689374685287, + "learning_rate": 4.383779099817512e-05, + "loss": 0.0207, + "num_input_tokens_seen": 11720448, + "step": 55545 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.340446412563324, + "learning_rate": 4.383621302388294e-05, + "loss": 0.0328, + "num_input_tokens_seen": 11721504, + "step": 55550 + }, + { + "epoch": 6.111661166116612, + "grad_norm": 0.019347725436091423, + "learning_rate": 4.3834634875986426e-05, + "loss": 0.0053, + "num_input_tokens_seen": 11722592, + "step": 55555 + }, + { + "epoch": 6.112211221122112, + "grad_norm": 0.22437182068824768, + "learning_rate": 4.3833056554500116e-05, + "loss": 0.1782, + "num_input_tokens_seen": 11723616, + "step": 55560 + }, + { + "epoch": 6.112761276127613, + "grad_norm": 0.3768455386161804, + "learning_rate": 4.383147805943857e-05, + "loss": 0.0483, + "num_input_tokens_seen": 11724768, + "step": 55565 + }, + { + "epoch": 6.113311331133113, + "grad_norm": 0.004187098704278469, + "learning_rate": 4.382989939081632e-05, + "loss": 0.0681, + "num_input_tokens_seen": 11725888, + "step": 55570 + }, + { + "epoch": 6.1138613861386135, + "grad_norm": 1.455633282661438, + "learning_rate": 4.382832054864792e-05, + "loss": 0.0236, + "num_input_tokens_seen": 11726976, + "step": 55575 + }, + { + "epoch": 6.114411441144115, + "grad_norm": 1.0842622518539429, + "learning_rate": 4.382674153294793e-05, + "loss": 0.1202, + "num_input_tokens_seen": 11728032, + "step": 55580 + }, + { + "epoch": 6.114961496149615, + "grad_norm": 0.06975821405649185, + "learning_rate": 4.3825162343730894e-05, + "loss": 0.0791, + "num_input_tokens_seen": 11729152, + "step": 55585 + }, + { + "epoch": 6.115511551155116, + "grad_norm": 0.010676848702132702, + "learning_rate": 4.382358298101137e-05, + "loss": 0.0515, + "num_input_tokens_seen": 11730240, + "step": 55590 + }, + { + "epoch": 6.116061606160616, + "grad_norm": 0.09941934049129486, + "learning_rate": 4.382200344480392e-05, + "loss": 0.0463, + "num_input_tokens_seen": 11731232, + "step": 55595 + }, + { + "epoch": 6.116611661166116, + "grad_norm": 0.011457517743110657, + "learning_rate": 4.3820423735123084e-05, + "loss": 0.1218, + "num_input_tokens_seen": 11732256, + "step": 55600 + }, + { + "epoch": 6.117161716171617, + "grad_norm": 0.128960520029068, + "learning_rate": 4.381884385198344e-05, + "loss": 0.0576, + "num_input_tokens_seen": 11733376, + "step": 55605 + }, + { + "epoch": 6.117711771177118, + "grad_norm": 0.2845321297645569, + "learning_rate": 4.3817263795399544e-05, + "loss": 0.0141, + "num_input_tokens_seen": 11734400, + "step": 55610 + }, + { + "epoch": 6.118261826182618, + "grad_norm": 0.8531581163406372, + "learning_rate": 4.3815683565385956e-05, + "loss": 0.1207, + "num_input_tokens_seen": 11735456, + "step": 55615 + }, + { + "epoch": 6.118811881188119, + "grad_norm": 0.05631465092301369, + "learning_rate": 4.3814103161957246e-05, + "loss": 0.0164, + "num_input_tokens_seen": 11736512, + "step": 55620 + }, + { + "epoch": 6.119361936193619, + "grad_norm": 0.24285894632339478, + "learning_rate": 4.381252258512797e-05, + "loss": 0.0135, + "num_input_tokens_seen": 11737568, + "step": 55625 + }, + { + "epoch": 6.11991199119912, + "grad_norm": 0.17050915956497192, + "learning_rate": 4.3810941834912695e-05, + "loss": 0.0237, + "num_input_tokens_seen": 11738656, + "step": 55630 + }, + { + "epoch": 6.12046204620462, + "grad_norm": 1.4398510456085205, + "learning_rate": 4.3809360911326006e-05, + "loss": 0.1347, + "num_input_tokens_seen": 11739648, + "step": 55635 + }, + { + "epoch": 6.121012101210121, + "grad_norm": 0.446088969707489, + "learning_rate": 4.3807779814382456e-05, + "loss": 0.0625, + "num_input_tokens_seen": 11740704, + "step": 55640 + }, + { + "epoch": 6.121562156215622, + "grad_norm": 0.031224511563777924, + "learning_rate": 4.380619854409662e-05, + "loss": 0.0201, + "num_input_tokens_seen": 11741728, + "step": 55645 + }, + { + "epoch": 6.122112211221122, + "grad_norm": 0.014791741035878658, + "learning_rate": 4.380461710048309e-05, + "loss": 0.0269, + "num_input_tokens_seen": 11742784, + "step": 55650 + }, + { + "epoch": 6.122662266226623, + "grad_norm": 0.21551631391048431, + "learning_rate": 4.3803035483556415e-05, + "loss": 0.0723, + "num_input_tokens_seen": 11743840, + "step": 55655 + }, + { + "epoch": 6.123212321232123, + "grad_norm": 0.3057268261909485, + "learning_rate": 4.3801453693331184e-05, + "loss": 0.0446, + "num_input_tokens_seen": 11744896, + "step": 55660 + }, + { + "epoch": 6.123762376237623, + "grad_norm": 0.3989151418209076, + "learning_rate": 4.3799871729821986e-05, + "loss": 0.0139, + "num_input_tokens_seen": 11745984, + "step": 55665 + }, + { + "epoch": 6.1243124312431245, + "grad_norm": 1.4862706661224365, + "learning_rate": 4.379828959304338e-05, + "loss": 0.1025, + "num_input_tokens_seen": 11747072, + "step": 55670 + }, + { + "epoch": 6.124862486248625, + "grad_norm": 0.05336305499076843, + "learning_rate": 4.379670728300998e-05, + "loss": 0.019, + "num_input_tokens_seen": 11748096, + "step": 55675 + }, + { + "epoch": 6.125412541254126, + "grad_norm": 0.1169280856847763, + "learning_rate": 4.3795124799736324e-05, + "loss": 0.0081, + "num_input_tokens_seen": 11749184, + "step": 55680 + }, + { + "epoch": 6.125962596259626, + "grad_norm": 0.3948638439178467, + "learning_rate": 4.3793542143237034e-05, + "loss": 0.0548, + "num_input_tokens_seen": 11750272, + "step": 55685 + }, + { + "epoch": 6.126512651265126, + "grad_norm": 0.2579663097858429, + "learning_rate": 4.379195931352668e-05, + "loss": 0.0217, + "num_input_tokens_seen": 11751296, + "step": 55690 + }, + { + "epoch": 6.127062706270627, + "grad_norm": 0.09350559115409851, + "learning_rate": 4.379037631061986e-05, + "loss": 0.0563, + "num_input_tokens_seen": 11752384, + "step": 55695 + }, + { + "epoch": 6.1276127612761275, + "grad_norm": 1.6913288831710815, + "learning_rate": 4.378879313453115e-05, + "loss": 0.0767, + "num_input_tokens_seen": 11753376, + "step": 55700 + }, + { + "epoch": 6.128162816281628, + "grad_norm": 0.08591106534004211, + "learning_rate": 4.378720978527515e-05, + "loss": 0.066, + "num_input_tokens_seen": 11754400, + "step": 55705 + }, + { + "epoch": 6.128712871287129, + "grad_norm": 1.3794142007827759, + "learning_rate": 4.3785626262866464e-05, + "loss": 0.0381, + "num_input_tokens_seen": 11755488, + "step": 55710 + }, + { + "epoch": 6.129262926292629, + "grad_norm": 0.2208792120218277, + "learning_rate": 4.3784042567319664e-05, + "loss": 0.0414, + "num_input_tokens_seen": 11756544, + "step": 55715 + }, + { + "epoch": 6.12981298129813, + "grad_norm": 0.015259824693202972, + "learning_rate": 4.3782458698649365e-05, + "loss": 0.098, + "num_input_tokens_seen": 11757632, + "step": 55720 + }, + { + "epoch": 6.13036303630363, + "grad_norm": 0.7553384900093079, + "learning_rate": 4.378087465687015e-05, + "loss": 0.1344, + "num_input_tokens_seen": 11758624, + "step": 55725 + }, + { + "epoch": 6.1309130913091305, + "grad_norm": 0.07327266037464142, + "learning_rate": 4.377929044199662e-05, + "loss": 0.0112, + "num_input_tokens_seen": 11759680, + "step": 55730 + }, + { + "epoch": 6.131463146314632, + "grad_norm": 0.22277431190013885, + "learning_rate": 4.3777706054043395e-05, + "loss": 0.0227, + "num_input_tokens_seen": 11760704, + "step": 55735 + }, + { + "epoch": 6.132013201320132, + "grad_norm": 0.18327952921390533, + "learning_rate": 4.3776121493025054e-05, + "loss": 0.0419, + "num_input_tokens_seen": 11761760, + "step": 55740 + }, + { + "epoch": 6.132563256325633, + "grad_norm": 0.6847269535064697, + "learning_rate": 4.377453675895622e-05, + "loss": 0.0152, + "num_input_tokens_seen": 11762880, + "step": 55745 + }, + { + "epoch": 6.133113311331133, + "grad_norm": 0.5642182230949402, + "learning_rate": 4.3772951851851476e-05, + "loss": 0.0175, + "num_input_tokens_seen": 11763968, + "step": 55750 + }, + { + "epoch": 6.133663366336633, + "grad_norm": 1.5067634582519531, + "learning_rate": 4.377136677172545e-05, + "loss": 0.1111, + "num_input_tokens_seen": 11765056, + "step": 55755 + }, + { + "epoch": 6.134213421342134, + "grad_norm": 0.06477133184671402, + "learning_rate": 4.376978151859275e-05, + "loss": 0.0175, + "num_input_tokens_seen": 11766144, + "step": 55760 + }, + { + "epoch": 6.134763476347635, + "grad_norm": 0.10890761762857437, + "learning_rate": 4.376819609246797e-05, + "loss": 0.0721, + "num_input_tokens_seen": 11767168, + "step": 55765 + }, + { + "epoch": 6.135313531353136, + "grad_norm": 0.053740885108709335, + "learning_rate": 4.3766610493365734e-05, + "loss": 0.0885, + "num_input_tokens_seen": 11768224, + "step": 55770 + }, + { + "epoch": 6.135863586358636, + "grad_norm": 0.37858834862709045, + "learning_rate": 4.376502472130066e-05, + "loss": 0.0182, + "num_input_tokens_seen": 11769216, + "step": 55775 + }, + { + "epoch": 6.136413641364136, + "grad_norm": 0.2642182409763336, + "learning_rate": 4.376343877628736e-05, + "loss": 0.0424, + "num_input_tokens_seen": 11770272, + "step": 55780 + }, + { + "epoch": 6.136963696369637, + "grad_norm": 0.2698633670806885, + "learning_rate": 4.376185265834044e-05, + "loss": 0.0289, + "num_input_tokens_seen": 11771264, + "step": 55785 + }, + { + "epoch": 6.137513751375137, + "grad_norm": 0.9238584041595459, + "learning_rate": 4.3760266367474526e-05, + "loss": 0.1132, + "num_input_tokens_seen": 11772320, + "step": 55790 + }, + { + "epoch": 6.138063806380638, + "grad_norm": 0.07845910638570786, + "learning_rate": 4.375867990370425e-05, + "loss": 0.0785, + "num_input_tokens_seen": 11773440, + "step": 55795 + }, + { + "epoch": 6.138613861386139, + "grad_norm": 0.12292070686817169, + "learning_rate": 4.3757093267044215e-05, + "loss": 0.0369, + "num_input_tokens_seen": 11774528, + "step": 55800 + }, + { + "epoch": 6.139163916391639, + "grad_norm": 1.5681253671646118, + "learning_rate": 4.375550645750905e-05, + "loss": 0.1824, + "num_input_tokens_seen": 11775616, + "step": 55805 + }, + { + "epoch": 6.13971397139714, + "grad_norm": 0.7700968384742737, + "learning_rate": 4.3753919475113384e-05, + "loss": 0.0914, + "num_input_tokens_seen": 11776608, + "step": 55810 + }, + { + "epoch": 6.14026402640264, + "grad_norm": 0.08340399712324142, + "learning_rate": 4.375233231987184e-05, + "loss": 0.0073, + "num_input_tokens_seen": 11777664, + "step": 55815 + }, + { + "epoch": 6.1408140814081404, + "grad_norm": 0.6383241415023804, + "learning_rate": 4.3750744991799045e-05, + "loss": 0.0924, + "num_input_tokens_seen": 11778784, + "step": 55820 + }, + { + "epoch": 6.1413641364136415, + "grad_norm": 0.04682411625981331, + "learning_rate": 4.3749157490909634e-05, + "loss": 0.0202, + "num_input_tokens_seen": 11779776, + "step": 55825 + }, + { + "epoch": 6.141914191419142, + "grad_norm": 0.03512817621231079, + "learning_rate": 4.374756981721824e-05, + "loss": 0.0133, + "num_input_tokens_seen": 11780800, + "step": 55830 + }, + { + "epoch": 6.142464246424643, + "grad_norm": 0.3081265687942505, + "learning_rate": 4.3745981970739486e-05, + "loss": 0.0431, + "num_input_tokens_seen": 11781824, + "step": 55835 + }, + { + "epoch": 6.143014301430143, + "grad_norm": 1.1355175971984863, + "learning_rate": 4.3744393951488015e-05, + "loss": 0.0769, + "num_input_tokens_seen": 11782880, + "step": 55840 + }, + { + "epoch": 6.143564356435643, + "grad_norm": 0.1490972638130188, + "learning_rate": 4.3742805759478456e-05, + "loss": 0.0314, + "num_input_tokens_seen": 11784000, + "step": 55845 + }, + { + "epoch": 6.144114411441144, + "grad_norm": 0.03146294504404068, + "learning_rate": 4.3741217394725445e-05, + "loss": 0.0131, + "num_input_tokens_seen": 11785056, + "step": 55850 + }, + { + "epoch": 6.1446644664466445, + "grad_norm": 0.08686750382184982, + "learning_rate": 4.373962885724364e-05, + "loss": 0.0407, + "num_input_tokens_seen": 11786080, + "step": 55855 + }, + { + "epoch": 6.145214521452146, + "grad_norm": 0.4867020547389984, + "learning_rate": 4.3738040147047654e-05, + "loss": 0.1464, + "num_input_tokens_seen": 11787104, + "step": 55860 + }, + { + "epoch": 6.145764576457646, + "grad_norm": 0.3030441105365753, + "learning_rate": 4.373645126415215e-05, + "loss": 0.0165, + "num_input_tokens_seen": 11788096, + "step": 55865 + }, + { + "epoch": 6.146314631463146, + "grad_norm": 0.5360686779022217, + "learning_rate": 4.373486220857176e-05, + "loss": 0.0537, + "num_input_tokens_seen": 11789120, + "step": 55870 + }, + { + "epoch": 6.146864686468647, + "grad_norm": 0.12409864366054535, + "learning_rate": 4.3733272980321144e-05, + "loss": 0.0526, + "num_input_tokens_seen": 11790240, + "step": 55875 + }, + { + "epoch": 6.147414741474147, + "grad_norm": 0.1793513149023056, + "learning_rate": 4.3731683579414934e-05, + "loss": 0.0353, + "num_input_tokens_seen": 11791328, + "step": 55880 + }, + { + "epoch": 6.1479647964796476, + "grad_norm": 0.2316206842660904, + "learning_rate": 4.3730094005867784e-05, + "loss": 0.051, + "num_input_tokens_seen": 11792320, + "step": 55885 + }, + { + "epoch": 6.148514851485149, + "grad_norm": 0.41155195236206055, + "learning_rate": 4.372850425969435e-05, + "loss": 0.0336, + "num_input_tokens_seen": 11793376, + "step": 55890 + }, + { + "epoch": 6.149064906490649, + "grad_norm": 0.16467052698135376, + "learning_rate": 4.3726914340909275e-05, + "loss": 0.1868, + "num_input_tokens_seen": 11794400, + "step": 55895 + }, + { + "epoch": 6.14961496149615, + "grad_norm": 0.029858257621526718, + "learning_rate": 4.372532424952722e-05, + "loss": 0.0703, + "num_input_tokens_seen": 11795488, + "step": 55900 + }, + { + "epoch": 6.15016501650165, + "grad_norm": 0.3650312125682831, + "learning_rate": 4.3723733985562834e-05, + "loss": 0.1169, + "num_input_tokens_seen": 11796576, + "step": 55905 + }, + { + "epoch": 6.15071507150715, + "grad_norm": 0.030773354694247246, + "learning_rate": 4.3722143549030775e-05, + "loss": 0.0995, + "num_input_tokens_seen": 11797632, + "step": 55910 + }, + { + "epoch": 6.1512651265126514, + "grad_norm": 0.7816126942634583, + "learning_rate": 4.3720552939945704e-05, + "loss": 0.0268, + "num_input_tokens_seen": 11798688, + "step": 55915 + }, + { + "epoch": 6.151815181518152, + "grad_norm": 0.23172281682491302, + "learning_rate": 4.371896215832228e-05, + "loss": 0.0228, + "num_input_tokens_seen": 11799808, + "step": 55920 + }, + { + "epoch": 6.152365236523653, + "grad_norm": 0.16447623074054718, + "learning_rate": 4.371737120417516e-05, + "loss": 0.1788, + "num_input_tokens_seen": 11800896, + "step": 55925 + }, + { + "epoch": 6.152915291529153, + "grad_norm": 0.534525990486145, + "learning_rate": 4.3715780077519016e-05, + "loss": 0.0246, + "num_input_tokens_seen": 11801920, + "step": 55930 + }, + { + "epoch": 6.153465346534653, + "grad_norm": 0.12874574959278107, + "learning_rate": 4.37141887783685e-05, + "loss": 0.0351, + "num_input_tokens_seen": 11802944, + "step": 55935 + }, + { + "epoch": 6.154015401540154, + "grad_norm": 0.03978044539690018, + "learning_rate": 4.37125973067383e-05, + "loss": 0.0206, + "num_input_tokens_seen": 11804032, + "step": 55940 + }, + { + "epoch": 6.1545654565456545, + "grad_norm": 0.13960345089435577, + "learning_rate": 4.371100566264306e-05, + "loss": 0.0518, + "num_input_tokens_seen": 11805056, + "step": 55945 + }, + { + "epoch": 6.1551155115511555, + "grad_norm": 0.6204633116722107, + "learning_rate": 4.3709413846097465e-05, + "loss": 0.1262, + "num_input_tokens_seen": 11806176, + "step": 55950 + }, + { + "epoch": 6.155665566556656, + "grad_norm": 0.11226630210876465, + "learning_rate": 4.3707821857116176e-05, + "loss": 0.0295, + "num_input_tokens_seen": 11807232, + "step": 55955 + }, + { + "epoch": 6.156215621562156, + "grad_norm": 0.4876357316970825, + "learning_rate": 4.370622969571388e-05, + "loss": 0.0459, + "num_input_tokens_seen": 11808288, + "step": 55960 + }, + { + "epoch": 6.156765676567657, + "grad_norm": 0.06523590534925461, + "learning_rate": 4.370463736190522e-05, + "loss": 0.0166, + "num_input_tokens_seen": 11809312, + "step": 55965 + }, + { + "epoch": 6.157315731573157, + "grad_norm": 0.3035472333431244, + "learning_rate": 4.370304485570491e-05, + "loss": 0.0526, + "num_input_tokens_seen": 11810400, + "step": 55970 + }, + { + "epoch": 6.1578657865786575, + "grad_norm": 0.04720980301499367, + "learning_rate": 4.37014521771276e-05, + "loss": 0.0079, + "num_input_tokens_seen": 11811488, + "step": 55975 + }, + { + "epoch": 6.158415841584159, + "grad_norm": 0.2732922434806824, + "learning_rate": 4.369985932618799e-05, + "loss": 0.037, + "num_input_tokens_seen": 11812480, + "step": 55980 + }, + { + "epoch": 6.158965896589659, + "grad_norm": 0.20267623662948608, + "learning_rate": 4.369826630290074e-05, + "loss": 0.0231, + "num_input_tokens_seen": 11813536, + "step": 55985 + }, + { + "epoch": 6.15951595159516, + "grad_norm": 0.008481248281896114, + "learning_rate": 4.369667310728054e-05, + "loss": 0.0099, + "num_input_tokens_seen": 11814624, + "step": 55990 + }, + { + "epoch": 6.16006600660066, + "grad_norm": 0.31991884112358093, + "learning_rate": 4.369507973934208e-05, + "loss": 0.0091, + "num_input_tokens_seen": 11815680, + "step": 55995 + }, + { + "epoch": 6.16061606160616, + "grad_norm": 0.26389259099960327, + "learning_rate": 4.369348619910005e-05, + "loss": 0.0282, + "num_input_tokens_seen": 11816800, + "step": 56000 + }, + { + "epoch": 6.161166116611661, + "grad_norm": 0.21466849744319916, + "learning_rate": 4.369189248656912e-05, + "loss": 0.0973, + "num_input_tokens_seen": 11817856, + "step": 56005 + }, + { + "epoch": 6.161716171617162, + "grad_norm": 0.3766041398048401, + "learning_rate": 4.369029860176398e-05, + "loss": 0.1168, + "num_input_tokens_seen": 11818976, + "step": 56010 + }, + { + "epoch": 6.162266226622663, + "grad_norm": 0.039296723902225494, + "learning_rate": 4.3688704544699334e-05, + "loss": 0.0482, + "num_input_tokens_seen": 11820064, + "step": 56015 + }, + { + "epoch": 6.162816281628163, + "grad_norm": 0.3743062913417816, + "learning_rate": 4.368711031538987e-05, + "loss": 0.0128, + "num_input_tokens_seen": 11821152, + "step": 56020 + }, + { + "epoch": 6.163366336633663, + "grad_norm": 0.1554139405488968, + "learning_rate": 4.368551591385026e-05, + "loss": 0.0583, + "num_input_tokens_seen": 11822176, + "step": 56025 + }, + { + "epoch": 6.163916391639164, + "grad_norm": 1.2709242105484009, + "learning_rate": 4.368392134009523e-05, + "loss": 0.045, + "num_input_tokens_seen": 11823232, + "step": 56030 + }, + { + "epoch": 6.164466446644664, + "grad_norm": 0.094075046479702, + "learning_rate": 4.368232659413946e-05, + "loss": 0.0343, + "num_input_tokens_seen": 11824288, + "step": 56035 + }, + { + "epoch": 6.165016501650165, + "grad_norm": 0.012581588700413704, + "learning_rate": 4.368073167599765e-05, + "loss": 0.0249, + "num_input_tokens_seen": 11825344, + "step": 56040 + }, + { + "epoch": 6.165566556655666, + "grad_norm": 0.2832528054714203, + "learning_rate": 4.36791365856845e-05, + "loss": 0.0716, + "num_input_tokens_seen": 11826432, + "step": 56045 + }, + { + "epoch": 6.166116611661166, + "grad_norm": 0.1880006194114685, + "learning_rate": 4.367754132321471e-05, + "loss": 0.0171, + "num_input_tokens_seen": 11827520, + "step": 56050 + }, + { + "epoch": 6.166666666666667, + "grad_norm": 0.24231572449207306, + "learning_rate": 4.367594588860299e-05, + "loss": 0.0077, + "num_input_tokens_seen": 11828640, + "step": 56055 + }, + { + "epoch": 6.167216721672167, + "grad_norm": 0.04360562935471535, + "learning_rate": 4.367435028186403e-05, + "loss": 0.0663, + "num_input_tokens_seen": 11829728, + "step": 56060 + }, + { + "epoch": 6.167766776677667, + "grad_norm": 0.24985454976558685, + "learning_rate": 4.367275450301255e-05, + "loss": 0.0336, + "num_input_tokens_seen": 11830848, + "step": 56065 + }, + { + "epoch": 6.1683168316831685, + "grad_norm": 0.02391659840941429, + "learning_rate": 4.3671158552063245e-05, + "loss": 0.0085, + "num_input_tokens_seen": 11831904, + "step": 56070 + }, + { + "epoch": 6.168866886688669, + "grad_norm": 0.08341287821531296, + "learning_rate": 4.366956242903083e-05, + "loss": 0.0211, + "num_input_tokens_seen": 11832928, + "step": 56075 + }, + { + "epoch": 6.16941694169417, + "grad_norm": 0.1332370489835739, + "learning_rate": 4.366796613393003e-05, + "loss": 0.0062, + "num_input_tokens_seen": 11833952, + "step": 56080 + }, + { + "epoch": 6.16996699669967, + "grad_norm": 0.5027431845664978, + "learning_rate": 4.366636966677553e-05, + "loss": 0.0371, + "num_input_tokens_seen": 11834976, + "step": 56085 + }, + { + "epoch": 6.17051705170517, + "grad_norm": 0.5824262499809265, + "learning_rate": 4.3664773027582065e-05, + "loss": 0.0262, + "num_input_tokens_seen": 11836032, + "step": 56090 + }, + { + "epoch": 6.171067106710671, + "grad_norm": 0.1248260885477066, + "learning_rate": 4.366317621636434e-05, + "loss": 0.0271, + "num_input_tokens_seen": 11837024, + "step": 56095 + }, + { + "epoch": 6.1716171617161715, + "grad_norm": 0.03897058218717575, + "learning_rate": 4.366157923313708e-05, + "loss": 0.094, + "num_input_tokens_seen": 11838080, + "step": 56100 + }, + { + "epoch": 6.172167216721673, + "grad_norm": 0.5794538259506226, + "learning_rate": 4.365998207791499e-05, + "loss": 0.0248, + "num_input_tokens_seen": 11839072, + "step": 56105 + }, + { + "epoch": 6.172717271727173, + "grad_norm": 0.26735934615135193, + "learning_rate": 4.365838475071281e-05, + "loss": 0.042, + "num_input_tokens_seen": 11840128, + "step": 56110 + }, + { + "epoch": 6.173267326732673, + "grad_norm": 0.8049494624137878, + "learning_rate": 4.365678725154525e-05, + "loss": 0.0273, + "num_input_tokens_seen": 11841216, + "step": 56115 + }, + { + "epoch": 6.173817381738174, + "grad_norm": 0.11932976543903351, + "learning_rate": 4.365518958042703e-05, + "loss": 0.0246, + "num_input_tokens_seen": 11842240, + "step": 56120 + }, + { + "epoch": 6.174367436743674, + "grad_norm": 1.7603906393051147, + "learning_rate": 4.365359173737288e-05, + "loss": 0.0175, + "num_input_tokens_seen": 11843296, + "step": 56125 + }, + { + "epoch": 6.174917491749175, + "grad_norm": 0.5628683567047119, + "learning_rate": 4.365199372239752e-05, + "loss": 0.0477, + "num_input_tokens_seen": 11844352, + "step": 56130 + }, + { + "epoch": 6.175467546754676, + "grad_norm": 0.35720428824424744, + "learning_rate": 4.3650395535515694e-05, + "loss": 0.0193, + "num_input_tokens_seen": 11845472, + "step": 56135 + }, + { + "epoch": 6.176017601760176, + "grad_norm": 0.01660631038248539, + "learning_rate": 4.364879717674212e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11846560, + "step": 56140 + }, + { + "epoch": 6.176567656765677, + "grad_norm": 0.0599333755671978, + "learning_rate": 4.3647198646091523e-05, + "loss": 0.0043, + "num_input_tokens_seen": 11847552, + "step": 56145 + }, + { + "epoch": 6.177117711771177, + "grad_norm": 0.06891845911741257, + "learning_rate": 4.3645599943578655e-05, + "loss": 0.0041, + "num_input_tokens_seen": 11848608, + "step": 56150 + }, + { + "epoch": 6.177667766776677, + "grad_norm": 1.8625131845474243, + "learning_rate": 4.3644001069218225e-05, + "loss": 0.0902, + "num_input_tokens_seen": 11849664, + "step": 56155 + }, + { + "epoch": 6.178217821782178, + "grad_norm": 0.027897516265511513, + "learning_rate": 4.3642402023025e-05, + "loss": 0.0485, + "num_input_tokens_seen": 11850624, + "step": 56160 + }, + { + "epoch": 6.178767876787679, + "grad_norm": 0.2669195234775543, + "learning_rate": 4.3640802805013685e-05, + "loss": 0.055, + "num_input_tokens_seen": 11851648, + "step": 56165 + }, + { + "epoch": 6.17931793179318, + "grad_norm": 1.2723137140274048, + "learning_rate": 4.3639203415199046e-05, + "loss": 0.0781, + "num_input_tokens_seen": 11852704, + "step": 56170 + }, + { + "epoch": 6.17986798679868, + "grad_norm": 0.13260994851589203, + "learning_rate": 4.363760385359581e-05, + "loss": 0.0149, + "num_input_tokens_seen": 11853728, + "step": 56175 + }, + { + "epoch": 6.18041804180418, + "grad_norm": 0.015671882778406143, + "learning_rate": 4.363600412021872e-05, + "loss": 0.0091, + "num_input_tokens_seen": 11854816, + "step": 56180 + }, + { + "epoch": 6.180968096809681, + "grad_norm": 1.7424874305725098, + "learning_rate": 4.3634404215082524e-05, + "loss": 0.0709, + "num_input_tokens_seen": 11855872, + "step": 56185 + }, + { + "epoch": 6.181518151815181, + "grad_norm": 0.1769144982099533, + "learning_rate": 4.363280413820197e-05, + "loss": 0.0309, + "num_input_tokens_seen": 11856896, + "step": 56190 + }, + { + "epoch": 6.1820682068206825, + "grad_norm": 0.07241547107696533, + "learning_rate": 4.36312038895918e-05, + "loss": 0.038, + "num_input_tokens_seen": 11857984, + "step": 56195 + }, + { + "epoch": 6.182618261826183, + "grad_norm": 0.2534622550010681, + "learning_rate": 4.362960346926676e-05, + "loss": 0.1014, + "num_input_tokens_seen": 11859008, + "step": 56200 + }, + { + "epoch": 6.183168316831683, + "grad_norm": 0.01714867725968361, + "learning_rate": 4.36280028772416e-05, + "loss": 0.1293, + "num_input_tokens_seen": 11860064, + "step": 56205 + }, + { + "epoch": 6.183718371837184, + "grad_norm": 0.03610323369503021, + "learning_rate": 4.362640211353109e-05, + "loss": 0.0867, + "num_input_tokens_seen": 11861184, + "step": 56210 + }, + { + "epoch": 6.184268426842684, + "grad_norm": 0.03289346024394035, + "learning_rate": 4.362480117814996e-05, + "loss": 0.0242, + "num_input_tokens_seen": 11862240, + "step": 56215 + }, + { + "epoch": 6.184818481848184, + "grad_norm": 0.0882079154253006, + "learning_rate": 4.362320007111298e-05, + "loss": 0.0084, + "num_input_tokens_seen": 11863328, + "step": 56220 + }, + { + "epoch": 6.1853685368536855, + "grad_norm": 1.1603224277496338, + "learning_rate": 4.3621598792434895e-05, + "loss": 0.1728, + "num_input_tokens_seen": 11864320, + "step": 56225 + }, + { + "epoch": 6.185918591859186, + "grad_norm": 0.28066787123680115, + "learning_rate": 4.361999734213047e-05, + "loss": 0.0577, + "num_input_tokens_seen": 11865344, + "step": 56230 + }, + { + "epoch": 6.186468646864687, + "grad_norm": 0.030256949365139008, + "learning_rate": 4.3618395720214465e-05, + "loss": 0.0048, + "num_input_tokens_seen": 11866464, + "step": 56235 + }, + { + "epoch": 6.187018701870187, + "grad_norm": 0.05042734742164612, + "learning_rate": 4.361679392670164e-05, + "loss": 0.0104, + "num_input_tokens_seen": 11867520, + "step": 56240 + }, + { + "epoch": 6.187568756875687, + "grad_norm": 0.007937728427350521, + "learning_rate": 4.361519196160677e-05, + "loss": 0.0126, + "num_input_tokens_seen": 11868576, + "step": 56245 + }, + { + "epoch": 6.188118811881188, + "grad_norm": 1.2476145029067993, + "learning_rate": 4.3613589824944586e-05, + "loss": 0.0565, + "num_input_tokens_seen": 11869696, + "step": 56250 + }, + { + "epoch": 6.1886688668866885, + "grad_norm": 0.05570083111524582, + "learning_rate": 4.3611987516729894e-05, + "loss": 0.0558, + "num_input_tokens_seen": 11870720, + "step": 56255 + }, + { + "epoch": 6.18921892189219, + "grad_norm": 0.12022107094526291, + "learning_rate": 4.3610385036977444e-05, + "loss": 0.0088, + "num_input_tokens_seen": 11871744, + "step": 56260 + }, + { + "epoch": 6.18976897689769, + "grad_norm": 0.6129845976829529, + "learning_rate": 4.3608782385701995e-05, + "loss": 0.0484, + "num_input_tokens_seen": 11872800, + "step": 56265 + }, + { + "epoch": 6.19031903190319, + "grad_norm": 0.011181426234543324, + "learning_rate": 4.360717956291833e-05, + "loss": 0.0349, + "num_input_tokens_seen": 11873888, + "step": 56270 + }, + { + "epoch": 6.190869086908691, + "grad_norm": 0.17934729158878326, + "learning_rate": 4.3605576568641226e-05, + "loss": 0.0435, + "num_input_tokens_seen": 11874912, + "step": 56275 + }, + { + "epoch": 6.191419141914191, + "grad_norm": 1.5596357583999634, + "learning_rate": 4.3603973402885445e-05, + "loss": 0.1169, + "num_input_tokens_seen": 11875936, + "step": 56280 + }, + { + "epoch": 6.191969196919692, + "grad_norm": 0.012726807966828346, + "learning_rate": 4.3602370065665774e-05, + "loss": 0.0756, + "num_input_tokens_seen": 11876992, + "step": 56285 + }, + { + "epoch": 6.192519251925193, + "grad_norm": 0.10065596550703049, + "learning_rate": 4.3600766556996985e-05, + "loss": 0.0089, + "num_input_tokens_seen": 11878016, + "step": 56290 + }, + { + "epoch": 6.193069306930693, + "grad_norm": 0.014183282852172852, + "learning_rate": 4.359916287689385e-05, + "loss": 0.0215, + "num_input_tokens_seen": 11879008, + "step": 56295 + }, + { + "epoch": 6.193619361936194, + "grad_norm": 0.00878976285457611, + "learning_rate": 4.3597559025371156e-05, + "loss": 0.0082, + "num_input_tokens_seen": 11880064, + "step": 56300 + }, + { + "epoch": 6.194169416941694, + "grad_norm": 0.018563706427812576, + "learning_rate": 4.359595500244369e-05, + "loss": 0.0133, + "num_input_tokens_seen": 11881152, + "step": 56305 + }, + { + "epoch": 6.194719471947194, + "grad_norm": 1.3540934324264526, + "learning_rate": 4.3594350808126224e-05, + "loss": 0.0347, + "num_input_tokens_seen": 11882208, + "step": 56310 + }, + { + "epoch": 6.195269526952695, + "grad_norm": 2.635856866836548, + "learning_rate": 4.359274644243355e-05, + "loss": 0.0695, + "num_input_tokens_seen": 11883264, + "step": 56315 + }, + { + "epoch": 6.195819581958196, + "grad_norm": 0.017102185636758804, + "learning_rate": 4.359114190538045e-05, + "loss": 0.052, + "num_input_tokens_seen": 11884320, + "step": 56320 + }, + { + "epoch": 6.196369636963697, + "grad_norm": 0.08597025275230408, + "learning_rate": 4.358953719698172e-05, + "loss": 0.0162, + "num_input_tokens_seen": 11885376, + "step": 56325 + }, + { + "epoch": 6.196919691969197, + "grad_norm": 2.5394625663757324, + "learning_rate": 4.358793231725215e-05, + "loss": 0.0877, + "num_input_tokens_seen": 11886464, + "step": 56330 + }, + { + "epoch": 6.197469746974697, + "grad_norm": 0.02092304825782776, + "learning_rate": 4.358632726620653e-05, + "loss": 0.0254, + "num_input_tokens_seen": 11887520, + "step": 56335 + }, + { + "epoch": 6.198019801980198, + "grad_norm": 1.6802269220352173, + "learning_rate": 4.358472204385964e-05, + "loss": 0.1423, + "num_input_tokens_seen": 11888608, + "step": 56340 + }, + { + "epoch": 6.198569856985698, + "grad_norm": 1.285110592842102, + "learning_rate": 4.358311665022629e-05, + "loss": 0.0756, + "num_input_tokens_seen": 11889728, + "step": 56345 + }, + { + "epoch": 6.1991199119911995, + "grad_norm": 0.1438116580247879, + "learning_rate": 4.358151108532127e-05, + "loss": 0.0094, + "num_input_tokens_seen": 11890688, + "step": 56350 + }, + { + "epoch": 6.1996699669967, + "grad_norm": 0.012191537767648697, + "learning_rate": 4.357990534915938e-05, + "loss": 0.0419, + "num_input_tokens_seen": 11891712, + "step": 56355 + }, + { + "epoch": 6.2002200220022, + "grad_norm": 1.2865663766860962, + "learning_rate": 4.357829944175541e-05, + "loss": 0.1897, + "num_input_tokens_seen": 11892704, + "step": 56360 + }, + { + "epoch": 6.200770077007701, + "grad_norm": 0.03252352774143219, + "learning_rate": 4.3576693363124176e-05, + "loss": 0.0403, + "num_input_tokens_seen": 11893728, + "step": 56365 + }, + { + "epoch": 6.201320132013201, + "grad_norm": 0.14712245762348175, + "learning_rate": 4.357508711328047e-05, + "loss": 0.0375, + "num_input_tokens_seen": 11894752, + "step": 56370 + }, + { + "epoch": 6.201870187018702, + "grad_norm": 0.013056378811597824, + "learning_rate": 4.3573480692239097e-05, + "loss": 0.0088, + "num_input_tokens_seen": 11895808, + "step": 56375 + }, + { + "epoch": 6.2024202420242025, + "grad_norm": 1.3159829378128052, + "learning_rate": 4.3571874100014876e-05, + "loss": 0.0395, + "num_input_tokens_seen": 11896864, + "step": 56380 + }, + { + "epoch": 6.202970297029703, + "grad_norm": 0.2586839199066162, + "learning_rate": 4.35702673366226e-05, + "loss": 0.0725, + "num_input_tokens_seen": 11897920, + "step": 56385 + }, + { + "epoch": 6.203520352035204, + "grad_norm": 0.04919402673840523, + "learning_rate": 4.356866040207707e-05, + "loss": 0.0406, + "num_input_tokens_seen": 11899008, + "step": 56390 + }, + { + "epoch": 6.204070407040704, + "grad_norm": 0.07950630784034729, + "learning_rate": 4.3567053296393115e-05, + "loss": 0.0226, + "num_input_tokens_seen": 11900096, + "step": 56395 + }, + { + "epoch": 6.204620462046204, + "grad_norm": 0.9247082471847534, + "learning_rate": 4.3565446019585544e-05, + "loss": 0.0354, + "num_input_tokens_seen": 11901152, + "step": 56400 + }, + { + "epoch": 6.205170517051705, + "grad_norm": 0.019130956381559372, + "learning_rate": 4.356383857166916e-05, + "loss": 0.0714, + "num_input_tokens_seen": 11902240, + "step": 56405 + }, + { + "epoch": 6.2057205720572055, + "grad_norm": 0.3906427025794983, + "learning_rate": 4.356223095265878e-05, + "loss": 0.0678, + "num_input_tokens_seen": 11903360, + "step": 56410 + }, + { + "epoch": 6.206270627062707, + "grad_norm": 0.010047596879303455, + "learning_rate": 4.3560623162569225e-05, + "loss": 0.0078, + "num_input_tokens_seen": 11904384, + "step": 56415 + }, + { + "epoch": 6.206820682068207, + "grad_norm": 0.10323233902454376, + "learning_rate": 4.3559015201415316e-05, + "loss": 0.0515, + "num_input_tokens_seen": 11905440, + "step": 56420 + }, + { + "epoch": 6.207370737073707, + "grad_norm": 0.0741005539894104, + "learning_rate": 4.355740706921187e-05, + "loss": 0.0302, + "num_input_tokens_seen": 11906592, + "step": 56425 + }, + { + "epoch": 6.207920792079208, + "grad_norm": 0.013058086857199669, + "learning_rate": 4.3555798765973705e-05, + "loss": 0.0091, + "num_input_tokens_seen": 11907712, + "step": 56430 + }, + { + "epoch": 6.208470847084708, + "grad_norm": 0.4140453040599823, + "learning_rate": 4.355419029171565e-05, + "loss": 0.0469, + "num_input_tokens_seen": 11908800, + "step": 56435 + }, + { + "epoch": 6.209020902090209, + "grad_norm": 0.8685168027877808, + "learning_rate": 4.3552581646452525e-05, + "loss": 0.0745, + "num_input_tokens_seen": 11909856, + "step": 56440 + }, + { + "epoch": 6.20957095709571, + "grad_norm": 0.08357524126768112, + "learning_rate": 4.355097283019916e-05, + "loss": 0.0283, + "num_input_tokens_seen": 11910976, + "step": 56445 + }, + { + "epoch": 6.21012101210121, + "grad_norm": 0.020677367225289345, + "learning_rate": 4.354936384297037e-05, + "loss": 0.087, + "num_input_tokens_seen": 11912096, + "step": 56450 + }, + { + "epoch": 6.210671067106711, + "grad_norm": 0.362705796957016, + "learning_rate": 4.3547754684781005e-05, + "loss": 0.062, + "num_input_tokens_seen": 11913152, + "step": 56455 + }, + { + "epoch": 6.211221122112211, + "grad_norm": 1.1297839879989624, + "learning_rate": 4.354614535564588e-05, + "loss": 0.0871, + "num_input_tokens_seen": 11914176, + "step": 56460 + }, + { + "epoch": 6.211771177117711, + "grad_norm": 0.0577472485601902, + "learning_rate": 4.354453585557984e-05, + "loss": 0.0274, + "num_input_tokens_seen": 11915232, + "step": 56465 + }, + { + "epoch": 6.212321232123212, + "grad_norm": 0.007711105979979038, + "learning_rate": 4.3542926184597696e-05, + "loss": 0.0171, + "num_input_tokens_seen": 11916256, + "step": 56470 + }, + { + "epoch": 6.212871287128713, + "grad_norm": 0.019196946173906326, + "learning_rate": 4.3541316342714314e-05, + "loss": 0.0684, + "num_input_tokens_seen": 11917280, + "step": 56475 + }, + { + "epoch": 6.213421342134214, + "grad_norm": 0.7845119833946228, + "learning_rate": 4.353970632994451e-05, + "loss": 0.0464, + "num_input_tokens_seen": 11918304, + "step": 56480 + }, + { + "epoch": 6.213971397139714, + "grad_norm": 2.130680799484253, + "learning_rate": 4.3538096146303134e-05, + "loss": 0.1896, + "num_input_tokens_seen": 11919328, + "step": 56485 + }, + { + "epoch": 6.214521452145214, + "grad_norm": 0.06851165741682053, + "learning_rate": 4.3536485791805015e-05, + "loss": 0.0845, + "num_input_tokens_seen": 11920416, + "step": 56490 + }, + { + "epoch": 6.215071507150715, + "grad_norm": 0.08212756365537643, + "learning_rate": 4.3534875266465005e-05, + "loss": 0.0685, + "num_input_tokens_seen": 11921504, + "step": 56495 + }, + { + "epoch": 6.215621562156215, + "grad_norm": 0.012534691020846367, + "learning_rate": 4.353326457029794e-05, + "loss": 0.1318, + "num_input_tokens_seen": 11922592, + "step": 56500 + }, + { + "epoch": 6.2161716171617165, + "grad_norm": 0.10223782807588577, + "learning_rate": 4.353165370331867e-05, + "loss": 0.0163, + "num_input_tokens_seen": 11923744, + "step": 56505 + }, + { + "epoch": 6.216721672167217, + "grad_norm": 0.37887144088745117, + "learning_rate": 4.353004266554204e-05, + "loss": 0.0167, + "num_input_tokens_seen": 11924704, + "step": 56510 + }, + { + "epoch": 6.217271727172717, + "grad_norm": 0.5857949256896973, + "learning_rate": 4.35284314569829e-05, + "loss": 0.0366, + "num_input_tokens_seen": 11925696, + "step": 56515 + }, + { + "epoch": 6.217821782178218, + "grad_norm": 0.11315315216779709, + "learning_rate": 4.3526820077656095e-05, + "loss": 0.0158, + "num_input_tokens_seen": 11926720, + "step": 56520 + }, + { + "epoch": 6.218371837183718, + "grad_norm": 0.2152009755373001, + "learning_rate": 4.352520852757648e-05, + "loss": 0.0418, + "num_input_tokens_seen": 11927744, + "step": 56525 + }, + { + "epoch": 6.218921892189219, + "grad_norm": 1.2939577102661133, + "learning_rate": 4.3523596806758916e-05, + "loss": 0.1518, + "num_input_tokens_seen": 11928800, + "step": 56530 + }, + { + "epoch": 6.2194719471947195, + "grad_norm": 0.4663042426109314, + "learning_rate": 4.3521984915218236e-05, + "loss": 0.135, + "num_input_tokens_seen": 11929888, + "step": 56535 + }, + { + "epoch": 6.22002200220022, + "grad_norm": 0.1390034258365631, + "learning_rate": 4.3520372852969314e-05, + "loss": 0.0531, + "num_input_tokens_seen": 11930912, + "step": 56540 + }, + { + "epoch": 6.220572057205721, + "grad_norm": 0.020067747682332993, + "learning_rate": 4.3518760620027e-05, + "loss": 0.0225, + "num_input_tokens_seen": 11932032, + "step": 56545 + }, + { + "epoch": 6.221122112211221, + "grad_norm": 0.08952867984771729, + "learning_rate": 4.351714821640616e-05, + "loss": 0.0784, + "num_input_tokens_seen": 11933184, + "step": 56550 + }, + { + "epoch": 6.221672167216722, + "grad_norm": 0.7230958342552185, + "learning_rate": 4.351553564212165e-05, + "loss": 0.1301, + "num_input_tokens_seen": 11934208, + "step": 56555 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.07689347118139267, + "learning_rate": 4.351392289718833e-05, + "loss": 0.015, + "num_input_tokens_seen": 11935264, + "step": 56560 + }, + { + "epoch": 6.2227722772277225, + "grad_norm": 0.10822968930006027, + "learning_rate": 4.3512309981621066e-05, + "loss": 0.0047, + "num_input_tokens_seen": 11936320, + "step": 56565 + }, + { + "epoch": 6.223322332233224, + "grad_norm": 0.02081383392214775, + "learning_rate": 4.3510696895434726e-05, + "loss": 0.0177, + "num_input_tokens_seen": 11937312, + "step": 56570 + }, + { + "epoch": 6.223872387238724, + "grad_norm": 0.09963971376419067, + "learning_rate": 4.3509083638644174e-05, + "loss": 0.132, + "num_input_tokens_seen": 11938368, + "step": 56575 + }, + { + "epoch": 6.224422442244224, + "grad_norm": 1.335394263267517, + "learning_rate": 4.3507470211264275e-05, + "loss": 0.0887, + "num_input_tokens_seen": 11939424, + "step": 56580 + }, + { + "epoch": 6.224972497249725, + "grad_norm": 0.3520393669605255, + "learning_rate": 4.350585661330991e-05, + "loss": 0.0298, + "num_input_tokens_seen": 11940544, + "step": 56585 + }, + { + "epoch": 6.225522552255225, + "grad_norm": 0.3804294168949127, + "learning_rate": 4.350424284479595e-05, + "loss": 0.0212, + "num_input_tokens_seen": 11941536, + "step": 56590 + }, + { + "epoch": 6.226072607260726, + "grad_norm": 0.55677330493927, + "learning_rate": 4.350262890573725e-05, + "loss": 0.0341, + "num_input_tokens_seen": 11942656, + "step": 56595 + }, + { + "epoch": 6.226622662266227, + "grad_norm": 0.6695677042007446, + "learning_rate": 4.350101479614871e-05, + "loss": 0.0383, + "num_input_tokens_seen": 11943712, + "step": 56600 + }, + { + "epoch": 6.227172717271727, + "grad_norm": 0.9432085156440735, + "learning_rate": 4.3499400516045186e-05, + "loss": 0.0292, + "num_input_tokens_seen": 11944704, + "step": 56605 + }, + { + "epoch": 6.227722772277228, + "grad_norm": 1.9654723405838013, + "learning_rate": 4.349778606544157e-05, + "loss": 0.1124, + "num_input_tokens_seen": 11945824, + "step": 56610 + }, + { + "epoch": 6.228272827282728, + "grad_norm": 0.039655134081840515, + "learning_rate": 4.349617144435273e-05, + "loss": 0.0144, + "num_input_tokens_seen": 11946880, + "step": 56615 + }, + { + "epoch": 6.228822882288229, + "grad_norm": 0.3654906749725342, + "learning_rate": 4.3494556652793556e-05, + "loss": 0.0273, + "num_input_tokens_seen": 11947936, + "step": 56620 + }, + { + "epoch": 6.229372937293729, + "grad_norm": 0.009989027865231037, + "learning_rate": 4.349294169077893e-05, + "loss": 0.0566, + "num_input_tokens_seen": 11948992, + "step": 56625 + }, + { + "epoch": 6.22992299229923, + "grad_norm": 0.061376191675662994, + "learning_rate": 4.349132655832373e-05, + "loss": 0.0042, + "num_input_tokens_seen": 11950048, + "step": 56630 + }, + { + "epoch": 6.230473047304731, + "grad_norm": 0.6311003565788269, + "learning_rate": 4.3489711255442844e-05, + "loss": 0.0542, + "num_input_tokens_seen": 11951072, + "step": 56635 + }, + { + "epoch": 6.231023102310231, + "grad_norm": 0.03685726225376129, + "learning_rate": 4.348809578215117e-05, + "loss": 0.1025, + "num_input_tokens_seen": 11952128, + "step": 56640 + }, + { + "epoch": 6.231573157315731, + "grad_norm": 0.17437878251075745, + "learning_rate": 4.348648013846358e-05, + "loss": 0.1255, + "num_input_tokens_seen": 11953216, + "step": 56645 + }, + { + "epoch": 6.232123212321232, + "grad_norm": 0.4332815110683441, + "learning_rate": 4.348486432439498e-05, + "loss": 0.0364, + "num_input_tokens_seen": 11954240, + "step": 56650 + }, + { + "epoch": 6.232673267326732, + "grad_norm": 0.08796083182096481, + "learning_rate": 4.348324833996025e-05, + "loss": 0.0647, + "num_input_tokens_seen": 11955296, + "step": 56655 + }, + { + "epoch": 6.2332233223322335, + "grad_norm": 0.06783730536699295, + "learning_rate": 4.348163218517429e-05, + "loss": 0.0413, + "num_input_tokens_seen": 11956320, + "step": 56660 + }, + { + "epoch": 6.233773377337734, + "grad_norm": 1.1366767883300781, + "learning_rate": 4.3480015860052004e-05, + "loss": 0.07, + "num_input_tokens_seen": 11957312, + "step": 56665 + }, + { + "epoch": 6.234323432343234, + "grad_norm": 0.33737024664878845, + "learning_rate": 4.347839936460827e-05, + "loss": 0.0909, + "num_input_tokens_seen": 11958368, + "step": 56670 + }, + { + "epoch": 6.234873487348735, + "grad_norm": 1.5383797883987427, + "learning_rate": 4.3476782698858e-05, + "loss": 0.2219, + "num_input_tokens_seen": 11959424, + "step": 56675 + }, + { + "epoch": 6.235423542354235, + "grad_norm": 1.0375206470489502, + "learning_rate": 4.347516586281608e-05, + "loss": 0.0984, + "num_input_tokens_seen": 11960512, + "step": 56680 + }, + { + "epoch": 6.235973597359736, + "grad_norm": 0.07585285604000092, + "learning_rate": 4.347354885649744e-05, + "loss": 0.036, + "num_input_tokens_seen": 11961600, + "step": 56685 + }, + { + "epoch": 6.2365236523652365, + "grad_norm": 0.21883361041545868, + "learning_rate": 4.347193167991694e-05, + "loss": 0.0196, + "num_input_tokens_seen": 11962624, + "step": 56690 + }, + { + "epoch": 6.237073707370737, + "grad_norm": 0.24094727635383606, + "learning_rate": 4.3470314333089534e-05, + "loss": 0.0624, + "num_input_tokens_seen": 11963616, + "step": 56695 + }, + { + "epoch": 6.237623762376238, + "grad_norm": 0.018044423311948776, + "learning_rate": 4.3468696816030086e-05, + "loss": 0.0116, + "num_input_tokens_seen": 11964672, + "step": 56700 + }, + { + "epoch": 6.238173817381738, + "grad_norm": 0.2477094978094101, + "learning_rate": 4.3467079128753526e-05, + "loss": 0.019, + "num_input_tokens_seen": 11965728, + "step": 56705 + }, + { + "epoch": 6.238723872387239, + "grad_norm": 0.05834139510989189, + "learning_rate": 4.346546127127477e-05, + "loss": 0.0231, + "num_input_tokens_seen": 11966816, + "step": 56710 + }, + { + "epoch": 6.239273927392739, + "grad_norm": 0.07693404704332352, + "learning_rate": 4.34638432436087e-05, + "loss": 0.0194, + "num_input_tokens_seen": 11967872, + "step": 56715 + }, + { + "epoch": 6.2398239823982395, + "grad_norm": 0.13015508651733398, + "learning_rate": 4.3462225045770266e-05, + "loss": 0.01, + "num_input_tokens_seen": 11968896, + "step": 56720 + }, + { + "epoch": 6.240374037403741, + "grad_norm": 0.08681260794401169, + "learning_rate": 4.346060667777435e-05, + "loss": 0.0366, + "num_input_tokens_seen": 11969984, + "step": 56725 + }, + { + "epoch": 6.240924092409241, + "grad_norm": 0.016110150143504143, + "learning_rate": 4.345898813963588e-05, + "loss": 0.0495, + "num_input_tokens_seen": 11971040, + "step": 56730 + }, + { + "epoch": 6.241474147414741, + "grad_norm": 0.030551735311746597, + "learning_rate": 4.3457369431369774e-05, + "loss": 0.0525, + "num_input_tokens_seen": 11972192, + "step": 56735 + }, + { + "epoch": 6.242024202420242, + "grad_norm": 0.04868775233626366, + "learning_rate": 4.345575055299096e-05, + "loss": 0.0272, + "num_input_tokens_seen": 11973216, + "step": 56740 + }, + { + "epoch": 6.242574257425742, + "grad_norm": 0.03008200041949749, + "learning_rate": 4.3454131504514344e-05, + "loss": 0.0144, + "num_input_tokens_seen": 11974304, + "step": 56745 + }, + { + "epoch": 6.243124312431243, + "grad_norm": 0.4126456081867218, + "learning_rate": 4.345251228595485e-05, + "loss": 0.0226, + "num_input_tokens_seen": 11975296, + "step": 56750 + }, + { + "epoch": 6.243674367436744, + "grad_norm": 0.018757106736302376, + "learning_rate": 4.345089289732741e-05, + "loss": 0.0304, + "num_input_tokens_seen": 11976352, + "step": 56755 + }, + { + "epoch": 6.244224422442244, + "grad_norm": 0.010963600128889084, + "learning_rate": 4.344927333864694e-05, + "loss": 0.0413, + "num_input_tokens_seen": 11977408, + "step": 56760 + }, + { + "epoch": 6.244774477447745, + "grad_norm": 0.43343478441238403, + "learning_rate": 4.3447653609928366e-05, + "loss": 0.0124, + "num_input_tokens_seen": 11978496, + "step": 56765 + }, + { + "epoch": 6.245324532453245, + "grad_norm": 0.9545229077339172, + "learning_rate": 4.344603371118663e-05, + "loss": 0.0536, + "num_input_tokens_seen": 11979552, + "step": 56770 + }, + { + "epoch": 6.245874587458746, + "grad_norm": 0.13173896074295044, + "learning_rate": 4.344441364243665e-05, + "loss": 0.0334, + "num_input_tokens_seen": 11980640, + "step": 56775 + }, + { + "epoch": 6.2464246424642464, + "grad_norm": 0.4977859556674957, + "learning_rate": 4.344279340369336e-05, + "loss": 0.0572, + "num_input_tokens_seen": 11981760, + "step": 56780 + }, + { + "epoch": 6.246974697469747, + "grad_norm": 0.03754860907793045, + "learning_rate": 4.3441172994971693e-05, + "loss": 0.0876, + "num_input_tokens_seen": 11982784, + "step": 56785 + }, + { + "epoch": 6.247524752475248, + "grad_norm": 0.9229552745819092, + "learning_rate": 4.343955241628658e-05, + "loss": 0.194, + "num_input_tokens_seen": 11983808, + "step": 56790 + }, + { + "epoch": 6.248074807480748, + "grad_norm": 0.012804843485355377, + "learning_rate": 4.3437931667652956e-05, + "loss": 0.0795, + "num_input_tokens_seen": 11984864, + "step": 56795 + }, + { + "epoch": 6.248624862486249, + "grad_norm": 1.9868847131729126, + "learning_rate": 4.3436310749085774e-05, + "loss": 0.0986, + "num_input_tokens_seen": 11985920, + "step": 56800 + }, + { + "epoch": 6.249174917491749, + "grad_norm": 1.0933170318603516, + "learning_rate": 4.343468966059996e-05, + "loss": 0.0829, + "num_input_tokens_seen": 11986976, + "step": 56805 + }, + { + "epoch": 6.2497249724972495, + "grad_norm": 0.032130926847457886, + "learning_rate": 4.343306840221045e-05, + "loss": 0.0618, + "num_input_tokens_seen": 11988000, + "step": 56810 + }, + { + "epoch": 6.2502750275027505, + "grad_norm": 0.873300313949585, + "learning_rate": 4.34314469739322e-05, + "loss": 0.0276, + "num_input_tokens_seen": 11989088, + "step": 56815 + }, + { + "epoch": 6.250825082508251, + "grad_norm": 0.6223634481430054, + "learning_rate": 4.342982537578014e-05, + "loss": 0.034, + "num_input_tokens_seen": 11990144, + "step": 56820 + }, + { + "epoch": 6.251375137513751, + "grad_norm": 0.7015300393104553, + "learning_rate": 4.3428203607769233e-05, + "loss": 0.0275, + "num_input_tokens_seen": 11991200, + "step": 56825 + }, + { + "epoch": 6.251925192519252, + "grad_norm": 1.7312361001968384, + "learning_rate": 4.342658166991441e-05, + "loss": 0.035, + "num_input_tokens_seen": 11992288, + "step": 56830 + }, + { + "epoch": 6.252475247524752, + "grad_norm": 0.1264597475528717, + "learning_rate": 4.342495956223063e-05, + "loss": 0.0095, + "num_input_tokens_seen": 11993312, + "step": 56835 + }, + { + "epoch": 6.253025302530253, + "grad_norm": 0.14315882325172424, + "learning_rate": 4.342333728473283e-05, + "loss": 0.0098, + "num_input_tokens_seen": 11994400, + "step": 56840 + }, + { + "epoch": 6.2535753575357536, + "grad_norm": 0.11478797346353531, + "learning_rate": 4.3421714837435986e-05, + "loss": 0.0089, + "num_input_tokens_seen": 11995456, + "step": 56845 + }, + { + "epoch": 6.254125412541254, + "grad_norm": 0.016188882291316986, + "learning_rate": 4.3420092220355024e-05, + "loss": 0.0092, + "num_input_tokens_seen": 11996544, + "step": 56850 + }, + { + "epoch": 6.254675467546755, + "grad_norm": 0.10863932222127914, + "learning_rate": 4.3418469433504916e-05, + "loss": 0.022, + "num_input_tokens_seen": 11997632, + "step": 56855 + }, + { + "epoch": 6.255225522552255, + "grad_norm": 0.17595170438289642, + "learning_rate": 4.341684647690061e-05, + "loss": 0.0364, + "num_input_tokens_seen": 11998688, + "step": 56860 + }, + { + "epoch": 6.255775577557756, + "grad_norm": 0.12000376731157303, + "learning_rate": 4.3415223350557074e-05, + "loss": 0.0322, + "num_input_tokens_seen": 11999776, + "step": 56865 + }, + { + "epoch": 6.256325632563256, + "grad_norm": 0.02508675865828991, + "learning_rate": 4.341360005448925e-05, + "loss": 0.0165, + "num_input_tokens_seen": 12000800, + "step": 56870 + }, + { + "epoch": 6.256875687568757, + "grad_norm": 0.05862418934702873, + "learning_rate": 4.3411976588712115e-05, + "loss": 0.003, + "num_input_tokens_seen": 12001920, + "step": 56875 + }, + { + "epoch": 6.257425742574258, + "grad_norm": 0.05761734023690224, + "learning_rate": 4.341035295324063e-05, + "loss": 0.0417, + "num_input_tokens_seen": 12003008, + "step": 56880 + }, + { + "epoch": 6.257975797579758, + "grad_norm": 0.16510149836540222, + "learning_rate": 4.3408729148089754e-05, + "loss": 0.0581, + "num_input_tokens_seen": 12004064, + "step": 56885 + }, + { + "epoch": 6.258525852585258, + "grad_norm": 0.013903837651014328, + "learning_rate": 4.3407105173274454e-05, + "loss": 0.0185, + "num_input_tokens_seen": 12005152, + "step": 56890 + }, + { + "epoch": 6.259075907590759, + "grad_norm": 0.5019019842147827, + "learning_rate": 4.3405481028809696e-05, + "loss": 0.0562, + "num_input_tokens_seen": 12006176, + "step": 56895 + }, + { + "epoch": 6.259625962596259, + "grad_norm": 0.36005493998527527, + "learning_rate": 4.3403856714710456e-05, + "loss": 0.0216, + "num_input_tokens_seen": 12007328, + "step": 56900 + }, + { + "epoch": 6.2601760176017605, + "grad_norm": 1.7881008386611938, + "learning_rate": 4.340223223099169e-05, + "loss": 0.038, + "num_input_tokens_seen": 12008448, + "step": 56905 + }, + { + "epoch": 6.260726072607261, + "grad_norm": 0.08931935578584671, + "learning_rate": 4.34006075776684e-05, + "loss": 0.1171, + "num_input_tokens_seen": 12009568, + "step": 56910 + }, + { + "epoch": 6.261276127612761, + "grad_norm": 0.7868647575378418, + "learning_rate": 4.339898275475552e-05, + "loss": 0.1003, + "num_input_tokens_seen": 12010624, + "step": 56915 + }, + { + "epoch": 6.261826182618262, + "grad_norm": 0.2955038547515869, + "learning_rate": 4.339735776226805e-05, + "loss": 0.0199, + "num_input_tokens_seen": 12011680, + "step": 56920 + }, + { + "epoch": 6.262376237623762, + "grad_norm": 0.07511456310749054, + "learning_rate": 4.339573260022096e-05, + "loss": 0.0174, + "num_input_tokens_seen": 12012800, + "step": 56925 + }, + { + "epoch": 6.262926292629263, + "grad_norm": 0.1465156376361847, + "learning_rate": 4.339410726862924e-05, + "loss": 0.0039, + "num_input_tokens_seen": 12013856, + "step": 56930 + }, + { + "epoch": 6.2634763476347635, + "grad_norm": 1.0464341640472412, + "learning_rate": 4.339248176750784e-05, + "loss": 0.0535, + "num_input_tokens_seen": 12014880, + "step": 56935 + }, + { + "epoch": 6.264026402640264, + "grad_norm": 0.46836328506469727, + "learning_rate": 4.339085609687178e-05, + "loss": 0.0172, + "num_input_tokens_seen": 12015936, + "step": 56940 + }, + { + "epoch": 6.264576457645765, + "grad_norm": 0.12238676100969315, + "learning_rate": 4.3389230256736015e-05, + "loss": 0.0214, + "num_input_tokens_seen": 12016928, + "step": 56945 + }, + { + "epoch": 6.265126512651265, + "grad_norm": 0.003543859114870429, + "learning_rate": 4.338760424711554e-05, + "loss": 0.0234, + "num_input_tokens_seen": 12017952, + "step": 56950 + }, + { + "epoch": 6.265676567656766, + "grad_norm": 0.5345079898834229, + "learning_rate": 4.338597806802534e-05, + "loss": 0.1169, + "num_input_tokens_seen": 12018976, + "step": 56955 + }, + { + "epoch": 6.266226622662266, + "grad_norm": 0.03416184335947037, + "learning_rate": 4.33843517194804e-05, + "loss": 0.0088, + "num_input_tokens_seen": 12020000, + "step": 56960 + }, + { + "epoch": 6.2667766776677665, + "grad_norm": 0.025587795302271843, + "learning_rate": 4.3382725201495723e-05, + "loss": 0.0027, + "num_input_tokens_seen": 12021056, + "step": 56965 + }, + { + "epoch": 6.267326732673268, + "grad_norm": 0.010263744741678238, + "learning_rate": 4.3381098514086276e-05, + "loss": 0.0418, + "num_input_tokens_seen": 12022112, + "step": 56970 + }, + { + "epoch": 6.267876787678768, + "grad_norm": 1.0196759700775146, + "learning_rate": 4.337947165726707e-05, + "loss": 0.1237, + "num_input_tokens_seen": 12023200, + "step": 56975 + }, + { + "epoch": 6.268426842684269, + "grad_norm": 0.008013831451535225, + "learning_rate": 4.337784463105309e-05, + "loss": 0.0764, + "num_input_tokens_seen": 12024288, + "step": 56980 + }, + { + "epoch": 6.268976897689769, + "grad_norm": 0.29008355736732483, + "learning_rate": 4.337621743545934e-05, + "loss": 0.0106, + "num_input_tokens_seen": 12025312, + "step": 56985 + }, + { + "epoch": 6.269526952695269, + "grad_norm": 0.23247458040714264, + "learning_rate": 4.3374590070500804e-05, + "loss": 0.0672, + "num_input_tokens_seen": 12026336, + "step": 56990 + }, + { + "epoch": 6.27007700770077, + "grad_norm": 1.3189762830734253, + "learning_rate": 4.33729625361925e-05, + "loss": 0.1712, + "num_input_tokens_seen": 12027392, + "step": 56995 + }, + { + "epoch": 6.270627062706271, + "grad_norm": 1.5459315776824951, + "learning_rate": 4.3371334832549404e-05, + "loss": 0.0231, + "num_input_tokens_seen": 12028416, + "step": 57000 + }, + { + "epoch": 6.271177117711771, + "grad_norm": 0.23858791589736938, + "learning_rate": 4.336970695958653e-05, + "loss": 0.0552, + "num_input_tokens_seen": 12029408, + "step": 57005 + }, + { + "epoch": 6.271727172717272, + "grad_norm": 0.12850061058998108, + "learning_rate": 4.336807891731889e-05, + "loss": 0.0208, + "num_input_tokens_seen": 12030464, + "step": 57010 + }, + { + "epoch": 6.272277227722772, + "grad_norm": 1.271868348121643, + "learning_rate": 4.336645070576148e-05, + "loss": 0.0483, + "num_input_tokens_seen": 12031552, + "step": 57015 + }, + { + "epoch": 6.272827282728273, + "grad_norm": 0.07698666304349899, + "learning_rate": 4.33648223249293e-05, + "loss": 0.0083, + "num_input_tokens_seen": 12032576, + "step": 57020 + }, + { + "epoch": 6.273377337733773, + "grad_norm": 0.838124692440033, + "learning_rate": 4.3363193774837374e-05, + "loss": 0.0258, + "num_input_tokens_seen": 12033536, + "step": 57025 + }, + { + "epoch": 6.273927392739274, + "grad_norm": 0.028581460937857628, + "learning_rate": 4.3361565055500695e-05, + "loss": 0.0292, + "num_input_tokens_seen": 12034528, + "step": 57030 + }, + { + "epoch": 6.274477447744775, + "grad_norm": 0.029032953083515167, + "learning_rate": 4.3359936166934286e-05, + "loss": 0.0044, + "num_input_tokens_seen": 12035584, + "step": 57035 + }, + { + "epoch": 6.275027502750275, + "grad_norm": 0.009685910306870937, + "learning_rate": 4.335830710915315e-05, + "loss": 0.0187, + "num_input_tokens_seen": 12036640, + "step": 57040 + }, + { + "epoch": 6.275577557755776, + "grad_norm": 0.06467613577842712, + "learning_rate": 4.3356677882172314e-05, + "loss": 0.015, + "num_input_tokens_seen": 12037664, + "step": 57045 + }, + { + "epoch": 6.276127612761276, + "grad_norm": 0.060405973345041275, + "learning_rate": 4.3355048486006775e-05, + "loss": 0.0215, + "num_input_tokens_seen": 12038752, + "step": 57050 + }, + { + "epoch": 6.276677667766776, + "grad_norm": 0.10743720829486847, + "learning_rate": 4.335341892067157e-05, + "loss": 0.0099, + "num_input_tokens_seen": 12039808, + "step": 57055 + }, + { + "epoch": 6.2772277227722775, + "grad_norm": 0.027667183429002762, + "learning_rate": 4.3351789186181715e-05, + "loss": 0.137, + "num_input_tokens_seen": 12040800, + "step": 57060 + }, + { + "epoch": 6.277777777777778, + "grad_norm": 0.10035751014947891, + "learning_rate": 4.335015928255222e-05, + "loss": 0.1252, + "num_input_tokens_seen": 12041952, + "step": 57065 + }, + { + "epoch": 6.278327832783278, + "grad_norm": 0.00849719438701868, + "learning_rate": 4.334852920979811e-05, + "loss": 0.0222, + "num_input_tokens_seen": 12043040, + "step": 57070 + }, + { + "epoch": 6.278877887788779, + "grad_norm": 0.057984914630651474, + "learning_rate": 4.334689896793441e-05, + "loss": 0.0079, + "num_input_tokens_seen": 12044064, + "step": 57075 + }, + { + "epoch": 6.279427942794279, + "grad_norm": 0.05874629318714142, + "learning_rate": 4.334526855697615e-05, + "loss": 0.0092, + "num_input_tokens_seen": 12045088, + "step": 57080 + }, + { + "epoch": 6.27997799779978, + "grad_norm": 1.0116524696350098, + "learning_rate": 4.334363797693834e-05, + "loss": 0.0616, + "num_input_tokens_seen": 12046176, + "step": 57085 + }, + { + "epoch": 6.2805280528052805, + "grad_norm": 0.019940370693802834, + "learning_rate": 4.3342007227836046e-05, + "loss": 0.0371, + "num_input_tokens_seen": 12047232, + "step": 57090 + }, + { + "epoch": 6.281078107810781, + "grad_norm": 0.017333323135972023, + "learning_rate": 4.334037630968425e-05, + "loss": 0.0642, + "num_input_tokens_seen": 12048320, + "step": 57095 + }, + { + "epoch": 6.281628162816282, + "grad_norm": 0.9080245494842529, + "learning_rate": 4.333874522249802e-05, + "loss": 0.0406, + "num_input_tokens_seen": 12049408, + "step": 57100 + }, + { + "epoch": 6.282178217821782, + "grad_norm": 0.047067660838365555, + "learning_rate": 4.3337113966292365e-05, + "loss": 0.0217, + "num_input_tokens_seen": 12050464, + "step": 57105 + }, + { + "epoch": 6.282728272827283, + "grad_norm": 0.21264298260211945, + "learning_rate": 4.333548254108234e-05, + "loss": 0.0406, + "num_input_tokens_seen": 12051520, + "step": 57110 + }, + { + "epoch": 6.283278327832783, + "grad_norm": 1.283643364906311, + "learning_rate": 4.333385094688296e-05, + "loss": 0.0725, + "num_input_tokens_seen": 12052576, + "step": 57115 + }, + { + "epoch": 6.2838283828382835, + "grad_norm": 0.021928533911705017, + "learning_rate": 4.333221918370928e-05, + "loss": 0.0114, + "num_input_tokens_seen": 12053600, + "step": 57120 + }, + { + "epoch": 6.284378437843785, + "grad_norm": 0.048804063349962234, + "learning_rate": 4.3330587251576327e-05, + "loss": 0.0529, + "num_input_tokens_seen": 12054656, + "step": 57125 + }, + { + "epoch": 6.284928492849285, + "grad_norm": 0.4348642826080322, + "learning_rate": 4.332895515049915e-05, + "loss": 0.0521, + "num_input_tokens_seen": 12055712, + "step": 57130 + }, + { + "epoch": 6.285478547854786, + "grad_norm": 0.05564124882221222, + "learning_rate": 4.332732288049279e-05, + "loss": 0.0504, + "num_input_tokens_seen": 12056768, + "step": 57135 + }, + { + "epoch": 6.286028602860286, + "grad_norm": 0.028310034424066544, + "learning_rate": 4.3325690441572286e-05, + "loss": 0.0361, + "num_input_tokens_seen": 12057856, + "step": 57140 + }, + { + "epoch": 6.286578657865786, + "grad_norm": 0.06667683273553848, + "learning_rate": 4.3324057833752694e-05, + "loss": 0.0769, + "num_input_tokens_seen": 12058944, + "step": 57145 + }, + { + "epoch": 6.287128712871287, + "grad_norm": 0.3316408395767212, + "learning_rate": 4.3322425057049043e-05, + "loss": 0.0234, + "num_input_tokens_seen": 12059936, + "step": 57150 + }, + { + "epoch": 6.287678767876788, + "grad_norm": 0.03564300760626793, + "learning_rate": 4.33207921114764e-05, + "loss": 0.0428, + "num_input_tokens_seen": 12060992, + "step": 57155 + }, + { + "epoch": 6.288228822882289, + "grad_norm": 1.0334783792495728, + "learning_rate": 4.33191589970498e-05, + "loss": 0.0807, + "num_input_tokens_seen": 12062048, + "step": 57160 + }, + { + "epoch": 6.288778877887789, + "grad_norm": 0.15883955359458923, + "learning_rate": 4.331752571378431e-05, + "loss": 0.0871, + "num_input_tokens_seen": 12063136, + "step": 57165 + }, + { + "epoch": 6.289328932893289, + "grad_norm": 0.14490510523319244, + "learning_rate": 4.331589226169497e-05, + "loss": 0.0054, + "num_input_tokens_seen": 12064224, + "step": 57170 + }, + { + "epoch": 6.28987898789879, + "grad_norm": 0.14088225364685059, + "learning_rate": 4.331425864079685e-05, + "loss": 0.0258, + "num_input_tokens_seen": 12065312, + "step": 57175 + }, + { + "epoch": 6.29042904290429, + "grad_norm": 0.09387973695993423, + "learning_rate": 4.331262485110498e-05, + "loss": 0.011, + "num_input_tokens_seen": 12066400, + "step": 57180 + }, + { + "epoch": 6.290979097909791, + "grad_norm": 0.024245474487543106, + "learning_rate": 4.331099089263445e-05, + "loss": 0.0342, + "num_input_tokens_seen": 12067456, + "step": 57185 + }, + { + "epoch": 6.291529152915292, + "grad_norm": 0.08997755497694016, + "learning_rate": 4.330935676540029e-05, + "loss": 0.0711, + "num_input_tokens_seen": 12068544, + "step": 57190 + }, + { + "epoch": 6.292079207920792, + "grad_norm": 0.9574497938156128, + "learning_rate": 4.330772246941758e-05, + "loss": 0.036, + "num_input_tokens_seen": 12069568, + "step": 57195 + }, + { + "epoch": 6.292629262926293, + "grad_norm": 0.3422914445400238, + "learning_rate": 4.330608800470138e-05, + "loss": 0.068, + "num_input_tokens_seen": 12070656, + "step": 57200 + }, + { + "epoch": 6.293179317931793, + "grad_norm": 0.10354882478713989, + "learning_rate": 4.330445337126675e-05, + "loss": 0.0381, + "num_input_tokens_seen": 12071680, + "step": 57205 + }, + { + "epoch": 6.293729372937293, + "grad_norm": 0.08550424128770828, + "learning_rate": 4.3302818569128746e-05, + "loss": 0.0137, + "num_input_tokens_seen": 12072768, + "step": 57210 + }, + { + "epoch": 6.2942794279427945, + "grad_norm": 0.2333553433418274, + "learning_rate": 4.330118359830246e-05, + "loss": 0.0191, + "num_input_tokens_seen": 12073792, + "step": 57215 + }, + { + "epoch": 6.294829482948295, + "grad_norm": 0.1186613067984581, + "learning_rate": 4.3299548458802944e-05, + "loss": 0.0058, + "num_input_tokens_seen": 12074848, + "step": 57220 + }, + { + "epoch": 6.295379537953796, + "grad_norm": 0.23841246962547302, + "learning_rate": 4.3297913150645264e-05, + "loss": 0.1052, + "num_input_tokens_seen": 12075872, + "step": 57225 + }, + { + "epoch": 6.295929592959296, + "grad_norm": 0.11752036213874817, + "learning_rate": 4.3296277673844496e-05, + "loss": 0.1087, + "num_input_tokens_seen": 12076928, + "step": 57230 + }, + { + "epoch": 6.296479647964796, + "grad_norm": 0.4578906297683716, + "learning_rate": 4.3294642028415725e-05, + "loss": 0.0246, + "num_input_tokens_seen": 12077920, + "step": 57235 + }, + { + "epoch": 6.297029702970297, + "grad_norm": 0.1962713897228241, + "learning_rate": 4.329300621437401e-05, + "loss": 0.0549, + "num_input_tokens_seen": 12079008, + "step": 57240 + }, + { + "epoch": 6.2975797579757975, + "grad_norm": 0.10269902646541595, + "learning_rate": 4.329137023173444e-05, + "loss": 0.0251, + "num_input_tokens_seen": 12080096, + "step": 57245 + }, + { + "epoch": 6.298129812981298, + "grad_norm": 0.03452081233263016, + "learning_rate": 4.328973408051209e-05, + "loss": 0.0108, + "num_input_tokens_seen": 12081152, + "step": 57250 + }, + { + "epoch": 6.298679867986799, + "grad_norm": 0.8680613040924072, + "learning_rate": 4.328809776072203e-05, + "loss": 0.0325, + "num_input_tokens_seen": 12082176, + "step": 57255 + }, + { + "epoch": 6.299229922992299, + "grad_norm": 0.06663722544908524, + "learning_rate": 4.3286461272379355e-05, + "loss": 0.0415, + "num_input_tokens_seen": 12083168, + "step": 57260 + }, + { + "epoch": 6.2997799779978, + "grad_norm": 0.15852515399456024, + "learning_rate": 4.328482461549914e-05, + "loss": 0.0493, + "num_input_tokens_seen": 12084160, + "step": 57265 + }, + { + "epoch": 6.3003300330033, + "grad_norm": 1.5211137533187866, + "learning_rate": 4.328318779009647e-05, + "loss": 0.0821, + "num_input_tokens_seen": 12085248, + "step": 57270 + }, + { + "epoch": 6.3008800880088005, + "grad_norm": 0.5740869641304016, + "learning_rate": 4.3281550796186426e-05, + "loss": 0.0395, + "num_input_tokens_seen": 12086304, + "step": 57275 + }, + { + "epoch": 6.301430143014302, + "grad_norm": 0.02333025634288788, + "learning_rate": 4.327991363378411e-05, + "loss": 0.0052, + "num_input_tokens_seen": 12087360, + "step": 57280 + }, + { + "epoch": 6.301980198019802, + "grad_norm": 0.05414162576198578, + "learning_rate": 4.327827630290459e-05, + "loss": 0.1377, + "num_input_tokens_seen": 12088384, + "step": 57285 + }, + { + "epoch": 6.302530253025303, + "grad_norm": 0.05954709276556969, + "learning_rate": 4.3276638803562975e-05, + "loss": 0.018, + "num_input_tokens_seen": 12089504, + "step": 57290 + }, + { + "epoch": 6.303080308030803, + "grad_norm": 0.9894658327102661, + "learning_rate": 4.327500113577435e-05, + "loss": 0.0573, + "num_input_tokens_seen": 12090528, + "step": 57295 + }, + { + "epoch": 6.303630363036303, + "grad_norm": 0.010702709667384624, + "learning_rate": 4.327336329955381e-05, + "loss": 0.0097, + "num_input_tokens_seen": 12091552, + "step": 57300 + }, + { + "epoch": 6.304180418041804, + "grad_norm": 0.15387876331806183, + "learning_rate": 4.3271725294916445e-05, + "loss": 0.0306, + "num_input_tokens_seen": 12092608, + "step": 57305 + }, + { + "epoch": 6.304730473047305, + "grad_norm": 0.029747093096375465, + "learning_rate": 4.327008712187736e-05, + "loss": 0.0318, + "num_input_tokens_seen": 12093664, + "step": 57310 + }, + { + "epoch": 6.305280528052805, + "grad_norm": 0.031996797770261765, + "learning_rate": 4.326844878045164e-05, + "loss": 0.0041, + "num_input_tokens_seen": 12094784, + "step": 57315 + }, + { + "epoch": 6.305830583058306, + "grad_norm": 0.013350732624530792, + "learning_rate": 4.32668102706544e-05, + "loss": 0.0185, + "num_input_tokens_seen": 12095776, + "step": 57320 + }, + { + "epoch": 6.306380638063806, + "grad_norm": 0.0988522320985794, + "learning_rate": 4.326517159250074e-05, + "loss": 0.0145, + "num_input_tokens_seen": 12096768, + "step": 57325 + }, + { + "epoch": 6.306930693069307, + "grad_norm": 0.548603892326355, + "learning_rate": 4.326353274600575e-05, + "loss": 0.0383, + "num_input_tokens_seen": 12097792, + "step": 57330 + }, + { + "epoch": 6.307480748074807, + "grad_norm": 0.19780735671520233, + "learning_rate": 4.326189373118454e-05, + "loss": 0.0117, + "num_input_tokens_seen": 12098880, + "step": 57335 + }, + { + "epoch": 6.3080308030803085, + "grad_norm": 0.039139412343502045, + "learning_rate": 4.326025454805223e-05, + "loss": 0.0205, + "num_input_tokens_seen": 12099936, + "step": 57340 + }, + { + "epoch": 6.308580858085809, + "grad_norm": 0.060677312314510345, + "learning_rate": 4.325861519662391e-05, + "loss": 0.0595, + "num_input_tokens_seen": 12101056, + "step": 57345 + }, + { + "epoch": 6.309130913091309, + "grad_norm": 0.24474455416202545, + "learning_rate": 4.3256975676914695e-05, + "loss": 0.0191, + "num_input_tokens_seen": 12102112, + "step": 57350 + }, + { + "epoch": 6.30968096809681, + "grad_norm": 0.7346634864807129, + "learning_rate": 4.325533598893969e-05, + "loss": 0.0248, + "num_input_tokens_seen": 12103168, + "step": 57355 + }, + { + "epoch": 6.31023102310231, + "grad_norm": 0.02633919194340706, + "learning_rate": 4.325369613271403e-05, + "loss": 0.1197, + "num_input_tokens_seen": 12104160, + "step": 57360 + }, + { + "epoch": 6.31078107810781, + "grad_norm": 0.04107463359832764, + "learning_rate": 4.3252056108252795e-05, + "loss": 0.1016, + "num_input_tokens_seen": 12105184, + "step": 57365 + }, + { + "epoch": 6.3113311331133115, + "grad_norm": 0.059970736503601074, + "learning_rate": 4.3250415915571116e-05, + "loss": 0.0305, + "num_input_tokens_seen": 12106144, + "step": 57370 + }, + { + "epoch": 6.311881188118812, + "grad_norm": 0.0961347222328186, + "learning_rate": 4.324877555468412e-05, + "loss": 0.0171, + "num_input_tokens_seen": 12107200, + "step": 57375 + }, + { + "epoch": 6.312431243124313, + "grad_norm": 0.48008033633232117, + "learning_rate": 4.3247135025606914e-05, + "loss": 0.0304, + "num_input_tokens_seen": 12108224, + "step": 57380 + }, + { + "epoch": 6.312981298129813, + "grad_norm": 0.20861734449863434, + "learning_rate": 4.324549432835463e-05, + "loss": 0.0568, + "num_input_tokens_seen": 12109248, + "step": 57385 + }, + { + "epoch": 6.313531353135313, + "grad_norm": 0.18745797872543335, + "learning_rate": 4.324385346294236e-05, + "loss": 0.0051, + "num_input_tokens_seen": 12110240, + "step": 57390 + }, + { + "epoch": 6.314081408140814, + "grad_norm": 0.2745620310306549, + "learning_rate": 4.324221242938526e-05, + "loss": 0.0305, + "num_input_tokens_seen": 12111200, + "step": 57395 + }, + { + "epoch": 6.3146314631463145, + "grad_norm": 0.039175257086753845, + "learning_rate": 4.3240571227698436e-05, + "loss": 0.1356, + "num_input_tokens_seen": 12112256, + "step": 57400 + }, + { + "epoch": 6.315181518151816, + "grad_norm": 1.3616117238998413, + "learning_rate": 4.323892985789702e-05, + "loss": 0.0846, + "num_input_tokens_seen": 12113312, + "step": 57405 + }, + { + "epoch": 6.315731573157316, + "grad_norm": 0.04238266497850418, + "learning_rate": 4.3237288319996146e-05, + "loss": 0.083, + "num_input_tokens_seen": 12114304, + "step": 57410 + }, + { + "epoch": 6.316281628162816, + "grad_norm": 0.5578572750091553, + "learning_rate": 4.3235646614010926e-05, + "loss": 0.0273, + "num_input_tokens_seen": 12115424, + "step": 57415 + }, + { + "epoch": 6.316831683168317, + "grad_norm": 1.601896047592163, + "learning_rate": 4.3234004739956515e-05, + "loss": 0.1645, + "num_input_tokens_seen": 12116416, + "step": 57420 + }, + { + "epoch": 6.317381738173817, + "grad_norm": 0.16267451643943787, + "learning_rate": 4.3232362697848015e-05, + "loss": 0.1094, + "num_input_tokens_seen": 12117472, + "step": 57425 + }, + { + "epoch": 6.3179317931793175, + "grad_norm": 0.07990699261426926, + "learning_rate": 4.323072048770059e-05, + "loss": 0.0584, + "num_input_tokens_seen": 12118592, + "step": 57430 + }, + { + "epoch": 6.318481848184819, + "grad_norm": 0.42765718698501587, + "learning_rate": 4.322907810952936e-05, + "loss": 0.0215, + "num_input_tokens_seen": 12119648, + "step": 57435 + }, + { + "epoch": 6.319031903190319, + "grad_norm": 0.21810904145240784, + "learning_rate": 4.322743556334946e-05, + "loss": 0.06, + "num_input_tokens_seen": 12120672, + "step": 57440 + }, + { + "epoch": 6.31958195819582, + "grad_norm": 0.0425146259367466, + "learning_rate": 4.3225792849176036e-05, + "loss": 0.0076, + "num_input_tokens_seen": 12121664, + "step": 57445 + }, + { + "epoch": 6.32013201320132, + "grad_norm": 0.047856491059064865, + "learning_rate": 4.3224149967024216e-05, + "loss": 0.1102, + "num_input_tokens_seen": 12122720, + "step": 57450 + }, + { + "epoch": 6.32068206820682, + "grad_norm": 0.02528955787420273, + "learning_rate": 4.3222506916909165e-05, + "loss": 0.0355, + "num_input_tokens_seen": 12123712, + "step": 57455 + }, + { + "epoch": 6.321232123212321, + "grad_norm": 0.3343714773654938, + "learning_rate": 4.3220863698846e-05, + "loss": 0.0913, + "num_input_tokens_seen": 12124768, + "step": 57460 + }, + { + "epoch": 6.321782178217822, + "grad_norm": 0.14570200443267822, + "learning_rate": 4.321922031284989e-05, + "loss": 0.0501, + "num_input_tokens_seen": 12125824, + "step": 57465 + }, + { + "epoch": 6.322332233223323, + "grad_norm": 0.022095346823334694, + "learning_rate": 4.321757675893596e-05, + "loss": 0.049, + "num_input_tokens_seen": 12126944, + "step": 57470 + }, + { + "epoch": 6.322882288228823, + "grad_norm": 0.0898425504565239, + "learning_rate": 4.321593303711937e-05, + "loss": 0.0578, + "num_input_tokens_seen": 12128000, + "step": 57475 + }, + { + "epoch": 6.323432343234323, + "grad_norm": 0.03068418800830841, + "learning_rate": 4.321428914741526e-05, + "loss": 0.0309, + "num_input_tokens_seen": 12129088, + "step": 57480 + }, + { + "epoch": 6.323982398239824, + "grad_norm": 0.1999099999666214, + "learning_rate": 4.321264508983879e-05, + "loss": 0.0181, + "num_input_tokens_seen": 12130144, + "step": 57485 + }, + { + "epoch": 6.324532453245324, + "grad_norm": 0.15474833548069, + "learning_rate": 4.3211000864405115e-05, + "loss": 0.0205, + "num_input_tokens_seen": 12131232, + "step": 57490 + }, + { + "epoch": 6.325082508250825, + "grad_norm": 0.3531326353549957, + "learning_rate": 4.320935647112938e-05, + "loss": 0.0368, + "num_input_tokens_seen": 12132256, + "step": 57495 + }, + { + "epoch": 6.325632563256326, + "grad_norm": 1.8433997631072998, + "learning_rate": 4.320771191002674e-05, + "loss": 0.0572, + "num_input_tokens_seen": 12133344, + "step": 57500 + }, + { + "epoch": 6.326182618261826, + "grad_norm": 0.09436316788196564, + "learning_rate": 4.3206067181112364e-05, + "loss": 0.0102, + "num_input_tokens_seen": 12134432, + "step": 57505 + }, + { + "epoch": 6.326732673267327, + "grad_norm": 0.017709072679281235, + "learning_rate": 4.320442228440139e-05, + "loss": 0.0191, + "num_input_tokens_seen": 12135456, + "step": 57510 + }, + { + "epoch": 6.327282728272827, + "grad_norm": 0.9920023679733276, + "learning_rate": 4.320277721990901e-05, + "loss": 0.1633, + "num_input_tokens_seen": 12136608, + "step": 57515 + }, + { + "epoch": 6.327832783278327, + "grad_norm": 0.047947414219379425, + "learning_rate": 4.320113198765035e-05, + "loss": 0.0112, + "num_input_tokens_seen": 12137664, + "step": 57520 + }, + { + "epoch": 6.3283828382838285, + "grad_norm": 0.08031447231769562, + "learning_rate": 4.31994865876406e-05, + "loss": 0.0382, + "num_input_tokens_seen": 12138688, + "step": 57525 + }, + { + "epoch": 6.328932893289329, + "grad_norm": 0.39408162236213684, + "learning_rate": 4.3197841019894914e-05, + "loss": 0.0173, + "num_input_tokens_seen": 12139712, + "step": 57530 + }, + { + "epoch": 6.32948294829483, + "grad_norm": 0.33892932534217834, + "learning_rate": 4.319619528442845e-05, + "loss": 0.0631, + "num_input_tokens_seen": 12140800, + "step": 57535 + }, + { + "epoch": 6.33003300330033, + "grad_norm": 0.03317924961447716, + "learning_rate": 4.31945493812564e-05, + "loss": 0.048, + "num_input_tokens_seen": 12141856, + "step": 57540 + }, + { + "epoch": 6.33058305830583, + "grad_norm": 0.009877334348857403, + "learning_rate": 4.319290331039391e-05, + "loss": 0.0414, + "num_input_tokens_seen": 12142944, + "step": 57545 + }, + { + "epoch": 6.331133113311331, + "grad_norm": 1.2076706886291504, + "learning_rate": 4.319125707185616e-05, + "loss": 0.0951, + "num_input_tokens_seen": 12144032, + "step": 57550 + }, + { + "epoch": 6.3316831683168315, + "grad_norm": 0.02271861396729946, + "learning_rate": 4.318961066565832e-05, + "loss": 0.0793, + "num_input_tokens_seen": 12145088, + "step": 57555 + }, + { + "epoch": 6.332233223322333, + "grad_norm": 0.09986626356840134, + "learning_rate": 4.3187964091815575e-05, + "loss": 0.0052, + "num_input_tokens_seen": 12146112, + "step": 57560 + }, + { + "epoch": 6.332783278327833, + "grad_norm": 0.29791781306266785, + "learning_rate": 4.318631735034308e-05, + "loss": 0.0129, + "num_input_tokens_seen": 12147168, + "step": 57565 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.5768572092056274, + "learning_rate": 4.3184670441256045e-05, + "loss": 0.1708, + "num_input_tokens_seen": 12148160, + "step": 57570 + }, + { + "epoch": 6.333883388338834, + "grad_norm": 0.23768484592437744, + "learning_rate": 4.3183023364569614e-05, + "loss": 0.0205, + "num_input_tokens_seen": 12149184, + "step": 57575 + }, + { + "epoch": 6.334433443344334, + "grad_norm": 0.17012520134449005, + "learning_rate": 4.318137612029898e-05, + "loss": 0.0132, + "num_input_tokens_seen": 12150240, + "step": 57580 + }, + { + "epoch": 6.334983498349835, + "grad_norm": 0.09490171074867249, + "learning_rate": 4.317972870845933e-05, + "loss": 0.0071, + "num_input_tokens_seen": 12151328, + "step": 57585 + }, + { + "epoch": 6.335533553355336, + "grad_norm": 0.008717814460396767, + "learning_rate": 4.317808112906585e-05, + "loss": 0.141, + "num_input_tokens_seen": 12152416, + "step": 57590 + }, + { + "epoch": 6.336083608360836, + "grad_norm": 0.6430479288101196, + "learning_rate": 4.317643338213371e-05, + "loss": 0.0954, + "num_input_tokens_seen": 12153472, + "step": 57595 + }, + { + "epoch": 6.336633663366337, + "grad_norm": 0.01336853951215744, + "learning_rate": 4.317478546767811e-05, + "loss": 0.042, + "num_input_tokens_seen": 12154560, + "step": 57600 + }, + { + "epoch": 6.337183718371837, + "grad_norm": 0.00829743780195713, + "learning_rate": 4.3173137385714236e-05, + "loss": 0.0126, + "num_input_tokens_seen": 12155584, + "step": 57605 + }, + { + "epoch": 6.337733773377337, + "grad_norm": 0.08374034613370895, + "learning_rate": 4.317148913625727e-05, + "loss": 0.0102, + "num_input_tokens_seen": 12156672, + "step": 57610 + }, + { + "epoch": 6.338283828382838, + "grad_norm": 0.06812597811222076, + "learning_rate": 4.3169840719322406e-05, + "loss": 0.0158, + "num_input_tokens_seen": 12157728, + "step": 57615 + }, + { + "epoch": 6.338833883388339, + "grad_norm": 0.20990559458732605, + "learning_rate": 4.316819213492484e-05, + "loss": 0.0095, + "num_input_tokens_seen": 12158752, + "step": 57620 + }, + { + "epoch": 6.33938393839384, + "grad_norm": 0.04082503542304039, + "learning_rate": 4.316654338307977e-05, + "loss": 0.0037, + "num_input_tokens_seen": 12159808, + "step": 57625 + }, + { + "epoch": 6.33993399339934, + "grad_norm": 0.09270547330379486, + "learning_rate": 4.3164894463802385e-05, + "loss": 0.0253, + "num_input_tokens_seen": 12160864, + "step": 57630 + }, + { + "epoch": 6.34048404840484, + "grad_norm": 0.4970947206020355, + "learning_rate": 4.316324537710789e-05, + "loss": 0.0249, + "num_input_tokens_seen": 12161920, + "step": 57635 + }, + { + "epoch": 6.341034103410341, + "grad_norm": 0.14753875136375427, + "learning_rate": 4.3161596123011464e-05, + "loss": 0.0728, + "num_input_tokens_seen": 12162944, + "step": 57640 + }, + { + "epoch": 6.341584158415841, + "grad_norm": 0.48725852370262146, + "learning_rate": 4.315994670152833e-05, + "loss": 0.0483, + "num_input_tokens_seen": 12163968, + "step": 57645 + }, + { + "epoch": 6.3421342134213425, + "grad_norm": 1.212252140045166, + "learning_rate": 4.3158297112673686e-05, + "loss": 0.0732, + "num_input_tokens_seen": 12165024, + "step": 57650 + }, + { + "epoch": 6.342684268426843, + "grad_norm": 0.05800141394138336, + "learning_rate": 4.315664735646272e-05, + "loss": 0.0101, + "num_input_tokens_seen": 12166080, + "step": 57655 + }, + { + "epoch": 6.343234323432343, + "grad_norm": 0.1808735877275467, + "learning_rate": 4.3154997432910646e-05, + "loss": 0.0119, + "num_input_tokens_seen": 12167168, + "step": 57660 + }, + { + "epoch": 6.343784378437844, + "grad_norm": 0.05486420542001724, + "learning_rate": 4.3153347342032684e-05, + "loss": 0.1044, + "num_input_tokens_seen": 12168256, + "step": 57665 + }, + { + "epoch": 6.344334433443344, + "grad_norm": 0.017094159498810768, + "learning_rate": 4.3151697083844025e-05, + "loss": 0.0137, + "num_input_tokens_seen": 12169248, + "step": 57670 + }, + { + "epoch": 6.3448844884488445, + "grad_norm": 1.299919843673706, + "learning_rate": 4.315004665835988e-05, + "loss": 0.0885, + "num_input_tokens_seen": 12170272, + "step": 57675 + }, + { + "epoch": 6.3454345434543455, + "grad_norm": 0.008957037702202797, + "learning_rate": 4.3148396065595476e-05, + "loss": 0.0873, + "num_input_tokens_seen": 12171296, + "step": 57680 + }, + { + "epoch": 6.345984598459846, + "grad_norm": 0.06345295161008835, + "learning_rate": 4.3146745305566005e-05, + "loss": 0.0412, + "num_input_tokens_seen": 12172352, + "step": 57685 + }, + { + "epoch": 6.346534653465347, + "grad_norm": 0.42151108384132385, + "learning_rate": 4.314509437828669e-05, + "loss": 0.0039, + "num_input_tokens_seen": 12173376, + "step": 57690 + }, + { + "epoch": 6.347084708470847, + "grad_norm": 0.7533058524131775, + "learning_rate": 4.314344328377274e-05, + "loss": 0.0257, + "num_input_tokens_seen": 12174400, + "step": 57695 + }, + { + "epoch": 6.347634763476347, + "grad_norm": 0.30691638588905334, + "learning_rate": 4.3141792022039396e-05, + "loss": 0.018, + "num_input_tokens_seen": 12175584, + "step": 57700 + }, + { + "epoch": 6.348184818481848, + "grad_norm": 0.22553454339504242, + "learning_rate": 4.3140140593101855e-05, + "loss": 0.015, + "num_input_tokens_seen": 12176608, + "step": 57705 + }, + { + "epoch": 6.3487348734873486, + "grad_norm": 0.11607085913419724, + "learning_rate": 4.313848899697534e-05, + "loss": 0.0625, + "num_input_tokens_seen": 12177664, + "step": 57710 + }, + { + "epoch": 6.34928492849285, + "grad_norm": 0.08216807246208191, + "learning_rate": 4.313683723367507e-05, + "loss": 0.224, + "num_input_tokens_seen": 12178752, + "step": 57715 + }, + { + "epoch": 6.34983498349835, + "grad_norm": 1.8390716314315796, + "learning_rate": 4.313518530321628e-05, + "loss": 0.0175, + "num_input_tokens_seen": 12179776, + "step": 57720 + }, + { + "epoch": 6.35038503850385, + "grad_norm": 0.2544153928756714, + "learning_rate": 4.3133533205614185e-05, + "loss": 0.0285, + "num_input_tokens_seen": 12180800, + "step": 57725 + }, + { + "epoch": 6.350935093509351, + "grad_norm": 0.3869023025035858, + "learning_rate": 4.313188094088402e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12181856, + "step": 57730 + }, + { + "epoch": 6.351485148514851, + "grad_norm": 0.8047307133674622, + "learning_rate": 4.313022850904101e-05, + "loss": 0.0649, + "num_input_tokens_seen": 12182944, + "step": 57735 + }, + { + "epoch": 6.3520352035203524, + "grad_norm": 0.006687366869300604, + "learning_rate": 4.3128575910100375e-05, + "loss": 0.0439, + "num_input_tokens_seen": 12184000, + "step": 57740 + }, + { + "epoch": 6.352585258525853, + "grad_norm": 0.035245802253484726, + "learning_rate": 4.312692314407737e-05, + "loss": 0.0505, + "num_input_tokens_seen": 12185088, + "step": 57745 + }, + { + "epoch": 6.353135313531353, + "grad_norm": 0.29163724184036255, + "learning_rate": 4.31252702109872e-05, + "loss": 0.0181, + "num_input_tokens_seen": 12186144, + "step": 57750 + }, + { + "epoch": 6.353685368536854, + "grad_norm": 0.009264126420021057, + "learning_rate": 4.312361711084512e-05, + "loss": 0.0378, + "num_input_tokens_seen": 12187264, + "step": 57755 + }, + { + "epoch": 6.354235423542354, + "grad_norm": 1.1034903526306152, + "learning_rate": 4.312196384366635e-05, + "loss": 0.0875, + "num_input_tokens_seen": 12188320, + "step": 57760 + }, + { + "epoch": 6.354785478547855, + "grad_norm": 0.11423476040363312, + "learning_rate": 4.312031040946614e-05, + "loss": 0.0038, + "num_input_tokens_seen": 12189344, + "step": 57765 + }, + { + "epoch": 6.3553355335533555, + "grad_norm": 0.9215590357780457, + "learning_rate": 4.311865680825971e-05, + "loss": 0.0373, + "num_input_tokens_seen": 12190400, + "step": 57770 + }, + { + "epoch": 6.355885588558856, + "grad_norm": 0.8690018057823181, + "learning_rate": 4.311700304006233e-05, + "loss": 0.1152, + "num_input_tokens_seen": 12191488, + "step": 57775 + }, + { + "epoch": 6.356435643564357, + "grad_norm": 0.19575078785419464, + "learning_rate": 4.311534910488923e-05, + "loss": 0.0054, + "num_input_tokens_seen": 12192544, + "step": 57780 + }, + { + "epoch": 6.356985698569857, + "grad_norm": 0.5270618200302124, + "learning_rate": 4.3113695002755635e-05, + "loss": 0.0162, + "num_input_tokens_seen": 12193568, + "step": 57785 + }, + { + "epoch": 6.357535753575357, + "grad_norm": 0.5166444182395935, + "learning_rate": 4.311204073367682e-05, + "loss": 0.0418, + "num_input_tokens_seen": 12194592, + "step": 57790 + }, + { + "epoch": 6.358085808580858, + "grad_norm": 0.04666069149971008, + "learning_rate": 4.311038629766801e-05, + "loss": 0.047, + "num_input_tokens_seen": 12195648, + "step": 57795 + }, + { + "epoch": 6.3586358635863585, + "grad_norm": 1.426906704902649, + "learning_rate": 4.310873169474446e-05, + "loss": 0.1455, + "num_input_tokens_seen": 12196704, + "step": 57800 + }, + { + "epoch": 6.3591859185918596, + "grad_norm": 1.5566433668136597, + "learning_rate": 4.310707692492142e-05, + "loss": 0.0999, + "num_input_tokens_seen": 12197856, + "step": 57805 + }, + { + "epoch": 6.35973597359736, + "grad_norm": 0.1803807020187378, + "learning_rate": 4.3105421988214135e-05, + "loss": 0.0415, + "num_input_tokens_seen": 12198912, + "step": 57810 + }, + { + "epoch": 6.36028602860286, + "grad_norm": 2.7179572582244873, + "learning_rate": 4.310376688463787e-05, + "loss": 0.0685, + "num_input_tokens_seen": 12199936, + "step": 57815 + }, + { + "epoch": 6.360836083608361, + "grad_norm": 0.004751579370349646, + "learning_rate": 4.310211161420787e-05, + "loss": 0.0174, + "num_input_tokens_seen": 12200960, + "step": 57820 + }, + { + "epoch": 6.361386138613861, + "grad_norm": 0.3770962059497833, + "learning_rate": 4.3100456176939394e-05, + "loss": 0.0412, + "num_input_tokens_seen": 12201984, + "step": 57825 + }, + { + "epoch": 6.361936193619362, + "grad_norm": 0.06300917267799377, + "learning_rate": 4.30988005728477e-05, + "loss": 0.0673, + "num_input_tokens_seen": 12203040, + "step": 57830 + }, + { + "epoch": 6.362486248624863, + "grad_norm": 0.0806502252817154, + "learning_rate": 4.3097144801948045e-05, + "loss": 0.0056, + "num_input_tokens_seen": 12204096, + "step": 57835 + }, + { + "epoch": 6.363036303630363, + "grad_norm": 0.016532529145479202, + "learning_rate": 4.309548886425568e-05, + "loss": 0.0882, + "num_input_tokens_seen": 12205216, + "step": 57840 + }, + { + "epoch": 6.363586358635864, + "grad_norm": 0.025428086519241333, + "learning_rate": 4.309383275978589e-05, + "loss": 0.1251, + "num_input_tokens_seen": 12206240, + "step": 57845 + }, + { + "epoch": 6.364136413641364, + "grad_norm": 0.267243355512619, + "learning_rate": 4.309217648855393e-05, + "loss": 0.0394, + "num_input_tokens_seen": 12207296, + "step": 57850 + }, + { + "epoch": 6.364686468646864, + "grad_norm": 0.02853766269981861, + "learning_rate": 4.309052005057505e-05, + "loss": 0.0692, + "num_input_tokens_seen": 12208352, + "step": 57855 + }, + { + "epoch": 6.365236523652365, + "grad_norm": 0.2191251814365387, + "learning_rate": 4.308886344586452e-05, + "loss": 0.04, + "num_input_tokens_seen": 12209504, + "step": 57860 + }, + { + "epoch": 6.365786578657866, + "grad_norm": 1.0135903358459473, + "learning_rate": 4.308720667443763e-05, + "loss": 0.1148, + "num_input_tokens_seen": 12210592, + "step": 57865 + }, + { + "epoch": 6.366336633663367, + "grad_norm": 1.2948027849197388, + "learning_rate": 4.3085549736309634e-05, + "loss": 0.0988, + "num_input_tokens_seen": 12211648, + "step": 57870 + }, + { + "epoch": 6.366886688668867, + "grad_norm": 0.828545093536377, + "learning_rate": 4.3083892631495806e-05, + "loss": 0.0207, + "num_input_tokens_seen": 12212736, + "step": 57875 + }, + { + "epoch": 6.367436743674367, + "grad_norm": 0.15122519433498383, + "learning_rate": 4.3082235360011406e-05, + "loss": 0.0253, + "num_input_tokens_seen": 12213792, + "step": 57880 + }, + { + "epoch": 6.367986798679868, + "grad_norm": 0.2414163202047348, + "learning_rate": 4.308057792187173e-05, + "loss": 0.0132, + "num_input_tokens_seen": 12214880, + "step": 57885 + }, + { + "epoch": 6.368536853685368, + "grad_norm": 1.8077516555786133, + "learning_rate": 4.307892031709204e-05, + "loss": 0.0784, + "num_input_tokens_seen": 12215872, + "step": 57890 + }, + { + "epoch": 6.3690869086908695, + "grad_norm": 0.04360939562320709, + "learning_rate": 4.307726254568761e-05, + "loss": 0.0272, + "num_input_tokens_seen": 12216896, + "step": 57895 + }, + { + "epoch": 6.36963696369637, + "grad_norm": 0.06442970037460327, + "learning_rate": 4.307560460767374e-05, + "loss": 0.0967, + "num_input_tokens_seen": 12217984, + "step": 57900 + }, + { + "epoch": 6.37018701870187, + "grad_norm": 0.7238142490386963, + "learning_rate": 4.307394650306568e-05, + "loss": 0.0217, + "num_input_tokens_seen": 12219040, + "step": 57905 + }, + { + "epoch": 6.370737073707371, + "grad_norm": 0.00897399801760912, + "learning_rate": 4.307228823187874e-05, + "loss": 0.0368, + "num_input_tokens_seen": 12220096, + "step": 57910 + }, + { + "epoch": 6.371287128712871, + "grad_norm": 0.7359485626220703, + "learning_rate": 4.3070629794128184e-05, + "loss": 0.0633, + "num_input_tokens_seen": 12221120, + "step": 57915 + }, + { + "epoch": 6.371837183718371, + "grad_norm": 0.013481556437909603, + "learning_rate": 4.3068971189829304e-05, + "loss": 0.0152, + "num_input_tokens_seen": 12222176, + "step": 57920 + }, + { + "epoch": 6.3723872387238725, + "grad_norm": 0.19293488562107086, + "learning_rate": 4.306731241899739e-05, + "loss": 0.0143, + "num_input_tokens_seen": 12223200, + "step": 57925 + }, + { + "epoch": 6.372937293729373, + "grad_norm": 0.10096937417984009, + "learning_rate": 4.306565348164773e-05, + "loss": 0.0313, + "num_input_tokens_seen": 12224224, + "step": 57930 + }, + { + "epoch": 6.373487348734874, + "grad_norm": 0.06566155701875687, + "learning_rate": 4.3063994377795604e-05, + "loss": 0.1084, + "num_input_tokens_seen": 12225312, + "step": 57935 + }, + { + "epoch": 6.374037403740374, + "grad_norm": 0.19367191195487976, + "learning_rate": 4.3062335107456306e-05, + "loss": 0.0178, + "num_input_tokens_seen": 12226336, + "step": 57940 + }, + { + "epoch": 6.374587458745874, + "grad_norm": 0.9284881353378296, + "learning_rate": 4.3060675670645144e-05, + "loss": 0.2292, + "num_input_tokens_seen": 12227424, + "step": 57945 + }, + { + "epoch": 6.375137513751375, + "grad_norm": 0.23523777723312378, + "learning_rate": 4.30590160673774e-05, + "loss": 0.0157, + "num_input_tokens_seen": 12228448, + "step": 57950 + }, + { + "epoch": 6.3756875687568755, + "grad_norm": 0.011730523779988289, + "learning_rate": 4.305735629766836e-05, + "loss": 0.0429, + "num_input_tokens_seen": 12229472, + "step": 57955 + }, + { + "epoch": 6.376237623762377, + "grad_norm": 2.191423177719116, + "learning_rate": 4.305569636153334e-05, + "loss": 0.0418, + "num_input_tokens_seen": 12230528, + "step": 57960 + }, + { + "epoch": 6.376787678767877, + "grad_norm": 0.11352184414863586, + "learning_rate": 4.305403625898763e-05, + "loss": 0.0471, + "num_input_tokens_seen": 12231520, + "step": 57965 + }, + { + "epoch": 6.377337733773377, + "grad_norm": 0.09468472748994827, + "learning_rate": 4.305237599004653e-05, + "loss": 0.0073, + "num_input_tokens_seen": 12232576, + "step": 57970 + }, + { + "epoch": 6.377887788778878, + "grad_norm": 0.18086515367031097, + "learning_rate": 4.305071555472534e-05, + "loss": 0.0089, + "num_input_tokens_seen": 12233568, + "step": 57975 + }, + { + "epoch": 6.378437843784378, + "grad_norm": 0.1439117044210434, + "learning_rate": 4.3049054953039366e-05, + "loss": 0.076, + "num_input_tokens_seen": 12234656, + "step": 57980 + }, + { + "epoch": 6.378987898789879, + "grad_norm": 0.7598080039024353, + "learning_rate": 4.304739418500392e-05, + "loss": 0.0703, + "num_input_tokens_seen": 12235712, + "step": 57985 + }, + { + "epoch": 6.37953795379538, + "grad_norm": 0.10011158138513565, + "learning_rate": 4.30457332506343e-05, + "loss": 0.013, + "num_input_tokens_seen": 12236800, + "step": 57990 + }, + { + "epoch": 6.38008800880088, + "grad_norm": 1.2601244449615479, + "learning_rate": 4.304407214994581e-05, + "loss": 0.1148, + "num_input_tokens_seen": 12237920, + "step": 57995 + }, + { + "epoch": 6.380638063806381, + "grad_norm": 0.24917279183864594, + "learning_rate": 4.304241088295377e-05, + "loss": 0.0184, + "num_input_tokens_seen": 12239008, + "step": 58000 + }, + { + "epoch": 6.381188118811881, + "grad_norm": 0.10321220755577087, + "learning_rate": 4.304074944967349e-05, + "loss": 0.0783, + "num_input_tokens_seen": 12240064, + "step": 58005 + }, + { + "epoch": 6.381738173817382, + "grad_norm": 1.1659866571426392, + "learning_rate": 4.303908785012027e-05, + "loss": 0.084, + "num_input_tokens_seen": 12241056, + "step": 58010 + }, + { + "epoch": 6.382288228822882, + "grad_norm": 0.025416620075702667, + "learning_rate": 4.303742608430944e-05, + "loss": 0.0182, + "num_input_tokens_seen": 12242112, + "step": 58015 + }, + { + "epoch": 6.382838283828383, + "grad_norm": 0.41637811064720154, + "learning_rate": 4.303576415225631e-05, + "loss": 0.0473, + "num_input_tokens_seen": 12243168, + "step": 58020 + }, + { + "epoch": 6.383388338833884, + "grad_norm": 7.611878395080566, + "learning_rate": 4.3034102053976186e-05, + "loss": 0.0595, + "num_input_tokens_seen": 12244320, + "step": 58025 + }, + { + "epoch": 6.383938393839384, + "grad_norm": 0.3754522204399109, + "learning_rate": 4.303243978948441e-05, + "loss": 0.0416, + "num_input_tokens_seen": 12245344, + "step": 58030 + }, + { + "epoch": 6.384488448844884, + "grad_norm": 0.7791700959205627, + "learning_rate": 4.3030777358796284e-05, + "loss": 0.0434, + "num_input_tokens_seen": 12246432, + "step": 58035 + }, + { + "epoch": 6.385038503850385, + "grad_norm": 0.6009176969528198, + "learning_rate": 4.302911476192714e-05, + "loss": 0.0462, + "num_input_tokens_seen": 12247456, + "step": 58040 + }, + { + "epoch": 6.385588558855885, + "grad_norm": 0.3241705000400543, + "learning_rate": 4.302745199889228e-05, + "loss": 0.0207, + "num_input_tokens_seen": 12248512, + "step": 58045 + }, + { + "epoch": 6.3861386138613865, + "grad_norm": 0.056883905082941055, + "learning_rate": 4.302578906970706e-05, + "loss": 0.105, + "num_input_tokens_seen": 12249536, + "step": 58050 + }, + { + "epoch": 6.386688668866887, + "grad_norm": 0.25531694293022156, + "learning_rate": 4.3024125974386786e-05, + "loss": 0.0206, + "num_input_tokens_seen": 12250560, + "step": 58055 + }, + { + "epoch": 6.387238723872387, + "grad_norm": 0.044007737189531326, + "learning_rate": 4.302246271294679e-05, + "loss": 0.0207, + "num_input_tokens_seen": 12251616, + "step": 58060 + }, + { + "epoch": 6.387788778877888, + "grad_norm": 0.04475419968366623, + "learning_rate": 4.3020799285402416e-05, + "loss": 0.0308, + "num_input_tokens_seen": 12252640, + "step": 58065 + }, + { + "epoch": 6.388338833883388, + "grad_norm": 0.12146829813718796, + "learning_rate": 4.301913569176897e-05, + "loss": 0.1072, + "num_input_tokens_seen": 12253696, + "step": 58070 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.2058473527431488, + "learning_rate": 4.30174719320618e-05, + "loss": 0.0359, + "num_input_tokens_seen": 12254784, + "step": 58075 + }, + { + "epoch": 6.3894389438943895, + "grad_norm": 0.3012075126171112, + "learning_rate": 4.301580800629623e-05, + "loss": 0.2017, + "num_input_tokens_seen": 12255776, + "step": 58080 + }, + { + "epoch": 6.38998899889989, + "grad_norm": 0.20213527977466583, + "learning_rate": 4.301414391448761e-05, + "loss": 0.0307, + "num_input_tokens_seen": 12256832, + "step": 58085 + }, + { + "epoch": 6.390539053905391, + "grad_norm": 0.5501635670661926, + "learning_rate": 4.301247965665127e-05, + "loss": 0.045, + "num_input_tokens_seen": 12257920, + "step": 58090 + }, + { + "epoch": 6.391089108910891, + "grad_norm": 0.14856016635894775, + "learning_rate": 4.3010815232802546e-05, + "loss": 0.0085, + "num_input_tokens_seen": 12258944, + "step": 58095 + }, + { + "epoch": 6.391639163916391, + "grad_norm": 0.10016889125108719, + "learning_rate": 4.300915064295679e-05, + "loss": 0.0119, + "num_input_tokens_seen": 12260000, + "step": 58100 + }, + { + "epoch": 6.392189218921892, + "grad_norm": 0.10674943029880524, + "learning_rate": 4.3007485887129315e-05, + "loss": 0.0143, + "num_input_tokens_seen": 12261088, + "step": 58105 + }, + { + "epoch": 6.3927392739273925, + "grad_norm": 0.6977173089981079, + "learning_rate": 4.30058209653355e-05, + "loss": 0.0805, + "num_input_tokens_seen": 12262080, + "step": 58110 + }, + { + "epoch": 6.393289328932894, + "grad_norm": 0.09150246530771255, + "learning_rate": 4.300415587759067e-05, + "loss": 0.0393, + "num_input_tokens_seen": 12263168, + "step": 58115 + }, + { + "epoch": 6.393839383938394, + "grad_norm": 0.29658758640289307, + "learning_rate": 4.300249062391017e-05, + "loss": 0.1168, + "num_input_tokens_seen": 12264192, + "step": 58120 + }, + { + "epoch": 6.394389438943894, + "grad_norm": 1.4005666971206665, + "learning_rate": 4.300082520430935e-05, + "loss": 0.0375, + "num_input_tokens_seen": 12265280, + "step": 58125 + }, + { + "epoch": 6.394939493949395, + "grad_norm": 0.11824743449687958, + "learning_rate": 4.2999159618803566e-05, + "loss": 0.0609, + "num_input_tokens_seen": 12266368, + "step": 58130 + }, + { + "epoch": 6.395489548954895, + "grad_norm": 0.013681003823876381, + "learning_rate": 4.2997493867408166e-05, + "loss": 0.1104, + "num_input_tokens_seen": 12267424, + "step": 58135 + }, + { + "epoch": 6.396039603960396, + "grad_norm": 0.5093629956245422, + "learning_rate": 4.29958279501385e-05, + "loss": 0.0341, + "num_input_tokens_seen": 12268416, + "step": 58140 + }, + { + "epoch": 6.396589658965897, + "grad_norm": 0.24653756618499756, + "learning_rate": 4.299416186700992e-05, + "loss": 0.0278, + "num_input_tokens_seen": 12269472, + "step": 58145 + }, + { + "epoch": 6.397139713971397, + "grad_norm": 0.04953561723232269, + "learning_rate": 4.2992495618037785e-05, + "loss": 0.0218, + "num_input_tokens_seen": 12270528, + "step": 58150 + }, + { + "epoch": 6.397689768976898, + "grad_norm": 0.11416713148355484, + "learning_rate": 4.299082920323746e-05, + "loss": 0.0093, + "num_input_tokens_seen": 12271488, + "step": 58155 + }, + { + "epoch": 6.398239823982398, + "grad_norm": 0.03981863707304001, + "learning_rate": 4.298916262262429e-05, + "loss": 0.086, + "num_input_tokens_seen": 12272608, + "step": 58160 + }, + { + "epoch": 6.398789878987899, + "grad_norm": 0.011713769286870956, + "learning_rate": 4.298749587621363e-05, + "loss": 0.0205, + "num_input_tokens_seen": 12273664, + "step": 58165 + }, + { + "epoch": 6.399339933993399, + "grad_norm": 0.034608371555805206, + "learning_rate": 4.298582896402087e-05, + "loss": 0.0591, + "num_input_tokens_seen": 12274720, + "step": 58170 + }, + { + "epoch": 6.3998899889989, + "grad_norm": 0.5224702954292297, + "learning_rate": 4.298416188606133e-05, + "loss": 0.1045, + "num_input_tokens_seen": 12275776, + "step": 58175 + }, + { + "epoch": 6.400440044004401, + "grad_norm": 0.8468660712242126, + "learning_rate": 4.2982494642350425e-05, + "loss": 0.0781, + "num_input_tokens_seen": 12276800, + "step": 58180 + }, + { + "epoch": 6.400990099009901, + "grad_norm": 0.42920124530792236, + "learning_rate": 4.298082723290349e-05, + "loss": 0.0265, + "num_input_tokens_seen": 12277856, + "step": 58185 + }, + { + "epoch": 6.401540154015402, + "grad_norm": 0.8643582463264465, + "learning_rate": 4.297915965773589e-05, + "loss": 0.0472, + "num_input_tokens_seen": 12278880, + "step": 58190 + }, + { + "epoch": 6.402090209020902, + "grad_norm": 0.3300144076347351, + "learning_rate": 4.297749191686301e-05, + "loss": 0.0254, + "num_input_tokens_seen": 12279936, + "step": 58195 + }, + { + "epoch": 6.402640264026402, + "grad_norm": 0.019975321367383003, + "learning_rate": 4.29758240103002e-05, + "loss": 0.0603, + "num_input_tokens_seen": 12280960, + "step": 58200 + }, + { + "epoch": 6.4031903190319035, + "grad_norm": 0.014072495512664318, + "learning_rate": 4.297415593806286e-05, + "loss": 0.0047, + "num_input_tokens_seen": 12282016, + "step": 58205 + }, + { + "epoch": 6.403740374037404, + "grad_norm": 0.00943674985319376, + "learning_rate": 4.297248770016635e-05, + "loss": 0.0132, + "num_input_tokens_seen": 12283104, + "step": 58210 + }, + { + "epoch": 6.404290429042904, + "grad_norm": 0.08293560147285461, + "learning_rate": 4.297081929662604e-05, + "loss": 0.1623, + "num_input_tokens_seen": 12284160, + "step": 58215 + }, + { + "epoch": 6.404840484048405, + "grad_norm": 0.1945735365152359, + "learning_rate": 4.296915072745731e-05, + "loss": 0.0201, + "num_input_tokens_seen": 12285312, + "step": 58220 + }, + { + "epoch": 6.405390539053905, + "grad_norm": 0.032744456082582474, + "learning_rate": 4.296748199267554e-05, + "loss": 0.0409, + "num_input_tokens_seen": 12286336, + "step": 58225 + }, + { + "epoch": 6.405940594059406, + "grad_norm": 0.2200605422258377, + "learning_rate": 4.2965813092296115e-05, + "loss": 0.0137, + "num_input_tokens_seen": 12287328, + "step": 58230 + }, + { + "epoch": 6.4064906490649065, + "grad_norm": 0.501491904258728, + "learning_rate": 4.296414402633441e-05, + "loss": 0.0305, + "num_input_tokens_seen": 12288352, + "step": 58235 + }, + { + "epoch": 6.407040704070407, + "grad_norm": 0.04992080479860306, + "learning_rate": 4.296247479480581e-05, + "loss": 0.0169, + "num_input_tokens_seen": 12289408, + "step": 58240 + }, + { + "epoch": 6.407590759075908, + "grad_norm": 0.10398934781551361, + "learning_rate": 4.296080539772569e-05, + "loss": 0.0254, + "num_input_tokens_seen": 12290464, + "step": 58245 + }, + { + "epoch": 6.408140814081408, + "grad_norm": 0.035629406571388245, + "learning_rate": 4.295913583510946e-05, + "loss": 0.0157, + "num_input_tokens_seen": 12291520, + "step": 58250 + }, + { + "epoch": 6.408690869086909, + "grad_norm": 1.0743327140808105, + "learning_rate": 4.2957466106972486e-05, + "loss": 0.0711, + "num_input_tokens_seen": 12292544, + "step": 58255 + }, + { + "epoch": 6.409240924092409, + "grad_norm": 0.014160869643092155, + "learning_rate": 4.2955796213330156e-05, + "loss": 0.0829, + "num_input_tokens_seen": 12293600, + "step": 58260 + }, + { + "epoch": 6.4097909790979095, + "grad_norm": 0.2651965916156769, + "learning_rate": 4.295412615419788e-05, + "loss": 0.097, + "num_input_tokens_seen": 12294656, + "step": 58265 + }, + { + "epoch": 6.410341034103411, + "grad_norm": 0.03937406837940216, + "learning_rate": 4.295245592959103e-05, + "loss": 0.0945, + "num_input_tokens_seen": 12295680, + "step": 58270 + }, + { + "epoch": 6.410891089108911, + "grad_norm": 1.3286007642745972, + "learning_rate": 4.2950785539525016e-05, + "loss": 0.1495, + "num_input_tokens_seen": 12296704, + "step": 58275 + }, + { + "epoch": 6.411441144114411, + "grad_norm": 0.9617658257484436, + "learning_rate": 4.294911498401522e-05, + "loss": 0.1735, + "num_input_tokens_seen": 12297792, + "step": 58280 + }, + { + "epoch": 6.411991199119912, + "grad_norm": 0.4730682969093323, + "learning_rate": 4.2947444263077044e-05, + "loss": 0.0221, + "num_input_tokens_seen": 12298848, + "step": 58285 + }, + { + "epoch": 6.412541254125412, + "grad_norm": 0.3834678828716278, + "learning_rate": 4.294577337672589e-05, + "loss": 0.0278, + "num_input_tokens_seen": 12299968, + "step": 58290 + }, + { + "epoch": 6.413091309130913, + "grad_norm": 0.059289172291755676, + "learning_rate": 4.294410232497715e-05, + "loss": 0.0149, + "num_input_tokens_seen": 12300992, + "step": 58295 + }, + { + "epoch": 6.413641364136414, + "grad_norm": 0.05644155293703079, + "learning_rate": 4.294243110784623e-05, + "loss": 0.0699, + "num_input_tokens_seen": 12302016, + "step": 58300 + }, + { + "epoch": 6.414191419141914, + "grad_norm": 0.023502318188548088, + "learning_rate": 4.2940759725348535e-05, + "loss": 0.0093, + "num_input_tokens_seen": 12303104, + "step": 58305 + }, + { + "epoch": 6.414741474147415, + "grad_norm": 0.9889509081840515, + "learning_rate": 4.2939088177499465e-05, + "loss": 0.0654, + "num_input_tokens_seen": 12304224, + "step": 58310 + }, + { + "epoch": 6.415291529152915, + "grad_norm": 0.38767385482788086, + "learning_rate": 4.293741646431443e-05, + "loss": 0.027, + "num_input_tokens_seen": 12305312, + "step": 58315 + }, + { + "epoch": 6.415841584158416, + "grad_norm": 0.013073842041194439, + "learning_rate": 4.293574458580882e-05, + "loss": 0.0595, + "num_input_tokens_seen": 12306368, + "step": 58320 + }, + { + "epoch": 6.416391639163916, + "grad_norm": 0.018027562648057938, + "learning_rate": 4.2934072541998076e-05, + "loss": 0.0067, + "num_input_tokens_seen": 12307456, + "step": 58325 + }, + { + "epoch": 6.416941694169417, + "grad_norm": 0.020539263263344765, + "learning_rate": 4.2932400332897585e-05, + "loss": 0.0061, + "num_input_tokens_seen": 12308448, + "step": 58330 + }, + { + "epoch": 6.417491749174918, + "grad_norm": 0.3822300434112549, + "learning_rate": 4.2930727958522764e-05, + "loss": 0.0155, + "num_input_tokens_seen": 12309504, + "step": 58335 + }, + { + "epoch": 6.418041804180418, + "grad_norm": 0.7718155384063721, + "learning_rate": 4.292905541888903e-05, + "loss": 0.0391, + "num_input_tokens_seen": 12310560, + "step": 58340 + }, + { + "epoch": 6.418591859185918, + "grad_norm": 1.9174277782440186, + "learning_rate": 4.2927382714011785e-05, + "loss": 0.0419, + "num_input_tokens_seen": 12311616, + "step": 58345 + }, + { + "epoch": 6.419141914191419, + "grad_norm": 0.1179005578160286, + "learning_rate": 4.292570984390647e-05, + "loss": 0.1164, + "num_input_tokens_seen": 12312704, + "step": 58350 + }, + { + "epoch": 6.419691969196919, + "grad_norm": 0.19890207052230835, + "learning_rate": 4.292403680858848e-05, + "loss": 0.0179, + "num_input_tokens_seen": 12313760, + "step": 58355 + }, + { + "epoch": 6.4202420242024205, + "grad_norm": 0.6229956150054932, + "learning_rate": 4.292236360807325e-05, + "loss": 0.0666, + "num_input_tokens_seen": 12314848, + "step": 58360 + }, + { + "epoch": 6.420792079207921, + "grad_norm": 0.34501662850379944, + "learning_rate": 4.292069024237619e-05, + "loss": 0.0536, + "num_input_tokens_seen": 12315968, + "step": 58365 + }, + { + "epoch": 6.421342134213421, + "grad_norm": 0.03761935979127884, + "learning_rate": 4.2919016711512735e-05, + "loss": 0.0141, + "num_input_tokens_seen": 12317024, + "step": 58370 + }, + { + "epoch": 6.421892189218922, + "grad_norm": 0.14073066413402557, + "learning_rate": 4.291734301549829e-05, + "loss": 0.02, + "num_input_tokens_seen": 12318080, + "step": 58375 + }, + { + "epoch": 6.422442244224422, + "grad_norm": 1.357087254524231, + "learning_rate": 4.29156691543483e-05, + "loss": 0.1381, + "num_input_tokens_seen": 12319072, + "step": 58380 + }, + { + "epoch": 6.422992299229923, + "grad_norm": 0.013853151351213455, + "learning_rate": 4.2913995128078174e-05, + "loss": 0.0648, + "num_input_tokens_seen": 12320192, + "step": 58385 + }, + { + "epoch": 6.4235423542354235, + "grad_norm": 0.683604896068573, + "learning_rate": 4.2912320936703366e-05, + "loss": 0.044, + "num_input_tokens_seen": 12321248, + "step": 58390 + }, + { + "epoch": 6.424092409240924, + "grad_norm": 0.02020045556128025, + "learning_rate": 4.291064658023928e-05, + "loss": 0.0507, + "num_input_tokens_seen": 12322336, + "step": 58395 + }, + { + "epoch": 6.424642464246425, + "grad_norm": 0.3772224187850952, + "learning_rate": 4.2908972058701364e-05, + "loss": 0.1092, + "num_input_tokens_seen": 12323392, + "step": 58400 + }, + { + "epoch": 6.425192519251925, + "grad_norm": 0.09353187680244446, + "learning_rate": 4.290729737210504e-05, + "loss": 0.0756, + "num_input_tokens_seen": 12324512, + "step": 58405 + }, + { + "epoch": 6.425742574257426, + "grad_norm": 0.031365521252155304, + "learning_rate": 4.290562252046576e-05, + "loss": 0.0035, + "num_input_tokens_seen": 12325536, + "step": 58410 + }, + { + "epoch": 6.426292629262926, + "grad_norm": 0.0488060861825943, + "learning_rate": 4.290394750379894e-05, + "loss": 0.0052, + "num_input_tokens_seen": 12326592, + "step": 58415 + }, + { + "epoch": 6.4268426842684265, + "grad_norm": 0.003592360531911254, + "learning_rate": 4.290227232212003e-05, + "loss": 0.0331, + "num_input_tokens_seen": 12327648, + "step": 58420 + }, + { + "epoch": 6.427392739273928, + "grad_norm": 0.3487224280834198, + "learning_rate": 4.290059697544446e-05, + "loss": 0.0445, + "num_input_tokens_seen": 12328704, + "step": 58425 + }, + { + "epoch": 6.427942794279428, + "grad_norm": 0.20270198583602905, + "learning_rate": 4.289892146378769e-05, + "loss": 0.0092, + "num_input_tokens_seen": 12329728, + "step": 58430 + }, + { + "epoch": 6.428492849284929, + "grad_norm": 0.653113842010498, + "learning_rate": 4.289724578716514e-05, + "loss": 0.0763, + "num_input_tokens_seen": 12330784, + "step": 58435 + }, + { + "epoch": 6.429042904290429, + "grad_norm": 0.089716337621212, + "learning_rate": 4.289556994559226e-05, + "loss": 0.006, + "num_input_tokens_seen": 12331808, + "step": 58440 + }, + { + "epoch": 6.429592959295929, + "grad_norm": 0.10657911002635956, + "learning_rate": 4.289389393908451e-05, + "loss": 0.0202, + "num_input_tokens_seen": 12332832, + "step": 58445 + }, + { + "epoch": 6.43014301430143, + "grad_norm": 0.12109088152647018, + "learning_rate": 4.289221776765732e-05, + "loss": 0.0286, + "num_input_tokens_seen": 12333856, + "step": 58450 + }, + { + "epoch": 6.430693069306931, + "grad_norm": 0.0751965194940567, + "learning_rate": 4.289054143132615e-05, + "loss": 0.0175, + "num_input_tokens_seen": 12334944, + "step": 58455 + }, + { + "epoch": 6.431243124312431, + "grad_norm": 0.3897447884082794, + "learning_rate": 4.288886493010643e-05, + "loss": 0.1417, + "num_input_tokens_seen": 12335936, + "step": 58460 + }, + { + "epoch": 6.431793179317932, + "grad_norm": 0.04571808874607086, + "learning_rate": 4.288718826401364e-05, + "loss": 0.0083, + "num_input_tokens_seen": 12337056, + "step": 58465 + }, + { + "epoch": 6.432343234323432, + "grad_norm": 0.00546839227899909, + "learning_rate": 4.288551143306322e-05, + "loss": 0.051, + "num_input_tokens_seen": 12338112, + "step": 58470 + }, + { + "epoch": 6.432893289328933, + "grad_norm": 0.01572548970580101, + "learning_rate": 4.288383443727061e-05, + "loss": 0.026, + "num_input_tokens_seen": 12339168, + "step": 58475 + }, + { + "epoch": 6.433443344334433, + "grad_norm": 0.013996182940900326, + "learning_rate": 4.288215727665129e-05, + "loss": 0.0539, + "num_input_tokens_seen": 12340224, + "step": 58480 + }, + { + "epoch": 6.433993399339934, + "grad_norm": 0.024316664785146713, + "learning_rate": 4.28804799512207e-05, + "loss": 0.0274, + "num_input_tokens_seen": 12341216, + "step": 58485 + }, + { + "epoch": 6.434543454345435, + "grad_norm": 0.07878698408603668, + "learning_rate": 4.287880246099432e-05, + "loss": 0.0169, + "num_input_tokens_seen": 12342272, + "step": 58490 + }, + { + "epoch": 6.435093509350935, + "grad_norm": 0.020708877593278885, + "learning_rate": 4.2877124805987576e-05, + "loss": 0.0441, + "num_input_tokens_seen": 12343296, + "step": 58495 + }, + { + "epoch": 6.435643564356436, + "grad_norm": 0.022493110969662666, + "learning_rate": 4.287544698621597e-05, + "loss": 0.0028, + "num_input_tokens_seen": 12344352, + "step": 58500 + }, + { + "epoch": 6.436193619361936, + "grad_norm": 0.9902406930923462, + "learning_rate": 4.287376900169494e-05, + "loss": 0.044, + "num_input_tokens_seen": 12345344, + "step": 58505 + }, + { + "epoch": 6.436743674367436, + "grad_norm": 0.4938605725765228, + "learning_rate": 4.287209085243995e-05, + "loss": 0.0223, + "num_input_tokens_seen": 12346432, + "step": 58510 + }, + { + "epoch": 6.4372937293729375, + "grad_norm": 0.15877176821231842, + "learning_rate": 4.287041253846649e-05, + "loss": 0.0128, + "num_input_tokens_seen": 12347520, + "step": 58515 + }, + { + "epoch": 6.437843784378438, + "grad_norm": 0.1696741133928299, + "learning_rate": 4.2868734059789994e-05, + "loss": 0.0169, + "num_input_tokens_seen": 12348544, + "step": 58520 + }, + { + "epoch": 6.438393839383938, + "grad_norm": 0.021042076870799065, + "learning_rate": 4.286705541642596e-05, + "loss": 0.0791, + "num_input_tokens_seen": 12349632, + "step": 58525 + }, + { + "epoch": 6.438943894389439, + "grad_norm": 0.19213984906673431, + "learning_rate": 4.286537660838985e-05, + "loss": 0.009, + "num_input_tokens_seen": 12350720, + "step": 58530 + }, + { + "epoch": 6.439493949394939, + "grad_norm": 0.0968167632818222, + "learning_rate": 4.286369763569714e-05, + "loss": 0.0063, + "num_input_tokens_seen": 12351808, + "step": 58535 + }, + { + "epoch": 6.44004400440044, + "grad_norm": 0.08240081369876862, + "learning_rate": 4.28620184983633e-05, + "loss": 0.0156, + "num_input_tokens_seen": 12352832, + "step": 58540 + }, + { + "epoch": 6.4405940594059405, + "grad_norm": 1.0035780668258667, + "learning_rate": 4.28603391964038e-05, + "loss": 0.0404, + "num_input_tokens_seen": 12353856, + "step": 58545 + }, + { + "epoch": 6.441144114411441, + "grad_norm": 0.13317357003688812, + "learning_rate": 4.2858659729834126e-05, + "loss": 0.036, + "num_input_tokens_seen": 12354976, + "step": 58550 + }, + { + "epoch": 6.441694169416942, + "grad_norm": 1.0689157247543335, + "learning_rate": 4.2856980098669755e-05, + "loss": 0.1489, + "num_input_tokens_seen": 12356000, + "step": 58555 + }, + { + "epoch": 6.442244224422442, + "grad_norm": 0.23884902894496918, + "learning_rate": 4.285530030292617e-05, + "loss": 0.0175, + "num_input_tokens_seen": 12357088, + "step": 58560 + }, + { + "epoch": 6.442794279427943, + "grad_norm": 0.030933789908885956, + "learning_rate": 4.285362034261885e-05, + "loss": 0.0457, + "num_input_tokens_seen": 12358144, + "step": 58565 + }, + { + "epoch": 6.443344334433443, + "grad_norm": 0.005518012680113316, + "learning_rate": 4.285194021776328e-05, + "loss": 0.0114, + "num_input_tokens_seen": 12359264, + "step": 58570 + }, + { + "epoch": 6.4438943894389435, + "grad_norm": 0.31640246510505676, + "learning_rate": 4.285025992837494e-05, + "loss": 0.0859, + "num_input_tokens_seen": 12360352, + "step": 58575 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.0859316885471344, + "learning_rate": 4.284857947446931e-05, + "loss": 0.0205, + "num_input_tokens_seen": 12361376, + "step": 58580 + }, + { + "epoch": 6.444994499449945, + "grad_norm": 0.01905982568860054, + "learning_rate": 4.2846898856061904e-05, + "loss": 0.0192, + "num_input_tokens_seen": 12362400, + "step": 58585 + }, + { + "epoch": 6.445544554455446, + "grad_norm": 0.03654435649514198, + "learning_rate": 4.284521807316818e-05, + "loss": 0.0065, + "num_input_tokens_seen": 12363488, + "step": 58590 + }, + { + "epoch": 6.446094609460946, + "grad_norm": 0.1547323316335678, + "learning_rate": 4.2843537125803656e-05, + "loss": 0.0271, + "num_input_tokens_seen": 12364608, + "step": 58595 + }, + { + "epoch": 6.446644664466446, + "grad_norm": 1.3566402196884155, + "learning_rate": 4.28418560139838e-05, + "loss": 0.0551, + "num_input_tokens_seen": 12365696, + "step": 58600 + }, + { + "epoch": 6.447194719471947, + "grad_norm": 0.7319304347038269, + "learning_rate": 4.284017473772414e-05, + "loss": 0.0887, + "num_input_tokens_seen": 12366720, + "step": 58605 + }, + { + "epoch": 6.447744774477448, + "grad_norm": 0.06815357506275177, + "learning_rate": 4.283849329704013e-05, + "loss": 0.0437, + "num_input_tokens_seen": 12367808, + "step": 58610 + }, + { + "epoch": 6.448294829482949, + "grad_norm": 0.07237856090068817, + "learning_rate": 4.28368116919473e-05, + "loss": 0.0564, + "num_input_tokens_seen": 12368800, + "step": 58615 + }, + { + "epoch": 6.448844884488449, + "grad_norm": 0.18224328756332397, + "learning_rate": 4.283512992246113e-05, + "loss": 0.1306, + "num_input_tokens_seen": 12369792, + "step": 58620 + }, + { + "epoch": 6.449394939493949, + "grad_norm": 0.2481931746006012, + "learning_rate": 4.283344798859712e-05, + "loss": 0.0085, + "num_input_tokens_seen": 12370880, + "step": 58625 + }, + { + "epoch": 6.44994499449945, + "grad_norm": 0.3276483416557312, + "learning_rate": 4.283176589037078e-05, + "loss": 0.0237, + "num_input_tokens_seen": 12371936, + "step": 58630 + }, + { + "epoch": 6.4504950495049505, + "grad_norm": 0.5926854014396667, + "learning_rate": 4.2830083627797614e-05, + "loss": 0.0228, + "num_input_tokens_seen": 12372960, + "step": 58635 + }, + { + "epoch": 6.451045104510451, + "grad_norm": 0.03358454257249832, + "learning_rate": 4.282840120089312e-05, + "loss": 0.0473, + "num_input_tokens_seen": 12373984, + "step": 58640 + }, + { + "epoch": 6.451595159515952, + "grad_norm": 0.7227364182472229, + "learning_rate": 4.282671860967281e-05, + "loss": 0.016, + "num_input_tokens_seen": 12375040, + "step": 58645 + }, + { + "epoch": 6.452145214521452, + "grad_norm": 0.013706959784030914, + "learning_rate": 4.2825035854152184e-05, + "loss": 0.1076, + "num_input_tokens_seen": 12376128, + "step": 58650 + }, + { + "epoch": 6.452695269526953, + "grad_norm": 1.9233628511428833, + "learning_rate": 4.282335293434676e-05, + "loss": 0.0563, + "num_input_tokens_seen": 12377216, + "step": 58655 + }, + { + "epoch": 6.453245324532453, + "grad_norm": 0.052634939551353455, + "learning_rate": 4.282166985027205e-05, + "loss": 0.0227, + "num_input_tokens_seen": 12378336, + "step": 58660 + }, + { + "epoch": 6.4537953795379535, + "grad_norm": 0.05550381913781166, + "learning_rate": 4.2819986601943546e-05, + "loss": 0.0047, + "num_input_tokens_seen": 12379424, + "step": 58665 + }, + { + "epoch": 6.4543454345434546, + "grad_norm": 0.21779093146324158, + "learning_rate": 4.2818303189376785e-05, + "loss": 0.0221, + "num_input_tokens_seen": 12380512, + "step": 58670 + }, + { + "epoch": 6.454895489548955, + "grad_norm": 0.08883407711982727, + "learning_rate": 4.281661961258727e-05, + "loss": 0.0363, + "num_input_tokens_seen": 12381600, + "step": 58675 + }, + { + "epoch": 6.455445544554456, + "grad_norm": 0.21270111203193665, + "learning_rate": 4.281493587159053e-05, + "loss": 0.0207, + "num_input_tokens_seen": 12382592, + "step": 58680 + }, + { + "epoch": 6.455995599559956, + "grad_norm": 0.1829473078250885, + "learning_rate": 4.2813251966402066e-05, + "loss": 0.0083, + "num_input_tokens_seen": 12383648, + "step": 58685 + }, + { + "epoch": 6.456545654565456, + "grad_norm": 0.3066900372505188, + "learning_rate": 4.281156789703741e-05, + "loss": 0.0107, + "num_input_tokens_seen": 12384768, + "step": 58690 + }, + { + "epoch": 6.457095709570957, + "grad_norm": 0.7078437209129333, + "learning_rate": 4.280988366351208e-05, + "loss": 0.0395, + "num_input_tokens_seen": 12385760, + "step": 58695 + }, + { + "epoch": 6.457645764576458, + "grad_norm": 0.33803069591522217, + "learning_rate": 4.2808199265841594e-05, + "loss": 0.085, + "num_input_tokens_seen": 12386784, + "step": 58700 + }, + { + "epoch": 6.458195819581958, + "grad_norm": 0.2538500130176544, + "learning_rate": 4.280651470404148e-05, + "loss": 0.0084, + "num_input_tokens_seen": 12387872, + "step": 58705 + }, + { + "epoch": 6.458745874587459, + "grad_norm": 0.8274948596954346, + "learning_rate": 4.280482997812727e-05, + "loss": 0.0571, + "num_input_tokens_seen": 12388896, + "step": 58710 + }, + { + "epoch": 6.459295929592959, + "grad_norm": 0.47398367524147034, + "learning_rate": 4.280314508811448e-05, + "loss": 0.0935, + "num_input_tokens_seen": 12390080, + "step": 58715 + }, + { + "epoch": 6.45984598459846, + "grad_norm": 0.04166139289736748, + "learning_rate": 4.280146003401865e-05, + "loss": 0.0055, + "num_input_tokens_seen": 12391136, + "step": 58720 + }, + { + "epoch": 6.46039603960396, + "grad_norm": 0.33163344860076904, + "learning_rate": 4.27997748158553e-05, + "loss": 0.0739, + "num_input_tokens_seen": 12392160, + "step": 58725 + }, + { + "epoch": 6.460946094609461, + "grad_norm": 0.41690927743911743, + "learning_rate": 4.279808943363998e-05, + "loss": 0.0716, + "num_input_tokens_seen": 12393248, + "step": 58730 + }, + { + "epoch": 6.461496149614962, + "grad_norm": 0.05363484099507332, + "learning_rate": 4.279640388738819e-05, + "loss": 0.0309, + "num_input_tokens_seen": 12394368, + "step": 58735 + }, + { + "epoch": 6.462046204620462, + "grad_norm": 1.0417877435684204, + "learning_rate": 4.27947181771155e-05, + "loss": 0.078, + "num_input_tokens_seen": 12395328, + "step": 58740 + }, + { + "epoch": 6.462596259625963, + "grad_norm": 0.022146571427583694, + "learning_rate": 4.279303230283742e-05, + "loss": 0.0694, + "num_input_tokens_seen": 12396352, + "step": 58745 + }, + { + "epoch": 6.463146314631463, + "grad_norm": 0.004151441622525454, + "learning_rate": 4.279134626456951e-05, + "loss": 0.0925, + "num_input_tokens_seen": 12397344, + "step": 58750 + }, + { + "epoch": 6.463696369636963, + "grad_norm": 1.0597496032714844, + "learning_rate": 4.278966006232729e-05, + "loss": 0.0578, + "num_input_tokens_seen": 12398432, + "step": 58755 + }, + { + "epoch": 6.4642464246424645, + "grad_norm": 0.009993762709200382, + "learning_rate": 4.278797369612631e-05, + "loss": 0.1167, + "num_input_tokens_seen": 12399456, + "step": 58760 + }, + { + "epoch": 6.464796479647965, + "grad_norm": 0.018799902871251106, + "learning_rate": 4.278628716598212e-05, + "loss": 0.1294, + "num_input_tokens_seen": 12400480, + "step": 58765 + }, + { + "epoch": 6.465346534653466, + "grad_norm": 0.6072202324867249, + "learning_rate": 4.2784600471910255e-05, + "loss": 0.0611, + "num_input_tokens_seen": 12401536, + "step": 58770 + }, + { + "epoch": 6.465896589658966, + "grad_norm": 0.11211177706718445, + "learning_rate": 4.278291361392625e-05, + "loss": 0.0262, + "num_input_tokens_seen": 12402656, + "step": 58775 + }, + { + "epoch": 6.466446644664466, + "grad_norm": 0.02491569332778454, + "learning_rate": 4.278122659204567e-05, + "loss": 0.1053, + "num_input_tokens_seen": 12403712, + "step": 58780 + }, + { + "epoch": 6.466996699669967, + "grad_norm": 0.010634033940732479, + "learning_rate": 4.2779539406284055e-05, + "loss": 0.0437, + "num_input_tokens_seen": 12404832, + "step": 58785 + }, + { + "epoch": 6.4675467546754675, + "grad_norm": 0.9450892806053162, + "learning_rate": 4.277785205665696e-05, + "loss": 0.0564, + "num_input_tokens_seen": 12405984, + "step": 58790 + }, + { + "epoch": 6.468096809680969, + "grad_norm": 0.08624476194381714, + "learning_rate": 4.277616454317993e-05, + "loss": 0.0366, + "num_input_tokens_seen": 12406976, + "step": 58795 + }, + { + "epoch": 6.468646864686469, + "grad_norm": 0.27124616503715515, + "learning_rate": 4.277447686586853e-05, + "loss": 0.0958, + "num_input_tokens_seen": 12408032, + "step": 58800 + }, + { + "epoch": 6.469196919691969, + "grad_norm": 0.045103415846824646, + "learning_rate": 4.27727890247383e-05, + "loss": 0.0279, + "num_input_tokens_seen": 12409088, + "step": 58805 + }, + { + "epoch": 6.46974697469747, + "grad_norm": 0.044893939048051834, + "learning_rate": 4.27711010198048e-05, + "loss": 0.0302, + "num_input_tokens_seen": 12410208, + "step": 58810 + }, + { + "epoch": 6.47029702970297, + "grad_norm": 0.08676129579544067, + "learning_rate": 4.276941285108359e-05, + "loss": 0.0585, + "num_input_tokens_seen": 12411264, + "step": 58815 + }, + { + "epoch": 6.4708470847084705, + "grad_norm": 0.28590309619903564, + "learning_rate": 4.276772451859022e-05, + "loss": 0.0458, + "num_input_tokens_seen": 12412352, + "step": 58820 + }, + { + "epoch": 6.471397139713972, + "grad_norm": 0.23009468615055084, + "learning_rate": 4.276603602234027e-05, + "loss": 0.0309, + "num_input_tokens_seen": 12413440, + "step": 58825 + }, + { + "epoch": 6.471947194719472, + "grad_norm": 0.0406542606651783, + "learning_rate": 4.2764347362349285e-05, + "loss": 0.0073, + "num_input_tokens_seen": 12414464, + "step": 58830 + }, + { + "epoch": 6.472497249724973, + "grad_norm": 1.2354507446289062, + "learning_rate": 4.276265853863285e-05, + "loss": 0.0303, + "num_input_tokens_seen": 12415616, + "step": 58835 + }, + { + "epoch": 6.473047304730473, + "grad_norm": 0.01760176382958889, + "learning_rate": 4.2760969551206495e-05, + "loss": 0.0277, + "num_input_tokens_seen": 12416608, + "step": 58840 + }, + { + "epoch": 6.473597359735973, + "grad_norm": 0.023051699623465538, + "learning_rate": 4.2759280400085815e-05, + "loss": 0.0032, + "num_input_tokens_seen": 12417664, + "step": 58845 + }, + { + "epoch": 6.474147414741474, + "grad_norm": 0.05058623105287552, + "learning_rate": 4.2757591085286366e-05, + "loss": 0.018, + "num_input_tokens_seen": 12418784, + "step": 58850 + }, + { + "epoch": 6.474697469746975, + "grad_norm": 0.0706670805811882, + "learning_rate": 4.2755901606823725e-05, + "loss": 0.0084, + "num_input_tokens_seen": 12419840, + "step": 58855 + }, + { + "epoch": 6.475247524752476, + "grad_norm": 0.040403567254543304, + "learning_rate": 4.275421196471346e-05, + "loss": 0.0643, + "num_input_tokens_seen": 12420896, + "step": 58860 + }, + { + "epoch": 6.475797579757976, + "grad_norm": 1.8277173042297363, + "learning_rate": 4.275252215897114e-05, + "loss": 0.0656, + "num_input_tokens_seen": 12421888, + "step": 58865 + }, + { + "epoch": 6.476347634763476, + "grad_norm": 0.6224768161773682, + "learning_rate": 4.275083218961234e-05, + "loss": 0.0412, + "num_input_tokens_seen": 12422976, + "step": 58870 + }, + { + "epoch": 6.476897689768977, + "grad_norm": 0.13675270974636078, + "learning_rate": 4.2749142056652635e-05, + "loss": 0.0153, + "num_input_tokens_seen": 12424064, + "step": 58875 + }, + { + "epoch": 6.477447744774477, + "grad_norm": 2.156634569168091, + "learning_rate": 4.2747451760107603e-05, + "loss": 0.06, + "num_input_tokens_seen": 12425088, + "step": 58880 + }, + { + "epoch": 6.477997799779978, + "grad_norm": 0.01395806297659874, + "learning_rate": 4.274576129999284e-05, + "loss": 0.0306, + "num_input_tokens_seen": 12426208, + "step": 58885 + }, + { + "epoch": 6.478547854785479, + "grad_norm": 0.08085625618696213, + "learning_rate": 4.274407067632389e-05, + "loss": 0.0101, + "num_input_tokens_seen": 12427264, + "step": 58890 + }, + { + "epoch": 6.479097909790979, + "grad_norm": 2.0585873126983643, + "learning_rate": 4.274237988911637e-05, + "loss": 0.0952, + "num_input_tokens_seen": 12428320, + "step": 58895 + }, + { + "epoch": 6.47964796479648, + "grad_norm": 0.05052101984620094, + "learning_rate": 4.274068893838584e-05, + "loss": 0.0233, + "num_input_tokens_seen": 12429408, + "step": 58900 + }, + { + "epoch": 6.48019801980198, + "grad_norm": 0.12006466090679169, + "learning_rate": 4.2738997824147896e-05, + "loss": 0.1448, + "num_input_tokens_seen": 12430432, + "step": 58905 + }, + { + "epoch": 6.48074807480748, + "grad_norm": 0.09437013417482376, + "learning_rate": 4.273730654641812e-05, + "loss": 0.0205, + "num_input_tokens_seen": 12431488, + "step": 58910 + }, + { + "epoch": 6.4812981298129815, + "grad_norm": 0.0075883581303060055, + "learning_rate": 4.27356151052121e-05, + "loss": 0.1426, + "num_input_tokens_seen": 12432608, + "step": 58915 + }, + { + "epoch": 6.481848184818482, + "grad_norm": 0.013905012048780918, + "learning_rate": 4.273392350054544e-05, + "loss": 0.0468, + "num_input_tokens_seen": 12433664, + "step": 58920 + }, + { + "epoch": 6.482398239823983, + "grad_norm": 0.020543618127703667, + "learning_rate": 4.2732231732433695e-05, + "loss": 0.0028, + "num_input_tokens_seen": 12434752, + "step": 58925 + }, + { + "epoch": 6.482948294829483, + "grad_norm": 0.014043756760656834, + "learning_rate": 4.273053980089249e-05, + "loss": 0.0171, + "num_input_tokens_seen": 12435808, + "step": 58930 + }, + { + "epoch": 6.483498349834983, + "grad_norm": 0.050557468086481094, + "learning_rate": 4.27288477059374e-05, + "loss": 0.107, + "num_input_tokens_seen": 12436864, + "step": 58935 + }, + { + "epoch": 6.484048404840484, + "grad_norm": 0.0560024194419384, + "learning_rate": 4.272715544758404e-05, + "loss": 0.0224, + "num_input_tokens_seen": 12437920, + "step": 58940 + }, + { + "epoch": 6.4845984598459845, + "grad_norm": 0.8988162279129028, + "learning_rate": 4.2725463025847984e-05, + "loss": 0.034, + "num_input_tokens_seen": 12439008, + "step": 58945 + }, + { + "epoch": 6.485148514851485, + "grad_norm": 0.029947588220238686, + "learning_rate": 4.272377044074485e-05, + "loss": 0.0197, + "num_input_tokens_seen": 12440032, + "step": 58950 + }, + { + "epoch": 6.485698569856986, + "grad_norm": 0.01756327971816063, + "learning_rate": 4.272207769229022e-05, + "loss": 0.0963, + "num_input_tokens_seen": 12441088, + "step": 58955 + }, + { + "epoch": 6.486248624862486, + "grad_norm": 1.2407641410827637, + "learning_rate": 4.27203847804997e-05, + "loss": 0.0969, + "num_input_tokens_seen": 12442112, + "step": 58960 + }, + { + "epoch": 6.486798679867987, + "grad_norm": 1.0125794410705566, + "learning_rate": 4.2718691705388917e-05, + "loss": 0.0176, + "num_input_tokens_seen": 12443168, + "step": 58965 + }, + { + "epoch": 6.487348734873487, + "grad_norm": 0.020596148446202278, + "learning_rate": 4.271699846697344e-05, + "loss": 0.0527, + "num_input_tokens_seen": 12444288, + "step": 58970 + }, + { + "epoch": 6.4878987898789875, + "grad_norm": 0.012813694775104523, + "learning_rate": 4.2715305065268884e-05, + "loss": 0.0532, + "num_input_tokens_seen": 12445344, + "step": 58975 + }, + { + "epoch": 6.488448844884489, + "grad_norm": 0.02048022486269474, + "learning_rate": 4.2713611500290876e-05, + "loss": 0.0703, + "num_input_tokens_seen": 12446368, + "step": 58980 + }, + { + "epoch": 6.488998899889989, + "grad_norm": 0.03414667025208473, + "learning_rate": 4.2711917772055e-05, + "loss": 0.0083, + "num_input_tokens_seen": 12447424, + "step": 58985 + }, + { + "epoch": 6.48954895489549, + "grad_norm": 0.19810684025287628, + "learning_rate": 4.2710223880576874e-05, + "loss": 0.0114, + "num_input_tokens_seen": 12448416, + "step": 58990 + }, + { + "epoch": 6.49009900990099, + "grad_norm": 0.44611260294914246, + "learning_rate": 4.2708529825872125e-05, + "loss": 0.0631, + "num_input_tokens_seen": 12449504, + "step": 58995 + }, + { + "epoch": 6.49064906490649, + "grad_norm": 0.009802172891795635, + "learning_rate": 4.2706835607956344e-05, + "loss": 0.0081, + "num_input_tokens_seen": 12450560, + "step": 59000 + }, + { + "epoch": 6.491199119911991, + "grad_norm": 0.03732439875602722, + "learning_rate": 4.270514122684516e-05, + "loss": 0.027, + "num_input_tokens_seen": 12451616, + "step": 59005 + }, + { + "epoch": 6.491749174917492, + "grad_norm": 0.2550029158592224, + "learning_rate": 4.2703446682554186e-05, + "loss": 0.0595, + "num_input_tokens_seen": 12452672, + "step": 59010 + }, + { + "epoch": 6.492299229922993, + "grad_norm": 0.02020212821662426, + "learning_rate": 4.2701751975099034e-05, + "loss": 0.059, + "num_input_tokens_seen": 12453696, + "step": 59015 + }, + { + "epoch": 6.492849284928493, + "grad_norm": 0.04293600842356682, + "learning_rate": 4.270005710449534e-05, + "loss": 0.0742, + "num_input_tokens_seen": 12454688, + "step": 59020 + }, + { + "epoch": 6.493399339933993, + "grad_norm": 0.013752471655607224, + "learning_rate": 4.26983620707587e-05, + "loss": 0.005, + "num_input_tokens_seen": 12455776, + "step": 59025 + }, + { + "epoch": 6.493949394939494, + "grad_norm": 0.07875404506921768, + "learning_rate": 4.269666687390476e-05, + "loss": 0.1207, + "num_input_tokens_seen": 12456864, + "step": 59030 + }, + { + "epoch": 6.494499449944994, + "grad_norm": 0.1152418777346611, + "learning_rate": 4.2694971513949124e-05, + "loss": 0.0069, + "num_input_tokens_seen": 12457920, + "step": 59035 + }, + { + "epoch": 6.4950495049504955, + "grad_norm": 0.0403679758310318, + "learning_rate": 4.269327599090743e-05, + "loss": 0.1585, + "num_input_tokens_seen": 12459040, + "step": 59040 + }, + { + "epoch": 6.495599559955996, + "grad_norm": 0.1453893631696701, + "learning_rate": 4.26915803047953e-05, + "loss": 0.1111, + "num_input_tokens_seen": 12460128, + "step": 59045 + }, + { + "epoch": 6.496149614961496, + "grad_norm": 1.6656992435455322, + "learning_rate": 4.268988445562836e-05, + "loss": 0.1132, + "num_input_tokens_seen": 12461216, + "step": 59050 + }, + { + "epoch": 6.496699669966997, + "grad_norm": 0.19401369988918304, + "learning_rate": 4.268818844342225e-05, + "loss": 0.0179, + "num_input_tokens_seen": 12462336, + "step": 59055 + }, + { + "epoch": 6.497249724972497, + "grad_norm": 1.0975627899169922, + "learning_rate": 4.268649226819259e-05, + "loss": 0.0688, + "num_input_tokens_seen": 12463392, + "step": 59060 + }, + { + "epoch": 6.497799779977997, + "grad_norm": 0.024287130683660507, + "learning_rate": 4.268479592995502e-05, + "loss": 0.0041, + "num_input_tokens_seen": 12464384, + "step": 59065 + }, + { + "epoch": 6.4983498349834985, + "grad_norm": 0.04199643060564995, + "learning_rate": 4.268309942872518e-05, + "loss": 0.0248, + "num_input_tokens_seen": 12465472, + "step": 59070 + }, + { + "epoch": 6.498899889988999, + "grad_norm": 0.6983339190483093, + "learning_rate": 4.2681402764518684e-05, + "loss": 0.0888, + "num_input_tokens_seen": 12466528, + "step": 59075 + }, + { + "epoch": 6.4994499449945, + "grad_norm": 0.13288618624210358, + "learning_rate": 4.267970593735119e-05, + "loss": 0.0153, + "num_input_tokens_seen": 12467552, + "step": 59080 + }, + { + "epoch": 6.5, + "grad_norm": 0.07370292395353317, + "learning_rate": 4.2678008947238336e-05, + "loss": 0.1014, + "num_input_tokens_seen": 12468640, + "step": 59085 + }, + { + "epoch": 6.5005500550055, + "grad_norm": 0.2687896490097046, + "learning_rate": 4.2676311794195744e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12469664, + "step": 59090 + }, + { + "epoch": 6.501100110011001, + "grad_norm": 1.040871262550354, + "learning_rate": 4.267461447823907e-05, + "loss": 0.1235, + "num_input_tokens_seen": 12470720, + "step": 59095 + }, + { + "epoch": 6.5016501650165015, + "grad_norm": 0.04357501491904259, + "learning_rate": 4.267291699938396e-05, + "loss": 0.025, + "num_input_tokens_seen": 12471808, + "step": 59100 + }, + { + "epoch": 6.502200220022003, + "grad_norm": 0.0998031347990036, + "learning_rate": 4.267121935764605e-05, + "loss": 0.0201, + "num_input_tokens_seen": 12472832, + "step": 59105 + }, + { + "epoch": 6.502750275027503, + "grad_norm": 0.17093117535114288, + "learning_rate": 4.2669521553041e-05, + "loss": 0.0148, + "num_input_tokens_seen": 12473920, + "step": 59110 + }, + { + "epoch": 6.503300330033003, + "grad_norm": 0.051198430359363556, + "learning_rate": 4.2667823585584445e-05, + "loss": 0.02, + "num_input_tokens_seen": 12475008, + "step": 59115 + }, + { + "epoch": 6.503850385038504, + "grad_norm": 0.11277107149362564, + "learning_rate": 4.266612545529203e-05, + "loss": 0.0346, + "num_input_tokens_seen": 12476064, + "step": 59120 + }, + { + "epoch": 6.504400440044004, + "grad_norm": 0.36976152658462524, + "learning_rate": 4.266442716217942e-05, + "loss": 0.0115, + "num_input_tokens_seen": 12477152, + "step": 59125 + }, + { + "epoch": 6.5049504950495045, + "grad_norm": 0.020983463153243065, + "learning_rate": 4.2662728706262255e-05, + "loss": 0.0214, + "num_input_tokens_seen": 12478240, + "step": 59130 + }, + { + "epoch": 6.505500550055006, + "grad_norm": 0.33291250467300415, + "learning_rate": 4.2661030087556196e-05, + "loss": 0.0213, + "num_input_tokens_seen": 12479296, + "step": 59135 + }, + { + "epoch": 6.506050605060506, + "grad_norm": 0.10609463602304459, + "learning_rate": 4.265933130607691e-05, + "loss": 0.0679, + "num_input_tokens_seen": 12480416, + "step": 59140 + }, + { + "epoch": 6.506600660066007, + "grad_norm": 0.04844127222895622, + "learning_rate": 4.2657632361840024e-05, + "loss": 0.0461, + "num_input_tokens_seen": 12481472, + "step": 59145 + }, + { + "epoch": 6.507150715071507, + "grad_norm": 0.12388291954994202, + "learning_rate": 4.2655933254861226e-05, + "loss": 0.0159, + "num_input_tokens_seen": 12482496, + "step": 59150 + }, + { + "epoch": 6.507700770077007, + "grad_norm": 0.5866649150848389, + "learning_rate": 4.265423398515616e-05, + "loss": 0.0519, + "num_input_tokens_seen": 12483552, + "step": 59155 + }, + { + "epoch": 6.508250825082508, + "grad_norm": 0.1744696944952011, + "learning_rate": 4.2652534552740496e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12484704, + "step": 59160 + }, + { + "epoch": 6.508800880088009, + "grad_norm": 0.122429758310318, + "learning_rate": 4.2650834957629885e-05, + "loss": 0.0046, + "num_input_tokens_seen": 12485728, + "step": 59165 + }, + { + "epoch": 6.50935093509351, + "grad_norm": 0.041515301913022995, + "learning_rate": 4.2649135199839996e-05, + "loss": 0.0294, + "num_input_tokens_seen": 12486720, + "step": 59170 + }, + { + "epoch": 6.50990099009901, + "grad_norm": 0.08968774974346161, + "learning_rate": 4.2647435279386496e-05, + "loss": 0.0168, + "num_input_tokens_seen": 12487808, + "step": 59175 + }, + { + "epoch": 6.51045104510451, + "grad_norm": 0.003654149128124118, + "learning_rate": 4.264573519628505e-05, + "loss": 0.022, + "num_input_tokens_seen": 12488800, + "step": 59180 + }, + { + "epoch": 6.511001100110011, + "grad_norm": 0.020954038947820663, + "learning_rate": 4.2644034950551344e-05, + "loss": 0.0099, + "num_input_tokens_seen": 12489856, + "step": 59185 + }, + { + "epoch": 6.511551155115511, + "grad_norm": 0.22133943438529968, + "learning_rate": 4.2642334542201026e-05, + "loss": 0.011, + "num_input_tokens_seen": 12490848, + "step": 59190 + }, + { + "epoch": 6.512101210121012, + "grad_norm": 0.042253024876117706, + "learning_rate": 4.264063397124978e-05, + "loss": 0.0017, + "num_input_tokens_seen": 12491872, + "step": 59195 + }, + { + "epoch": 6.512651265126513, + "grad_norm": 0.11440309137105942, + "learning_rate": 4.2638933237713264e-05, + "loss": 0.108, + "num_input_tokens_seen": 12492960, + "step": 59200 + }, + { + "epoch": 6.513201320132013, + "grad_norm": 0.026534561067819595, + "learning_rate": 4.263723234160717e-05, + "loss": 0.0026, + "num_input_tokens_seen": 12494048, + "step": 59205 + }, + { + "epoch": 6.513751375137514, + "grad_norm": 0.0036597582511603832, + "learning_rate": 4.2635531282947175e-05, + "loss": 0.0229, + "num_input_tokens_seen": 12495136, + "step": 59210 + }, + { + "epoch": 6.514301430143014, + "grad_norm": 1.846217393875122, + "learning_rate": 4.2633830061748946e-05, + "loss": 0.1393, + "num_input_tokens_seen": 12496224, + "step": 59215 + }, + { + "epoch": 6.514851485148515, + "grad_norm": 0.7211006283760071, + "learning_rate": 4.2632128678028175e-05, + "loss": 0.0758, + "num_input_tokens_seen": 12497248, + "step": 59220 + }, + { + "epoch": 6.5154015401540155, + "grad_norm": 1.6357877254486084, + "learning_rate": 4.263042713180052e-05, + "loss": 0.1531, + "num_input_tokens_seen": 12498336, + "step": 59225 + }, + { + "epoch": 6.515951595159516, + "grad_norm": 0.26094773411750793, + "learning_rate": 4.2628725423081684e-05, + "loss": 0.0099, + "num_input_tokens_seen": 12499328, + "step": 59230 + }, + { + "epoch": 6.516501650165017, + "grad_norm": 1.939943790435791, + "learning_rate": 4.2627023551887346e-05, + "loss": 0.1243, + "num_input_tokens_seen": 12500352, + "step": 59235 + }, + { + "epoch": 6.517051705170517, + "grad_norm": 0.011483191512525082, + "learning_rate": 4.262532151823319e-05, + "loss": 0.0105, + "num_input_tokens_seen": 12501376, + "step": 59240 + }, + { + "epoch": 6.517601760176017, + "grad_norm": 0.6169926524162292, + "learning_rate": 4.26236193221349e-05, + "loss": 0.0121, + "num_input_tokens_seen": 12502432, + "step": 59245 + }, + { + "epoch": 6.518151815181518, + "grad_norm": 0.20036177337169647, + "learning_rate": 4.262191696360817e-05, + "loss": 0.0182, + "num_input_tokens_seen": 12503520, + "step": 59250 + }, + { + "epoch": 6.5187018701870185, + "grad_norm": 2.997713088989258, + "learning_rate": 4.2620214442668696e-05, + "loss": 0.0912, + "num_input_tokens_seen": 12504640, + "step": 59255 + }, + { + "epoch": 6.51925192519252, + "grad_norm": 0.016768047586083412, + "learning_rate": 4.261851175933215e-05, + "loss": 0.106, + "num_input_tokens_seen": 12505664, + "step": 59260 + }, + { + "epoch": 6.51980198019802, + "grad_norm": 0.021896187216043472, + "learning_rate": 4.261680891361423e-05, + "loss": 0.019, + "num_input_tokens_seen": 12506816, + "step": 59265 + }, + { + "epoch": 6.52035203520352, + "grad_norm": 0.01727159507572651, + "learning_rate": 4.261510590553065e-05, + "loss": 0.0528, + "num_input_tokens_seen": 12507904, + "step": 59270 + }, + { + "epoch": 6.520902090209021, + "grad_norm": 0.015048977918922901, + "learning_rate": 4.2613402735097075e-05, + "loss": 0.0219, + "num_input_tokens_seen": 12508928, + "step": 59275 + }, + { + "epoch": 6.521452145214521, + "grad_norm": 0.055408552289009094, + "learning_rate": 4.261169940232923e-05, + "loss": 0.0031, + "num_input_tokens_seen": 12509952, + "step": 59280 + }, + { + "epoch": 6.522002200220022, + "grad_norm": 0.036544427275657654, + "learning_rate": 4.260999590724281e-05, + "loss": 0.0373, + "num_input_tokens_seen": 12511040, + "step": 59285 + }, + { + "epoch": 6.522552255225523, + "grad_norm": 0.28818029165267944, + "learning_rate": 4.260829224985349e-05, + "loss": 0.0247, + "num_input_tokens_seen": 12512096, + "step": 59290 + }, + { + "epoch": 6.523102310231023, + "grad_norm": 0.1151842325925827, + "learning_rate": 4.2606588430177e-05, + "loss": 0.055, + "num_input_tokens_seen": 12513152, + "step": 59295 + }, + { + "epoch": 6.523652365236524, + "grad_norm": 0.010408779606223106, + "learning_rate": 4.260488444822903e-05, + "loss": 0.057, + "num_input_tokens_seen": 12514144, + "step": 59300 + }, + { + "epoch": 6.524202420242024, + "grad_norm": 0.12723620235919952, + "learning_rate": 4.260318030402529e-05, + "loss": 0.0064, + "num_input_tokens_seen": 12515200, + "step": 59305 + }, + { + "epoch": 6.524752475247524, + "grad_norm": 0.5167088508605957, + "learning_rate": 4.260147599758148e-05, + "loss": 0.105, + "num_input_tokens_seen": 12516256, + "step": 59310 + }, + { + "epoch": 6.525302530253025, + "grad_norm": 0.015265758149325848, + "learning_rate": 4.259977152891331e-05, + "loss": 0.0043, + "num_input_tokens_seen": 12517312, + "step": 59315 + }, + { + "epoch": 6.525852585258526, + "grad_norm": 0.03769713267683983, + "learning_rate": 4.25980668980365e-05, + "loss": 0.0469, + "num_input_tokens_seen": 12518368, + "step": 59320 + }, + { + "epoch": 6.526402640264027, + "grad_norm": 0.16493651270866394, + "learning_rate": 4.259636210496675e-05, + "loss": 0.0373, + "num_input_tokens_seen": 12519424, + "step": 59325 + }, + { + "epoch": 6.526952695269527, + "grad_norm": 0.19046911597251892, + "learning_rate": 4.259465714971977e-05, + "loss": 0.0954, + "num_input_tokens_seen": 12520576, + "step": 59330 + }, + { + "epoch": 6.527502750275027, + "grad_norm": 0.029361294582486153, + "learning_rate": 4.259295203231128e-05, + "loss": 0.0076, + "num_input_tokens_seen": 12521664, + "step": 59335 + }, + { + "epoch": 6.528052805280528, + "grad_norm": 0.011105875484645367, + "learning_rate": 4.2591246752756986e-05, + "loss": 0.0931, + "num_input_tokens_seen": 12522688, + "step": 59340 + }, + { + "epoch": 6.528602860286028, + "grad_norm": 0.14681078493595123, + "learning_rate": 4.258954131107262e-05, + "loss": 0.0557, + "num_input_tokens_seen": 12523744, + "step": 59345 + }, + { + "epoch": 6.5291529152915295, + "grad_norm": 0.018828222528100014, + "learning_rate": 4.2587835707273895e-05, + "loss": 0.0742, + "num_input_tokens_seen": 12524832, + "step": 59350 + }, + { + "epoch": 6.52970297029703, + "grad_norm": 0.02273421362042427, + "learning_rate": 4.2586129941376515e-05, + "loss": 0.0263, + "num_input_tokens_seen": 12525888, + "step": 59355 + }, + { + "epoch": 6.53025302530253, + "grad_norm": 0.5477719306945801, + "learning_rate": 4.258442401339622e-05, + "loss": 0.018, + "num_input_tokens_seen": 12527008, + "step": 59360 + }, + { + "epoch": 6.530803080308031, + "grad_norm": 0.006850811652839184, + "learning_rate": 4.258271792334873e-05, + "loss": 0.0095, + "num_input_tokens_seen": 12528064, + "step": 59365 + }, + { + "epoch": 6.531353135313531, + "grad_norm": 0.09381376206874847, + "learning_rate": 4.2581011671249766e-05, + "loss": 0.1624, + "num_input_tokens_seen": 12529120, + "step": 59370 + }, + { + "epoch": 6.531903190319031, + "grad_norm": 0.09955358505249023, + "learning_rate": 4.2579305257115057e-05, + "loss": 0.0049, + "num_input_tokens_seen": 12530176, + "step": 59375 + }, + { + "epoch": 6.5324532453245325, + "grad_norm": 0.050101976841688156, + "learning_rate": 4.257759868096032e-05, + "loss": 0.027, + "num_input_tokens_seen": 12531264, + "step": 59380 + }, + { + "epoch": 6.533003300330033, + "grad_norm": 0.13358302414417267, + "learning_rate": 4.2575891942801286e-05, + "loss": 0.0635, + "num_input_tokens_seen": 12532320, + "step": 59385 + }, + { + "epoch": 6.533553355335534, + "grad_norm": 0.39758676290512085, + "learning_rate": 4.2574185042653695e-05, + "loss": 0.1089, + "num_input_tokens_seen": 12533344, + "step": 59390 + }, + { + "epoch": 6.534103410341034, + "grad_norm": 0.02103075571358204, + "learning_rate": 4.257247798053328e-05, + "loss": 0.0077, + "num_input_tokens_seen": 12534400, + "step": 59395 + }, + { + "epoch": 6.534653465346535, + "grad_norm": 0.07135724276304245, + "learning_rate": 4.257077075645576e-05, + "loss": 0.0137, + "num_input_tokens_seen": 12535392, + "step": 59400 + }, + { + "epoch": 6.535203520352035, + "grad_norm": 0.5735506415367126, + "learning_rate": 4.256906337043688e-05, + "loss": 0.0267, + "num_input_tokens_seen": 12536512, + "step": 59405 + }, + { + "epoch": 6.5357535753575355, + "grad_norm": 0.8129149079322815, + "learning_rate": 4.256735582249237e-05, + "loss": 0.0541, + "num_input_tokens_seen": 12537504, + "step": 59410 + }, + { + "epoch": 6.536303630363037, + "grad_norm": 0.011018123477697372, + "learning_rate": 4.256564811263797e-05, + "loss": 0.0156, + "num_input_tokens_seen": 12538528, + "step": 59415 + }, + { + "epoch": 6.536853685368537, + "grad_norm": 0.1578523963689804, + "learning_rate": 4.2563940240889425e-05, + "loss": 0.0434, + "num_input_tokens_seen": 12539584, + "step": 59420 + }, + { + "epoch": 6.537403740374037, + "grad_norm": 0.440743625164032, + "learning_rate": 4.256223220726247e-05, + "loss": 0.0945, + "num_input_tokens_seen": 12540640, + "step": 59425 + }, + { + "epoch": 6.537953795379538, + "grad_norm": 1.5928711891174316, + "learning_rate": 4.256052401177285e-05, + "loss": 0.0598, + "num_input_tokens_seen": 12541728, + "step": 59430 + }, + { + "epoch": 6.538503850385038, + "grad_norm": 0.013408496044576168, + "learning_rate": 4.25588156544363e-05, + "loss": 0.0209, + "num_input_tokens_seen": 12542752, + "step": 59435 + }, + { + "epoch": 6.539053905390539, + "grad_norm": 0.11794663220643997, + "learning_rate": 4.255710713526857e-05, + "loss": 0.066, + "num_input_tokens_seen": 12543776, + "step": 59440 + }, + { + "epoch": 6.53960396039604, + "grad_norm": 0.32560786604881287, + "learning_rate": 4.2555398454285424e-05, + "loss": 0.0186, + "num_input_tokens_seen": 12544800, + "step": 59445 + }, + { + "epoch": 6.54015401540154, + "grad_norm": 0.699972927570343, + "learning_rate": 4.255368961150258e-05, + "loss": 0.0647, + "num_input_tokens_seen": 12545856, + "step": 59450 + }, + { + "epoch": 6.540704070407041, + "grad_norm": 1.2522153854370117, + "learning_rate": 4.25519806069358e-05, + "loss": 0.0661, + "num_input_tokens_seen": 12546944, + "step": 59455 + }, + { + "epoch": 6.541254125412541, + "grad_norm": 0.12598282098770142, + "learning_rate": 4.255027144060085e-05, + "loss": 0.0703, + "num_input_tokens_seen": 12548032, + "step": 59460 + }, + { + "epoch": 6.541804180418042, + "grad_norm": 0.7687423229217529, + "learning_rate": 4.254856211251346e-05, + "loss": 0.0595, + "num_input_tokens_seen": 12549120, + "step": 59465 + }, + { + "epoch": 6.542354235423542, + "grad_norm": 1.079203486442566, + "learning_rate": 4.254685262268939e-05, + "loss": 0.1438, + "num_input_tokens_seen": 12550240, + "step": 59470 + }, + { + "epoch": 6.542904290429043, + "grad_norm": 0.08188258856534958, + "learning_rate": 4.254514297114441e-05, + "loss": 0.0929, + "num_input_tokens_seen": 12551296, + "step": 59475 + }, + { + "epoch": 6.543454345434544, + "grad_norm": 0.24825836718082428, + "learning_rate": 4.254343315789427e-05, + "loss": 0.01, + "num_input_tokens_seen": 12552288, + "step": 59480 + }, + { + "epoch": 6.544004400440044, + "grad_norm": 0.7184412479400635, + "learning_rate": 4.254172318295471e-05, + "loss": 0.0404, + "num_input_tokens_seen": 12553312, + "step": 59485 + }, + { + "epoch": 6.544554455445544, + "grad_norm": 0.21763834357261658, + "learning_rate": 4.254001304634151e-05, + "loss": 0.0201, + "num_input_tokens_seen": 12554400, + "step": 59490 + }, + { + "epoch": 6.545104510451045, + "grad_norm": 0.043261002749204636, + "learning_rate": 4.2538302748070427e-05, + "loss": 0.0941, + "num_input_tokens_seen": 12555424, + "step": 59495 + }, + { + "epoch": 6.5456545654565454, + "grad_norm": 0.43083813786506653, + "learning_rate": 4.253659228815722e-05, + "loss": 0.0965, + "num_input_tokens_seen": 12556448, + "step": 59500 + }, + { + "epoch": 6.5462046204620465, + "grad_norm": 0.023693663999438286, + "learning_rate": 4.253488166661767e-05, + "loss": 0.01, + "num_input_tokens_seen": 12557568, + "step": 59505 + }, + { + "epoch": 6.546754675467547, + "grad_norm": 0.025828523561358452, + "learning_rate": 4.253317088346751e-05, + "loss": 0.0487, + "num_input_tokens_seen": 12558528, + "step": 59510 + }, + { + "epoch": 6.547304730473047, + "grad_norm": 0.04175840690732002, + "learning_rate": 4.253145993872254e-05, + "loss": 0.0644, + "num_input_tokens_seen": 12559648, + "step": 59515 + }, + { + "epoch": 6.547854785478548, + "grad_norm": 0.018512139096856117, + "learning_rate": 4.2529748832398513e-05, + "loss": 0.0043, + "num_input_tokens_seen": 12560672, + "step": 59520 + }, + { + "epoch": 6.548404840484048, + "grad_norm": 0.050964683294296265, + "learning_rate": 4.2528037564511204e-05, + "loss": 0.0306, + "num_input_tokens_seen": 12561696, + "step": 59525 + }, + { + "epoch": 6.548954895489549, + "grad_norm": 0.5629640817642212, + "learning_rate": 4.2526326135076375e-05, + "loss": 0.0539, + "num_input_tokens_seen": 12562816, + "step": 59530 + }, + { + "epoch": 6.5495049504950495, + "grad_norm": 2.074566125869751, + "learning_rate": 4.252461454410982e-05, + "loss": 0.0361, + "num_input_tokens_seen": 12563840, + "step": 59535 + }, + { + "epoch": 6.55005500550055, + "grad_norm": 0.027250496670603752, + "learning_rate": 4.2522902791627294e-05, + "loss": 0.0041, + "num_input_tokens_seen": 12564960, + "step": 59540 + }, + { + "epoch": 6.550605060506051, + "grad_norm": 0.5673506259918213, + "learning_rate": 4.252119087764459e-05, + "loss": 0.0592, + "num_input_tokens_seen": 12565984, + "step": 59545 + }, + { + "epoch": 6.551155115511551, + "grad_norm": 0.45032861828804016, + "learning_rate": 4.251947880217747e-05, + "loss": 0.0276, + "num_input_tokens_seen": 12567072, + "step": 59550 + }, + { + "epoch": 6.551705170517051, + "grad_norm": 1.2888017892837524, + "learning_rate": 4.251776656524172e-05, + "loss": 0.0323, + "num_input_tokens_seen": 12568128, + "step": 59555 + }, + { + "epoch": 6.552255225522552, + "grad_norm": 0.015043831430375576, + "learning_rate": 4.251605416685312e-05, + "loss": 0.0497, + "num_input_tokens_seen": 12569280, + "step": 59560 + }, + { + "epoch": 6.552805280528053, + "grad_norm": 0.05397864803671837, + "learning_rate": 4.251434160702746e-05, + "loss": 0.0184, + "num_input_tokens_seen": 12570336, + "step": 59565 + }, + { + "epoch": 6.553355335533554, + "grad_norm": 3.264150381088257, + "learning_rate": 4.251262888578052e-05, + "loss": 0.0503, + "num_input_tokens_seen": 12571360, + "step": 59570 + }, + { + "epoch": 6.553905390539054, + "grad_norm": 0.07179959118366241, + "learning_rate": 4.2510916003128076e-05, + "loss": 0.0125, + "num_input_tokens_seen": 12572416, + "step": 59575 + }, + { + "epoch": 6.554455445544555, + "grad_norm": 3.3802037239074707, + "learning_rate": 4.250920295908593e-05, + "loss": 0.0396, + "num_input_tokens_seen": 12573440, + "step": 59580 + }, + { + "epoch": 6.555005500550055, + "grad_norm": 0.02911342866718769, + "learning_rate": 4.250748975366985e-05, + "loss": 0.1402, + "num_input_tokens_seen": 12574528, + "step": 59585 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 1.273053526878357, + "learning_rate": 4.2505776386895644e-05, + "loss": 0.0561, + "num_input_tokens_seen": 12575584, + "step": 59590 + }, + { + "epoch": 6.5561056105610565, + "grad_norm": 0.15270736813545227, + "learning_rate": 4.250406285877909e-05, + "loss": 0.0219, + "num_input_tokens_seen": 12576608, + "step": 59595 + }, + { + "epoch": 6.556655665566557, + "grad_norm": 0.2645033895969391, + "learning_rate": 4.2502349169335996e-05, + "loss": 0.0305, + "num_input_tokens_seen": 12577600, + "step": 59600 + }, + { + "epoch": 6.557205720572057, + "grad_norm": 0.023027487099170685, + "learning_rate": 4.250063531858215e-05, + "loss": 0.0172, + "num_input_tokens_seen": 12578592, + "step": 59605 + }, + { + "epoch": 6.557755775577558, + "grad_norm": 0.10208246856927872, + "learning_rate": 4.249892130653335e-05, + "loss": 0.0126, + "num_input_tokens_seen": 12579648, + "step": 59610 + }, + { + "epoch": 6.558305830583058, + "grad_norm": 1.0565801858901978, + "learning_rate": 4.249720713320537e-05, + "loss": 0.0742, + "num_input_tokens_seen": 12580704, + "step": 59615 + }, + { + "epoch": 6.558855885588558, + "grad_norm": 0.006712839473038912, + "learning_rate": 4.249549279861404e-05, + "loss": 0.0072, + "num_input_tokens_seen": 12581664, + "step": 59620 + }, + { + "epoch": 6.5594059405940595, + "grad_norm": 0.07025282084941864, + "learning_rate": 4.2493778302775155e-05, + "loss": 0.0197, + "num_input_tokens_seen": 12582720, + "step": 59625 + }, + { + "epoch": 6.55995599559956, + "grad_norm": 0.2376662790775299, + "learning_rate": 4.2492063645704496e-05, + "loss": 0.011, + "num_input_tokens_seen": 12583776, + "step": 59630 + }, + { + "epoch": 6.560506050605061, + "grad_norm": 0.09616400301456451, + "learning_rate": 4.2490348827417894e-05, + "loss": 0.0107, + "num_input_tokens_seen": 12584768, + "step": 59635 + }, + { + "epoch": 6.561056105610561, + "grad_norm": 0.010569831356406212, + "learning_rate": 4.248863384793113e-05, + "loss": 0.0098, + "num_input_tokens_seen": 12585824, + "step": 59640 + }, + { + "epoch": 6.561606160616062, + "grad_norm": 0.11398308724164963, + "learning_rate": 4.248691870726002e-05, + "loss": 0.0123, + "num_input_tokens_seen": 12586912, + "step": 59645 + }, + { + "epoch": 6.562156215621562, + "grad_norm": 0.012027587741613388, + "learning_rate": 4.248520340542037e-05, + "loss": 0.0037, + "num_input_tokens_seen": 12587968, + "step": 59650 + }, + { + "epoch": 6.5627062706270625, + "grad_norm": 0.011512513272464275, + "learning_rate": 4.248348794242799e-05, + "loss": 0.1172, + "num_input_tokens_seen": 12589024, + "step": 59655 + }, + { + "epoch": 6.563256325632564, + "grad_norm": 1.074673056602478, + "learning_rate": 4.24817723182987e-05, + "loss": 0.0978, + "num_input_tokens_seen": 12590048, + "step": 59660 + }, + { + "epoch": 6.563806380638064, + "grad_norm": 0.07804922759532928, + "learning_rate": 4.2480056533048296e-05, + "loss": 0.0118, + "num_input_tokens_seen": 12591072, + "step": 59665 + }, + { + "epoch": 6.564356435643564, + "grad_norm": 0.0389934778213501, + "learning_rate": 4.24783405866926e-05, + "loss": 0.0047, + "num_input_tokens_seen": 12592096, + "step": 59670 + }, + { + "epoch": 6.564906490649065, + "grad_norm": 0.1737401932477951, + "learning_rate": 4.247662447924743e-05, + "loss": 0.0663, + "num_input_tokens_seen": 12593120, + "step": 59675 + }, + { + "epoch": 6.565456545654565, + "grad_norm": 0.09239202737808228, + "learning_rate": 4.2474908210728594e-05, + "loss": 0.0667, + "num_input_tokens_seen": 12594112, + "step": 59680 + }, + { + "epoch": 6.566006600660066, + "grad_norm": 0.012130037881433964, + "learning_rate": 4.247319178115191e-05, + "loss": 0.0437, + "num_input_tokens_seen": 12595136, + "step": 59685 + }, + { + "epoch": 6.566556655665567, + "grad_norm": 0.08288192003965378, + "learning_rate": 4.2471475190533204e-05, + "loss": 0.0818, + "num_input_tokens_seen": 12596320, + "step": 59690 + }, + { + "epoch": 6.567106710671067, + "grad_norm": 0.8783779740333557, + "learning_rate": 4.2469758438888295e-05, + "loss": 0.0487, + "num_input_tokens_seen": 12597376, + "step": 59695 + }, + { + "epoch": 6.567656765676568, + "grad_norm": 0.5179229974746704, + "learning_rate": 4.2468041526233014e-05, + "loss": 0.0615, + "num_input_tokens_seen": 12598432, + "step": 59700 + }, + { + "epoch": 6.568206820682068, + "grad_norm": 1.6958255767822266, + "learning_rate": 4.2466324452583164e-05, + "loss": 0.0982, + "num_input_tokens_seen": 12599456, + "step": 59705 + }, + { + "epoch": 6.568756875687569, + "grad_norm": 1.4508967399597168, + "learning_rate": 4.2464607217954594e-05, + "loss": 0.0892, + "num_input_tokens_seen": 12600512, + "step": 59710 + }, + { + "epoch": 6.569306930693069, + "grad_norm": 0.43369460105895996, + "learning_rate": 4.246288982236311e-05, + "loss": 0.0165, + "num_input_tokens_seen": 12601600, + "step": 59715 + }, + { + "epoch": 6.56985698569857, + "grad_norm": 0.2815224528312683, + "learning_rate": 4.2461172265824556e-05, + "loss": 0.0168, + "num_input_tokens_seen": 12602592, + "step": 59720 + }, + { + "epoch": 6.570407040704071, + "grad_norm": 0.235782653093338, + "learning_rate": 4.245945454835475e-05, + "loss": 0.0282, + "num_input_tokens_seen": 12603616, + "step": 59725 + }, + { + "epoch": 6.570957095709571, + "grad_norm": 0.11820787936449051, + "learning_rate": 4.2457736669969536e-05, + "loss": 0.0146, + "num_input_tokens_seen": 12604672, + "step": 59730 + }, + { + "epoch": 6.571507150715071, + "grad_norm": 0.04013577103614807, + "learning_rate": 4.245601863068474e-05, + "loss": 0.0398, + "num_input_tokens_seen": 12605696, + "step": 59735 + }, + { + "epoch": 6.572057205720572, + "grad_norm": 0.06506111472845078, + "learning_rate": 4.245430043051619e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12606752, + "step": 59740 + }, + { + "epoch": 6.572607260726072, + "grad_norm": 0.8593289852142334, + "learning_rate": 4.245258206947973e-05, + "loss": 0.1071, + "num_input_tokens_seen": 12607808, + "step": 59745 + }, + { + "epoch": 6.5731573157315735, + "grad_norm": 0.01635628193616867, + "learning_rate": 4.2450863547591204e-05, + "loss": 0.0116, + "num_input_tokens_seen": 12608928, + "step": 59750 + }, + { + "epoch": 6.573707370737074, + "grad_norm": 0.036287397146224976, + "learning_rate": 4.244914486486643e-05, + "loss": 0.0437, + "num_input_tokens_seen": 12610016, + "step": 59755 + }, + { + "epoch": 6.574257425742574, + "grad_norm": 0.3253238797187805, + "learning_rate": 4.244742602132127e-05, + "loss": 0.0359, + "num_input_tokens_seen": 12611168, + "step": 59760 + }, + { + "epoch": 6.574807480748075, + "grad_norm": 0.6099974513053894, + "learning_rate": 4.244570701697155e-05, + "loss": 0.0195, + "num_input_tokens_seen": 12612160, + "step": 59765 + }, + { + "epoch": 6.575357535753575, + "grad_norm": 0.08670922368764877, + "learning_rate": 4.2443987851833124e-05, + "loss": 0.0228, + "num_input_tokens_seen": 12613216, + "step": 59770 + }, + { + "epoch": 6.575907590759076, + "grad_norm": 0.10493406653404236, + "learning_rate": 4.244226852592183e-05, + "loss": 0.0921, + "num_input_tokens_seen": 12614272, + "step": 59775 + }, + { + "epoch": 6.5764576457645765, + "grad_norm": 0.16395017504692078, + "learning_rate": 4.244054903925352e-05, + "loss": 0.0055, + "num_input_tokens_seen": 12615296, + "step": 59780 + }, + { + "epoch": 6.577007700770077, + "grad_norm": 0.26422905921936035, + "learning_rate": 4.243882939184404e-05, + "loss": 0.0291, + "num_input_tokens_seen": 12616288, + "step": 59785 + }, + { + "epoch": 6.577557755775578, + "grad_norm": 0.059524986892938614, + "learning_rate": 4.243710958370923e-05, + "loss": 0.056, + "num_input_tokens_seen": 12617344, + "step": 59790 + }, + { + "epoch": 6.578107810781078, + "grad_norm": 0.5266307592391968, + "learning_rate": 4.243538961486495e-05, + "loss": 0.0312, + "num_input_tokens_seen": 12618304, + "step": 59795 + }, + { + "epoch": 6.578657865786578, + "grad_norm": 1.1846495866775513, + "learning_rate": 4.243366948532705e-05, + "loss": 0.1134, + "num_input_tokens_seen": 12619360, + "step": 59800 + }, + { + "epoch": 6.579207920792079, + "grad_norm": 0.14760325849056244, + "learning_rate": 4.243194919511139e-05, + "loss": 0.0091, + "num_input_tokens_seen": 12620416, + "step": 59805 + }, + { + "epoch": 6.5797579757975795, + "grad_norm": 0.9139850735664368, + "learning_rate": 4.243022874423381e-05, + "loss": 0.0591, + "num_input_tokens_seen": 12621536, + "step": 59810 + }, + { + "epoch": 6.580308030803081, + "grad_norm": 0.5721153616905212, + "learning_rate": 4.242850813271018e-05, + "loss": 0.0353, + "num_input_tokens_seen": 12622560, + "step": 59815 + }, + { + "epoch": 6.580858085808581, + "grad_norm": 0.5113655924797058, + "learning_rate": 4.2426787360556355e-05, + "loss": 0.0226, + "num_input_tokens_seen": 12623648, + "step": 59820 + }, + { + "epoch": 6.581408140814082, + "grad_norm": 0.9439740777015686, + "learning_rate": 4.24250664277882e-05, + "loss": 0.0358, + "num_input_tokens_seen": 12624704, + "step": 59825 + }, + { + "epoch": 6.581958195819582, + "grad_norm": 0.037887219339609146, + "learning_rate": 4.242334533442156e-05, + "loss": 0.1307, + "num_input_tokens_seen": 12625760, + "step": 59830 + }, + { + "epoch": 6.582508250825082, + "grad_norm": 2.150146245956421, + "learning_rate": 4.24216240804723e-05, + "loss": 0.0824, + "num_input_tokens_seen": 12626816, + "step": 59835 + }, + { + "epoch": 6.583058305830583, + "grad_norm": 0.2600877285003662, + "learning_rate": 4.24199026659563e-05, + "loss": 0.0178, + "num_input_tokens_seen": 12627904, + "step": 59840 + }, + { + "epoch": 6.583608360836084, + "grad_norm": 0.1182061955332756, + "learning_rate": 4.2418181090889415e-05, + "loss": 0.0114, + "num_input_tokens_seen": 12629024, + "step": 59845 + }, + { + "epoch": 6.584158415841584, + "grad_norm": 0.2028927505016327, + "learning_rate": 4.241645935528751e-05, + "loss": 0.0697, + "num_input_tokens_seen": 12630016, + "step": 59850 + }, + { + "epoch": 6.584708470847085, + "grad_norm": 0.3232347071170807, + "learning_rate": 4.241473745916646e-05, + "loss": 0.0294, + "num_input_tokens_seen": 12631104, + "step": 59855 + }, + { + "epoch": 6.585258525852585, + "grad_norm": 1.1344295740127563, + "learning_rate": 4.2413015402542125e-05, + "loss": 0.0417, + "num_input_tokens_seen": 12632224, + "step": 59860 + }, + { + "epoch": 6.585808580858086, + "grad_norm": 0.11518535763025284, + "learning_rate": 4.241129318543039e-05, + "loss": 0.0365, + "num_input_tokens_seen": 12633280, + "step": 59865 + }, + { + "epoch": 6.586358635863586, + "grad_norm": 0.08037569373846054, + "learning_rate": 4.2409570807847115e-05, + "loss": 0.0654, + "num_input_tokens_seen": 12634336, + "step": 59870 + }, + { + "epoch": 6.586908690869087, + "grad_norm": 0.4364216923713684, + "learning_rate": 4.2407848269808185e-05, + "loss": 0.1339, + "num_input_tokens_seen": 12635456, + "step": 59875 + }, + { + "epoch": 6.587458745874588, + "grad_norm": 0.008587325923144817, + "learning_rate": 4.240612557132947e-05, + "loss": 0.06, + "num_input_tokens_seen": 12636544, + "step": 59880 + }, + { + "epoch": 6.588008800880088, + "grad_norm": 0.07184682041406631, + "learning_rate": 4.2404402712426844e-05, + "loss": 0.0697, + "num_input_tokens_seen": 12637568, + "step": 59885 + }, + { + "epoch": 6.588558855885589, + "grad_norm": 0.016404569149017334, + "learning_rate": 4.240267969311619e-05, + "loss": 0.0049, + "num_input_tokens_seen": 12638688, + "step": 59890 + }, + { + "epoch": 6.589108910891089, + "grad_norm": 0.05318818241357803, + "learning_rate": 4.240095651341339e-05, + "loss": 0.0028, + "num_input_tokens_seen": 12639712, + "step": 59895 + }, + { + "epoch": 6.589658965896589, + "grad_norm": 0.018681293353438377, + "learning_rate": 4.239923317333433e-05, + "loss": 0.0253, + "num_input_tokens_seen": 12640736, + "step": 59900 + }, + { + "epoch": 6.5902090209020905, + "grad_norm": 0.914097249507904, + "learning_rate": 4.239750967289487e-05, + "loss": 0.038, + "num_input_tokens_seen": 12641760, + "step": 59905 + }, + { + "epoch": 6.590759075907591, + "grad_norm": 0.33965274691581726, + "learning_rate": 4.2395786012110924e-05, + "loss": 0.0144, + "num_input_tokens_seen": 12642784, + "step": 59910 + }, + { + "epoch": 6.591309130913091, + "grad_norm": 0.4293169379234314, + "learning_rate": 4.2394062190998365e-05, + "loss": 0.0345, + "num_input_tokens_seen": 12643872, + "step": 59915 + }, + { + "epoch": 6.591859185918592, + "grad_norm": 0.5763260126113892, + "learning_rate": 4.2392338209573076e-05, + "loss": 0.0147, + "num_input_tokens_seen": 12644960, + "step": 59920 + }, + { + "epoch": 6.592409240924092, + "grad_norm": 0.03473005071282387, + "learning_rate": 4.2390614067850956e-05, + "loss": 0.0289, + "num_input_tokens_seen": 12646016, + "step": 59925 + }, + { + "epoch": 6.592959295929593, + "grad_norm": 0.014899324625730515, + "learning_rate": 4.2388889765847895e-05, + "loss": 0.0046, + "num_input_tokens_seen": 12647040, + "step": 59930 + }, + { + "epoch": 6.5935093509350935, + "grad_norm": 0.019025851041078568, + "learning_rate": 4.238716530357977e-05, + "loss": 0.057, + "num_input_tokens_seen": 12648096, + "step": 59935 + }, + { + "epoch": 6.594059405940594, + "grad_norm": 0.3596935570240021, + "learning_rate": 4.238544068106249e-05, + "loss": 0.029, + "num_input_tokens_seen": 12649152, + "step": 59940 + }, + { + "epoch": 6.594609460946095, + "grad_norm": 0.985899806022644, + "learning_rate": 4.238371589831195e-05, + "loss": 0.0818, + "num_input_tokens_seen": 12650176, + "step": 59945 + }, + { + "epoch": 6.595159515951595, + "grad_norm": 0.8111243844032288, + "learning_rate": 4.238199095534404e-05, + "loss": 0.0396, + "num_input_tokens_seen": 12651264, + "step": 59950 + }, + { + "epoch": 6.595709570957096, + "grad_norm": 0.07500014454126358, + "learning_rate": 4.238026585217466e-05, + "loss": 0.0463, + "num_input_tokens_seen": 12652320, + "step": 59955 + }, + { + "epoch": 6.596259625962596, + "grad_norm": 0.045995794236660004, + "learning_rate": 4.237854058881971e-05, + "loss": 0.0142, + "num_input_tokens_seen": 12653376, + "step": 59960 + }, + { + "epoch": 6.5968096809680965, + "grad_norm": 0.02096632495522499, + "learning_rate": 4.237681516529508e-05, + "loss": 0.0143, + "num_input_tokens_seen": 12654432, + "step": 59965 + }, + { + "epoch": 6.597359735973598, + "grad_norm": 0.01934894546866417, + "learning_rate": 4.23750895816167e-05, + "loss": 0.011, + "num_input_tokens_seen": 12655520, + "step": 59970 + }, + { + "epoch": 6.597909790979098, + "grad_norm": 0.13419334590435028, + "learning_rate": 4.237336383780045e-05, + "loss": 0.0059, + "num_input_tokens_seen": 12656608, + "step": 59975 + }, + { + "epoch": 6.598459845984598, + "grad_norm": 0.04598075523972511, + "learning_rate": 4.2371637933862234e-05, + "loss": 0.0789, + "num_input_tokens_seen": 12657600, + "step": 59980 + }, + { + "epoch": 6.599009900990099, + "grad_norm": 1.49076509475708, + "learning_rate": 4.236991186981798e-05, + "loss": 0.0839, + "num_input_tokens_seen": 12658720, + "step": 59985 + }, + { + "epoch": 6.599559955995599, + "grad_norm": 0.1552688479423523, + "learning_rate": 4.2368185645683564e-05, + "loss": 0.0543, + "num_input_tokens_seen": 12659776, + "step": 59990 + }, + { + "epoch": 6.6001100110011, + "grad_norm": 0.02317643165588379, + "learning_rate": 4.2366459261474933e-05, + "loss": 0.0134, + "num_input_tokens_seen": 12660800, + "step": 59995 + }, + { + "epoch": 6.600660066006601, + "grad_norm": 0.41680091619491577, + "learning_rate": 4.2364732717207976e-05, + "loss": 0.0431, + "num_input_tokens_seen": 12661824, + "step": 60000 + }, + { + "epoch": 6.601210121012102, + "grad_norm": 0.16032204031944275, + "learning_rate": 4.236300601289861e-05, + "loss": 0.0979, + "num_input_tokens_seen": 12662880, + "step": 60005 + }, + { + "epoch": 6.601760176017602, + "grad_norm": 0.8645339012145996, + "learning_rate": 4.236127914856275e-05, + "loss": 0.0145, + "num_input_tokens_seen": 12664000, + "step": 60010 + }, + { + "epoch": 6.602310231023102, + "grad_norm": 1.1742174625396729, + "learning_rate": 4.2359552124216306e-05, + "loss": 0.0771, + "num_input_tokens_seen": 12665056, + "step": 60015 + }, + { + "epoch": 6.602860286028603, + "grad_norm": 0.31470227241516113, + "learning_rate": 4.23578249398752e-05, + "loss": 0.0171, + "num_input_tokens_seen": 12666112, + "step": 60020 + }, + { + "epoch": 6.603410341034103, + "grad_norm": 0.04260065779089928, + "learning_rate": 4.2356097595555355e-05, + "loss": 0.0371, + "num_input_tokens_seen": 12667136, + "step": 60025 + }, + { + "epoch": 6.603960396039604, + "grad_norm": 0.017412517219781876, + "learning_rate": 4.2354370091272686e-05, + "loss": 0.0627, + "num_input_tokens_seen": 12668224, + "step": 60030 + }, + { + "epoch": 6.604510451045105, + "grad_norm": 0.016050193458795547, + "learning_rate": 4.235264242704311e-05, + "loss": 0.0079, + "num_input_tokens_seen": 12669312, + "step": 60035 + }, + { + "epoch": 6.605060506050605, + "grad_norm": 0.027172517031431198, + "learning_rate": 4.2350914602882564e-05, + "loss": 0.0182, + "num_input_tokens_seen": 12670336, + "step": 60040 + }, + { + "epoch": 6.605610561056105, + "grad_norm": 1.4633872509002686, + "learning_rate": 4.234918661880696e-05, + "loss": 0.11, + "num_input_tokens_seen": 12671328, + "step": 60045 + }, + { + "epoch": 6.606160616061606, + "grad_norm": 0.5786484479904175, + "learning_rate": 4.234745847483222e-05, + "loss": 0.038, + "num_input_tokens_seen": 12672384, + "step": 60050 + }, + { + "epoch": 6.606710671067106, + "grad_norm": 0.011981195770204067, + "learning_rate": 4.23457301709743e-05, + "loss": 0.1194, + "num_input_tokens_seen": 12673472, + "step": 60055 + }, + { + "epoch": 6.6072607260726075, + "grad_norm": 0.005959671456366777, + "learning_rate": 4.234400170724909e-05, + "loss": 0.007, + "num_input_tokens_seen": 12674464, + "step": 60060 + }, + { + "epoch": 6.607810781078108, + "grad_norm": 0.17318303883075714, + "learning_rate": 4.2342273083672546e-05, + "loss": 0.0117, + "num_input_tokens_seen": 12675488, + "step": 60065 + }, + { + "epoch": 6.608360836083609, + "grad_norm": 0.0775732472538948, + "learning_rate": 4.234054430026059e-05, + "loss": 0.006, + "num_input_tokens_seen": 12676640, + "step": 60070 + }, + { + "epoch": 6.608910891089109, + "grad_norm": 0.11511293053627014, + "learning_rate": 4.2338815357029154e-05, + "loss": 0.2213, + "num_input_tokens_seen": 12677696, + "step": 60075 + }, + { + "epoch": 6.609460946094609, + "grad_norm": 0.21855632960796356, + "learning_rate": 4.2337086253994186e-05, + "loss": 0.054, + "num_input_tokens_seen": 12678720, + "step": 60080 + }, + { + "epoch": 6.61001100110011, + "grad_norm": 0.03154420107603073, + "learning_rate": 4.23353569911716e-05, + "loss": 0.1365, + "num_input_tokens_seen": 12679744, + "step": 60085 + }, + { + "epoch": 6.6105610561056105, + "grad_norm": 0.12433972209692001, + "learning_rate": 4.233362756857736e-05, + "loss": 0.0592, + "num_input_tokens_seen": 12680800, + "step": 60090 + }, + { + "epoch": 6.611111111111111, + "grad_norm": 0.039133790880441666, + "learning_rate": 4.233189798622739e-05, + "loss": 0.013, + "num_input_tokens_seen": 12681920, + "step": 60095 + }, + { + "epoch": 6.611661166116612, + "grad_norm": 1.104404091835022, + "learning_rate": 4.2330168244137626e-05, + "loss": 0.1064, + "num_input_tokens_seen": 12683040, + "step": 60100 + }, + { + "epoch": 6.612211221122112, + "grad_norm": 0.8942311406135559, + "learning_rate": 4.2328438342324026e-05, + "loss": 0.0356, + "num_input_tokens_seen": 12684128, + "step": 60105 + }, + { + "epoch": 6.612761276127613, + "grad_norm": 0.27196991443634033, + "learning_rate": 4.232670828080252e-05, + "loss": 0.1106, + "num_input_tokens_seen": 12685184, + "step": 60110 + }, + { + "epoch": 6.613311331133113, + "grad_norm": 0.20894663035869598, + "learning_rate": 4.232497805958906e-05, + "loss": 0.0678, + "num_input_tokens_seen": 12686272, + "step": 60115 + }, + { + "epoch": 6.6138613861386135, + "grad_norm": 0.09711111336946487, + "learning_rate": 4.2323247678699586e-05, + "loss": 0.0575, + "num_input_tokens_seen": 12687360, + "step": 60120 + }, + { + "epoch": 6.614411441144115, + "grad_norm": 1.1790857315063477, + "learning_rate": 4.2321517138150055e-05, + "loss": 0.1072, + "num_input_tokens_seen": 12688384, + "step": 60125 + }, + { + "epoch": 6.614961496149615, + "grad_norm": 0.07774175703525543, + "learning_rate": 4.231978643795641e-05, + "loss": 0.0155, + "num_input_tokens_seen": 12689376, + "step": 60130 + }, + { + "epoch": 6.615511551155116, + "grad_norm": 0.1761220097541809, + "learning_rate": 4.23180555781346e-05, + "loss": 0.0092, + "num_input_tokens_seen": 12690336, + "step": 60135 + }, + { + "epoch": 6.616061606160616, + "grad_norm": 0.6544256210327148, + "learning_rate": 4.231632455870059e-05, + "loss": 0.0223, + "num_input_tokens_seen": 12691456, + "step": 60140 + }, + { + "epoch": 6.616611661166116, + "grad_norm": 0.31207817792892456, + "learning_rate": 4.231459337967032e-05, + "loss": 0.0153, + "num_input_tokens_seen": 12692544, + "step": 60145 + }, + { + "epoch": 6.617161716171617, + "grad_norm": 0.7183454632759094, + "learning_rate": 4.2312862041059755e-05, + "loss": 0.0372, + "num_input_tokens_seen": 12693664, + "step": 60150 + }, + { + "epoch": 6.617711771177118, + "grad_norm": 0.1413586288690567, + "learning_rate": 4.231113054288485e-05, + "loss": 0.0128, + "num_input_tokens_seen": 12694784, + "step": 60155 + }, + { + "epoch": 6.618261826182618, + "grad_norm": 0.03300405666232109, + "learning_rate": 4.230939888516155e-05, + "loss": 0.008, + "num_input_tokens_seen": 12695808, + "step": 60160 + }, + { + "epoch": 6.618811881188119, + "grad_norm": 0.018523547798395157, + "learning_rate": 4.230766706790584e-05, + "loss": 0.0269, + "num_input_tokens_seen": 12696800, + "step": 60165 + }, + { + "epoch": 6.619361936193619, + "grad_norm": 2.2087185382843018, + "learning_rate": 4.2305935091133664e-05, + "loss": 0.0838, + "num_input_tokens_seen": 12697792, + "step": 60170 + }, + { + "epoch": 6.61991199119912, + "grad_norm": 1.029441237449646, + "learning_rate": 4.2304202954860986e-05, + "loss": 0.0679, + "num_input_tokens_seen": 12698816, + "step": 60175 + }, + { + "epoch": 6.62046204620462, + "grad_norm": 0.348217248916626, + "learning_rate": 4.230247065910377e-05, + "loss": 0.0145, + "num_input_tokens_seen": 12699872, + "step": 60180 + }, + { + "epoch": 6.621012101210121, + "grad_norm": 0.22861120104789734, + "learning_rate": 4.230073820387799e-05, + "loss": 0.061, + "num_input_tokens_seen": 12700928, + "step": 60185 + }, + { + "epoch": 6.621562156215622, + "grad_norm": 0.023119423538446426, + "learning_rate": 4.2299005589199604e-05, + "loss": 0.0252, + "num_input_tokens_seen": 12701984, + "step": 60190 + }, + { + "epoch": 6.622112211221122, + "grad_norm": 0.2160542607307434, + "learning_rate": 4.229727281508459e-05, + "loss": 0.0115, + "num_input_tokens_seen": 12703072, + "step": 60195 + }, + { + "epoch": 6.622662266226623, + "grad_norm": 0.06254066526889801, + "learning_rate": 4.2295539881548905e-05, + "loss": 0.0551, + "num_input_tokens_seen": 12704160, + "step": 60200 + }, + { + "epoch": 6.623212321232123, + "grad_norm": 0.012142802588641644, + "learning_rate": 4.229380678860853e-05, + "loss": 0.0063, + "num_input_tokens_seen": 12705248, + "step": 60205 + }, + { + "epoch": 6.623762376237623, + "grad_norm": 0.05570574477314949, + "learning_rate": 4.229207353627944e-05, + "loss": 0.0227, + "num_input_tokens_seen": 12706368, + "step": 60210 + }, + { + "epoch": 6.6243124312431245, + "grad_norm": 0.12688665091991425, + "learning_rate": 4.22903401245776e-05, + "loss": 0.0476, + "num_input_tokens_seen": 12707424, + "step": 60215 + }, + { + "epoch": 6.624862486248625, + "grad_norm": 0.38453876972198486, + "learning_rate": 4.228860655351899e-05, + "loss": 0.0867, + "num_input_tokens_seen": 12708480, + "step": 60220 + }, + { + "epoch": 6.625412541254125, + "grad_norm": 1.1123666763305664, + "learning_rate": 4.22868728231196e-05, + "loss": 0.0225, + "num_input_tokens_seen": 12709472, + "step": 60225 + }, + { + "epoch": 6.625962596259626, + "grad_norm": 0.015737129375338554, + "learning_rate": 4.228513893339539e-05, + "loss": 0.004, + "num_input_tokens_seen": 12710528, + "step": 60230 + }, + { + "epoch": 6.626512651265126, + "grad_norm": 0.4939567446708679, + "learning_rate": 4.228340488436235e-05, + "loss": 0.0552, + "num_input_tokens_seen": 12711584, + "step": 60235 + }, + { + "epoch": 6.627062706270627, + "grad_norm": 0.005083780735731125, + "learning_rate": 4.228167067603647e-05, + "loss": 0.014, + "num_input_tokens_seen": 12712608, + "step": 60240 + }, + { + "epoch": 6.6276127612761275, + "grad_norm": 0.3823738396167755, + "learning_rate": 4.227993630843371e-05, + "loss": 0.0242, + "num_input_tokens_seen": 12713696, + "step": 60245 + }, + { + "epoch": 6.628162816281629, + "grad_norm": 1.6445668935775757, + "learning_rate": 4.227820178157008e-05, + "loss": 0.0845, + "num_input_tokens_seen": 12714752, + "step": 60250 + }, + { + "epoch": 6.628712871287129, + "grad_norm": 0.0288577638566494, + "learning_rate": 4.2276467095461545e-05, + "loss": 0.0102, + "num_input_tokens_seen": 12715840, + "step": 60255 + }, + { + "epoch": 6.629262926292629, + "grad_norm": 0.0192550215870142, + "learning_rate": 4.227473225012411e-05, + "loss": 0.0043, + "num_input_tokens_seen": 12716864, + "step": 60260 + }, + { + "epoch": 6.62981298129813, + "grad_norm": 1.6659116744995117, + "learning_rate": 4.2272997245573756e-05, + "loss": 0.125, + "num_input_tokens_seen": 12717920, + "step": 60265 + }, + { + "epoch": 6.63036303630363, + "grad_norm": 0.08687888085842133, + "learning_rate": 4.2271262081826465e-05, + "loss": 0.0034, + "num_input_tokens_seen": 12718912, + "step": 60270 + }, + { + "epoch": 6.6309130913091305, + "grad_norm": 0.033112816512584686, + "learning_rate": 4.226952675889825e-05, + "loss": 0.0045, + "num_input_tokens_seen": 12719936, + "step": 60275 + }, + { + "epoch": 6.631463146314632, + "grad_norm": 1.154592514038086, + "learning_rate": 4.22677912768051e-05, + "loss": 0.111, + "num_input_tokens_seen": 12721024, + "step": 60280 + }, + { + "epoch": 6.632013201320132, + "grad_norm": 0.20585839450359344, + "learning_rate": 4.2266055635562995e-05, + "loss": 0.0717, + "num_input_tokens_seen": 12722112, + "step": 60285 + }, + { + "epoch": 6.632563256325633, + "grad_norm": 0.00786418654024601, + "learning_rate": 4.226431983518794e-05, + "loss": 0.0266, + "num_input_tokens_seen": 12723104, + "step": 60290 + }, + { + "epoch": 6.633113311331133, + "grad_norm": 0.13260948657989502, + "learning_rate": 4.226258387569593e-05, + "loss": 0.0259, + "num_input_tokens_seen": 12724128, + "step": 60295 + }, + { + "epoch": 6.633663366336633, + "grad_norm": 0.06395365297794342, + "learning_rate": 4.226084775710297e-05, + "loss": 0.0782, + "num_input_tokens_seen": 12725216, + "step": 60300 + }, + { + "epoch": 6.634213421342134, + "grad_norm": 0.4491265118122101, + "learning_rate": 4.225911147942506e-05, + "loss": 0.0101, + "num_input_tokens_seen": 12726240, + "step": 60305 + }, + { + "epoch": 6.634763476347635, + "grad_norm": 0.18620720505714417, + "learning_rate": 4.225737504267821e-05, + "loss": 0.0945, + "num_input_tokens_seen": 12727296, + "step": 60310 + }, + { + "epoch": 6.635313531353136, + "grad_norm": 1.0792664289474487, + "learning_rate": 4.22556384468784e-05, + "loss": 0.0726, + "num_input_tokens_seen": 12728320, + "step": 60315 + }, + { + "epoch": 6.635863586358636, + "grad_norm": 0.0319400392472744, + "learning_rate": 4.2253901692041654e-05, + "loss": 0.0739, + "num_input_tokens_seen": 12729312, + "step": 60320 + }, + { + "epoch": 6.636413641364136, + "grad_norm": 0.0448213666677475, + "learning_rate": 4.225216477818398e-05, + "loss": 0.073, + "num_input_tokens_seen": 12730432, + "step": 60325 + }, + { + "epoch": 6.636963696369637, + "grad_norm": 0.5963416695594788, + "learning_rate": 4.225042770532138e-05, + "loss": 0.023, + "num_input_tokens_seen": 12731456, + "step": 60330 + }, + { + "epoch": 6.637513751375137, + "grad_norm": 1.2412159442901611, + "learning_rate": 4.224869047346986e-05, + "loss": 0.1255, + "num_input_tokens_seen": 12732544, + "step": 60335 + }, + { + "epoch": 6.638063806380638, + "grad_norm": 0.15641476213932037, + "learning_rate": 4.224695308264545e-05, + "loss": 0.031, + "num_input_tokens_seen": 12733664, + "step": 60340 + }, + { + "epoch": 6.638613861386139, + "grad_norm": 0.01424479205161333, + "learning_rate": 4.224521553286413e-05, + "loss": 0.0451, + "num_input_tokens_seen": 12734784, + "step": 60345 + }, + { + "epoch": 6.639163916391639, + "grad_norm": 0.042015478014945984, + "learning_rate": 4.224347782414195e-05, + "loss": 0.1255, + "num_input_tokens_seen": 12735808, + "step": 60350 + }, + { + "epoch": 6.63971397139714, + "grad_norm": 0.09010860323905945, + "learning_rate": 4.2241739956494906e-05, + "loss": 0.0122, + "num_input_tokens_seen": 12736896, + "step": 60355 + }, + { + "epoch": 6.64026402640264, + "grad_norm": 3.4365243911743164, + "learning_rate": 4.224000192993901e-05, + "loss": 0.047, + "num_input_tokens_seen": 12737888, + "step": 60360 + }, + { + "epoch": 6.6408140814081404, + "grad_norm": 0.5700371861457825, + "learning_rate": 4.223826374449029e-05, + "loss": 0.0749, + "num_input_tokens_seen": 12738944, + "step": 60365 + }, + { + "epoch": 6.6413641364136415, + "grad_norm": 0.11038850992918015, + "learning_rate": 4.2236525400164774e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12740000, + "step": 60370 + }, + { + "epoch": 6.641914191419142, + "grad_norm": 0.37193188071250916, + "learning_rate": 4.2234786896978475e-05, + "loss": 0.0146, + "num_input_tokens_seen": 12741120, + "step": 60375 + }, + { + "epoch": 6.642464246424643, + "grad_norm": 0.08887450397014618, + "learning_rate": 4.22330482349474e-05, + "loss": 0.0095, + "num_input_tokens_seen": 12742176, + "step": 60380 + }, + { + "epoch": 6.643014301430143, + "grad_norm": 0.053093135356903076, + "learning_rate": 4.22313094140876e-05, + "loss": 0.0464, + "num_input_tokens_seen": 12743200, + "step": 60385 + }, + { + "epoch": 6.643564356435643, + "grad_norm": 0.9308581352233887, + "learning_rate": 4.222957043441509e-05, + "loss": 0.0493, + "num_input_tokens_seen": 12744352, + "step": 60390 + }, + { + "epoch": 6.644114411441144, + "grad_norm": 0.09871168434619904, + "learning_rate": 4.22278312959459e-05, + "loss": 0.018, + "num_input_tokens_seen": 12745408, + "step": 60395 + }, + { + "epoch": 6.6446644664466445, + "grad_norm": 0.8822101354598999, + "learning_rate": 4.222609199869605e-05, + "loss": 0.0232, + "num_input_tokens_seen": 12746400, + "step": 60400 + }, + { + "epoch": 6.645214521452145, + "grad_norm": 0.07202938944101334, + "learning_rate": 4.222435254268158e-05, + "loss": 0.0281, + "num_input_tokens_seen": 12747392, + "step": 60405 + }, + { + "epoch": 6.645764576457646, + "grad_norm": 0.1952480524778366, + "learning_rate": 4.2222612927918506e-05, + "loss": 0.129, + "num_input_tokens_seen": 12748448, + "step": 60410 + }, + { + "epoch": 6.646314631463146, + "grad_norm": 0.1121385246515274, + "learning_rate": 4.222087315442288e-05, + "loss": 0.0637, + "num_input_tokens_seen": 12749536, + "step": 60415 + }, + { + "epoch": 6.646864686468647, + "grad_norm": 0.9040135741233826, + "learning_rate": 4.221913322221074e-05, + "loss": 0.0691, + "num_input_tokens_seen": 12750592, + "step": 60420 + }, + { + "epoch": 6.647414741474147, + "grad_norm": 0.7396254539489746, + "learning_rate": 4.22173931312981e-05, + "loss": 0.0865, + "num_input_tokens_seen": 12751648, + "step": 60425 + }, + { + "epoch": 6.647964796479648, + "grad_norm": 0.7286087870597839, + "learning_rate": 4.221565288170101e-05, + "loss": 0.0603, + "num_input_tokens_seen": 12752736, + "step": 60430 + }, + { + "epoch": 6.648514851485149, + "grad_norm": 0.012743616476655006, + "learning_rate": 4.2213912473435505e-05, + "loss": 0.0473, + "num_input_tokens_seen": 12753760, + "step": 60435 + }, + { + "epoch": 6.649064906490649, + "grad_norm": 0.0781225636601448, + "learning_rate": 4.2212171906517635e-05, + "loss": 0.0397, + "num_input_tokens_seen": 12754848, + "step": 60440 + }, + { + "epoch": 6.64961496149615, + "grad_norm": 0.009370770305395126, + "learning_rate": 4.221043118096344e-05, + "loss": 0.1157, + "num_input_tokens_seen": 12755904, + "step": 60445 + }, + { + "epoch": 6.65016501650165, + "grad_norm": 0.015757057815790176, + "learning_rate": 4.220869029678894e-05, + "loss": 0.0151, + "num_input_tokens_seen": 12756928, + "step": 60450 + }, + { + "epoch": 6.65071507150715, + "grad_norm": 1.9240128993988037, + "learning_rate": 4.220694925401022e-05, + "loss": 0.0269, + "num_input_tokens_seen": 12757984, + "step": 60455 + }, + { + "epoch": 6.6512651265126514, + "grad_norm": 0.039287395775318146, + "learning_rate": 4.2205208052643296e-05, + "loss": 0.0308, + "num_input_tokens_seen": 12759040, + "step": 60460 + }, + { + "epoch": 6.651815181518152, + "grad_norm": 0.03568337857723236, + "learning_rate": 4.220346669270423e-05, + "loss": 0.0243, + "num_input_tokens_seen": 12760160, + "step": 60465 + }, + { + "epoch": 6.652365236523653, + "grad_norm": 0.5977741479873657, + "learning_rate": 4.2201725174209064e-05, + "loss": 0.0388, + "num_input_tokens_seen": 12761248, + "step": 60470 + }, + { + "epoch": 6.652915291529153, + "grad_norm": 0.49896201491355896, + "learning_rate": 4.219998349717385e-05, + "loss": 0.0521, + "num_input_tokens_seen": 12762400, + "step": 60475 + }, + { + "epoch": 6.653465346534653, + "grad_norm": 0.19934403896331787, + "learning_rate": 4.219824166161464e-05, + "loss": 0.0104, + "num_input_tokens_seen": 12763456, + "step": 60480 + }, + { + "epoch": 6.654015401540154, + "grad_norm": 0.07057710736989975, + "learning_rate": 4.21964996675475e-05, + "loss": 0.0564, + "num_input_tokens_seen": 12764512, + "step": 60485 + }, + { + "epoch": 6.6545654565456545, + "grad_norm": 0.07557593286037445, + "learning_rate": 4.2194757514988456e-05, + "loss": 0.018, + "num_input_tokens_seen": 12765600, + "step": 60490 + }, + { + "epoch": 6.6551155115511555, + "grad_norm": 1.4032939672470093, + "learning_rate": 4.2193015203953596e-05, + "loss": 0.0534, + "num_input_tokens_seen": 12766688, + "step": 60495 + }, + { + "epoch": 6.655665566556656, + "grad_norm": 0.09304744750261307, + "learning_rate": 4.2191272734458955e-05, + "loss": 0.0233, + "num_input_tokens_seen": 12767744, + "step": 60500 + }, + { + "epoch": 6.656215621562156, + "grad_norm": 0.2543956935405731, + "learning_rate": 4.2189530106520616e-05, + "loss": 0.0082, + "num_input_tokens_seen": 12768832, + "step": 60505 + }, + { + "epoch": 6.656765676567657, + "grad_norm": 0.9827739596366882, + "learning_rate": 4.2187787320154616e-05, + "loss": 0.0669, + "num_input_tokens_seen": 12769888, + "step": 60510 + }, + { + "epoch": 6.657315731573157, + "grad_norm": 0.06780719012022018, + "learning_rate": 4.218604437537703e-05, + "loss": 0.011, + "num_input_tokens_seen": 12770912, + "step": 60515 + }, + { + "epoch": 6.6578657865786575, + "grad_norm": 0.11009959131479263, + "learning_rate": 4.218430127220392e-05, + "loss": 0.0708, + "num_input_tokens_seen": 12772032, + "step": 60520 + }, + { + "epoch": 6.658415841584159, + "grad_norm": 0.007059836760163307, + "learning_rate": 4.2182558010651355e-05, + "loss": 0.033, + "num_input_tokens_seen": 12773088, + "step": 60525 + }, + { + "epoch": 6.658965896589659, + "grad_norm": 0.5597087144851685, + "learning_rate": 4.218081459073539e-05, + "loss": 0.0749, + "num_input_tokens_seen": 12774112, + "step": 60530 + }, + { + "epoch": 6.65951595159516, + "grad_norm": 0.011811872012913227, + "learning_rate": 4.2179071012472104e-05, + "loss": 0.0225, + "num_input_tokens_seen": 12775200, + "step": 60535 + }, + { + "epoch": 6.66006600660066, + "grad_norm": 0.016034923493862152, + "learning_rate": 4.217732727587757e-05, + "loss": 0.0776, + "num_input_tokens_seen": 12776192, + "step": 60540 + }, + { + "epoch": 6.66061606160616, + "grad_norm": 0.12589937448501587, + "learning_rate": 4.217558338096785e-05, + "loss": 0.0068, + "num_input_tokens_seen": 12777184, + "step": 60545 + }, + { + "epoch": 6.661166116611661, + "grad_norm": 0.015477134846150875, + "learning_rate": 4.217383932775901e-05, + "loss": 0.0487, + "num_input_tokens_seen": 12778272, + "step": 60550 + }, + { + "epoch": 6.661716171617162, + "grad_norm": 0.31165996193885803, + "learning_rate": 4.2172095116267146e-05, + "loss": 0.0152, + "num_input_tokens_seen": 12779360, + "step": 60555 + }, + { + "epoch": 6.662266226622663, + "grad_norm": 0.20059026777744293, + "learning_rate": 4.217035074650831e-05, + "loss": 0.0119, + "num_input_tokens_seen": 12780416, + "step": 60560 + }, + { + "epoch": 6.662816281628163, + "grad_norm": 0.02162995934486389, + "learning_rate": 4.2168606218498604e-05, + "loss": 0.0159, + "num_input_tokens_seen": 12781504, + "step": 60565 + }, + { + "epoch": 6.663366336633663, + "grad_norm": 0.703188955783844, + "learning_rate": 4.216686153225409e-05, + "loss": 0.0169, + "num_input_tokens_seen": 12782528, + "step": 60570 + }, + { + "epoch": 6.663916391639164, + "grad_norm": 0.013208824209868908, + "learning_rate": 4.216511668779084e-05, + "loss": 0.0228, + "num_input_tokens_seen": 12783584, + "step": 60575 + }, + { + "epoch": 6.664466446644664, + "grad_norm": 0.15524965524673462, + "learning_rate": 4.2163371685124955e-05, + "loss": 0.0541, + "num_input_tokens_seen": 12784704, + "step": 60580 + }, + { + "epoch": 6.665016501650165, + "grad_norm": 0.06934081017971039, + "learning_rate": 4.216162652427251e-05, + "loss": 0.0272, + "num_input_tokens_seen": 12785696, + "step": 60585 + }, + { + "epoch": 6.665566556655666, + "grad_norm": 0.025195281952619553, + "learning_rate": 4.2159881205249586e-05, + "loss": 0.0109, + "num_input_tokens_seen": 12786720, + "step": 60590 + }, + { + "epoch": 6.666116611661166, + "grad_norm": 0.4648047089576721, + "learning_rate": 4.215813572807227e-05, + "loss": 0.0458, + "num_input_tokens_seen": 12787840, + "step": 60595 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.0057920049875974655, + "learning_rate": 4.215639009275665e-05, + "loss": 0.0478, + "num_input_tokens_seen": 12788864, + "step": 60600 + }, + { + "epoch": 6.667216721672167, + "grad_norm": 0.07949428260326385, + "learning_rate": 4.2154644299318816e-05, + "loss": 0.0084, + "num_input_tokens_seen": 12789984, + "step": 60605 + }, + { + "epoch": 6.667766776677668, + "grad_norm": 0.02697317861020565, + "learning_rate": 4.215289834777485e-05, + "loss": 0.0058, + "num_input_tokens_seen": 12791008, + "step": 60610 + }, + { + "epoch": 6.6683168316831685, + "grad_norm": 0.0034994971938431263, + "learning_rate": 4.215115223814086e-05, + "loss": 0.0724, + "num_input_tokens_seen": 12792064, + "step": 60615 + }, + { + "epoch": 6.668866886688669, + "grad_norm": 0.4728896915912628, + "learning_rate": 4.214940597043293e-05, + "loss": 0.1367, + "num_input_tokens_seen": 12793120, + "step": 60620 + }, + { + "epoch": 6.66941694169417, + "grad_norm": 0.10720501095056534, + "learning_rate": 4.2147659544667146e-05, + "loss": 0.0508, + "num_input_tokens_seen": 12794240, + "step": 60625 + }, + { + "epoch": 6.66996699669967, + "grad_norm": 1.5462439060211182, + "learning_rate": 4.214591296085962e-05, + "loss": 0.0971, + "num_input_tokens_seen": 12795264, + "step": 60630 + }, + { + "epoch": 6.67051705170517, + "grad_norm": 0.1814267486333847, + "learning_rate": 4.214416621902644e-05, + "loss": 0.0041, + "num_input_tokens_seen": 12796384, + "step": 60635 + }, + { + "epoch": 6.671067106710671, + "grad_norm": 0.05568911135196686, + "learning_rate": 4.21424193191837e-05, + "loss": 0.0053, + "num_input_tokens_seen": 12797344, + "step": 60640 + }, + { + "epoch": 6.6716171617161715, + "grad_norm": 0.5237460732460022, + "learning_rate": 4.2140672261347514e-05, + "loss": 0.0684, + "num_input_tokens_seen": 12798368, + "step": 60645 + }, + { + "epoch": 6.672167216721672, + "grad_norm": 0.10512932389974594, + "learning_rate": 4.213892504553397e-05, + "loss": 0.0536, + "num_input_tokens_seen": 12799392, + "step": 60650 + }, + { + "epoch": 6.672717271727173, + "grad_norm": 0.7645341157913208, + "learning_rate": 4.213717767175919e-05, + "loss": 0.1115, + "num_input_tokens_seen": 12800448, + "step": 60655 + }, + { + "epoch": 6.673267326732673, + "grad_norm": 1.0669811964035034, + "learning_rate": 4.213543014003926e-05, + "loss": 0.0365, + "num_input_tokens_seen": 12801472, + "step": 60660 + }, + { + "epoch": 6.673817381738174, + "grad_norm": 0.02773924544453621, + "learning_rate": 4.213368245039029e-05, + "loss": 0.0142, + "num_input_tokens_seen": 12802528, + "step": 60665 + }, + { + "epoch": 6.674367436743674, + "grad_norm": 0.6628694534301758, + "learning_rate": 4.213193460282839e-05, + "loss": 0.0267, + "num_input_tokens_seen": 12803584, + "step": 60670 + }, + { + "epoch": 6.674917491749175, + "grad_norm": 1.1784589290618896, + "learning_rate": 4.213018659736967e-05, + "loss": 0.0136, + "num_input_tokens_seen": 12804672, + "step": 60675 + }, + { + "epoch": 6.675467546754676, + "grad_norm": 0.13301925361156464, + "learning_rate": 4.212843843403025e-05, + "loss": 0.0176, + "num_input_tokens_seen": 12805632, + "step": 60680 + }, + { + "epoch": 6.676017601760176, + "grad_norm": 0.7036293745040894, + "learning_rate": 4.212669011282622e-05, + "loss": 0.0213, + "num_input_tokens_seen": 12806656, + "step": 60685 + }, + { + "epoch": 6.676567656765677, + "grad_norm": 0.9586271643638611, + "learning_rate": 4.2124941633773706e-05, + "loss": 0.0378, + "num_input_tokens_seen": 12807712, + "step": 60690 + }, + { + "epoch": 6.677117711771177, + "grad_norm": 0.04786662384867668, + "learning_rate": 4.2123192996888825e-05, + "loss": 0.0076, + "num_input_tokens_seen": 12808736, + "step": 60695 + }, + { + "epoch": 6.677667766776677, + "grad_norm": 0.026807846501469612, + "learning_rate": 4.21214442021877e-05, + "loss": 0.0318, + "num_input_tokens_seen": 12809760, + "step": 60700 + }, + { + "epoch": 6.678217821782178, + "grad_norm": 0.17461353540420532, + "learning_rate": 4.211969524968643e-05, + "loss": 0.0378, + "num_input_tokens_seen": 12810816, + "step": 60705 + }, + { + "epoch": 6.678767876787679, + "grad_norm": 0.5683909058570862, + "learning_rate": 4.211794613940114e-05, + "loss": 0.0186, + "num_input_tokens_seen": 12811904, + "step": 60710 + }, + { + "epoch": 6.67931793179318, + "grad_norm": 0.2215563952922821, + "learning_rate": 4.211619687134796e-05, + "loss": 0.0405, + "num_input_tokens_seen": 12812864, + "step": 60715 + }, + { + "epoch": 6.67986798679868, + "grad_norm": 0.005139323882758617, + "learning_rate": 4.2114447445543e-05, + "loss": 0.0285, + "num_input_tokens_seen": 12813856, + "step": 60720 + }, + { + "epoch": 6.68041804180418, + "grad_norm": 0.2631307542324066, + "learning_rate": 4.2112697862002404e-05, + "loss": 0.0348, + "num_input_tokens_seen": 12814880, + "step": 60725 + }, + { + "epoch": 6.680968096809681, + "grad_norm": 0.27050620317459106, + "learning_rate": 4.211094812074228e-05, + "loss": 0.0131, + "num_input_tokens_seen": 12815872, + "step": 60730 + }, + { + "epoch": 6.681518151815181, + "grad_norm": 0.05410630628466606, + "learning_rate": 4.2109198221778746e-05, + "loss": 0.0022, + "num_input_tokens_seen": 12816928, + "step": 60735 + }, + { + "epoch": 6.6820682068206825, + "grad_norm": 0.04221515357494354, + "learning_rate": 4.210744816512795e-05, + "loss": 0.0212, + "num_input_tokens_seen": 12817952, + "step": 60740 + }, + { + "epoch": 6.682618261826183, + "grad_norm": 0.12847736477851868, + "learning_rate": 4.2105697950806014e-05, + "loss": 0.0186, + "num_input_tokens_seen": 12819040, + "step": 60745 + }, + { + "epoch": 6.683168316831683, + "grad_norm": 1.1374727487564087, + "learning_rate": 4.210394757882906e-05, + "loss": 0.1233, + "num_input_tokens_seen": 12820000, + "step": 60750 + }, + { + "epoch": 6.683718371837184, + "grad_norm": 0.013261363841593266, + "learning_rate": 4.210219704921324e-05, + "loss": 0.0597, + "num_input_tokens_seen": 12821056, + "step": 60755 + }, + { + "epoch": 6.684268426842684, + "grad_norm": 0.024072542786598206, + "learning_rate": 4.2100446361974665e-05, + "loss": 0.1451, + "num_input_tokens_seen": 12822144, + "step": 60760 + }, + { + "epoch": 6.684818481848184, + "grad_norm": 0.9333149790763855, + "learning_rate": 4.2098695517129495e-05, + "loss": 0.0686, + "num_input_tokens_seen": 12823200, + "step": 60765 + }, + { + "epoch": 6.6853685368536855, + "grad_norm": 0.012194184586405754, + "learning_rate": 4.209694451469384e-05, + "loss": 0.0372, + "num_input_tokens_seen": 12824288, + "step": 60770 + }, + { + "epoch": 6.685918591859186, + "grad_norm": 0.07266218215227127, + "learning_rate": 4.2095193354683856e-05, + "loss": 0.0118, + "num_input_tokens_seen": 12825344, + "step": 60775 + }, + { + "epoch": 6.686468646864687, + "grad_norm": 0.05629877373576164, + "learning_rate": 4.2093442037115683e-05, + "loss": 0.0369, + "num_input_tokens_seen": 12826368, + "step": 60780 + }, + { + "epoch": 6.687018701870187, + "grad_norm": 0.6944217085838318, + "learning_rate": 4.209169056200545e-05, + "loss": 0.0214, + "num_input_tokens_seen": 12827456, + "step": 60785 + }, + { + "epoch": 6.687568756875687, + "grad_norm": 0.15738336741924286, + "learning_rate": 4.208993892936931e-05, + "loss": 0.0134, + "num_input_tokens_seen": 12828544, + "step": 60790 + }, + { + "epoch": 6.688118811881188, + "grad_norm": 0.41986459493637085, + "learning_rate": 4.20881871392234e-05, + "loss": 0.0238, + "num_input_tokens_seen": 12829536, + "step": 60795 + }, + { + "epoch": 6.6886688668866885, + "grad_norm": 1.4656168222427368, + "learning_rate": 4.208643519158387e-05, + "loss": 0.036, + "num_input_tokens_seen": 12830688, + "step": 60800 + }, + { + "epoch": 6.68921892189219, + "grad_norm": 0.017376763746142387, + "learning_rate": 4.208468308646687e-05, + "loss": 0.0302, + "num_input_tokens_seen": 12831712, + "step": 60805 + }, + { + "epoch": 6.68976897689769, + "grad_norm": 0.009053850546479225, + "learning_rate": 4.208293082388854e-05, + "loss": 0.0852, + "num_input_tokens_seen": 12832768, + "step": 60810 + }, + { + "epoch": 6.69031903190319, + "grad_norm": 0.09938842803239822, + "learning_rate": 4.208117840386504e-05, + "loss": 0.0046, + "num_input_tokens_seen": 12833888, + "step": 60815 + }, + { + "epoch": 6.690869086908691, + "grad_norm": 0.09080947935581207, + "learning_rate": 4.2079425826412503e-05, + "loss": 0.0217, + "num_input_tokens_seen": 12835008, + "step": 60820 + }, + { + "epoch": 6.691419141914191, + "grad_norm": 0.39701253175735474, + "learning_rate": 4.20776730915471e-05, + "loss": 0.0335, + "num_input_tokens_seen": 12836064, + "step": 60825 + }, + { + "epoch": 6.6919691969196915, + "grad_norm": 0.10975494980812073, + "learning_rate": 4.207592019928498e-05, + "loss": 0.0232, + "num_input_tokens_seen": 12837088, + "step": 60830 + }, + { + "epoch": 6.692519251925193, + "grad_norm": 4.5179667472839355, + "learning_rate": 4.2074167149642296e-05, + "loss": 0.0303, + "num_input_tokens_seen": 12838144, + "step": 60835 + }, + { + "epoch": 6.693069306930693, + "grad_norm": 0.03838955610990524, + "learning_rate": 4.207241394263521e-05, + "loss": 0.0458, + "num_input_tokens_seen": 12839200, + "step": 60840 + }, + { + "epoch": 6.693619361936194, + "grad_norm": 0.05089268460869789, + "learning_rate": 4.207066057827988e-05, + "loss": 0.0099, + "num_input_tokens_seen": 12840320, + "step": 60845 + }, + { + "epoch": 6.694169416941694, + "grad_norm": 0.021555285900831223, + "learning_rate": 4.206890705659245e-05, + "loss": 0.0168, + "num_input_tokens_seen": 12841408, + "step": 60850 + }, + { + "epoch": 6.694719471947195, + "grad_norm": 1.7469795942306519, + "learning_rate": 4.20671533775891e-05, + "loss": 0.1378, + "num_input_tokens_seen": 12842496, + "step": 60855 + }, + { + "epoch": 6.695269526952695, + "grad_norm": 0.8008646368980408, + "learning_rate": 4.2065399541285986e-05, + "loss": 0.0421, + "num_input_tokens_seen": 12843648, + "step": 60860 + }, + { + "epoch": 6.695819581958196, + "grad_norm": 0.014598584733903408, + "learning_rate": 4.2063645547699274e-05, + "loss": 0.0258, + "num_input_tokens_seen": 12844704, + "step": 60865 + }, + { + "epoch": 6.696369636963697, + "grad_norm": 0.09038875997066498, + "learning_rate": 4.2061891396845134e-05, + "loss": 0.0716, + "num_input_tokens_seen": 12845792, + "step": 60870 + }, + { + "epoch": 6.696919691969197, + "grad_norm": 0.010028696618974209, + "learning_rate": 4.206013708873973e-05, + "loss": 0.0425, + "num_input_tokens_seen": 12846880, + "step": 60875 + }, + { + "epoch": 6.697469746974697, + "grad_norm": 0.2928291857242584, + "learning_rate": 4.2058382623399215e-05, + "loss": 0.0249, + "num_input_tokens_seen": 12847904, + "step": 60880 + }, + { + "epoch": 6.698019801980198, + "grad_norm": 0.019214538857340813, + "learning_rate": 4.205662800083978e-05, + "loss": 0.022, + "num_input_tokens_seen": 12848960, + "step": 60885 + }, + { + "epoch": 6.698569856985698, + "grad_norm": 0.10629842430353165, + "learning_rate": 4.205487322107759e-05, + "loss": 0.0769, + "num_input_tokens_seen": 12849984, + "step": 60890 + }, + { + "epoch": 6.6991199119911995, + "grad_norm": 0.10723483562469482, + "learning_rate": 4.2053118284128815e-05, + "loss": 0.0506, + "num_input_tokens_seen": 12850976, + "step": 60895 + }, + { + "epoch": 6.6996699669967, + "grad_norm": 0.0156418327242136, + "learning_rate": 4.205136319000964e-05, + "loss": 0.0477, + "num_input_tokens_seen": 12852032, + "step": 60900 + }, + { + "epoch": 6.7002200220022, + "grad_norm": 0.18686938285827637, + "learning_rate": 4.204960793873622e-05, + "loss": 0.0373, + "num_input_tokens_seen": 12853056, + "step": 60905 + }, + { + "epoch": 6.700770077007701, + "grad_norm": 0.004843613598495722, + "learning_rate": 4.204785253032475e-05, + "loss": 0.0139, + "num_input_tokens_seen": 12854144, + "step": 60910 + }, + { + "epoch": 6.701320132013201, + "grad_norm": 0.011093578301370144, + "learning_rate": 4.2046096964791404e-05, + "loss": 0.0236, + "num_input_tokens_seen": 12855136, + "step": 60915 + }, + { + "epoch": 6.701870187018702, + "grad_norm": 1.0600669384002686, + "learning_rate": 4.2044341242152366e-05, + "loss": 0.0509, + "num_input_tokens_seen": 12856160, + "step": 60920 + }, + { + "epoch": 6.7024202420242025, + "grad_norm": 1.444265365600586, + "learning_rate": 4.204258536242381e-05, + "loss": 0.1633, + "num_input_tokens_seen": 12857248, + "step": 60925 + }, + { + "epoch": 6.702970297029703, + "grad_norm": 0.20038658380508423, + "learning_rate": 4.204082932562192e-05, + "loss": 0.0764, + "num_input_tokens_seen": 12858272, + "step": 60930 + }, + { + "epoch": 6.703520352035204, + "grad_norm": 0.0605681911110878, + "learning_rate": 4.203907313176289e-05, + "loss": 0.0495, + "num_input_tokens_seen": 12859392, + "step": 60935 + }, + { + "epoch": 6.704070407040704, + "grad_norm": 0.8019582033157349, + "learning_rate": 4.203731678086289e-05, + "loss": 0.0832, + "num_input_tokens_seen": 12860448, + "step": 60940 + }, + { + "epoch": 6.704620462046204, + "grad_norm": 0.09440843760967255, + "learning_rate": 4.2035560272938124e-05, + "loss": 0.069, + "num_input_tokens_seen": 12861568, + "step": 60945 + }, + { + "epoch": 6.705170517051705, + "grad_norm": 0.9738882780075073, + "learning_rate": 4.203380360800476e-05, + "loss": 0.0423, + "num_input_tokens_seen": 12862560, + "step": 60950 + }, + { + "epoch": 6.7057205720572055, + "grad_norm": 0.20269760489463806, + "learning_rate": 4.203204678607902e-05, + "loss": 0.0089, + "num_input_tokens_seen": 12863712, + "step": 60955 + }, + { + "epoch": 6.706270627062707, + "grad_norm": 0.23456408083438873, + "learning_rate": 4.203028980717707e-05, + "loss": 0.0789, + "num_input_tokens_seen": 12864768, + "step": 60960 + }, + { + "epoch": 6.706820682068207, + "grad_norm": 1.5649054050445557, + "learning_rate": 4.202853267131511e-05, + "loss": 0.0434, + "num_input_tokens_seen": 12865792, + "step": 60965 + }, + { + "epoch": 6.707370737073707, + "grad_norm": 0.5339441299438477, + "learning_rate": 4.202677537850934e-05, + "loss": 0.0224, + "num_input_tokens_seen": 12866784, + "step": 60970 + }, + { + "epoch": 6.707920792079208, + "grad_norm": 0.6930166482925415, + "learning_rate": 4.202501792877595e-05, + "loss": 0.2365, + "num_input_tokens_seen": 12867840, + "step": 60975 + }, + { + "epoch": 6.708470847084708, + "grad_norm": 0.04186677932739258, + "learning_rate": 4.202326032213113e-05, + "loss": 0.016, + "num_input_tokens_seen": 12868864, + "step": 60980 + }, + { + "epoch": 6.709020902090209, + "grad_norm": 0.09027178585529327, + "learning_rate": 4.202150255859109e-05, + "loss": 0.0103, + "num_input_tokens_seen": 12869888, + "step": 60985 + }, + { + "epoch": 6.70957095709571, + "grad_norm": 0.39317741990089417, + "learning_rate": 4.201974463817204e-05, + "loss": 0.0402, + "num_input_tokens_seen": 12870976, + "step": 60990 + }, + { + "epoch": 6.71012101210121, + "grad_norm": 0.14628289639949799, + "learning_rate": 4.201798656089016e-05, + "loss": 0.0446, + "num_input_tokens_seen": 12872032, + "step": 60995 + }, + { + "epoch": 6.710671067106711, + "grad_norm": 0.2587202489376068, + "learning_rate": 4.201622832676167e-05, + "loss": 0.0133, + "num_input_tokens_seen": 12873088, + "step": 61000 + }, + { + "epoch": 6.711221122112211, + "grad_norm": 0.052565764635801315, + "learning_rate": 4.201446993580276e-05, + "loss": 0.0142, + "num_input_tokens_seen": 12874144, + "step": 61005 + }, + { + "epoch": 6.711771177117711, + "grad_norm": 0.26501983404159546, + "learning_rate": 4.201271138802965e-05, + "loss": 0.0184, + "num_input_tokens_seen": 12875200, + "step": 61010 + }, + { + "epoch": 6.712321232123212, + "grad_norm": 1.2349604368209839, + "learning_rate": 4.201095268345855e-05, + "loss": 0.1046, + "num_input_tokens_seen": 12876192, + "step": 61015 + }, + { + "epoch": 6.712871287128713, + "grad_norm": 0.49368971586227417, + "learning_rate": 4.2009193822105654e-05, + "loss": 0.0214, + "num_input_tokens_seen": 12877216, + "step": 61020 + }, + { + "epoch": 6.713421342134214, + "grad_norm": 0.30611854791641235, + "learning_rate": 4.200743480398718e-05, + "loss": 0.0044, + "num_input_tokens_seen": 12878272, + "step": 61025 + }, + { + "epoch": 6.713971397139714, + "grad_norm": 4.113346576690674, + "learning_rate": 4.2005675629119344e-05, + "loss": 0.1053, + "num_input_tokens_seen": 12879328, + "step": 61030 + }, + { + "epoch": 6.714521452145215, + "grad_norm": 0.01428779773414135, + "learning_rate": 4.2003916297518354e-05, + "loss": 0.0793, + "num_input_tokens_seen": 12880416, + "step": 61035 + }, + { + "epoch": 6.715071507150715, + "grad_norm": 0.9364966750144958, + "learning_rate": 4.200215680920043e-05, + "loss": 0.0719, + "num_input_tokens_seen": 12881504, + "step": 61040 + }, + { + "epoch": 6.715621562156215, + "grad_norm": 0.04755541682243347, + "learning_rate": 4.200039716418178e-05, + "loss": 0.0067, + "num_input_tokens_seen": 12882560, + "step": 61045 + }, + { + "epoch": 6.7161716171617165, + "grad_norm": 0.010351145640015602, + "learning_rate": 4.199863736247863e-05, + "loss": 0.0975, + "num_input_tokens_seen": 12883648, + "step": 61050 + }, + { + "epoch": 6.716721672167217, + "grad_norm": 1.1731915473937988, + "learning_rate": 4.1996877404107196e-05, + "loss": 0.1071, + "num_input_tokens_seen": 12884736, + "step": 61055 + }, + { + "epoch": 6.717271727172717, + "grad_norm": 0.08597785979509354, + "learning_rate": 4.19951172890837e-05, + "loss": 0.0179, + "num_input_tokens_seen": 12885824, + "step": 61060 + }, + { + "epoch": 6.717821782178218, + "grad_norm": 0.5179294347763062, + "learning_rate": 4.199335701742436e-05, + "loss": 0.1302, + "num_input_tokens_seen": 12886944, + "step": 61065 + }, + { + "epoch": 6.718371837183718, + "grad_norm": 0.007973159663379192, + "learning_rate": 4.1991596589145407e-05, + "loss": 0.0693, + "num_input_tokens_seen": 12888000, + "step": 61070 + }, + { + "epoch": 6.718921892189218, + "grad_norm": 0.05402100086212158, + "learning_rate": 4.198983600426305e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12889088, + "step": 61075 + }, + { + "epoch": 6.7194719471947195, + "grad_norm": 0.012807239778339863, + "learning_rate": 4.1988075262793534e-05, + "loss": 0.0726, + "num_input_tokens_seen": 12890144, + "step": 61080 + }, + { + "epoch": 6.72002200220022, + "grad_norm": 0.03177046403288841, + "learning_rate": 4.198631436475309e-05, + "loss": 0.0092, + "num_input_tokens_seen": 12891264, + "step": 61085 + }, + { + "epoch": 6.720572057205721, + "grad_norm": 0.9112176299095154, + "learning_rate": 4.198455331015793e-05, + "loss": 0.0415, + "num_input_tokens_seen": 12892320, + "step": 61090 + }, + { + "epoch": 6.721122112211221, + "grad_norm": 0.051671553403139114, + "learning_rate": 4.198279209902429e-05, + "loss": 0.0091, + "num_input_tokens_seen": 12893440, + "step": 61095 + }, + { + "epoch": 6.721672167216722, + "grad_norm": 0.03333171084523201, + "learning_rate": 4.1981030731368395e-05, + "loss": 0.0182, + "num_input_tokens_seen": 12894560, + "step": 61100 + }, + { + "epoch": 6.722222222222222, + "grad_norm": 0.26888152956962585, + "learning_rate": 4.197926920720651e-05, + "loss": 0.033, + "num_input_tokens_seen": 12895616, + "step": 61105 + }, + { + "epoch": 6.7227722772277225, + "grad_norm": 0.05619155615568161, + "learning_rate": 4.1977507526554835e-05, + "loss": 0.1126, + "num_input_tokens_seen": 12896736, + "step": 61110 + }, + { + "epoch": 6.723322332233224, + "grad_norm": 0.05665603652596474, + "learning_rate": 4.197574568942962e-05, + "loss": 0.0281, + "num_input_tokens_seen": 12897728, + "step": 61115 + }, + { + "epoch": 6.723872387238724, + "grad_norm": 0.282736599445343, + "learning_rate": 4.19739836958471e-05, + "loss": 0.0644, + "num_input_tokens_seen": 12898816, + "step": 61120 + }, + { + "epoch": 6.724422442244224, + "grad_norm": 0.6238106489181519, + "learning_rate": 4.197222154582352e-05, + "loss": 0.0365, + "num_input_tokens_seen": 12899904, + "step": 61125 + }, + { + "epoch": 6.724972497249725, + "grad_norm": 1.0748302936553955, + "learning_rate": 4.197045923937512e-05, + "loss": 0.0889, + "num_input_tokens_seen": 12900960, + "step": 61130 + }, + { + "epoch": 6.725522552255225, + "grad_norm": 0.11719466745853424, + "learning_rate": 4.1968696776518145e-05, + "loss": 0.0079, + "num_input_tokens_seen": 12902016, + "step": 61135 + }, + { + "epoch": 6.726072607260726, + "grad_norm": 0.05276552960276604, + "learning_rate": 4.1966934157268823e-05, + "loss": 0.0329, + "num_input_tokens_seen": 12903104, + "step": 61140 + }, + { + "epoch": 6.726622662266227, + "grad_norm": 0.3149576485157013, + "learning_rate": 4.196517138164342e-05, + "loss": 0.0162, + "num_input_tokens_seen": 12904192, + "step": 61145 + }, + { + "epoch": 6.727172717271727, + "grad_norm": 0.35977938771247864, + "learning_rate": 4.196340844965817e-05, + "loss": 0.0111, + "num_input_tokens_seen": 12905216, + "step": 61150 + }, + { + "epoch": 6.727722772277228, + "grad_norm": 0.05977074056863785, + "learning_rate": 4.196164536132932e-05, + "loss": 0.0051, + "num_input_tokens_seen": 12906336, + "step": 61155 + }, + { + "epoch": 6.728272827282728, + "grad_norm": 0.37689825892448425, + "learning_rate": 4.1959882116673126e-05, + "loss": 0.0234, + "num_input_tokens_seen": 12907392, + "step": 61160 + }, + { + "epoch": 6.728822882288229, + "grad_norm": 0.2551504671573639, + "learning_rate": 4.195811871570584e-05, + "loss": 0.0147, + "num_input_tokens_seen": 12908544, + "step": 61165 + }, + { + "epoch": 6.729372937293729, + "grad_norm": 0.6951929926872253, + "learning_rate": 4.195635515844371e-05, + "loss": 0.018, + "num_input_tokens_seen": 12909600, + "step": 61170 + }, + { + "epoch": 6.72992299229923, + "grad_norm": 0.041683826595544815, + "learning_rate": 4.1954591444902994e-05, + "loss": 0.0429, + "num_input_tokens_seen": 12910688, + "step": 61175 + }, + { + "epoch": 6.730473047304731, + "grad_norm": 0.4033508598804474, + "learning_rate": 4.1952827575099934e-05, + "loss": 0.0994, + "num_input_tokens_seen": 12911680, + "step": 61180 + }, + { + "epoch": 6.731023102310231, + "grad_norm": 0.34888529777526855, + "learning_rate": 4.1951063549050806e-05, + "loss": 0.1149, + "num_input_tokens_seen": 12912704, + "step": 61185 + }, + { + "epoch": 6.731573157315731, + "grad_norm": 1.2622649669647217, + "learning_rate": 4.194929936677185e-05, + "loss": 0.117, + "num_input_tokens_seen": 12913792, + "step": 61190 + }, + { + "epoch": 6.732123212321232, + "grad_norm": 0.18742474913597107, + "learning_rate": 4.194753502827935e-05, + "loss": 0.0183, + "num_input_tokens_seen": 12914848, + "step": 61195 + }, + { + "epoch": 6.732673267326732, + "grad_norm": 1.0844733715057373, + "learning_rate": 4.1945770533589535e-05, + "loss": 0.1613, + "num_input_tokens_seen": 12915936, + "step": 61200 + }, + { + "epoch": 6.7332233223322335, + "grad_norm": 0.8492565751075745, + "learning_rate": 4.194400588271869e-05, + "loss": 0.0616, + "num_input_tokens_seen": 12916928, + "step": 61205 + }, + { + "epoch": 6.733773377337734, + "grad_norm": 0.06296257674694061, + "learning_rate": 4.1942241075683084e-05, + "loss": 0.0107, + "num_input_tokens_seen": 12917920, + "step": 61210 + }, + { + "epoch": 6.734323432343234, + "grad_norm": 0.016954613849520683, + "learning_rate": 4.194047611249896e-05, + "loss": 0.0847, + "num_input_tokens_seen": 12918944, + "step": 61215 + }, + { + "epoch": 6.734873487348735, + "grad_norm": 2.1697182655334473, + "learning_rate": 4.193871099318259e-05, + "loss": 0.0691, + "num_input_tokens_seen": 12920000, + "step": 61220 + }, + { + "epoch": 6.735423542354235, + "grad_norm": 1.1508827209472656, + "learning_rate": 4.1936945717750266e-05, + "loss": 0.0652, + "num_input_tokens_seen": 12921088, + "step": 61225 + }, + { + "epoch": 6.735973597359736, + "grad_norm": 0.14066459238529205, + "learning_rate": 4.193518028621824e-05, + "loss": 0.0316, + "num_input_tokens_seen": 12922144, + "step": 61230 + }, + { + "epoch": 6.7365236523652365, + "grad_norm": 0.01501485239714384, + "learning_rate": 4.193341469860277e-05, + "loss": 0.0562, + "num_input_tokens_seen": 12923232, + "step": 61235 + }, + { + "epoch": 6.737073707370737, + "grad_norm": 1.2966231107711792, + "learning_rate": 4.193164895492015e-05, + "loss": 0.033, + "num_input_tokens_seen": 12924320, + "step": 61240 + }, + { + "epoch": 6.737623762376238, + "grad_norm": 0.07236551493406296, + "learning_rate": 4.192988305518664e-05, + "loss": 0.0066, + "num_input_tokens_seen": 12925344, + "step": 61245 + }, + { + "epoch": 6.738173817381738, + "grad_norm": 1.2053223848342896, + "learning_rate": 4.1928116999418525e-05, + "loss": 0.021, + "num_input_tokens_seen": 12926368, + "step": 61250 + }, + { + "epoch": 6.738723872387238, + "grad_norm": 0.6239527463912964, + "learning_rate": 4.192635078763209e-05, + "loss": 0.0285, + "num_input_tokens_seen": 12927488, + "step": 61255 + }, + { + "epoch": 6.739273927392739, + "grad_norm": 0.1536162793636322, + "learning_rate": 4.192458441984359e-05, + "loss": 0.0462, + "num_input_tokens_seen": 12928544, + "step": 61260 + }, + { + "epoch": 6.7398239823982395, + "grad_norm": 0.0065872701816260815, + "learning_rate": 4.192281789606931e-05, + "loss": 0.0301, + "num_input_tokens_seen": 12929664, + "step": 61265 + }, + { + "epoch": 6.740374037403741, + "grad_norm": 0.11081087589263916, + "learning_rate": 4.1921051216325554e-05, + "loss": 0.026, + "num_input_tokens_seen": 12930720, + "step": 61270 + }, + { + "epoch": 6.740924092409241, + "grad_norm": 0.019330760464072227, + "learning_rate": 4.191928438062857e-05, + "loss": 0.0121, + "num_input_tokens_seen": 12931744, + "step": 61275 + }, + { + "epoch": 6.741474147414742, + "grad_norm": 0.30952027440071106, + "learning_rate": 4.191751738899468e-05, + "loss": 0.0189, + "num_input_tokens_seen": 12932800, + "step": 61280 + }, + { + "epoch": 6.742024202420242, + "grad_norm": 0.05625578388571739, + "learning_rate": 4.191575024144014e-05, + "loss": 0.0055, + "num_input_tokens_seen": 12933856, + "step": 61285 + }, + { + "epoch": 6.742574257425742, + "grad_norm": 0.022606318816542625, + "learning_rate": 4.191398293798125e-05, + "loss": 0.02, + "num_input_tokens_seen": 12934912, + "step": 61290 + }, + { + "epoch": 6.743124312431243, + "grad_norm": 0.016906600445508957, + "learning_rate": 4.1912215478634286e-05, + "loss": 0.0096, + "num_input_tokens_seen": 12936000, + "step": 61295 + }, + { + "epoch": 6.743674367436744, + "grad_norm": 1.3286895751953125, + "learning_rate": 4.191044786341556e-05, + "loss": 0.034, + "num_input_tokens_seen": 12937120, + "step": 61300 + }, + { + "epoch": 6.744224422442244, + "grad_norm": 0.052826765924692154, + "learning_rate": 4.190868009234134e-05, + "loss": 0.0159, + "num_input_tokens_seen": 12938208, + "step": 61305 + }, + { + "epoch": 6.744774477447745, + "grad_norm": 0.03458448499441147, + "learning_rate": 4.190691216542794e-05, + "loss": 0.0125, + "num_input_tokens_seen": 12939296, + "step": 61310 + }, + { + "epoch": 6.745324532453245, + "grad_norm": 0.06619586050510406, + "learning_rate": 4.190514408269164e-05, + "loss": 0.0495, + "num_input_tokens_seen": 12940352, + "step": 61315 + }, + { + "epoch": 6.745874587458746, + "grad_norm": 0.025322359055280685, + "learning_rate": 4.190337584414873e-05, + "loss": 0.084, + "num_input_tokens_seen": 12941344, + "step": 61320 + }, + { + "epoch": 6.7464246424642464, + "grad_norm": 0.01837751641869545, + "learning_rate": 4.190160744981553e-05, + "loss": 0.0028, + "num_input_tokens_seen": 12942368, + "step": 61325 + }, + { + "epoch": 6.746974697469747, + "grad_norm": 0.1106041893362999, + "learning_rate": 4.189983889970832e-05, + "loss": 0.0864, + "num_input_tokens_seen": 12943456, + "step": 61330 + }, + { + "epoch": 6.747524752475248, + "grad_norm": 0.6985586285591125, + "learning_rate": 4.18980701938434e-05, + "loss": 0.0144, + "num_input_tokens_seen": 12944512, + "step": 61335 + }, + { + "epoch": 6.748074807480748, + "grad_norm": 0.009610414505004883, + "learning_rate": 4.189630133223707e-05, + "loss": 0.0166, + "num_input_tokens_seen": 12945568, + "step": 61340 + }, + { + "epoch": 6.748624862486249, + "grad_norm": 0.09179190546274185, + "learning_rate": 4.1894532314905644e-05, + "loss": 0.1143, + "num_input_tokens_seen": 12946592, + "step": 61345 + }, + { + "epoch": 6.749174917491749, + "grad_norm": 0.20640960335731506, + "learning_rate": 4.189276314186542e-05, + "loss": 0.0507, + "num_input_tokens_seen": 12947712, + "step": 61350 + }, + { + "epoch": 6.7497249724972495, + "grad_norm": 0.6214811205863953, + "learning_rate": 4.189099381313271e-05, + "loss": 0.0346, + "num_input_tokens_seen": 12948736, + "step": 61355 + }, + { + "epoch": 6.7502750275027505, + "grad_norm": 1.213464379310608, + "learning_rate": 4.188922432872381e-05, + "loss": 0.0761, + "num_input_tokens_seen": 12949760, + "step": 61360 + }, + { + "epoch": 6.750825082508251, + "grad_norm": 0.1713825911283493, + "learning_rate": 4.1887454688655034e-05, + "loss": 0.0422, + "num_input_tokens_seen": 12950880, + "step": 61365 + }, + { + "epoch": 6.751375137513751, + "grad_norm": 0.5123904347419739, + "learning_rate": 4.188568489294269e-05, + "loss": 0.0285, + "num_input_tokens_seen": 12951904, + "step": 61370 + }, + { + "epoch": 6.751925192519252, + "grad_norm": 0.15083573758602142, + "learning_rate": 4.18839149416031e-05, + "loss": 0.0551, + "num_input_tokens_seen": 12952992, + "step": 61375 + }, + { + "epoch": 6.752475247524752, + "grad_norm": 0.5201641321182251, + "learning_rate": 4.1882144834652555e-05, + "loss": 0.0286, + "num_input_tokens_seen": 12954016, + "step": 61380 + }, + { + "epoch": 6.753025302530253, + "grad_norm": 0.34331607818603516, + "learning_rate": 4.1880374572107386e-05, + "loss": 0.0615, + "num_input_tokens_seen": 12955136, + "step": 61385 + }, + { + "epoch": 6.7535753575357536, + "grad_norm": 0.0038295325357466936, + "learning_rate": 4.18786041539839e-05, + "loss": 0.0168, + "num_input_tokens_seen": 12956128, + "step": 61390 + }, + { + "epoch": 6.754125412541254, + "grad_norm": 0.015646997839212418, + "learning_rate": 4.187683358029843e-05, + "loss": 0.0031, + "num_input_tokens_seen": 12957184, + "step": 61395 + }, + { + "epoch": 6.754675467546755, + "grad_norm": 0.04428791627287865, + "learning_rate": 4.187506285106728e-05, + "loss": 0.0223, + "num_input_tokens_seen": 12958240, + "step": 61400 + }, + { + "epoch": 6.755225522552255, + "grad_norm": 0.008771134540438652, + "learning_rate": 4.1873291966306773e-05, + "loss": 0.0296, + "num_input_tokens_seen": 12959328, + "step": 61405 + }, + { + "epoch": 6.755775577557756, + "grad_norm": 0.009443888440728188, + "learning_rate": 4.187152092603322e-05, + "loss": 0.0729, + "num_input_tokens_seen": 12960352, + "step": 61410 + }, + { + "epoch": 6.756325632563256, + "grad_norm": 0.017374489456415176, + "learning_rate": 4.186974973026296e-05, + "loss": 0.0149, + "num_input_tokens_seen": 12961376, + "step": 61415 + }, + { + "epoch": 6.756875687568757, + "grad_norm": 3.122736930847168, + "learning_rate": 4.186797837901232e-05, + "loss": 0.1836, + "num_input_tokens_seen": 12962400, + "step": 61420 + }, + { + "epoch": 6.757425742574258, + "grad_norm": 1.439496636390686, + "learning_rate": 4.186620687229761e-05, + "loss": 0.0478, + "num_input_tokens_seen": 12963520, + "step": 61425 + }, + { + "epoch": 6.757975797579758, + "grad_norm": 0.2262941300868988, + "learning_rate": 4.186443521013517e-05, + "loss": 0.0559, + "num_input_tokens_seen": 12964576, + "step": 61430 + }, + { + "epoch": 6.758525852585258, + "grad_norm": 0.051219549030065536, + "learning_rate": 4.186266339254131e-05, + "loss": 0.0245, + "num_input_tokens_seen": 12965536, + "step": 61435 + }, + { + "epoch": 6.759075907590759, + "grad_norm": 0.08700674027204514, + "learning_rate": 4.186089141953238e-05, + "loss": 0.0522, + "num_input_tokens_seen": 12966624, + "step": 61440 + }, + { + "epoch": 6.759625962596259, + "grad_norm": 0.4692244529724121, + "learning_rate": 4.1859119291124715e-05, + "loss": 0.0292, + "num_input_tokens_seen": 12967680, + "step": 61445 + }, + { + "epoch": 6.7601760176017605, + "grad_norm": 0.2308039665222168, + "learning_rate": 4.1857347007334626e-05, + "loss": 0.0554, + "num_input_tokens_seen": 12968736, + "step": 61450 + }, + { + "epoch": 6.760726072607261, + "grad_norm": 0.028755804523825645, + "learning_rate": 4.1855574568178454e-05, + "loss": 0.0296, + "num_input_tokens_seen": 12969760, + "step": 61455 + }, + { + "epoch": 6.761276127612762, + "grad_norm": 0.017524344846606255, + "learning_rate": 4.185380197367255e-05, + "loss": 0.0177, + "num_input_tokens_seen": 12970784, + "step": 61460 + }, + { + "epoch": 6.761826182618262, + "grad_norm": 0.008320414461195469, + "learning_rate": 4.185202922383324e-05, + "loss": 0.0081, + "num_input_tokens_seen": 12971808, + "step": 61465 + }, + { + "epoch": 6.762376237623762, + "grad_norm": 0.21684329211711884, + "learning_rate": 4.185025631867686e-05, + "loss": 0.0107, + "num_input_tokens_seen": 12972832, + "step": 61470 + }, + { + "epoch": 6.762926292629263, + "grad_norm": 0.030013220384716988, + "learning_rate": 4.1848483258219753e-05, + "loss": 0.0597, + "num_input_tokens_seen": 12973952, + "step": 61475 + }, + { + "epoch": 6.7634763476347635, + "grad_norm": 0.8806403875350952, + "learning_rate": 4.184671004247826e-05, + "loss": 0.0294, + "num_input_tokens_seen": 12975008, + "step": 61480 + }, + { + "epoch": 6.764026402640264, + "grad_norm": 1.2831226587295532, + "learning_rate": 4.184493667146873e-05, + "loss": 0.0458, + "num_input_tokens_seen": 12976032, + "step": 61485 + }, + { + "epoch": 6.764576457645765, + "grad_norm": 0.1062639132142067, + "learning_rate": 4.184316314520749e-05, + "loss": 0.0228, + "num_input_tokens_seen": 12977088, + "step": 61490 + }, + { + "epoch": 6.765126512651265, + "grad_norm": 0.13319668173789978, + "learning_rate": 4.18413894637109e-05, + "loss": 0.0198, + "num_input_tokens_seen": 12978176, + "step": 61495 + }, + { + "epoch": 6.765676567656766, + "grad_norm": 0.06313352286815643, + "learning_rate": 4.1839615626995315e-05, + "loss": 0.0169, + "num_input_tokens_seen": 12979264, + "step": 61500 + }, + { + "epoch": 6.766226622662266, + "grad_norm": 0.026316696777939796, + "learning_rate": 4.183784163507707e-05, + "loss": 0.0433, + "num_input_tokens_seen": 12980384, + "step": 61505 + }, + { + "epoch": 6.7667766776677665, + "grad_norm": 0.3405722975730896, + "learning_rate": 4.183606748797251e-05, + "loss": 0.0118, + "num_input_tokens_seen": 12981376, + "step": 61510 + }, + { + "epoch": 6.767326732673268, + "grad_norm": 0.03305143117904663, + "learning_rate": 4.1834293185698e-05, + "loss": 0.0691, + "num_input_tokens_seen": 12982464, + "step": 61515 + }, + { + "epoch": 6.767876787678768, + "grad_norm": 0.016738101840019226, + "learning_rate": 4.183251872826989e-05, + "loss": 0.0074, + "num_input_tokens_seen": 12983520, + "step": 61520 + }, + { + "epoch": 6.768426842684269, + "grad_norm": 0.015017497353255749, + "learning_rate": 4.1830744115704524e-05, + "loss": 0.0754, + "num_input_tokens_seen": 12984544, + "step": 61525 + }, + { + "epoch": 6.768976897689769, + "grad_norm": 0.05546507611870766, + "learning_rate": 4.1828969348018276e-05, + "loss": 0.0896, + "num_input_tokens_seen": 12985600, + "step": 61530 + }, + { + "epoch": 6.769526952695269, + "grad_norm": 0.22811096906661987, + "learning_rate": 4.182719442522748e-05, + "loss": 0.0076, + "num_input_tokens_seen": 12986656, + "step": 61535 + }, + { + "epoch": 6.77007700770077, + "grad_norm": 0.0899396538734436, + "learning_rate": 4.182541934734852e-05, + "loss": 0.0369, + "num_input_tokens_seen": 12987680, + "step": 61540 + }, + { + "epoch": 6.770627062706271, + "grad_norm": 0.134332537651062, + "learning_rate": 4.182364411439774e-05, + "loss": 0.0081, + "num_input_tokens_seen": 12988768, + "step": 61545 + }, + { + "epoch": 6.771177117711771, + "grad_norm": 0.253231018781662, + "learning_rate": 4.18218687263915e-05, + "loss": 0.1122, + "num_input_tokens_seen": 12989824, + "step": 61550 + }, + { + "epoch": 6.771727172717272, + "grad_norm": 0.03066716343164444, + "learning_rate": 4.1820093183346174e-05, + "loss": 0.0062, + "num_input_tokens_seen": 12990912, + "step": 61555 + }, + { + "epoch": 6.772277227722772, + "grad_norm": 0.010441879741847515, + "learning_rate": 4.181831748527811e-05, + "loss": 0.0401, + "num_input_tokens_seen": 12991968, + "step": 61560 + }, + { + "epoch": 6.772827282728273, + "grad_norm": 0.4663168787956238, + "learning_rate": 4.18165416322037e-05, + "loss": 0.0549, + "num_input_tokens_seen": 12992992, + "step": 61565 + }, + { + "epoch": 6.773377337733773, + "grad_norm": 0.08177437633275986, + "learning_rate": 4.1814765624139286e-05, + "loss": 0.0582, + "num_input_tokens_seen": 12994048, + "step": 61570 + }, + { + "epoch": 6.773927392739274, + "grad_norm": 0.8261528611183167, + "learning_rate": 4.181298946110125e-05, + "loss": 0.017, + "num_input_tokens_seen": 12995168, + "step": 61575 + }, + { + "epoch": 6.774477447744775, + "grad_norm": 0.01674889400601387, + "learning_rate": 4.181121314310595e-05, + "loss": 0.0868, + "num_input_tokens_seen": 12996128, + "step": 61580 + }, + { + "epoch": 6.775027502750275, + "grad_norm": 0.009055147878825665, + "learning_rate": 4.180943667016977e-05, + "loss": 0.0711, + "num_input_tokens_seen": 12997152, + "step": 61585 + }, + { + "epoch": 6.775577557755776, + "grad_norm": 0.5935642123222351, + "learning_rate": 4.180766004230908e-05, + "loss": 0.0151, + "num_input_tokens_seen": 12998208, + "step": 61590 + }, + { + "epoch": 6.776127612761276, + "grad_norm": 3.746412515640259, + "learning_rate": 4.1805883259540245e-05, + "loss": 0.102, + "num_input_tokens_seen": 12999232, + "step": 61595 + }, + { + "epoch": 6.776677667766776, + "grad_norm": 1.0676522254943848, + "learning_rate": 4.1804106321879654e-05, + "loss": 0.0529, + "num_input_tokens_seen": 13000352, + "step": 61600 + }, + { + "epoch": 6.7772277227722775, + "grad_norm": 0.2922142446041107, + "learning_rate": 4.180232922934368e-05, + "loss": 0.0188, + "num_input_tokens_seen": 13001376, + "step": 61605 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.037240128964185715, + "learning_rate": 4.18005519819487e-05, + "loss": 0.0573, + "num_input_tokens_seen": 13002368, + "step": 61610 + }, + { + "epoch": 6.778327832783278, + "grad_norm": 0.5567891597747803, + "learning_rate": 4.179877457971109e-05, + "loss": 0.0598, + "num_input_tokens_seen": 13003392, + "step": 61615 + }, + { + "epoch": 6.778877887788779, + "grad_norm": 0.3222486674785614, + "learning_rate": 4.179699702264724e-05, + "loss": 0.0396, + "num_input_tokens_seen": 13004480, + "step": 61620 + }, + { + "epoch": 6.779427942794279, + "grad_norm": 0.5252091884613037, + "learning_rate": 4.179521931077352e-05, + "loss": 0.0357, + "num_input_tokens_seen": 13005504, + "step": 61625 + }, + { + "epoch": 6.77997799779978, + "grad_norm": 1.8020832538604736, + "learning_rate": 4.179344144410633e-05, + "loss": 0.0889, + "num_input_tokens_seen": 13006624, + "step": 61630 + }, + { + "epoch": 6.7805280528052805, + "grad_norm": 0.614797830581665, + "learning_rate": 4.1791663422662055e-05, + "loss": 0.105, + "num_input_tokens_seen": 13007616, + "step": 61635 + }, + { + "epoch": 6.781078107810782, + "grad_norm": 0.01443360187113285, + "learning_rate": 4.178988524645706e-05, + "loss": 0.0483, + "num_input_tokens_seen": 13008672, + "step": 61640 + }, + { + "epoch": 6.781628162816282, + "grad_norm": 0.05442584306001663, + "learning_rate": 4.1788106915507765e-05, + "loss": 0.1251, + "num_input_tokens_seen": 13009728, + "step": 61645 + }, + { + "epoch": 6.782178217821782, + "grad_norm": 0.0596655011177063, + "learning_rate": 4.178632842983053e-05, + "loss": 0.0067, + "num_input_tokens_seen": 13010816, + "step": 61650 + }, + { + "epoch": 6.782728272827283, + "grad_norm": 0.10855880379676819, + "learning_rate": 4.178454978944177e-05, + "loss": 0.0521, + "num_input_tokens_seen": 13011872, + "step": 61655 + }, + { + "epoch": 6.783278327832783, + "grad_norm": 0.02344793826341629, + "learning_rate": 4.178277099435787e-05, + "loss": 0.1282, + "num_input_tokens_seen": 13012992, + "step": 61660 + }, + { + "epoch": 6.7838283828382835, + "grad_norm": 0.11404319852590561, + "learning_rate": 4.178099204459521e-05, + "loss": 0.0287, + "num_input_tokens_seen": 13014016, + "step": 61665 + }, + { + "epoch": 6.784378437843785, + "grad_norm": 0.2668536901473999, + "learning_rate": 4.177921294017021e-05, + "loss": 0.0735, + "num_input_tokens_seen": 13015040, + "step": 61670 + }, + { + "epoch": 6.784928492849285, + "grad_norm": 0.02323472686111927, + "learning_rate": 4.1777433681099255e-05, + "loss": 0.0044, + "num_input_tokens_seen": 13016128, + "step": 61675 + }, + { + "epoch": 6.785478547854785, + "grad_norm": 0.013348218984901905, + "learning_rate": 4.177565426739875e-05, + "loss": 0.006, + "num_input_tokens_seen": 13017120, + "step": 61680 + }, + { + "epoch": 6.786028602860286, + "grad_norm": 2.173656463623047, + "learning_rate": 4.177387469908507e-05, + "loss": 0.122, + "num_input_tokens_seen": 13018208, + "step": 61685 + }, + { + "epoch": 6.786578657865786, + "grad_norm": 0.330091267824173, + "learning_rate": 4.177209497617465e-05, + "loss": 0.0155, + "num_input_tokens_seen": 13019264, + "step": 61690 + }, + { + "epoch": 6.787128712871287, + "grad_norm": 0.08204136788845062, + "learning_rate": 4.1770315098683875e-05, + "loss": 0.0769, + "num_input_tokens_seen": 13020320, + "step": 61695 + }, + { + "epoch": 6.787678767876788, + "grad_norm": 0.015971409156918526, + "learning_rate": 4.176853506662916e-05, + "loss": 0.0266, + "num_input_tokens_seen": 13021440, + "step": 61700 + }, + { + "epoch": 6.788228822882289, + "grad_norm": 0.058333124965429306, + "learning_rate": 4.1766754880026895e-05, + "loss": 0.037, + "num_input_tokens_seen": 13022496, + "step": 61705 + }, + { + "epoch": 6.788778877887789, + "grad_norm": 2.0794661045074463, + "learning_rate": 4.17649745388935e-05, + "loss": 0.0395, + "num_input_tokens_seen": 13023552, + "step": 61710 + }, + { + "epoch": 6.789328932893289, + "grad_norm": 0.0707615464925766, + "learning_rate": 4.1763194043245366e-05, + "loss": 0.0153, + "num_input_tokens_seen": 13024576, + "step": 61715 + }, + { + "epoch": 6.78987898789879, + "grad_norm": 0.10394937545061111, + "learning_rate": 4.1761413393098936e-05, + "loss": 0.0199, + "num_input_tokens_seen": 13025568, + "step": 61720 + }, + { + "epoch": 6.79042904290429, + "grad_norm": 0.03972197696566582, + "learning_rate": 4.175963258847059e-05, + "loss": 0.0133, + "num_input_tokens_seen": 13026656, + "step": 61725 + }, + { + "epoch": 6.790979097909791, + "grad_norm": 0.15302075445652008, + "learning_rate": 4.175785162937676e-05, + "loss": 0.1185, + "num_input_tokens_seen": 13027712, + "step": 61730 + }, + { + "epoch": 6.791529152915292, + "grad_norm": 0.12014489620923996, + "learning_rate": 4.1756070515833845e-05, + "loss": 0.0974, + "num_input_tokens_seen": 13028704, + "step": 61735 + }, + { + "epoch": 6.792079207920792, + "grad_norm": 0.1118219718337059, + "learning_rate": 4.175428924785828e-05, + "loss": 0.1084, + "num_input_tokens_seen": 13029696, + "step": 61740 + }, + { + "epoch": 6.792629262926293, + "grad_norm": 0.4863645136356354, + "learning_rate": 4.175250782546646e-05, + "loss": 0.1072, + "num_input_tokens_seen": 13030752, + "step": 61745 + }, + { + "epoch": 6.793179317931793, + "grad_norm": 0.03350289911031723, + "learning_rate": 4.175072624867481e-05, + "loss": 0.0112, + "num_input_tokens_seen": 13031712, + "step": 61750 + }, + { + "epoch": 6.793729372937293, + "grad_norm": 0.0353868305683136, + "learning_rate": 4.174894451749977e-05, + "loss": 0.019, + "num_input_tokens_seen": 13032800, + "step": 61755 + }, + { + "epoch": 6.7942794279427945, + "grad_norm": 0.08043862879276276, + "learning_rate": 4.174716263195773e-05, + "loss": 0.0954, + "num_input_tokens_seen": 13033792, + "step": 61760 + }, + { + "epoch": 6.794829482948295, + "grad_norm": 2.25055193901062, + "learning_rate": 4.174538059206513e-05, + "loss": 0.0921, + "num_input_tokens_seen": 13034816, + "step": 61765 + }, + { + "epoch": 6.795379537953796, + "grad_norm": 0.22254151105880737, + "learning_rate": 4.1743598397838405e-05, + "loss": 0.0107, + "num_input_tokens_seen": 13035840, + "step": 61770 + }, + { + "epoch": 6.795929592959296, + "grad_norm": 0.7841948866844177, + "learning_rate": 4.174181604929396e-05, + "loss": 0.0336, + "num_input_tokens_seen": 13036864, + "step": 61775 + }, + { + "epoch": 6.796479647964796, + "grad_norm": 0.03609379753470421, + "learning_rate": 4.174003354644823e-05, + "loss": 0.0208, + "num_input_tokens_seen": 13037920, + "step": 61780 + }, + { + "epoch": 6.797029702970297, + "grad_norm": 0.028627708554267883, + "learning_rate": 4.1738250889317635e-05, + "loss": 0.004, + "num_input_tokens_seen": 13038912, + "step": 61785 + }, + { + "epoch": 6.7975797579757975, + "grad_norm": 0.044485244899988174, + "learning_rate": 4.1736468077918624e-05, + "loss": 0.0426, + "num_input_tokens_seen": 13040000, + "step": 61790 + }, + { + "epoch": 6.798129812981298, + "grad_norm": 0.030761489644646645, + "learning_rate": 4.1734685112267615e-05, + "loss": 0.0493, + "num_input_tokens_seen": 13041120, + "step": 61795 + }, + { + "epoch": 6.798679867986799, + "grad_norm": 0.7079233527183533, + "learning_rate": 4.173290199238105e-05, + "loss": 0.0385, + "num_input_tokens_seen": 13042112, + "step": 61800 + }, + { + "epoch": 6.799229922992299, + "grad_norm": 0.19874657690525055, + "learning_rate": 4.173111871827534e-05, + "loss": 0.0233, + "num_input_tokens_seen": 13043168, + "step": 61805 + }, + { + "epoch": 6.7997799779978, + "grad_norm": 0.009279177524149418, + "learning_rate": 4.1729335289966947e-05, + "loss": 0.0101, + "num_input_tokens_seen": 13044192, + "step": 61810 + }, + { + "epoch": 6.8003300330033, + "grad_norm": 0.38182732462882996, + "learning_rate": 4.17275517074723e-05, + "loss": 0.0758, + "num_input_tokens_seen": 13045184, + "step": 61815 + }, + { + "epoch": 6.8008800880088005, + "grad_norm": 0.07514157146215439, + "learning_rate": 4.1725767970807835e-05, + "loss": 0.0228, + "num_input_tokens_seen": 13046208, + "step": 61820 + }, + { + "epoch": 6.801430143014302, + "grad_norm": 0.008685809560120106, + "learning_rate": 4.172398407998999e-05, + "loss": 0.0659, + "num_input_tokens_seen": 13047264, + "step": 61825 + }, + { + "epoch": 6.801980198019802, + "grad_norm": 0.5332095623016357, + "learning_rate": 4.1722200035035206e-05, + "loss": 0.0993, + "num_input_tokens_seen": 13048320, + "step": 61830 + }, + { + "epoch": 6.802530253025303, + "grad_norm": 0.03453279659152031, + "learning_rate": 4.172041583595994e-05, + "loss": 0.0115, + "num_input_tokens_seen": 13049376, + "step": 61835 + }, + { + "epoch": 6.803080308030803, + "grad_norm": 0.06091571971774101, + "learning_rate": 4.171863148278061e-05, + "loss": 0.0112, + "num_input_tokens_seen": 13050432, + "step": 61840 + }, + { + "epoch": 6.803630363036303, + "grad_norm": 0.03211501985788345, + "learning_rate": 4.171684697551368e-05, + "loss": 0.0394, + "num_input_tokens_seen": 13051520, + "step": 61845 + }, + { + "epoch": 6.804180418041804, + "grad_norm": 0.06047477573156357, + "learning_rate": 4.17150623141756e-05, + "loss": 0.0325, + "num_input_tokens_seen": 13052608, + "step": 61850 + }, + { + "epoch": 6.804730473047305, + "grad_norm": 0.03440360724925995, + "learning_rate": 4.171327749878281e-05, + "loss": 0.013, + "num_input_tokens_seen": 13053696, + "step": 61855 + }, + { + "epoch": 6.805280528052805, + "grad_norm": 1.9443174600601196, + "learning_rate": 4.1711492529351757e-05, + "loss": 0.0909, + "num_input_tokens_seen": 13054784, + "step": 61860 + }, + { + "epoch": 6.805830583058306, + "grad_norm": 0.02344699576497078, + "learning_rate": 4.170970740589889e-05, + "loss": 0.0326, + "num_input_tokens_seen": 13055776, + "step": 61865 + }, + { + "epoch": 6.806380638063806, + "grad_norm": 1.0320206880569458, + "learning_rate": 4.170792212844068e-05, + "loss": 0.0447, + "num_input_tokens_seen": 13056896, + "step": 61870 + }, + { + "epoch": 6.806930693069307, + "grad_norm": 1.225146770477295, + "learning_rate": 4.1706136696993565e-05, + "loss": 0.0531, + "num_input_tokens_seen": 13057856, + "step": 61875 + }, + { + "epoch": 6.807480748074807, + "grad_norm": 0.0712420642375946, + "learning_rate": 4.1704351111574005e-05, + "loss": 0.0215, + "num_input_tokens_seen": 13058848, + "step": 61880 + }, + { + "epoch": 6.8080308030803085, + "grad_norm": 0.2705245316028595, + "learning_rate": 4.1702565372198454e-05, + "loss": 0.0102, + "num_input_tokens_seen": 13059872, + "step": 61885 + }, + { + "epoch": 6.808580858085809, + "grad_norm": 0.6860411167144775, + "learning_rate": 4.1700779478883366e-05, + "loss": 0.0153, + "num_input_tokens_seen": 13060928, + "step": 61890 + }, + { + "epoch": 6.809130913091309, + "grad_norm": 0.06462611258029938, + "learning_rate": 4.169899343164522e-05, + "loss": 0.0232, + "num_input_tokens_seen": 13061952, + "step": 61895 + }, + { + "epoch": 6.80968096809681, + "grad_norm": 0.10313300788402557, + "learning_rate": 4.169720723050046e-05, + "loss": 0.2393, + "num_input_tokens_seen": 13063040, + "step": 61900 + }, + { + "epoch": 6.81023102310231, + "grad_norm": 0.35954922437667847, + "learning_rate": 4.1695420875465555e-05, + "loss": 0.0556, + "num_input_tokens_seen": 13064128, + "step": 61905 + }, + { + "epoch": 6.81078107810781, + "grad_norm": 0.5880876779556274, + "learning_rate": 4.169363436655696e-05, + "loss": 0.0826, + "num_input_tokens_seen": 13065152, + "step": 61910 + }, + { + "epoch": 6.8113311331133115, + "grad_norm": 3.2076313495635986, + "learning_rate": 4.169184770379115e-05, + "loss": 0.0384, + "num_input_tokens_seen": 13066176, + "step": 61915 + }, + { + "epoch": 6.811881188118812, + "grad_norm": 0.029030771926045418, + "learning_rate": 4.169006088718459e-05, + "loss": 0.2032, + "num_input_tokens_seen": 13067264, + "step": 61920 + }, + { + "epoch": 6.812431243124313, + "grad_norm": 0.026035184040665627, + "learning_rate": 4.168827391675375e-05, + "loss": 0.0238, + "num_input_tokens_seen": 13068256, + "step": 61925 + }, + { + "epoch": 6.812981298129813, + "grad_norm": 0.7642486691474915, + "learning_rate": 4.16864867925151e-05, + "loss": 0.0404, + "num_input_tokens_seen": 13069344, + "step": 61930 + }, + { + "epoch": 6.813531353135313, + "grad_norm": 0.7153540849685669, + "learning_rate": 4.16846995144851e-05, + "loss": 0.043, + "num_input_tokens_seen": 13070400, + "step": 61935 + }, + { + "epoch": 6.814081408140814, + "grad_norm": 1.3272645473480225, + "learning_rate": 4.1682912082680234e-05, + "loss": 0.0694, + "num_input_tokens_seen": 13071456, + "step": 61940 + }, + { + "epoch": 6.8146314631463145, + "grad_norm": 0.00938092265278101, + "learning_rate": 4.168112449711698e-05, + "loss": 0.0257, + "num_input_tokens_seen": 13072512, + "step": 61945 + }, + { + "epoch": 6.815181518151816, + "grad_norm": 1.0674079656600952, + "learning_rate": 4.167933675781179e-05, + "loss": 0.0642, + "num_input_tokens_seen": 13073536, + "step": 61950 + }, + { + "epoch": 6.815731573157316, + "grad_norm": 0.017859041690826416, + "learning_rate": 4.167754886478117e-05, + "loss": 0.0186, + "num_input_tokens_seen": 13074624, + "step": 61955 + }, + { + "epoch": 6.816281628162816, + "grad_norm": 0.28720980882644653, + "learning_rate": 4.167576081804158e-05, + "loss": 0.0224, + "num_input_tokens_seen": 13075680, + "step": 61960 + }, + { + "epoch": 6.816831683168317, + "grad_norm": 0.4750230312347412, + "learning_rate": 4.167397261760951e-05, + "loss": 0.0191, + "num_input_tokens_seen": 13076704, + "step": 61965 + }, + { + "epoch": 6.817381738173817, + "grad_norm": 0.03440608084201813, + "learning_rate": 4.1672184263501426e-05, + "loss": 0.0357, + "num_input_tokens_seen": 13077760, + "step": 61970 + }, + { + "epoch": 6.8179317931793175, + "grad_norm": 0.008713740855455399, + "learning_rate": 4.167039575573383e-05, + "loss": 0.0056, + "num_input_tokens_seen": 13078848, + "step": 61975 + }, + { + "epoch": 6.818481848184819, + "grad_norm": 0.37627536058425903, + "learning_rate": 4.1668607094323185e-05, + "loss": 0.0144, + "num_input_tokens_seen": 13079968, + "step": 61980 + }, + { + "epoch": 6.819031903190319, + "grad_norm": 1.7991900444030762, + "learning_rate": 4.166681827928599e-05, + "loss": 0.086, + "num_input_tokens_seen": 13080960, + "step": 61985 + }, + { + "epoch": 6.81958195819582, + "grad_norm": 0.013909044675529003, + "learning_rate": 4.166502931063873e-05, + "loss": 0.0099, + "num_input_tokens_seen": 13082016, + "step": 61990 + }, + { + "epoch": 6.82013201320132, + "grad_norm": 0.043973300606012344, + "learning_rate": 4.166324018839789e-05, + "loss": 0.1253, + "num_input_tokens_seen": 13083136, + "step": 61995 + }, + { + "epoch": 6.82068206820682, + "grad_norm": 0.9794159531593323, + "learning_rate": 4.1661450912579966e-05, + "loss": 0.0147, + "num_input_tokens_seen": 13084224, + "step": 62000 + }, + { + "epoch": 6.821232123212321, + "grad_norm": 0.7834392189979553, + "learning_rate": 4.165966148320144e-05, + "loss": 0.0736, + "num_input_tokens_seen": 13085280, + "step": 62005 + }, + { + "epoch": 6.821782178217822, + "grad_norm": 0.6109846830368042, + "learning_rate": 4.165787190027881e-05, + "loss": 0.0341, + "num_input_tokens_seen": 13086336, + "step": 62010 + }, + { + "epoch": 6.822332233223323, + "grad_norm": 0.220127135515213, + "learning_rate": 4.1656082163828566e-05, + "loss": 0.0255, + "num_input_tokens_seen": 13087360, + "step": 62015 + }, + { + "epoch": 6.822882288228823, + "grad_norm": 0.04566657915711403, + "learning_rate": 4.16542922738672e-05, + "loss": 0.0263, + "num_input_tokens_seen": 13088448, + "step": 62020 + }, + { + "epoch": 6.823432343234323, + "grad_norm": 0.4695199728012085, + "learning_rate": 4.165250223041122e-05, + "loss": 0.032, + "num_input_tokens_seen": 13089440, + "step": 62025 + }, + { + "epoch": 6.823982398239824, + "grad_norm": 0.34615692496299744, + "learning_rate": 4.165071203347711e-05, + "loss": 0.0167, + "num_input_tokens_seen": 13090464, + "step": 62030 + }, + { + "epoch": 6.824532453245324, + "grad_norm": 0.0266606155782938, + "learning_rate": 4.164892168308139e-05, + "loss": 0.0117, + "num_input_tokens_seen": 13091552, + "step": 62035 + }, + { + "epoch": 6.825082508250825, + "grad_norm": 0.09858661144971848, + "learning_rate": 4.164713117924054e-05, + "loss": 0.0075, + "num_input_tokens_seen": 13092608, + "step": 62040 + }, + { + "epoch": 6.825632563256326, + "grad_norm": 0.11503313481807709, + "learning_rate": 4.164534052197108e-05, + "loss": 0.0134, + "num_input_tokens_seen": 13093696, + "step": 62045 + }, + { + "epoch": 6.826182618261826, + "grad_norm": 0.4553314745426178, + "learning_rate": 4.164354971128949e-05, + "loss": 0.0321, + "num_input_tokens_seen": 13094720, + "step": 62050 + }, + { + "epoch": 6.826732673267327, + "grad_norm": 0.7020336985588074, + "learning_rate": 4.1641758747212286e-05, + "loss": 0.142, + "num_input_tokens_seen": 13095776, + "step": 62055 + }, + { + "epoch": 6.827282728272827, + "grad_norm": 0.48968204855918884, + "learning_rate": 4.1639967629756e-05, + "loss": 0.0136, + "num_input_tokens_seen": 13096896, + "step": 62060 + }, + { + "epoch": 6.827832783278328, + "grad_norm": 0.8302322030067444, + "learning_rate": 4.1638176358937094e-05, + "loss": 0.042, + "num_input_tokens_seen": 13098048, + "step": 62065 + }, + { + "epoch": 6.8283828382838285, + "grad_norm": 0.41536790132522583, + "learning_rate": 4.1636384934772106e-05, + "loss": 0.0517, + "num_input_tokens_seen": 13099072, + "step": 62070 + }, + { + "epoch": 6.828932893289329, + "grad_norm": 0.18654312193393707, + "learning_rate": 4.163459335727754e-05, + "loss": 0.0124, + "num_input_tokens_seen": 13100128, + "step": 62075 + }, + { + "epoch": 6.82948294829483, + "grad_norm": 1.1436113119125366, + "learning_rate": 4.1632801626469906e-05, + "loss": 0.1179, + "num_input_tokens_seen": 13101216, + "step": 62080 + }, + { + "epoch": 6.83003300330033, + "grad_norm": 0.015329958871006966, + "learning_rate": 4.1631009742365736e-05, + "loss": 0.0093, + "num_input_tokens_seen": 13102336, + "step": 62085 + }, + { + "epoch": 6.83058305830583, + "grad_norm": 0.16946327686309814, + "learning_rate": 4.1629217704981515e-05, + "loss": 0.1197, + "num_input_tokens_seen": 13103360, + "step": 62090 + }, + { + "epoch": 6.831133113311331, + "grad_norm": 1.28866446018219, + "learning_rate": 4.162742551433378e-05, + "loss": 0.0655, + "num_input_tokens_seen": 13104384, + "step": 62095 + }, + { + "epoch": 6.8316831683168315, + "grad_norm": 0.06742633134126663, + "learning_rate": 4.1625633170439035e-05, + "loss": 0.0892, + "num_input_tokens_seen": 13105440, + "step": 62100 + }, + { + "epoch": 6.832233223322332, + "grad_norm": 0.0194157175719738, + "learning_rate": 4.162384067331382e-05, + "loss": 0.0034, + "num_input_tokens_seen": 13106464, + "step": 62105 + }, + { + "epoch": 6.832783278327833, + "grad_norm": 0.10408160835504532, + "learning_rate": 4.162204802297464e-05, + "loss": 0.0366, + "num_input_tokens_seen": 13107584, + "step": 62110 + }, + { + "epoch": 6.833333333333333, + "grad_norm": 0.23188218474388123, + "learning_rate": 4.162025521943801e-05, + "loss": 0.0872, + "num_input_tokens_seen": 13108672, + "step": 62115 + }, + { + "epoch": 6.833883388338834, + "grad_norm": 0.005703374743461609, + "learning_rate": 4.161846226272047e-05, + "loss": 0.0416, + "num_input_tokens_seen": 13109728, + "step": 62120 + }, + { + "epoch": 6.834433443344334, + "grad_norm": 1.0029774904251099, + "learning_rate": 4.1616669152838536e-05, + "loss": 0.0629, + "num_input_tokens_seen": 13110784, + "step": 62125 + }, + { + "epoch": 6.834983498349835, + "grad_norm": 1.1838234663009644, + "learning_rate": 4.161487588980873e-05, + "loss": 0.0308, + "num_input_tokens_seen": 13111840, + "step": 62130 + }, + { + "epoch": 6.835533553355336, + "grad_norm": 0.5318888425827026, + "learning_rate": 4.16130824736476e-05, + "loss": 0.1412, + "num_input_tokens_seen": 13112832, + "step": 62135 + }, + { + "epoch": 6.836083608360836, + "grad_norm": 1.992466926574707, + "learning_rate": 4.161128890437165e-05, + "loss": 0.0897, + "num_input_tokens_seen": 13113888, + "step": 62140 + }, + { + "epoch": 6.836633663366337, + "grad_norm": 0.030295399948954582, + "learning_rate": 4.1609495181997424e-05, + "loss": 0.0648, + "num_input_tokens_seen": 13114976, + "step": 62145 + }, + { + "epoch": 6.837183718371837, + "grad_norm": 0.07579342275857925, + "learning_rate": 4.160770130654145e-05, + "loss": 0.0294, + "num_input_tokens_seen": 13116000, + "step": 62150 + }, + { + "epoch": 6.837733773377337, + "grad_norm": 0.32115882635116577, + "learning_rate": 4.160590727802026e-05, + "loss": 0.0072, + "num_input_tokens_seen": 13117120, + "step": 62155 + }, + { + "epoch": 6.838283828382838, + "grad_norm": 0.10355987399816513, + "learning_rate": 4.1604113096450394e-05, + "loss": 0.0365, + "num_input_tokens_seen": 13118208, + "step": 62160 + }, + { + "epoch": 6.838833883388339, + "grad_norm": 0.01640700176358223, + "learning_rate": 4.160231876184838e-05, + "loss": 0.0452, + "num_input_tokens_seen": 13119264, + "step": 62165 + }, + { + "epoch": 6.83938393839384, + "grad_norm": 0.12572135031223297, + "learning_rate": 4.160052427423077e-05, + "loss": 0.0493, + "num_input_tokens_seen": 13120288, + "step": 62170 + }, + { + "epoch": 6.83993399339934, + "grad_norm": 0.4876473844051361, + "learning_rate": 4.159872963361408e-05, + "loss": 0.0526, + "num_input_tokens_seen": 13121344, + "step": 62175 + }, + { + "epoch": 6.84048404840484, + "grad_norm": 0.13093797862529755, + "learning_rate": 4.159693484001488e-05, + "loss": 0.1416, + "num_input_tokens_seen": 13122336, + "step": 62180 + }, + { + "epoch": 6.841034103410341, + "grad_norm": 0.03444826975464821, + "learning_rate": 4.1595139893449686e-05, + "loss": 0.0888, + "num_input_tokens_seen": 13123424, + "step": 62185 + }, + { + "epoch": 6.841584158415841, + "grad_norm": 0.3919450342655182, + "learning_rate": 4.159334479393505e-05, + "loss": 0.0191, + "num_input_tokens_seen": 13124448, + "step": 62190 + }, + { + "epoch": 6.8421342134213425, + "grad_norm": 0.32968559861183167, + "learning_rate": 4.159154954148752e-05, + "loss": 0.0117, + "num_input_tokens_seen": 13125536, + "step": 62195 + }, + { + "epoch": 6.842684268426843, + "grad_norm": 1.0073788166046143, + "learning_rate": 4.158975413612363e-05, + "loss": 0.1073, + "num_input_tokens_seen": 13126624, + "step": 62200 + }, + { + "epoch": 6.843234323432343, + "grad_norm": 0.054044902324676514, + "learning_rate": 4.158795857785995e-05, + "loss": 0.0448, + "num_input_tokens_seen": 13127648, + "step": 62205 + }, + { + "epoch": 6.843784378437844, + "grad_norm": 0.032216593623161316, + "learning_rate": 4.158616286671301e-05, + "loss": 0.0485, + "num_input_tokens_seen": 13128736, + "step": 62210 + }, + { + "epoch": 6.844334433443344, + "grad_norm": 0.04735838994383812, + "learning_rate": 4.158436700269936e-05, + "loss": 0.0296, + "num_input_tokens_seen": 13129696, + "step": 62215 + }, + { + "epoch": 6.8448844884488445, + "grad_norm": 0.02293168380856514, + "learning_rate": 4.158257098583557e-05, + "loss": 0.0409, + "num_input_tokens_seen": 13130720, + "step": 62220 + }, + { + "epoch": 6.8454345434543455, + "grad_norm": 0.4702245593070984, + "learning_rate": 4.158077481613817e-05, + "loss": 0.1551, + "num_input_tokens_seen": 13131776, + "step": 62225 + }, + { + "epoch": 6.845984598459846, + "grad_norm": 0.1642719805240631, + "learning_rate": 4.157897849362373e-05, + "loss": 0.0768, + "num_input_tokens_seen": 13132896, + "step": 62230 + }, + { + "epoch": 6.846534653465347, + "grad_norm": 0.024356847628951073, + "learning_rate": 4.15771820183088e-05, + "loss": 0.0146, + "num_input_tokens_seen": 13134048, + "step": 62235 + }, + { + "epoch": 6.847084708470847, + "grad_norm": 0.5812610983848572, + "learning_rate": 4.157538539020994e-05, + "loss": 0.122, + "num_input_tokens_seen": 13135104, + "step": 62240 + }, + { + "epoch": 6.847634763476347, + "grad_norm": 0.05689754709601402, + "learning_rate": 4.15735886093437e-05, + "loss": 0.0103, + "num_input_tokens_seen": 13136128, + "step": 62245 + }, + { + "epoch": 6.848184818481848, + "grad_norm": 0.015111548826098442, + "learning_rate": 4.157179167572665e-05, + "loss": 0.0445, + "num_input_tokens_seen": 13137184, + "step": 62250 + }, + { + "epoch": 6.8487348734873486, + "grad_norm": 1.1679474115371704, + "learning_rate": 4.156999458937535e-05, + "loss": 0.1447, + "num_input_tokens_seen": 13138240, + "step": 62255 + }, + { + "epoch": 6.84928492849285, + "grad_norm": 1.0103892087936401, + "learning_rate": 4.156819735030636e-05, + "loss": 0.13, + "num_input_tokens_seen": 13139328, + "step": 62260 + }, + { + "epoch": 6.84983498349835, + "grad_norm": 0.535938560962677, + "learning_rate": 4.156639995853624e-05, + "loss": 0.0306, + "num_input_tokens_seen": 13140320, + "step": 62265 + }, + { + "epoch": 6.85038503850385, + "grad_norm": 0.11934306472539902, + "learning_rate": 4.156460241408157e-05, + "loss": 0.0244, + "num_input_tokens_seen": 13141344, + "step": 62270 + }, + { + "epoch": 6.850935093509351, + "grad_norm": 1.9505336284637451, + "learning_rate": 4.15628047169589e-05, + "loss": 0.0815, + "num_input_tokens_seen": 13142432, + "step": 62275 + }, + { + "epoch": 6.851485148514851, + "grad_norm": 0.043840110301971436, + "learning_rate": 4.156100686718481e-05, + "loss": 0.0471, + "num_input_tokens_seen": 13143488, + "step": 62280 + }, + { + "epoch": 6.852035203520352, + "grad_norm": 0.011152603663504124, + "learning_rate": 4.155920886477587e-05, + "loss": 0.0057, + "num_input_tokens_seen": 13144512, + "step": 62285 + }, + { + "epoch": 6.852585258525853, + "grad_norm": 0.48838987946510315, + "learning_rate": 4.155741070974864e-05, + "loss": 0.016, + "num_input_tokens_seen": 13145568, + "step": 62290 + }, + { + "epoch": 6.853135313531353, + "grad_norm": 0.04075268283486366, + "learning_rate": 4.15556124021197e-05, + "loss": 0.0093, + "num_input_tokens_seen": 13146560, + "step": 62295 + }, + { + "epoch": 6.853685368536854, + "grad_norm": 0.25015032291412354, + "learning_rate": 4.155381394190564e-05, + "loss": 0.0107, + "num_input_tokens_seen": 13147648, + "step": 62300 + }, + { + "epoch": 6.854235423542354, + "grad_norm": 0.023325610905885696, + "learning_rate": 4.1552015329123006e-05, + "loss": 0.0222, + "num_input_tokens_seen": 13148672, + "step": 62305 + }, + { + "epoch": 6.854785478547855, + "grad_norm": 0.034544724971055984, + "learning_rate": 4.1550216563788394e-05, + "loss": 0.0788, + "num_input_tokens_seen": 13149664, + "step": 62310 + }, + { + "epoch": 6.8553355335533555, + "grad_norm": 0.9118666648864746, + "learning_rate": 4.154841764591837e-05, + "loss": 0.1086, + "num_input_tokens_seen": 13150752, + "step": 62315 + }, + { + "epoch": 6.855885588558856, + "grad_norm": 0.07009749859571457, + "learning_rate": 4.154661857552953e-05, + "loss": 0.028, + "num_input_tokens_seen": 13151808, + "step": 62320 + }, + { + "epoch": 6.856435643564357, + "grad_norm": 0.05335605517029762, + "learning_rate": 4.1544819352638445e-05, + "loss": 0.0146, + "num_input_tokens_seen": 13152864, + "step": 62325 + }, + { + "epoch": 6.856985698569857, + "grad_norm": 1.6838635206222534, + "learning_rate": 4.15430199772617e-05, + "loss": 0.1659, + "num_input_tokens_seen": 13153920, + "step": 62330 + }, + { + "epoch": 6.857535753575357, + "grad_norm": 0.06075122952461243, + "learning_rate": 4.154122044941587e-05, + "loss": 0.0446, + "num_input_tokens_seen": 13154976, + "step": 62335 + }, + { + "epoch": 6.858085808580858, + "grad_norm": 0.4635102450847626, + "learning_rate": 4.1539420769117557e-05, + "loss": 0.0584, + "num_input_tokens_seen": 13156064, + "step": 62340 + }, + { + "epoch": 6.8586358635863585, + "grad_norm": 0.03345303609967232, + "learning_rate": 4.153762093638334e-05, + "loss": 0.0805, + "num_input_tokens_seen": 13157088, + "step": 62345 + }, + { + "epoch": 6.8591859185918596, + "grad_norm": 0.6352887153625488, + "learning_rate": 4.1535820951229795e-05, + "loss": 0.0464, + "num_input_tokens_seen": 13158112, + "step": 62350 + }, + { + "epoch": 6.85973597359736, + "grad_norm": 1.201701045036316, + "learning_rate": 4.153402081367353e-05, + "loss": 0.0753, + "num_input_tokens_seen": 13159168, + "step": 62355 + }, + { + "epoch": 6.86028602860286, + "grad_norm": 0.32974424958229065, + "learning_rate": 4.153222052373113e-05, + "loss": 0.0191, + "num_input_tokens_seen": 13160256, + "step": 62360 + }, + { + "epoch": 6.860836083608361, + "grad_norm": 0.012806355953216553, + "learning_rate": 4.1530420081419185e-05, + "loss": 0.0472, + "num_input_tokens_seen": 13161344, + "step": 62365 + }, + { + "epoch": 6.861386138613861, + "grad_norm": 0.021672656759619713, + "learning_rate": 4.152861948675429e-05, + "loss": 0.0529, + "num_input_tokens_seen": 13162464, + "step": 62370 + }, + { + "epoch": 6.861936193619362, + "grad_norm": 0.5501694083213806, + "learning_rate": 4.1526818739753046e-05, + "loss": 0.0408, + "num_input_tokens_seen": 13163584, + "step": 62375 + }, + { + "epoch": 6.862486248624863, + "grad_norm": 0.2438572198152542, + "learning_rate": 4.152501784043203e-05, + "loss": 0.0192, + "num_input_tokens_seen": 13164608, + "step": 62380 + }, + { + "epoch": 6.863036303630363, + "grad_norm": 0.22000324726104736, + "learning_rate": 4.1523216788807875e-05, + "loss": 0.0385, + "num_input_tokens_seen": 13165600, + "step": 62385 + }, + { + "epoch": 6.863586358635864, + "grad_norm": 0.20443172752857208, + "learning_rate": 4.152141558489714e-05, + "loss": 0.0197, + "num_input_tokens_seen": 13166624, + "step": 62390 + }, + { + "epoch": 6.864136413641364, + "grad_norm": 0.01967739313840866, + "learning_rate": 4.151961422871646e-05, + "loss": 0.0333, + "num_input_tokens_seen": 13167648, + "step": 62395 + }, + { + "epoch": 6.864686468646864, + "grad_norm": 0.1316366046667099, + "learning_rate": 4.151781272028241e-05, + "loss": 0.0169, + "num_input_tokens_seen": 13168704, + "step": 62400 + }, + { + "epoch": 6.865236523652365, + "grad_norm": 0.18160848319530487, + "learning_rate": 4.1516011059611614e-05, + "loss": 0.0698, + "num_input_tokens_seen": 13169696, + "step": 62405 + }, + { + "epoch": 6.865786578657866, + "grad_norm": 1.2403147220611572, + "learning_rate": 4.151420924672067e-05, + "loss": 0.073, + "num_input_tokens_seen": 13170752, + "step": 62410 + }, + { + "epoch": 6.866336633663367, + "grad_norm": 0.3891199827194214, + "learning_rate": 4.1512407281626175e-05, + "loss": 0.0362, + "num_input_tokens_seen": 13171904, + "step": 62415 + }, + { + "epoch": 6.866886688668867, + "grad_norm": 0.051649656146764755, + "learning_rate": 4.151060516434475e-05, + "loss": 0.0981, + "num_input_tokens_seen": 13173024, + "step": 62420 + }, + { + "epoch": 6.867436743674367, + "grad_norm": 0.09628427773714066, + "learning_rate": 4.150880289489301e-05, + "loss": 0.0361, + "num_input_tokens_seen": 13174048, + "step": 62425 + }, + { + "epoch": 6.867986798679868, + "grad_norm": 0.043947894126176834, + "learning_rate": 4.150700047328754e-05, + "loss": 0.0027, + "num_input_tokens_seen": 13175104, + "step": 62430 + }, + { + "epoch": 6.868536853685368, + "grad_norm": 0.5530372858047485, + "learning_rate": 4.1505197899544974e-05, + "loss": 0.0588, + "num_input_tokens_seen": 13176256, + "step": 62435 + }, + { + "epoch": 6.8690869086908695, + "grad_norm": 0.024071943014860153, + "learning_rate": 4.1503395173681917e-05, + "loss": 0.069, + "num_input_tokens_seen": 13177312, + "step": 62440 + }, + { + "epoch": 6.86963696369637, + "grad_norm": 0.16531158983707428, + "learning_rate": 4.150159229571499e-05, + "loss": 0.0376, + "num_input_tokens_seen": 13178368, + "step": 62445 + }, + { + "epoch": 6.87018701870187, + "grad_norm": 1.095631718635559, + "learning_rate": 4.149978926566081e-05, + "loss": 0.0612, + "num_input_tokens_seen": 13179424, + "step": 62450 + }, + { + "epoch": 6.870737073707371, + "grad_norm": 0.019762910902500153, + "learning_rate": 4.149798608353598e-05, + "loss": 0.0242, + "num_input_tokens_seen": 13180448, + "step": 62455 + }, + { + "epoch": 6.871287128712871, + "grad_norm": 1.6665890216827393, + "learning_rate": 4.149618274935713e-05, + "loss": 0.061, + "num_input_tokens_seen": 13181536, + "step": 62460 + }, + { + "epoch": 6.871837183718371, + "grad_norm": 0.5847460031509399, + "learning_rate": 4.149437926314088e-05, + "loss": 0.0585, + "num_input_tokens_seen": 13182592, + "step": 62465 + }, + { + "epoch": 6.8723872387238725, + "grad_norm": 0.2693309485912323, + "learning_rate": 4.149257562490385e-05, + "loss": 0.0078, + "num_input_tokens_seen": 13183616, + "step": 62470 + }, + { + "epoch": 6.872937293729373, + "grad_norm": 0.017036493867635727, + "learning_rate": 4.149077183466267e-05, + "loss": 0.0255, + "num_input_tokens_seen": 13184672, + "step": 62475 + }, + { + "epoch": 6.873487348734874, + "grad_norm": 0.00838506780564785, + "learning_rate": 4.148896789243395e-05, + "loss": 0.0025, + "num_input_tokens_seen": 13185728, + "step": 62480 + }, + { + "epoch": 6.874037403740374, + "grad_norm": 0.18514809012413025, + "learning_rate": 4.148716379823433e-05, + "loss": 0.0359, + "num_input_tokens_seen": 13186784, + "step": 62485 + }, + { + "epoch": 6.874587458745875, + "grad_norm": 0.02733270823955536, + "learning_rate": 4.1485359552080424e-05, + "loss": 0.044, + "num_input_tokens_seen": 13187808, + "step": 62490 + }, + { + "epoch": 6.875137513751375, + "grad_norm": 0.21183602511882782, + "learning_rate": 4.148355515398888e-05, + "loss": 0.0232, + "num_input_tokens_seen": 13188864, + "step": 62495 + }, + { + "epoch": 6.8756875687568755, + "grad_norm": 0.19426178932189941, + "learning_rate": 4.1481750603976324e-05, + "loss": 0.0118, + "num_input_tokens_seen": 13189920, + "step": 62500 + }, + { + "epoch": 6.876237623762377, + "grad_norm": 0.29404744505882263, + "learning_rate": 4.147994590205937e-05, + "loss": 0.0242, + "num_input_tokens_seen": 13190912, + "step": 62505 + }, + { + "epoch": 6.876787678767877, + "grad_norm": 0.033968377858400345, + "learning_rate": 4.147814104825466e-05, + "loss": 0.0233, + "num_input_tokens_seen": 13191936, + "step": 62510 + }, + { + "epoch": 6.877337733773377, + "grad_norm": 0.5295193791389465, + "learning_rate": 4.147633604257884e-05, + "loss": 0.0159, + "num_input_tokens_seen": 13193024, + "step": 62515 + }, + { + "epoch": 6.877887788778878, + "grad_norm": 0.1415971964597702, + "learning_rate": 4.147453088504854e-05, + "loss": 0.0238, + "num_input_tokens_seen": 13194016, + "step": 62520 + }, + { + "epoch": 6.878437843784378, + "grad_norm": 1.1958954334259033, + "learning_rate": 4.147272557568038e-05, + "loss": 0.0217, + "num_input_tokens_seen": 13195072, + "step": 62525 + }, + { + "epoch": 6.878987898789879, + "grad_norm": 0.01061342190951109, + "learning_rate": 4.147092011449102e-05, + "loss": 0.1093, + "num_input_tokens_seen": 13196096, + "step": 62530 + }, + { + "epoch": 6.87953795379538, + "grad_norm": 0.010709188878536224, + "learning_rate": 4.1469114501497096e-05, + "loss": 0.0499, + "num_input_tokens_seen": 13197120, + "step": 62535 + }, + { + "epoch": 6.88008800880088, + "grad_norm": 0.07466117292642593, + "learning_rate": 4.1467308736715245e-05, + "loss": 0.0118, + "num_input_tokens_seen": 13198176, + "step": 62540 + }, + { + "epoch": 6.880638063806381, + "grad_norm": 0.28372320532798767, + "learning_rate": 4.1465502820162104e-05, + "loss": 0.0238, + "num_input_tokens_seen": 13199232, + "step": 62545 + }, + { + "epoch": 6.881188118811881, + "grad_norm": 1.2259067296981812, + "learning_rate": 4.146369675185433e-05, + "loss": 0.0727, + "num_input_tokens_seen": 13200320, + "step": 62550 + }, + { + "epoch": 6.881738173817382, + "grad_norm": 0.44244760274887085, + "learning_rate": 4.146189053180857e-05, + "loss": 0.0678, + "num_input_tokens_seen": 13201408, + "step": 62555 + }, + { + "epoch": 6.882288228822882, + "grad_norm": 0.02863328717648983, + "learning_rate": 4.146008416004146e-05, + "loss": 0.0845, + "num_input_tokens_seen": 13202432, + "step": 62560 + }, + { + "epoch": 6.882838283828383, + "grad_norm": 0.017634527757763863, + "learning_rate": 4.145827763656964e-05, + "loss": 0.013, + "num_input_tokens_seen": 13203584, + "step": 62565 + }, + { + "epoch": 6.883388338833884, + "grad_norm": 0.08198315650224686, + "learning_rate": 4.145647096140979e-05, + "loss": 0.1234, + "num_input_tokens_seen": 13204544, + "step": 62570 + }, + { + "epoch": 6.883938393839384, + "grad_norm": 0.018607860431075096, + "learning_rate": 4.145466413457853e-05, + "loss": 0.0179, + "num_input_tokens_seen": 13205600, + "step": 62575 + }, + { + "epoch": 6.884488448844884, + "grad_norm": 0.005129750352352858, + "learning_rate": 4.1452857156092526e-05, + "loss": 0.0803, + "num_input_tokens_seen": 13206624, + "step": 62580 + }, + { + "epoch": 6.885038503850385, + "grad_norm": 0.056361570954322815, + "learning_rate": 4.145105002596845e-05, + "loss": 0.0188, + "num_input_tokens_seen": 13207680, + "step": 62585 + }, + { + "epoch": 6.885588558855885, + "grad_norm": 0.07046183943748474, + "learning_rate": 4.1449242744222925e-05, + "loss": 0.0861, + "num_input_tokens_seen": 13208768, + "step": 62590 + }, + { + "epoch": 6.8861386138613865, + "grad_norm": 0.04105724021792412, + "learning_rate": 4.1447435310872626e-05, + "loss": 0.0126, + "num_input_tokens_seen": 13209824, + "step": 62595 + }, + { + "epoch": 6.886688668866887, + "grad_norm": 0.07707682996988297, + "learning_rate": 4.144562772593421e-05, + "loss": 0.0046, + "num_input_tokens_seen": 13210944, + "step": 62600 + }, + { + "epoch": 6.887238723872387, + "grad_norm": 0.292665034532547, + "learning_rate": 4.1443819989424325e-05, + "loss": 0.0583, + "num_input_tokens_seen": 13212000, + "step": 62605 + }, + { + "epoch": 6.887788778877888, + "grad_norm": 0.01663454808294773, + "learning_rate": 4.1442012101359656e-05, + "loss": 0.016, + "num_input_tokens_seen": 13213088, + "step": 62610 + }, + { + "epoch": 6.888338833883388, + "grad_norm": 0.6957768201828003, + "learning_rate": 4.144020406175684e-05, + "loss": 0.0177, + "num_input_tokens_seen": 13214144, + "step": 62615 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.240728497505188, + "learning_rate": 4.1438395870632565e-05, + "loss": 0.0338, + "num_input_tokens_seen": 13215136, + "step": 62620 + }, + { + "epoch": 6.8894389438943895, + "grad_norm": 0.18754753470420837, + "learning_rate": 4.143658752800347e-05, + "loss": 0.0389, + "num_input_tokens_seen": 13216192, + "step": 62625 + }, + { + "epoch": 6.88998899889989, + "grad_norm": 0.6783047914505005, + "learning_rate": 4.143477903388624e-05, + "loss": 0.0594, + "num_input_tokens_seen": 13217344, + "step": 62630 + }, + { + "epoch": 6.890539053905391, + "grad_norm": 0.9282431602478027, + "learning_rate": 4.143297038829754e-05, + "loss": 0.0533, + "num_input_tokens_seen": 13218336, + "step": 62635 + }, + { + "epoch": 6.891089108910891, + "grad_norm": 0.12408780306577682, + "learning_rate": 4.143116159125403e-05, + "loss": 0.0251, + "num_input_tokens_seen": 13219424, + "step": 62640 + }, + { + "epoch": 6.891639163916391, + "grad_norm": 0.07214757055044174, + "learning_rate": 4.142935264277239e-05, + "loss": 0.0238, + "num_input_tokens_seen": 13220448, + "step": 62645 + }, + { + "epoch": 6.892189218921892, + "grad_norm": 0.21261073648929596, + "learning_rate": 4.14275435428693e-05, + "loss": 0.076, + "num_input_tokens_seen": 13221472, + "step": 62650 + }, + { + "epoch": 6.8927392739273925, + "grad_norm": 0.1522454470396042, + "learning_rate": 4.142573429156141e-05, + "loss": 0.168, + "num_input_tokens_seen": 13222464, + "step": 62655 + }, + { + "epoch": 6.893289328932894, + "grad_norm": 0.5488570928573608, + "learning_rate": 4.142392488886542e-05, + "loss": 0.1182, + "num_input_tokens_seen": 13223520, + "step": 62660 + }, + { + "epoch": 6.893839383938394, + "grad_norm": 0.2940983474254608, + "learning_rate": 4.1422115334798e-05, + "loss": 0.0832, + "num_input_tokens_seen": 13224608, + "step": 62665 + }, + { + "epoch": 6.894389438943895, + "grad_norm": 0.9467594623565674, + "learning_rate": 4.1420305629375804e-05, + "loss": 0.0326, + "num_input_tokens_seen": 13225728, + "step": 62670 + }, + { + "epoch": 6.894939493949395, + "grad_norm": 0.7030537724494934, + "learning_rate": 4.141849577261554e-05, + "loss": 0.0713, + "num_input_tokens_seen": 13226816, + "step": 62675 + }, + { + "epoch": 6.895489548954895, + "grad_norm": 0.6855475902557373, + "learning_rate": 4.141668576453388e-05, + "loss": 0.0402, + "num_input_tokens_seen": 13227840, + "step": 62680 + }, + { + "epoch": 6.896039603960396, + "grad_norm": 0.1006854996085167, + "learning_rate": 4.1414875605147504e-05, + "loss": 0.0239, + "num_input_tokens_seen": 13228864, + "step": 62685 + }, + { + "epoch": 6.896589658965897, + "grad_norm": 0.03689059615135193, + "learning_rate": 4.1413065294473094e-05, + "loss": 0.0374, + "num_input_tokens_seen": 13229888, + "step": 62690 + }, + { + "epoch": 6.897139713971397, + "grad_norm": 0.020670650526881218, + "learning_rate": 4.141125483252735e-05, + "loss": 0.0282, + "num_input_tokens_seen": 13230912, + "step": 62695 + }, + { + "epoch": 6.897689768976898, + "grad_norm": 0.1261989325284958, + "learning_rate": 4.140944421932693e-05, + "loss": 0.066, + "num_input_tokens_seen": 13231936, + "step": 62700 + }, + { + "epoch": 6.898239823982398, + "grad_norm": 0.024299317970871925, + "learning_rate": 4.140763345488854e-05, + "loss": 0.015, + "num_input_tokens_seen": 13232992, + "step": 62705 + }, + { + "epoch": 6.898789878987898, + "grad_norm": 0.024711860343813896, + "learning_rate": 4.140582253922887e-05, + "loss": 0.0795, + "num_input_tokens_seen": 13234016, + "step": 62710 + }, + { + "epoch": 6.899339933993399, + "grad_norm": 0.18280799686908722, + "learning_rate": 4.1404011472364595e-05, + "loss": 0.0149, + "num_input_tokens_seen": 13235104, + "step": 62715 + }, + { + "epoch": 6.8998899889989, + "grad_norm": 1.7742211818695068, + "learning_rate": 4.140220025431243e-05, + "loss": 0.0332, + "num_input_tokens_seen": 13236160, + "step": 62720 + }, + { + "epoch": 6.900440044004401, + "grad_norm": 0.1732673943042755, + "learning_rate": 4.1400388885089045e-05, + "loss": 0.0578, + "num_input_tokens_seen": 13237184, + "step": 62725 + }, + { + "epoch": 6.900990099009901, + "grad_norm": 0.0368824228644371, + "learning_rate": 4.1398577364711154e-05, + "loss": 0.0825, + "num_input_tokens_seen": 13238208, + "step": 62730 + }, + { + "epoch": 6.901540154015402, + "grad_norm": 0.8933115005493164, + "learning_rate": 4.139676569319544e-05, + "loss": 0.0585, + "num_input_tokens_seen": 13239200, + "step": 62735 + }, + { + "epoch": 6.902090209020902, + "grad_norm": 0.13341648876667023, + "learning_rate": 4.139495387055861e-05, + "loss": 0.0227, + "num_input_tokens_seen": 13240256, + "step": 62740 + }, + { + "epoch": 6.902640264026402, + "grad_norm": 0.02653578855097294, + "learning_rate": 4.139314189681735e-05, + "loss": 0.0336, + "num_input_tokens_seen": 13241312, + "step": 62745 + }, + { + "epoch": 6.9031903190319035, + "grad_norm": 0.14267204701900482, + "learning_rate": 4.139132977198837e-05, + "loss": 0.0108, + "num_input_tokens_seen": 13242304, + "step": 62750 + }, + { + "epoch": 6.903740374037404, + "grad_norm": 0.15614064037799835, + "learning_rate": 4.138951749608837e-05, + "loss": 0.0212, + "num_input_tokens_seen": 13243392, + "step": 62755 + }, + { + "epoch": 6.904290429042904, + "grad_norm": 0.09162081778049469, + "learning_rate": 4.1387705069134054e-05, + "loss": 0.1138, + "num_input_tokens_seen": 13244384, + "step": 62760 + }, + { + "epoch": 6.904840484048405, + "grad_norm": 0.030041325837373734, + "learning_rate": 4.138589249114212e-05, + "loss": 0.098, + "num_input_tokens_seen": 13245472, + "step": 62765 + }, + { + "epoch": 6.905390539053905, + "grad_norm": 0.08421000838279724, + "learning_rate": 4.138407976212928e-05, + "loss": 0.0253, + "num_input_tokens_seen": 13246496, + "step": 62770 + }, + { + "epoch": 6.905940594059406, + "grad_norm": 1.099539875984192, + "learning_rate": 4.138226688211223e-05, + "loss": 0.0392, + "num_input_tokens_seen": 13247552, + "step": 62775 + }, + { + "epoch": 6.9064906490649065, + "grad_norm": 0.1502404808998108, + "learning_rate": 4.13804538511077e-05, + "loss": 0.0094, + "num_input_tokens_seen": 13248608, + "step": 62780 + }, + { + "epoch": 6.907040704070407, + "grad_norm": 0.009673239663243294, + "learning_rate": 4.137864066913238e-05, + "loss": 0.0622, + "num_input_tokens_seen": 13249696, + "step": 62785 + }, + { + "epoch": 6.907590759075908, + "grad_norm": 0.017218373715877533, + "learning_rate": 4.137682733620298e-05, + "loss": 0.0046, + "num_input_tokens_seen": 13250720, + "step": 62790 + }, + { + "epoch": 6.908140814081408, + "grad_norm": 0.26453572511672974, + "learning_rate": 4.137501385233624e-05, + "loss": 0.0682, + "num_input_tokens_seen": 13251776, + "step": 62795 + }, + { + "epoch": 6.908690869086909, + "grad_norm": 0.3834207057952881, + "learning_rate": 4.1373200217548844e-05, + "loss": 0.0206, + "num_input_tokens_seen": 13252864, + "step": 62800 + }, + { + "epoch": 6.909240924092409, + "grad_norm": 0.062152255326509476, + "learning_rate": 4.1371386431857515e-05, + "loss": 0.0243, + "num_input_tokens_seen": 13253920, + "step": 62805 + }, + { + "epoch": 6.9097909790979095, + "grad_norm": 0.03640420734882355, + "learning_rate": 4.136957249527898e-05, + "loss": 0.1846, + "num_input_tokens_seen": 13254880, + "step": 62810 + }, + { + "epoch": 6.910341034103411, + "grad_norm": 0.014625114388763905, + "learning_rate": 4.136775840782995e-05, + "loss": 0.0047, + "num_input_tokens_seen": 13255936, + "step": 62815 + }, + { + "epoch": 6.910891089108911, + "grad_norm": 0.2381712645292282, + "learning_rate": 4.136594416952714e-05, + "loss": 0.0126, + "num_input_tokens_seen": 13256992, + "step": 62820 + }, + { + "epoch": 6.911441144114411, + "grad_norm": 1.4513907432556152, + "learning_rate": 4.1364129780387275e-05, + "loss": 0.0798, + "num_input_tokens_seen": 13258048, + "step": 62825 + }, + { + "epoch": 6.911991199119912, + "grad_norm": 1.7690964937210083, + "learning_rate": 4.1362315240427086e-05, + "loss": 0.1433, + "num_input_tokens_seen": 13259072, + "step": 62830 + }, + { + "epoch": 6.912541254125412, + "grad_norm": 0.34336090087890625, + "learning_rate": 4.1360500549663286e-05, + "loss": 0.0325, + "num_input_tokens_seen": 13260128, + "step": 62835 + }, + { + "epoch": 6.913091309130913, + "grad_norm": 0.026065140962600708, + "learning_rate": 4.13586857081126e-05, + "loss": 0.0067, + "num_input_tokens_seen": 13261216, + "step": 62840 + }, + { + "epoch": 6.913641364136414, + "grad_norm": 1.5533033609390259, + "learning_rate": 4.135687071579176e-05, + "loss": 0.0203, + "num_input_tokens_seen": 13262336, + "step": 62845 + }, + { + "epoch": 6.914191419141914, + "grad_norm": 0.02904651127755642, + "learning_rate": 4.135505557271749e-05, + "loss": 0.0176, + "num_input_tokens_seen": 13263360, + "step": 62850 + }, + { + "epoch": 6.914741474147415, + "grad_norm": 0.1808214783668518, + "learning_rate": 4.1353240278906515e-05, + "loss": 0.0296, + "num_input_tokens_seen": 13264352, + "step": 62855 + }, + { + "epoch": 6.915291529152915, + "grad_norm": 0.4366595447063446, + "learning_rate": 4.135142483437558e-05, + "loss": 0.0902, + "num_input_tokens_seen": 13265376, + "step": 62860 + }, + { + "epoch": 6.915841584158416, + "grad_norm": 0.4928220212459564, + "learning_rate": 4.13496092391414e-05, + "loss": 0.0273, + "num_input_tokens_seen": 13266400, + "step": 62865 + }, + { + "epoch": 6.916391639163916, + "grad_norm": 0.03335251286625862, + "learning_rate": 4.1347793493220724e-05, + "loss": 0.0178, + "num_input_tokens_seen": 13267520, + "step": 62870 + }, + { + "epoch": 6.916941694169417, + "grad_norm": 0.004124731291085482, + "learning_rate": 4.134597759663028e-05, + "loss": 0.0734, + "num_input_tokens_seen": 13268608, + "step": 62875 + }, + { + "epoch": 6.917491749174918, + "grad_norm": 0.010694052092730999, + "learning_rate": 4.1344161549386806e-05, + "loss": 0.0928, + "num_input_tokens_seen": 13269696, + "step": 62880 + }, + { + "epoch": 6.918041804180418, + "grad_norm": 0.12443685531616211, + "learning_rate": 4.134234535150703e-05, + "loss": 0.0314, + "num_input_tokens_seen": 13270720, + "step": 62885 + }, + { + "epoch": 6.918591859185918, + "grad_norm": 0.01594083569943905, + "learning_rate": 4.13405290030077e-05, + "loss": 0.0358, + "num_input_tokens_seen": 13271776, + "step": 62890 + }, + { + "epoch": 6.919141914191419, + "grad_norm": 0.10415156930685043, + "learning_rate": 4.133871250390556e-05, + "loss": 0.0567, + "num_input_tokens_seen": 13272768, + "step": 62895 + }, + { + "epoch": 6.919691969196919, + "grad_norm": 0.5239382982254028, + "learning_rate": 4.1336895854217345e-05, + "loss": 0.0292, + "num_input_tokens_seen": 13273888, + "step": 62900 + }, + { + "epoch": 6.9202420242024205, + "grad_norm": 0.01748901605606079, + "learning_rate": 4.13350790539598e-05, + "loss": 0.0294, + "num_input_tokens_seen": 13274976, + "step": 62905 + }, + { + "epoch": 6.920792079207921, + "grad_norm": 0.01991129107773304, + "learning_rate": 4.1333262103149676e-05, + "loss": 0.1591, + "num_input_tokens_seen": 13276064, + "step": 62910 + }, + { + "epoch": 6.921342134213422, + "grad_norm": 0.18099772930145264, + "learning_rate": 4.13314450018037e-05, + "loss": 0.0066, + "num_input_tokens_seen": 13277152, + "step": 62915 + }, + { + "epoch": 6.921892189218922, + "grad_norm": 0.024486543610692024, + "learning_rate": 4.1329627749938637e-05, + "loss": 0.0017, + "num_input_tokens_seen": 13278240, + "step": 62920 + }, + { + "epoch": 6.922442244224422, + "grad_norm": 0.08127323538064957, + "learning_rate": 4.132781034757123e-05, + "loss": 0.0577, + "num_input_tokens_seen": 13279264, + "step": 62925 + }, + { + "epoch": 6.922992299229923, + "grad_norm": 0.11216987669467926, + "learning_rate": 4.1325992794718236e-05, + "loss": 0.0634, + "num_input_tokens_seen": 13280288, + "step": 62930 + }, + { + "epoch": 6.9235423542354235, + "grad_norm": 0.056787874549627304, + "learning_rate": 4.132417509139639e-05, + "loss": 0.1017, + "num_input_tokens_seen": 13281312, + "step": 62935 + }, + { + "epoch": 6.924092409240924, + "grad_norm": 0.1758049875497818, + "learning_rate": 4.132235723762247e-05, + "loss": 0.0983, + "num_input_tokens_seen": 13282368, + "step": 62940 + }, + { + "epoch": 6.924642464246425, + "grad_norm": 0.03043941967189312, + "learning_rate": 4.13205392334132e-05, + "loss": 0.0114, + "num_input_tokens_seen": 13283424, + "step": 62945 + }, + { + "epoch": 6.925192519251925, + "grad_norm": 0.02002961002290249, + "learning_rate": 4.131872107878536e-05, + "loss": 0.13, + "num_input_tokens_seen": 13284448, + "step": 62950 + }, + { + "epoch": 6.925742574257426, + "grad_norm": 0.09744741022586823, + "learning_rate": 4.1316902773755696e-05, + "loss": 0.0614, + "num_input_tokens_seen": 13285536, + "step": 62955 + }, + { + "epoch": 6.926292629262926, + "grad_norm": 1.5659114122390747, + "learning_rate": 4.131508431834097e-05, + "loss": 0.0916, + "num_input_tokens_seen": 13286656, + "step": 62960 + }, + { + "epoch": 6.9268426842684265, + "grad_norm": 0.010950867086648941, + "learning_rate": 4.131326571255794e-05, + "loss": 0.0228, + "num_input_tokens_seen": 13287648, + "step": 62965 + }, + { + "epoch": 6.927392739273928, + "grad_norm": 0.03211439773440361, + "learning_rate": 4.1311446956423374e-05, + "loss": 0.0079, + "num_input_tokens_seen": 13288672, + "step": 62970 + }, + { + "epoch": 6.927942794279428, + "grad_norm": 0.06260731816291809, + "learning_rate": 4.130962804995402e-05, + "loss": 0.0184, + "num_input_tokens_seen": 13289728, + "step": 62975 + }, + { + "epoch": 6.928492849284929, + "grad_norm": 0.3191014230251312, + "learning_rate": 4.1307808993166656e-05, + "loss": 0.0167, + "num_input_tokens_seen": 13290752, + "step": 62980 + }, + { + "epoch": 6.929042904290429, + "grad_norm": 2.1346168518066406, + "learning_rate": 4.130598978607803e-05, + "loss": 0.0749, + "num_input_tokens_seen": 13291776, + "step": 62985 + }, + { + "epoch": 6.929592959295929, + "grad_norm": 0.03254862502217293, + "learning_rate": 4.130417042870494e-05, + "loss": 0.0275, + "num_input_tokens_seen": 13292864, + "step": 62990 + }, + { + "epoch": 6.93014301430143, + "grad_norm": 0.18119490146636963, + "learning_rate": 4.130235092106412e-05, + "loss": 0.0214, + "num_input_tokens_seen": 13293888, + "step": 62995 + }, + { + "epoch": 6.930693069306931, + "grad_norm": 0.014043768867850304, + "learning_rate": 4.130053126317235e-05, + "loss": 0.0608, + "num_input_tokens_seen": 13294976, + "step": 63000 + }, + { + "epoch": 6.931243124312431, + "grad_norm": 0.5226516723632812, + "learning_rate": 4.129871145504641e-05, + "loss": 0.0147, + "num_input_tokens_seen": 13296000, + "step": 63005 + }, + { + "epoch": 6.931793179317932, + "grad_norm": 1.2164641618728638, + "learning_rate": 4.129689149670307e-05, + "loss": 0.0506, + "num_input_tokens_seen": 13297056, + "step": 63010 + }, + { + "epoch": 6.932343234323432, + "grad_norm": 0.12311973422765732, + "learning_rate": 4.129507138815911e-05, + "loss": 0.0638, + "num_input_tokens_seen": 13298176, + "step": 63015 + }, + { + "epoch": 6.932893289328933, + "grad_norm": 0.9924378395080566, + "learning_rate": 4.129325112943129e-05, + "loss": 0.0793, + "num_input_tokens_seen": 13299328, + "step": 63020 + }, + { + "epoch": 6.933443344334433, + "grad_norm": 0.042650699615478516, + "learning_rate": 4.129143072053638e-05, + "loss": 0.0426, + "num_input_tokens_seen": 13300416, + "step": 63025 + }, + { + "epoch": 6.933993399339934, + "grad_norm": 0.9828067421913147, + "learning_rate": 4.128961016149118e-05, + "loss": 0.0406, + "num_input_tokens_seen": 13301440, + "step": 63030 + }, + { + "epoch": 6.934543454345435, + "grad_norm": 0.019660653546452522, + "learning_rate": 4.128778945231246e-05, + "loss": 0.0127, + "num_input_tokens_seen": 13302528, + "step": 63035 + }, + { + "epoch": 6.935093509350935, + "grad_norm": 0.1545543372631073, + "learning_rate": 4.1285968593017006e-05, + "loss": 0.0085, + "num_input_tokens_seen": 13303520, + "step": 63040 + }, + { + "epoch": 6.935643564356436, + "grad_norm": 0.08791617304086685, + "learning_rate": 4.1284147583621586e-05, + "loss": 0.0105, + "num_input_tokens_seen": 13304544, + "step": 63045 + }, + { + "epoch": 6.936193619361936, + "grad_norm": 0.12177947908639908, + "learning_rate": 4.1282326424142994e-05, + "loss": 0.0189, + "num_input_tokens_seen": 13305600, + "step": 63050 + }, + { + "epoch": 6.936743674367436, + "grad_norm": 0.2588721811771393, + "learning_rate": 4.1280505114598014e-05, + "loss": 0.0633, + "num_input_tokens_seen": 13306688, + "step": 63055 + }, + { + "epoch": 6.9372937293729375, + "grad_norm": 0.1659359633922577, + "learning_rate": 4.1278683655003424e-05, + "loss": 0.0799, + "num_input_tokens_seen": 13307680, + "step": 63060 + }, + { + "epoch": 6.937843784378438, + "grad_norm": 0.18479497730731964, + "learning_rate": 4.127686204537603e-05, + "loss": 0.0083, + "num_input_tokens_seen": 13308736, + "step": 63065 + }, + { + "epoch": 6.938393839383938, + "grad_norm": 0.2382815182209015, + "learning_rate": 4.12750402857326e-05, + "loss": 0.0275, + "num_input_tokens_seen": 13309760, + "step": 63070 + }, + { + "epoch": 6.938943894389439, + "grad_norm": 1.051193356513977, + "learning_rate": 4.127321837608993e-05, + "loss": 0.0246, + "num_input_tokens_seen": 13310816, + "step": 63075 + }, + { + "epoch": 6.939493949394939, + "grad_norm": 0.05533664673566818, + "learning_rate": 4.127139631646482e-05, + "loss": 0.0041, + "num_input_tokens_seen": 13311904, + "step": 63080 + }, + { + "epoch": 6.94004400440044, + "grad_norm": 1.5750401020050049, + "learning_rate": 4.126957410687407e-05, + "loss": 0.0618, + "num_input_tokens_seen": 13312928, + "step": 63085 + }, + { + "epoch": 6.9405940594059405, + "grad_norm": 0.06081739068031311, + "learning_rate": 4.126775174733444e-05, + "loss": 0.0175, + "num_input_tokens_seen": 13313984, + "step": 63090 + }, + { + "epoch": 6.941144114411442, + "grad_norm": 0.018099889159202576, + "learning_rate": 4.126592923786276e-05, + "loss": 0.0533, + "num_input_tokens_seen": 13315104, + "step": 63095 + }, + { + "epoch": 6.941694169416942, + "grad_norm": 0.8967812061309814, + "learning_rate": 4.126410657847581e-05, + "loss": 0.0908, + "num_input_tokens_seen": 13316192, + "step": 63100 + }, + { + "epoch": 6.942244224422442, + "grad_norm": 1.3803305625915527, + "learning_rate": 4.12622837691904e-05, + "loss": 0.0229, + "num_input_tokens_seen": 13317216, + "step": 63105 + }, + { + "epoch": 6.942794279427943, + "grad_norm": 0.09111931174993515, + "learning_rate": 4.1260460810023326e-05, + "loss": 0.0217, + "num_input_tokens_seen": 13318304, + "step": 63110 + }, + { + "epoch": 6.943344334433443, + "grad_norm": 0.03665630891919136, + "learning_rate": 4.125863770099138e-05, + "loss": 0.0114, + "num_input_tokens_seen": 13319360, + "step": 63115 + }, + { + "epoch": 6.9438943894389435, + "grad_norm": 0.7561793327331543, + "learning_rate": 4.1256814442111374e-05, + "loss": 0.0271, + "num_input_tokens_seen": 13320416, + "step": 63120 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.07497143745422363, + "learning_rate": 4.1254991033400114e-05, + "loss": 0.0603, + "num_input_tokens_seen": 13321504, + "step": 63125 + }, + { + "epoch": 6.944994499449945, + "grad_norm": 0.11002719402313232, + "learning_rate": 4.12531674748744e-05, + "loss": 0.0356, + "num_input_tokens_seen": 13322528, + "step": 63130 + }, + { + "epoch": 6.945544554455445, + "grad_norm": 1.768580436706543, + "learning_rate": 4.1251343766551035e-05, + "loss": 0.0671, + "num_input_tokens_seen": 13323584, + "step": 63135 + }, + { + "epoch": 6.946094609460946, + "grad_norm": 0.34873881936073303, + "learning_rate": 4.124951990844684e-05, + "loss": 0.0818, + "num_input_tokens_seen": 13324672, + "step": 63140 + }, + { + "epoch": 6.946644664466446, + "grad_norm": 0.4208351969718933, + "learning_rate": 4.124769590057861e-05, + "loss": 0.0193, + "num_input_tokens_seen": 13325664, + "step": 63145 + }, + { + "epoch": 6.947194719471947, + "grad_norm": 0.24565668404102325, + "learning_rate": 4.124587174296317e-05, + "loss": 0.0617, + "num_input_tokens_seen": 13326752, + "step": 63150 + }, + { + "epoch": 6.947744774477448, + "grad_norm": 0.3241002559661865, + "learning_rate": 4.1244047435617324e-05, + "loss": 0.0553, + "num_input_tokens_seen": 13327808, + "step": 63155 + }, + { + "epoch": 6.948294829482949, + "grad_norm": 0.0136416582390666, + "learning_rate": 4.124222297855789e-05, + "loss": 0.043, + "num_input_tokens_seen": 13328864, + "step": 63160 + }, + { + "epoch": 6.948844884488449, + "grad_norm": 0.04426996409893036, + "learning_rate": 4.124039837180167e-05, + "loss": 0.0525, + "num_input_tokens_seen": 13329920, + "step": 63165 + }, + { + "epoch": 6.949394939493949, + "grad_norm": 0.5923147201538086, + "learning_rate": 4.1238573615365494e-05, + "loss": 0.0387, + "num_input_tokens_seen": 13331008, + "step": 63170 + }, + { + "epoch": 6.94994499449945, + "grad_norm": 0.06427627056837082, + "learning_rate": 4.1236748709266185e-05, + "loss": 0.1007, + "num_input_tokens_seen": 13332064, + "step": 63175 + }, + { + "epoch": 6.9504950495049505, + "grad_norm": 0.7318408489227295, + "learning_rate": 4.123492365352055e-05, + "loss": 0.016, + "num_input_tokens_seen": 13333152, + "step": 63180 + }, + { + "epoch": 6.951045104510451, + "grad_norm": 0.13968145847320557, + "learning_rate": 4.123309844814541e-05, + "loss": 0.1226, + "num_input_tokens_seen": 13334208, + "step": 63185 + }, + { + "epoch": 6.951595159515952, + "grad_norm": 0.03821275755763054, + "learning_rate": 4.12312730931576e-05, + "loss": 0.0036, + "num_input_tokens_seen": 13335200, + "step": 63190 + }, + { + "epoch": 6.952145214521452, + "grad_norm": 0.5617203116416931, + "learning_rate": 4.122944758857393e-05, + "loss": 0.0221, + "num_input_tokens_seen": 13336224, + "step": 63195 + }, + { + "epoch": 6.952695269526953, + "grad_norm": 0.30655354261398315, + "learning_rate": 4.122762193441123e-05, + "loss": 0.0945, + "num_input_tokens_seen": 13337248, + "step": 63200 + }, + { + "epoch": 6.953245324532453, + "grad_norm": 0.028907891362905502, + "learning_rate": 4.1225796130686325e-05, + "loss": 0.0328, + "num_input_tokens_seen": 13338272, + "step": 63205 + }, + { + "epoch": 6.9537953795379535, + "grad_norm": 0.005648595746606588, + "learning_rate": 4.122397017741604e-05, + "loss": 0.0422, + "num_input_tokens_seen": 13339360, + "step": 63210 + }, + { + "epoch": 6.9543454345434546, + "grad_norm": 0.16435013711452484, + "learning_rate": 4.1222144074617206e-05, + "loss": 0.0331, + "num_input_tokens_seen": 13340352, + "step": 63215 + }, + { + "epoch": 6.954895489548955, + "grad_norm": 0.048092249780893326, + "learning_rate": 4.122031782230666e-05, + "loss": 0.1347, + "num_input_tokens_seen": 13341408, + "step": 63220 + }, + { + "epoch": 6.955445544554456, + "grad_norm": 0.2969937324523926, + "learning_rate": 4.121849142050123e-05, + "loss": 0.0231, + "num_input_tokens_seen": 13342464, + "step": 63225 + }, + { + "epoch": 6.955995599559956, + "grad_norm": 0.020975865423679352, + "learning_rate": 4.121666486921774e-05, + "loss": 0.0963, + "num_input_tokens_seen": 13343520, + "step": 63230 + }, + { + "epoch": 6.956545654565456, + "grad_norm": 0.033496856689453125, + "learning_rate": 4.121483816847304e-05, + "loss": 0.0277, + "num_input_tokens_seen": 13344576, + "step": 63235 + }, + { + "epoch": 6.957095709570957, + "grad_norm": 0.08267563581466675, + "learning_rate": 4.121301131828394e-05, + "loss": 0.0055, + "num_input_tokens_seen": 13345600, + "step": 63240 + }, + { + "epoch": 6.957645764576458, + "grad_norm": 0.6885938048362732, + "learning_rate": 4.1211184318667306e-05, + "loss": 0.0563, + "num_input_tokens_seen": 13346624, + "step": 63245 + }, + { + "epoch": 6.958195819581958, + "grad_norm": 0.18981331586837769, + "learning_rate": 4.1209357169639965e-05, + "loss": 0.1038, + "num_input_tokens_seen": 13347616, + "step": 63250 + }, + { + "epoch": 6.958745874587459, + "grad_norm": 0.023677650839090347, + "learning_rate": 4.120752987121876e-05, + "loss": 0.0539, + "num_input_tokens_seen": 13348672, + "step": 63255 + }, + { + "epoch": 6.959295929592959, + "grad_norm": 0.6753993034362793, + "learning_rate": 4.1205702423420525e-05, + "loss": 0.1157, + "num_input_tokens_seen": 13349728, + "step": 63260 + }, + { + "epoch": 6.95984598459846, + "grad_norm": 0.638232409954071, + "learning_rate": 4.120387482626211e-05, + "loss": 0.114, + "num_input_tokens_seen": 13350752, + "step": 63265 + }, + { + "epoch": 6.96039603960396, + "grad_norm": 0.011308335699141026, + "learning_rate": 4.1202047079760354e-05, + "loss": 0.0263, + "num_input_tokens_seen": 13351872, + "step": 63270 + }, + { + "epoch": 6.960946094609461, + "grad_norm": 0.7548192739486694, + "learning_rate": 4.12002191839321e-05, + "loss": 0.0882, + "num_input_tokens_seen": 13352928, + "step": 63275 + }, + { + "epoch": 6.961496149614962, + "grad_norm": 0.709023654460907, + "learning_rate": 4.11983911387942e-05, + "loss": 0.0307, + "num_input_tokens_seen": 13354048, + "step": 63280 + }, + { + "epoch": 6.962046204620462, + "grad_norm": 0.021471112966537476, + "learning_rate": 4.119656294436351e-05, + "loss": 0.0395, + "num_input_tokens_seen": 13355136, + "step": 63285 + }, + { + "epoch": 6.962596259625963, + "grad_norm": 0.010497305542230606, + "learning_rate": 4.119473460065687e-05, + "loss": 0.0305, + "num_input_tokens_seen": 13356256, + "step": 63290 + }, + { + "epoch": 6.963146314631463, + "grad_norm": 0.01479140855371952, + "learning_rate": 4.119290610769112e-05, + "loss": 0.0574, + "num_input_tokens_seen": 13357312, + "step": 63295 + }, + { + "epoch": 6.963696369636963, + "grad_norm": 0.17272207140922546, + "learning_rate": 4.1191077465483144e-05, + "loss": 0.061, + "num_input_tokens_seen": 13358368, + "step": 63300 + }, + { + "epoch": 6.9642464246424645, + "grad_norm": 0.13601146638393402, + "learning_rate": 4.118924867404976e-05, + "loss": 0.0306, + "num_input_tokens_seen": 13359488, + "step": 63305 + }, + { + "epoch": 6.964796479647965, + "grad_norm": 0.13754473626613617, + "learning_rate": 4.1187419733407845e-05, + "loss": 0.1848, + "num_input_tokens_seen": 13360608, + "step": 63310 + }, + { + "epoch": 6.965346534653465, + "grad_norm": 0.07084060460329056, + "learning_rate": 4.1185590643574244e-05, + "loss": 0.0322, + "num_input_tokens_seen": 13361728, + "step": 63315 + }, + { + "epoch": 6.965896589658966, + "grad_norm": 0.342691570520401, + "learning_rate": 4.1183761404565836e-05, + "loss": 0.0233, + "num_input_tokens_seen": 13362720, + "step": 63320 + }, + { + "epoch": 6.966446644664466, + "grad_norm": 0.09201780706644058, + "learning_rate": 4.1181932016399446e-05, + "loss": 0.056, + "num_input_tokens_seen": 13363808, + "step": 63325 + }, + { + "epoch": 6.966996699669967, + "grad_norm": 0.0380348265171051, + "learning_rate": 4.118010247909197e-05, + "loss": 0.0357, + "num_input_tokens_seen": 13364896, + "step": 63330 + }, + { + "epoch": 6.9675467546754675, + "grad_norm": 0.151570126414299, + "learning_rate": 4.1178272792660244e-05, + "loss": 0.0068, + "num_input_tokens_seen": 13365888, + "step": 63335 + }, + { + "epoch": 6.968096809680969, + "grad_norm": 0.12998425960540771, + "learning_rate": 4.1176442957121144e-05, + "loss": 0.0102, + "num_input_tokens_seen": 13366880, + "step": 63340 + }, + { + "epoch": 6.968646864686469, + "grad_norm": 0.297620564699173, + "learning_rate": 4.117461297249153e-05, + "loss": 0.0184, + "num_input_tokens_seen": 13368000, + "step": 63345 + }, + { + "epoch": 6.969196919691969, + "grad_norm": 2.1886801719665527, + "learning_rate": 4.1172782838788265e-05, + "loss": 0.1172, + "num_input_tokens_seen": 13369056, + "step": 63350 + }, + { + "epoch": 6.96974697469747, + "grad_norm": 1.0727229118347168, + "learning_rate": 4.117095255602823e-05, + "loss": 0.1589, + "num_input_tokens_seen": 13370112, + "step": 63355 + }, + { + "epoch": 6.97029702970297, + "grad_norm": 0.026723964139819145, + "learning_rate": 4.116912212422828e-05, + "loss": 0.0118, + "num_input_tokens_seen": 13371168, + "step": 63360 + }, + { + "epoch": 6.9708470847084705, + "grad_norm": 0.09168732911348343, + "learning_rate": 4.116729154340528e-05, + "loss": 0.0318, + "num_input_tokens_seen": 13372256, + "step": 63365 + }, + { + "epoch": 6.971397139713972, + "grad_norm": 0.08267903327941895, + "learning_rate": 4.116546081357613e-05, + "loss": 0.0977, + "num_input_tokens_seen": 13373280, + "step": 63370 + }, + { + "epoch": 6.971947194719472, + "grad_norm": 0.0849209725856781, + "learning_rate": 4.1163629934757676e-05, + "loss": 0.1044, + "num_input_tokens_seen": 13374400, + "step": 63375 + }, + { + "epoch": 6.972497249724973, + "grad_norm": 2.0198986530303955, + "learning_rate": 4.11617989069668e-05, + "loss": 0.0588, + "num_input_tokens_seen": 13375488, + "step": 63380 + }, + { + "epoch": 6.973047304730473, + "grad_norm": 0.8926954865455627, + "learning_rate": 4.1159967730220384e-05, + "loss": 0.0489, + "num_input_tokens_seen": 13376544, + "step": 63385 + }, + { + "epoch": 6.973597359735973, + "grad_norm": 0.017724430188536644, + "learning_rate": 4.11581364045353e-05, + "loss": 0.1017, + "num_input_tokens_seen": 13377632, + "step": 63390 + }, + { + "epoch": 6.974147414741474, + "grad_norm": 0.8547229766845703, + "learning_rate": 4.1156304929928425e-05, + "loss": 0.1172, + "num_input_tokens_seen": 13378688, + "step": 63395 + }, + { + "epoch": 6.974697469746975, + "grad_norm": 0.9776557683944702, + "learning_rate": 4.115447330641663e-05, + "loss": 0.0423, + "num_input_tokens_seen": 13379744, + "step": 63400 + }, + { + "epoch": 6.975247524752476, + "grad_norm": 0.2993006110191345, + "learning_rate": 4.1152641534016816e-05, + "loss": 0.115, + "num_input_tokens_seen": 13380736, + "step": 63405 + }, + { + "epoch": 6.975797579757976, + "grad_norm": 0.27535316348075867, + "learning_rate": 4.1150809612745854e-05, + "loss": 0.0511, + "num_input_tokens_seen": 13381760, + "step": 63410 + }, + { + "epoch": 6.976347634763476, + "grad_norm": 0.26493003964424133, + "learning_rate": 4.114897754262063e-05, + "loss": 0.0357, + "num_input_tokens_seen": 13382784, + "step": 63415 + }, + { + "epoch": 6.976897689768977, + "grad_norm": 0.1779199242591858, + "learning_rate": 4.114714532365803e-05, + "loss": 0.0659, + "num_input_tokens_seen": 13383776, + "step": 63420 + }, + { + "epoch": 6.977447744774477, + "grad_norm": 0.32522299885749817, + "learning_rate": 4.114531295587494e-05, + "loss": 0.0872, + "num_input_tokens_seen": 13384832, + "step": 63425 + }, + { + "epoch": 6.977997799779978, + "grad_norm": 1.0702519416809082, + "learning_rate": 4.114348043928824e-05, + "loss": 0.0511, + "num_input_tokens_seen": 13385920, + "step": 63430 + }, + { + "epoch": 6.978547854785479, + "grad_norm": 0.742648184299469, + "learning_rate": 4.1141647773914835e-05, + "loss": 0.0245, + "num_input_tokens_seen": 13386944, + "step": 63435 + }, + { + "epoch": 6.979097909790979, + "grad_norm": 0.37562301754951477, + "learning_rate": 4.11398149597716e-05, + "loss": 0.0821, + "num_input_tokens_seen": 13388032, + "step": 63440 + }, + { + "epoch": 6.97964796479648, + "grad_norm": 0.1872437447309494, + "learning_rate": 4.113798199687545e-05, + "loss": 0.0495, + "num_input_tokens_seen": 13389088, + "step": 63445 + }, + { + "epoch": 6.98019801980198, + "grad_norm": 0.844780683517456, + "learning_rate": 4.113614888524325e-05, + "loss": 0.0462, + "num_input_tokens_seen": 13390080, + "step": 63450 + }, + { + "epoch": 6.98074807480748, + "grad_norm": 0.3668551445007324, + "learning_rate": 4.1134315624891914e-05, + "loss": 0.0356, + "num_input_tokens_seen": 13391104, + "step": 63455 + }, + { + "epoch": 6.9812981298129815, + "grad_norm": 0.7534089684486389, + "learning_rate": 4.113248221583833e-05, + "loss": 0.0374, + "num_input_tokens_seen": 13392128, + "step": 63460 + }, + { + "epoch": 6.981848184818482, + "grad_norm": 0.9671634435653687, + "learning_rate": 4.11306486580994e-05, + "loss": 0.0614, + "num_input_tokens_seen": 13393152, + "step": 63465 + }, + { + "epoch": 6.982398239823983, + "grad_norm": 0.01617344655096531, + "learning_rate": 4.112881495169203e-05, + "loss": 0.0031, + "num_input_tokens_seen": 13394144, + "step": 63470 + }, + { + "epoch": 6.982948294829483, + "grad_norm": 0.3395143449306488, + "learning_rate": 4.11269810966331e-05, + "loss": 0.0156, + "num_input_tokens_seen": 13395264, + "step": 63475 + }, + { + "epoch": 6.983498349834983, + "grad_norm": 0.0198766328394413, + "learning_rate": 4.1125147092939524e-05, + "loss": 0.0198, + "num_input_tokens_seen": 13396320, + "step": 63480 + }, + { + "epoch": 6.984048404840484, + "grad_norm": 1.3853646516799927, + "learning_rate": 4.112331294062821e-05, + "loss": 0.0539, + "num_input_tokens_seen": 13397376, + "step": 63485 + }, + { + "epoch": 6.9845984598459845, + "grad_norm": 0.06582441926002502, + "learning_rate": 4.112147863971606e-05, + "loss": 0.0273, + "num_input_tokens_seen": 13398464, + "step": 63490 + }, + { + "epoch": 6.985148514851485, + "grad_norm": 0.8219113945960999, + "learning_rate": 4.111964419021998e-05, + "loss": 0.0517, + "num_input_tokens_seen": 13399456, + "step": 63495 + }, + { + "epoch": 6.985698569856986, + "grad_norm": 1.507323980331421, + "learning_rate": 4.1117809592156857e-05, + "loss": 0.0554, + "num_input_tokens_seen": 13400576, + "step": 63500 + }, + { + "epoch": 6.986248624862486, + "grad_norm": 0.3196604549884796, + "learning_rate": 4.1115974845543626e-05, + "loss": 0.0317, + "num_input_tokens_seen": 13401632, + "step": 63505 + }, + { + "epoch": 6.986798679867987, + "grad_norm": 0.4940096437931061, + "learning_rate": 4.11141399503972e-05, + "loss": 0.0443, + "num_input_tokens_seen": 13402656, + "step": 63510 + }, + { + "epoch": 6.987348734873487, + "grad_norm": 0.4900582432746887, + "learning_rate": 4.1112304906734466e-05, + "loss": 0.0398, + "num_input_tokens_seen": 13403680, + "step": 63515 + }, + { + "epoch": 6.987898789878988, + "grad_norm": 0.03592162951827049, + "learning_rate": 4.111046971457235e-05, + "loss": 0.0157, + "num_input_tokens_seen": 13404704, + "step": 63520 + }, + { + "epoch": 6.988448844884489, + "grad_norm": 0.4550641179084778, + "learning_rate": 4.1108634373927764e-05, + "loss": 0.0151, + "num_input_tokens_seen": 13405728, + "step": 63525 + }, + { + "epoch": 6.988998899889989, + "grad_norm": 1.0451425313949585, + "learning_rate": 4.110679888481763e-05, + "loss": 0.1173, + "num_input_tokens_seen": 13406816, + "step": 63530 + }, + { + "epoch": 6.98954895489549, + "grad_norm": 0.4972873032093048, + "learning_rate": 4.110496324725886e-05, + "loss": 0.088, + "num_input_tokens_seen": 13407808, + "step": 63535 + }, + { + "epoch": 6.99009900990099, + "grad_norm": 0.3091857433319092, + "learning_rate": 4.110312746126836e-05, + "loss": 0.0256, + "num_input_tokens_seen": 13408864, + "step": 63540 + }, + { + "epoch": 6.99064906490649, + "grad_norm": 0.05600034445524216, + "learning_rate": 4.110129152686307e-05, + "loss": 0.0316, + "num_input_tokens_seen": 13409888, + "step": 63545 + }, + { + "epoch": 6.991199119911991, + "grad_norm": 0.022370031103491783, + "learning_rate": 4.10994554440599e-05, + "loss": 0.0397, + "num_input_tokens_seen": 13410944, + "step": 63550 + }, + { + "epoch": 6.991749174917492, + "grad_norm": 0.5231063961982727, + "learning_rate": 4.109761921287578e-05, + "loss": 0.023, + "num_input_tokens_seen": 13411968, + "step": 63555 + }, + { + "epoch": 6.992299229922993, + "grad_norm": 1.0073031187057495, + "learning_rate": 4.109578283332762e-05, + "loss": 0.0295, + "num_input_tokens_seen": 13412960, + "step": 63560 + }, + { + "epoch": 6.992849284928493, + "grad_norm": 0.5428640246391296, + "learning_rate": 4.1093946305432365e-05, + "loss": 0.0389, + "num_input_tokens_seen": 13414080, + "step": 63565 + }, + { + "epoch": 6.993399339933993, + "grad_norm": 0.10002946853637695, + "learning_rate": 4.109210962920692e-05, + "loss": 0.0148, + "num_input_tokens_seen": 13415136, + "step": 63570 + }, + { + "epoch": 6.993949394939494, + "grad_norm": 0.9946054816246033, + "learning_rate": 4.109027280466822e-05, + "loss": 0.0641, + "num_input_tokens_seen": 13416192, + "step": 63575 + }, + { + "epoch": 6.994499449944994, + "grad_norm": 0.4393671452999115, + "learning_rate": 4.1088435831833206e-05, + "loss": 0.0657, + "num_input_tokens_seen": 13417344, + "step": 63580 + }, + { + "epoch": 6.9950495049504955, + "grad_norm": 0.02315455488860607, + "learning_rate": 4.108659871071879e-05, + "loss": 0.0486, + "num_input_tokens_seen": 13418400, + "step": 63585 + }, + { + "epoch": 6.995599559955996, + "grad_norm": 1.7528598308563232, + "learning_rate": 4.108476144134192e-05, + "loss": 0.0655, + "num_input_tokens_seen": 13419424, + "step": 63590 + }, + { + "epoch": 6.996149614961496, + "grad_norm": 0.13681046664714813, + "learning_rate": 4.1082924023719516e-05, + "loss": 0.0091, + "num_input_tokens_seen": 13420512, + "step": 63595 + }, + { + "epoch": 6.996699669966997, + "grad_norm": 0.007095757871866226, + "learning_rate": 4.108108645786852e-05, + "loss": 0.0184, + "num_input_tokens_seen": 13421504, + "step": 63600 + }, + { + "epoch": 6.997249724972497, + "grad_norm": 0.024376647546887398, + "learning_rate": 4.107924874380587e-05, + "loss": 0.0171, + "num_input_tokens_seen": 13422560, + "step": 63605 + }, + { + "epoch": 6.997799779977997, + "grad_norm": 0.025824464857578278, + "learning_rate": 4.10774108815485e-05, + "loss": 0.0112, + "num_input_tokens_seen": 13423584, + "step": 63610 + }, + { + "epoch": 6.9983498349834985, + "grad_norm": 0.1385827213525772, + "learning_rate": 4.107557287111334e-05, + "loss": 0.0121, + "num_input_tokens_seen": 13424608, + "step": 63615 + }, + { + "epoch": 6.998899889988999, + "grad_norm": 0.08613397926092148, + "learning_rate": 4.1073734712517345e-05, + "loss": 0.2137, + "num_input_tokens_seen": 13425696, + "step": 63620 + }, + { + "epoch": 6.9994499449945, + "grad_norm": 0.05348677933216095, + "learning_rate": 4.1071896405777454e-05, + "loss": 0.0364, + "num_input_tokens_seen": 13426720, + "step": 63625 + }, + { + "epoch": 7.0, + "grad_norm": 0.20597292482852936, + "learning_rate": 4.10700579509106e-05, + "loss": 0.0257, + "num_input_tokens_seen": 13427712, + "step": 63630 + }, + { + "epoch": 7.0, + "eval_loss": 0.06129888445138931, + "eval_runtime": 36.9644, + "eval_samples_per_second": 109.294, + "eval_steps_per_second": 27.324, + "num_input_tokens_seen": 13427712, + "step": 63630 + }, + { + "epoch": 7.0005500550055, + "grad_norm": 0.06341218203306198, + "learning_rate": 4.106821934793373e-05, + "loss": 0.0174, + "num_input_tokens_seen": 13428736, + "step": 63635 + }, + { + "epoch": 7.001100110011001, + "grad_norm": 0.04203970730304718, + "learning_rate": 4.10663805968638e-05, + "loss": 0.0092, + "num_input_tokens_seen": 13429728, + "step": 63640 + }, + { + "epoch": 7.0016501650165015, + "grad_norm": 0.042507123202085495, + "learning_rate": 4.1064541697717754e-05, + "loss": 0.0046, + "num_input_tokens_seen": 13430816, + "step": 63645 + }, + { + "epoch": 7.002200220022003, + "grad_norm": 0.013317953795194626, + "learning_rate": 4.1062702650512516e-05, + "loss": 0.0364, + "num_input_tokens_seen": 13431904, + "step": 63650 + }, + { + "epoch": 7.002750275027503, + "grad_norm": 0.15793952345848083, + "learning_rate": 4.1060863455265075e-05, + "loss": 0.2089, + "num_input_tokens_seen": 13433024, + "step": 63655 + }, + { + "epoch": 7.003300330033003, + "grad_norm": 0.14024686813354492, + "learning_rate": 4.105902411199236e-05, + "loss": 0.0493, + "num_input_tokens_seen": 13434080, + "step": 63660 + }, + { + "epoch": 7.003850385038504, + "grad_norm": 0.05575086548924446, + "learning_rate": 4.1057184620711314e-05, + "loss": 0.054, + "num_input_tokens_seen": 13435104, + "step": 63665 + }, + { + "epoch": 7.004400440044004, + "grad_norm": 0.07684403657913208, + "learning_rate": 4.105534498143891e-05, + "loss": 0.0426, + "num_input_tokens_seen": 13436128, + "step": 63670 + }, + { + "epoch": 7.0049504950495045, + "grad_norm": 0.07276426255702972, + "learning_rate": 4.105350519419209e-05, + "loss": 0.0373, + "num_input_tokens_seen": 13437184, + "step": 63675 + }, + { + "epoch": 7.005500550055006, + "grad_norm": 0.2814671993255615, + "learning_rate": 4.105166525898783e-05, + "loss": 0.044, + "num_input_tokens_seen": 13438208, + "step": 63680 + }, + { + "epoch": 7.006050605060506, + "grad_norm": 0.6385027170181274, + "learning_rate": 4.104982517584306e-05, + "loss": 0.0265, + "num_input_tokens_seen": 13439328, + "step": 63685 + }, + { + "epoch": 7.006600660066007, + "grad_norm": 0.028835000470280647, + "learning_rate": 4.1047984944774756e-05, + "loss": 0.0281, + "num_input_tokens_seen": 13440352, + "step": 63690 + }, + { + "epoch": 7.007150715071507, + "grad_norm": 0.05447071045637131, + "learning_rate": 4.104614456579987e-05, + "loss": 0.0597, + "num_input_tokens_seen": 13441344, + "step": 63695 + }, + { + "epoch": 7.007700770077007, + "grad_norm": 0.020905327051877975, + "learning_rate": 4.104430403893538e-05, + "loss": 0.0646, + "num_input_tokens_seen": 13442400, + "step": 63700 + }, + { + "epoch": 7.008250825082508, + "grad_norm": 0.25539469718933105, + "learning_rate": 4.104246336419824e-05, + "loss": 0.0246, + "num_input_tokens_seen": 13443456, + "step": 63705 + }, + { + "epoch": 7.008800880088009, + "grad_norm": 0.01006622426211834, + "learning_rate": 4.10406225416054e-05, + "loss": 0.0055, + "num_input_tokens_seen": 13444448, + "step": 63710 + }, + { + "epoch": 7.00935093509351, + "grad_norm": 0.1526707112789154, + "learning_rate": 4.1038781571173844e-05, + "loss": 0.0756, + "num_input_tokens_seen": 13445440, + "step": 63715 + }, + { + "epoch": 7.00990099009901, + "grad_norm": 0.011956717818975449, + "learning_rate": 4.1036940452920534e-05, + "loss": 0.0701, + "num_input_tokens_seen": 13446432, + "step": 63720 + }, + { + "epoch": 7.01045104510451, + "grad_norm": 0.28862103819847107, + "learning_rate": 4.103509918686245e-05, + "loss": 0.067, + "num_input_tokens_seen": 13447456, + "step": 63725 + }, + { + "epoch": 7.011001100110011, + "grad_norm": 0.13046397268772125, + "learning_rate": 4.103325777301654e-05, + "loss": 0.0149, + "num_input_tokens_seen": 13448544, + "step": 63730 + }, + { + "epoch": 7.011551155115511, + "grad_norm": 0.18299946188926697, + "learning_rate": 4.1031416211399785e-05, + "loss": 0.0125, + "num_input_tokens_seen": 13449632, + "step": 63735 + }, + { + "epoch": 7.0121012101210125, + "grad_norm": 0.024097079411149025, + "learning_rate": 4.1029574502029164e-05, + "loss": 0.0453, + "num_input_tokens_seen": 13450688, + "step": 63740 + }, + { + "epoch": 7.012651265126513, + "grad_norm": 1.5765326023101807, + "learning_rate": 4.1027732644921644e-05, + "loss": 0.3208, + "num_input_tokens_seen": 13451744, + "step": 63745 + }, + { + "epoch": 7.013201320132013, + "grad_norm": 0.12384431809186935, + "learning_rate": 4.102589064009421e-05, + "loss": 0.0241, + "num_input_tokens_seen": 13452736, + "step": 63750 + }, + { + "epoch": 7.013751375137514, + "grad_norm": 0.29615429043769836, + "learning_rate": 4.102404848756382e-05, + "loss": 0.0441, + "num_input_tokens_seen": 13453792, + "step": 63755 + }, + { + "epoch": 7.014301430143014, + "grad_norm": 0.09760177880525589, + "learning_rate": 4.102220618734748e-05, + "loss": 0.0529, + "num_input_tokens_seen": 13454848, + "step": 63760 + }, + { + "epoch": 7.014851485148514, + "grad_norm": 0.33759114146232605, + "learning_rate": 4.1020363739462145e-05, + "loss": 0.0104, + "num_input_tokens_seen": 13455872, + "step": 63765 + }, + { + "epoch": 7.0154015401540155, + "grad_norm": 0.2556864619255066, + "learning_rate": 4.101852114392481e-05, + "loss": 0.0523, + "num_input_tokens_seen": 13456960, + "step": 63770 + }, + { + "epoch": 7.015951595159516, + "grad_norm": 0.009159663692116737, + "learning_rate": 4.101667840075244e-05, + "loss": 0.1455, + "num_input_tokens_seen": 13457952, + "step": 63775 + }, + { + "epoch": 7.016501650165017, + "grad_norm": 0.0471402145922184, + "learning_rate": 4.101483550996204e-05, + "loss": 0.0532, + "num_input_tokens_seen": 13458944, + "step": 63780 + }, + { + "epoch": 7.017051705170517, + "grad_norm": 0.12050558626651764, + "learning_rate": 4.1012992471570584e-05, + "loss": 0.0116, + "num_input_tokens_seen": 13460000, + "step": 63785 + }, + { + "epoch": 7.017601760176017, + "grad_norm": 0.06759258359670639, + "learning_rate": 4.101114928559506e-05, + "loss": 0.0039, + "num_input_tokens_seen": 13461056, + "step": 63790 + }, + { + "epoch": 7.018151815181518, + "grad_norm": 0.03147035092115402, + "learning_rate": 4.100930595205246e-05, + "loss": 0.092, + "num_input_tokens_seen": 13462048, + "step": 63795 + }, + { + "epoch": 7.0187018701870185, + "grad_norm": 0.09453348070383072, + "learning_rate": 4.100746247095976e-05, + "loss": 0.0533, + "num_input_tokens_seen": 13463040, + "step": 63800 + }, + { + "epoch": 7.01925192519252, + "grad_norm": 0.1621338278055191, + "learning_rate": 4.100561884233396e-05, + "loss": 0.0532, + "num_input_tokens_seen": 13464096, + "step": 63805 + }, + { + "epoch": 7.01980198019802, + "grad_norm": 0.04181145876646042, + "learning_rate": 4.1003775066192054e-05, + "loss": 0.0583, + "num_input_tokens_seen": 13465184, + "step": 63810 + }, + { + "epoch": 7.02035203520352, + "grad_norm": 0.010818803682923317, + "learning_rate": 4.100193114255104e-05, + "loss": 0.0101, + "num_input_tokens_seen": 13466208, + "step": 63815 + }, + { + "epoch": 7.020902090209021, + "grad_norm": 0.659329354763031, + "learning_rate": 4.1000087071427895e-05, + "loss": 0.0463, + "num_input_tokens_seen": 13467232, + "step": 63820 + }, + { + "epoch": 7.021452145214521, + "grad_norm": 0.05732037499547005, + "learning_rate": 4.099824285283963e-05, + "loss": 0.0309, + "num_input_tokens_seen": 13468320, + "step": 63825 + }, + { + "epoch": 7.022002200220022, + "grad_norm": 0.2422318458557129, + "learning_rate": 4.099639848680324e-05, + "loss": 0.055, + "num_input_tokens_seen": 13469312, + "step": 63830 + }, + { + "epoch": 7.022552255225523, + "grad_norm": 0.11136725544929504, + "learning_rate": 4.099455397333572e-05, + "loss": 0.0232, + "num_input_tokens_seen": 13470400, + "step": 63835 + }, + { + "epoch": 7.023102310231023, + "grad_norm": 0.012794401496648788, + "learning_rate": 4.0992709312454065e-05, + "loss": 0.0346, + "num_input_tokens_seen": 13471392, + "step": 63840 + }, + { + "epoch": 7.023652365236524, + "grad_norm": 0.028227489441633224, + "learning_rate": 4.099086450417528e-05, + "loss": 0.0251, + "num_input_tokens_seen": 13472416, + "step": 63845 + }, + { + "epoch": 7.024202420242024, + "grad_norm": 0.038033850491046906, + "learning_rate": 4.098901954851638e-05, + "loss": 0.047, + "num_input_tokens_seen": 13473504, + "step": 63850 + }, + { + "epoch": 7.024752475247524, + "grad_norm": 0.16128535568714142, + "learning_rate": 4.098717444549436e-05, + "loss": 0.0424, + "num_input_tokens_seen": 13474592, + "step": 63855 + }, + { + "epoch": 7.025302530253025, + "grad_norm": 0.11777710914611816, + "learning_rate": 4.098532919512622e-05, + "loss": 0.0151, + "num_input_tokens_seen": 13475616, + "step": 63860 + }, + { + "epoch": 7.025852585258526, + "grad_norm": 0.7713527083396912, + "learning_rate": 4.098348379742897e-05, + "loss": 0.0341, + "num_input_tokens_seen": 13476672, + "step": 63865 + }, + { + "epoch": 7.026402640264027, + "grad_norm": 0.5821409225463867, + "learning_rate": 4.098163825241962e-05, + "loss": 0.0149, + "num_input_tokens_seen": 13477728, + "step": 63870 + }, + { + "epoch": 7.026952695269527, + "grad_norm": 0.05238524451851845, + "learning_rate": 4.0979792560115175e-05, + "loss": 0.0346, + "num_input_tokens_seen": 13478784, + "step": 63875 + }, + { + "epoch": 7.027502750275027, + "grad_norm": 0.5608092546463013, + "learning_rate": 4.0977946720532654e-05, + "loss": 0.135, + "num_input_tokens_seen": 13479744, + "step": 63880 + }, + { + "epoch": 7.028052805280528, + "grad_norm": 0.8874763250350952, + "learning_rate": 4.0976100733689065e-05, + "loss": 0.1318, + "num_input_tokens_seen": 13480800, + "step": 63885 + }, + { + "epoch": 7.028602860286028, + "grad_norm": 0.0060861180536448956, + "learning_rate": 4.097425459960142e-05, + "loss": 0.0065, + "num_input_tokens_seen": 13481888, + "step": 63890 + }, + { + "epoch": 7.0291529152915295, + "grad_norm": 0.13675138354301453, + "learning_rate": 4.0972408318286734e-05, + "loss": 0.0632, + "num_input_tokens_seen": 13482976, + "step": 63895 + }, + { + "epoch": 7.02970297029703, + "grad_norm": 0.3920537829399109, + "learning_rate": 4.0970561889762036e-05, + "loss": 0.0493, + "num_input_tokens_seen": 13484064, + "step": 63900 + }, + { + "epoch": 7.03025302530253, + "grad_norm": 0.05868622288107872, + "learning_rate": 4.0968715314044323e-05, + "loss": 0.0236, + "num_input_tokens_seen": 13485184, + "step": 63905 + }, + { + "epoch": 7.030803080308031, + "grad_norm": 0.6692244410514832, + "learning_rate": 4.096686859115062e-05, + "loss": 0.0205, + "num_input_tokens_seen": 13486144, + "step": 63910 + }, + { + "epoch": 7.031353135313531, + "grad_norm": 0.28097811341285706, + "learning_rate": 4.096502172109796e-05, + "loss": 0.0981, + "num_input_tokens_seen": 13487168, + "step": 63915 + }, + { + "epoch": 7.031903190319032, + "grad_norm": 0.07804752141237259, + "learning_rate": 4.096317470390335e-05, + "loss": 0.0221, + "num_input_tokens_seen": 13488192, + "step": 63920 + }, + { + "epoch": 7.0324532453245325, + "grad_norm": 0.03547343239188194, + "learning_rate": 4.0961327539583824e-05, + "loss": 0.0032, + "num_input_tokens_seen": 13489184, + "step": 63925 + }, + { + "epoch": 7.033003300330033, + "grad_norm": 0.7350422739982605, + "learning_rate": 4.09594802281564e-05, + "loss": 0.0435, + "num_input_tokens_seen": 13490208, + "step": 63930 + }, + { + "epoch": 7.033553355335534, + "grad_norm": 0.01215433981269598, + "learning_rate": 4.09576327696381e-05, + "loss": 0.0024, + "num_input_tokens_seen": 13491264, + "step": 63935 + }, + { + "epoch": 7.034103410341034, + "grad_norm": 0.013911676593124866, + "learning_rate": 4.095578516404596e-05, + "loss": 0.0405, + "num_input_tokens_seen": 13492320, + "step": 63940 + }, + { + "epoch": 7.034653465346534, + "grad_norm": 0.1776733249425888, + "learning_rate": 4.0953937411397e-05, + "loss": 0.0461, + "num_input_tokens_seen": 13493472, + "step": 63945 + }, + { + "epoch": 7.035203520352035, + "grad_norm": 0.07804148644208908, + "learning_rate": 4.095208951170826e-05, + "loss": 0.0155, + "num_input_tokens_seen": 13494528, + "step": 63950 + }, + { + "epoch": 7.0357535753575355, + "grad_norm": 0.3241458833217621, + "learning_rate": 4.095024146499675e-05, + "loss": 0.0334, + "num_input_tokens_seen": 13495616, + "step": 63955 + }, + { + "epoch": 7.036303630363037, + "grad_norm": 0.06556657701730728, + "learning_rate": 4.094839327127954e-05, + "loss": 0.0741, + "num_input_tokens_seen": 13496704, + "step": 63960 + }, + { + "epoch": 7.036853685368537, + "grad_norm": 0.040107645094394684, + "learning_rate": 4.094654493057363e-05, + "loss": 0.0038, + "num_input_tokens_seen": 13497728, + "step": 63965 + }, + { + "epoch": 7.037403740374037, + "grad_norm": 1.388262391090393, + "learning_rate": 4.094469644289607e-05, + "loss": 0.0959, + "num_input_tokens_seen": 13498816, + "step": 63970 + }, + { + "epoch": 7.037953795379538, + "grad_norm": 0.6486594080924988, + "learning_rate": 4.094284780826389e-05, + "loss": 0.0284, + "num_input_tokens_seen": 13499840, + "step": 63975 + }, + { + "epoch": 7.038503850385038, + "grad_norm": 0.19078868627548218, + "learning_rate": 4.094099902669414e-05, + "loss": 0.0073, + "num_input_tokens_seen": 13500928, + "step": 63980 + }, + { + "epoch": 7.039053905390539, + "grad_norm": 0.11675333976745605, + "learning_rate": 4.093915009820385e-05, + "loss": 0.003, + "num_input_tokens_seen": 13502048, + "step": 63985 + }, + { + "epoch": 7.03960396039604, + "grad_norm": 0.04491390660405159, + "learning_rate": 4.0937301022810056e-05, + "loss": 0.0052, + "num_input_tokens_seen": 13503040, + "step": 63990 + }, + { + "epoch": 7.04015401540154, + "grad_norm": 0.00601269630715251, + "learning_rate": 4.093545180052981e-05, + "loss": 0.0049, + "num_input_tokens_seen": 13504128, + "step": 63995 + }, + { + "epoch": 7.040704070407041, + "grad_norm": 0.30040621757507324, + "learning_rate": 4.093360243138015e-05, + "loss": 0.0236, + "num_input_tokens_seen": 13505184, + "step": 64000 + }, + { + "epoch": 7.041254125412541, + "grad_norm": 0.07342889159917831, + "learning_rate": 4.093175291537812e-05, + "loss": 0.0686, + "num_input_tokens_seen": 13506272, + "step": 64005 + }, + { + "epoch": 7.041804180418042, + "grad_norm": 0.5424259901046753, + "learning_rate": 4.092990325254077e-05, + "loss": 0.0834, + "num_input_tokens_seen": 13507328, + "step": 64010 + }, + { + "epoch": 7.042354235423542, + "grad_norm": 0.3499722182750702, + "learning_rate": 4.0928053442885146e-05, + "loss": 0.0315, + "num_input_tokens_seen": 13508384, + "step": 64015 + }, + { + "epoch": 7.042904290429043, + "grad_norm": 0.06585072726011276, + "learning_rate": 4.09262034864283e-05, + "loss": 0.1215, + "num_input_tokens_seen": 13509440, + "step": 64020 + }, + { + "epoch": 7.043454345434544, + "grad_norm": 0.03355809673666954, + "learning_rate": 4.0924353383187265e-05, + "loss": 0.0649, + "num_input_tokens_seen": 13510528, + "step": 64025 + }, + { + "epoch": 7.044004400440044, + "grad_norm": 0.007212810683995485, + "learning_rate": 4.0922503133179114e-05, + "loss": 0.0281, + "num_input_tokens_seen": 13511584, + "step": 64030 + }, + { + "epoch": 7.044554455445544, + "grad_norm": 0.1678893119096756, + "learning_rate": 4.09206527364209e-05, + "loss": 0.0129, + "num_input_tokens_seen": 13512640, + "step": 64035 + }, + { + "epoch": 7.045104510451045, + "grad_norm": 0.04493722319602966, + "learning_rate": 4.091880219292966e-05, + "loss": 0.0187, + "num_input_tokens_seen": 13513632, + "step": 64040 + }, + { + "epoch": 7.0456545654565454, + "grad_norm": 0.1748335212469101, + "learning_rate": 4.0916951502722464e-05, + "loss": 0.0105, + "num_input_tokens_seen": 13514656, + "step": 64045 + }, + { + "epoch": 7.0462046204620465, + "grad_norm": 0.5085019469261169, + "learning_rate": 4.091510066581636e-05, + "loss": 0.0228, + "num_input_tokens_seen": 13515680, + "step": 64050 + }, + { + "epoch": 7.046754675467547, + "grad_norm": 0.3142580986022949, + "learning_rate": 4.091324968222841e-05, + "loss": 0.0149, + "num_input_tokens_seen": 13516736, + "step": 64055 + }, + { + "epoch": 7.047304730473047, + "grad_norm": 0.28426796197891235, + "learning_rate": 4.091139855197568e-05, + "loss": 0.0095, + "num_input_tokens_seen": 13517824, + "step": 64060 + }, + { + "epoch": 7.047854785478548, + "grad_norm": 0.023220503702759743, + "learning_rate": 4.090954727507522e-05, + "loss": 0.0819, + "num_input_tokens_seen": 13518848, + "step": 64065 + }, + { + "epoch": 7.048404840484048, + "grad_norm": 0.01369461975991726, + "learning_rate": 4.090769585154409e-05, + "loss": 0.0144, + "num_input_tokens_seen": 13519904, + "step": 64070 + }, + { + "epoch": 7.048954895489549, + "grad_norm": 1.183200478553772, + "learning_rate": 4.090584428139937e-05, + "loss": 0.0249, + "num_input_tokens_seen": 13520960, + "step": 64075 + }, + { + "epoch": 7.0495049504950495, + "grad_norm": 0.8213006258010864, + "learning_rate": 4.0903992564658114e-05, + "loss": 0.0275, + "num_input_tokens_seen": 13522016, + "step": 64080 + }, + { + "epoch": 7.05005500550055, + "grad_norm": 0.576572835445404, + "learning_rate": 4.090214070133739e-05, + "loss": 0.0092, + "num_input_tokens_seen": 13523040, + "step": 64085 + }, + { + "epoch": 7.050605060506051, + "grad_norm": 0.030346186831593513, + "learning_rate": 4.090028869145426e-05, + "loss": 0.0116, + "num_input_tokens_seen": 13524064, + "step": 64090 + }, + { + "epoch": 7.051155115511551, + "grad_norm": 1.0197157859802246, + "learning_rate": 4.0898436535025806e-05, + "loss": 0.0527, + "num_input_tokens_seen": 13525184, + "step": 64095 + }, + { + "epoch": 7.051705170517051, + "grad_norm": 0.15437765419483185, + "learning_rate": 4.0896584232069084e-05, + "loss": 0.022, + "num_input_tokens_seen": 13526208, + "step": 64100 + }, + { + "epoch": 7.052255225522552, + "grad_norm": 2.1711716651916504, + "learning_rate": 4.089473178260118e-05, + "loss": 0.1603, + "num_input_tokens_seen": 13527264, + "step": 64105 + }, + { + "epoch": 7.052805280528053, + "grad_norm": 0.04557792842388153, + "learning_rate": 4.089287918663916e-05, + "loss": 0.0319, + "num_input_tokens_seen": 13528256, + "step": 64110 + }, + { + "epoch": 7.053355335533554, + "grad_norm": 0.059230390936136246, + "learning_rate": 4.08910264442001e-05, + "loss": 0.0113, + "num_input_tokens_seen": 13529312, + "step": 64115 + }, + { + "epoch": 7.053905390539054, + "grad_norm": 0.008203419856727123, + "learning_rate": 4.0889173555301066e-05, + "loss": 0.0077, + "num_input_tokens_seen": 13530464, + "step": 64120 + }, + { + "epoch": 7.054455445544554, + "grad_norm": 0.12579748034477234, + "learning_rate": 4.088732051995915e-05, + "loss": 0.1716, + "num_input_tokens_seen": 13531552, + "step": 64125 + }, + { + "epoch": 7.055005500550055, + "grad_norm": 0.6169193983078003, + "learning_rate": 4.0885467338191426e-05, + "loss": 0.0277, + "num_input_tokens_seen": 13532640, + "step": 64130 + }, + { + "epoch": 7.055555555555555, + "grad_norm": 0.7740669250488281, + "learning_rate": 4.0883614010014966e-05, + "loss": 0.0585, + "num_input_tokens_seen": 13533664, + "step": 64135 + }, + { + "epoch": 7.0561056105610565, + "grad_norm": 0.9165868163108826, + "learning_rate": 4.088176053544686e-05, + "loss": 0.0185, + "num_input_tokens_seen": 13534752, + "step": 64140 + }, + { + "epoch": 7.056655665566557, + "grad_norm": 1.7390363216400146, + "learning_rate": 4.087990691450419e-05, + "loss": 0.0624, + "num_input_tokens_seen": 13535776, + "step": 64145 + }, + { + "epoch": 7.057205720572057, + "grad_norm": 1.5937559604644775, + "learning_rate": 4.087805314720403e-05, + "loss": 0.0444, + "num_input_tokens_seen": 13536800, + "step": 64150 + }, + { + "epoch": 7.057755775577558, + "grad_norm": 0.04774663597345352, + "learning_rate": 4.0876199233563485e-05, + "loss": 0.0249, + "num_input_tokens_seen": 13537888, + "step": 64155 + }, + { + "epoch": 7.058305830583058, + "grad_norm": 0.01282127108424902, + "learning_rate": 4.0874345173599625e-05, + "loss": 0.0423, + "num_input_tokens_seen": 13538944, + "step": 64160 + }, + { + "epoch": 7.058855885588559, + "grad_norm": 0.005155283957719803, + "learning_rate": 4.087249096732954e-05, + "loss": 0.0092, + "num_input_tokens_seen": 13539936, + "step": 64165 + }, + { + "epoch": 7.0594059405940595, + "grad_norm": 0.794340193271637, + "learning_rate": 4.0870636614770326e-05, + "loss": 0.0913, + "num_input_tokens_seen": 13541024, + "step": 64170 + }, + { + "epoch": 7.05995599559956, + "grad_norm": 0.24671593308448792, + "learning_rate": 4.0868782115939066e-05, + "loss": 0.0179, + "num_input_tokens_seen": 13542016, + "step": 64175 + }, + { + "epoch": 7.060506050605061, + "grad_norm": 0.09093166887760162, + "learning_rate": 4.0866927470852856e-05, + "loss": 0.0074, + "num_input_tokens_seen": 13543072, + "step": 64180 + }, + { + "epoch": 7.061056105610561, + "grad_norm": 0.008785414509475231, + "learning_rate": 4.086507267952879e-05, + "loss": 0.0036, + "num_input_tokens_seen": 13544160, + "step": 64185 + }, + { + "epoch": 7.061606160616061, + "grad_norm": 0.2414504587650299, + "learning_rate": 4.086321774198397e-05, + "loss": 0.0668, + "num_input_tokens_seen": 13545216, + "step": 64190 + }, + { + "epoch": 7.062156215621562, + "grad_norm": 0.11755643039941788, + "learning_rate": 4.086136265823547e-05, + "loss": 0.0068, + "num_input_tokens_seen": 13546208, + "step": 64195 + }, + { + "epoch": 7.0627062706270625, + "grad_norm": 0.6631919741630554, + "learning_rate": 4.085950742830042e-05, + "loss": 0.0674, + "num_input_tokens_seen": 13547264, + "step": 64200 + }, + { + "epoch": 7.063256325632564, + "grad_norm": 0.0469072163105011, + "learning_rate": 4.085765205219588e-05, + "loss": 0.0246, + "num_input_tokens_seen": 13548352, + "step": 64205 + }, + { + "epoch": 7.063806380638064, + "grad_norm": 0.04797784239053726, + "learning_rate": 4.0855796529938986e-05, + "loss": 0.0159, + "num_input_tokens_seen": 13549344, + "step": 64210 + }, + { + "epoch": 7.064356435643564, + "grad_norm": 0.9281363487243652, + "learning_rate": 4.085394086154681e-05, + "loss": 0.0468, + "num_input_tokens_seen": 13550400, + "step": 64215 + }, + { + "epoch": 7.064906490649065, + "grad_norm": 0.08869562298059464, + "learning_rate": 4.085208504703647e-05, + "loss": 0.1071, + "num_input_tokens_seen": 13551520, + "step": 64220 + }, + { + "epoch": 7.065456545654565, + "grad_norm": 1.9743773937225342, + "learning_rate": 4.085022908642508e-05, + "loss": 0.151, + "num_input_tokens_seen": 13552544, + "step": 64225 + }, + { + "epoch": 7.066006600660066, + "grad_norm": 0.016067517921328545, + "learning_rate": 4.084837297972973e-05, + "loss": 0.0463, + "num_input_tokens_seen": 13553536, + "step": 64230 + }, + { + "epoch": 7.066556655665567, + "grad_norm": 0.43360331654548645, + "learning_rate": 4.0846516726967524e-05, + "loss": 0.0251, + "num_input_tokens_seen": 13554592, + "step": 64235 + }, + { + "epoch": 7.067106710671067, + "grad_norm": 0.32388243079185486, + "learning_rate": 4.084466032815558e-05, + "loss": 0.0202, + "num_input_tokens_seen": 13555712, + "step": 64240 + }, + { + "epoch": 7.067656765676568, + "grad_norm": 0.13271646201610565, + "learning_rate": 4.0842803783311014e-05, + "loss": 0.0145, + "num_input_tokens_seen": 13556768, + "step": 64245 + }, + { + "epoch": 7.068206820682068, + "grad_norm": 0.05487137287855148, + "learning_rate": 4.084094709245092e-05, + "loss": 0.0107, + "num_input_tokens_seen": 13557856, + "step": 64250 + }, + { + "epoch": 7.068756875687569, + "grad_norm": 0.023034831508994102, + "learning_rate": 4.0839090255592415e-05, + "loss": 0.0179, + "num_input_tokens_seen": 13558880, + "step": 64255 + }, + { + "epoch": 7.069306930693069, + "grad_norm": 0.2760298550128937, + "learning_rate": 4.083723327275262e-05, + "loss": 0.0499, + "num_input_tokens_seen": 13560000, + "step": 64260 + }, + { + "epoch": 7.06985698569857, + "grad_norm": 0.9125409722328186, + "learning_rate": 4.083537614394865e-05, + "loss": 0.0722, + "num_input_tokens_seen": 13561120, + "step": 64265 + }, + { + "epoch": 7.070407040704071, + "grad_norm": 0.5048150420188904, + "learning_rate": 4.083351886919761e-05, + "loss": 0.0134, + "num_input_tokens_seen": 13562208, + "step": 64270 + }, + { + "epoch": 7.070957095709571, + "grad_norm": 0.043830305337905884, + "learning_rate": 4.083166144851662e-05, + "loss": 0.0116, + "num_input_tokens_seen": 13563328, + "step": 64275 + }, + { + "epoch": 7.071507150715071, + "grad_norm": 0.2762090563774109, + "learning_rate": 4.0829803881922805e-05, + "loss": 0.1586, + "num_input_tokens_seen": 13564352, + "step": 64280 + }, + { + "epoch": 7.072057205720572, + "grad_norm": 0.09804712235927582, + "learning_rate": 4.082794616943329e-05, + "loss": 0.0078, + "num_input_tokens_seen": 13565376, + "step": 64285 + }, + { + "epoch": 7.072607260726072, + "grad_norm": 0.4352259039878845, + "learning_rate": 4.0826088311065185e-05, + "loss": 0.0856, + "num_input_tokens_seen": 13566432, + "step": 64290 + }, + { + "epoch": 7.0731573157315735, + "grad_norm": 0.319375604391098, + "learning_rate": 4.082423030683562e-05, + "loss": 0.0184, + "num_input_tokens_seen": 13567488, + "step": 64295 + }, + { + "epoch": 7.073707370737074, + "grad_norm": 0.013880395330488682, + "learning_rate": 4.082237215676172e-05, + "loss": 0.007, + "num_input_tokens_seen": 13568512, + "step": 64300 + }, + { + "epoch": 7.074257425742574, + "grad_norm": 3.119432210922241, + "learning_rate": 4.0820513860860606e-05, + "loss": 0.1028, + "num_input_tokens_seen": 13569536, + "step": 64305 + }, + { + "epoch": 7.074807480748075, + "grad_norm": 0.016448087990283966, + "learning_rate": 4.08186554191494e-05, + "loss": 0.0307, + "num_input_tokens_seen": 13570560, + "step": 64310 + }, + { + "epoch": 7.075357535753575, + "grad_norm": 0.33636412024497986, + "learning_rate": 4.081679683164525e-05, + "loss": 0.1082, + "num_input_tokens_seen": 13571520, + "step": 64315 + }, + { + "epoch": 7.075907590759076, + "grad_norm": 0.030363017693161964, + "learning_rate": 4.081493809836528e-05, + "loss": 0.0253, + "num_input_tokens_seen": 13572576, + "step": 64320 + }, + { + "epoch": 7.0764576457645765, + "grad_norm": 0.010646946728229523, + "learning_rate": 4.081307921932659e-05, + "loss": 0.0236, + "num_input_tokens_seen": 13573600, + "step": 64325 + }, + { + "epoch": 7.077007700770077, + "grad_norm": 0.09615601599216461, + "learning_rate": 4.081122019454635e-05, + "loss": 0.0698, + "num_input_tokens_seen": 13574656, + "step": 64330 + }, + { + "epoch": 7.077557755775578, + "grad_norm": 0.17996066808700562, + "learning_rate": 4.0809361024041684e-05, + "loss": 0.0871, + "num_input_tokens_seen": 13575648, + "step": 64335 + }, + { + "epoch": 7.078107810781078, + "grad_norm": 0.004446197766810656, + "learning_rate": 4.080750170782971e-05, + "loss": 0.1174, + "num_input_tokens_seen": 13576800, + "step": 64340 + }, + { + "epoch": 7.078657865786579, + "grad_norm": 0.6956964135169983, + "learning_rate": 4.080564224592759e-05, + "loss": 0.1083, + "num_input_tokens_seen": 13577856, + "step": 64345 + }, + { + "epoch": 7.079207920792079, + "grad_norm": 0.1472562551498413, + "learning_rate": 4.080378263835245e-05, + "loss": 0.0446, + "num_input_tokens_seen": 13578880, + "step": 64350 + }, + { + "epoch": 7.0797579757975795, + "grad_norm": 0.06082082912325859, + "learning_rate": 4.0801922885121424e-05, + "loss": 0.0157, + "num_input_tokens_seen": 13579904, + "step": 64355 + }, + { + "epoch": 7.080308030803081, + "grad_norm": 0.08031733334064484, + "learning_rate": 4.0800062986251654e-05, + "loss": 0.0026, + "num_input_tokens_seen": 13581024, + "step": 64360 + }, + { + "epoch": 7.080858085808581, + "grad_norm": 0.04089433327317238, + "learning_rate": 4.079820294176029e-05, + "loss": 0.2415, + "num_input_tokens_seen": 13582112, + "step": 64365 + }, + { + "epoch": 7.081408140814081, + "grad_norm": 0.038957662880420685, + "learning_rate": 4.0796342751664474e-05, + "loss": 0.0095, + "num_input_tokens_seen": 13583232, + "step": 64370 + }, + { + "epoch": 7.081958195819582, + "grad_norm": 0.03187446668744087, + "learning_rate": 4.079448241598134e-05, + "loss": 0.0035, + "num_input_tokens_seen": 13584224, + "step": 64375 + }, + { + "epoch": 7.082508250825082, + "grad_norm": 1.2747790813446045, + "learning_rate": 4.079262193472804e-05, + "loss": 0.0596, + "num_input_tokens_seen": 13585312, + "step": 64380 + }, + { + "epoch": 7.083058305830583, + "grad_norm": 0.043077342212200165, + "learning_rate": 4.079076130792172e-05, + "loss": 0.0241, + "num_input_tokens_seen": 13586368, + "step": 64385 + }, + { + "epoch": 7.083608360836084, + "grad_norm": 0.4612657129764557, + "learning_rate": 4.078890053557954e-05, + "loss": 0.018, + "num_input_tokens_seen": 13587456, + "step": 64390 + }, + { + "epoch": 7.084158415841584, + "grad_norm": 0.1494143307209015, + "learning_rate": 4.078703961771864e-05, + "loss": 0.0161, + "num_input_tokens_seen": 13588512, + "step": 64395 + }, + { + "epoch": 7.084708470847085, + "grad_norm": 0.0077137574553489685, + "learning_rate": 4.0785178554356166e-05, + "loss": 0.0043, + "num_input_tokens_seen": 13589536, + "step": 64400 + }, + { + "epoch": 7.085258525852585, + "grad_norm": 0.017818184569478035, + "learning_rate": 4.078331734550927e-05, + "loss": 0.0354, + "num_input_tokens_seen": 13590624, + "step": 64405 + }, + { + "epoch": 7.085808580858086, + "grad_norm": 0.40552225708961487, + "learning_rate": 4.0781455991195115e-05, + "loss": 0.0726, + "num_input_tokens_seen": 13591744, + "step": 64410 + }, + { + "epoch": 7.086358635863586, + "grad_norm": 0.009440410882234573, + "learning_rate": 4.077959449143086e-05, + "loss": 0.0522, + "num_input_tokens_seen": 13592768, + "step": 64415 + }, + { + "epoch": 7.086908690869087, + "grad_norm": 0.21761566400527954, + "learning_rate": 4.077773284623365e-05, + "loss": 0.0632, + "num_input_tokens_seen": 13593824, + "step": 64420 + }, + { + "epoch": 7.087458745874588, + "grad_norm": 0.21073298156261444, + "learning_rate": 4.077587105562064e-05, + "loss": 0.0331, + "num_input_tokens_seen": 13594880, + "step": 64425 + }, + { + "epoch": 7.088008800880088, + "grad_norm": 0.03338317200541496, + "learning_rate": 4.0774009119609005e-05, + "loss": 0.0428, + "num_input_tokens_seen": 13595968, + "step": 64430 + }, + { + "epoch": 7.088558855885589, + "grad_norm": 0.00971637200564146, + "learning_rate": 4.077214703821589e-05, + "loss": 0.0138, + "num_input_tokens_seen": 13597024, + "step": 64435 + }, + { + "epoch": 7.089108910891089, + "grad_norm": 0.3826289474964142, + "learning_rate": 4.077028481145847e-05, + "loss": 0.0177, + "num_input_tokens_seen": 13598048, + "step": 64440 + }, + { + "epoch": 7.089658965896589, + "grad_norm": 0.01766391284763813, + "learning_rate": 4.076842243935391e-05, + "loss": 0.004, + "num_input_tokens_seen": 13599168, + "step": 64445 + }, + { + "epoch": 7.0902090209020905, + "grad_norm": 0.005322797689586878, + "learning_rate": 4.076655992191936e-05, + "loss": 0.0243, + "num_input_tokens_seen": 13600192, + "step": 64450 + }, + { + "epoch": 7.090759075907591, + "grad_norm": 0.03160294517874718, + "learning_rate": 4.076469725917198e-05, + "loss": 0.0441, + "num_input_tokens_seen": 13601216, + "step": 64455 + }, + { + "epoch": 7.091309130913091, + "grad_norm": 0.008405142463743687, + "learning_rate": 4.076283445112897e-05, + "loss": 0.0098, + "num_input_tokens_seen": 13602208, + "step": 64460 + }, + { + "epoch": 7.091859185918592, + "grad_norm": 0.013482715934515, + "learning_rate": 4.076097149780746e-05, + "loss": 0.0042, + "num_input_tokens_seen": 13603264, + "step": 64465 + }, + { + "epoch": 7.092409240924092, + "grad_norm": 0.0492287240922451, + "learning_rate": 4.0759108399224656e-05, + "loss": 0.0089, + "num_input_tokens_seen": 13604384, + "step": 64470 + }, + { + "epoch": 7.092959295929593, + "grad_norm": 0.03166879713535309, + "learning_rate": 4.075724515539771e-05, + "loss": 0.0133, + "num_input_tokens_seen": 13605376, + "step": 64475 + }, + { + "epoch": 7.0935093509350935, + "grad_norm": 0.44711360335350037, + "learning_rate": 4.075538176634378e-05, + "loss": 0.0515, + "num_input_tokens_seen": 13606464, + "step": 64480 + }, + { + "epoch": 7.094059405940594, + "grad_norm": 0.008102552965283394, + "learning_rate": 4.0753518232080066e-05, + "loss": 0.0027, + "num_input_tokens_seen": 13607488, + "step": 64485 + }, + { + "epoch": 7.094609460946095, + "grad_norm": 0.03963395580649376, + "learning_rate": 4.0751654552623735e-05, + "loss": 0.0043, + "num_input_tokens_seen": 13608512, + "step": 64490 + }, + { + "epoch": 7.095159515951595, + "grad_norm": 1.5800552368164062, + "learning_rate": 4.074979072799197e-05, + "loss": 0.0816, + "num_input_tokens_seen": 13609568, + "step": 64495 + }, + { + "epoch": 7.095709570957096, + "grad_norm": 2.730499267578125, + "learning_rate": 4.0747926758201927e-05, + "loss": 0.066, + "num_input_tokens_seen": 13610560, + "step": 64500 + }, + { + "epoch": 7.096259625962596, + "grad_norm": 0.06493518501520157, + "learning_rate": 4.0746062643270805e-05, + "loss": 0.108, + "num_input_tokens_seen": 13611648, + "step": 64505 + }, + { + "epoch": 7.0968096809680965, + "grad_norm": 0.11074808239936829, + "learning_rate": 4.074419838321578e-05, + "loss": 0.1209, + "num_input_tokens_seen": 13612704, + "step": 64510 + }, + { + "epoch": 7.097359735973598, + "grad_norm": 0.9327032566070557, + "learning_rate": 4.074233397805404e-05, + "loss": 0.0709, + "num_input_tokens_seen": 13613760, + "step": 64515 + }, + { + "epoch": 7.097909790979098, + "grad_norm": 0.07106907665729523, + "learning_rate": 4.0740469427802754e-05, + "loss": 0.042, + "num_input_tokens_seen": 13614816, + "step": 64520 + }, + { + "epoch": 7.098459845984599, + "grad_norm": 0.8543082475662231, + "learning_rate": 4.0738604732479115e-05, + "loss": 0.034, + "num_input_tokens_seen": 13615808, + "step": 64525 + }, + { + "epoch": 7.099009900990099, + "grad_norm": 0.44481298327445984, + "learning_rate": 4.07367398921003e-05, + "loss": 0.0324, + "num_input_tokens_seen": 13616896, + "step": 64530 + }, + { + "epoch": 7.099559955995599, + "grad_norm": 0.00688532181084156, + "learning_rate": 4.073487490668352e-05, + "loss": 0.0848, + "num_input_tokens_seen": 13617920, + "step": 64535 + }, + { + "epoch": 7.1001100110011, + "grad_norm": 1.3696894645690918, + "learning_rate": 4.073300977624594e-05, + "loss": 0.0268, + "num_input_tokens_seen": 13618944, + "step": 64540 + }, + { + "epoch": 7.100660066006601, + "grad_norm": 0.6510699987411499, + "learning_rate": 4.0731144500804755e-05, + "loss": 0.022, + "num_input_tokens_seen": 13619968, + "step": 64545 + }, + { + "epoch": 7.101210121012101, + "grad_norm": 1.2250466346740723, + "learning_rate": 4.072927908037717e-05, + "loss": 0.0599, + "num_input_tokens_seen": 13620960, + "step": 64550 + }, + { + "epoch": 7.101760176017602, + "grad_norm": 1.368594765663147, + "learning_rate": 4.072741351498036e-05, + "loss": 0.0202, + "num_input_tokens_seen": 13622048, + "step": 64555 + }, + { + "epoch": 7.102310231023102, + "grad_norm": 0.7683954238891602, + "learning_rate": 4.072554780463153e-05, + "loss": 0.1009, + "num_input_tokens_seen": 13623040, + "step": 64560 + }, + { + "epoch": 7.102860286028603, + "grad_norm": 0.40754902362823486, + "learning_rate": 4.072368194934787e-05, + "loss": 0.1071, + "num_input_tokens_seen": 13624064, + "step": 64565 + }, + { + "epoch": 7.103410341034103, + "grad_norm": 0.05304509773850441, + "learning_rate": 4.072181594914658e-05, + "loss": 0.0264, + "num_input_tokens_seen": 13625152, + "step": 64570 + }, + { + "epoch": 7.103960396039604, + "grad_norm": 0.4488976299762726, + "learning_rate": 4.0719949804044857e-05, + "loss": 0.0163, + "num_input_tokens_seen": 13626240, + "step": 64575 + }, + { + "epoch": 7.104510451045105, + "grad_norm": 0.01962287910282612, + "learning_rate": 4.07180835140599e-05, + "loss": 0.0554, + "num_input_tokens_seen": 13627264, + "step": 64580 + }, + { + "epoch": 7.105060506050605, + "grad_norm": 0.020389437675476074, + "learning_rate": 4.07162170792089e-05, + "loss": 0.0245, + "num_input_tokens_seen": 13628288, + "step": 64585 + }, + { + "epoch": 7.105610561056106, + "grad_norm": 0.31089234352111816, + "learning_rate": 4.071435049950908e-05, + "loss": 0.0271, + "num_input_tokens_seen": 13629280, + "step": 64590 + }, + { + "epoch": 7.106160616061606, + "grad_norm": 0.16992805898189545, + "learning_rate": 4.0712483774977623e-05, + "loss": 0.1145, + "num_input_tokens_seen": 13630368, + "step": 64595 + }, + { + "epoch": 7.106710671067106, + "grad_norm": 0.011042671278119087, + "learning_rate": 4.071061690563175e-05, + "loss": 0.1081, + "num_input_tokens_seen": 13631424, + "step": 64600 + }, + { + "epoch": 7.1072607260726075, + "grad_norm": 0.5490486025810242, + "learning_rate": 4.070874989148866e-05, + "loss": 0.023, + "num_input_tokens_seen": 13632480, + "step": 64605 + }, + { + "epoch": 7.107810781078108, + "grad_norm": 1.187293529510498, + "learning_rate": 4.070688273256556e-05, + "loss": 0.0723, + "num_input_tokens_seen": 13633536, + "step": 64610 + }, + { + "epoch": 7.108360836083609, + "grad_norm": 2.828065872192383, + "learning_rate": 4.070501542887965e-05, + "loss": 0.0397, + "num_input_tokens_seen": 13634496, + "step": 64615 + }, + { + "epoch": 7.108910891089109, + "grad_norm": 0.9472057819366455, + "learning_rate": 4.070314798044815e-05, + "loss": 0.0998, + "num_input_tokens_seen": 13635584, + "step": 64620 + }, + { + "epoch": 7.109460946094609, + "grad_norm": 0.5401446223258972, + "learning_rate": 4.070128038728828e-05, + "loss": 0.0117, + "num_input_tokens_seen": 13636544, + "step": 64625 + }, + { + "epoch": 7.11001100110011, + "grad_norm": 0.8595551252365112, + "learning_rate": 4.069941264941723e-05, + "loss": 0.0766, + "num_input_tokens_seen": 13637600, + "step": 64630 + }, + { + "epoch": 7.1105610561056105, + "grad_norm": 2.157412528991699, + "learning_rate": 4.0697544766852234e-05, + "loss": 0.0532, + "num_input_tokens_seen": 13638656, + "step": 64635 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.1116265207529068, + "learning_rate": 4.0695676739610506e-05, + "loss": 0.0096, + "num_input_tokens_seen": 13639680, + "step": 64640 + }, + { + "epoch": 7.111661166116612, + "grad_norm": 0.8819917440414429, + "learning_rate": 4.069380856770925e-05, + "loss": 0.0273, + "num_input_tokens_seen": 13640736, + "step": 64645 + }, + { + "epoch": 7.112211221122112, + "grad_norm": 0.10529205203056335, + "learning_rate": 4.069194025116569e-05, + "loss": 0.0332, + "num_input_tokens_seen": 13641824, + "step": 64650 + }, + { + "epoch": 7.112761276127613, + "grad_norm": 0.23692157864570618, + "learning_rate": 4.0690071789997055e-05, + "loss": 0.0098, + "num_input_tokens_seen": 13642880, + "step": 64655 + }, + { + "epoch": 7.113311331133113, + "grad_norm": 0.01830802857875824, + "learning_rate": 4.0688203184220555e-05, + "loss": 0.0139, + "num_input_tokens_seen": 13643904, + "step": 64660 + }, + { + "epoch": 7.1138613861386135, + "grad_norm": 0.07983198016881943, + "learning_rate": 4.068633443385341e-05, + "loss": 0.0251, + "num_input_tokens_seen": 13644896, + "step": 64665 + }, + { + "epoch": 7.114411441144115, + "grad_norm": 0.014847246930003166, + "learning_rate": 4.0684465538912845e-05, + "loss": 0.005, + "num_input_tokens_seen": 13645888, + "step": 64670 + }, + { + "epoch": 7.114961496149615, + "grad_norm": 0.03278225660324097, + "learning_rate": 4.068259649941609e-05, + "loss": 0.117, + "num_input_tokens_seen": 13646976, + "step": 64675 + }, + { + "epoch": 7.115511551155116, + "grad_norm": 0.027082078158855438, + "learning_rate": 4.068072731538037e-05, + "loss": 0.0083, + "num_input_tokens_seen": 13648064, + "step": 64680 + }, + { + "epoch": 7.116061606160616, + "grad_norm": 0.04632548242807388, + "learning_rate": 4.067885798682291e-05, + "loss": 0.0064, + "num_input_tokens_seen": 13649152, + "step": 64685 + }, + { + "epoch": 7.116611661166116, + "grad_norm": 1.2399548292160034, + "learning_rate": 4.067698851376094e-05, + "loss": 0.0416, + "num_input_tokens_seen": 13650208, + "step": 64690 + }, + { + "epoch": 7.117161716171617, + "grad_norm": 0.05903025344014168, + "learning_rate": 4.067511889621168e-05, + "loss": 0.0296, + "num_input_tokens_seen": 13651264, + "step": 64695 + }, + { + "epoch": 7.117711771177118, + "grad_norm": 0.2477681189775467, + "learning_rate": 4.067324913419239e-05, + "loss": 0.0421, + "num_input_tokens_seen": 13652320, + "step": 64700 + }, + { + "epoch": 7.118261826182618, + "grad_norm": 3.1074323654174805, + "learning_rate": 4.067137922772027e-05, + "loss": 0.0539, + "num_input_tokens_seen": 13653376, + "step": 64705 + }, + { + "epoch": 7.118811881188119, + "grad_norm": 0.007135886233299971, + "learning_rate": 4.066950917681257e-05, + "loss": 0.0274, + "num_input_tokens_seen": 13654432, + "step": 64710 + }, + { + "epoch": 7.119361936193619, + "grad_norm": 0.06537330150604248, + "learning_rate": 4.066763898148652e-05, + "loss": 0.0358, + "num_input_tokens_seen": 13655488, + "step": 64715 + }, + { + "epoch": 7.11991199119912, + "grad_norm": 0.031726617366075516, + "learning_rate": 4.066576864175936e-05, + "loss": 0.0178, + "num_input_tokens_seen": 13656576, + "step": 64720 + }, + { + "epoch": 7.12046204620462, + "grad_norm": 0.10843367129564285, + "learning_rate": 4.066389815764834e-05, + "loss": 0.0265, + "num_input_tokens_seen": 13657728, + "step": 64725 + }, + { + "epoch": 7.121012101210121, + "grad_norm": 0.4444946050643921, + "learning_rate": 4.066202752917067e-05, + "loss": 0.0506, + "num_input_tokens_seen": 13658752, + "step": 64730 + }, + { + "epoch": 7.121562156215622, + "grad_norm": 1.0744903087615967, + "learning_rate": 4.066015675634362e-05, + "loss": 0.0914, + "num_input_tokens_seen": 13659872, + "step": 64735 + }, + { + "epoch": 7.122112211221122, + "grad_norm": 2.8578040599823, + "learning_rate": 4.065828583918441e-05, + "loss": 0.0953, + "num_input_tokens_seen": 13660960, + "step": 64740 + }, + { + "epoch": 7.122662266226623, + "grad_norm": 0.9573668241500854, + "learning_rate": 4.065641477771029e-05, + "loss": 0.0834, + "num_input_tokens_seen": 13662048, + "step": 64745 + }, + { + "epoch": 7.123212321232123, + "grad_norm": 0.13078096508979797, + "learning_rate": 4.0654543571938526e-05, + "loss": 0.1165, + "num_input_tokens_seen": 13663136, + "step": 64750 + }, + { + "epoch": 7.123762376237623, + "grad_norm": 0.6021547913551331, + "learning_rate": 4.065267222188633e-05, + "loss": 0.0141, + "num_input_tokens_seen": 13664128, + "step": 64755 + }, + { + "epoch": 7.1243124312431245, + "grad_norm": 0.015813881531357765, + "learning_rate": 4.065080072757097e-05, + "loss": 0.0075, + "num_input_tokens_seen": 13665152, + "step": 64760 + }, + { + "epoch": 7.124862486248625, + "grad_norm": 0.7613192200660706, + "learning_rate": 4.0648929089009694e-05, + "loss": 0.0842, + "num_input_tokens_seen": 13666176, + "step": 64765 + }, + { + "epoch": 7.125412541254126, + "grad_norm": 0.036606743931770325, + "learning_rate": 4.0647057306219746e-05, + "loss": 0.0491, + "num_input_tokens_seen": 13667200, + "step": 64770 + }, + { + "epoch": 7.125962596259626, + "grad_norm": 0.8600496649742126, + "learning_rate": 4.064518537921837e-05, + "loss": 0.0323, + "num_input_tokens_seen": 13668256, + "step": 64775 + }, + { + "epoch": 7.126512651265126, + "grad_norm": 0.37243497371673584, + "learning_rate": 4.064331330802283e-05, + "loss": 0.0397, + "num_input_tokens_seen": 13669280, + "step": 64780 + }, + { + "epoch": 7.127062706270627, + "grad_norm": 0.20576268434524536, + "learning_rate": 4.064144109265039e-05, + "loss": 0.0829, + "num_input_tokens_seen": 13670304, + "step": 64785 + }, + { + "epoch": 7.1276127612761275, + "grad_norm": 0.5846989750862122, + "learning_rate": 4.063956873311828e-05, + "loss": 0.0189, + "num_input_tokens_seen": 13671296, + "step": 64790 + }, + { + "epoch": 7.128162816281628, + "grad_norm": 0.03543012961745262, + "learning_rate": 4.063769622944378e-05, + "loss": 0.0428, + "num_input_tokens_seen": 13672352, + "step": 64795 + }, + { + "epoch": 7.128712871287129, + "grad_norm": 0.1735706329345703, + "learning_rate": 4.0635823581644126e-05, + "loss": 0.0324, + "num_input_tokens_seen": 13673376, + "step": 64800 + }, + { + "epoch": 7.129262926292629, + "grad_norm": 0.012601642869412899, + "learning_rate": 4.06339507897366e-05, + "loss": 0.056, + "num_input_tokens_seen": 13674400, + "step": 64805 + }, + { + "epoch": 7.12981298129813, + "grad_norm": 0.4744659662246704, + "learning_rate": 4.063207785373844e-05, + "loss": 0.0127, + "num_input_tokens_seen": 13675488, + "step": 64810 + }, + { + "epoch": 7.13036303630363, + "grad_norm": 0.13069935142993927, + "learning_rate": 4.0630204773666925e-05, + "loss": 0.092, + "num_input_tokens_seen": 13676512, + "step": 64815 + }, + { + "epoch": 7.1309130913091305, + "grad_norm": 0.8723949193954468, + "learning_rate": 4.062833154953931e-05, + "loss": 0.0213, + "num_input_tokens_seen": 13677536, + "step": 64820 + }, + { + "epoch": 7.131463146314632, + "grad_norm": 0.0038673505187034607, + "learning_rate": 4.062645818137287e-05, + "loss": 0.0171, + "num_input_tokens_seen": 13678592, + "step": 64825 + }, + { + "epoch": 7.132013201320132, + "grad_norm": 0.12202700227499008, + "learning_rate": 4.0624584669184855e-05, + "loss": 0.0098, + "num_input_tokens_seen": 13679584, + "step": 64830 + }, + { + "epoch": 7.132563256325633, + "grad_norm": 0.5668461322784424, + "learning_rate": 4.062271101299254e-05, + "loss": 0.0369, + "num_input_tokens_seen": 13680672, + "step": 64835 + }, + { + "epoch": 7.133113311331133, + "grad_norm": 0.34226614236831665, + "learning_rate": 4.06208372128132e-05, + "loss": 0.1128, + "num_input_tokens_seen": 13681792, + "step": 64840 + }, + { + "epoch": 7.133663366336633, + "grad_norm": 0.39524099230766296, + "learning_rate": 4.06189632686641e-05, + "loss": 0.053, + "num_input_tokens_seen": 13682880, + "step": 64845 + }, + { + "epoch": 7.134213421342134, + "grad_norm": 0.03070572018623352, + "learning_rate": 4.0617089180562506e-05, + "loss": 0.0056, + "num_input_tokens_seen": 13683904, + "step": 64850 + }, + { + "epoch": 7.134763476347635, + "grad_norm": 0.08045511692762375, + "learning_rate": 4.061521494852569e-05, + "loss": 0.0161, + "num_input_tokens_seen": 13684992, + "step": 64855 + }, + { + "epoch": 7.135313531353136, + "grad_norm": 0.11328045278787613, + "learning_rate": 4.0613340572570944e-05, + "loss": 0.0921, + "num_input_tokens_seen": 13686016, + "step": 64860 + }, + { + "epoch": 7.135863586358636, + "grad_norm": 0.21978558599948883, + "learning_rate": 4.0611466052715525e-05, + "loss": 0.0155, + "num_input_tokens_seen": 13687104, + "step": 64865 + }, + { + "epoch": 7.136413641364136, + "grad_norm": 0.13567723333835602, + "learning_rate": 4.0609591388976707e-05, + "loss": 0.0217, + "num_input_tokens_seen": 13688256, + "step": 64870 + }, + { + "epoch": 7.136963696369637, + "grad_norm": 0.9590712785720825, + "learning_rate": 4.060771658137178e-05, + "loss": 0.1088, + "num_input_tokens_seen": 13689312, + "step": 64875 + }, + { + "epoch": 7.137513751375137, + "grad_norm": 0.059968944638967514, + "learning_rate": 4.060584162991802e-05, + "loss": 0.0163, + "num_input_tokens_seen": 13690304, + "step": 64880 + }, + { + "epoch": 7.138063806380638, + "grad_norm": 0.06764023005962372, + "learning_rate": 4.060396653463271e-05, + "loss": 0.0176, + "num_input_tokens_seen": 13691360, + "step": 64885 + }, + { + "epoch": 7.138613861386139, + "grad_norm": 0.006448822561651468, + "learning_rate": 4.060209129553313e-05, + "loss": 0.0106, + "num_input_tokens_seen": 13692416, + "step": 64890 + }, + { + "epoch": 7.139163916391639, + "grad_norm": 0.1466824859380722, + "learning_rate": 4.060021591263655e-05, + "loss": 0.0075, + "num_input_tokens_seen": 13693504, + "step": 64895 + }, + { + "epoch": 7.13971397139714, + "grad_norm": 0.5056488513946533, + "learning_rate": 4.059834038596026e-05, + "loss": 0.0104, + "num_input_tokens_seen": 13694592, + "step": 64900 + }, + { + "epoch": 7.14026402640264, + "grad_norm": 1.3484421968460083, + "learning_rate": 4.0596464715521566e-05, + "loss": 0.2444, + "num_input_tokens_seen": 13695680, + "step": 64905 + }, + { + "epoch": 7.1408140814081404, + "grad_norm": 0.013040470890700817, + "learning_rate": 4.059458890133774e-05, + "loss": 0.0078, + "num_input_tokens_seen": 13696800, + "step": 64910 + }, + { + "epoch": 7.1413641364136415, + "grad_norm": 0.030387917533516884, + "learning_rate": 4.059271294342607e-05, + "loss": 0.0753, + "num_input_tokens_seen": 13697824, + "step": 64915 + }, + { + "epoch": 7.141914191419142, + "grad_norm": 0.07569528371095657, + "learning_rate": 4.059083684180384e-05, + "loss": 0.0288, + "num_input_tokens_seen": 13698944, + "step": 64920 + }, + { + "epoch": 7.142464246424643, + "grad_norm": 0.007013200782239437, + "learning_rate": 4.058896059648836e-05, + "loss": 0.0091, + "num_input_tokens_seen": 13700000, + "step": 64925 + }, + { + "epoch": 7.143014301430143, + "grad_norm": 0.009767874144017696, + "learning_rate": 4.0587084207496895e-05, + "loss": 0.0456, + "num_input_tokens_seen": 13701120, + "step": 64930 + }, + { + "epoch": 7.143564356435643, + "grad_norm": 0.1087370216846466, + "learning_rate": 4.058520767484676e-05, + "loss": 0.0284, + "num_input_tokens_seen": 13702144, + "step": 64935 + }, + { + "epoch": 7.144114411441144, + "grad_norm": 0.009140247479081154, + "learning_rate": 4.0583330998555246e-05, + "loss": 0.0085, + "num_input_tokens_seen": 13703200, + "step": 64940 + }, + { + "epoch": 7.1446644664466445, + "grad_norm": 0.015441668219864368, + "learning_rate": 4.0581454178639654e-05, + "loss": 0.0052, + "num_input_tokens_seen": 13704320, + "step": 64945 + }, + { + "epoch": 7.145214521452146, + "grad_norm": 0.3751157522201538, + "learning_rate": 4.057957721511727e-05, + "loss": 0.0437, + "num_input_tokens_seen": 13705408, + "step": 64950 + }, + { + "epoch": 7.145764576457646, + "grad_norm": 1.2624021768569946, + "learning_rate": 4.05777001080054e-05, + "loss": 0.0743, + "num_input_tokens_seen": 13706496, + "step": 64955 + }, + { + "epoch": 7.146314631463146, + "grad_norm": 0.07343179732561111, + "learning_rate": 4.057582285732135e-05, + "loss": 0.0182, + "num_input_tokens_seen": 13707584, + "step": 64960 + }, + { + "epoch": 7.146864686468647, + "grad_norm": 0.7719160318374634, + "learning_rate": 4.05739454630824e-05, + "loss": 0.0439, + "num_input_tokens_seen": 13708672, + "step": 64965 + }, + { + "epoch": 7.147414741474147, + "grad_norm": 0.07492172718048096, + "learning_rate": 4.0572067925305866e-05, + "loss": 0.1109, + "num_input_tokens_seen": 13709760, + "step": 64970 + }, + { + "epoch": 7.1479647964796476, + "grad_norm": 0.7729910016059875, + "learning_rate": 4.057019024400906e-05, + "loss": 0.0823, + "num_input_tokens_seen": 13710816, + "step": 64975 + }, + { + "epoch": 7.148514851485149, + "grad_norm": 0.021886030212044716, + "learning_rate": 4.0568312419209286e-05, + "loss": 0.027, + "num_input_tokens_seen": 13711904, + "step": 64980 + }, + { + "epoch": 7.149064906490649, + "grad_norm": 0.048504091799259186, + "learning_rate": 4.056643445092385e-05, + "loss": 0.0119, + "num_input_tokens_seen": 13712960, + "step": 64985 + }, + { + "epoch": 7.14961496149615, + "grad_norm": 0.04917990416288376, + "learning_rate": 4.056455633917005e-05, + "loss": 0.0035, + "num_input_tokens_seen": 13713984, + "step": 64990 + }, + { + "epoch": 7.15016501650165, + "grad_norm": 0.10908743739128113, + "learning_rate": 4.05626780839652e-05, + "loss": 0.0062, + "num_input_tokens_seen": 13715104, + "step": 64995 + }, + { + "epoch": 7.15071507150715, + "grad_norm": 0.11545638740062714, + "learning_rate": 4.056079968532662e-05, + "loss": 0.053, + "num_input_tokens_seen": 13716096, + "step": 65000 + }, + { + "epoch": 7.1512651265126514, + "grad_norm": 0.027721915394067764, + "learning_rate": 4.055892114327162e-05, + "loss": 0.1243, + "num_input_tokens_seen": 13717088, + "step": 65005 + }, + { + "epoch": 7.151815181518152, + "grad_norm": 0.048355426639318466, + "learning_rate": 4.05570424578175e-05, + "loss": 0.0385, + "num_input_tokens_seen": 13718144, + "step": 65010 + }, + { + "epoch": 7.152365236523653, + "grad_norm": 0.901618242263794, + "learning_rate": 4.055516362898159e-05, + "loss": 0.0247, + "num_input_tokens_seen": 13719232, + "step": 65015 + }, + { + "epoch": 7.152915291529153, + "grad_norm": 0.4371436834335327, + "learning_rate": 4.05532846567812e-05, + "loss": 0.0728, + "num_input_tokens_seen": 13720288, + "step": 65020 + }, + { + "epoch": 7.153465346534653, + "grad_norm": 0.030460750684142113, + "learning_rate": 4.055140554123364e-05, + "loss": 0.0673, + "num_input_tokens_seen": 13721344, + "step": 65025 + }, + { + "epoch": 7.154015401540154, + "grad_norm": 1.0635145902633667, + "learning_rate": 4.054952628235625e-05, + "loss": 0.1211, + "num_input_tokens_seen": 13722400, + "step": 65030 + }, + { + "epoch": 7.1545654565456545, + "grad_norm": 0.05950377136468887, + "learning_rate": 4.054764688016633e-05, + "loss": 0.037, + "num_input_tokens_seen": 13723456, + "step": 65035 + }, + { + "epoch": 7.1551155115511555, + "grad_norm": 0.0462549589574337, + "learning_rate": 4.054576733468122e-05, + "loss": 0.0487, + "num_input_tokens_seen": 13724544, + "step": 65040 + }, + { + "epoch": 7.155665566556656, + "grad_norm": 0.4519183933734894, + "learning_rate": 4.054388764591822e-05, + "loss": 0.0541, + "num_input_tokens_seen": 13725600, + "step": 65045 + }, + { + "epoch": 7.156215621562156, + "grad_norm": 0.19047610461711884, + "learning_rate": 4.054200781389468e-05, + "loss": 0.0263, + "num_input_tokens_seen": 13726624, + "step": 65050 + }, + { + "epoch": 7.156765676567657, + "grad_norm": 0.06449022889137268, + "learning_rate": 4.05401278386279e-05, + "loss": 0.0216, + "num_input_tokens_seen": 13727680, + "step": 65055 + }, + { + "epoch": 7.157315731573157, + "grad_norm": 0.5030164122581482, + "learning_rate": 4.053824772013522e-05, + "loss": 0.0223, + "num_input_tokens_seen": 13728672, + "step": 65060 + }, + { + "epoch": 7.1578657865786575, + "grad_norm": 0.6673767566680908, + "learning_rate": 4.053636745843397e-05, + "loss": 0.0156, + "num_input_tokens_seen": 13729696, + "step": 65065 + }, + { + "epoch": 7.158415841584159, + "grad_norm": 0.03761296346783638, + "learning_rate": 4.053448705354147e-05, + "loss": 0.0088, + "num_input_tokens_seen": 13730720, + "step": 65070 + }, + { + "epoch": 7.158965896589659, + "grad_norm": 0.0688709244132042, + "learning_rate": 4.0532606505475065e-05, + "loss": 0.0251, + "num_input_tokens_seen": 13731840, + "step": 65075 + }, + { + "epoch": 7.15951595159516, + "grad_norm": 0.3655194342136383, + "learning_rate": 4.053072581425208e-05, + "loss": 0.0071, + "num_input_tokens_seen": 13732992, + "step": 65080 + }, + { + "epoch": 7.16006600660066, + "grad_norm": 0.0079403230920434, + "learning_rate": 4.052884497988983e-05, + "loss": 0.0082, + "num_input_tokens_seen": 13734048, + "step": 65085 + }, + { + "epoch": 7.16061606160616, + "grad_norm": 1.3423540592193604, + "learning_rate": 4.052696400240569e-05, + "loss": 0.0265, + "num_input_tokens_seen": 13735168, + "step": 65090 + }, + { + "epoch": 7.161166116611661, + "grad_norm": 0.07679152488708496, + "learning_rate": 4.052508288181696e-05, + "loss": 0.0681, + "num_input_tokens_seen": 13736224, + "step": 65095 + }, + { + "epoch": 7.161716171617162, + "grad_norm": 0.05765419825911522, + "learning_rate": 4.052320161814099e-05, + "loss": 0.083, + "num_input_tokens_seen": 13737248, + "step": 65100 + }, + { + "epoch": 7.162266226622663, + "grad_norm": 0.03794679418206215, + "learning_rate": 4.052132021139513e-05, + "loss": 0.0061, + "num_input_tokens_seen": 13738272, + "step": 65105 + }, + { + "epoch": 7.162816281628163, + "grad_norm": 1.30423903465271, + "learning_rate": 4.05194386615967e-05, + "loss": 0.0824, + "num_input_tokens_seen": 13739296, + "step": 65110 + }, + { + "epoch": 7.163366336633663, + "grad_norm": 0.2681594491004944, + "learning_rate": 4.051755696876306e-05, + "loss": 0.0909, + "num_input_tokens_seen": 13740352, + "step": 65115 + }, + { + "epoch": 7.163916391639164, + "grad_norm": 0.9376403093338013, + "learning_rate": 4.051567513291153e-05, + "loss": 0.0315, + "num_input_tokens_seen": 13741408, + "step": 65120 + }, + { + "epoch": 7.164466446644664, + "grad_norm": 0.22452744841575623, + "learning_rate": 4.0513793154059476e-05, + "loss": 0.0238, + "num_input_tokens_seen": 13742464, + "step": 65125 + }, + { + "epoch": 7.165016501650165, + "grad_norm": 1.0564852952957153, + "learning_rate": 4.0511911032224234e-05, + "loss": 0.1014, + "num_input_tokens_seen": 13743520, + "step": 65130 + }, + { + "epoch": 7.165566556655666, + "grad_norm": 0.018735678866505623, + "learning_rate": 4.051002876742316e-05, + "loss": 0.0612, + "num_input_tokens_seen": 13744512, + "step": 65135 + }, + { + "epoch": 7.166116611661166, + "grad_norm": 0.08576920628547668, + "learning_rate": 4.050814635967358e-05, + "loss": 0.038, + "num_input_tokens_seen": 13745568, + "step": 65140 + }, + { + "epoch": 7.166666666666667, + "grad_norm": 0.1558006852865219, + "learning_rate": 4.0506263808992854e-05, + "loss": 0.0653, + "num_input_tokens_seen": 13746592, + "step": 65145 + }, + { + "epoch": 7.167216721672167, + "grad_norm": 0.7807793021202087, + "learning_rate": 4.050438111539835e-05, + "loss": 0.0945, + "num_input_tokens_seen": 13747616, + "step": 65150 + }, + { + "epoch": 7.167766776677667, + "grad_norm": 0.1140761598944664, + "learning_rate": 4.05024982789074e-05, + "loss": 0.0662, + "num_input_tokens_seen": 13748704, + "step": 65155 + }, + { + "epoch": 7.1683168316831685, + "grad_norm": 0.36320963501930237, + "learning_rate": 4.0500615299537356e-05, + "loss": 0.0284, + "num_input_tokens_seen": 13749760, + "step": 65160 + }, + { + "epoch": 7.168866886688669, + "grad_norm": 0.07105319947004318, + "learning_rate": 4.049873217730558e-05, + "loss": 0.0053, + "num_input_tokens_seen": 13750880, + "step": 65165 + }, + { + "epoch": 7.16941694169417, + "grad_norm": 0.009518072940409184, + "learning_rate": 4.0496848912229425e-05, + "loss": 0.0584, + "num_input_tokens_seen": 13751936, + "step": 65170 + }, + { + "epoch": 7.16996699669967, + "grad_norm": 0.015032962895929813, + "learning_rate": 4.049496550432625e-05, + "loss": 0.0405, + "num_input_tokens_seen": 13753056, + "step": 65175 + }, + { + "epoch": 7.17051705170517, + "grad_norm": 0.03883422538638115, + "learning_rate": 4.0493081953613424e-05, + "loss": 0.0092, + "num_input_tokens_seen": 13754112, + "step": 65180 + }, + { + "epoch": 7.171067106710671, + "grad_norm": 0.015110557898879051, + "learning_rate": 4.0491198260108284e-05, + "loss": 0.0375, + "num_input_tokens_seen": 13755168, + "step": 65185 + }, + { + "epoch": 7.1716171617161715, + "grad_norm": 0.028936387971043587, + "learning_rate": 4.0489314423828206e-05, + "loss": 0.0458, + "num_input_tokens_seen": 13756192, + "step": 65190 + }, + { + "epoch": 7.172167216721673, + "grad_norm": 0.125836580991745, + "learning_rate": 4.048743044479055e-05, + "loss": 0.009, + "num_input_tokens_seen": 13757216, + "step": 65195 + }, + { + "epoch": 7.172717271727173, + "grad_norm": 0.037546511739492416, + "learning_rate": 4.0485546323012684e-05, + "loss": 0.0347, + "num_input_tokens_seen": 13758272, + "step": 65200 + }, + { + "epoch": 7.173267326732673, + "grad_norm": 2.5635249614715576, + "learning_rate": 4.048366205851196e-05, + "loss": 0.0972, + "num_input_tokens_seen": 13759328, + "step": 65205 + }, + { + "epoch": 7.173817381738174, + "grad_norm": 0.10274296998977661, + "learning_rate": 4.048177765130576e-05, + "loss": 0.0847, + "num_input_tokens_seen": 13760448, + "step": 65210 + }, + { + "epoch": 7.174367436743674, + "grad_norm": 0.5822851657867432, + "learning_rate": 4.047989310141144e-05, + "loss": 0.1131, + "num_input_tokens_seen": 13761504, + "step": 65215 + }, + { + "epoch": 7.174917491749175, + "grad_norm": 0.03789836913347244, + "learning_rate": 4.047800840884637e-05, + "loss": 0.0837, + "num_input_tokens_seen": 13762560, + "step": 65220 + }, + { + "epoch": 7.175467546754676, + "grad_norm": 0.015688784420490265, + "learning_rate": 4.047612357362793e-05, + "loss": 0.0499, + "num_input_tokens_seen": 13763648, + "step": 65225 + }, + { + "epoch": 7.176017601760176, + "grad_norm": 0.3899560272693634, + "learning_rate": 4.047423859577348e-05, + "loss": 0.015, + "num_input_tokens_seen": 13764736, + "step": 65230 + }, + { + "epoch": 7.176567656765677, + "grad_norm": 0.023116810247302055, + "learning_rate": 4.047235347530041e-05, + "loss": 0.0298, + "num_input_tokens_seen": 13765792, + "step": 65235 + }, + { + "epoch": 7.177117711771177, + "grad_norm": 0.3055586516857147, + "learning_rate": 4.047046821222607e-05, + "loss": 0.0233, + "num_input_tokens_seen": 13766784, + "step": 65240 + }, + { + "epoch": 7.177667766776677, + "grad_norm": 0.06528788805007935, + "learning_rate": 4.046858280656785e-05, + "loss": 0.0087, + "num_input_tokens_seen": 13767776, + "step": 65245 + }, + { + "epoch": 7.178217821782178, + "grad_norm": 0.31277045607566833, + "learning_rate": 4.0466697258343126e-05, + "loss": 0.0143, + "num_input_tokens_seen": 13768736, + "step": 65250 + }, + { + "epoch": 7.178767876787679, + "grad_norm": 0.015719356015324593, + "learning_rate": 4.0464811567569276e-05, + "loss": 0.0128, + "num_input_tokens_seen": 13769792, + "step": 65255 + }, + { + "epoch": 7.17931793179318, + "grad_norm": 0.08820172399282455, + "learning_rate": 4.046292573426368e-05, + "loss": 0.0234, + "num_input_tokens_seen": 13770880, + "step": 65260 + }, + { + "epoch": 7.17986798679868, + "grad_norm": 0.03181495890021324, + "learning_rate": 4.046103975844371e-05, + "loss": 0.0305, + "num_input_tokens_seen": 13772000, + "step": 65265 + }, + { + "epoch": 7.18041804180418, + "grad_norm": 0.0529186986386776, + "learning_rate": 4.045915364012677e-05, + "loss": 0.007, + "num_input_tokens_seen": 13773024, + "step": 65270 + }, + { + "epoch": 7.180968096809681, + "grad_norm": 0.017478400841355324, + "learning_rate": 4.045726737933021e-05, + "loss": 0.1247, + "num_input_tokens_seen": 13774112, + "step": 65275 + }, + { + "epoch": 7.181518151815181, + "grad_norm": 0.18211305141448975, + "learning_rate": 4.045538097607144e-05, + "loss": 0.0539, + "num_input_tokens_seen": 13775168, + "step": 65280 + }, + { + "epoch": 7.1820682068206825, + "grad_norm": 0.005376230459660292, + "learning_rate": 4.0453494430367844e-05, + "loss": 0.0108, + "num_input_tokens_seen": 13776224, + "step": 65285 + }, + { + "epoch": 7.182618261826183, + "grad_norm": 1.768554449081421, + "learning_rate": 4.045160774223681e-05, + "loss": 0.04, + "num_input_tokens_seen": 13777312, + "step": 65290 + }, + { + "epoch": 7.183168316831683, + "grad_norm": 0.5526648163795471, + "learning_rate": 4.044972091169572e-05, + "loss": 0.1552, + "num_input_tokens_seen": 13778368, + "step": 65295 + }, + { + "epoch": 7.183718371837184, + "grad_norm": 0.028191322460770607, + "learning_rate": 4.044783393876196e-05, + "loss": 0.0396, + "num_input_tokens_seen": 13779424, + "step": 65300 + }, + { + "epoch": 7.184268426842684, + "grad_norm": 0.025082001462578773, + "learning_rate": 4.044594682345292e-05, + "loss": 0.009, + "num_input_tokens_seen": 13780512, + "step": 65305 + }, + { + "epoch": 7.184818481848184, + "grad_norm": 0.7428873181343079, + "learning_rate": 4.0444059565786014e-05, + "loss": 0.0757, + "num_input_tokens_seen": 13781600, + "step": 65310 + }, + { + "epoch": 7.1853685368536855, + "grad_norm": 0.1569097936153412, + "learning_rate": 4.044217216577861e-05, + "loss": 0.0286, + "num_input_tokens_seen": 13782656, + "step": 65315 + }, + { + "epoch": 7.185918591859186, + "grad_norm": 0.020762892439961433, + "learning_rate": 4.0440284623448134e-05, + "loss": 0.0253, + "num_input_tokens_seen": 13783744, + "step": 65320 + }, + { + "epoch": 7.186468646864687, + "grad_norm": 0.3586273789405823, + "learning_rate": 4.0438396938811953e-05, + "loss": 0.023, + "num_input_tokens_seen": 13784800, + "step": 65325 + }, + { + "epoch": 7.187018701870187, + "grad_norm": 0.17100505530834198, + "learning_rate": 4.0436509111887465e-05, + "loss": 0.0682, + "num_input_tokens_seen": 13785824, + "step": 65330 + }, + { + "epoch": 7.187568756875687, + "grad_norm": 0.12354904413223267, + "learning_rate": 4.043462114269208e-05, + "loss": 0.0178, + "num_input_tokens_seen": 13786912, + "step": 65335 + }, + { + "epoch": 7.188118811881188, + "grad_norm": 0.04668431356549263, + "learning_rate": 4.043273303124322e-05, + "loss": 0.0118, + "num_input_tokens_seen": 13787904, + "step": 65340 + }, + { + "epoch": 7.1886688668866885, + "grad_norm": 0.5543931722640991, + "learning_rate": 4.0430844777558254e-05, + "loss": 0.0668, + "num_input_tokens_seen": 13788928, + "step": 65345 + }, + { + "epoch": 7.18921892189219, + "grad_norm": 0.08689849078655243, + "learning_rate": 4.042895638165459e-05, + "loss": 0.0552, + "num_input_tokens_seen": 13789952, + "step": 65350 + }, + { + "epoch": 7.18976897689769, + "grad_norm": 0.007147145923227072, + "learning_rate": 4.042706784354964e-05, + "loss": 0.0206, + "num_input_tokens_seen": 13791040, + "step": 65355 + }, + { + "epoch": 7.19031903190319, + "grad_norm": 0.11688871681690216, + "learning_rate": 4.0425179163260806e-05, + "loss": 0.0215, + "num_input_tokens_seen": 13792064, + "step": 65360 + }, + { + "epoch": 7.190869086908691, + "grad_norm": 1.1604551076889038, + "learning_rate": 4.042329034080551e-05, + "loss": 0.0373, + "num_input_tokens_seen": 13793024, + "step": 65365 + }, + { + "epoch": 7.191419141914191, + "grad_norm": 0.24118097126483917, + "learning_rate": 4.0421401376201134e-05, + "loss": 0.167, + "num_input_tokens_seen": 13794048, + "step": 65370 + }, + { + "epoch": 7.191969196919692, + "grad_norm": 0.022489117458462715, + "learning_rate": 4.041951226946511e-05, + "loss": 0.0186, + "num_input_tokens_seen": 13795072, + "step": 65375 + }, + { + "epoch": 7.192519251925193, + "grad_norm": 0.03572782129049301, + "learning_rate": 4.041762302061484e-05, + "loss": 0.014, + "num_input_tokens_seen": 13796128, + "step": 65380 + }, + { + "epoch": 7.193069306930693, + "grad_norm": 1.692406177520752, + "learning_rate": 4.041573362966773e-05, + "loss": 0.0644, + "num_input_tokens_seen": 13797216, + "step": 65385 + }, + { + "epoch": 7.193619361936194, + "grad_norm": 0.20327268540859222, + "learning_rate": 4.0413844096641204e-05, + "loss": 0.008, + "num_input_tokens_seen": 13798208, + "step": 65390 + }, + { + "epoch": 7.194169416941694, + "grad_norm": 1.2750283479690552, + "learning_rate": 4.041195442155268e-05, + "loss": 0.1184, + "num_input_tokens_seen": 13799200, + "step": 65395 + }, + { + "epoch": 7.194719471947194, + "grad_norm": 0.02028276026248932, + "learning_rate": 4.041006460441955e-05, + "loss": 0.0132, + "num_input_tokens_seen": 13800256, + "step": 65400 + }, + { + "epoch": 7.195269526952695, + "grad_norm": 0.021836524829268456, + "learning_rate": 4.040817464525927e-05, + "loss": 0.0275, + "num_input_tokens_seen": 13801312, + "step": 65405 + }, + { + "epoch": 7.195819581958196, + "grad_norm": 0.015230289660394192, + "learning_rate": 4.0406284544089224e-05, + "loss": 0.0121, + "num_input_tokens_seen": 13802368, + "step": 65410 + }, + { + "epoch": 7.196369636963697, + "grad_norm": 0.8374636769294739, + "learning_rate": 4.040439430092685e-05, + "loss": 0.0323, + "num_input_tokens_seen": 13803392, + "step": 65415 + }, + { + "epoch": 7.196919691969197, + "grad_norm": 0.7043392062187195, + "learning_rate": 4.0402503915789566e-05, + "loss": 0.133, + "num_input_tokens_seen": 13804448, + "step": 65420 + }, + { + "epoch": 7.197469746974697, + "grad_norm": 0.10819356143474579, + "learning_rate": 4.040061338869479e-05, + "loss": 0.0139, + "num_input_tokens_seen": 13805440, + "step": 65425 + }, + { + "epoch": 7.198019801980198, + "grad_norm": 0.05565492436289787, + "learning_rate": 4.039872271965995e-05, + "loss": 0.0252, + "num_input_tokens_seen": 13806560, + "step": 65430 + }, + { + "epoch": 7.198569856985698, + "grad_norm": 0.018804846331477165, + "learning_rate": 4.039683190870248e-05, + "loss": 0.0395, + "num_input_tokens_seen": 13807648, + "step": 65435 + }, + { + "epoch": 7.1991199119911995, + "grad_norm": 0.023373862728476524, + "learning_rate": 4.03949409558398e-05, + "loss": 0.0272, + "num_input_tokens_seen": 13808704, + "step": 65440 + }, + { + "epoch": 7.1996699669967, + "grad_norm": 0.061835743486881256, + "learning_rate": 4.039304986108933e-05, + "loss": 0.066, + "num_input_tokens_seen": 13809824, + "step": 65445 + }, + { + "epoch": 7.2002200220022, + "grad_norm": 0.16548366844654083, + "learning_rate": 4.03911586244685e-05, + "loss": 0.1218, + "num_input_tokens_seen": 13810880, + "step": 65450 + }, + { + "epoch": 7.200770077007701, + "grad_norm": 0.09373576194047928, + "learning_rate": 4.038926724599475e-05, + "loss": 0.194, + "num_input_tokens_seen": 13811936, + "step": 65455 + }, + { + "epoch": 7.201320132013201, + "grad_norm": 0.03702471777796745, + "learning_rate": 4.0387375725685514e-05, + "loss": 0.0648, + "num_input_tokens_seen": 13813024, + "step": 65460 + }, + { + "epoch": 7.201870187018702, + "grad_norm": 0.6737339496612549, + "learning_rate": 4.038548406355822e-05, + "loss": 0.0429, + "num_input_tokens_seen": 13814080, + "step": 65465 + }, + { + "epoch": 7.2024202420242025, + "grad_norm": 0.2725382447242737, + "learning_rate": 4.03835922596303e-05, + "loss": 0.0395, + "num_input_tokens_seen": 13815136, + "step": 65470 + }, + { + "epoch": 7.202970297029703, + "grad_norm": 0.498966246843338, + "learning_rate": 4.038170031391919e-05, + "loss": 0.096, + "num_input_tokens_seen": 13816224, + "step": 65475 + }, + { + "epoch": 7.203520352035204, + "grad_norm": 0.07053044438362122, + "learning_rate": 4.037980822644233e-05, + "loss": 0.0189, + "num_input_tokens_seen": 13817184, + "step": 65480 + }, + { + "epoch": 7.204070407040704, + "grad_norm": 0.01670747809112072, + "learning_rate": 4.0377915997217153e-05, + "loss": 0.0044, + "num_input_tokens_seen": 13818240, + "step": 65485 + }, + { + "epoch": 7.204620462046204, + "grad_norm": 0.22249969840049744, + "learning_rate": 4.037602362626111e-05, + "loss": 0.043, + "num_input_tokens_seen": 13819264, + "step": 65490 + }, + { + "epoch": 7.205170517051705, + "grad_norm": 0.3009086549282074, + "learning_rate": 4.0374131113591626e-05, + "loss": 0.044, + "num_input_tokens_seen": 13820320, + "step": 65495 + }, + { + "epoch": 7.2057205720572055, + "grad_norm": 0.4170851707458496, + "learning_rate": 4.0372238459226166e-05, + "loss": 0.0268, + "num_input_tokens_seen": 13821408, + "step": 65500 + }, + { + "epoch": 7.206270627062707, + "grad_norm": 0.014141865074634552, + "learning_rate": 4.0370345663182154e-05, + "loss": 0.0167, + "num_input_tokens_seen": 13822496, + "step": 65505 + }, + { + "epoch": 7.206820682068207, + "grad_norm": 0.03512227535247803, + "learning_rate": 4.0368452725477045e-05, + "loss": 0.0527, + "num_input_tokens_seen": 13823584, + "step": 65510 + }, + { + "epoch": 7.207370737073707, + "grad_norm": 0.0972728282213211, + "learning_rate": 4.0366559646128274e-05, + "loss": 0.0169, + "num_input_tokens_seen": 13824640, + "step": 65515 + }, + { + "epoch": 7.207920792079208, + "grad_norm": 1.0792760848999023, + "learning_rate": 4.03646664251533e-05, + "loss": 0.0721, + "num_input_tokens_seen": 13825728, + "step": 65520 + }, + { + "epoch": 7.208470847084708, + "grad_norm": 0.08540687710046768, + "learning_rate": 4.036277306256957e-05, + "loss": 0.0343, + "num_input_tokens_seen": 13826784, + "step": 65525 + }, + { + "epoch": 7.209020902090209, + "grad_norm": 0.050952620804309845, + "learning_rate": 4.0360879558394535e-05, + "loss": 0.0066, + "num_input_tokens_seen": 13827840, + "step": 65530 + }, + { + "epoch": 7.20957095709571, + "grad_norm": 0.11673122644424438, + "learning_rate": 4.0358985912645634e-05, + "loss": 0.1298, + "num_input_tokens_seen": 13828896, + "step": 65535 + }, + { + "epoch": 7.21012101210121, + "grad_norm": 0.08391152322292328, + "learning_rate": 4.0357092125340336e-05, + "loss": 0.0111, + "num_input_tokens_seen": 13829952, + "step": 65540 + }, + { + "epoch": 7.210671067106711, + "grad_norm": 0.06872396916151047, + "learning_rate": 4.035519819649609e-05, + "loss": 0.023, + "num_input_tokens_seen": 13831072, + "step": 65545 + }, + { + "epoch": 7.211221122112211, + "grad_norm": 0.17241570353507996, + "learning_rate": 4.035330412613035e-05, + "loss": 0.0095, + "num_input_tokens_seen": 13832160, + "step": 65550 + }, + { + "epoch": 7.211771177117711, + "grad_norm": 0.2988886833190918, + "learning_rate": 4.0351409914260576e-05, + "loss": 0.0432, + "num_input_tokens_seen": 13833248, + "step": 65555 + }, + { + "epoch": 7.212321232123212, + "grad_norm": 0.033634163439273834, + "learning_rate": 4.034951556090422e-05, + "loss": 0.0656, + "num_input_tokens_seen": 13834304, + "step": 65560 + }, + { + "epoch": 7.212871287128713, + "grad_norm": 0.07580036669969559, + "learning_rate": 4.034762106607874e-05, + "loss": 0.0207, + "num_input_tokens_seen": 13835392, + "step": 65565 + }, + { + "epoch": 7.213421342134214, + "grad_norm": 0.07675604522228241, + "learning_rate": 4.03457264298016e-05, + "loss": 0.0234, + "num_input_tokens_seen": 13836480, + "step": 65570 + }, + { + "epoch": 7.213971397139714, + "grad_norm": 0.6941676139831543, + "learning_rate": 4.034383165209027e-05, + "loss": 0.037, + "num_input_tokens_seen": 13837504, + "step": 65575 + }, + { + "epoch": 7.214521452145214, + "grad_norm": 0.013514776714146137, + "learning_rate": 4.0341936732962206e-05, + "loss": 0.026, + "num_input_tokens_seen": 13838560, + "step": 65580 + }, + { + "epoch": 7.215071507150715, + "grad_norm": 0.07136429846286774, + "learning_rate": 4.034004167243487e-05, + "loss": 0.0351, + "num_input_tokens_seen": 13839616, + "step": 65585 + }, + { + "epoch": 7.215621562156215, + "grad_norm": 0.0719350129365921, + "learning_rate": 4.033814647052574e-05, + "loss": 0.0045, + "num_input_tokens_seen": 13840640, + "step": 65590 + }, + { + "epoch": 7.2161716171617165, + "grad_norm": 0.18475443124771118, + "learning_rate": 4.033625112725226e-05, + "loss": 0.0051, + "num_input_tokens_seen": 13841696, + "step": 65595 + }, + { + "epoch": 7.216721672167217, + "grad_norm": 0.07903914898633957, + "learning_rate": 4.033435564263192e-05, + "loss": 0.0424, + "num_input_tokens_seen": 13842720, + "step": 65600 + }, + { + "epoch": 7.217271727172717, + "grad_norm": 0.07131826877593994, + "learning_rate": 4.033246001668218e-05, + "loss": 0.1002, + "num_input_tokens_seen": 13843808, + "step": 65605 + }, + { + "epoch": 7.217821782178218, + "grad_norm": 0.5821592211723328, + "learning_rate": 4.033056424942051e-05, + "loss": 0.0165, + "num_input_tokens_seen": 13844864, + "step": 65610 + }, + { + "epoch": 7.218371837183718, + "grad_norm": 0.39225584268569946, + "learning_rate": 4.032866834086439e-05, + "loss": 0.095, + "num_input_tokens_seen": 13845952, + "step": 65615 + }, + { + "epoch": 7.218921892189219, + "grad_norm": 0.6992904543876648, + "learning_rate": 4.032677229103129e-05, + "loss": 0.0228, + "num_input_tokens_seen": 13847008, + "step": 65620 + }, + { + "epoch": 7.2194719471947195, + "grad_norm": 0.016971949487924576, + "learning_rate": 4.032487609993869e-05, + "loss": 0.0789, + "num_input_tokens_seen": 13848064, + "step": 65625 + }, + { + "epoch": 7.22002200220022, + "grad_norm": 0.04714718833565712, + "learning_rate": 4.0322979767604055e-05, + "loss": 0.1536, + "num_input_tokens_seen": 13849056, + "step": 65630 + }, + { + "epoch": 7.220572057205721, + "grad_norm": 0.019881166517734528, + "learning_rate": 4.032108329404486e-05, + "loss": 0.0627, + "num_input_tokens_seen": 13850112, + "step": 65635 + }, + { + "epoch": 7.221122112211221, + "grad_norm": 0.3282223641872406, + "learning_rate": 4.03191866792786e-05, + "loss": 0.0308, + "num_input_tokens_seen": 13851168, + "step": 65640 + }, + { + "epoch": 7.221672167216722, + "grad_norm": 0.01009361632168293, + "learning_rate": 4.031728992332274e-05, + "loss": 0.0062, + "num_input_tokens_seen": 13852160, + "step": 65645 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.41584932804107666, + "learning_rate": 4.031539302619478e-05, + "loss": 0.1484, + "num_input_tokens_seen": 13853184, + "step": 65650 + }, + { + "epoch": 7.2227722772277225, + "grad_norm": 1.2178397178649902, + "learning_rate": 4.0313495987912184e-05, + "loss": 0.0743, + "num_input_tokens_seen": 13854304, + "step": 65655 + }, + { + "epoch": 7.223322332233224, + "grad_norm": 1.1228365898132324, + "learning_rate": 4.031159880849244e-05, + "loss": 0.0686, + "num_input_tokens_seen": 13855264, + "step": 65660 + }, + { + "epoch": 7.223872387238724, + "grad_norm": 1.35969078540802, + "learning_rate": 4.0309701487953056e-05, + "loss": 0.0761, + "num_input_tokens_seen": 13856256, + "step": 65665 + }, + { + "epoch": 7.224422442244224, + "grad_norm": 0.037517618387937546, + "learning_rate": 4.030780402631148e-05, + "loss": 0.0293, + "num_input_tokens_seen": 13857280, + "step": 65670 + }, + { + "epoch": 7.224972497249725, + "grad_norm": 0.052818357944488525, + "learning_rate": 4.030590642358523e-05, + "loss": 0.0098, + "num_input_tokens_seen": 13858336, + "step": 65675 + }, + { + "epoch": 7.225522552255225, + "grad_norm": 0.07805755734443665, + "learning_rate": 4.030400867979178e-05, + "loss": 0.0177, + "num_input_tokens_seen": 13859424, + "step": 65680 + }, + { + "epoch": 7.226072607260726, + "grad_norm": 0.3233828842639923, + "learning_rate": 4.030211079494863e-05, + "loss": 0.038, + "num_input_tokens_seen": 13860512, + "step": 65685 + }, + { + "epoch": 7.226622662266227, + "grad_norm": 0.7477223873138428, + "learning_rate": 4.0300212769073265e-05, + "loss": 0.0288, + "num_input_tokens_seen": 13861536, + "step": 65690 + }, + { + "epoch": 7.227172717271727, + "grad_norm": 0.7200679183006287, + "learning_rate": 4.029831460218317e-05, + "loss": 0.072, + "num_input_tokens_seen": 13862592, + "step": 65695 + }, + { + "epoch": 7.227722772277228, + "grad_norm": 0.02416899800300598, + "learning_rate": 4.029641629429587e-05, + "loss": 0.0059, + "num_input_tokens_seen": 13863616, + "step": 65700 + }, + { + "epoch": 7.228272827282728, + "grad_norm": 0.47537341713905334, + "learning_rate": 4.029451784542883e-05, + "loss": 0.0317, + "num_input_tokens_seen": 13864736, + "step": 65705 + }, + { + "epoch": 7.228822882288229, + "grad_norm": 0.17980782687664032, + "learning_rate": 4.0292619255599564e-05, + "loss": 0.0253, + "num_input_tokens_seen": 13865792, + "step": 65710 + }, + { + "epoch": 7.229372937293729, + "grad_norm": 0.023893065750598907, + "learning_rate": 4.0290720524825556e-05, + "loss": 0.0062, + "num_input_tokens_seen": 13866848, + "step": 65715 + }, + { + "epoch": 7.22992299229923, + "grad_norm": 0.19549928605556488, + "learning_rate": 4.028882165312432e-05, + "loss": 0.0171, + "num_input_tokens_seen": 13867872, + "step": 65720 + }, + { + "epoch": 7.230473047304731, + "grad_norm": 0.7977517247200012, + "learning_rate": 4.028692264051335e-05, + "loss": 0.0593, + "num_input_tokens_seen": 13868992, + "step": 65725 + }, + { + "epoch": 7.231023102310231, + "grad_norm": 0.005280773155391216, + "learning_rate": 4.0285023487010146e-05, + "loss": 0.0205, + "num_input_tokens_seen": 13870080, + "step": 65730 + }, + { + "epoch": 7.231573157315731, + "grad_norm": 0.015717074275016785, + "learning_rate": 4.028312419263222e-05, + "loss": 0.0079, + "num_input_tokens_seen": 13871200, + "step": 65735 + }, + { + "epoch": 7.232123212321232, + "grad_norm": 1.1414756774902344, + "learning_rate": 4.028122475739706e-05, + "loss": 0.0956, + "num_input_tokens_seen": 13872288, + "step": 65740 + }, + { + "epoch": 7.232673267326732, + "grad_norm": 0.6504176259040833, + "learning_rate": 4.02793251813222e-05, + "loss": 0.1421, + "num_input_tokens_seen": 13873344, + "step": 65745 + }, + { + "epoch": 7.2332233223322335, + "grad_norm": 0.011386221274733543, + "learning_rate": 4.0277425464425126e-05, + "loss": 0.0016, + "num_input_tokens_seen": 13874400, + "step": 65750 + }, + { + "epoch": 7.233773377337734, + "grad_norm": 0.7288344502449036, + "learning_rate": 4.027552560672335e-05, + "loss": 0.0194, + "num_input_tokens_seen": 13875488, + "step": 65755 + }, + { + "epoch": 7.234323432343234, + "grad_norm": 0.006942219566553831, + "learning_rate": 4.0273625608234385e-05, + "loss": 0.111, + "num_input_tokens_seen": 13876544, + "step": 65760 + }, + { + "epoch": 7.234873487348735, + "grad_norm": 0.5747019648551941, + "learning_rate": 4.027172546897574e-05, + "loss": 0.064, + "num_input_tokens_seen": 13877568, + "step": 65765 + }, + { + "epoch": 7.235423542354235, + "grad_norm": 0.08483809977769852, + "learning_rate": 4.026982518896494e-05, + "loss": 0.006, + "num_input_tokens_seen": 13878656, + "step": 65770 + }, + { + "epoch": 7.235973597359736, + "grad_norm": 0.12047575414180756, + "learning_rate": 4.026792476821948e-05, + "loss": 0.0856, + "num_input_tokens_seen": 13879776, + "step": 65775 + }, + { + "epoch": 7.2365236523652365, + "grad_norm": 0.021343132480978966, + "learning_rate": 4.026602420675688e-05, + "loss": 0.0097, + "num_input_tokens_seen": 13880832, + "step": 65780 + }, + { + "epoch": 7.237073707370737, + "grad_norm": 0.049737025052309036, + "learning_rate": 4.026412350459466e-05, + "loss": 0.0879, + "num_input_tokens_seen": 13881952, + "step": 65785 + }, + { + "epoch": 7.237623762376238, + "grad_norm": 0.04561996832489967, + "learning_rate": 4.026222266175034e-05, + "loss": 0.0087, + "num_input_tokens_seen": 13883040, + "step": 65790 + }, + { + "epoch": 7.238173817381738, + "grad_norm": 0.036980148404836655, + "learning_rate": 4.026032167824144e-05, + "loss": 0.087, + "num_input_tokens_seen": 13884032, + "step": 65795 + }, + { + "epoch": 7.238723872387239, + "grad_norm": 0.013326961547136307, + "learning_rate": 4.025842055408548e-05, + "loss": 0.0393, + "num_input_tokens_seen": 13885056, + "step": 65800 + }, + { + "epoch": 7.239273927392739, + "grad_norm": 0.2437317818403244, + "learning_rate": 4.025651928929998e-05, + "loss": 0.0249, + "num_input_tokens_seen": 13886208, + "step": 65805 + }, + { + "epoch": 7.2398239823982395, + "grad_norm": 0.35022521018981934, + "learning_rate": 4.0254617883902454e-05, + "loss": 0.0272, + "num_input_tokens_seen": 13887200, + "step": 65810 + }, + { + "epoch": 7.240374037403741, + "grad_norm": 0.38884642720222473, + "learning_rate": 4.0252716337910444e-05, + "loss": 0.052, + "num_input_tokens_seen": 13888224, + "step": 65815 + }, + { + "epoch": 7.240924092409241, + "grad_norm": 0.4014183282852173, + "learning_rate": 4.0250814651341464e-05, + "loss": 0.0272, + "num_input_tokens_seen": 13889312, + "step": 65820 + }, + { + "epoch": 7.241474147414741, + "grad_norm": 0.15861119329929352, + "learning_rate": 4.024891282421304e-05, + "loss": 0.0305, + "num_input_tokens_seen": 13890272, + "step": 65825 + }, + { + "epoch": 7.242024202420242, + "grad_norm": 0.23348230123519897, + "learning_rate": 4.02470108565427e-05, + "loss": 0.0239, + "num_input_tokens_seen": 13891424, + "step": 65830 + }, + { + "epoch": 7.242574257425742, + "grad_norm": 0.03191269189119339, + "learning_rate": 4.0245108748347985e-05, + "loss": 0.0623, + "num_input_tokens_seen": 13892512, + "step": 65835 + }, + { + "epoch": 7.243124312431243, + "grad_norm": 0.013362924568355083, + "learning_rate": 4.024320649964642e-05, + "loss": 0.0019, + "num_input_tokens_seen": 13893600, + "step": 65840 + }, + { + "epoch": 7.243674367436744, + "grad_norm": 0.010015751235187054, + "learning_rate": 4.0241304110455524e-05, + "loss": 0.0166, + "num_input_tokens_seen": 13894720, + "step": 65845 + }, + { + "epoch": 7.244224422442244, + "grad_norm": 0.10684258490800858, + "learning_rate": 4.023940158079285e-05, + "loss": 0.0457, + "num_input_tokens_seen": 13895840, + "step": 65850 + }, + { + "epoch": 7.244774477447745, + "grad_norm": 0.0345790721476078, + "learning_rate": 4.023749891067592e-05, + "loss": 0.0299, + "num_input_tokens_seen": 13896896, + "step": 65855 + }, + { + "epoch": 7.245324532453245, + "grad_norm": 0.4020603597164154, + "learning_rate": 4.023559610012228e-05, + "loss": 0.0795, + "num_input_tokens_seen": 13897952, + "step": 65860 + }, + { + "epoch": 7.245874587458746, + "grad_norm": 0.07877447456121445, + "learning_rate": 4.023369314914946e-05, + "loss": 0.0111, + "num_input_tokens_seen": 13899008, + "step": 65865 + }, + { + "epoch": 7.2464246424642464, + "grad_norm": 0.3141859471797943, + "learning_rate": 4.0231790057774995e-05, + "loss": 0.0134, + "num_input_tokens_seen": 13900032, + "step": 65870 + }, + { + "epoch": 7.246974697469747, + "grad_norm": 1.2174979448318481, + "learning_rate": 4.022988682601644e-05, + "loss": 0.2226, + "num_input_tokens_seen": 13901088, + "step": 65875 + }, + { + "epoch": 7.247524752475248, + "grad_norm": 0.020409071817994118, + "learning_rate": 4.0227983453891314e-05, + "loss": 0.0488, + "num_input_tokens_seen": 13902176, + "step": 65880 + }, + { + "epoch": 7.248074807480748, + "grad_norm": 0.7290687561035156, + "learning_rate": 4.022607994141718e-05, + "loss": 0.067, + "num_input_tokens_seen": 13903232, + "step": 65885 + }, + { + "epoch": 7.248624862486249, + "grad_norm": 0.04643254727125168, + "learning_rate": 4.022417628861157e-05, + "loss": 0.0295, + "num_input_tokens_seen": 13904256, + "step": 65890 + }, + { + "epoch": 7.249174917491749, + "grad_norm": 0.06430301070213318, + "learning_rate": 4.022227249549203e-05, + "loss": 0.0518, + "num_input_tokens_seen": 13905312, + "step": 65895 + }, + { + "epoch": 7.2497249724972495, + "grad_norm": 0.8565894961357117, + "learning_rate": 4.022036856207611e-05, + "loss": 0.0281, + "num_input_tokens_seen": 13906400, + "step": 65900 + }, + { + "epoch": 7.2502750275027505, + "grad_norm": 0.025072462856769562, + "learning_rate": 4.021846448838136e-05, + "loss": 0.0653, + "num_input_tokens_seen": 13907392, + "step": 65905 + }, + { + "epoch": 7.250825082508251, + "grad_norm": 0.23747090995311737, + "learning_rate": 4.021656027442533e-05, + "loss": 0.0643, + "num_input_tokens_seen": 13908416, + "step": 65910 + }, + { + "epoch": 7.251375137513751, + "grad_norm": 0.03425115346908569, + "learning_rate": 4.0214655920225555e-05, + "loss": 0.0313, + "num_input_tokens_seen": 13909536, + "step": 65915 + }, + { + "epoch": 7.251925192519252, + "grad_norm": 0.25032371282577515, + "learning_rate": 4.02127514257996e-05, + "loss": 0.0427, + "num_input_tokens_seen": 13910560, + "step": 65920 + }, + { + "epoch": 7.252475247524752, + "grad_norm": 0.008035617880523205, + "learning_rate": 4.0210846791165016e-05, + "loss": 0.0673, + "num_input_tokens_seen": 13911616, + "step": 65925 + }, + { + "epoch": 7.253025302530253, + "grad_norm": 0.16431665420532227, + "learning_rate": 4.020894201633935e-05, + "loss": 0.0384, + "num_input_tokens_seen": 13912672, + "step": 65930 + }, + { + "epoch": 7.2535753575357536, + "grad_norm": 0.5769645571708679, + "learning_rate": 4.020703710134017e-05, + "loss": 0.0391, + "num_input_tokens_seen": 13913728, + "step": 65935 + }, + { + "epoch": 7.254125412541254, + "grad_norm": 0.019536452367901802, + "learning_rate": 4.020513204618503e-05, + "loss": 0.0264, + "num_input_tokens_seen": 13914752, + "step": 65940 + }, + { + "epoch": 7.254675467546755, + "grad_norm": 0.029858481138944626, + "learning_rate": 4.020322685089147e-05, + "loss": 0.0265, + "num_input_tokens_seen": 13915808, + "step": 65945 + }, + { + "epoch": 7.255225522552255, + "grad_norm": 0.08298149704933167, + "learning_rate": 4.0201321515477074e-05, + "loss": 0.0167, + "num_input_tokens_seen": 13916832, + "step": 65950 + }, + { + "epoch": 7.255775577557756, + "grad_norm": 0.1544632613658905, + "learning_rate": 4.019941603995939e-05, + "loss": 0.0668, + "num_input_tokens_seen": 13917952, + "step": 65955 + }, + { + "epoch": 7.256325632563256, + "grad_norm": 0.09539206326007843, + "learning_rate": 4.019751042435599e-05, + "loss": 0.0347, + "num_input_tokens_seen": 13919008, + "step": 65960 + }, + { + "epoch": 7.256875687568757, + "grad_norm": 0.06457312405109406, + "learning_rate": 4.019560466868442e-05, + "loss": 0.008, + "num_input_tokens_seen": 13920064, + "step": 65965 + }, + { + "epoch": 7.257425742574258, + "grad_norm": 0.028266385197639465, + "learning_rate": 4.019369877296225e-05, + "loss": 0.0612, + "num_input_tokens_seen": 13921152, + "step": 65970 + }, + { + "epoch": 7.257975797579758, + "grad_norm": 0.10637502372264862, + "learning_rate": 4.019179273720706e-05, + "loss": 0.0126, + "num_input_tokens_seen": 13922176, + "step": 65975 + }, + { + "epoch": 7.258525852585258, + "grad_norm": 0.6667693257331848, + "learning_rate": 4.01898865614364e-05, + "loss": 0.278, + "num_input_tokens_seen": 13923168, + "step": 65980 + }, + { + "epoch": 7.259075907590759, + "grad_norm": 1.038470983505249, + "learning_rate": 4.0187980245667846e-05, + "loss": 0.05, + "num_input_tokens_seen": 13924128, + "step": 65985 + }, + { + "epoch": 7.259625962596259, + "grad_norm": 0.045979950577020645, + "learning_rate": 4.018607378991896e-05, + "loss": 0.0349, + "num_input_tokens_seen": 13925216, + "step": 65990 + }, + { + "epoch": 7.2601760176017605, + "grad_norm": 0.0856911912560463, + "learning_rate": 4.0184167194207325e-05, + "loss": 0.0185, + "num_input_tokens_seen": 13926208, + "step": 65995 + }, + { + "epoch": 7.260726072607261, + "grad_norm": 0.010712169110774994, + "learning_rate": 4.01822604585505e-05, + "loss": 0.024, + "num_input_tokens_seen": 13927232, + "step": 66000 + }, + { + "epoch": 7.261276127612761, + "grad_norm": 0.24053357541561127, + "learning_rate": 4.018035358296607e-05, + "loss": 0.0082, + "num_input_tokens_seen": 13928320, + "step": 66005 + }, + { + "epoch": 7.261826182618262, + "grad_norm": 0.18727752566337585, + "learning_rate": 4.017844656747161e-05, + "loss": 0.0176, + "num_input_tokens_seen": 13929376, + "step": 66010 + }, + { + "epoch": 7.262376237623762, + "grad_norm": 0.03892593830823898, + "learning_rate": 4.0176539412084684e-05, + "loss": 0.0641, + "num_input_tokens_seen": 13930368, + "step": 66015 + }, + { + "epoch": 7.262926292629263, + "grad_norm": 0.04390240088105202, + "learning_rate": 4.017463211682288e-05, + "loss": 0.0197, + "num_input_tokens_seen": 13931392, + "step": 66020 + }, + { + "epoch": 7.2634763476347635, + "grad_norm": 0.09579335898160934, + "learning_rate": 4.017272468170377e-05, + "loss": 0.0091, + "num_input_tokens_seen": 13932448, + "step": 66025 + }, + { + "epoch": 7.264026402640264, + "grad_norm": 0.027923479676246643, + "learning_rate": 4.017081710674494e-05, + "loss": 0.0045, + "num_input_tokens_seen": 13933536, + "step": 66030 + }, + { + "epoch": 7.264576457645765, + "grad_norm": 0.8056132197380066, + "learning_rate": 4.0168909391963973e-05, + "loss": 0.0133, + "num_input_tokens_seen": 13934592, + "step": 66035 + }, + { + "epoch": 7.265126512651265, + "grad_norm": 0.45427095890045166, + "learning_rate": 4.0167001537378443e-05, + "loss": 0.026, + "num_input_tokens_seen": 13935680, + "step": 66040 + }, + { + "epoch": 7.265676567656766, + "grad_norm": 1.180701732635498, + "learning_rate": 4.016509354300594e-05, + "loss": 0.2335, + "num_input_tokens_seen": 13936768, + "step": 66045 + }, + { + "epoch": 7.266226622662266, + "grad_norm": 0.037970323115587234, + "learning_rate": 4.016318540886404e-05, + "loss": 0.0032, + "num_input_tokens_seen": 13937824, + "step": 66050 + }, + { + "epoch": 7.2667766776677665, + "grad_norm": 0.808879017829895, + "learning_rate": 4.0161277134970345e-05, + "loss": 0.0551, + "num_input_tokens_seen": 13938848, + "step": 66055 + }, + { + "epoch": 7.267326732673268, + "grad_norm": 0.016940219327807426, + "learning_rate": 4.015936872134242e-05, + "loss": 0.0028, + "num_input_tokens_seen": 13939936, + "step": 66060 + }, + { + "epoch": 7.267876787678768, + "grad_norm": 0.5585882067680359, + "learning_rate": 4.015746016799788e-05, + "loss": 0.0573, + "num_input_tokens_seen": 13940992, + "step": 66065 + }, + { + "epoch": 7.268426842684269, + "grad_norm": 0.06172242760658264, + "learning_rate": 4.0155551474954296e-05, + "loss": 0.1309, + "num_input_tokens_seen": 13941984, + "step": 66070 + }, + { + "epoch": 7.268976897689769, + "grad_norm": 0.8520759344100952, + "learning_rate": 4.0153642642229274e-05, + "loss": 0.0443, + "num_input_tokens_seen": 13943072, + "step": 66075 + }, + { + "epoch": 7.269526952695269, + "grad_norm": 0.61328125, + "learning_rate": 4.01517336698404e-05, + "loss": 0.0291, + "num_input_tokens_seen": 13944160, + "step": 66080 + }, + { + "epoch": 7.27007700770077, + "grad_norm": 0.03781716153025627, + "learning_rate": 4.014982455780525e-05, + "loss": 0.0987, + "num_input_tokens_seen": 13945216, + "step": 66085 + }, + { + "epoch": 7.270627062706271, + "grad_norm": 0.047960035502910614, + "learning_rate": 4.014791530614145e-05, + "loss": 0.0438, + "num_input_tokens_seen": 13946272, + "step": 66090 + }, + { + "epoch": 7.271177117711771, + "grad_norm": 0.02923273667693138, + "learning_rate": 4.014600591486658e-05, + "loss": 0.0124, + "num_input_tokens_seen": 13947296, + "step": 66095 + }, + { + "epoch": 7.271727172717272, + "grad_norm": 0.05220974609255791, + "learning_rate": 4.0144096383998256e-05, + "loss": 0.0689, + "num_input_tokens_seen": 13948352, + "step": 66100 + }, + { + "epoch": 7.272277227722772, + "grad_norm": 0.03283588960766792, + "learning_rate": 4.0142186713554045e-05, + "loss": 0.0098, + "num_input_tokens_seen": 13949440, + "step": 66105 + }, + { + "epoch": 7.272827282728273, + "grad_norm": 0.04427117854356766, + "learning_rate": 4.0140276903551566e-05, + "loss": 0.0724, + "num_input_tokens_seen": 13950528, + "step": 66110 + }, + { + "epoch": 7.273377337733773, + "grad_norm": 0.5496826767921448, + "learning_rate": 4.013836695400842e-05, + "loss": 0.0688, + "num_input_tokens_seen": 13951520, + "step": 66115 + }, + { + "epoch": 7.273927392739274, + "grad_norm": 1.0693273544311523, + "learning_rate": 4.0136456864942216e-05, + "loss": 0.0392, + "num_input_tokens_seen": 13952608, + "step": 66120 + }, + { + "epoch": 7.274477447744775, + "grad_norm": 0.03792034834623337, + "learning_rate": 4.0134546636370553e-05, + "loss": 0.0209, + "num_input_tokens_seen": 13953696, + "step": 66125 + }, + { + "epoch": 7.275027502750275, + "grad_norm": 0.1115712821483612, + "learning_rate": 4.013263626831103e-05, + "loss": 0.0234, + "num_input_tokens_seen": 13954688, + "step": 66130 + }, + { + "epoch": 7.275577557755776, + "grad_norm": 0.07184954732656479, + "learning_rate": 4.013072576078126e-05, + "loss": 0.0369, + "num_input_tokens_seen": 13955776, + "step": 66135 + }, + { + "epoch": 7.276127612761276, + "grad_norm": 0.030454576015472412, + "learning_rate": 4.012881511379886e-05, + "loss": 0.0527, + "num_input_tokens_seen": 13956832, + "step": 66140 + }, + { + "epoch": 7.276677667766776, + "grad_norm": 0.06133222579956055, + "learning_rate": 4.012690432738142e-05, + "loss": 0.0426, + "num_input_tokens_seen": 13957888, + "step": 66145 + }, + { + "epoch": 7.2772277227722775, + "grad_norm": 0.1501973420381546, + "learning_rate": 4.012499340154656e-05, + "loss": 0.0989, + "num_input_tokens_seen": 13958912, + "step": 66150 + }, + { + "epoch": 7.277777777777778, + "grad_norm": 0.06258763372898102, + "learning_rate": 4.0123082336311904e-05, + "loss": 0.0127, + "num_input_tokens_seen": 13959936, + "step": 66155 + }, + { + "epoch": 7.278327832783278, + "grad_norm": 0.06832689046859741, + "learning_rate": 4.012117113169505e-05, + "loss": 0.0252, + "num_input_tokens_seen": 13960960, + "step": 66160 + }, + { + "epoch": 7.278877887788779, + "grad_norm": 0.027911748737096786, + "learning_rate": 4.011925978771361e-05, + "loss": 0.0399, + "num_input_tokens_seen": 13962080, + "step": 66165 + }, + { + "epoch": 7.279427942794279, + "grad_norm": 0.20204436779022217, + "learning_rate": 4.011734830438522e-05, + "loss": 0.0188, + "num_input_tokens_seen": 13963136, + "step": 66170 + }, + { + "epoch": 7.27997799779978, + "grad_norm": 0.43355467915534973, + "learning_rate": 4.011543668172748e-05, + "loss": 0.0331, + "num_input_tokens_seen": 13964192, + "step": 66175 + }, + { + "epoch": 7.2805280528052805, + "grad_norm": 1.0267549753189087, + "learning_rate": 4.011352491975802e-05, + "loss": 0.1425, + "num_input_tokens_seen": 13965248, + "step": 66180 + }, + { + "epoch": 7.281078107810781, + "grad_norm": 0.36758679151535034, + "learning_rate": 4.0111613018494435e-05, + "loss": 0.0159, + "num_input_tokens_seen": 13966368, + "step": 66185 + }, + { + "epoch": 7.281628162816282, + "grad_norm": 1.1521275043487549, + "learning_rate": 4.010970097795438e-05, + "loss": 0.0163, + "num_input_tokens_seen": 13967424, + "step": 66190 + }, + { + "epoch": 7.282178217821782, + "grad_norm": 0.6148930191993713, + "learning_rate": 4.0107788798155456e-05, + "loss": 0.0864, + "num_input_tokens_seen": 13968512, + "step": 66195 + }, + { + "epoch": 7.282728272827283, + "grad_norm": 0.475879967212677, + "learning_rate": 4.010587647911529e-05, + "loss": 0.013, + "num_input_tokens_seen": 13969600, + "step": 66200 + }, + { + "epoch": 7.283278327832783, + "grad_norm": 0.10450369864702225, + "learning_rate": 4.0103964020851514e-05, + "loss": 0.0145, + "num_input_tokens_seen": 13970720, + "step": 66205 + }, + { + "epoch": 7.2838283828382835, + "grad_norm": 0.1835480034351349, + "learning_rate": 4.010205142338175e-05, + "loss": 0.0417, + "num_input_tokens_seen": 13971712, + "step": 66210 + }, + { + "epoch": 7.284378437843785, + "grad_norm": 0.0587519146502018, + "learning_rate": 4.010013868672362e-05, + "loss": 0.0227, + "num_input_tokens_seen": 13972768, + "step": 66215 + }, + { + "epoch": 7.284928492849285, + "grad_norm": 0.05286993086338043, + "learning_rate": 4.009822581089475e-05, + "loss": 0.0735, + "num_input_tokens_seen": 13973792, + "step": 66220 + }, + { + "epoch": 7.285478547854786, + "grad_norm": 0.0057862079702317715, + "learning_rate": 4.009631279591279e-05, + "loss": 0.0707, + "num_input_tokens_seen": 13974912, + "step": 66225 + }, + { + "epoch": 7.286028602860286, + "grad_norm": 0.7940106987953186, + "learning_rate": 4.009439964179536e-05, + "loss": 0.059, + "num_input_tokens_seen": 13976032, + "step": 66230 + }, + { + "epoch": 7.286578657865786, + "grad_norm": 0.030842065811157227, + "learning_rate": 4.009248634856008e-05, + "loss": 0.0226, + "num_input_tokens_seen": 13977024, + "step": 66235 + }, + { + "epoch": 7.287128712871287, + "grad_norm": 0.01070003304630518, + "learning_rate": 4.00905729162246e-05, + "loss": 0.0536, + "num_input_tokens_seen": 13978112, + "step": 66240 + }, + { + "epoch": 7.287678767876788, + "grad_norm": 0.11585547029972076, + "learning_rate": 4.0088659344806556e-05, + "loss": 0.0061, + "num_input_tokens_seen": 13979168, + "step": 66245 + }, + { + "epoch": 7.288228822882289, + "grad_norm": 0.046304360032081604, + "learning_rate": 4.0086745634323574e-05, + "loss": 0.0117, + "num_input_tokens_seen": 13980192, + "step": 66250 + }, + { + "epoch": 7.288778877887789, + "grad_norm": 0.03267737478017807, + "learning_rate": 4.00848317847933e-05, + "loss": 0.0407, + "num_input_tokens_seen": 13981280, + "step": 66255 + }, + { + "epoch": 7.289328932893289, + "grad_norm": 0.03980856388807297, + "learning_rate": 4.008291779623337e-05, + "loss": 0.0052, + "num_input_tokens_seen": 13982336, + "step": 66260 + }, + { + "epoch": 7.28987898789879, + "grad_norm": 0.08116069436073303, + "learning_rate": 4.008100366866142e-05, + "loss": 0.0808, + "num_input_tokens_seen": 13983392, + "step": 66265 + }, + { + "epoch": 7.29042904290429, + "grad_norm": 0.07456057518720627, + "learning_rate": 4.00790894020951e-05, + "loss": 0.0378, + "num_input_tokens_seen": 13984480, + "step": 66270 + }, + { + "epoch": 7.290979097909791, + "grad_norm": 0.39844244718551636, + "learning_rate": 4.007717499655205e-05, + "loss": 0.0369, + "num_input_tokens_seen": 13985568, + "step": 66275 + }, + { + "epoch": 7.291529152915292, + "grad_norm": 0.3125433027744293, + "learning_rate": 4.0075260452049906e-05, + "loss": 0.0208, + "num_input_tokens_seen": 13986720, + "step": 66280 + }, + { + "epoch": 7.292079207920792, + "grad_norm": 0.06525541841983795, + "learning_rate": 4.007334576860633e-05, + "loss": 0.0177, + "num_input_tokens_seen": 13987872, + "step": 66285 + }, + { + "epoch": 7.292629262926293, + "grad_norm": 0.05076486989855766, + "learning_rate": 4.007143094623894e-05, + "loss": 0.0254, + "num_input_tokens_seen": 13988928, + "step": 66290 + }, + { + "epoch": 7.293179317931793, + "grad_norm": 0.5019972920417786, + "learning_rate": 4.006951598496542e-05, + "loss": 0.0326, + "num_input_tokens_seen": 13989984, + "step": 66295 + }, + { + "epoch": 7.293729372937293, + "grad_norm": 0.2205052673816681, + "learning_rate": 4.00676008848034e-05, + "loss": 0.0585, + "num_input_tokens_seen": 13991104, + "step": 66300 + }, + { + "epoch": 7.2942794279427945, + "grad_norm": 0.030410433188080788, + "learning_rate": 4.006568564577054e-05, + "loss": 0.1065, + "num_input_tokens_seen": 13992160, + "step": 66305 + }, + { + "epoch": 7.294829482948295, + "grad_norm": 0.12389658391475677, + "learning_rate": 4.006377026788447e-05, + "loss": 0.0242, + "num_input_tokens_seen": 13993248, + "step": 66310 + }, + { + "epoch": 7.295379537953796, + "grad_norm": 0.07849203795194626, + "learning_rate": 4.006185475116286e-05, + "loss": 0.0187, + "num_input_tokens_seen": 13994304, + "step": 66315 + }, + { + "epoch": 7.295929592959296, + "grad_norm": 0.4345003664493561, + "learning_rate": 4.0059939095623365e-05, + "loss": 0.0182, + "num_input_tokens_seen": 13995360, + "step": 66320 + }, + { + "epoch": 7.296479647964796, + "grad_norm": 0.1820424497127533, + "learning_rate": 4.0058023301283634e-05, + "loss": 0.027, + "num_input_tokens_seen": 13996416, + "step": 66325 + }, + { + "epoch": 7.297029702970297, + "grad_norm": 0.172308087348938, + "learning_rate": 4.0056107368161336e-05, + "loss": 0.013, + "num_input_tokens_seen": 13997472, + "step": 66330 + }, + { + "epoch": 7.2975797579757975, + "grad_norm": 0.08661990612745285, + "learning_rate": 4.0054191296274116e-05, + "loss": 0.0468, + "num_input_tokens_seen": 13998560, + "step": 66335 + }, + { + "epoch": 7.298129812981298, + "grad_norm": 0.030576417222619057, + "learning_rate": 4.005227508563964e-05, + "loss": 0.0032, + "num_input_tokens_seen": 13999648, + "step": 66340 + }, + { + "epoch": 7.298679867986799, + "grad_norm": 0.1686612218618393, + "learning_rate": 4.0050358736275567e-05, + "loss": 0.0064, + "num_input_tokens_seen": 14000608, + "step": 66345 + }, + { + "epoch": 7.299229922992299, + "grad_norm": 0.03340477868914604, + "learning_rate": 4.0048442248199555e-05, + "loss": 0.0153, + "num_input_tokens_seen": 14001696, + "step": 66350 + }, + { + "epoch": 7.2997799779978, + "grad_norm": 0.030117465183138847, + "learning_rate": 4.004652562142928e-05, + "loss": 0.0619, + "num_input_tokens_seen": 14002816, + "step": 66355 + }, + { + "epoch": 7.3003300330033, + "grad_norm": 0.22851800918579102, + "learning_rate": 4.0044608855982394e-05, + "loss": 0.1149, + "num_input_tokens_seen": 14003872, + "step": 66360 + }, + { + "epoch": 7.3008800880088005, + "grad_norm": 0.05186235159635544, + "learning_rate": 4.004269195187657e-05, + "loss": 0.1518, + "num_input_tokens_seen": 14004896, + "step": 66365 + }, + { + "epoch": 7.301430143014302, + "grad_norm": 0.09293117374181747, + "learning_rate": 4.004077490912946e-05, + "loss": 0.0515, + "num_input_tokens_seen": 14005984, + "step": 66370 + }, + { + "epoch": 7.301980198019802, + "grad_norm": 0.9677518606185913, + "learning_rate": 4.003885772775876e-05, + "loss": 0.0364, + "num_input_tokens_seen": 14006976, + "step": 66375 + }, + { + "epoch": 7.302530253025303, + "grad_norm": 1.1575703620910645, + "learning_rate": 4.003694040778212e-05, + "loss": 0.2154, + "num_input_tokens_seen": 14008032, + "step": 66380 + }, + { + "epoch": 7.303080308030803, + "grad_norm": 0.11392103880643845, + "learning_rate": 4.003502294921722e-05, + "loss": 0.0835, + "num_input_tokens_seen": 14009120, + "step": 66385 + }, + { + "epoch": 7.303630363036303, + "grad_norm": 1.0617409944534302, + "learning_rate": 4.003310535208172e-05, + "loss": 0.0391, + "num_input_tokens_seen": 14010240, + "step": 66390 + }, + { + "epoch": 7.304180418041804, + "grad_norm": 0.02290262281894684, + "learning_rate": 4.0031187616393304e-05, + "loss": 0.0107, + "num_input_tokens_seen": 14011200, + "step": 66395 + }, + { + "epoch": 7.304730473047305, + "grad_norm": 0.05122220888733864, + "learning_rate": 4.002926974216965e-05, + "loss": 0.0264, + "num_input_tokens_seen": 14012256, + "step": 66400 + }, + { + "epoch": 7.305280528052805, + "grad_norm": 2.5612082481384277, + "learning_rate": 4.002735172942842e-05, + "loss": 0.0799, + "num_input_tokens_seen": 14013344, + "step": 66405 + }, + { + "epoch": 7.305830583058306, + "grad_norm": 0.07682778686285019, + "learning_rate": 4.002543357818731e-05, + "loss": 0.0086, + "num_input_tokens_seen": 14014400, + "step": 66410 + }, + { + "epoch": 7.306380638063806, + "grad_norm": 0.07869286835193634, + "learning_rate": 4.002351528846398e-05, + "loss": 0.0229, + "num_input_tokens_seen": 14015456, + "step": 66415 + }, + { + "epoch": 7.306930693069307, + "grad_norm": 0.18103636801242828, + "learning_rate": 4.002159686027611e-05, + "loss": 0.0376, + "num_input_tokens_seen": 14016512, + "step": 66420 + }, + { + "epoch": 7.307480748074807, + "grad_norm": 0.049975648522377014, + "learning_rate": 4.0019678293641405e-05, + "loss": 0.0743, + "num_input_tokens_seen": 14017536, + "step": 66425 + }, + { + "epoch": 7.3080308030803085, + "grad_norm": 0.12957622110843658, + "learning_rate": 4.001775958857753e-05, + "loss": 0.0261, + "num_input_tokens_seen": 14018624, + "step": 66430 + }, + { + "epoch": 7.308580858085809, + "grad_norm": 0.03164099156856537, + "learning_rate": 4.0015840745102175e-05, + "loss": 0.1543, + "num_input_tokens_seen": 14019648, + "step": 66435 + }, + { + "epoch": 7.309130913091309, + "grad_norm": 0.6542877554893494, + "learning_rate": 4.001392176323301e-05, + "loss": 0.1207, + "num_input_tokens_seen": 14020672, + "step": 66440 + }, + { + "epoch": 7.30968096809681, + "grad_norm": 0.006419263780117035, + "learning_rate": 4.001200264298773e-05, + "loss": 0.0093, + "num_input_tokens_seen": 14021696, + "step": 66445 + }, + { + "epoch": 7.31023102310231, + "grad_norm": 0.9854424595832825, + "learning_rate": 4.001008338438403e-05, + "loss": 0.0344, + "num_input_tokens_seen": 14022784, + "step": 66450 + }, + { + "epoch": 7.31078107810781, + "grad_norm": 0.04202871769666672, + "learning_rate": 4.000816398743961e-05, + "loss": 0.0352, + "num_input_tokens_seen": 14023840, + "step": 66455 + }, + { + "epoch": 7.3113311331133115, + "grad_norm": 0.35200339555740356, + "learning_rate": 4.0006244452172124e-05, + "loss": 0.0315, + "num_input_tokens_seen": 14024992, + "step": 66460 + }, + { + "epoch": 7.311881188118812, + "grad_norm": 0.10289742797613144, + "learning_rate": 4.000432477859929e-05, + "loss": 0.0069, + "num_input_tokens_seen": 14026048, + "step": 66465 + }, + { + "epoch": 7.312431243124313, + "grad_norm": 0.042376551777124405, + "learning_rate": 4.0002404966738794e-05, + "loss": 0.0798, + "num_input_tokens_seen": 14027040, + "step": 66470 + }, + { + "epoch": 7.312981298129813, + "grad_norm": 0.6464705467224121, + "learning_rate": 4.0000485016608334e-05, + "loss": 0.1126, + "num_input_tokens_seen": 14028160, + "step": 66475 + }, + { + "epoch": 7.313531353135313, + "grad_norm": 0.02725931443274021, + "learning_rate": 3.99985649282256e-05, + "loss": 0.0041, + "num_input_tokens_seen": 14029184, + "step": 66480 + }, + { + "epoch": 7.314081408140814, + "grad_norm": 0.08308073878288269, + "learning_rate": 3.999664470160829e-05, + "loss": 0.0343, + "num_input_tokens_seen": 14030304, + "step": 66485 + }, + { + "epoch": 7.3146314631463145, + "grad_norm": 1.151052474975586, + "learning_rate": 3.99947243367741e-05, + "loss": 0.0195, + "num_input_tokens_seen": 14031392, + "step": 66490 + }, + { + "epoch": 7.315181518151816, + "grad_norm": 0.05441443622112274, + "learning_rate": 3.999280383374073e-05, + "loss": 0.0479, + "num_input_tokens_seen": 14032480, + "step": 66495 + }, + { + "epoch": 7.315731573157316, + "grad_norm": 0.09841941297054291, + "learning_rate": 3.9990883192525876e-05, + "loss": 0.0185, + "num_input_tokens_seen": 14033504, + "step": 66500 + }, + { + "epoch": 7.316281628162816, + "grad_norm": 1.2631828784942627, + "learning_rate": 3.9988962413147254e-05, + "loss": 0.0985, + "num_input_tokens_seen": 14034528, + "step": 66505 + }, + { + "epoch": 7.316831683168317, + "grad_norm": 0.2861199676990509, + "learning_rate": 3.998704149562256e-05, + "loss": 0.0474, + "num_input_tokens_seen": 14035584, + "step": 66510 + }, + { + "epoch": 7.317381738173817, + "grad_norm": 0.7382895946502686, + "learning_rate": 3.998512043996949e-05, + "loss": 0.0291, + "num_input_tokens_seen": 14036640, + "step": 66515 + }, + { + "epoch": 7.3179317931793175, + "grad_norm": 0.11140184849500656, + "learning_rate": 3.998319924620576e-05, + "loss": 0.0045, + "num_input_tokens_seen": 14037664, + "step": 66520 + }, + { + "epoch": 7.318481848184819, + "grad_norm": 0.34906333684921265, + "learning_rate": 3.998127791434907e-05, + "loss": 0.0806, + "num_input_tokens_seen": 14038688, + "step": 66525 + }, + { + "epoch": 7.319031903190319, + "grad_norm": 0.10605323314666748, + "learning_rate": 3.9979356444417125e-05, + "loss": 0.0518, + "num_input_tokens_seen": 14039776, + "step": 66530 + }, + { + "epoch": 7.31958195819582, + "grad_norm": 0.41937127709388733, + "learning_rate": 3.997743483642765e-05, + "loss": 0.0567, + "num_input_tokens_seen": 14040864, + "step": 66535 + }, + { + "epoch": 7.32013201320132, + "grad_norm": 0.01265305932611227, + "learning_rate": 3.997551309039833e-05, + "loss": 0.0294, + "num_input_tokens_seen": 14041952, + "step": 66540 + }, + { + "epoch": 7.32068206820682, + "grad_norm": 0.7562652230262756, + "learning_rate": 3.9973591206346904e-05, + "loss": 0.071, + "num_input_tokens_seen": 14043008, + "step": 66545 + }, + { + "epoch": 7.321232123212321, + "grad_norm": 0.011032805778086185, + "learning_rate": 3.997166918429107e-05, + "loss": 0.0229, + "num_input_tokens_seen": 14044032, + "step": 66550 + }, + { + "epoch": 7.321782178217822, + "grad_norm": 0.022388027980923653, + "learning_rate": 3.996974702424855e-05, + "loss": 0.0209, + "num_input_tokens_seen": 14045056, + "step": 66555 + }, + { + "epoch": 7.322332233223323, + "grad_norm": 0.5965508222579956, + "learning_rate": 3.996782472623705e-05, + "loss": 0.0327, + "num_input_tokens_seen": 14046208, + "step": 66560 + }, + { + "epoch": 7.322882288228823, + "grad_norm": 0.01744917221367359, + "learning_rate": 3.996590229027428e-05, + "loss": 0.0387, + "num_input_tokens_seen": 14047296, + "step": 66565 + }, + { + "epoch": 7.323432343234323, + "grad_norm": 0.05614543333649635, + "learning_rate": 3.996397971637799e-05, + "loss": 0.077, + "num_input_tokens_seen": 14048320, + "step": 66570 + }, + { + "epoch": 7.323982398239824, + "grad_norm": 0.8066559433937073, + "learning_rate": 3.9962057004565864e-05, + "loss": 0.0447, + "num_input_tokens_seen": 14049408, + "step": 66575 + }, + { + "epoch": 7.324532453245324, + "grad_norm": 2.8746583461761475, + "learning_rate": 3.996013415485565e-05, + "loss": 0.0362, + "num_input_tokens_seen": 14050528, + "step": 66580 + }, + { + "epoch": 7.325082508250825, + "grad_norm": 0.5819210410118103, + "learning_rate": 3.995821116726505e-05, + "loss": 0.087, + "num_input_tokens_seen": 14051552, + "step": 66585 + }, + { + "epoch": 7.325632563256326, + "grad_norm": 0.023027675226330757, + "learning_rate": 3.99562880418118e-05, + "loss": 0.0192, + "num_input_tokens_seen": 14052608, + "step": 66590 + }, + { + "epoch": 7.326182618261826, + "grad_norm": 0.4871039390563965, + "learning_rate": 3.995436477851361e-05, + "loss": 0.0441, + "num_input_tokens_seen": 14053600, + "step": 66595 + }, + { + "epoch": 7.326732673267327, + "grad_norm": 0.01248351763933897, + "learning_rate": 3.995244137738823e-05, + "loss": 0.0194, + "num_input_tokens_seen": 14054656, + "step": 66600 + }, + { + "epoch": 7.327282728272827, + "grad_norm": 0.016385147348046303, + "learning_rate": 3.995051783845337e-05, + "loss": 0.021, + "num_input_tokens_seen": 14055712, + "step": 66605 + }, + { + "epoch": 7.327832783278327, + "grad_norm": 0.029706893488764763, + "learning_rate": 3.9948594161726754e-05, + "loss": 0.0142, + "num_input_tokens_seen": 14056800, + "step": 66610 + }, + { + "epoch": 7.3283828382838285, + "grad_norm": 0.11745353788137436, + "learning_rate": 3.994667034722612e-05, + "loss": 0.0461, + "num_input_tokens_seen": 14057888, + "step": 66615 + }, + { + "epoch": 7.328932893289329, + "grad_norm": 0.587128758430481, + "learning_rate": 3.99447463949692e-05, + "loss": 0.0299, + "num_input_tokens_seen": 14058944, + "step": 66620 + }, + { + "epoch": 7.32948294829483, + "grad_norm": 0.09012147039175034, + "learning_rate": 3.9942822304973725e-05, + "loss": 0.0207, + "num_input_tokens_seen": 14060032, + "step": 66625 + }, + { + "epoch": 7.33003300330033, + "grad_norm": 0.315617173910141, + "learning_rate": 3.9940898077257425e-05, + "loss": 0.0441, + "num_input_tokens_seen": 14061056, + "step": 66630 + }, + { + "epoch": 7.33058305830583, + "grad_norm": 0.021794244647026062, + "learning_rate": 3.9938973711838037e-05, + "loss": 0.002, + "num_input_tokens_seen": 14062112, + "step": 66635 + }, + { + "epoch": 7.331133113311331, + "grad_norm": 0.08975569158792496, + "learning_rate": 3.9937049208733294e-05, + "loss": 0.0604, + "num_input_tokens_seen": 14063072, + "step": 66640 + }, + { + "epoch": 7.3316831683168315, + "grad_norm": 1.6883100271224976, + "learning_rate": 3.9935124567960937e-05, + "loss": 0.1468, + "num_input_tokens_seen": 14064064, + "step": 66645 + }, + { + "epoch": 7.332233223322333, + "grad_norm": 0.8301972150802612, + "learning_rate": 3.993319978953871e-05, + "loss": 0.0399, + "num_input_tokens_seen": 14065120, + "step": 66650 + }, + { + "epoch": 7.332783278327833, + "grad_norm": 0.026407375931739807, + "learning_rate": 3.993127487348434e-05, + "loss": 0.0076, + "num_input_tokens_seen": 14066208, + "step": 66655 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.040699709206819534, + "learning_rate": 3.9929349819815574e-05, + "loss": 0.0906, + "num_input_tokens_seen": 14067296, + "step": 66660 + }, + { + "epoch": 7.333883388338834, + "grad_norm": 0.5013672113418579, + "learning_rate": 3.9927424628550155e-05, + "loss": 0.0322, + "num_input_tokens_seen": 14068384, + "step": 66665 + }, + { + "epoch": 7.334433443344334, + "grad_norm": 0.01954762265086174, + "learning_rate": 3.992549929970582e-05, + "loss": 0.0175, + "num_input_tokens_seen": 14069472, + "step": 66670 + }, + { + "epoch": 7.334983498349835, + "grad_norm": 0.2581571936607361, + "learning_rate": 3.992357383330034e-05, + "loss": 0.0155, + "num_input_tokens_seen": 14070528, + "step": 66675 + }, + { + "epoch": 7.335533553355336, + "grad_norm": 0.6729561686515808, + "learning_rate": 3.992164822935143e-05, + "loss": 0.0234, + "num_input_tokens_seen": 14071520, + "step": 66680 + }, + { + "epoch": 7.336083608360836, + "grad_norm": 0.4413395822048187, + "learning_rate": 3.9919722487876846e-05, + "loss": 0.018, + "num_input_tokens_seen": 14072544, + "step": 66685 + }, + { + "epoch": 7.336633663366337, + "grad_norm": 0.903256356716156, + "learning_rate": 3.991779660889433e-05, + "loss": 0.0233, + "num_input_tokens_seen": 14073568, + "step": 66690 + }, + { + "epoch": 7.337183718371837, + "grad_norm": 0.3992891311645508, + "learning_rate": 3.991587059242165e-05, + "loss": 0.0633, + "num_input_tokens_seen": 14074560, + "step": 66695 + }, + { + "epoch": 7.337733773377337, + "grad_norm": 0.3205169141292572, + "learning_rate": 3.991394443847655e-05, + "loss": 0.1696, + "num_input_tokens_seen": 14075584, + "step": 66700 + }, + { + "epoch": 7.338283828382838, + "grad_norm": 0.02464531548321247, + "learning_rate": 3.991201814707678e-05, + "loss": 0.0436, + "num_input_tokens_seen": 14076672, + "step": 66705 + }, + { + "epoch": 7.338833883388339, + "grad_norm": 0.05014713481068611, + "learning_rate": 3.9910091718240095e-05, + "loss": 0.0142, + "num_input_tokens_seen": 14077696, + "step": 66710 + }, + { + "epoch": 7.33938393839384, + "grad_norm": 0.05202624201774597, + "learning_rate": 3.990816515198424e-05, + "loss": 0.006, + "num_input_tokens_seen": 14078720, + "step": 66715 + }, + { + "epoch": 7.33993399339934, + "grad_norm": 0.05274531990289688, + "learning_rate": 3.9906238448326995e-05, + "loss": 0.0167, + "num_input_tokens_seen": 14079744, + "step": 66720 + }, + { + "epoch": 7.34048404840484, + "grad_norm": 0.011942792683839798, + "learning_rate": 3.990431160728609e-05, + "loss": 0.0033, + "num_input_tokens_seen": 14080896, + "step": 66725 + }, + { + "epoch": 7.341034103410341, + "grad_norm": 1.0254509449005127, + "learning_rate": 3.990238462887931e-05, + "loss": 0.0362, + "num_input_tokens_seen": 14081920, + "step": 66730 + }, + { + "epoch": 7.341584158415841, + "grad_norm": 0.17019379138946533, + "learning_rate": 3.990045751312439e-05, + "loss": 0.0129, + "num_input_tokens_seen": 14083072, + "step": 66735 + }, + { + "epoch": 7.3421342134213425, + "grad_norm": 0.008717337623238564, + "learning_rate": 3.989853026003911e-05, + "loss": 0.0351, + "num_input_tokens_seen": 14084128, + "step": 66740 + }, + { + "epoch": 7.342684268426843, + "grad_norm": 0.08912304043769836, + "learning_rate": 3.989660286964122e-05, + "loss": 0.0172, + "num_input_tokens_seen": 14085184, + "step": 66745 + }, + { + "epoch": 7.343234323432343, + "grad_norm": 0.17667056620121002, + "learning_rate": 3.9894675341948496e-05, + "loss": 0.0072, + "num_input_tokens_seen": 14086208, + "step": 66750 + }, + { + "epoch": 7.343784378437844, + "grad_norm": 0.08412844687700272, + "learning_rate": 3.989274767697869e-05, + "loss": 0.0296, + "num_input_tokens_seen": 14087360, + "step": 66755 + }, + { + "epoch": 7.344334433443344, + "grad_norm": 0.3993680477142334, + "learning_rate": 3.989081987474957e-05, + "loss": 0.0355, + "num_input_tokens_seen": 14088416, + "step": 66760 + }, + { + "epoch": 7.3448844884488445, + "grad_norm": 0.06709051877260208, + "learning_rate": 3.9888891935278926e-05, + "loss": 0.0899, + "num_input_tokens_seen": 14089504, + "step": 66765 + }, + { + "epoch": 7.3454345434543455, + "grad_norm": 0.1402687430381775, + "learning_rate": 3.98869638585845e-05, + "loss": 0.0241, + "num_input_tokens_seen": 14090592, + "step": 66770 + }, + { + "epoch": 7.345984598459846, + "grad_norm": 0.08606252074241638, + "learning_rate": 3.9885035644684065e-05, + "loss": 0.0067, + "num_input_tokens_seen": 14091712, + "step": 66775 + }, + { + "epoch": 7.346534653465347, + "grad_norm": 0.10634765028953552, + "learning_rate": 3.9883107293595406e-05, + "loss": 0.0186, + "num_input_tokens_seen": 14092768, + "step": 66780 + }, + { + "epoch": 7.347084708470847, + "grad_norm": 0.02245038002729416, + "learning_rate": 3.988117880533629e-05, + "loss": 0.0673, + "num_input_tokens_seen": 14093792, + "step": 66785 + }, + { + "epoch": 7.347634763476347, + "grad_norm": 0.027768436819314957, + "learning_rate": 3.987925017992449e-05, + "loss": 0.0529, + "num_input_tokens_seen": 14094848, + "step": 66790 + }, + { + "epoch": 7.348184818481848, + "grad_norm": 0.1387055218219757, + "learning_rate": 3.987732141737778e-05, + "loss": 0.0476, + "num_input_tokens_seen": 14095872, + "step": 66795 + }, + { + "epoch": 7.3487348734873486, + "grad_norm": 0.020801858976483345, + "learning_rate": 3.9875392517713936e-05, + "loss": 0.0071, + "num_input_tokens_seen": 14096864, + "step": 66800 + }, + { + "epoch": 7.34928492849285, + "grad_norm": 0.00880564097315073, + "learning_rate": 3.987346348095074e-05, + "loss": 0.0044, + "num_input_tokens_seen": 14097952, + "step": 66805 + }, + { + "epoch": 7.34983498349835, + "grad_norm": 0.3417624831199646, + "learning_rate": 3.987153430710595e-05, + "loss": 0.0656, + "num_input_tokens_seen": 14098976, + "step": 66810 + }, + { + "epoch": 7.35038503850385, + "grad_norm": 0.030485685914754868, + "learning_rate": 3.986960499619739e-05, + "loss": 0.0652, + "num_input_tokens_seen": 14100096, + "step": 66815 + }, + { + "epoch": 7.350935093509351, + "grad_norm": 0.024739183485507965, + "learning_rate": 3.98676755482428e-05, + "loss": 0.063, + "num_input_tokens_seen": 14101152, + "step": 66820 + }, + { + "epoch": 7.351485148514851, + "grad_norm": 0.23523473739624023, + "learning_rate": 3.986574596325998e-05, + "loss": 0.0637, + "num_input_tokens_seen": 14102176, + "step": 66825 + }, + { + "epoch": 7.3520352035203524, + "grad_norm": 0.166555255651474, + "learning_rate": 3.986381624126672e-05, + "loss": 0.0398, + "num_input_tokens_seen": 14103200, + "step": 66830 + }, + { + "epoch": 7.352585258525853, + "grad_norm": 0.5281898975372314, + "learning_rate": 3.9861886382280796e-05, + "loss": 0.0053, + "num_input_tokens_seen": 14104320, + "step": 66835 + }, + { + "epoch": 7.353135313531353, + "grad_norm": 0.15479815006256104, + "learning_rate": 3.985995638632e-05, + "loss": 0.1392, + "num_input_tokens_seen": 14105344, + "step": 66840 + }, + { + "epoch": 7.353685368536854, + "grad_norm": 0.004599140956997871, + "learning_rate": 3.9858026253402105e-05, + "loss": 0.0049, + "num_input_tokens_seen": 14106368, + "step": 66845 + }, + { + "epoch": 7.354235423542354, + "grad_norm": 0.013250380754470825, + "learning_rate": 3.9856095983544916e-05, + "loss": 0.0365, + "num_input_tokens_seen": 14107392, + "step": 66850 + }, + { + "epoch": 7.354785478547855, + "grad_norm": 0.018329253420233727, + "learning_rate": 3.985416557676622e-05, + "loss": 0.0771, + "num_input_tokens_seen": 14108448, + "step": 66855 + }, + { + "epoch": 7.3553355335533555, + "grad_norm": 0.017567669972777367, + "learning_rate": 3.985223503308381e-05, + "loss": 0.1292, + "num_input_tokens_seen": 14109472, + "step": 66860 + }, + { + "epoch": 7.355885588558856, + "grad_norm": 0.12829051911830902, + "learning_rate": 3.985030435251548e-05, + "loss": 0.0272, + "num_input_tokens_seen": 14110496, + "step": 66865 + }, + { + "epoch": 7.356435643564357, + "grad_norm": 0.06469959020614624, + "learning_rate": 3.9848373535079016e-05, + "loss": 0.0222, + "num_input_tokens_seen": 14111488, + "step": 66870 + }, + { + "epoch": 7.356985698569857, + "grad_norm": 0.011444519273936749, + "learning_rate": 3.9846442580792216e-05, + "loss": 0.0082, + "num_input_tokens_seen": 14112576, + "step": 66875 + }, + { + "epoch": 7.357535753575357, + "grad_norm": 0.09388014674186707, + "learning_rate": 3.984451148967288e-05, + "loss": 0.1253, + "num_input_tokens_seen": 14113632, + "step": 66880 + }, + { + "epoch": 7.358085808580858, + "grad_norm": 0.013518090359866619, + "learning_rate": 3.984258026173881e-05, + "loss": 0.0377, + "num_input_tokens_seen": 14114816, + "step": 66885 + }, + { + "epoch": 7.3586358635863585, + "grad_norm": 0.01076652854681015, + "learning_rate": 3.9840648897007794e-05, + "loss": 0.0039, + "num_input_tokens_seen": 14115808, + "step": 66890 + }, + { + "epoch": 7.3591859185918596, + "grad_norm": 1.550441026687622, + "learning_rate": 3.9838717395497635e-05, + "loss": 0.0867, + "num_input_tokens_seen": 14116896, + "step": 66895 + }, + { + "epoch": 7.35973597359736, + "grad_norm": 1.5130245685577393, + "learning_rate": 3.983678575722615e-05, + "loss": 0.0635, + "num_input_tokens_seen": 14117952, + "step": 66900 + }, + { + "epoch": 7.36028602860286, + "grad_norm": 0.23129843175411224, + "learning_rate": 3.983485398221112e-05, + "loss": 0.0195, + "num_input_tokens_seen": 14118976, + "step": 66905 + }, + { + "epoch": 7.360836083608361, + "grad_norm": 0.0020420809742063284, + "learning_rate": 3.983292207047036e-05, + "loss": 0.0208, + "num_input_tokens_seen": 14120064, + "step": 66910 + }, + { + "epoch": 7.361386138613861, + "grad_norm": 0.017815228551626205, + "learning_rate": 3.9830990022021675e-05, + "loss": 0.0023, + "num_input_tokens_seen": 14121088, + "step": 66915 + }, + { + "epoch": 7.361936193619362, + "grad_norm": 0.7242720127105713, + "learning_rate": 3.9829057836882874e-05, + "loss": 0.0231, + "num_input_tokens_seen": 14122240, + "step": 66920 + }, + { + "epoch": 7.362486248624863, + "grad_norm": 0.40613263845443726, + "learning_rate": 3.982712551507176e-05, + "loss": 0.0383, + "num_input_tokens_seen": 14123264, + "step": 66925 + }, + { + "epoch": 7.363036303630363, + "grad_norm": 0.08280638605356216, + "learning_rate": 3.982519305660615e-05, + "loss": 0.0298, + "num_input_tokens_seen": 14124288, + "step": 66930 + }, + { + "epoch": 7.363586358635864, + "grad_norm": 0.11641141027212143, + "learning_rate": 3.982326046150385e-05, + "loss": 0.0675, + "num_input_tokens_seen": 14125376, + "step": 66935 + }, + { + "epoch": 7.364136413641364, + "grad_norm": 0.5565266609191895, + "learning_rate": 3.982132772978267e-05, + "loss": 0.1007, + "num_input_tokens_seen": 14126464, + "step": 66940 + }, + { + "epoch": 7.364686468646864, + "grad_norm": 1.5668712854385376, + "learning_rate": 3.9819394861460416e-05, + "loss": 0.0379, + "num_input_tokens_seen": 14127456, + "step": 66945 + }, + { + "epoch": 7.365236523652365, + "grad_norm": 0.028594981878995895, + "learning_rate": 3.9817461856554915e-05, + "loss": 0.0118, + "num_input_tokens_seen": 14128480, + "step": 66950 + }, + { + "epoch": 7.365786578657866, + "grad_norm": 0.06330449879169464, + "learning_rate": 3.9815528715083986e-05, + "loss": 0.0547, + "num_input_tokens_seen": 14129536, + "step": 66955 + }, + { + "epoch": 7.366336633663367, + "grad_norm": 0.012526540085673332, + "learning_rate": 3.981359543706543e-05, + "loss": 0.0378, + "num_input_tokens_seen": 14130592, + "step": 66960 + }, + { + "epoch": 7.366886688668867, + "grad_norm": 0.3702634274959564, + "learning_rate": 3.981166202251707e-05, + "loss": 0.0184, + "num_input_tokens_seen": 14131552, + "step": 66965 + }, + { + "epoch": 7.367436743674367, + "grad_norm": 0.3909294605255127, + "learning_rate": 3.980972847145673e-05, + "loss": 0.0126, + "num_input_tokens_seen": 14132640, + "step": 66970 + }, + { + "epoch": 7.367986798679868, + "grad_norm": 1.514557123184204, + "learning_rate": 3.980779478390224e-05, + "loss": 0.13, + "num_input_tokens_seen": 14133760, + "step": 66975 + }, + { + "epoch": 7.368536853685368, + "grad_norm": 0.01733700931072235, + "learning_rate": 3.98058609598714e-05, + "loss": 0.0018, + "num_input_tokens_seen": 14134848, + "step": 66980 + }, + { + "epoch": 7.3690869086908695, + "grad_norm": 0.2534865736961365, + "learning_rate": 3.9803926999382046e-05, + "loss": 0.0122, + "num_input_tokens_seen": 14135904, + "step": 66985 + }, + { + "epoch": 7.36963696369637, + "grad_norm": 0.8785184025764465, + "learning_rate": 3.9801992902452e-05, + "loss": 0.0198, + "num_input_tokens_seen": 14136960, + "step": 66990 + }, + { + "epoch": 7.37018701870187, + "grad_norm": 0.8566928505897522, + "learning_rate": 3.980005866909909e-05, + "loss": 0.0473, + "num_input_tokens_seen": 14138048, + "step": 66995 + }, + { + "epoch": 7.370737073707371, + "grad_norm": 0.3373243510723114, + "learning_rate": 3.9798124299341145e-05, + "loss": 0.0092, + "num_input_tokens_seen": 14139104, + "step": 67000 + }, + { + "epoch": 7.371287128712871, + "grad_norm": 0.5166856050491333, + "learning_rate": 3.979618979319598e-05, + "loss": 0.0113, + "num_input_tokens_seen": 14140160, + "step": 67005 + }, + { + "epoch": 7.371837183718371, + "grad_norm": 0.6332351565361023, + "learning_rate": 3.9794255150681445e-05, + "loss": 0.0264, + "num_input_tokens_seen": 14141184, + "step": 67010 + }, + { + "epoch": 7.3723872387238725, + "grad_norm": 1.2009023427963257, + "learning_rate": 3.979232037181535e-05, + "loss": 0.143, + "num_input_tokens_seen": 14142208, + "step": 67015 + }, + { + "epoch": 7.372937293729373, + "grad_norm": 0.03187442570924759, + "learning_rate": 3.9790385456615536e-05, + "loss": 0.0472, + "num_input_tokens_seen": 14143328, + "step": 67020 + }, + { + "epoch": 7.373487348734874, + "grad_norm": 0.041091062128543854, + "learning_rate": 3.978845040509984e-05, + "loss": 0.0318, + "num_input_tokens_seen": 14144384, + "step": 67025 + }, + { + "epoch": 7.374037403740374, + "grad_norm": 0.01362503133714199, + "learning_rate": 3.97865152172861e-05, + "loss": 0.1109, + "num_input_tokens_seen": 14145376, + "step": 67030 + }, + { + "epoch": 7.374587458745874, + "grad_norm": 0.13995526731014252, + "learning_rate": 3.978457989319213e-05, + "loss": 0.0197, + "num_input_tokens_seen": 14146496, + "step": 67035 + }, + { + "epoch": 7.375137513751375, + "grad_norm": 0.8166986703872681, + "learning_rate": 3.978264443283579e-05, + "loss": 0.07, + "num_input_tokens_seen": 14147552, + "step": 67040 + }, + { + "epoch": 7.3756875687568755, + "grad_norm": 0.8121374249458313, + "learning_rate": 3.97807088362349e-05, + "loss": 0.031, + "num_input_tokens_seen": 14148576, + "step": 67045 + }, + { + "epoch": 7.376237623762377, + "grad_norm": 0.15303651988506317, + "learning_rate": 3.977877310340732e-05, + "loss": 0.0047, + "num_input_tokens_seen": 14149632, + "step": 67050 + }, + { + "epoch": 7.376787678767877, + "grad_norm": 0.093876414000988, + "learning_rate": 3.9776837234370875e-05, + "loss": 0.0055, + "num_input_tokens_seen": 14150656, + "step": 67055 + }, + { + "epoch": 7.377337733773377, + "grad_norm": 0.023782553151249886, + "learning_rate": 3.977490122914342e-05, + "loss": 0.0312, + "num_input_tokens_seen": 14151680, + "step": 67060 + }, + { + "epoch": 7.377887788778878, + "grad_norm": 1.8769644498825073, + "learning_rate": 3.977296508774278e-05, + "loss": 0.1946, + "num_input_tokens_seen": 14152768, + "step": 67065 + }, + { + "epoch": 7.378437843784378, + "grad_norm": 0.04349517449736595, + "learning_rate": 3.977102881018682e-05, + "loss": 0.0746, + "num_input_tokens_seen": 14153824, + "step": 67070 + }, + { + "epoch": 7.378987898789879, + "grad_norm": 0.8462116122245789, + "learning_rate": 3.976909239649337e-05, + "loss": 0.0513, + "num_input_tokens_seen": 14154912, + "step": 67075 + }, + { + "epoch": 7.37953795379538, + "grad_norm": 0.05925355106592178, + "learning_rate": 3.9767155846680274e-05, + "loss": 0.1241, + "num_input_tokens_seen": 14156064, + "step": 67080 + }, + { + "epoch": 7.38008800880088, + "grad_norm": 0.033410076051950455, + "learning_rate": 3.97652191607654e-05, + "loss": 0.0154, + "num_input_tokens_seen": 14157120, + "step": 67085 + }, + { + "epoch": 7.380638063806381, + "grad_norm": 0.21202696859836578, + "learning_rate": 3.9763282338766575e-05, + "loss": 0.0438, + "num_input_tokens_seen": 14158272, + "step": 67090 + }, + { + "epoch": 7.381188118811881, + "grad_norm": 0.23704448342323303, + "learning_rate": 3.976134538070167e-05, + "loss": 0.049, + "num_input_tokens_seen": 14159360, + "step": 67095 + }, + { + "epoch": 7.381738173817382, + "grad_norm": 0.042417991906404495, + "learning_rate": 3.975940828658853e-05, + "loss": 0.0493, + "num_input_tokens_seen": 14160384, + "step": 67100 + }, + { + "epoch": 7.382288228822882, + "grad_norm": 0.031076522544026375, + "learning_rate": 3.9757471056444996e-05, + "loss": 0.1262, + "num_input_tokens_seen": 14161440, + "step": 67105 + }, + { + "epoch": 7.382838283828383, + "grad_norm": 1.4385911226272583, + "learning_rate": 3.975553369028894e-05, + "loss": 0.0494, + "num_input_tokens_seen": 14162592, + "step": 67110 + }, + { + "epoch": 7.383388338833884, + "grad_norm": 0.6413736343383789, + "learning_rate": 3.97535961881382e-05, + "loss": 0.0834, + "num_input_tokens_seen": 14163616, + "step": 67115 + }, + { + "epoch": 7.383938393839384, + "grad_norm": 0.006134268827736378, + "learning_rate": 3.975165855001066e-05, + "loss": 0.0309, + "num_input_tokens_seen": 14164704, + "step": 67120 + }, + { + "epoch": 7.384488448844884, + "grad_norm": 0.07529140263795853, + "learning_rate": 3.974972077592415e-05, + "loss": 0.0821, + "num_input_tokens_seen": 14165728, + "step": 67125 + }, + { + "epoch": 7.385038503850385, + "grad_norm": 0.4046725928783417, + "learning_rate": 3.9747782865896544e-05, + "loss": 0.0478, + "num_input_tokens_seen": 14166752, + "step": 67130 + }, + { + "epoch": 7.385588558855885, + "grad_norm": 0.37417373061180115, + "learning_rate": 3.9745844819945696e-05, + "loss": 0.0177, + "num_input_tokens_seen": 14167840, + "step": 67135 + }, + { + "epoch": 7.3861386138613865, + "grad_norm": 0.666889488697052, + "learning_rate": 3.9743906638089476e-05, + "loss": 0.0695, + "num_input_tokens_seen": 14168928, + "step": 67140 + }, + { + "epoch": 7.386688668866887, + "grad_norm": 0.18463081121444702, + "learning_rate": 3.9741968320345746e-05, + "loss": 0.0139, + "num_input_tokens_seen": 14169984, + "step": 67145 + }, + { + "epoch": 7.387238723872387, + "grad_norm": 0.4032478928565979, + "learning_rate": 3.974002986673237e-05, + "loss": 0.0186, + "num_input_tokens_seen": 14171008, + "step": 67150 + }, + { + "epoch": 7.387788778877888, + "grad_norm": 0.01517655048519373, + "learning_rate": 3.973809127726721e-05, + "loss": 0.0376, + "num_input_tokens_seen": 14172096, + "step": 67155 + }, + { + "epoch": 7.388338833883388, + "grad_norm": 1.435023307800293, + "learning_rate": 3.973615255196813e-05, + "loss": 0.0901, + "num_input_tokens_seen": 14173152, + "step": 67160 + }, + { + "epoch": 7.388888888888889, + "grad_norm": 0.10065122693777084, + "learning_rate": 3.9734213690853006e-05, + "loss": 0.0838, + "num_input_tokens_seen": 14174304, + "step": 67165 + }, + { + "epoch": 7.3894389438943895, + "grad_norm": 0.04409809038043022, + "learning_rate": 3.9732274693939705e-05, + "loss": 0.1616, + "num_input_tokens_seen": 14175328, + "step": 67170 + }, + { + "epoch": 7.38998899889989, + "grad_norm": 0.8051329851150513, + "learning_rate": 3.973033556124609e-05, + "loss": 0.0183, + "num_input_tokens_seen": 14176416, + "step": 67175 + }, + { + "epoch": 7.390539053905391, + "grad_norm": 0.01739991270005703, + "learning_rate": 3.972839629279005e-05, + "loss": 0.0275, + "num_input_tokens_seen": 14177440, + "step": 67180 + }, + { + "epoch": 7.391089108910891, + "grad_norm": 0.08060631155967712, + "learning_rate": 3.972645688858944e-05, + "loss": 0.0067, + "num_input_tokens_seen": 14178496, + "step": 67185 + }, + { + "epoch": 7.391639163916391, + "grad_norm": 1.0185142755508423, + "learning_rate": 3.972451734866215e-05, + "loss": 0.0243, + "num_input_tokens_seen": 14179552, + "step": 67190 + }, + { + "epoch": 7.392189218921892, + "grad_norm": 0.45650598406791687, + "learning_rate": 3.972257767302605e-05, + "loss": 0.0225, + "num_input_tokens_seen": 14180576, + "step": 67195 + }, + { + "epoch": 7.3927392739273925, + "grad_norm": 0.13991035521030426, + "learning_rate": 3.972063786169902e-05, + "loss": 0.037, + "num_input_tokens_seen": 14181632, + "step": 67200 + }, + { + "epoch": 7.393289328932894, + "grad_norm": 0.026955541223287582, + "learning_rate": 3.971869791469892e-05, + "loss": 0.0257, + "num_input_tokens_seen": 14182720, + "step": 67205 + }, + { + "epoch": 7.393839383938394, + "grad_norm": 0.016142740845680237, + "learning_rate": 3.971675783204365e-05, + "loss": 0.0555, + "num_input_tokens_seen": 14183808, + "step": 67210 + }, + { + "epoch": 7.394389438943894, + "grad_norm": 0.2782140374183655, + "learning_rate": 3.971481761375109e-05, + "loss": 0.0278, + "num_input_tokens_seen": 14184832, + "step": 67215 + }, + { + "epoch": 7.394939493949395, + "grad_norm": 0.025661250576376915, + "learning_rate": 3.9712877259839114e-05, + "loss": 0.035, + "num_input_tokens_seen": 14185920, + "step": 67220 + }, + { + "epoch": 7.395489548954895, + "grad_norm": 0.027291782200336456, + "learning_rate": 3.971093677032561e-05, + "loss": 0.0061, + "num_input_tokens_seen": 14186976, + "step": 67225 + }, + { + "epoch": 7.396039603960396, + "grad_norm": 0.019511045888066292, + "learning_rate": 3.970899614522846e-05, + "loss": 0.0016, + "num_input_tokens_seen": 14188064, + "step": 67230 + }, + { + "epoch": 7.396589658965897, + "grad_norm": 0.10126545280218124, + "learning_rate": 3.970705538456554e-05, + "loss": 0.0177, + "num_input_tokens_seen": 14189152, + "step": 67235 + }, + { + "epoch": 7.397139713971397, + "grad_norm": 0.07678389549255371, + "learning_rate": 3.9705114488354764e-05, + "loss": 0.0056, + "num_input_tokens_seen": 14190176, + "step": 67240 + }, + { + "epoch": 7.397689768976898, + "grad_norm": 1.3739159107208252, + "learning_rate": 3.9703173456614e-05, + "loss": 0.0814, + "num_input_tokens_seen": 14191264, + "step": 67245 + }, + { + "epoch": 7.398239823982398, + "grad_norm": 0.13822805881500244, + "learning_rate": 3.970123228936113e-05, + "loss": 0.0904, + "num_input_tokens_seen": 14192320, + "step": 67250 + }, + { + "epoch": 7.398789878987899, + "grad_norm": 0.029507704079151154, + "learning_rate": 3.969929098661407e-05, + "loss": 0.0464, + "num_input_tokens_seen": 14193344, + "step": 67255 + }, + { + "epoch": 7.399339933993399, + "grad_norm": 0.31375473737716675, + "learning_rate": 3.9697349548390696e-05, + "loss": 0.0128, + "num_input_tokens_seen": 14194400, + "step": 67260 + }, + { + "epoch": 7.3998899889989, + "grad_norm": 0.9448689222335815, + "learning_rate": 3.96954079747089e-05, + "loss": 0.0282, + "num_input_tokens_seen": 14195456, + "step": 67265 + }, + { + "epoch": 7.400440044004401, + "grad_norm": 0.9908038377761841, + "learning_rate": 3.969346626558657e-05, + "loss": 0.0737, + "num_input_tokens_seen": 14196608, + "step": 67270 + }, + { + "epoch": 7.400990099009901, + "grad_norm": 0.013531893491744995, + "learning_rate": 3.969152442104163e-05, + "loss": 0.0032, + "num_input_tokens_seen": 14197696, + "step": 67275 + }, + { + "epoch": 7.401540154015402, + "grad_norm": 0.07339884340763092, + "learning_rate": 3.968958244109196e-05, + "loss": 0.0495, + "num_input_tokens_seen": 14198752, + "step": 67280 + }, + { + "epoch": 7.402090209020902, + "grad_norm": 0.009359879419207573, + "learning_rate": 3.968764032575544e-05, + "loss": 0.0142, + "num_input_tokens_seen": 14199776, + "step": 67285 + }, + { + "epoch": 7.402640264026402, + "grad_norm": 0.01465163379907608, + "learning_rate": 3.9685698075050006e-05, + "loss": 0.0643, + "num_input_tokens_seen": 14200800, + "step": 67290 + }, + { + "epoch": 7.4031903190319035, + "grad_norm": 0.12807601690292358, + "learning_rate": 3.968375568899353e-05, + "loss": 0.0115, + "num_input_tokens_seen": 14201888, + "step": 67295 + }, + { + "epoch": 7.403740374037404, + "grad_norm": 0.03650163486599922, + "learning_rate": 3.968181316760392e-05, + "loss": 0.0519, + "num_input_tokens_seen": 14202976, + "step": 67300 + }, + { + "epoch": 7.404290429042904, + "grad_norm": 0.02909858338534832, + "learning_rate": 3.967987051089909e-05, + "loss": 0.0158, + "num_input_tokens_seen": 14204000, + "step": 67305 + }, + { + "epoch": 7.404840484048405, + "grad_norm": 0.09086998552083969, + "learning_rate": 3.9677927718896936e-05, + "loss": 0.0549, + "num_input_tokens_seen": 14205056, + "step": 67310 + }, + { + "epoch": 7.405390539053905, + "grad_norm": 1.2537357807159424, + "learning_rate": 3.967598479161536e-05, + "loss": 0.0424, + "num_input_tokens_seen": 14206112, + "step": 67315 + }, + { + "epoch": 7.405940594059406, + "grad_norm": 0.28289860486984253, + "learning_rate": 3.967404172907228e-05, + "loss": 0.0373, + "num_input_tokens_seen": 14207168, + "step": 67320 + }, + { + "epoch": 7.4064906490649065, + "grad_norm": 0.01566609926521778, + "learning_rate": 3.96720985312856e-05, + "loss": 0.0125, + "num_input_tokens_seen": 14208192, + "step": 67325 + }, + { + "epoch": 7.407040704070407, + "grad_norm": 0.03636155650019646, + "learning_rate": 3.967015519827322e-05, + "loss": 0.0018, + "num_input_tokens_seen": 14209248, + "step": 67330 + }, + { + "epoch": 7.407590759075908, + "grad_norm": 0.15998949110507965, + "learning_rate": 3.966821173005306e-05, + "loss": 0.0073, + "num_input_tokens_seen": 14210368, + "step": 67335 + }, + { + "epoch": 7.408140814081408, + "grad_norm": 0.40591442584991455, + "learning_rate": 3.966626812664304e-05, + "loss": 0.0091, + "num_input_tokens_seen": 14211360, + "step": 67340 + }, + { + "epoch": 7.408690869086909, + "grad_norm": 0.04021672531962395, + "learning_rate": 3.966432438806106e-05, + "loss": 0.0427, + "num_input_tokens_seen": 14212416, + "step": 67345 + }, + { + "epoch": 7.409240924092409, + "grad_norm": 0.058037009090185165, + "learning_rate": 3.966238051432504e-05, + "loss": 0.0284, + "num_input_tokens_seen": 14213440, + "step": 67350 + }, + { + "epoch": 7.4097909790979095, + "grad_norm": 0.019133925437927246, + "learning_rate": 3.9660436505452894e-05, + "loss": 0.0031, + "num_input_tokens_seen": 14214528, + "step": 67355 + }, + { + "epoch": 7.410341034103411, + "grad_norm": 0.5046377778053284, + "learning_rate": 3.965849236146254e-05, + "loss": 0.0321, + "num_input_tokens_seen": 14215552, + "step": 67360 + }, + { + "epoch": 7.410891089108911, + "grad_norm": 1.70872962474823, + "learning_rate": 3.9656548082371894e-05, + "loss": 0.0894, + "num_input_tokens_seen": 14216704, + "step": 67365 + }, + { + "epoch": 7.411441144114411, + "grad_norm": 0.1279860883951187, + "learning_rate": 3.9654603668198874e-05, + "loss": 0.0268, + "num_input_tokens_seen": 14217760, + "step": 67370 + }, + { + "epoch": 7.411991199119912, + "grad_norm": 0.6234844923019409, + "learning_rate": 3.965265911896141e-05, + "loss": 0.1153, + "num_input_tokens_seen": 14218816, + "step": 67375 + }, + { + "epoch": 7.412541254125412, + "grad_norm": 0.026552747935056686, + "learning_rate": 3.965071443467741e-05, + "loss": 0.0296, + "num_input_tokens_seen": 14219904, + "step": 67380 + }, + { + "epoch": 7.413091309130913, + "grad_norm": 0.008292731828987598, + "learning_rate": 3.964876961536481e-05, + "loss": 0.0249, + "num_input_tokens_seen": 14220992, + "step": 67385 + }, + { + "epoch": 7.413641364136414, + "grad_norm": 0.012623880989849567, + "learning_rate": 3.9646824661041534e-05, + "loss": 0.0059, + "num_input_tokens_seen": 14221984, + "step": 67390 + }, + { + "epoch": 7.414191419141914, + "grad_norm": 0.015630846843123436, + "learning_rate": 3.96448795717255e-05, + "loss": 0.0041, + "num_input_tokens_seen": 14223008, + "step": 67395 + }, + { + "epoch": 7.414741474147415, + "grad_norm": 0.22054080665111542, + "learning_rate": 3.964293434743463e-05, + "loss": 0.0067, + "num_input_tokens_seen": 14224064, + "step": 67400 + }, + { + "epoch": 7.415291529152915, + "grad_norm": 0.031788021326065063, + "learning_rate": 3.964098898818688e-05, + "loss": 0.0057, + "num_input_tokens_seen": 14225088, + "step": 67405 + }, + { + "epoch": 7.415841584158416, + "grad_norm": 0.18577003479003906, + "learning_rate": 3.963904349400015e-05, + "loss": 0.0134, + "num_input_tokens_seen": 14226112, + "step": 67410 + }, + { + "epoch": 7.416391639163916, + "grad_norm": 0.9840144515037537, + "learning_rate": 3.963709786489237e-05, + "loss": 0.0442, + "num_input_tokens_seen": 14227136, + "step": 67415 + }, + { + "epoch": 7.416941694169417, + "grad_norm": 1.260834813117981, + "learning_rate": 3.963515210088149e-05, + "loss": 0.0443, + "num_input_tokens_seen": 14228160, + "step": 67420 + }, + { + "epoch": 7.417491749174918, + "grad_norm": 0.16753973066806793, + "learning_rate": 3.963320620198544e-05, + "loss": 0.0633, + "num_input_tokens_seen": 14229152, + "step": 67425 + }, + { + "epoch": 7.418041804180418, + "grad_norm": 0.3183155655860901, + "learning_rate": 3.9631260168222144e-05, + "loss": 0.0327, + "num_input_tokens_seen": 14230240, + "step": 67430 + }, + { + "epoch": 7.418591859185918, + "grad_norm": 0.048261858522892, + "learning_rate": 3.9629313999609554e-05, + "loss": 0.0625, + "num_input_tokens_seen": 14231296, + "step": 67435 + }, + { + "epoch": 7.419141914191419, + "grad_norm": 0.4263778030872345, + "learning_rate": 3.962736769616559e-05, + "loss": 0.0078, + "num_input_tokens_seen": 14232384, + "step": 67440 + }, + { + "epoch": 7.419691969196919, + "grad_norm": 0.7730094790458679, + "learning_rate": 3.962542125790819e-05, + "loss": 0.0289, + "num_input_tokens_seen": 14233472, + "step": 67445 + }, + { + "epoch": 7.4202420242024205, + "grad_norm": 0.059608250856399536, + "learning_rate": 3.96234746848553e-05, + "loss": 0.0397, + "num_input_tokens_seen": 14234496, + "step": 67450 + }, + { + "epoch": 7.420792079207921, + "grad_norm": 0.08190082758665085, + "learning_rate": 3.962152797702487e-05, + "loss": 0.0938, + "num_input_tokens_seen": 14235680, + "step": 67455 + }, + { + "epoch": 7.421342134213421, + "grad_norm": 1.28766667842865, + "learning_rate": 3.961958113443483e-05, + "loss": 0.0121, + "num_input_tokens_seen": 14236672, + "step": 67460 + }, + { + "epoch": 7.421892189218922, + "grad_norm": 0.03436720743775368, + "learning_rate": 3.9617634157103136e-05, + "loss": 0.0059, + "num_input_tokens_seen": 14237664, + "step": 67465 + }, + { + "epoch": 7.422442244224422, + "grad_norm": 0.212777242064476, + "learning_rate": 3.961568704504771e-05, + "loss": 0.0161, + "num_input_tokens_seen": 14238656, + "step": 67470 + }, + { + "epoch": 7.422992299229923, + "grad_norm": 0.013988989405333996, + "learning_rate": 3.961373979828651e-05, + "loss": 0.0629, + "num_input_tokens_seen": 14239712, + "step": 67475 + }, + { + "epoch": 7.4235423542354235, + "grad_norm": 0.08040327578783035, + "learning_rate": 3.961179241683749e-05, + "loss": 0.0056, + "num_input_tokens_seen": 14240832, + "step": 67480 + }, + { + "epoch": 7.424092409240924, + "grad_norm": 0.010681888088583946, + "learning_rate": 3.960984490071858e-05, + "loss": 0.0138, + "num_input_tokens_seen": 14241952, + "step": 67485 + }, + { + "epoch": 7.424642464246425, + "grad_norm": 0.16633160412311554, + "learning_rate": 3.960789724994774e-05, + "loss": 0.0511, + "num_input_tokens_seen": 14243072, + "step": 67490 + }, + { + "epoch": 7.425192519251925, + "grad_norm": 0.013433173298835754, + "learning_rate": 3.960594946454293e-05, + "loss": 0.0978, + "num_input_tokens_seen": 14244192, + "step": 67495 + }, + { + "epoch": 7.425742574257426, + "grad_norm": 0.024927768856287003, + "learning_rate": 3.960400154452209e-05, + "loss": 0.0134, + "num_input_tokens_seen": 14245216, + "step": 67500 + }, + { + "epoch": 7.426292629262926, + "grad_norm": 0.20231324434280396, + "learning_rate": 3.960205348990317e-05, + "loss": 0.0073, + "num_input_tokens_seen": 14246272, + "step": 67505 + }, + { + "epoch": 7.4268426842684265, + "grad_norm": 1.1176966428756714, + "learning_rate": 3.960010530070414e-05, + "loss": 0.0354, + "num_input_tokens_seen": 14247360, + "step": 67510 + }, + { + "epoch": 7.427392739273928, + "grad_norm": 0.009607170708477497, + "learning_rate": 3.9598156976942935e-05, + "loss": 0.0055, + "num_input_tokens_seen": 14248384, + "step": 67515 + }, + { + "epoch": 7.427942794279428, + "grad_norm": 0.9299288988113403, + "learning_rate": 3.959620851863752e-05, + "loss": 0.0409, + "num_input_tokens_seen": 14249440, + "step": 67520 + }, + { + "epoch": 7.428492849284929, + "grad_norm": 0.007265150081366301, + "learning_rate": 3.959425992580587e-05, + "loss": 0.0492, + "num_input_tokens_seen": 14250528, + "step": 67525 + }, + { + "epoch": 7.429042904290429, + "grad_norm": 2.4314193725585938, + "learning_rate": 3.959231119846591e-05, + "loss": 0.1167, + "num_input_tokens_seen": 14251584, + "step": 67530 + }, + { + "epoch": 7.429592959295929, + "grad_norm": 1.5268718004226685, + "learning_rate": 3.9590362336635636e-05, + "loss": 0.0478, + "num_input_tokens_seen": 14252640, + "step": 67535 + }, + { + "epoch": 7.43014301430143, + "grad_norm": 0.03363143652677536, + "learning_rate": 3.9588413340332984e-05, + "loss": 0.0609, + "num_input_tokens_seen": 14253696, + "step": 67540 + }, + { + "epoch": 7.430693069306931, + "grad_norm": 0.29611074924468994, + "learning_rate": 3.9586464209575934e-05, + "loss": 0.0953, + "num_input_tokens_seen": 14254784, + "step": 67545 + }, + { + "epoch": 7.431243124312431, + "grad_norm": 0.04300183430314064, + "learning_rate": 3.958451494438244e-05, + "loss": 0.1862, + "num_input_tokens_seen": 14255840, + "step": 67550 + }, + { + "epoch": 7.431793179317932, + "grad_norm": 0.12009445577859879, + "learning_rate": 3.958256554477047e-05, + "loss": 0.0756, + "num_input_tokens_seen": 14256896, + "step": 67555 + }, + { + "epoch": 7.432343234323432, + "grad_norm": 1.7941502332687378, + "learning_rate": 3.9580616010757985e-05, + "loss": 0.0298, + "num_input_tokens_seen": 14258016, + "step": 67560 + }, + { + "epoch": 7.432893289328933, + "grad_norm": 0.06921741366386414, + "learning_rate": 3.957866634236297e-05, + "loss": 0.0837, + "num_input_tokens_seen": 14259072, + "step": 67565 + }, + { + "epoch": 7.433443344334433, + "grad_norm": 0.008369728922843933, + "learning_rate": 3.957671653960337e-05, + "loss": 0.0212, + "num_input_tokens_seen": 14260096, + "step": 67570 + }, + { + "epoch": 7.433993399339934, + "grad_norm": 0.0664948895573616, + "learning_rate": 3.957476660249718e-05, + "loss": 0.0112, + "num_input_tokens_seen": 14261216, + "step": 67575 + }, + { + "epoch": 7.434543454345435, + "grad_norm": 0.04870346188545227, + "learning_rate": 3.957281653106235e-05, + "loss": 0.0061, + "num_input_tokens_seen": 14262272, + "step": 67580 + }, + { + "epoch": 7.435093509350935, + "grad_norm": 1.4736002683639526, + "learning_rate": 3.9570866325316865e-05, + "loss": 0.0501, + "num_input_tokens_seen": 14263328, + "step": 67585 + }, + { + "epoch": 7.435643564356436, + "grad_norm": 0.05192948877811432, + "learning_rate": 3.95689159852787e-05, + "loss": 0.0376, + "num_input_tokens_seen": 14264352, + "step": 67590 + }, + { + "epoch": 7.436193619361936, + "grad_norm": 0.9027845859527588, + "learning_rate": 3.956696551096582e-05, + "loss": 0.0286, + "num_input_tokens_seen": 14265440, + "step": 67595 + }, + { + "epoch": 7.436743674367436, + "grad_norm": 0.7318345904350281, + "learning_rate": 3.9565014902396216e-05, + "loss": 0.109, + "num_input_tokens_seen": 14266464, + "step": 67600 + }, + { + "epoch": 7.4372937293729375, + "grad_norm": 0.01778084598481655, + "learning_rate": 3.956306415958786e-05, + "loss": 0.0092, + "num_input_tokens_seen": 14267520, + "step": 67605 + }, + { + "epoch": 7.437843784378438, + "grad_norm": 0.028631802648305893, + "learning_rate": 3.9561113282558724e-05, + "loss": 0.0138, + "num_input_tokens_seen": 14268608, + "step": 67610 + }, + { + "epoch": 7.438393839383938, + "grad_norm": 0.01752961054444313, + "learning_rate": 3.95591622713268e-05, + "loss": 0.0057, + "num_input_tokens_seen": 14269632, + "step": 67615 + }, + { + "epoch": 7.438943894389439, + "grad_norm": 0.021891698241233826, + "learning_rate": 3.9557211125910057e-05, + "loss": 0.0327, + "num_input_tokens_seen": 14270656, + "step": 67620 + }, + { + "epoch": 7.439493949394939, + "grad_norm": 0.025344068184494972, + "learning_rate": 3.955525984632649e-05, + "loss": 0.0072, + "num_input_tokens_seen": 14271776, + "step": 67625 + }, + { + "epoch": 7.44004400440044, + "grad_norm": 0.033210135996341705, + "learning_rate": 3.955330843259407e-05, + "loss": 0.0193, + "num_input_tokens_seen": 14272864, + "step": 67630 + }, + { + "epoch": 7.4405940594059405, + "grad_norm": 0.07106093317270279, + "learning_rate": 3.95513568847308e-05, + "loss": 0.007, + "num_input_tokens_seen": 14273952, + "step": 67635 + }, + { + "epoch": 7.441144114411441, + "grad_norm": 0.7817365527153015, + "learning_rate": 3.9549405202754654e-05, + "loss": 0.0534, + "num_input_tokens_seen": 14275040, + "step": 67640 + }, + { + "epoch": 7.441694169416942, + "grad_norm": 1.4419870376586914, + "learning_rate": 3.954745338668362e-05, + "loss": 0.0945, + "num_input_tokens_seen": 14276096, + "step": 67645 + }, + { + "epoch": 7.442244224422442, + "grad_norm": 0.07701214402914047, + "learning_rate": 3.954550143653568e-05, + "loss": 0.0221, + "num_input_tokens_seen": 14277280, + "step": 67650 + }, + { + "epoch": 7.442794279427943, + "grad_norm": 0.005563228856772184, + "learning_rate": 3.954354935232885e-05, + "loss": 0.0743, + "num_input_tokens_seen": 14278336, + "step": 67655 + }, + { + "epoch": 7.443344334433443, + "grad_norm": 0.04266329109668732, + "learning_rate": 3.954159713408109e-05, + "loss": 0.0747, + "num_input_tokens_seen": 14279456, + "step": 67660 + }, + { + "epoch": 7.4438943894389435, + "grad_norm": 0.007795314770191908, + "learning_rate": 3.9539644781810404e-05, + "loss": 0.0605, + "num_input_tokens_seen": 14280512, + "step": 67665 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.00557738309726119, + "learning_rate": 3.95376922955348e-05, + "loss": 0.0142, + "num_input_tokens_seen": 14281568, + "step": 67670 + }, + { + "epoch": 7.444994499449945, + "grad_norm": 0.010654594749212265, + "learning_rate": 3.953573967527225e-05, + "loss": 0.0082, + "num_input_tokens_seen": 14282688, + "step": 67675 + }, + { + "epoch": 7.445544554455446, + "grad_norm": 0.019750135019421577, + "learning_rate": 3.9533786921040776e-05, + "loss": 0.0172, + "num_input_tokens_seen": 14283680, + "step": 67680 + }, + { + "epoch": 7.446094609460946, + "grad_norm": 0.07720331847667694, + "learning_rate": 3.953183403285836e-05, + "loss": 0.1551, + "num_input_tokens_seen": 14284704, + "step": 67685 + }, + { + "epoch": 7.446644664466446, + "grad_norm": 0.47950950264930725, + "learning_rate": 3.9529881010743e-05, + "loss": 0.177, + "num_input_tokens_seen": 14285728, + "step": 67690 + }, + { + "epoch": 7.447194719471947, + "grad_norm": 0.13186298310756683, + "learning_rate": 3.9527927854712695e-05, + "loss": 0.1113, + "num_input_tokens_seen": 14286720, + "step": 67695 + }, + { + "epoch": 7.447744774477448, + "grad_norm": 0.1216205358505249, + "learning_rate": 3.952597456478545e-05, + "loss": 0.0182, + "num_input_tokens_seen": 14287712, + "step": 67700 + }, + { + "epoch": 7.448294829482949, + "grad_norm": 0.03859042003750801, + "learning_rate": 3.952402114097927e-05, + "loss": 0.0478, + "num_input_tokens_seen": 14288832, + "step": 67705 + }, + { + "epoch": 7.448844884488449, + "grad_norm": 0.0724145695567131, + "learning_rate": 3.952206758331216e-05, + "loss": 0.0987, + "num_input_tokens_seen": 14289856, + "step": 67710 + }, + { + "epoch": 7.449394939493949, + "grad_norm": 0.019160255789756775, + "learning_rate": 3.952011389180211e-05, + "loss": 0.0045, + "num_input_tokens_seen": 14290912, + "step": 67715 + }, + { + "epoch": 7.44994499449945, + "grad_norm": 0.030386755242943764, + "learning_rate": 3.951816006646716e-05, + "loss": 0.0072, + "num_input_tokens_seen": 14291968, + "step": 67720 + }, + { + "epoch": 7.4504950495049505, + "grad_norm": 0.25863122940063477, + "learning_rate": 3.951620610732527e-05, + "loss": 0.0072, + "num_input_tokens_seen": 14293056, + "step": 67725 + }, + { + "epoch": 7.451045104510451, + "grad_norm": 0.1306181699037552, + "learning_rate": 3.9514252014394496e-05, + "loss": 0.0154, + "num_input_tokens_seen": 14294080, + "step": 67730 + }, + { + "epoch": 7.451595159515952, + "grad_norm": 0.789461612701416, + "learning_rate": 3.951229778769281e-05, + "loss": 0.119, + "num_input_tokens_seen": 14295168, + "step": 67735 + }, + { + "epoch": 7.452145214521452, + "grad_norm": 0.58260178565979, + "learning_rate": 3.9510343427238246e-05, + "loss": 0.0443, + "num_input_tokens_seen": 14296224, + "step": 67740 + }, + { + "epoch": 7.452695269526953, + "grad_norm": 0.911709189414978, + "learning_rate": 3.950838893304881e-05, + "loss": 0.0269, + "num_input_tokens_seen": 14297248, + "step": 67745 + }, + { + "epoch": 7.453245324532453, + "grad_norm": 0.11052534729242325, + "learning_rate": 3.9506434305142506e-05, + "loss": 0.0644, + "num_input_tokens_seen": 14298368, + "step": 67750 + }, + { + "epoch": 7.4537953795379535, + "grad_norm": 0.10301186889410019, + "learning_rate": 3.9504479543537366e-05, + "loss": 0.0338, + "num_input_tokens_seen": 14299424, + "step": 67755 + }, + { + "epoch": 7.4543454345434546, + "grad_norm": 0.03289072960615158, + "learning_rate": 3.950252464825139e-05, + "loss": 0.0632, + "num_input_tokens_seen": 14300480, + "step": 67760 + }, + { + "epoch": 7.454895489548955, + "grad_norm": 0.016341781243681908, + "learning_rate": 3.950056961930262e-05, + "loss": 0.058, + "num_input_tokens_seen": 14301472, + "step": 67765 + }, + { + "epoch": 7.455445544554456, + "grad_norm": 0.25807350873947144, + "learning_rate": 3.9498614456709045e-05, + "loss": 0.0242, + "num_input_tokens_seen": 14302496, + "step": 67770 + }, + { + "epoch": 7.455995599559956, + "grad_norm": 0.6808936595916748, + "learning_rate": 3.949665916048869e-05, + "loss": 0.0175, + "num_input_tokens_seen": 14303520, + "step": 67775 + }, + { + "epoch": 7.456545654565456, + "grad_norm": 0.055129602551460266, + "learning_rate": 3.94947037306596e-05, + "loss": 0.0118, + "num_input_tokens_seen": 14304608, + "step": 67780 + }, + { + "epoch": 7.457095709570957, + "grad_norm": 0.04508653283119202, + "learning_rate": 3.949274816723977e-05, + "loss": 0.0495, + "num_input_tokens_seen": 14305664, + "step": 67785 + }, + { + "epoch": 7.457645764576458, + "grad_norm": 0.14185816049575806, + "learning_rate": 3.9490792470247244e-05, + "loss": 0.0998, + "num_input_tokens_seen": 14306720, + "step": 67790 + }, + { + "epoch": 7.458195819581958, + "grad_norm": 0.116663858294487, + "learning_rate": 3.9488836639700036e-05, + "loss": 0.054, + "num_input_tokens_seen": 14307712, + "step": 67795 + }, + { + "epoch": 7.458745874587459, + "grad_norm": 0.004532953724265099, + "learning_rate": 3.948688067561617e-05, + "loss": 0.0172, + "num_input_tokens_seen": 14308768, + "step": 67800 + }, + { + "epoch": 7.459295929592959, + "grad_norm": 0.023544898256659508, + "learning_rate": 3.948492457801367e-05, + "loss": 0.0063, + "num_input_tokens_seen": 14309760, + "step": 67805 + }, + { + "epoch": 7.45984598459846, + "grad_norm": 1.0957379341125488, + "learning_rate": 3.948296834691058e-05, + "loss": 0.063, + "num_input_tokens_seen": 14310752, + "step": 67810 + }, + { + "epoch": 7.46039603960396, + "grad_norm": 0.57252037525177, + "learning_rate": 3.948101198232491e-05, + "loss": 0.053, + "num_input_tokens_seen": 14311872, + "step": 67815 + }, + { + "epoch": 7.460946094609461, + "grad_norm": 0.07395058125257492, + "learning_rate": 3.947905548427471e-05, + "loss": 0.0174, + "num_input_tokens_seen": 14312928, + "step": 67820 + }, + { + "epoch": 7.461496149614962, + "grad_norm": 0.14716286957263947, + "learning_rate": 3.9477098852778e-05, + "loss": 0.04, + "num_input_tokens_seen": 14314048, + "step": 67825 + }, + { + "epoch": 7.462046204620462, + "grad_norm": 0.10975323617458344, + "learning_rate": 3.9475142087852827e-05, + "loss": 0.0126, + "num_input_tokens_seen": 14315136, + "step": 67830 + }, + { + "epoch": 7.462596259625963, + "grad_norm": 0.007485984358936548, + "learning_rate": 3.94731851895172e-05, + "loss": 0.0393, + "num_input_tokens_seen": 14316128, + "step": 67835 + }, + { + "epoch": 7.463146314631463, + "grad_norm": 1.3513468503952026, + "learning_rate": 3.947122815778917e-05, + "loss": 0.0278, + "num_input_tokens_seen": 14317184, + "step": 67840 + }, + { + "epoch": 7.463696369636963, + "grad_norm": 0.017525916919112206, + "learning_rate": 3.946927099268679e-05, + "loss": 0.085, + "num_input_tokens_seen": 14318208, + "step": 67845 + }, + { + "epoch": 7.4642464246424645, + "grad_norm": 0.1252916306257248, + "learning_rate": 3.946731369422807e-05, + "loss": 0.0225, + "num_input_tokens_seen": 14319296, + "step": 67850 + }, + { + "epoch": 7.464796479647965, + "grad_norm": 0.0037749132607132196, + "learning_rate": 3.9465356262431064e-05, + "loss": 0.0031, + "num_input_tokens_seen": 14320352, + "step": 67855 + }, + { + "epoch": 7.465346534653466, + "grad_norm": 0.01425150316208601, + "learning_rate": 3.9463398697313816e-05, + "loss": 0.0142, + "num_input_tokens_seen": 14321440, + "step": 67860 + }, + { + "epoch": 7.465896589658966, + "grad_norm": 0.0478321835398674, + "learning_rate": 3.946144099889435e-05, + "loss": 0.0102, + "num_input_tokens_seen": 14322464, + "step": 67865 + }, + { + "epoch": 7.466446644664466, + "grad_norm": 0.005737926810979843, + "learning_rate": 3.945948316719073e-05, + "loss": 0.0188, + "num_input_tokens_seen": 14323520, + "step": 67870 + }, + { + "epoch": 7.466996699669967, + "grad_norm": 0.005469325929880142, + "learning_rate": 3.945752520222099e-05, + "loss": 0.0144, + "num_input_tokens_seen": 14324544, + "step": 67875 + }, + { + "epoch": 7.4675467546754675, + "grad_norm": 0.009051625616848469, + "learning_rate": 3.9455567104003176e-05, + "loss": 0.0809, + "num_input_tokens_seen": 14325600, + "step": 67880 + }, + { + "epoch": 7.468096809680969, + "grad_norm": 0.0398891307413578, + "learning_rate": 3.9453608872555344e-05, + "loss": 0.0188, + "num_input_tokens_seen": 14326720, + "step": 67885 + }, + { + "epoch": 7.468646864686469, + "grad_norm": 0.11675383895635605, + "learning_rate": 3.9451650507895535e-05, + "loss": 0.0098, + "num_input_tokens_seen": 14327808, + "step": 67890 + }, + { + "epoch": 7.469196919691969, + "grad_norm": 0.21570314466953278, + "learning_rate": 3.9449692010041787e-05, + "loss": 0.011, + "num_input_tokens_seen": 14328832, + "step": 67895 + }, + { + "epoch": 7.46974697469747, + "grad_norm": 0.07459145039319992, + "learning_rate": 3.9447733379012165e-05, + "loss": 0.1805, + "num_input_tokens_seen": 14329952, + "step": 67900 + }, + { + "epoch": 7.47029702970297, + "grad_norm": 0.0314883217215538, + "learning_rate": 3.9445774614824716e-05, + "loss": 0.0041, + "num_input_tokens_seen": 14331040, + "step": 67905 + }, + { + "epoch": 7.4708470847084705, + "grad_norm": 0.0035148903261870146, + "learning_rate": 3.9443815717497504e-05, + "loss": 0.0838, + "num_input_tokens_seen": 14332064, + "step": 67910 + }, + { + "epoch": 7.471397139713972, + "grad_norm": 0.04939455911517143, + "learning_rate": 3.944185668704856e-05, + "loss": 0.1052, + "num_input_tokens_seen": 14333120, + "step": 67915 + }, + { + "epoch": 7.471947194719472, + "grad_norm": 0.6094806790351868, + "learning_rate": 3.943989752349596e-05, + "loss": 0.0092, + "num_input_tokens_seen": 14334144, + "step": 67920 + }, + { + "epoch": 7.472497249724973, + "grad_norm": 0.1488451063632965, + "learning_rate": 3.943793822685775e-05, + "loss": 0.1073, + "num_input_tokens_seen": 14335232, + "step": 67925 + }, + { + "epoch": 7.473047304730473, + "grad_norm": 1.782655954360962, + "learning_rate": 3.9435978797151997e-05, + "loss": 0.018, + "num_input_tokens_seen": 14336288, + "step": 67930 + }, + { + "epoch": 7.473597359735973, + "grad_norm": 0.5658003687858582, + "learning_rate": 3.9434019234396745e-05, + "loss": 0.067, + "num_input_tokens_seen": 14337344, + "step": 67935 + }, + { + "epoch": 7.474147414741474, + "grad_norm": 0.012839659117162228, + "learning_rate": 3.9432059538610077e-05, + "loss": 0.0914, + "num_input_tokens_seen": 14338336, + "step": 67940 + }, + { + "epoch": 7.474697469746975, + "grad_norm": 0.7160047888755798, + "learning_rate": 3.943009970981003e-05, + "loss": 0.0153, + "num_input_tokens_seen": 14339360, + "step": 67945 + }, + { + "epoch": 7.475247524752476, + "grad_norm": 0.4466274380683899, + "learning_rate": 3.9428139748014683e-05, + "loss": 0.0377, + "num_input_tokens_seen": 14340384, + "step": 67950 + }, + { + "epoch": 7.475797579757976, + "grad_norm": 0.9192910194396973, + "learning_rate": 3.94261796532421e-05, + "loss": 0.0909, + "num_input_tokens_seen": 14341408, + "step": 67955 + }, + { + "epoch": 7.476347634763476, + "grad_norm": 1.1968705654144287, + "learning_rate": 3.9424219425510335e-05, + "loss": 0.0177, + "num_input_tokens_seen": 14342464, + "step": 67960 + }, + { + "epoch": 7.476897689768977, + "grad_norm": 0.01480959914624691, + "learning_rate": 3.942225906483746e-05, + "loss": 0.0594, + "num_input_tokens_seen": 14343520, + "step": 67965 + }, + { + "epoch": 7.477447744774477, + "grad_norm": 0.9264664649963379, + "learning_rate": 3.942029857124154e-05, + "loss": 0.018, + "num_input_tokens_seen": 14344640, + "step": 67970 + }, + { + "epoch": 7.477997799779978, + "grad_norm": 0.0873965471982956, + "learning_rate": 3.941833794474066e-05, + "loss": 0.0069, + "num_input_tokens_seen": 14345664, + "step": 67975 + }, + { + "epoch": 7.478547854785479, + "grad_norm": 1.085967779159546, + "learning_rate": 3.9416377185352874e-05, + "loss": 0.1178, + "num_input_tokens_seen": 14346720, + "step": 67980 + }, + { + "epoch": 7.479097909790979, + "grad_norm": 0.2420349419116974, + "learning_rate": 3.941441629309626e-05, + "loss": 0.0103, + "num_input_tokens_seen": 14347808, + "step": 67985 + }, + { + "epoch": 7.47964796479648, + "grad_norm": 0.1136210635304451, + "learning_rate": 3.941245526798887e-05, + "loss": 0.0319, + "num_input_tokens_seen": 14348896, + "step": 67990 + }, + { + "epoch": 7.48019801980198, + "grad_norm": 0.2340853363275528, + "learning_rate": 3.941049411004881e-05, + "loss": 0.0161, + "num_input_tokens_seen": 14350016, + "step": 67995 + }, + { + "epoch": 7.48074807480748, + "grad_norm": 0.25683555006980896, + "learning_rate": 3.9408532819294144e-05, + "loss": 0.0099, + "num_input_tokens_seen": 14351008, + "step": 68000 + }, + { + "epoch": 7.4812981298129815, + "grad_norm": 1.2992266416549683, + "learning_rate": 3.9406571395742945e-05, + "loss": 0.087, + "num_input_tokens_seen": 14352096, + "step": 68005 + }, + { + "epoch": 7.481848184818482, + "grad_norm": 0.04391444846987724, + "learning_rate": 3.940460983941327e-05, + "loss": 0.0392, + "num_input_tokens_seen": 14353152, + "step": 68010 + }, + { + "epoch": 7.482398239823983, + "grad_norm": 0.09826311469078064, + "learning_rate": 3.940264815032324e-05, + "loss": 0.0039, + "num_input_tokens_seen": 14354240, + "step": 68015 + }, + { + "epoch": 7.482948294829483, + "grad_norm": 0.01082484982907772, + "learning_rate": 3.94006863284909e-05, + "loss": 0.0198, + "num_input_tokens_seen": 14355328, + "step": 68020 + }, + { + "epoch": 7.483498349834983, + "grad_norm": 0.021777953952550888, + "learning_rate": 3.9398724373934355e-05, + "loss": 0.0899, + "num_input_tokens_seen": 14356416, + "step": 68025 + }, + { + "epoch": 7.484048404840484, + "grad_norm": 1.200352430343628, + "learning_rate": 3.939676228667167e-05, + "loss": 0.0205, + "num_input_tokens_seen": 14357472, + "step": 68030 + }, + { + "epoch": 7.4845984598459845, + "grad_norm": 0.009166255593299866, + "learning_rate": 3.939480006672094e-05, + "loss": 0.0823, + "num_input_tokens_seen": 14358528, + "step": 68035 + }, + { + "epoch": 7.485148514851485, + "grad_norm": 0.3649188280105591, + "learning_rate": 3.939283771410024e-05, + "loss": 0.008, + "num_input_tokens_seen": 14359552, + "step": 68040 + }, + { + "epoch": 7.485698569856986, + "grad_norm": 0.11760548502206802, + "learning_rate": 3.939087522882766e-05, + "loss": 0.0046, + "num_input_tokens_seen": 14360672, + "step": 68045 + }, + { + "epoch": 7.486248624862486, + "grad_norm": 0.09943555295467377, + "learning_rate": 3.93889126109213e-05, + "loss": 0.0152, + "num_input_tokens_seen": 14361696, + "step": 68050 + }, + { + "epoch": 7.486798679867987, + "grad_norm": 0.01279898826032877, + "learning_rate": 3.938694986039922e-05, + "loss": 0.0958, + "num_input_tokens_seen": 14362752, + "step": 68055 + }, + { + "epoch": 7.487348734873487, + "grad_norm": 0.009893753565847874, + "learning_rate": 3.9384986977279535e-05, + "loss": 0.0136, + "num_input_tokens_seen": 14363840, + "step": 68060 + }, + { + "epoch": 7.4878987898789875, + "grad_norm": 0.00865256693214178, + "learning_rate": 3.938302396158033e-05, + "loss": 0.0073, + "num_input_tokens_seen": 14364864, + "step": 68065 + }, + { + "epoch": 7.488448844884489, + "grad_norm": 0.4037909507751465, + "learning_rate": 3.938106081331969e-05, + "loss": 0.0124, + "num_input_tokens_seen": 14365920, + "step": 68070 + }, + { + "epoch": 7.488998899889989, + "grad_norm": 0.014378834515810013, + "learning_rate": 3.9379097532515725e-05, + "loss": 0.0655, + "num_input_tokens_seen": 14366944, + "step": 68075 + }, + { + "epoch": 7.48954895489549, + "grad_norm": 0.06907283514738083, + "learning_rate": 3.9377134119186507e-05, + "loss": 0.0187, + "num_input_tokens_seen": 14367968, + "step": 68080 + }, + { + "epoch": 7.49009900990099, + "grad_norm": 0.1297653764486313, + "learning_rate": 3.937517057335014e-05, + "loss": 0.0091, + "num_input_tokens_seen": 14369056, + "step": 68085 + }, + { + "epoch": 7.49064906490649, + "grad_norm": 0.7041162848472595, + "learning_rate": 3.937320689502473e-05, + "loss": 0.0436, + "num_input_tokens_seen": 14370080, + "step": 68090 + }, + { + "epoch": 7.491199119911991, + "grad_norm": 0.0747084766626358, + "learning_rate": 3.937124308422837e-05, + "loss": 0.0022, + "num_input_tokens_seen": 14371136, + "step": 68095 + }, + { + "epoch": 7.491749174917492, + "grad_norm": 0.14532466232776642, + "learning_rate": 3.936927914097915e-05, + "loss": 0.0052, + "num_input_tokens_seen": 14372224, + "step": 68100 + }, + { + "epoch": 7.492299229922993, + "grad_norm": 0.05009741708636284, + "learning_rate": 3.936731506529519e-05, + "loss": 0.0639, + "num_input_tokens_seen": 14373312, + "step": 68105 + }, + { + "epoch": 7.492849284928493, + "grad_norm": 0.04860786721110344, + "learning_rate": 3.936535085719457e-05, + "loss": 0.0335, + "num_input_tokens_seen": 14374368, + "step": 68110 + }, + { + "epoch": 7.493399339933993, + "grad_norm": 0.12450204789638519, + "learning_rate": 3.936338651669541e-05, + "loss": 0.0187, + "num_input_tokens_seen": 14375360, + "step": 68115 + }, + { + "epoch": 7.493949394939494, + "grad_norm": 0.074772410094738, + "learning_rate": 3.936142204381581e-05, + "loss": 0.0342, + "num_input_tokens_seen": 14376448, + "step": 68120 + }, + { + "epoch": 7.494499449944994, + "grad_norm": 1.3152635097503662, + "learning_rate": 3.935945743857388e-05, + "loss": 0.0362, + "num_input_tokens_seen": 14377440, + "step": 68125 + }, + { + "epoch": 7.4950495049504955, + "grad_norm": 0.7678439021110535, + "learning_rate": 3.935749270098771e-05, + "loss": 0.0315, + "num_input_tokens_seen": 14378464, + "step": 68130 + }, + { + "epoch": 7.495599559955996, + "grad_norm": 0.02596352808177471, + "learning_rate": 3.935552783107542e-05, + "loss": 0.0952, + "num_input_tokens_seen": 14379488, + "step": 68135 + }, + { + "epoch": 7.496149614961496, + "grad_norm": 0.4551008939743042, + "learning_rate": 3.9353562828855124e-05, + "loss": 0.0781, + "num_input_tokens_seen": 14380576, + "step": 68140 + }, + { + "epoch": 7.496699669966997, + "grad_norm": 1.3512088060379028, + "learning_rate": 3.935159769434493e-05, + "loss": 0.1264, + "num_input_tokens_seen": 14381696, + "step": 68145 + }, + { + "epoch": 7.497249724972497, + "grad_norm": 0.11197994649410248, + "learning_rate": 3.934963242756294e-05, + "loss": 0.0858, + "num_input_tokens_seen": 14382784, + "step": 68150 + }, + { + "epoch": 7.497799779977997, + "grad_norm": 1.214945912361145, + "learning_rate": 3.9347667028527276e-05, + "loss": 0.0608, + "num_input_tokens_seen": 14383840, + "step": 68155 + }, + { + "epoch": 7.4983498349834985, + "grad_norm": 0.928898811340332, + "learning_rate": 3.9345701497256046e-05, + "loss": 0.028, + "num_input_tokens_seen": 14384864, + "step": 68160 + }, + { + "epoch": 7.498899889988999, + "grad_norm": 1.369187593460083, + "learning_rate": 3.934373583376737e-05, + "loss": 0.0714, + "num_input_tokens_seen": 14385984, + "step": 68165 + }, + { + "epoch": 7.4994499449945, + "grad_norm": 0.08130550384521484, + "learning_rate": 3.934177003807937e-05, + "loss": 0.0048, + "num_input_tokens_seen": 14387104, + "step": 68170 + }, + { + "epoch": 7.5, + "grad_norm": 0.08321811258792877, + "learning_rate": 3.933980411021015e-05, + "loss": 0.0138, + "num_input_tokens_seen": 14388192, + "step": 68175 + }, + { + "epoch": 7.5005500550055, + "grad_norm": 0.7297899127006531, + "learning_rate": 3.933783805017784e-05, + "loss": 0.0435, + "num_input_tokens_seen": 14389280, + "step": 68180 + }, + { + "epoch": 7.501100110011001, + "grad_norm": 0.017066489905118942, + "learning_rate": 3.933587185800055e-05, + "loss": 0.0464, + "num_input_tokens_seen": 14390368, + "step": 68185 + }, + { + "epoch": 7.5016501650165015, + "grad_norm": 0.48389101028442383, + "learning_rate": 3.933390553369642e-05, + "loss": 0.0382, + "num_input_tokens_seen": 14391392, + "step": 68190 + }, + { + "epoch": 7.502200220022003, + "grad_norm": 0.24065057933330536, + "learning_rate": 3.933193907728355e-05, + "loss": 0.038, + "num_input_tokens_seen": 14392480, + "step": 68195 + }, + { + "epoch": 7.502750275027503, + "grad_norm": 0.052133265882730484, + "learning_rate": 3.9329972488780086e-05, + "loss": 0.0719, + "num_input_tokens_seen": 14393568, + "step": 68200 + }, + { + "epoch": 7.503300330033003, + "grad_norm": 0.1268729418516159, + "learning_rate": 3.932800576820413e-05, + "loss": 0.006, + "num_input_tokens_seen": 14394560, + "step": 68205 + }, + { + "epoch": 7.503850385038504, + "grad_norm": 0.6194582581520081, + "learning_rate": 3.932603891557382e-05, + "loss": 0.0625, + "num_input_tokens_seen": 14395552, + "step": 68210 + }, + { + "epoch": 7.504400440044004, + "grad_norm": 1.0169404745101929, + "learning_rate": 3.9324071930907294e-05, + "loss": 0.0286, + "num_input_tokens_seen": 14396640, + "step": 68215 + }, + { + "epoch": 7.5049504950495045, + "grad_norm": 0.7964855432510376, + "learning_rate": 3.932210481422267e-05, + "loss": 0.0277, + "num_input_tokens_seen": 14397696, + "step": 68220 + }, + { + "epoch": 7.505500550055006, + "grad_norm": 0.42316871881484985, + "learning_rate": 3.932013756553807e-05, + "loss": 0.1133, + "num_input_tokens_seen": 14398752, + "step": 68225 + }, + { + "epoch": 7.506050605060506, + "grad_norm": 0.00534916901960969, + "learning_rate": 3.931817018487164e-05, + "loss": 0.0784, + "num_input_tokens_seen": 14399840, + "step": 68230 + }, + { + "epoch": 7.506600660066007, + "grad_norm": 0.12763093411922455, + "learning_rate": 3.93162026722415e-05, + "loss": 0.0171, + "num_input_tokens_seen": 14400864, + "step": 68235 + }, + { + "epoch": 7.507150715071507, + "grad_norm": 1.1658600568771362, + "learning_rate": 3.931423502766579e-05, + "loss": 0.0864, + "num_input_tokens_seen": 14401856, + "step": 68240 + }, + { + "epoch": 7.507700770077007, + "grad_norm": 0.037205565720796585, + "learning_rate": 3.9312267251162646e-05, + "loss": 0.0866, + "num_input_tokens_seen": 14402848, + "step": 68245 + }, + { + "epoch": 7.508250825082508, + "grad_norm": 0.145355686545372, + "learning_rate": 3.931029934275021e-05, + "loss": 0.0245, + "num_input_tokens_seen": 14403936, + "step": 68250 + }, + { + "epoch": 7.508800880088009, + "grad_norm": 0.2669374644756317, + "learning_rate": 3.93083313024466e-05, + "loss": 0.0297, + "num_input_tokens_seen": 14404928, + "step": 68255 + }, + { + "epoch": 7.50935093509351, + "grad_norm": 0.022265620529651642, + "learning_rate": 3.930636313026997e-05, + "loss": 0.0513, + "num_input_tokens_seen": 14405984, + "step": 68260 + }, + { + "epoch": 7.50990099009901, + "grad_norm": 0.4512791335582733, + "learning_rate": 3.9304394826238464e-05, + "loss": 0.0149, + "num_input_tokens_seen": 14407072, + "step": 68265 + }, + { + "epoch": 7.51045104510451, + "grad_norm": 0.07785272598266602, + "learning_rate": 3.930242639037021e-05, + "loss": 0.0193, + "num_input_tokens_seen": 14408128, + "step": 68270 + }, + { + "epoch": 7.511001100110011, + "grad_norm": 0.14470624923706055, + "learning_rate": 3.930045782268335e-05, + "loss": 0.0274, + "num_input_tokens_seen": 14409216, + "step": 68275 + }, + { + "epoch": 7.511551155115511, + "grad_norm": 2.5507819652557373, + "learning_rate": 3.929848912319603e-05, + "loss": 0.0756, + "num_input_tokens_seen": 14410240, + "step": 68280 + }, + { + "epoch": 7.512101210121012, + "grad_norm": 0.011554280295968056, + "learning_rate": 3.92965202919264e-05, + "loss": 0.0251, + "num_input_tokens_seen": 14411264, + "step": 68285 + }, + { + "epoch": 7.512651265126513, + "grad_norm": 0.861657440662384, + "learning_rate": 3.929455132889261e-05, + "loss": 0.027, + "num_input_tokens_seen": 14412352, + "step": 68290 + }, + { + "epoch": 7.513201320132013, + "grad_norm": 0.008390051312744617, + "learning_rate": 3.929258223411279e-05, + "loss": 0.0174, + "num_input_tokens_seen": 14413376, + "step": 68295 + }, + { + "epoch": 7.513751375137514, + "grad_norm": 0.05844087153673172, + "learning_rate": 3.9290613007605104e-05, + "loss": 0.0412, + "num_input_tokens_seen": 14414432, + "step": 68300 + }, + { + "epoch": 7.514301430143014, + "grad_norm": 0.013186008669435978, + "learning_rate": 3.928864364938769e-05, + "loss": 0.1087, + "num_input_tokens_seen": 14415488, + "step": 68305 + }, + { + "epoch": 7.514851485148515, + "grad_norm": 0.1747847944498062, + "learning_rate": 3.9286674159478706e-05, + "loss": 0.0091, + "num_input_tokens_seen": 14416544, + "step": 68310 + }, + { + "epoch": 7.5154015401540155, + "grad_norm": 0.05234529450535774, + "learning_rate": 3.928470453789631e-05, + "loss": 0.005, + "num_input_tokens_seen": 14417600, + "step": 68315 + }, + { + "epoch": 7.515951595159516, + "grad_norm": 0.062290631234645844, + "learning_rate": 3.9282734784658636e-05, + "loss": 0.0166, + "num_input_tokens_seen": 14418688, + "step": 68320 + }, + { + "epoch": 7.516501650165017, + "grad_norm": 0.5004236698150635, + "learning_rate": 3.928076489978385e-05, + "loss": 0.024, + "num_input_tokens_seen": 14419712, + "step": 68325 + }, + { + "epoch": 7.517051705170517, + "grad_norm": 0.021934792399406433, + "learning_rate": 3.9278794883290116e-05, + "loss": 0.0533, + "num_input_tokens_seen": 14420736, + "step": 68330 + }, + { + "epoch": 7.517601760176017, + "grad_norm": 0.03167514130473137, + "learning_rate": 3.927682473519557e-05, + "loss": 0.0258, + "num_input_tokens_seen": 14421760, + "step": 68335 + }, + { + "epoch": 7.518151815181518, + "grad_norm": 0.09881763905286789, + "learning_rate": 3.927485445551839e-05, + "loss": 0.0023, + "num_input_tokens_seen": 14422816, + "step": 68340 + }, + { + "epoch": 7.5187018701870185, + "grad_norm": 0.020998481661081314, + "learning_rate": 3.9272884044276714e-05, + "loss": 0.0107, + "num_input_tokens_seen": 14423840, + "step": 68345 + }, + { + "epoch": 7.51925192519252, + "grad_norm": 0.07612960785627365, + "learning_rate": 3.927091350148873e-05, + "loss": 0.0709, + "num_input_tokens_seen": 14424864, + "step": 68350 + }, + { + "epoch": 7.51980198019802, + "grad_norm": 0.05156746506690979, + "learning_rate": 3.9268942827172573e-05, + "loss": 0.0433, + "num_input_tokens_seen": 14425888, + "step": 68355 + }, + { + "epoch": 7.52035203520352, + "grad_norm": 0.07541946321725845, + "learning_rate": 3.926697202134642e-05, + "loss": 0.0215, + "num_input_tokens_seen": 14426880, + "step": 68360 + }, + { + "epoch": 7.520902090209021, + "grad_norm": 0.014038575813174248, + "learning_rate": 3.926500108402843e-05, + "loss": 0.0188, + "num_input_tokens_seen": 14427872, + "step": 68365 + }, + { + "epoch": 7.521452145214521, + "grad_norm": 0.05583032965660095, + "learning_rate": 3.9263030015236776e-05, + "loss": 0.0245, + "num_input_tokens_seen": 14428896, + "step": 68370 + }, + { + "epoch": 7.522002200220022, + "grad_norm": 0.8552578091621399, + "learning_rate": 3.926105881498961e-05, + "loss": 0.0247, + "num_input_tokens_seen": 14429952, + "step": 68375 + }, + { + "epoch": 7.522552255225523, + "grad_norm": 0.12172187864780426, + "learning_rate": 3.925908748330511e-05, + "loss": 0.0258, + "num_input_tokens_seen": 14431008, + "step": 68380 + }, + { + "epoch": 7.523102310231023, + "grad_norm": 0.016643794253468513, + "learning_rate": 3.925711602020144e-05, + "loss": 0.0454, + "num_input_tokens_seen": 14432096, + "step": 68385 + }, + { + "epoch": 7.523652365236524, + "grad_norm": 0.18026188015937805, + "learning_rate": 3.925514442569679e-05, + "loss": 0.2015, + "num_input_tokens_seen": 14433152, + "step": 68390 + }, + { + "epoch": 7.524202420242024, + "grad_norm": 0.016484253108501434, + "learning_rate": 3.92531726998093e-05, + "loss": 0.0358, + "num_input_tokens_seen": 14434240, + "step": 68395 + }, + { + "epoch": 7.524752475247524, + "grad_norm": 0.007833878509700298, + "learning_rate": 3.925120084255716e-05, + "loss": 0.0232, + "num_input_tokens_seen": 14435232, + "step": 68400 + }, + { + "epoch": 7.525302530253025, + "grad_norm": 0.06264835596084595, + "learning_rate": 3.924922885395854e-05, + "loss": 0.0118, + "num_input_tokens_seen": 14436256, + "step": 68405 + }, + { + "epoch": 7.525852585258526, + "grad_norm": 0.008263256400823593, + "learning_rate": 3.924725673403161e-05, + "loss": 0.0175, + "num_input_tokens_seen": 14437280, + "step": 68410 + }, + { + "epoch": 7.526402640264027, + "grad_norm": 0.8029187321662903, + "learning_rate": 3.9245284482794555e-05, + "loss": 0.1558, + "num_input_tokens_seen": 14438304, + "step": 68415 + }, + { + "epoch": 7.526952695269527, + "grad_norm": 0.8737295866012573, + "learning_rate": 3.924331210026555e-05, + "loss": 0.0277, + "num_input_tokens_seen": 14439392, + "step": 68420 + }, + { + "epoch": 7.527502750275027, + "grad_norm": 0.4096229374408722, + "learning_rate": 3.924133958646277e-05, + "loss": 0.0575, + "num_input_tokens_seen": 14440448, + "step": 68425 + }, + { + "epoch": 7.528052805280528, + "grad_norm": 0.011407235637307167, + "learning_rate": 3.92393669414044e-05, + "loss": 0.0146, + "num_input_tokens_seen": 14441472, + "step": 68430 + }, + { + "epoch": 7.528602860286028, + "grad_norm": 0.10006318986415863, + "learning_rate": 3.923739416510862e-05, + "loss": 0.0027, + "num_input_tokens_seen": 14442464, + "step": 68435 + }, + { + "epoch": 7.5291529152915295, + "grad_norm": 1.0223348140716553, + "learning_rate": 3.92354212575936e-05, + "loss": 0.0159, + "num_input_tokens_seen": 14443488, + "step": 68440 + }, + { + "epoch": 7.52970297029703, + "grad_norm": 0.047548092901706696, + "learning_rate": 3.9233448218877536e-05, + "loss": 0.0018, + "num_input_tokens_seen": 14444576, + "step": 68445 + }, + { + "epoch": 7.53025302530253, + "grad_norm": 0.010524065233767033, + "learning_rate": 3.923147504897861e-05, + "loss": 0.0046, + "num_input_tokens_seen": 14445568, + "step": 68450 + }, + { + "epoch": 7.530803080308031, + "grad_norm": 2.167343854904175, + "learning_rate": 3.922950174791501e-05, + "loss": 0.0897, + "num_input_tokens_seen": 14446656, + "step": 68455 + }, + { + "epoch": 7.531353135313531, + "grad_norm": 1.8622878789901733, + "learning_rate": 3.922752831570492e-05, + "loss": 0.148, + "num_input_tokens_seen": 14447712, + "step": 68460 + }, + { + "epoch": 7.531903190319031, + "grad_norm": 0.04795878008008003, + "learning_rate": 3.9225554752366526e-05, + "loss": 0.0413, + "num_input_tokens_seen": 14448768, + "step": 68465 + }, + { + "epoch": 7.5324532453245325, + "grad_norm": 0.6117247343063354, + "learning_rate": 3.922358105791802e-05, + "loss": 0.0319, + "num_input_tokens_seen": 14449792, + "step": 68470 + }, + { + "epoch": 7.533003300330033, + "grad_norm": 0.041049834340810776, + "learning_rate": 3.92216072323776e-05, + "loss": 0.0032, + "num_input_tokens_seen": 14450880, + "step": 68475 + }, + { + "epoch": 7.533553355335534, + "grad_norm": 0.3520382344722748, + "learning_rate": 3.9219633275763445e-05, + "loss": 0.0214, + "num_input_tokens_seen": 14452000, + "step": 68480 + }, + { + "epoch": 7.534103410341034, + "grad_norm": 0.005614571738988161, + "learning_rate": 3.9217659188093746e-05, + "loss": 0.0185, + "num_input_tokens_seen": 14453056, + "step": 68485 + }, + { + "epoch": 7.534653465346535, + "grad_norm": 0.03666834533214569, + "learning_rate": 3.9215684969386715e-05, + "loss": 0.033, + "num_input_tokens_seen": 14454080, + "step": 68490 + }, + { + "epoch": 7.535203520352035, + "grad_norm": 0.3652130365371704, + "learning_rate": 3.921371061966053e-05, + "loss": 0.0798, + "num_input_tokens_seen": 14455072, + "step": 68495 + }, + { + "epoch": 7.5357535753575355, + "grad_norm": 0.9692928791046143, + "learning_rate": 3.9211736138933394e-05, + "loss": 0.0352, + "num_input_tokens_seen": 14456064, + "step": 68500 + }, + { + "epoch": 7.536303630363037, + "grad_norm": 1.6635167598724365, + "learning_rate": 3.920976152722352e-05, + "loss": 0.0996, + "num_input_tokens_seen": 14457120, + "step": 68505 + }, + { + "epoch": 7.536853685368537, + "grad_norm": 0.01710321381688118, + "learning_rate": 3.920778678454908e-05, + "loss": 0.0081, + "num_input_tokens_seen": 14458208, + "step": 68510 + }, + { + "epoch": 7.537403740374037, + "grad_norm": 0.20638230443000793, + "learning_rate": 3.920581191092829e-05, + "loss": 0.0177, + "num_input_tokens_seen": 14459296, + "step": 68515 + }, + { + "epoch": 7.537953795379538, + "grad_norm": 0.010181468911468983, + "learning_rate": 3.9203836906379346e-05, + "loss": 0.0651, + "num_input_tokens_seen": 14460384, + "step": 68520 + }, + { + "epoch": 7.538503850385038, + "grad_norm": 0.02726322039961815, + "learning_rate": 3.920186177092046e-05, + "loss": 0.0045, + "num_input_tokens_seen": 14461440, + "step": 68525 + }, + { + "epoch": 7.539053905390539, + "grad_norm": 0.5248520374298096, + "learning_rate": 3.9199886504569814e-05, + "loss": 0.0233, + "num_input_tokens_seen": 14462560, + "step": 68530 + }, + { + "epoch": 7.53960396039604, + "grad_norm": 0.014444523490965366, + "learning_rate": 3.919791110734564e-05, + "loss": 0.0288, + "num_input_tokens_seen": 14463616, + "step": 68535 + }, + { + "epoch": 7.54015401540154, + "grad_norm": 0.9993568658828735, + "learning_rate": 3.9195935579266125e-05, + "loss": 0.0768, + "num_input_tokens_seen": 14464640, + "step": 68540 + }, + { + "epoch": 7.540704070407041, + "grad_norm": 0.03712693229317665, + "learning_rate": 3.919395992034949e-05, + "loss": 0.068, + "num_input_tokens_seen": 14465728, + "step": 68545 + }, + { + "epoch": 7.541254125412541, + "grad_norm": 0.004476083442568779, + "learning_rate": 3.9191984130613934e-05, + "loss": 0.0126, + "num_input_tokens_seen": 14466784, + "step": 68550 + }, + { + "epoch": 7.541804180418042, + "grad_norm": 0.030130784958600998, + "learning_rate": 3.919000821007767e-05, + "loss": 0.0794, + "num_input_tokens_seen": 14467840, + "step": 68555 + }, + { + "epoch": 7.542354235423542, + "grad_norm": 1.5947256088256836, + "learning_rate": 3.918803215875892e-05, + "loss": 0.0672, + "num_input_tokens_seen": 14468992, + "step": 68560 + }, + { + "epoch": 7.542904290429043, + "grad_norm": 2.7066268920898438, + "learning_rate": 3.9186055976675864e-05, + "loss": 0.0807, + "num_input_tokens_seen": 14469984, + "step": 68565 + }, + { + "epoch": 7.543454345434544, + "grad_norm": 0.016821200028061867, + "learning_rate": 3.918407966384675e-05, + "loss": 0.1542, + "num_input_tokens_seen": 14471072, + "step": 68570 + }, + { + "epoch": 7.544004400440044, + "grad_norm": 0.09221707284450531, + "learning_rate": 3.918210322028978e-05, + "loss": 0.0097, + "num_input_tokens_seen": 14472032, + "step": 68575 + }, + { + "epoch": 7.544554455445544, + "grad_norm": 0.011471346020698547, + "learning_rate": 3.918012664602317e-05, + "loss": 0.0189, + "num_input_tokens_seen": 14473056, + "step": 68580 + }, + { + "epoch": 7.545104510451045, + "grad_norm": 2.6207563877105713, + "learning_rate": 3.9178149941065135e-05, + "loss": 0.0456, + "num_input_tokens_seen": 14474144, + "step": 68585 + }, + { + "epoch": 7.5456545654565454, + "grad_norm": 0.46846532821655273, + "learning_rate": 3.9176173105433896e-05, + "loss": 0.0538, + "num_input_tokens_seen": 14475200, + "step": 68590 + }, + { + "epoch": 7.5462046204620465, + "grad_norm": 0.15493734180927277, + "learning_rate": 3.917419613914767e-05, + "loss": 0.0584, + "num_input_tokens_seen": 14476224, + "step": 68595 + }, + { + "epoch": 7.546754675467547, + "grad_norm": 1.021471381187439, + "learning_rate": 3.917221904222469e-05, + "loss": 0.1496, + "num_input_tokens_seen": 14477344, + "step": 68600 + }, + { + "epoch": 7.547304730473047, + "grad_norm": 0.07218153029680252, + "learning_rate": 3.917024181468315e-05, + "loss": 0.0105, + "num_input_tokens_seen": 14478400, + "step": 68605 + }, + { + "epoch": 7.547854785478548, + "grad_norm": 0.3748684823513031, + "learning_rate": 3.91682644565413e-05, + "loss": 0.0499, + "num_input_tokens_seen": 14479456, + "step": 68610 + }, + { + "epoch": 7.548404840484048, + "grad_norm": 0.06410866975784302, + "learning_rate": 3.916628696781736e-05, + "loss": 0.0098, + "num_input_tokens_seen": 14480448, + "step": 68615 + }, + { + "epoch": 7.548954895489549, + "grad_norm": 0.06254497915506363, + "learning_rate": 3.9164309348529546e-05, + "loss": 0.0392, + "num_input_tokens_seen": 14481472, + "step": 68620 + }, + { + "epoch": 7.5495049504950495, + "grad_norm": 0.20360414683818817, + "learning_rate": 3.916233159869609e-05, + "loss": 0.0251, + "num_input_tokens_seen": 14482496, + "step": 68625 + }, + { + "epoch": 7.55005500550055, + "grad_norm": 0.0033205312211066484, + "learning_rate": 3.916035371833522e-05, + "loss": 0.0621, + "num_input_tokens_seen": 14483648, + "step": 68630 + }, + { + "epoch": 7.550605060506051, + "grad_norm": 0.020744958892464638, + "learning_rate": 3.915837570746515e-05, + "loss": 0.0289, + "num_input_tokens_seen": 14484736, + "step": 68635 + }, + { + "epoch": 7.551155115511551, + "grad_norm": 0.04897540807723999, + "learning_rate": 3.9156397566104145e-05, + "loss": 0.04, + "num_input_tokens_seen": 14485792, + "step": 68640 + }, + { + "epoch": 7.551705170517051, + "grad_norm": 0.016848478466272354, + "learning_rate": 3.915441929427042e-05, + "loss": 0.0917, + "num_input_tokens_seen": 14486880, + "step": 68645 + }, + { + "epoch": 7.552255225522552, + "grad_norm": 0.08374100923538208, + "learning_rate": 3.915244089198219e-05, + "loss": 0.0062, + "num_input_tokens_seen": 14487936, + "step": 68650 + }, + { + "epoch": 7.552805280528053, + "grad_norm": 0.0783439502120018, + "learning_rate": 3.91504623592577e-05, + "loss": 0.0332, + "num_input_tokens_seen": 14489024, + "step": 68655 + }, + { + "epoch": 7.553355335533554, + "grad_norm": 0.07142144441604614, + "learning_rate": 3.91484836961152e-05, + "loss": 0.0107, + "num_input_tokens_seen": 14490016, + "step": 68660 + }, + { + "epoch": 7.553905390539054, + "grad_norm": 0.013624187558889389, + "learning_rate": 3.914650490257291e-05, + "loss": 0.0744, + "num_input_tokens_seen": 14491104, + "step": 68665 + }, + { + "epoch": 7.554455445544555, + "grad_norm": 0.09037581086158752, + "learning_rate": 3.914452597864907e-05, + "loss": 0.0105, + "num_input_tokens_seen": 14492224, + "step": 68670 + }, + { + "epoch": 7.555005500550055, + "grad_norm": 0.19587047398090363, + "learning_rate": 3.914254692436193e-05, + "loss": 0.0185, + "num_input_tokens_seen": 14493248, + "step": 68675 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 0.029145941138267517, + "learning_rate": 3.9140567739729715e-05, + "loss": 0.0287, + "num_input_tokens_seen": 14494336, + "step": 68680 + }, + { + "epoch": 7.5561056105610565, + "grad_norm": 0.051671937108039856, + "learning_rate": 3.913858842477067e-05, + "loss": 0.0073, + "num_input_tokens_seen": 14495360, + "step": 68685 + }, + { + "epoch": 7.556655665566557, + "grad_norm": 0.011857329867780209, + "learning_rate": 3.913660897950304e-05, + "loss": 0.0036, + "num_input_tokens_seen": 14496384, + "step": 68690 + }, + { + "epoch": 7.557205720572057, + "grad_norm": 0.40271875262260437, + "learning_rate": 3.9134629403945075e-05, + "loss": 0.0793, + "num_input_tokens_seen": 14497472, + "step": 68695 + }, + { + "epoch": 7.557755775577558, + "grad_norm": 0.7212868332862854, + "learning_rate": 3.913264969811501e-05, + "loss": 0.1276, + "num_input_tokens_seen": 14498528, + "step": 68700 + }, + { + "epoch": 7.558305830583058, + "grad_norm": 0.017050176858901978, + "learning_rate": 3.913066986203109e-05, + "loss": 0.0021, + "num_input_tokens_seen": 14499552, + "step": 68705 + }, + { + "epoch": 7.558855885588558, + "grad_norm": 0.006925249472260475, + "learning_rate": 3.912868989571157e-05, + "loss": 0.0282, + "num_input_tokens_seen": 14500576, + "step": 68710 + }, + { + "epoch": 7.5594059405940595, + "grad_norm": 0.06142882630228996, + "learning_rate": 3.91267097991747e-05, + "loss": 0.061, + "num_input_tokens_seen": 14501600, + "step": 68715 + }, + { + "epoch": 7.55995599559956, + "grad_norm": 0.5341870188713074, + "learning_rate": 3.9124729572438724e-05, + "loss": 0.0191, + "num_input_tokens_seen": 14502688, + "step": 68720 + }, + { + "epoch": 7.560506050605061, + "grad_norm": 0.036104824393987656, + "learning_rate": 3.912274921552189e-05, + "loss": 0.0083, + "num_input_tokens_seen": 14503712, + "step": 68725 + }, + { + "epoch": 7.561056105610561, + "grad_norm": 0.006462882272899151, + "learning_rate": 3.9120768728442445e-05, + "loss": 0.0151, + "num_input_tokens_seen": 14504704, + "step": 68730 + }, + { + "epoch": 7.561606160616062, + "grad_norm": 0.7865393161773682, + "learning_rate": 3.911878811121866e-05, + "loss": 0.0539, + "num_input_tokens_seen": 14505728, + "step": 68735 + }, + { + "epoch": 7.562156215621562, + "grad_norm": 0.015402628108859062, + "learning_rate": 3.9116807363868776e-05, + "loss": 0.0096, + "num_input_tokens_seen": 14506848, + "step": 68740 + }, + { + "epoch": 7.5627062706270625, + "grad_norm": 0.017291391268372536, + "learning_rate": 3.911482648641105e-05, + "loss": 0.0021, + "num_input_tokens_seen": 14507872, + "step": 68745 + }, + { + "epoch": 7.563256325632564, + "grad_norm": 1.262714147567749, + "learning_rate": 3.911284547886375e-05, + "loss": 0.0892, + "num_input_tokens_seen": 14508864, + "step": 68750 + }, + { + "epoch": 7.563806380638064, + "grad_norm": 0.025511324405670166, + "learning_rate": 3.911086434124512e-05, + "loss": 0.0206, + "num_input_tokens_seen": 14509920, + "step": 68755 + }, + { + "epoch": 7.564356435643564, + "grad_norm": 0.415323406457901, + "learning_rate": 3.910888307357342e-05, + "loss": 0.0197, + "num_input_tokens_seen": 14510880, + "step": 68760 + }, + { + "epoch": 7.564906490649065, + "grad_norm": 0.15529340505599976, + "learning_rate": 3.910690167586693e-05, + "loss": 0.005, + "num_input_tokens_seen": 14511904, + "step": 68765 + }, + { + "epoch": 7.565456545654565, + "grad_norm": 0.20843176543712616, + "learning_rate": 3.910492014814388e-05, + "loss": 0.1338, + "num_input_tokens_seen": 14512992, + "step": 68770 + }, + { + "epoch": 7.566006600660066, + "grad_norm": 0.352372407913208, + "learning_rate": 3.910293849042256e-05, + "loss": 0.0683, + "num_input_tokens_seen": 14514112, + "step": 68775 + }, + { + "epoch": 7.566556655665567, + "grad_norm": 0.11186732351779938, + "learning_rate": 3.910095670272122e-05, + "loss": 0.0813, + "num_input_tokens_seen": 14515136, + "step": 68780 + }, + { + "epoch": 7.567106710671067, + "grad_norm": 0.019785910844802856, + "learning_rate": 3.9098974785058116e-05, + "loss": 0.0632, + "num_input_tokens_seen": 14516256, + "step": 68785 + }, + { + "epoch": 7.567656765676568, + "grad_norm": 0.01060990896075964, + "learning_rate": 3.909699273745154e-05, + "loss": 0.0648, + "num_input_tokens_seen": 14517280, + "step": 68790 + }, + { + "epoch": 7.568206820682068, + "grad_norm": 0.0407165102660656, + "learning_rate": 3.9095010559919744e-05, + "loss": 0.1067, + "num_input_tokens_seen": 14518272, + "step": 68795 + }, + { + "epoch": 7.568756875687569, + "grad_norm": 0.0949241891503334, + "learning_rate": 3.9093028252481e-05, + "loss": 0.1239, + "num_input_tokens_seen": 14519328, + "step": 68800 + }, + { + "epoch": 7.569306930693069, + "grad_norm": 0.04403635859489441, + "learning_rate": 3.9091045815153573e-05, + "loss": 0.0074, + "num_input_tokens_seen": 14520352, + "step": 68805 + }, + { + "epoch": 7.56985698569857, + "grad_norm": 0.017260786145925522, + "learning_rate": 3.9089063247955746e-05, + "loss": 0.027, + "num_input_tokens_seen": 14521408, + "step": 68810 + }, + { + "epoch": 7.570407040704071, + "grad_norm": 1.4516565799713135, + "learning_rate": 3.9087080550905776e-05, + "loss": 0.0501, + "num_input_tokens_seen": 14522496, + "step": 68815 + }, + { + "epoch": 7.570957095709571, + "grad_norm": 0.04443337023258209, + "learning_rate": 3.9085097724021945e-05, + "loss": 0.0465, + "num_input_tokens_seen": 14523520, + "step": 68820 + }, + { + "epoch": 7.571507150715071, + "grad_norm": 0.03140205889940262, + "learning_rate": 3.908311476732253e-05, + "loss": 0.095, + "num_input_tokens_seen": 14524576, + "step": 68825 + }, + { + "epoch": 7.572057205720572, + "grad_norm": 0.8256922364234924, + "learning_rate": 3.90811316808258e-05, + "loss": 0.0164, + "num_input_tokens_seen": 14525568, + "step": 68830 + }, + { + "epoch": 7.572607260726072, + "grad_norm": 0.587880551815033, + "learning_rate": 3.9079148464550044e-05, + "loss": 0.1267, + "num_input_tokens_seen": 14526560, + "step": 68835 + }, + { + "epoch": 7.5731573157315735, + "grad_norm": 1.1647173166275024, + "learning_rate": 3.9077165118513526e-05, + "loss": 0.0633, + "num_input_tokens_seen": 14527648, + "step": 68840 + }, + { + "epoch": 7.573707370737074, + "grad_norm": 0.22527240216732025, + "learning_rate": 3.907518164273453e-05, + "loss": 0.1028, + "num_input_tokens_seen": 14528704, + "step": 68845 + }, + { + "epoch": 7.574257425742574, + "grad_norm": 0.007615762762725353, + "learning_rate": 3.9073198037231345e-05, + "loss": 0.0919, + "num_input_tokens_seen": 14529792, + "step": 68850 + }, + { + "epoch": 7.574807480748075, + "grad_norm": 5.1677727699279785, + "learning_rate": 3.907121430202224e-05, + "loss": 0.0721, + "num_input_tokens_seen": 14530784, + "step": 68855 + }, + { + "epoch": 7.575357535753575, + "grad_norm": 0.5894714593887329, + "learning_rate": 3.906923043712551e-05, + "loss": 0.0063, + "num_input_tokens_seen": 14531808, + "step": 68860 + }, + { + "epoch": 7.575907590759076, + "grad_norm": 0.10567370802164078, + "learning_rate": 3.906724644255944e-05, + "loss": 0.0211, + "num_input_tokens_seen": 14532864, + "step": 68865 + }, + { + "epoch": 7.5764576457645765, + "grad_norm": 0.5163949728012085, + "learning_rate": 3.90652623183423e-05, + "loss": 0.027, + "num_input_tokens_seen": 14534016, + "step": 68870 + }, + { + "epoch": 7.577007700770077, + "grad_norm": 1.3671224117279053, + "learning_rate": 3.906327806449238e-05, + "loss": 0.0758, + "num_input_tokens_seen": 14534976, + "step": 68875 + }, + { + "epoch": 7.577557755775578, + "grad_norm": 0.036894384771585464, + "learning_rate": 3.906129368102799e-05, + "loss": 0.01, + "num_input_tokens_seen": 14536032, + "step": 68880 + }, + { + "epoch": 7.578107810781078, + "grad_norm": 0.14180102944374084, + "learning_rate": 3.90593091679674e-05, + "loss": 0.0493, + "num_input_tokens_seen": 14537120, + "step": 68885 + }, + { + "epoch": 7.578657865786578, + "grad_norm": 0.06784670054912567, + "learning_rate": 3.90573245253289e-05, + "loss": 0.0049, + "num_input_tokens_seen": 14538176, + "step": 68890 + }, + { + "epoch": 7.579207920792079, + "grad_norm": 0.25763019919395447, + "learning_rate": 3.9055339753130785e-05, + "loss": 0.0116, + "num_input_tokens_seen": 14539232, + "step": 68895 + }, + { + "epoch": 7.5797579757975795, + "grad_norm": 1.0401331186294556, + "learning_rate": 3.905335485139135e-05, + "loss": 0.0675, + "num_input_tokens_seen": 14540288, + "step": 68900 + }, + { + "epoch": 7.580308030803081, + "grad_norm": 0.20226241648197174, + "learning_rate": 3.905136982012888e-05, + "loss": 0.0723, + "num_input_tokens_seen": 14541408, + "step": 68905 + }, + { + "epoch": 7.580858085808581, + "grad_norm": 0.06792054325342178, + "learning_rate": 3.904938465936168e-05, + "loss": 0.0255, + "num_input_tokens_seen": 14542464, + "step": 68910 + }, + { + "epoch": 7.581408140814082, + "grad_norm": 0.19168835878372192, + "learning_rate": 3.904739936910805e-05, + "loss": 0.0181, + "num_input_tokens_seen": 14543552, + "step": 68915 + }, + { + "epoch": 7.581958195819582, + "grad_norm": 1.4515403509140015, + "learning_rate": 3.9045413949386274e-05, + "loss": 0.0453, + "num_input_tokens_seen": 14544544, + "step": 68920 + }, + { + "epoch": 7.582508250825082, + "grad_norm": 0.006510552950203419, + "learning_rate": 3.9043428400214654e-05, + "loss": 0.052, + "num_input_tokens_seen": 14545600, + "step": 68925 + }, + { + "epoch": 7.583058305830583, + "grad_norm": 0.21424752473831177, + "learning_rate": 3.90414427216115e-05, + "loss": 0.0099, + "num_input_tokens_seen": 14546592, + "step": 68930 + }, + { + "epoch": 7.583608360836084, + "grad_norm": 0.037516091018915176, + "learning_rate": 3.9039456913595104e-05, + "loss": 0.0662, + "num_input_tokens_seen": 14547712, + "step": 68935 + }, + { + "epoch": 7.584158415841584, + "grad_norm": 0.02629544585943222, + "learning_rate": 3.903747097618377e-05, + "loss": 0.0567, + "num_input_tokens_seen": 14548768, + "step": 68940 + }, + { + "epoch": 7.584708470847085, + "grad_norm": 0.24885369837284088, + "learning_rate": 3.90354849093958e-05, + "loss": 0.0605, + "num_input_tokens_seen": 14549856, + "step": 68945 + }, + { + "epoch": 7.585258525852585, + "grad_norm": 0.6709048748016357, + "learning_rate": 3.903349871324951e-05, + "loss": 0.0659, + "num_input_tokens_seen": 14550976, + "step": 68950 + }, + { + "epoch": 7.585808580858086, + "grad_norm": 0.30532917380332947, + "learning_rate": 3.903151238776319e-05, + "loss": 0.0089, + "num_input_tokens_seen": 14552096, + "step": 68955 + }, + { + "epoch": 7.586358635863586, + "grad_norm": 0.11932579427957535, + "learning_rate": 3.902952593295515e-05, + "loss": 0.0133, + "num_input_tokens_seen": 14553152, + "step": 68960 + }, + { + "epoch": 7.586908690869087, + "grad_norm": 0.030871735885739326, + "learning_rate": 3.90275393488437e-05, + "loss": 0.0311, + "num_input_tokens_seen": 14554208, + "step": 68965 + }, + { + "epoch": 7.587458745874588, + "grad_norm": 0.042905211448669434, + "learning_rate": 3.902555263544716e-05, + "loss": 0.0449, + "num_input_tokens_seen": 14555264, + "step": 68970 + }, + { + "epoch": 7.588008800880088, + "grad_norm": 0.006335576996207237, + "learning_rate": 3.902356579278382e-05, + "loss": 0.0043, + "num_input_tokens_seen": 14556320, + "step": 68975 + }, + { + "epoch": 7.588558855885589, + "grad_norm": 0.0167789738625288, + "learning_rate": 3.9021578820872015e-05, + "loss": 0.0252, + "num_input_tokens_seen": 14557344, + "step": 68980 + }, + { + "epoch": 7.589108910891089, + "grad_norm": 1.5145854949951172, + "learning_rate": 3.9019591719730045e-05, + "loss": 0.07, + "num_input_tokens_seen": 14558400, + "step": 68985 + }, + { + "epoch": 7.589658965896589, + "grad_norm": 0.058692567050457, + "learning_rate": 3.9017604489376214e-05, + "loss": 0.009, + "num_input_tokens_seen": 14559488, + "step": 68990 + }, + { + "epoch": 7.5902090209020905, + "grad_norm": 0.03968594968318939, + "learning_rate": 3.901561712982886e-05, + "loss": 0.1238, + "num_input_tokens_seen": 14560512, + "step": 68995 + }, + { + "epoch": 7.590759075907591, + "grad_norm": 0.12836381793022156, + "learning_rate": 3.901362964110628e-05, + "loss": 0.0787, + "num_input_tokens_seen": 14561632, + "step": 69000 + }, + { + "epoch": 7.591309130913091, + "grad_norm": 0.26842716336250305, + "learning_rate": 3.901164202322681e-05, + "loss": 0.058, + "num_input_tokens_seen": 14562592, + "step": 69005 + }, + { + "epoch": 7.591859185918592, + "grad_norm": 0.028569763526320457, + "learning_rate": 3.900965427620875e-05, + "loss": 0.0214, + "num_input_tokens_seen": 14563616, + "step": 69010 + }, + { + "epoch": 7.592409240924092, + "grad_norm": 0.18726608157157898, + "learning_rate": 3.900766640007043e-05, + "loss": 0.0326, + "num_input_tokens_seen": 14564704, + "step": 69015 + }, + { + "epoch": 7.592959295929593, + "grad_norm": 0.006859893910586834, + "learning_rate": 3.900567839483017e-05, + "loss": 0.0316, + "num_input_tokens_seen": 14565792, + "step": 69020 + }, + { + "epoch": 7.5935093509350935, + "grad_norm": 0.5135946869850159, + "learning_rate": 3.90036902605063e-05, + "loss": 0.05, + "num_input_tokens_seen": 14566816, + "step": 69025 + }, + { + "epoch": 7.594059405940594, + "grad_norm": 0.17640520632266998, + "learning_rate": 3.9001701997117135e-05, + "loss": 0.0095, + "num_input_tokens_seen": 14567904, + "step": 69030 + }, + { + "epoch": 7.594609460946095, + "grad_norm": 0.06765494495630264, + "learning_rate": 3.8999713604681e-05, + "loss": 0.0039, + "num_input_tokens_seen": 14568960, + "step": 69035 + }, + { + "epoch": 7.595159515951595, + "grad_norm": 0.019230835139751434, + "learning_rate": 3.899772508321622e-05, + "loss": 0.0154, + "num_input_tokens_seen": 14570080, + "step": 69040 + }, + { + "epoch": 7.595709570957096, + "grad_norm": 0.010430295951664448, + "learning_rate": 3.899573643274112e-05, + "loss": 0.0197, + "num_input_tokens_seen": 14571136, + "step": 69045 + }, + { + "epoch": 7.596259625962596, + "grad_norm": 0.0628187507390976, + "learning_rate": 3.899374765327404e-05, + "loss": 0.0079, + "num_input_tokens_seen": 14572160, + "step": 69050 + }, + { + "epoch": 7.5968096809680965, + "grad_norm": 0.39786404371261597, + "learning_rate": 3.89917587448333e-05, + "loss": 0.0195, + "num_input_tokens_seen": 14573216, + "step": 69055 + }, + { + "epoch": 7.597359735973598, + "grad_norm": 2.103201389312744, + "learning_rate": 3.8989769707437234e-05, + "loss": 0.1369, + "num_input_tokens_seen": 14574272, + "step": 69060 + }, + { + "epoch": 7.597909790979098, + "grad_norm": 0.037279970943927765, + "learning_rate": 3.8987780541104174e-05, + "loss": 0.011, + "num_input_tokens_seen": 14575296, + "step": 69065 + }, + { + "epoch": 7.598459845984598, + "grad_norm": 0.006982363294810057, + "learning_rate": 3.8985791245852455e-05, + "loss": 0.0238, + "num_input_tokens_seen": 14576288, + "step": 69070 + }, + { + "epoch": 7.599009900990099, + "grad_norm": 0.03966573253273964, + "learning_rate": 3.898380182170041e-05, + "loss": 0.0127, + "num_input_tokens_seen": 14577344, + "step": 69075 + }, + { + "epoch": 7.599559955995599, + "grad_norm": 0.27445903420448303, + "learning_rate": 3.898181226866637e-05, + "loss": 0.0663, + "num_input_tokens_seen": 14578368, + "step": 69080 + }, + { + "epoch": 7.6001100110011, + "grad_norm": 0.19234825670719147, + "learning_rate": 3.897982258676867e-05, + "loss": 0.0126, + "num_input_tokens_seen": 14579392, + "step": 69085 + }, + { + "epoch": 7.600660066006601, + "grad_norm": 0.39728185534477234, + "learning_rate": 3.8977832776025666e-05, + "loss": 0.0128, + "num_input_tokens_seen": 14580480, + "step": 69090 + }, + { + "epoch": 7.601210121012102, + "grad_norm": 0.8639705777168274, + "learning_rate": 3.8975842836455676e-05, + "loss": 0.0347, + "num_input_tokens_seen": 14581568, + "step": 69095 + }, + { + "epoch": 7.601760176017602, + "grad_norm": 0.30811619758605957, + "learning_rate": 3.8973852768077054e-05, + "loss": 0.0066, + "num_input_tokens_seen": 14582560, + "step": 69100 + }, + { + "epoch": 7.602310231023102, + "grad_norm": 0.44406363368034363, + "learning_rate": 3.897186257090813e-05, + "loss": 0.048, + "num_input_tokens_seen": 14583616, + "step": 69105 + }, + { + "epoch": 7.602860286028603, + "grad_norm": 0.024241773411631584, + "learning_rate": 3.896987224496725e-05, + "loss": 0.0331, + "num_input_tokens_seen": 14584672, + "step": 69110 + }, + { + "epoch": 7.603410341034103, + "grad_norm": 0.12196166813373566, + "learning_rate": 3.896788179027277e-05, + "loss": 0.009, + "num_input_tokens_seen": 14585664, + "step": 69115 + }, + { + "epoch": 7.603960396039604, + "grad_norm": 0.041240885853767395, + "learning_rate": 3.896589120684303e-05, + "loss": 0.0335, + "num_input_tokens_seen": 14586784, + "step": 69120 + }, + { + "epoch": 7.604510451045105, + "grad_norm": 0.6169804930686951, + "learning_rate": 3.896390049469636e-05, + "loss": 0.106, + "num_input_tokens_seen": 14587808, + "step": 69125 + }, + { + "epoch": 7.605060506050605, + "grad_norm": 0.16508495807647705, + "learning_rate": 3.896190965385113e-05, + "loss": 0.0108, + "num_input_tokens_seen": 14588864, + "step": 69130 + }, + { + "epoch": 7.605610561056105, + "grad_norm": 0.018942439928650856, + "learning_rate": 3.895991868432567e-05, + "loss": 0.0225, + "num_input_tokens_seen": 14589984, + "step": 69135 + }, + { + "epoch": 7.606160616061606, + "grad_norm": 0.7908747792243958, + "learning_rate": 3.8957927586138345e-05, + "loss": 0.0849, + "num_input_tokens_seen": 14590976, + "step": 69140 + }, + { + "epoch": 7.606710671067106, + "grad_norm": 0.009018205106258392, + "learning_rate": 3.8955936359307496e-05, + "loss": 0.0056, + "num_input_tokens_seen": 14592032, + "step": 69145 + }, + { + "epoch": 7.6072607260726075, + "grad_norm": 0.011923224665224552, + "learning_rate": 3.8953945003851475e-05, + "loss": 0.0809, + "num_input_tokens_seen": 14593088, + "step": 69150 + }, + { + "epoch": 7.607810781078108, + "grad_norm": 0.24208758771419525, + "learning_rate": 3.895195351978864e-05, + "loss": 0.0146, + "num_input_tokens_seen": 14594144, + "step": 69155 + }, + { + "epoch": 7.608360836083609, + "grad_norm": 0.020902568474411964, + "learning_rate": 3.8949961907137346e-05, + "loss": 0.0066, + "num_input_tokens_seen": 14595232, + "step": 69160 + }, + { + "epoch": 7.608910891089109, + "grad_norm": 0.0068914094008505344, + "learning_rate": 3.8947970165915945e-05, + "loss": 0.0172, + "num_input_tokens_seen": 14596256, + "step": 69165 + }, + { + "epoch": 7.609460946094609, + "grad_norm": 0.020586108788847923, + "learning_rate": 3.89459782961428e-05, + "loss": 0.1164, + "num_input_tokens_seen": 14597280, + "step": 69170 + }, + { + "epoch": 7.61001100110011, + "grad_norm": 0.6181662678718567, + "learning_rate": 3.8943986297836266e-05, + "loss": 0.0265, + "num_input_tokens_seen": 14598304, + "step": 69175 + }, + { + "epoch": 7.6105610561056105, + "grad_norm": 0.35362911224365234, + "learning_rate": 3.8941994171014696e-05, + "loss": 0.0288, + "num_input_tokens_seen": 14599424, + "step": 69180 + }, + { + "epoch": 7.611111111111111, + "grad_norm": 0.014309262856841087, + "learning_rate": 3.894000191569645e-05, + "loss": 0.0437, + "num_input_tokens_seen": 14600448, + "step": 69185 + }, + { + "epoch": 7.611661166116612, + "grad_norm": 0.05923669785261154, + "learning_rate": 3.893800953189991e-05, + "loss": 0.0081, + "num_input_tokens_seen": 14601536, + "step": 69190 + }, + { + "epoch": 7.612211221122112, + "grad_norm": 0.0078017832711339, + "learning_rate": 3.893601701964342e-05, + "loss": 0.012, + "num_input_tokens_seen": 14602656, + "step": 69195 + }, + { + "epoch": 7.612761276127613, + "grad_norm": 0.4846392869949341, + "learning_rate": 3.8934024378945345e-05, + "loss": 0.0152, + "num_input_tokens_seen": 14603648, + "step": 69200 + }, + { + "epoch": 7.613311331133113, + "grad_norm": 2.673290491104126, + "learning_rate": 3.893203160982405e-05, + "loss": 0.0478, + "num_input_tokens_seen": 14604736, + "step": 69205 + }, + { + "epoch": 7.6138613861386135, + "grad_norm": 1.3036434650421143, + "learning_rate": 3.893003871229791e-05, + "loss": 0.0801, + "num_input_tokens_seen": 14605856, + "step": 69210 + }, + { + "epoch": 7.614411441144115, + "grad_norm": 0.23503807187080383, + "learning_rate": 3.892804568638528e-05, + "loss": 0.0347, + "num_input_tokens_seen": 14606880, + "step": 69215 + }, + { + "epoch": 7.614961496149615, + "grad_norm": 0.00569176534190774, + "learning_rate": 3.8926052532104554e-05, + "loss": 0.0973, + "num_input_tokens_seen": 14607936, + "step": 69220 + }, + { + "epoch": 7.615511551155116, + "grad_norm": 0.03007165715098381, + "learning_rate": 3.892405924947406e-05, + "loss": 0.0239, + "num_input_tokens_seen": 14608960, + "step": 69225 + }, + { + "epoch": 7.616061606160616, + "grad_norm": 0.705504834651947, + "learning_rate": 3.892206583851221e-05, + "loss": 0.0402, + "num_input_tokens_seen": 14609952, + "step": 69230 + }, + { + "epoch": 7.616611661166116, + "grad_norm": 0.11895889788866043, + "learning_rate": 3.892007229923736e-05, + "loss": 0.0141, + "num_input_tokens_seen": 14611008, + "step": 69235 + }, + { + "epoch": 7.617161716171617, + "grad_norm": 0.014820191077888012, + "learning_rate": 3.891807863166788e-05, + "loss": 0.0367, + "num_input_tokens_seen": 14612032, + "step": 69240 + }, + { + "epoch": 7.617711771177118, + "grad_norm": 0.8201719522476196, + "learning_rate": 3.891608483582214e-05, + "loss": 0.0351, + "num_input_tokens_seen": 14613056, + "step": 69245 + }, + { + "epoch": 7.618261826182618, + "grad_norm": 0.09395970404148102, + "learning_rate": 3.8914090911718536e-05, + "loss": 0.0082, + "num_input_tokens_seen": 14614144, + "step": 69250 + }, + { + "epoch": 7.618811881188119, + "grad_norm": 0.06449402868747711, + "learning_rate": 3.891209685937542e-05, + "loss": 0.1115, + "num_input_tokens_seen": 14615136, + "step": 69255 + }, + { + "epoch": 7.619361936193619, + "grad_norm": 0.01523101981729269, + "learning_rate": 3.891010267881119e-05, + "loss": 0.0048, + "num_input_tokens_seen": 14616128, + "step": 69260 + }, + { + "epoch": 7.61991199119912, + "grad_norm": 0.021384751424193382, + "learning_rate": 3.890810837004423e-05, + "loss": 0.0263, + "num_input_tokens_seen": 14617120, + "step": 69265 + }, + { + "epoch": 7.62046204620462, + "grad_norm": 0.005272640846669674, + "learning_rate": 3.890611393309289e-05, + "loss": 0.0108, + "num_input_tokens_seen": 14618208, + "step": 69270 + }, + { + "epoch": 7.621012101210121, + "grad_norm": 0.4438736140727997, + "learning_rate": 3.890411936797558e-05, + "loss": 0.1826, + "num_input_tokens_seen": 14619168, + "step": 69275 + }, + { + "epoch": 7.621562156215622, + "grad_norm": 0.05326494574546814, + "learning_rate": 3.8902124674710674e-05, + "loss": 0.0107, + "num_input_tokens_seen": 14620256, + "step": 69280 + }, + { + "epoch": 7.622112211221122, + "grad_norm": 0.06086471676826477, + "learning_rate": 3.890012985331655e-05, + "loss": 0.0063, + "num_input_tokens_seen": 14621312, + "step": 69285 + }, + { + "epoch": 7.622662266226623, + "grad_norm": 0.05938445031642914, + "learning_rate": 3.88981349038116e-05, + "loss": 0.0053, + "num_input_tokens_seen": 14622368, + "step": 69290 + }, + { + "epoch": 7.623212321232123, + "grad_norm": 1.2473598718643188, + "learning_rate": 3.8896139826214214e-05, + "loss": 0.0504, + "num_input_tokens_seen": 14623424, + "step": 69295 + }, + { + "epoch": 7.623762376237623, + "grad_norm": 0.022761601954698563, + "learning_rate": 3.889414462054277e-05, + "loss": 0.0058, + "num_input_tokens_seen": 14624416, + "step": 69300 + }, + { + "epoch": 7.6243124312431245, + "grad_norm": 0.8057231903076172, + "learning_rate": 3.889214928681566e-05, + "loss": 0.026, + "num_input_tokens_seen": 14625504, + "step": 69305 + }, + { + "epoch": 7.624862486248625, + "grad_norm": 0.6996598243713379, + "learning_rate": 3.889015382505129e-05, + "loss": 0.0504, + "num_input_tokens_seen": 14626624, + "step": 69310 + }, + { + "epoch": 7.625412541254125, + "grad_norm": 0.01236118096858263, + "learning_rate": 3.888815823526803e-05, + "loss": 0.0033, + "num_input_tokens_seen": 14627680, + "step": 69315 + }, + { + "epoch": 7.625962596259626, + "grad_norm": 1.1235320568084717, + "learning_rate": 3.888616251748428e-05, + "loss": 0.0351, + "num_input_tokens_seen": 14628672, + "step": 69320 + }, + { + "epoch": 7.626512651265126, + "grad_norm": 1.7621829509735107, + "learning_rate": 3.888416667171843e-05, + "loss": 0.0337, + "num_input_tokens_seen": 14629696, + "step": 69325 + }, + { + "epoch": 7.627062706270627, + "grad_norm": 0.005601926706731319, + "learning_rate": 3.888217069798887e-05, + "loss": 0.0189, + "num_input_tokens_seen": 14630688, + "step": 69330 + }, + { + "epoch": 7.6276127612761275, + "grad_norm": 0.01552016194909811, + "learning_rate": 3.8880174596314024e-05, + "loss": 0.062, + "num_input_tokens_seen": 14631680, + "step": 69335 + }, + { + "epoch": 7.628162816281629, + "grad_norm": 1.8889200687408447, + "learning_rate": 3.887817836671226e-05, + "loss": 0.2388, + "num_input_tokens_seen": 14632768, + "step": 69340 + }, + { + "epoch": 7.628712871287129, + "grad_norm": 1.9321439266204834, + "learning_rate": 3.887618200920198e-05, + "loss": 0.1484, + "num_input_tokens_seen": 14633856, + "step": 69345 + }, + { + "epoch": 7.629262926292629, + "grad_norm": 0.2581078112125397, + "learning_rate": 3.887418552380159e-05, + "loss": 0.0337, + "num_input_tokens_seen": 14634880, + "step": 69350 + }, + { + "epoch": 7.62981298129813, + "grad_norm": 1.083528995513916, + "learning_rate": 3.8872188910529496e-05, + "loss": 0.0666, + "num_input_tokens_seen": 14635968, + "step": 69355 + }, + { + "epoch": 7.63036303630363, + "grad_norm": 0.07523279637098312, + "learning_rate": 3.887019216940409e-05, + "loss": 0.0149, + "num_input_tokens_seen": 14636992, + "step": 69360 + }, + { + "epoch": 7.6309130913091305, + "grad_norm": 0.19882379472255707, + "learning_rate": 3.886819530044378e-05, + "loss": 0.0111, + "num_input_tokens_seen": 14638080, + "step": 69365 + }, + { + "epoch": 7.631463146314632, + "grad_norm": 0.12819714844226837, + "learning_rate": 3.886619830366697e-05, + "loss": 0.0314, + "num_input_tokens_seen": 14639104, + "step": 69370 + }, + { + "epoch": 7.632013201320132, + "grad_norm": 0.026004722341895103, + "learning_rate": 3.8864201179092055e-05, + "loss": 0.0135, + "num_input_tokens_seen": 14640128, + "step": 69375 + }, + { + "epoch": 7.632563256325633, + "grad_norm": 0.2738077938556671, + "learning_rate": 3.8862203926737464e-05, + "loss": 0.0471, + "num_input_tokens_seen": 14641152, + "step": 69380 + }, + { + "epoch": 7.633113311331133, + "grad_norm": 0.08186094462871552, + "learning_rate": 3.886020654662158e-05, + "loss": 0.0307, + "num_input_tokens_seen": 14642240, + "step": 69385 + }, + { + "epoch": 7.633663366336633, + "grad_norm": 0.051511313766241074, + "learning_rate": 3.885820903876282e-05, + "loss": 0.0063, + "num_input_tokens_seen": 14643264, + "step": 69390 + }, + { + "epoch": 7.634213421342134, + "grad_norm": 0.032803408801555634, + "learning_rate": 3.885621140317961e-05, + "loss": 0.0403, + "num_input_tokens_seen": 14644288, + "step": 69395 + }, + { + "epoch": 7.634763476347635, + "grad_norm": 0.03002605214715004, + "learning_rate": 3.8854213639890336e-05, + "loss": 0.0243, + "num_input_tokens_seen": 14645344, + "step": 69400 + }, + { + "epoch": 7.635313531353136, + "grad_norm": 0.013159723952412605, + "learning_rate": 3.885221574891343e-05, + "loss": 0.0082, + "num_input_tokens_seen": 14646432, + "step": 69405 + }, + { + "epoch": 7.635863586358636, + "grad_norm": 0.00517292832955718, + "learning_rate": 3.8850217730267297e-05, + "loss": 0.0028, + "num_input_tokens_seen": 14647520, + "step": 69410 + }, + { + "epoch": 7.636413641364136, + "grad_norm": 0.6674588322639465, + "learning_rate": 3.8848219583970356e-05, + "loss": 0.0915, + "num_input_tokens_seen": 14648512, + "step": 69415 + }, + { + "epoch": 7.636963696369637, + "grad_norm": 0.021202988922595978, + "learning_rate": 3.8846221310041006e-05, + "loss": 0.0189, + "num_input_tokens_seen": 14649536, + "step": 69420 + }, + { + "epoch": 7.637513751375137, + "grad_norm": 0.038659170269966125, + "learning_rate": 3.884422290849769e-05, + "loss": 0.0049, + "num_input_tokens_seen": 14650592, + "step": 69425 + }, + { + "epoch": 7.638063806380638, + "grad_norm": 0.019215188920497894, + "learning_rate": 3.884222437935883e-05, + "loss": 0.0854, + "num_input_tokens_seen": 14651680, + "step": 69430 + }, + { + "epoch": 7.638613861386139, + "grad_norm": 0.021755559369921684, + "learning_rate": 3.884022572264281e-05, + "loss": 0.0092, + "num_input_tokens_seen": 14652768, + "step": 69435 + }, + { + "epoch": 7.639163916391639, + "grad_norm": 0.21822333335876465, + "learning_rate": 3.883822693836807e-05, + "loss": 0.0255, + "num_input_tokens_seen": 14653760, + "step": 69440 + }, + { + "epoch": 7.63971397139714, + "grad_norm": 0.06180325895547867, + "learning_rate": 3.8836228026553035e-05, + "loss": 0.054, + "num_input_tokens_seen": 14654848, + "step": 69445 + }, + { + "epoch": 7.64026402640264, + "grad_norm": 0.03915868327021599, + "learning_rate": 3.8834228987216126e-05, + "loss": 0.0267, + "num_input_tokens_seen": 14655904, + "step": 69450 + }, + { + "epoch": 7.6408140814081404, + "grad_norm": 0.11853349953889847, + "learning_rate": 3.8832229820375773e-05, + "loss": 0.0155, + "num_input_tokens_seen": 14656928, + "step": 69455 + }, + { + "epoch": 7.6413641364136415, + "grad_norm": 1.406506896018982, + "learning_rate": 3.883023052605039e-05, + "loss": 0.1128, + "num_input_tokens_seen": 14657952, + "step": 69460 + }, + { + "epoch": 7.641914191419142, + "grad_norm": 0.24601434171199799, + "learning_rate": 3.8828231104258406e-05, + "loss": 0.1362, + "num_input_tokens_seen": 14659072, + "step": 69465 + }, + { + "epoch": 7.642464246424643, + "grad_norm": 0.021003197878599167, + "learning_rate": 3.882623155501825e-05, + "loss": 0.1024, + "num_input_tokens_seen": 14660160, + "step": 69470 + }, + { + "epoch": 7.643014301430143, + "grad_norm": 0.04203540086746216, + "learning_rate": 3.882423187834836e-05, + "loss": 0.0149, + "num_input_tokens_seen": 14661184, + "step": 69475 + }, + { + "epoch": 7.643564356435643, + "grad_norm": 0.42701855301856995, + "learning_rate": 3.882223207426716e-05, + "loss": 0.0168, + "num_input_tokens_seen": 14662208, + "step": 69480 + }, + { + "epoch": 7.644114411441144, + "grad_norm": 0.3469402492046356, + "learning_rate": 3.882023214279308e-05, + "loss": 0.0149, + "num_input_tokens_seen": 14663264, + "step": 69485 + }, + { + "epoch": 7.6446644664466445, + "grad_norm": 0.03876160457730293, + "learning_rate": 3.881823208394454e-05, + "loss": 0.0442, + "num_input_tokens_seen": 14664384, + "step": 69490 + }, + { + "epoch": 7.645214521452145, + "grad_norm": 0.01681518740952015, + "learning_rate": 3.881623189774e-05, + "loss": 0.0139, + "num_input_tokens_seen": 14665536, + "step": 69495 + }, + { + "epoch": 7.645764576457646, + "grad_norm": 0.06631527096033096, + "learning_rate": 3.881423158419787e-05, + "loss": 0.0165, + "num_input_tokens_seen": 14666560, + "step": 69500 + }, + { + "epoch": 7.646314631463146, + "grad_norm": 0.7457787394523621, + "learning_rate": 3.8812231143336595e-05, + "loss": 0.0514, + "num_input_tokens_seen": 14667680, + "step": 69505 + }, + { + "epoch": 7.646864686468647, + "grad_norm": 0.052553534507751465, + "learning_rate": 3.881023057517462e-05, + "loss": 0.0618, + "num_input_tokens_seen": 14668704, + "step": 69510 + }, + { + "epoch": 7.647414741474147, + "grad_norm": 0.9689643979072571, + "learning_rate": 3.880822987973037e-05, + "loss": 0.1277, + "num_input_tokens_seen": 14669728, + "step": 69515 + }, + { + "epoch": 7.647964796479648, + "grad_norm": 0.054910916835069656, + "learning_rate": 3.880622905702229e-05, + "loss": 0.0228, + "num_input_tokens_seen": 14670784, + "step": 69520 + }, + { + "epoch": 7.648514851485149, + "grad_norm": 0.0376092754304409, + "learning_rate": 3.880422810706883e-05, + "loss": 0.0052, + "num_input_tokens_seen": 14671840, + "step": 69525 + }, + { + "epoch": 7.649064906490649, + "grad_norm": 0.3762623369693756, + "learning_rate": 3.8802227029888425e-05, + "loss": 0.0081, + "num_input_tokens_seen": 14672864, + "step": 69530 + }, + { + "epoch": 7.64961496149615, + "grad_norm": 0.02425309084355831, + "learning_rate": 3.88002258254995e-05, + "loss": 0.0996, + "num_input_tokens_seen": 14673952, + "step": 69535 + }, + { + "epoch": 7.65016501650165, + "grad_norm": 0.4797307252883911, + "learning_rate": 3.8798224493920534e-05, + "loss": 0.02, + "num_input_tokens_seen": 14675008, + "step": 69540 + }, + { + "epoch": 7.65071507150715, + "grad_norm": 0.03812698274850845, + "learning_rate": 3.879622303516994e-05, + "loss": 0.0041, + "num_input_tokens_seen": 14676160, + "step": 69545 + }, + { + "epoch": 7.6512651265126514, + "grad_norm": 1.5524197816848755, + "learning_rate": 3.8794221449266187e-05, + "loss": 0.0593, + "num_input_tokens_seen": 14677248, + "step": 69550 + }, + { + "epoch": 7.651815181518152, + "grad_norm": 0.08737751096487045, + "learning_rate": 3.879221973622772e-05, + "loss": 0.0142, + "num_input_tokens_seen": 14678336, + "step": 69555 + }, + { + "epoch": 7.652365236523653, + "grad_norm": 0.6601261496543884, + "learning_rate": 3.879021789607297e-05, + "loss": 0.0217, + "num_input_tokens_seen": 14679456, + "step": 69560 + }, + { + "epoch": 7.652915291529153, + "grad_norm": 0.017550360411405563, + "learning_rate": 3.87882159288204e-05, + "loss": 0.008, + "num_input_tokens_seen": 14680512, + "step": 69565 + }, + { + "epoch": 7.653465346534653, + "grad_norm": 0.038128577172756195, + "learning_rate": 3.878621383448846e-05, + "loss": 0.0093, + "num_input_tokens_seen": 14681472, + "step": 69570 + }, + { + "epoch": 7.654015401540154, + "grad_norm": 1.4489020109176636, + "learning_rate": 3.878421161309561e-05, + "loss": 0.0532, + "num_input_tokens_seen": 14682528, + "step": 69575 + }, + { + "epoch": 7.6545654565456545, + "grad_norm": 0.012403514236211777, + "learning_rate": 3.878220926466028e-05, + "loss": 0.0176, + "num_input_tokens_seen": 14683584, + "step": 69580 + }, + { + "epoch": 7.6551155115511555, + "grad_norm": 0.048147719353437424, + "learning_rate": 3.8780206789200954e-05, + "loss": 0.0214, + "num_input_tokens_seen": 14684640, + "step": 69585 + }, + { + "epoch": 7.655665566556656, + "grad_norm": 0.1825437992811203, + "learning_rate": 3.8778204186736076e-05, + "loss": 0.0043, + "num_input_tokens_seen": 14685664, + "step": 69590 + }, + { + "epoch": 7.656215621562156, + "grad_norm": 0.010063719004392624, + "learning_rate": 3.87762014572841e-05, + "loss": 0.0331, + "num_input_tokens_seen": 14686688, + "step": 69595 + }, + { + "epoch": 7.656765676567657, + "grad_norm": 0.4126126170158386, + "learning_rate": 3.877419860086348e-05, + "loss": 0.0391, + "num_input_tokens_seen": 14687776, + "step": 69600 + }, + { + "epoch": 7.657315731573157, + "grad_norm": 0.7013813257217407, + "learning_rate": 3.877219561749269e-05, + "loss": 0.0578, + "num_input_tokens_seen": 14688864, + "step": 69605 + }, + { + "epoch": 7.6578657865786575, + "grad_norm": 0.03898196667432785, + "learning_rate": 3.877019250719017e-05, + "loss": 0.0556, + "num_input_tokens_seen": 14689920, + "step": 69610 + }, + { + "epoch": 7.658415841584159, + "grad_norm": 1.114802598953247, + "learning_rate": 3.87681892699744e-05, + "loss": 0.0455, + "num_input_tokens_seen": 14691040, + "step": 69615 + }, + { + "epoch": 7.658965896589659, + "grad_norm": 1.7291589975357056, + "learning_rate": 3.876618590586383e-05, + "loss": 0.0399, + "num_input_tokens_seen": 14692192, + "step": 69620 + }, + { + "epoch": 7.65951595159516, + "grad_norm": 1.6851451396942139, + "learning_rate": 3.8764182414876935e-05, + "loss": 0.1289, + "num_input_tokens_seen": 14693216, + "step": 69625 + }, + { + "epoch": 7.66006600660066, + "grad_norm": 0.006900584790855646, + "learning_rate": 3.876217879703218e-05, + "loss": 0.0662, + "num_input_tokens_seen": 14694336, + "step": 69630 + }, + { + "epoch": 7.66061606160616, + "grad_norm": 1.105123519897461, + "learning_rate": 3.8760175052348026e-05, + "loss": 0.0754, + "num_input_tokens_seen": 14695328, + "step": 69635 + }, + { + "epoch": 7.661166116611661, + "grad_norm": 0.2698659300804138, + "learning_rate": 3.875817118084293e-05, + "loss": 0.0114, + "num_input_tokens_seen": 14696352, + "step": 69640 + }, + { + "epoch": 7.661716171617162, + "grad_norm": 0.01597292535007, + "learning_rate": 3.8756167182535386e-05, + "loss": 0.0174, + "num_input_tokens_seen": 14697408, + "step": 69645 + }, + { + "epoch": 7.662266226622663, + "grad_norm": 0.1134985089302063, + "learning_rate": 3.875416305744384e-05, + "loss": 0.0644, + "num_input_tokens_seen": 14698400, + "step": 69650 + }, + { + "epoch": 7.662816281628163, + "grad_norm": 0.4310421049594879, + "learning_rate": 3.8752158805586775e-05, + "loss": 0.014, + "num_input_tokens_seen": 14699424, + "step": 69655 + }, + { + "epoch": 7.663366336633663, + "grad_norm": 1.9861292839050293, + "learning_rate": 3.875015442698266e-05, + "loss": 0.0591, + "num_input_tokens_seen": 14700448, + "step": 69660 + }, + { + "epoch": 7.663916391639164, + "grad_norm": 0.029218556359410286, + "learning_rate": 3.8748149921649973e-05, + "loss": 0.0237, + "num_input_tokens_seen": 14701568, + "step": 69665 + }, + { + "epoch": 7.664466446644664, + "grad_norm": 0.16086609661579132, + "learning_rate": 3.8746145289607186e-05, + "loss": 0.005, + "num_input_tokens_seen": 14702560, + "step": 69670 + }, + { + "epoch": 7.665016501650165, + "grad_norm": 0.0667361319065094, + "learning_rate": 3.8744140530872773e-05, + "loss": 0.0046, + "num_input_tokens_seen": 14703616, + "step": 69675 + }, + { + "epoch": 7.665566556655666, + "grad_norm": 0.3853817284107208, + "learning_rate": 3.874213564546521e-05, + "loss": 0.0176, + "num_input_tokens_seen": 14704608, + "step": 69680 + }, + { + "epoch": 7.666116611661166, + "grad_norm": 0.22731520235538483, + "learning_rate": 3.874013063340298e-05, + "loss": 0.0161, + "num_input_tokens_seen": 14705728, + "step": 69685 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.2821115553379059, + "learning_rate": 3.873812549470456e-05, + "loss": 0.0098, + "num_input_tokens_seen": 14706720, + "step": 69690 + }, + { + "epoch": 7.667216721672167, + "grad_norm": 0.022355081513524055, + "learning_rate": 3.8736120229388426e-05, + "loss": 0.044, + "num_input_tokens_seen": 14707744, + "step": 69695 + }, + { + "epoch": 7.667766776677668, + "grad_norm": 0.10862065851688385, + "learning_rate": 3.873411483747306e-05, + "loss": 0.0047, + "num_input_tokens_seen": 14708864, + "step": 69700 + }, + { + "epoch": 7.6683168316831685, + "grad_norm": 0.02164941467344761, + "learning_rate": 3.873210931897695e-05, + "loss": 0.0715, + "num_input_tokens_seen": 14709920, + "step": 69705 + }, + { + "epoch": 7.668866886688669, + "grad_norm": 1.0080634355545044, + "learning_rate": 3.8730103673918575e-05, + "loss": 0.0838, + "num_input_tokens_seen": 14710944, + "step": 69710 + }, + { + "epoch": 7.66941694169417, + "grad_norm": 0.01336168684065342, + "learning_rate": 3.872809790231643e-05, + "loss": 0.0022, + "num_input_tokens_seen": 14712096, + "step": 69715 + }, + { + "epoch": 7.66996699669967, + "grad_norm": 0.3192310631275177, + "learning_rate": 3.8726092004189e-05, + "loss": 0.0253, + "num_input_tokens_seen": 14713120, + "step": 69720 + }, + { + "epoch": 7.67051705170517, + "grad_norm": 0.2015175074338913, + "learning_rate": 3.8724085979554745e-05, + "loss": 0.059, + "num_input_tokens_seen": 14714144, + "step": 69725 + }, + { + "epoch": 7.671067106710671, + "grad_norm": 0.034911148250103, + "learning_rate": 3.872207982843219e-05, + "loss": 0.0414, + "num_input_tokens_seen": 14715200, + "step": 69730 + }, + { + "epoch": 7.6716171617161715, + "grad_norm": 0.6360921859741211, + "learning_rate": 3.872007355083981e-05, + "loss": 0.0168, + "num_input_tokens_seen": 14716224, + "step": 69735 + }, + { + "epoch": 7.672167216721672, + "grad_norm": 0.6638467311859131, + "learning_rate": 3.871806714679609e-05, + "loss": 0.1419, + "num_input_tokens_seen": 14717248, + "step": 69740 + }, + { + "epoch": 7.672717271727173, + "grad_norm": 1.4752123355865479, + "learning_rate": 3.871606061631953e-05, + "loss": 0.1012, + "num_input_tokens_seen": 14718336, + "step": 69745 + }, + { + "epoch": 7.673267326732673, + "grad_norm": 0.03679259121417999, + "learning_rate": 3.871405395942862e-05, + "loss": 0.0554, + "num_input_tokens_seen": 14719456, + "step": 69750 + }, + { + "epoch": 7.673817381738174, + "grad_norm": 0.010120363906025887, + "learning_rate": 3.8712047176141857e-05, + "loss": 0.003, + "num_input_tokens_seen": 14720544, + "step": 69755 + }, + { + "epoch": 7.674367436743674, + "grad_norm": 1.1750819683074951, + "learning_rate": 3.8710040266477734e-05, + "loss": 0.1276, + "num_input_tokens_seen": 14721600, + "step": 69760 + }, + { + "epoch": 7.674917491749175, + "grad_norm": 0.05135910212993622, + "learning_rate": 3.8708033230454754e-05, + "loss": 0.0362, + "num_input_tokens_seen": 14722560, + "step": 69765 + }, + { + "epoch": 7.675467546754676, + "grad_norm": 0.014489427208900452, + "learning_rate": 3.87060260680914e-05, + "loss": 0.0458, + "num_input_tokens_seen": 14723584, + "step": 69770 + }, + { + "epoch": 7.676017601760176, + "grad_norm": 0.16771458089351654, + "learning_rate": 3.870401877940618e-05, + "loss": 0.0113, + "num_input_tokens_seen": 14724640, + "step": 69775 + }, + { + "epoch": 7.676567656765677, + "grad_norm": 0.47365519404411316, + "learning_rate": 3.870201136441761e-05, + "loss": 0.1013, + "num_input_tokens_seen": 14725664, + "step": 69780 + }, + { + "epoch": 7.677117711771177, + "grad_norm": 1.3693970441818237, + "learning_rate": 3.870000382314416e-05, + "loss": 0.0363, + "num_input_tokens_seen": 14726720, + "step": 69785 + }, + { + "epoch": 7.677667766776677, + "grad_norm": 0.13783220946788788, + "learning_rate": 3.8697996155604354e-05, + "loss": 0.0076, + "num_input_tokens_seen": 14727712, + "step": 69790 + }, + { + "epoch": 7.678217821782178, + "grad_norm": 0.016159212216734886, + "learning_rate": 3.8695988361816694e-05, + "loss": 0.0347, + "num_input_tokens_seen": 14728736, + "step": 69795 + }, + { + "epoch": 7.678767876787679, + "grad_norm": 1.6590286493301392, + "learning_rate": 3.869398044179968e-05, + "loss": 0.0645, + "num_input_tokens_seen": 14729760, + "step": 69800 + }, + { + "epoch": 7.67931793179318, + "grad_norm": 0.0630490705370903, + "learning_rate": 3.869197239557181e-05, + "loss": 0.0031, + "num_input_tokens_seen": 14730816, + "step": 69805 + }, + { + "epoch": 7.67986798679868, + "grad_norm": 0.20101983845233917, + "learning_rate": 3.868996422315161e-05, + "loss": 0.0083, + "num_input_tokens_seen": 14731808, + "step": 69810 + }, + { + "epoch": 7.68041804180418, + "grad_norm": 0.04988852143287659, + "learning_rate": 3.868795592455759e-05, + "loss": 0.0043, + "num_input_tokens_seen": 14732928, + "step": 69815 + }, + { + "epoch": 7.680968096809681, + "grad_norm": 0.0806700736284256, + "learning_rate": 3.868594749980822e-05, + "loss": 0.0672, + "num_input_tokens_seen": 14734048, + "step": 69820 + }, + { + "epoch": 7.681518151815181, + "grad_norm": 0.1529533714056015, + "learning_rate": 3.868393894892206e-05, + "loss": 0.0327, + "num_input_tokens_seen": 14735168, + "step": 69825 + }, + { + "epoch": 7.6820682068206825, + "grad_norm": 0.4733859896659851, + "learning_rate": 3.868193027191759e-05, + "loss": 0.1042, + "num_input_tokens_seen": 14736256, + "step": 69830 + }, + { + "epoch": 7.682618261826183, + "grad_norm": 0.20809631049633026, + "learning_rate": 3.867992146881334e-05, + "loss": 0.0168, + "num_input_tokens_seen": 14737248, + "step": 69835 + }, + { + "epoch": 7.683168316831683, + "grad_norm": 0.04373854771256447, + "learning_rate": 3.8677912539627815e-05, + "loss": 0.053, + "num_input_tokens_seen": 14738272, + "step": 69840 + }, + { + "epoch": 7.683718371837184, + "grad_norm": 0.0804927870631218, + "learning_rate": 3.867590348437954e-05, + "loss": 0.0114, + "num_input_tokens_seen": 14739328, + "step": 69845 + }, + { + "epoch": 7.684268426842684, + "grad_norm": 0.020371010527014732, + "learning_rate": 3.867389430308702e-05, + "loss": 0.0781, + "num_input_tokens_seen": 14740416, + "step": 69850 + }, + { + "epoch": 7.684818481848184, + "grad_norm": 0.1189800426363945, + "learning_rate": 3.8671884995768776e-05, + "loss": 0.018, + "num_input_tokens_seen": 14741408, + "step": 69855 + }, + { + "epoch": 7.6853685368536855, + "grad_norm": 0.06940155476331711, + "learning_rate": 3.8669875562443333e-05, + "loss": 0.0641, + "num_input_tokens_seen": 14742496, + "step": 69860 + }, + { + "epoch": 7.685918591859186, + "grad_norm": 0.8440304398536682, + "learning_rate": 3.866786600312921e-05, + "loss": 0.0559, + "num_input_tokens_seen": 14743552, + "step": 69865 + }, + { + "epoch": 7.686468646864687, + "grad_norm": 0.43952521681785583, + "learning_rate": 3.8665856317844904e-05, + "loss": 0.0751, + "num_input_tokens_seen": 14744576, + "step": 69870 + }, + { + "epoch": 7.687018701870187, + "grad_norm": 0.31241485476493835, + "learning_rate": 3.866384650660898e-05, + "loss": 0.057, + "num_input_tokens_seen": 14745664, + "step": 69875 + }, + { + "epoch": 7.687568756875687, + "grad_norm": 0.10186536610126495, + "learning_rate": 3.866183656943994e-05, + "loss": 0.0051, + "num_input_tokens_seen": 14746784, + "step": 69880 + }, + { + "epoch": 7.688118811881188, + "grad_norm": 0.023166315630078316, + "learning_rate": 3.865982650635629e-05, + "loss": 0.0177, + "num_input_tokens_seen": 14747840, + "step": 69885 + }, + { + "epoch": 7.6886688668866885, + "grad_norm": 0.022495051845908165, + "learning_rate": 3.865781631737658e-05, + "loss": 0.0203, + "num_input_tokens_seen": 14748928, + "step": 69890 + }, + { + "epoch": 7.68921892189219, + "grad_norm": 0.039454441517591476, + "learning_rate": 3.865580600251933e-05, + "loss": 0.013, + "num_input_tokens_seen": 14749984, + "step": 69895 + }, + { + "epoch": 7.68976897689769, + "grad_norm": 0.3155892789363861, + "learning_rate": 3.865379556180307e-05, + "loss": 0.0452, + "num_input_tokens_seen": 14751040, + "step": 69900 + }, + { + "epoch": 7.69031903190319, + "grad_norm": 0.17392970621585846, + "learning_rate": 3.8651784995246336e-05, + "loss": 0.0658, + "num_input_tokens_seen": 14752064, + "step": 69905 + }, + { + "epoch": 7.690869086908691, + "grad_norm": 0.03106583096086979, + "learning_rate": 3.864977430286765e-05, + "loss": 0.0075, + "num_input_tokens_seen": 14753152, + "step": 69910 + }, + { + "epoch": 7.691419141914191, + "grad_norm": 0.016072673723101616, + "learning_rate": 3.864776348468553e-05, + "loss": 0.0059, + "num_input_tokens_seen": 14754208, + "step": 69915 + }, + { + "epoch": 7.6919691969196915, + "grad_norm": 0.0703948438167572, + "learning_rate": 3.8645752540718536e-05, + "loss": 0.0721, + "num_input_tokens_seen": 14755264, + "step": 69920 + }, + { + "epoch": 7.692519251925193, + "grad_norm": 0.2901637554168701, + "learning_rate": 3.8643741470985185e-05, + "loss": 0.0635, + "num_input_tokens_seen": 14756384, + "step": 69925 + }, + { + "epoch": 7.693069306930693, + "grad_norm": 0.015761135146021843, + "learning_rate": 3.864173027550402e-05, + "loss": 0.1897, + "num_input_tokens_seen": 14757376, + "step": 69930 + }, + { + "epoch": 7.693619361936194, + "grad_norm": 0.012778066098690033, + "learning_rate": 3.863971895429356e-05, + "loss": 0.0193, + "num_input_tokens_seen": 14758368, + "step": 69935 + }, + { + "epoch": 7.694169416941694, + "grad_norm": 0.06716597080230713, + "learning_rate": 3.8637707507372366e-05, + "loss": 0.0063, + "num_input_tokens_seen": 14759520, + "step": 69940 + }, + { + "epoch": 7.694719471947195, + "grad_norm": 0.7718433141708374, + "learning_rate": 3.8635695934758963e-05, + "loss": 0.0736, + "num_input_tokens_seen": 14760640, + "step": 69945 + }, + { + "epoch": 7.695269526952695, + "grad_norm": 0.12330296635627747, + "learning_rate": 3.863368423647189e-05, + "loss": 0.0245, + "num_input_tokens_seen": 14761696, + "step": 69950 + }, + { + "epoch": 7.695819581958196, + "grad_norm": 1.4148051738739014, + "learning_rate": 3.86316724125297e-05, + "loss": 0.0998, + "num_input_tokens_seen": 14762752, + "step": 69955 + }, + { + "epoch": 7.696369636963697, + "grad_norm": 0.08717036992311478, + "learning_rate": 3.8629660462950924e-05, + "loss": 0.0452, + "num_input_tokens_seen": 14763744, + "step": 69960 + }, + { + "epoch": 7.696919691969197, + "grad_norm": 0.552761971950531, + "learning_rate": 3.86276483877541e-05, + "loss": 0.0319, + "num_input_tokens_seen": 14764800, + "step": 69965 + }, + { + "epoch": 7.697469746974697, + "grad_norm": 1.840308666229248, + "learning_rate": 3.862563618695779e-05, + "loss": 0.0396, + "num_input_tokens_seen": 14765952, + "step": 69970 + }, + { + "epoch": 7.698019801980198, + "grad_norm": 0.05162489414215088, + "learning_rate": 3.8623623860580525e-05, + "loss": 0.0054, + "num_input_tokens_seen": 14767008, + "step": 69975 + }, + { + "epoch": 7.698569856985698, + "grad_norm": 1.3258765935897827, + "learning_rate": 3.8621611408640854e-05, + "loss": 0.0415, + "num_input_tokens_seen": 14768064, + "step": 69980 + }, + { + "epoch": 7.6991199119911995, + "grad_norm": 0.07513231784105301, + "learning_rate": 3.8619598831157334e-05, + "loss": 0.0241, + "num_input_tokens_seen": 14769120, + "step": 69985 + }, + { + "epoch": 7.6996699669967, + "grad_norm": 0.059686530381441116, + "learning_rate": 3.86175861281485e-05, + "loss": 0.0893, + "num_input_tokens_seen": 14770208, + "step": 69990 + }, + { + "epoch": 7.7002200220022, + "grad_norm": 0.3686002790927887, + "learning_rate": 3.8615573299632914e-05, + "loss": 0.0179, + "num_input_tokens_seen": 14771296, + "step": 69995 + }, + { + "epoch": 7.700770077007701, + "grad_norm": 1.1351263523101807, + "learning_rate": 3.861356034562912e-05, + "loss": 0.05, + "num_input_tokens_seen": 14772384, + "step": 70000 + }, + { + "epoch": 7.701320132013201, + "grad_norm": 0.016883086413145065, + "learning_rate": 3.8611547266155665e-05, + "loss": 0.014, + "num_input_tokens_seen": 14773408, + "step": 70005 + }, + { + "epoch": 7.701870187018702, + "grad_norm": 0.026499448344111443, + "learning_rate": 3.860953406123112e-05, + "loss": 0.017, + "num_input_tokens_seen": 14774432, + "step": 70010 + }, + { + "epoch": 7.7024202420242025, + "grad_norm": 0.35253503918647766, + "learning_rate": 3.860752073087403e-05, + "loss": 0.132, + "num_input_tokens_seen": 14775488, + "step": 70015 + }, + { + "epoch": 7.702970297029703, + "grad_norm": 1.9361145496368408, + "learning_rate": 3.8605507275102946e-05, + "loss": 0.0665, + "num_input_tokens_seen": 14776544, + "step": 70020 + }, + { + "epoch": 7.703520352035204, + "grad_norm": 0.04490487650036812, + "learning_rate": 3.860349369393643e-05, + "loss": 0.0401, + "num_input_tokens_seen": 14777568, + "step": 70025 + }, + { + "epoch": 7.704070407040704, + "grad_norm": 0.28283125162124634, + "learning_rate": 3.8601479987393044e-05, + "loss": 0.0582, + "num_input_tokens_seen": 14778720, + "step": 70030 + }, + { + "epoch": 7.704620462046204, + "grad_norm": 0.8852986693382263, + "learning_rate": 3.859946615549135e-05, + "loss": 0.1643, + "num_input_tokens_seen": 14779744, + "step": 70035 + }, + { + "epoch": 7.705170517051705, + "grad_norm": 0.11939035356044769, + "learning_rate": 3.859745219824989e-05, + "loss": 0.0262, + "num_input_tokens_seen": 14780800, + "step": 70040 + }, + { + "epoch": 7.7057205720572055, + "grad_norm": 0.020259562879800797, + "learning_rate": 3.8595438115687235e-05, + "loss": 0.0433, + "num_input_tokens_seen": 14781888, + "step": 70045 + }, + { + "epoch": 7.706270627062707, + "grad_norm": 0.012125339359045029, + "learning_rate": 3.8593423907821966e-05, + "loss": 0.035, + "num_input_tokens_seen": 14782880, + "step": 70050 + }, + { + "epoch": 7.706820682068207, + "grad_norm": 0.04336989298462868, + "learning_rate": 3.859140957467262e-05, + "loss": 0.0336, + "num_input_tokens_seen": 14783936, + "step": 70055 + }, + { + "epoch": 7.707370737073707, + "grad_norm": 0.07213756442070007, + "learning_rate": 3.858939511625778e-05, + "loss": 0.0233, + "num_input_tokens_seen": 14784960, + "step": 70060 + }, + { + "epoch": 7.707920792079208, + "grad_norm": 0.13216733932495117, + "learning_rate": 3.8587380532596e-05, + "loss": 0.0356, + "num_input_tokens_seen": 14785984, + "step": 70065 + }, + { + "epoch": 7.708470847084708, + "grad_norm": 0.09276522696018219, + "learning_rate": 3.858536582370586e-05, + "loss": 0.0079, + "num_input_tokens_seen": 14787008, + "step": 70070 + }, + { + "epoch": 7.709020902090209, + "grad_norm": 0.12574604153633118, + "learning_rate": 3.8583350989605916e-05, + "loss": 0.0076, + "num_input_tokens_seen": 14788032, + "step": 70075 + }, + { + "epoch": 7.70957095709571, + "grad_norm": 0.05051020532846451, + "learning_rate": 3.858133603031475e-05, + "loss": 0.046, + "num_input_tokens_seen": 14789152, + "step": 70080 + }, + { + "epoch": 7.71012101210121, + "grad_norm": 1.7842001914978027, + "learning_rate": 3.857932094585092e-05, + "loss": 0.1085, + "num_input_tokens_seen": 14790208, + "step": 70085 + }, + { + "epoch": 7.710671067106711, + "grad_norm": 0.09696069359779358, + "learning_rate": 3.8577305736233e-05, + "loss": 0.0145, + "num_input_tokens_seen": 14791264, + "step": 70090 + }, + { + "epoch": 7.711221122112211, + "grad_norm": 1.1427477598190308, + "learning_rate": 3.8575290401479586e-05, + "loss": 0.0735, + "num_input_tokens_seen": 14792352, + "step": 70095 + }, + { + "epoch": 7.711771177117711, + "grad_norm": 0.13038524985313416, + "learning_rate": 3.8573274941609225e-05, + "loss": 0.0108, + "num_input_tokens_seen": 14793472, + "step": 70100 + }, + { + "epoch": 7.712321232123212, + "grad_norm": 0.016577476635575294, + "learning_rate": 3.85712593566405e-05, + "loss": 0.0126, + "num_input_tokens_seen": 14794496, + "step": 70105 + }, + { + "epoch": 7.712871287128713, + "grad_norm": 0.19945642352104187, + "learning_rate": 3.856924364659199e-05, + "loss": 0.0075, + "num_input_tokens_seen": 14795520, + "step": 70110 + }, + { + "epoch": 7.713421342134214, + "grad_norm": 0.05114151909947395, + "learning_rate": 3.8567227811482276e-05, + "loss": 0.0223, + "num_input_tokens_seen": 14796608, + "step": 70115 + }, + { + "epoch": 7.713971397139714, + "grad_norm": 0.02198196016252041, + "learning_rate": 3.856521185132993e-05, + "loss": 0.0311, + "num_input_tokens_seen": 14797600, + "step": 70120 + }, + { + "epoch": 7.714521452145215, + "grad_norm": 0.421170711517334, + "learning_rate": 3.8563195766153535e-05, + "loss": 0.0114, + "num_input_tokens_seen": 14798592, + "step": 70125 + }, + { + "epoch": 7.715071507150715, + "grad_norm": 0.4311959743499756, + "learning_rate": 3.856117955597168e-05, + "loss": 0.0293, + "num_input_tokens_seen": 14799648, + "step": 70130 + }, + { + "epoch": 7.715621562156215, + "grad_norm": 0.07327806949615479, + "learning_rate": 3.855916322080293e-05, + "loss": 0.0417, + "num_input_tokens_seen": 14800736, + "step": 70135 + }, + { + "epoch": 7.7161716171617165, + "grad_norm": 0.5616374015808105, + "learning_rate": 3.8557146760665886e-05, + "loss": 0.0109, + "num_input_tokens_seen": 14801856, + "step": 70140 + }, + { + "epoch": 7.716721672167217, + "grad_norm": 0.01437312737107277, + "learning_rate": 3.8555130175579123e-05, + "loss": 0.0284, + "num_input_tokens_seen": 14802944, + "step": 70145 + }, + { + "epoch": 7.717271727172717, + "grad_norm": 0.027849730104207993, + "learning_rate": 3.855311346556123e-05, + "loss": 0.0033, + "num_input_tokens_seen": 14804032, + "step": 70150 + }, + { + "epoch": 7.717821782178218, + "grad_norm": 0.5790640711784363, + "learning_rate": 3.855109663063079e-05, + "loss": 0.0226, + "num_input_tokens_seen": 14805120, + "step": 70155 + }, + { + "epoch": 7.718371837183718, + "grad_norm": 0.12848706543445587, + "learning_rate": 3.85490796708064e-05, + "loss": 0.0084, + "num_input_tokens_seen": 14806208, + "step": 70160 + }, + { + "epoch": 7.718921892189218, + "grad_norm": 0.0038910778239369392, + "learning_rate": 3.854706258610664e-05, + "loss": 0.0239, + "num_input_tokens_seen": 14807264, + "step": 70165 + }, + { + "epoch": 7.7194719471947195, + "grad_norm": 0.13384799659252167, + "learning_rate": 3.854504537655012e-05, + "loss": 0.0054, + "num_input_tokens_seen": 14808320, + "step": 70170 + }, + { + "epoch": 7.72002200220022, + "grad_norm": 0.22412537038326263, + "learning_rate": 3.85430280421554e-05, + "loss": 0.102, + "num_input_tokens_seen": 14809440, + "step": 70175 + }, + { + "epoch": 7.720572057205721, + "grad_norm": 0.015134909190237522, + "learning_rate": 3.854101058294108e-05, + "loss": 0.0731, + "num_input_tokens_seen": 14810496, + "step": 70180 + }, + { + "epoch": 7.721122112211221, + "grad_norm": 1.2530337572097778, + "learning_rate": 3.8538992998925785e-05, + "loss": 0.0929, + "num_input_tokens_seen": 14811584, + "step": 70185 + }, + { + "epoch": 7.721672167216722, + "grad_norm": 1.1093193292617798, + "learning_rate": 3.853697529012808e-05, + "loss": 0.0599, + "num_input_tokens_seen": 14812608, + "step": 70190 + }, + { + "epoch": 7.722222222222222, + "grad_norm": 0.6189627051353455, + "learning_rate": 3.8534957456566566e-05, + "loss": 0.026, + "num_input_tokens_seen": 14813664, + "step": 70195 + }, + { + "epoch": 7.7227722772277225, + "grad_norm": 0.03882245719432831, + "learning_rate": 3.8532939498259845e-05, + "loss": 0.0369, + "num_input_tokens_seen": 14814784, + "step": 70200 + }, + { + "epoch": 7.723322332233224, + "grad_norm": 0.0033598202280700207, + "learning_rate": 3.853092141522652e-05, + "loss": 0.0266, + "num_input_tokens_seen": 14815904, + "step": 70205 + }, + { + "epoch": 7.723872387238724, + "grad_norm": 0.4394903779029846, + "learning_rate": 3.852890320748518e-05, + "loss": 0.0846, + "num_input_tokens_seen": 14816960, + "step": 70210 + }, + { + "epoch": 7.724422442244224, + "grad_norm": 0.3240152597427368, + "learning_rate": 3.852688487505443e-05, + "loss": 0.0356, + "num_input_tokens_seen": 14818048, + "step": 70215 + }, + { + "epoch": 7.724972497249725, + "grad_norm": 0.05653784051537514, + "learning_rate": 3.852486641795288e-05, + "loss": 0.0336, + "num_input_tokens_seen": 14819168, + "step": 70220 + }, + { + "epoch": 7.725522552255225, + "grad_norm": 0.012615546584129333, + "learning_rate": 3.8522847836199116e-05, + "loss": 0.0158, + "num_input_tokens_seen": 14820224, + "step": 70225 + }, + { + "epoch": 7.726072607260726, + "grad_norm": 0.014482027851045132, + "learning_rate": 3.8520829129811755e-05, + "loss": 0.03, + "num_input_tokens_seen": 14821344, + "step": 70230 + }, + { + "epoch": 7.726622662266227, + "grad_norm": 0.08157660067081451, + "learning_rate": 3.85188102988094e-05, + "loss": 0.0171, + "num_input_tokens_seen": 14822464, + "step": 70235 + }, + { + "epoch": 7.727172717271727, + "grad_norm": 0.04736536368727684, + "learning_rate": 3.851679134321066e-05, + "loss": 0.0165, + "num_input_tokens_seen": 14823616, + "step": 70240 + }, + { + "epoch": 7.727722772277228, + "grad_norm": 0.013304024934768677, + "learning_rate": 3.851477226303414e-05, + "loss": 0.0023, + "num_input_tokens_seen": 14824640, + "step": 70245 + }, + { + "epoch": 7.728272827282728, + "grad_norm": 0.07087397575378418, + "learning_rate": 3.8512753058298445e-05, + "loss": 0.0737, + "num_input_tokens_seen": 14825728, + "step": 70250 + }, + { + "epoch": 7.728822882288229, + "grad_norm": 0.006525163538753986, + "learning_rate": 3.85107337290222e-05, + "loss": 0.037, + "num_input_tokens_seen": 14826784, + "step": 70255 + }, + { + "epoch": 7.729372937293729, + "grad_norm": 0.04621646925806999, + "learning_rate": 3.8508714275223994e-05, + "loss": 0.0302, + "num_input_tokens_seen": 14827840, + "step": 70260 + }, + { + "epoch": 7.72992299229923, + "grad_norm": 0.024794623255729675, + "learning_rate": 3.850669469692246e-05, + "loss": 0.0044, + "num_input_tokens_seen": 14828896, + "step": 70265 + }, + { + "epoch": 7.730473047304731, + "grad_norm": 0.09810446947813034, + "learning_rate": 3.850467499413619e-05, + "loss": 0.0196, + "num_input_tokens_seen": 14829920, + "step": 70270 + }, + { + "epoch": 7.731023102310231, + "grad_norm": 0.04886331409215927, + "learning_rate": 3.850265516688382e-05, + "loss": 0.0076, + "num_input_tokens_seen": 14831008, + "step": 70275 + }, + { + "epoch": 7.731573157315731, + "grad_norm": 0.09117792546749115, + "learning_rate": 3.8500635215183954e-05, + "loss": 0.0272, + "num_input_tokens_seen": 14832064, + "step": 70280 + }, + { + "epoch": 7.732123212321232, + "grad_norm": 0.25519320368766785, + "learning_rate": 3.849861513905521e-05, + "loss": 0.121, + "num_input_tokens_seen": 14833120, + "step": 70285 + }, + { + "epoch": 7.732673267326732, + "grad_norm": 0.06603186577558517, + "learning_rate": 3.849659493851621e-05, + "loss": 0.132, + "num_input_tokens_seen": 14834080, + "step": 70290 + }, + { + "epoch": 7.7332233223322335, + "grad_norm": 0.06540407240390778, + "learning_rate": 3.849457461358556e-05, + "loss": 0.0029, + "num_input_tokens_seen": 14835072, + "step": 70295 + }, + { + "epoch": 7.733773377337734, + "grad_norm": 1.216983437538147, + "learning_rate": 3.849255416428191e-05, + "loss": 0.087, + "num_input_tokens_seen": 14836160, + "step": 70300 + }, + { + "epoch": 7.734323432343234, + "grad_norm": 0.0059507787227630615, + "learning_rate": 3.8490533590623854e-05, + "loss": 0.0673, + "num_input_tokens_seen": 14837216, + "step": 70305 + }, + { + "epoch": 7.734873487348735, + "grad_norm": 1.4618409872055054, + "learning_rate": 3.848851289263003e-05, + "loss": 0.183, + "num_input_tokens_seen": 14838272, + "step": 70310 + }, + { + "epoch": 7.735423542354235, + "grad_norm": 0.10925088077783585, + "learning_rate": 3.848649207031904e-05, + "loss": 0.0119, + "num_input_tokens_seen": 14839264, + "step": 70315 + }, + { + "epoch": 7.735973597359736, + "grad_norm": 0.9467993974685669, + "learning_rate": 3.848447112370953e-05, + "loss": 0.0614, + "num_input_tokens_seen": 14840352, + "step": 70320 + }, + { + "epoch": 7.7365236523652365, + "grad_norm": 0.045073967427015305, + "learning_rate": 3.848245005282013e-05, + "loss": 0.0413, + "num_input_tokens_seen": 14841408, + "step": 70325 + }, + { + "epoch": 7.737073707370737, + "grad_norm": 0.2434035837650299, + "learning_rate": 3.848042885766944e-05, + "loss": 0.069, + "num_input_tokens_seen": 14842496, + "step": 70330 + }, + { + "epoch": 7.737623762376238, + "grad_norm": 0.018393583595752716, + "learning_rate": 3.8478407538276116e-05, + "loss": 0.0091, + "num_input_tokens_seen": 14843520, + "step": 70335 + }, + { + "epoch": 7.738173817381738, + "grad_norm": 0.8536286354064941, + "learning_rate": 3.8476386094658776e-05, + "loss": 0.1389, + "num_input_tokens_seen": 14844544, + "step": 70340 + }, + { + "epoch": 7.738723872387238, + "grad_norm": 0.02560214139521122, + "learning_rate": 3.847436452683605e-05, + "loss": 0.0152, + "num_input_tokens_seen": 14845600, + "step": 70345 + }, + { + "epoch": 7.739273927392739, + "grad_norm": 0.7041303515434265, + "learning_rate": 3.847234283482658e-05, + "loss": 0.021, + "num_input_tokens_seen": 14846624, + "step": 70350 + }, + { + "epoch": 7.7398239823982395, + "grad_norm": 0.05835716798901558, + "learning_rate": 3.847032101864898e-05, + "loss": 0.0204, + "num_input_tokens_seen": 14847712, + "step": 70355 + }, + { + "epoch": 7.740374037403741, + "grad_norm": 0.1433085948228836, + "learning_rate": 3.84682990783219e-05, + "loss": 0.051, + "num_input_tokens_seen": 14848736, + "step": 70360 + }, + { + "epoch": 7.740924092409241, + "grad_norm": 0.1169290691614151, + "learning_rate": 3.846627701386397e-05, + "loss": 0.0125, + "num_input_tokens_seen": 14849792, + "step": 70365 + }, + { + "epoch": 7.741474147414742, + "grad_norm": 1.4346503019332886, + "learning_rate": 3.846425482529383e-05, + "loss": 0.0654, + "num_input_tokens_seen": 14850848, + "step": 70370 + }, + { + "epoch": 7.742024202420242, + "grad_norm": 0.010365558788180351, + "learning_rate": 3.8462232512630106e-05, + "loss": 0.0078, + "num_input_tokens_seen": 14851904, + "step": 70375 + }, + { + "epoch": 7.742574257425742, + "grad_norm": 0.12760727107524872, + "learning_rate": 3.846021007589146e-05, + "loss": 0.0609, + "num_input_tokens_seen": 14852928, + "step": 70380 + }, + { + "epoch": 7.743124312431243, + "grad_norm": 0.20780326426029205, + "learning_rate": 3.84581875150965e-05, + "loss": 0.0778, + "num_input_tokens_seen": 14853984, + "step": 70385 + }, + { + "epoch": 7.743674367436744, + "grad_norm": 0.4375588297843933, + "learning_rate": 3.8456164830263885e-05, + "loss": 0.04, + "num_input_tokens_seen": 14854944, + "step": 70390 + }, + { + "epoch": 7.744224422442244, + "grad_norm": 0.08641955256462097, + "learning_rate": 3.8454142021412264e-05, + "loss": 0.094, + "num_input_tokens_seen": 14856000, + "step": 70395 + }, + { + "epoch": 7.744774477447745, + "grad_norm": 0.03347213938832283, + "learning_rate": 3.8452119088560265e-05, + "loss": 0.0096, + "num_input_tokens_seen": 14857056, + "step": 70400 + }, + { + "epoch": 7.745324532453245, + "grad_norm": 0.06933426856994629, + "learning_rate": 3.845009603172654e-05, + "loss": 0.0371, + "num_input_tokens_seen": 14858176, + "step": 70405 + }, + { + "epoch": 7.745874587458746, + "grad_norm": 0.07481477409601212, + "learning_rate": 3.844807285092974e-05, + "loss": 0.0101, + "num_input_tokens_seen": 14859200, + "step": 70410 + }, + { + "epoch": 7.7464246424642464, + "grad_norm": 0.10402249544858932, + "learning_rate": 3.84460495461885e-05, + "loss": 0.0208, + "num_input_tokens_seen": 14860288, + "step": 70415 + }, + { + "epoch": 7.746974697469747, + "grad_norm": 0.05677751451730728, + "learning_rate": 3.844402611752147e-05, + "loss": 0.0125, + "num_input_tokens_seen": 14861344, + "step": 70420 + }, + { + "epoch": 7.747524752475248, + "grad_norm": 0.02193984016776085, + "learning_rate": 3.844200256494731e-05, + "loss": 0.0109, + "num_input_tokens_seen": 14862304, + "step": 70425 + }, + { + "epoch": 7.748074807480748, + "grad_norm": 0.011446989141404629, + "learning_rate": 3.8439978888484664e-05, + "loss": 0.0042, + "num_input_tokens_seen": 14863392, + "step": 70430 + }, + { + "epoch": 7.748624862486249, + "grad_norm": 1.252547025680542, + "learning_rate": 3.843795508815218e-05, + "loss": 0.0653, + "num_input_tokens_seen": 14864384, + "step": 70435 + }, + { + "epoch": 7.749174917491749, + "grad_norm": 0.2912446856498718, + "learning_rate": 3.843593116396851e-05, + "loss": 0.0668, + "num_input_tokens_seen": 14865472, + "step": 70440 + }, + { + "epoch": 7.7497249724972495, + "grad_norm": 0.4968110918998718, + "learning_rate": 3.84339071159523e-05, + "loss": 0.0736, + "num_input_tokens_seen": 14866496, + "step": 70445 + }, + { + "epoch": 7.7502750275027505, + "grad_norm": 0.326486736536026, + "learning_rate": 3.843188294412223e-05, + "loss": 0.0077, + "num_input_tokens_seen": 14867456, + "step": 70450 + }, + { + "epoch": 7.750825082508251, + "grad_norm": 0.03159954398870468, + "learning_rate": 3.842985864849693e-05, + "loss": 0.0028, + "num_input_tokens_seen": 14868480, + "step": 70455 + }, + { + "epoch": 7.751375137513751, + "grad_norm": 0.0120593486353755, + "learning_rate": 3.8427834229095075e-05, + "loss": 0.1155, + "num_input_tokens_seen": 14869536, + "step": 70460 + }, + { + "epoch": 7.751925192519252, + "grad_norm": 0.012980123050510883, + "learning_rate": 3.842580968593531e-05, + "loss": 0.0503, + "num_input_tokens_seen": 14870624, + "step": 70465 + }, + { + "epoch": 7.752475247524752, + "grad_norm": 0.490947961807251, + "learning_rate": 3.8423785019036295e-05, + "loss": 0.0129, + "num_input_tokens_seen": 14871680, + "step": 70470 + }, + { + "epoch": 7.753025302530253, + "grad_norm": 0.3729015290737152, + "learning_rate": 3.84217602284167e-05, + "loss": 0.0472, + "num_input_tokens_seen": 14872672, + "step": 70475 + }, + { + "epoch": 7.7535753575357536, + "grad_norm": 0.20598511397838593, + "learning_rate": 3.841973531409518e-05, + "loss": 0.0275, + "num_input_tokens_seen": 14873728, + "step": 70480 + }, + { + "epoch": 7.754125412541254, + "grad_norm": 0.011010982096195221, + "learning_rate": 3.8417710276090407e-05, + "loss": 0.0216, + "num_input_tokens_seen": 14874816, + "step": 70485 + }, + { + "epoch": 7.754675467546755, + "grad_norm": 0.01863504946231842, + "learning_rate": 3.841568511442103e-05, + "loss": 0.0034, + "num_input_tokens_seen": 14875872, + "step": 70490 + }, + { + "epoch": 7.755225522552255, + "grad_norm": 0.007859901525080204, + "learning_rate": 3.841365982910572e-05, + "loss": 0.0187, + "num_input_tokens_seen": 14876896, + "step": 70495 + }, + { + "epoch": 7.755775577557756, + "grad_norm": 1.2959280014038086, + "learning_rate": 3.841163442016315e-05, + "loss": 0.0874, + "num_input_tokens_seen": 14877888, + "step": 70500 + }, + { + "epoch": 7.756325632563256, + "grad_norm": 0.03253708779811859, + "learning_rate": 3.840960888761197e-05, + "loss": 0.0782, + "num_input_tokens_seen": 14878912, + "step": 70505 + }, + { + "epoch": 7.756875687568757, + "grad_norm": 0.10301491618156433, + "learning_rate": 3.840758323147087e-05, + "loss": 0.0502, + "num_input_tokens_seen": 14879968, + "step": 70510 + }, + { + "epoch": 7.757425742574258, + "grad_norm": 0.03559134155511856, + "learning_rate": 3.84055574517585e-05, + "loss": 0.0081, + "num_input_tokens_seen": 14880960, + "step": 70515 + }, + { + "epoch": 7.757975797579758, + "grad_norm": 0.236698180437088, + "learning_rate": 3.8403531548493546e-05, + "loss": 0.0377, + "num_input_tokens_seen": 14882048, + "step": 70520 + }, + { + "epoch": 7.758525852585258, + "grad_norm": 0.5981996059417725, + "learning_rate": 3.8401505521694675e-05, + "loss": 0.0729, + "num_input_tokens_seen": 14883104, + "step": 70525 + }, + { + "epoch": 7.759075907590759, + "grad_norm": 0.10139118880033493, + "learning_rate": 3.839947937138055e-05, + "loss": 0.004, + "num_input_tokens_seen": 14884128, + "step": 70530 + }, + { + "epoch": 7.759625962596259, + "grad_norm": 0.1457580029964447, + "learning_rate": 3.839745309756986e-05, + "loss": 0.0084, + "num_input_tokens_seen": 14885184, + "step": 70535 + }, + { + "epoch": 7.7601760176017605, + "grad_norm": 1.0260125398635864, + "learning_rate": 3.839542670028127e-05, + "loss": 0.0708, + "num_input_tokens_seen": 14886240, + "step": 70540 + }, + { + "epoch": 7.760726072607261, + "grad_norm": 0.25352370738983154, + "learning_rate": 3.839340017953347e-05, + "loss": 0.0442, + "num_input_tokens_seen": 14887296, + "step": 70545 + }, + { + "epoch": 7.761276127612762, + "grad_norm": 0.18509721755981445, + "learning_rate": 3.839137353534511e-05, + "loss": 0.1019, + "num_input_tokens_seen": 14888352, + "step": 70550 + }, + { + "epoch": 7.761826182618262, + "grad_norm": 0.06291862577199936, + "learning_rate": 3.83893467677349e-05, + "loss": 0.0044, + "num_input_tokens_seen": 14889376, + "step": 70555 + }, + { + "epoch": 7.762376237623762, + "grad_norm": 0.06841875612735748, + "learning_rate": 3.8387319876721496e-05, + "loss": 0.144, + "num_input_tokens_seen": 14890400, + "step": 70560 + }, + { + "epoch": 7.762926292629263, + "grad_norm": 0.2929542660713196, + "learning_rate": 3.838529286232359e-05, + "loss": 0.0412, + "num_input_tokens_seen": 14891456, + "step": 70565 + }, + { + "epoch": 7.7634763476347635, + "grad_norm": 0.014319286681711674, + "learning_rate": 3.838326572455987e-05, + "loss": 0.0107, + "num_input_tokens_seen": 14892512, + "step": 70570 + }, + { + "epoch": 7.764026402640264, + "grad_norm": 0.04798700287938118, + "learning_rate": 3.838123846344901e-05, + "loss": 0.0567, + "num_input_tokens_seen": 14893568, + "step": 70575 + }, + { + "epoch": 7.764576457645765, + "grad_norm": 0.023289402946829796, + "learning_rate": 3.837921107900969e-05, + "loss": 0.0305, + "num_input_tokens_seen": 14894624, + "step": 70580 + }, + { + "epoch": 7.765126512651265, + "grad_norm": 1.341059684753418, + "learning_rate": 3.8377183571260605e-05, + "loss": 0.0601, + "num_input_tokens_seen": 14895648, + "step": 70585 + }, + { + "epoch": 7.765676567656766, + "grad_norm": 0.13703574240207672, + "learning_rate": 3.8375155940220444e-05, + "loss": 0.0061, + "num_input_tokens_seen": 14896736, + "step": 70590 + }, + { + "epoch": 7.766226622662266, + "grad_norm": 0.22690363228321075, + "learning_rate": 3.837312818590788e-05, + "loss": 0.0646, + "num_input_tokens_seen": 14897792, + "step": 70595 + }, + { + "epoch": 7.7667766776677665, + "grad_norm": 1.0069297552108765, + "learning_rate": 3.837110030834161e-05, + "loss": 0.0389, + "num_input_tokens_seen": 14898816, + "step": 70600 + }, + { + "epoch": 7.767326732673268, + "grad_norm": 0.9074820280075073, + "learning_rate": 3.836907230754033e-05, + "loss": 0.1157, + "num_input_tokens_seen": 14899872, + "step": 70605 + }, + { + "epoch": 7.767876787678768, + "grad_norm": 0.4535698890686035, + "learning_rate": 3.836704418352271e-05, + "loss": 0.0691, + "num_input_tokens_seen": 14900864, + "step": 70610 + }, + { + "epoch": 7.768426842684269, + "grad_norm": 1.1934500932693481, + "learning_rate": 3.836501593630748e-05, + "loss": 0.015, + "num_input_tokens_seen": 14901920, + "step": 70615 + }, + { + "epoch": 7.768976897689769, + "grad_norm": 0.08426688611507416, + "learning_rate": 3.83629875659133e-05, + "loss": 0.0372, + "num_input_tokens_seen": 14902944, + "step": 70620 + }, + { + "epoch": 7.769526952695269, + "grad_norm": 0.26435285806655884, + "learning_rate": 3.836095907235888e-05, + "loss": 0.0444, + "num_input_tokens_seen": 14904000, + "step": 70625 + }, + { + "epoch": 7.77007700770077, + "grad_norm": 0.07624831050634384, + "learning_rate": 3.835893045566291e-05, + "loss": 0.0036, + "num_input_tokens_seen": 14905088, + "step": 70630 + }, + { + "epoch": 7.770627062706271, + "grad_norm": 0.01492159254848957, + "learning_rate": 3.835690171584409e-05, + "loss": 0.0065, + "num_input_tokens_seen": 14906176, + "step": 70635 + }, + { + "epoch": 7.771177117711771, + "grad_norm": 0.025568736717104912, + "learning_rate": 3.835487285292112e-05, + "loss": 0.004, + "num_input_tokens_seen": 14907168, + "step": 70640 + }, + { + "epoch": 7.771727172717272, + "grad_norm": 0.38104891777038574, + "learning_rate": 3.835284386691268e-05, + "loss": 0.1176, + "num_input_tokens_seen": 14908288, + "step": 70645 + }, + { + "epoch": 7.772277227722772, + "grad_norm": 0.04602549970149994, + "learning_rate": 3.8350814757837495e-05, + "loss": 0.042, + "num_input_tokens_seen": 14909408, + "step": 70650 + }, + { + "epoch": 7.772827282728273, + "grad_norm": 0.37309902906417847, + "learning_rate": 3.834878552571426e-05, + "loss": 0.0225, + "num_input_tokens_seen": 14910400, + "step": 70655 + }, + { + "epoch": 7.773377337733773, + "grad_norm": 1.6619271039962769, + "learning_rate": 3.834675617056167e-05, + "loss": 0.0685, + "num_input_tokens_seen": 14911456, + "step": 70660 + }, + { + "epoch": 7.773927392739274, + "grad_norm": 0.06557886302471161, + "learning_rate": 3.834472669239843e-05, + "loss": 0.0102, + "num_input_tokens_seen": 14912480, + "step": 70665 + }, + { + "epoch": 7.774477447744775, + "grad_norm": 0.008658448234200478, + "learning_rate": 3.834269709124325e-05, + "loss": 0.0496, + "num_input_tokens_seen": 14913536, + "step": 70670 + }, + { + "epoch": 7.775027502750275, + "grad_norm": 1.111063838005066, + "learning_rate": 3.834066736711484e-05, + "loss": 0.0735, + "num_input_tokens_seen": 14914592, + "step": 70675 + }, + { + "epoch": 7.775577557755776, + "grad_norm": 0.7675957679748535, + "learning_rate": 3.833863752003189e-05, + "loss": 0.0537, + "num_input_tokens_seen": 14915616, + "step": 70680 + }, + { + "epoch": 7.776127612761276, + "grad_norm": 0.4157085418701172, + "learning_rate": 3.833660755001313e-05, + "loss": 0.0381, + "num_input_tokens_seen": 14916640, + "step": 70685 + }, + { + "epoch": 7.776677667766776, + "grad_norm": 0.021751513704657555, + "learning_rate": 3.833457745707725e-05, + "loss": 0.0064, + "num_input_tokens_seen": 14917664, + "step": 70690 + }, + { + "epoch": 7.7772277227722775, + "grad_norm": 0.2972389757633209, + "learning_rate": 3.833254724124297e-05, + "loss": 0.0065, + "num_input_tokens_seen": 14918752, + "step": 70695 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.19087031483650208, + "learning_rate": 3.833051690252899e-05, + "loss": 0.0425, + "num_input_tokens_seen": 14919744, + "step": 70700 + }, + { + "epoch": 7.778327832783278, + "grad_norm": 0.03528348356485367, + "learning_rate": 3.832848644095405e-05, + "loss": 0.0871, + "num_input_tokens_seen": 14920800, + "step": 70705 + }, + { + "epoch": 7.778877887788779, + "grad_norm": 0.15454906225204468, + "learning_rate": 3.832645585653682e-05, + "loss": 0.0118, + "num_input_tokens_seen": 14921856, + "step": 70710 + }, + { + "epoch": 7.779427942794279, + "grad_norm": 0.316581666469574, + "learning_rate": 3.832442514929607e-05, + "loss": 0.0299, + "num_input_tokens_seen": 14922880, + "step": 70715 + }, + { + "epoch": 7.77997799779978, + "grad_norm": 1.0333789587020874, + "learning_rate": 3.832239431925047e-05, + "loss": 0.0451, + "num_input_tokens_seen": 14923840, + "step": 70720 + }, + { + "epoch": 7.7805280528052805, + "grad_norm": 0.034413475543260574, + "learning_rate": 3.832036336641877e-05, + "loss": 0.0069, + "num_input_tokens_seen": 14924896, + "step": 70725 + }, + { + "epoch": 7.781078107810782, + "grad_norm": 0.06099377200007439, + "learning_rate": 3.831833229081966e-05, + "loss": 0.0221, + "num_input_tokens_seen": 14925984, + "step": 70730 + }, + { + "epoch": 7.781628162816282, + "grad_norm": 0.0050437962636351585, + "learning_rate": 3.8316301092471875e-05, + "loss": 0.0091, + "num_input_tokens_seen": 14927040, + "step": 70735 + }, + { + "epoch": 7.782178217821782, + "grad_norm": 0.042637888342142105, + "learning_rate": 3.831426977139413e-05, + "loss": 0.0203, + "num_input_tokens_seen": 14928064, + "step": 70740 + }, + { + "epoch": 7.782728272827283, + "grad_norm": 1.0007843971252441, + "learning_rate": 3.831223832760515e-05, + "loss": 0.0857, + "num_input_tokens_seen": 14929120, + "step": 70745 + }, + { + "epoch": 7.783278327832783, + "grad_norm": 0.3451541066169739, + "learning_rate": 3.8310206761123655e-05, + "loss": 0.0366, + "num_input_tokens_seen": 14930176, + "step": 70750 + }, + { + "epoch": 7.7838283828382835, + "grad_norm": 0.33092010021209717, + "learning_rate": 3.830817507196837e-05, + "loss": 0.0111, + "num_input_tokens_seen": 14931232, + "step": 70755 + }, + { + "epoch": 7.784378437843785, + "grad_norm": 4.565337657928467, + "learning_rate": 3.830614326015803e-05, + "loss": 0.0778, + "num_input_tokens_seen": 14932256, + "step": 70760 + }, + { + "epoch": 7.784928492849285, + "grad_norm": 0.0372001938521862, + "learning_rate": 3.830411132571135e-05, + "loss": 0.0064, + "num_input_tokens_seen": 14933280, + "step": 70765 + }, + { + "epoch": 7.785478547854785, + "grad_norm": 0.24475891888141632, + "learning_rate": 3.830207926864705e-05, + "loss": 0.0113, + "num_input_tokens_seen": 14934368, + "step": 70770 + }, + { + "epoch": 7.786028602860286, + "grad_norm": 0.017552722245454788, + "learning_rate": 3.830004708898388e-05, + "loss": 0.01, + "num_input_tokens_seen": 14935488, + "step": 70775 + }, + { + "epoch": 7.786578657865786, + "grad_norm": 0.1833542138338089, + "learning_rate": 3.8298014786740556e-05, + "loss": 0.0073, + "num_input_tokens_seen": 14936512, + "step": 70780 + }, + { + "epoch": 7.787128712871287, + "grad_norm": 0.04370281100273132, + "learning_rate": 3.8295982361935814e-05, + "loss": 0.0056, + "num_input_tokens_seen": 14937504, + "step": 70785 + }, + { + "epoch": 7.787678767876788, + "grad_norm": 0.655175507068634, + "learning_rate": 3.829394981458837e-05, + "loss": 0.0227, + "num_input_tokens_seen": 14938624, + "step": 70790 + }, + { + "epoch": 7.788228822882289, + "grad_norm": 0.20412577688694, + "learning_rate": 3.829191714471698e-05, + "loss": 0.0096, + "num_input_tokens_seen": 14939712, + "step": 70795 + }, + { + "epoch": 7.788778877887789, + "grad_norm": 0.1753966510295868, + "learning_rate": 3.828988435234036e-05, + "loss": 0.0132, + "num_input_tokens_seen": 14940800, + "step": 70800 + }, + { + "epoch": 7.789328932893289, + "grad_norm": 0.07370790839195251, + "learning_rate": 3.828785143747726e-05, + "loss": 0.01, + "num_input_tokens_seen": 14941888, + "step": 70805 + }, + { + "epoch": 7.78987898789879, + "grad_norm": 0.7286133170127869, + "learning_rate": 3.828581840014641e-05, + "loss": 0.0271, + "num_input_tokens_seen": 14942912, + "step": 70810 + }, + { + "epoch": 7.79042904290429, + "grad_norm": 0.14824329316616058, + "learning_rate": 3.8283785240366544e-05, + "loss": 0.008, + "num_input_tokens_seen": 14943936, + "step": 70815 + }, + { + "epoch": 7.790979097909791, + "grad_norm": 0.02155277691781521, + "learning_rate": 3.82817519581564e-05, + "loss": 0.1141, + "num_input_tokens_seen": 14944992, + "step": 70820 + }, + { + "epoch": 7.791529152915292, + "grad_norm": 0.03126048296689987, + "learning_rate": 3.8279718553534725e-05, + "loss": 0.0488, + "num_input_tokens_seen": 14946080, + "step": 70825 + }, + { + "epoch": 7.792079207920792, + "grad_norm": 0.04373779147863388, + "learning_rate": 3.827768502652026e-05, + "loss": 0.1162, + "num_input_tokens_seen": 14947168, + "step": 70830 + }, + { + "epoch": 7.792629262926293, + "grad_norm": 0.01884778030216694, + "learning_rate": 3.827565137713174e-05, + "loss": 0.0386, + "num_input_tokens_seen": 14948224, + "step": 70835 + }, + { + "epoch": 7.793179317931793, + "grad_norm": 0.04112083837389946, + "learning_rate": 3.827361760538791e-05, + "loss": 0.0719, + "num_input_tokens_seen": 14949248, + "step": 70840 + }, + { + "epoch": 7.793729372937293, + "grad_norm": 0.012053684331476688, + "learning_rate": 3.827158371130752e-05, + "loss": 0.0427, + "num_input_tokens_seen": 14950336, + "step": 70845 + }, + { + "epoch": 7.7942794279427945, + "grad_norm": 0.007549323607236147, + "learning_rate": 3.8269549694909304e-05, + "loss": 0.0073, + "num_input_tokens_seen": 14951360, + "step": 70850 + }, + { + "epoch": 7.794829482948295, + "grad_norm": 0.16124852001667023, + "learning_rate": 3.826751555621202e-05, + "loss": 0.0303, + "num_input_tokens_seen": 14952384, + "step": 70855 + }, + { + "epoch": 7.795379537953796, + "grad_norm": 0.16794733703136444, + "learning_rate": 3.826548129523442e-05, + "loss": 0.019, + "num_input_tokens_seen": 14953440, + "step": 70860 + }, + { + "epoch": 7.795929592959296, + "grad_norm": 0.8771539926528931, + "learning_rate": 3.8263446911995234e-05, + "loss": 0.1221, + "num_input_tokens_seen": 14954528, + "step": 70865 + }, + { + "epoch": 7.796479647964796, + "grad_norm": 0.9709321856498718, + "learning_rate": 3.826141240651322e-05, + "loss": 0.0145, + "num_input_tokens_seen": 14955584, + "step": 70870 + }, + { + "epoch": 7.797029702970297, + "grad_norm": 1.2754744291305542, + "learning_rate": 3.825937777880714e-05, + "loss": 0.0302, + "num_input_tokens_seen": 14956736, + "step": 70875 + }, + { + "epoch": 7.7975797579757975, + "grad_norm": 1.6290174722671509, + "learning_rate": 3.825734302889573e-05, + "loss": 0.0492, + "num_input_tokens_seen": 14957760, + "step": 70880 + }, + { + "epoch": 7.798129812981298, + "grad_norm": 1.0617448091506958, + "learning_rate": 3.825530815679775e-05, + "loss": 0.0228, + "num_input_tokens_seen": 14958848, + "step": 70885 + }, + { + "epoch": 7.798679867986799, + "grad_norm": 0.049792807549238205, + "learning_rate": 3.825327316253196e-05, + "loss": 0.0098, + "num_input_tokens_seen": 14959840, + "step": 70890 + }, + { + "epoch": 7.799229922992299, + "grad_norm": 0.013825511559844017, + "learning_rate": 3.825123804611711e-05, + "loss": 0.0899, + "num_input_tokens_seen": 14960928, + "step": 70895 + }, + { + "epoch": 7.7997799779978, + "grad_norm": 0.003813371993601322, + "learning_rate": 3.824920280757196e-05, + "loss": 0.0089, + "num_input_tokens_seen": 14961952, + "step": 70900 + }, + { + "epoch": 7.8003300330033, + "grad_norm": 0.015335461124777794, + "learning_rate": 3.824716744691527e-05, + "loss": 0.071, + "num_input_tokens_seen": 14963104, + "step": 70905 + }, + { + "epoch": 7.8008800880088005, + "grad_norm": 0.048669204115867615, + "learning_rate": 3.824513196416578e-05, + "loss": 0.1976, + "num_input_tokens_seen": 14964096, + "step": 70910 + }, + { + "epoch": 7.801430143014302, + "grad_norm": 0.03769839182496071, + "learning_rate": 3.8243096359342265e-05, + "loss": 0.0595, + "num_input_tokens_seen": 14965152, + "step": 70915 + }, + { + "epoch": 7.801980198019802, + "grad_norm": 0.027155425399541855, + "learning_rate": 3.82410606324635e-05, + "loss": 0.0889, + "num_input_tokens_seen": 14966176, + "step": 70920 + }, + { + "epoch": 7.802530253025303, + "grad_norm": 0.04492851346731186, + "learning_rate": 3.8239024783548224e-05, + "loss": 0.0481, + "num_input_tokens_seen": 14967264, + "step": 70925 + }, + { + "epoch": 7.803080308030803, + "grad_norm": 1.6017683744430542, + "learning_rate": 3.823698881261521e-05, + "loss": 0.0218, + "num_input_tokens_seen": 14968352, + "step": 70930 + }, + { + "epoch": 7.803630363036303, + "grad_norm": 0.029241757467389107, + "learning_rate": 3.823495271968322e-05, + "loss": 0.0617, + "num_input_tokens_seen": 14969408, + "step": 70935 + }, + { + "epoch": 7.804180418041804, + "grad_norm": 0.8635713458061218, + "learning_rate": 3.8232916504771024e-05, + "loss": 0.025, + "num_input_tokens_seen": 14970432, + "step": 70940 + }, + { + "epoch": 7.804730473047305, + "grad_norm": 1.2942302227020264, + "learning_rate": 3.823088016789738e-05, + "loss": 0.0485, + "num_input_tokens_seen": 14971520, + "step": 70945 + }, + { + "epoch": 7.805280528052805, + "grad_norm": 1.0324451923370361, + "learning_rate": 3.8228843709081075e-05, + "loss": 0.0489, + "num_input_tokens_seen": 14972576, + "step": 70950 + }, + { + "epoch": 7.805830583058306, + "grad_norm": 0.016886482015252113, + "learning_rate": 3.822680712834086e-05, + "loss": 0.0921, + "num_input_tokens_seen": 14973568, + "step": 70955 + }, + { + "epoch": 7.806380638063806, + "grad_norm": 0.027648139744997025, + "learning_rate": 3.822477042569551e-05, + "loss": 0.0068, + "num_input_tokens_seen": 14974656, + "step": 70960 + }, + { + "epoch": 7.806930693069307, + "grad_norm": 0.27607911825180054, + "learning_rate": 3.822273360116379e-05, + "loss": 0.0669, + "num_input_tokens_seen": 14975680, + "step": 70965 + }, + { + "epoch": 7.807480748074807, + "grad_norm": 0.007081802934408188, + "learning_rate": 3.8220696654764485e-05, + "loss": 0.0499, + "num_input_tokens_seen": 14976768, + "step": 70970 + }, + { + "epoch": 7.8080308030803085, + "grad_norm": 0.3049758970737457, + "learning_rate": 3.8218659586516365e-05, + "loss": 0.0135, + "num_input_tokens_seen": 14977856, + "step": 70975 + }, + { + "epoch": 7.808580858085809, + "grad_norm": 0.6058976650238037, + "learning_rate": 3.82166223964382e-05, + "loss": 0.0216, + "num_input_tokens_seen": 14978944, + "step": 70980 + }, + { + "epoch": 7.809130913091309, + "grad_norm": 0.009777569212019444, + "learning_rate": 3.8214585084548766e-05, + "loss": 0.0858, + "num_input_tokens_seen": 14979936, + "step": 70985 + }, + { + "epoch": 7.80968096809681, + "grad_norm": 1.8210046291351318, + "learning_rate": 3.821254765086685e-05, + "loss": 0.0199, + "num_input_tokens_seen": 14980992, + "step": 70990 + }, + { + "epoch": 7.81023102310231, + "grad_norm": 1.3256876468658447, + "learning_rate": 3.821051009541121e-05, + "loss": 0.0538, + "num_input_tokens_seen": 14982080, + "step": 70995 + }, + { + "epoch": 7.81078107810781, + "grad_norm": 0.03174324706196785, + "learning_rate": 3.8208472418200646e-05, + "loss": 0.0076, + "num_input_tokens_seen": 14983168, + "step": 71000 + }, + { + "epoch": 7.8113311331133115, + "grad_norm": 1.539366364479065, + "learning_rate": 3.820643461925393e-05, + "loss": 0.0213, + "num_input_tokens_seen": 14984256, + "step": 71005 + }, + { + "epoch": 7.811881188118812, + "grad_norm": 0.10905199497938156, + "learning_rate": 3.820439669858984e-05, + "loss": 0.1015, + "num_input_tokens_seen": 14985312, + "step": 71010 + }, + { + "epoch": 7.812431243124313, + "grad_norm": 0.015515944920480251, + "learning_rate": 3.8202358656227165e-05, + "loss": 0.0459, + "num_input_tokens_seen": 14986336, + "step": 71015 + }, + { + "epoch": 7.812981298129813, + "grad_norm": 1.5100769996643066, + "learning_rate": 3.8200320492184685e-05, + "loss": 0.0778, + "num_input_tokens_seen": 14987456, + "step": 71020 + }, + { + "epoch": 7.813531353135313, + "grad_norm": 0.031791720539331436, + "learning_rate": 3.819828220648118e-05, + "loss": 0.0047, + "num_input_tokens_seen": 14988608, + "step": 71025 + }, + { + "epoch": 7.814081408140814, + "grad_norm": 0.016931077465415, + "learning_rate": 3.8196243799135446e-05, + "loss": 0.015, + "num_input_tokens_seen": 14989664, + "step": 71030 + }, + { + "epoch": 7.8146314631463145, + "grad_norm": 0.006433896254748106, + "learning_rate": 3.819420527016626e-05, + "loss": 0.0037, + "num_input_tokens_seen": 14990688, + "step": 71035 + }, + { + "epoch": 7.815181518151816, + "grad_norm": 0.015322246588766575, + "learning_rate": 3.819216661959243e-05, + "loss": 0.0305, + "num_input_tokens_seen": 14991712, + "step": 71040 + }, + { + "epoch": 7.815731573157316, + "grad_norm": 1.4264756441116333, + "learning_rate": 3.819012784743272e-05, + "loss": 0.1136, + "num_input_tokens_seen": 14992800, + "step": 71045 + }, + { + "epoch": 7.816281628162816, + "grad_norm": 0.016708822920918465, + "learning_rate": 3.818808895370593e-05, + "loss": 0.0167, + "num_input_tokens_seen": 14993824, + "step": 71050 + }, + { + "epoch": 7.816831683168317, + "grad_norm": 0.15073998272418976, + "learning_rate": 3.818604993843085e-05, + "loss": 0.0106, + "num_input_tokens_seen": 14994912, + "step": 71055 + }, + { + "epoch": 7.817381738173817, + "grad_norm": 0.28211572766304016, + "learning_rate": 3.8184010801626275e-05, + "loss": 0.0775, + "num_input_tokens_seen": 14995968, + "step": 71060 + }, + { + "epoch": 7.8179317931793175, + "grad_norm": 0.9416921138763428, + "learning_rate": 3.8181971543311004e-05, + "loss": 0.115, + "num_input_tokens_seen": 14997024, + "step": 71065 + }, + { + "epoch": 7.818481848184819, + "grad_norm": 0.06439191848039627, + "learning_rate": 3.8179932163503825e-05, + "loss": 0.005, + "num_input_tokens_seen": 14998048, + "step": 71070 + }, + { + "epoch": 7.819031903190319, + "grad_norm": 0.012968828901648521, + "learning_rate": 3.8177892662223535e-05, + "loss": 0.1229, + "num_input_tokens_seen": 14999136, + "step": 71075 + }, + { + "epoch": 7.81958195819582, + "grad_norm": 0.08067130297422409, + "learning_rate": 3.8175853039488926e-05, + "loss": 0.1494, + "num_input_tokens_seen": 15000160, + "step": 71080 + }, + { + "epoch": 7.82013201320132, + "grad_norm": 0.012109728530049324, + "learning_rate": 3.817381329531881e-05, + "loss": 0.002, + "num_input_tokens_seen": 15001216, + "step": 71085 + }, + { + "epoch": 7.82068206820682, + "grad_norm": 0.05883360281586647, + "learning_rate": 3.817177342973197e-05, + "loss": 0.0139, + "num_input_tokens_seen": 15002304, + "step": 71090 + }, + { + "epoch": 7.821232123212321, + "grad_norm": 0.00821854081004858, + "learning_rate": 3.816973344274723e-05, + "loss": 0.051, + "num_input_tokens_seen": 15003360, + "step": 71095 + }, + { + "epoch": 7.821782178217822, + "grad_norm": 0.028944598510861397, + "learning_rate": 3.816769333438336e-05, + "loss": 0.1163, + "num_input_tokens_seen": 15004416, + "step": 71100 + }, + { + "epoch": 7.822332233223323, + "grad_norm": 0.2784945070743561, + "learning_rate": 3.8165653104659185e-05, + "loss": 0.0133, + "num_input_tokens_seen": 15005472, + "step": 71105 + }, + { + "epoch": 7.822882288228823, + "grad_norm": 0.054216548800468445, + "learning_rate": 3.816361275359351e-05, + "loss": 0.0599, + "num_input_tokens_seen": 15006560, + "step": 71110 + }, + { + "epoch": 7.823432343234323, + "grad_norm": 0.012357983738183975, + "learning_rate": 3.816157228120513e-05, + "loss": 0.008, + "num_input_tokens_seen": 15007648, + "step": 71115 + }, + { + "epoch": 7.823982398239824, + "grad_norm": 0.31918948888778687, + "learning_rate": 3.815953168751284e-05, + "loss": 0.0552, + "num_input_tokens_seen": 15008672, + "step": 71120 + }, + { + "epoch": 7.824532453245324, + "grad_norm": 1.1416558027267456, + "learning_rate": 3.815749097253548e-05, + "loss": 0.1067, + "num_input_tokens_seen": 15009760, + "step": 71125 + }, + { + "epoch": 7.825082508250825, + "grad_norm": 0.11225467920303345, + "learning_rate": 3.8155450136291824e-05, + "loss": 0.0067, + "num_input_tokens_seen": 15010816, + "step": 71130 + }, + { + "epoch": 7.825632563256326, + "grad_norm": 0.32607606053352356, + "learning_rate": 3.8153409178800704e-05, + "loss": 0.0231, + "num_input_tokens_seen": 15011872, + "step": 71135 + }, + { + "epoch": 7.826182618261826, + "grad_norm": 0.3945577144622803, + "learning_rate": 3.815136810008093e-05, + "loss": 0.0112, + "num_input_tokens_seen": 15012928, + "step": 71140 + }, + { + "epoch": 7.826732673267327, + "grad_norm": 0.008046845905482769, + "learning_rate": 3.814932690015129e-05, + "loss": 0.0792, + "num_input_tokens_seen": 15013952, + "step": 71145 + }, + { + "epoch": 7.827282728272827, + "grad_norm": 0.9554591178894043, + "learning_rate": 3.8147285579030624e-05, + "loss": 0.0227, + "num_input_tokens_seen": 15015008, + "step": 71150 + }, + { + "epoch": 7.827832783278328, + "grad_norm": 0.044394269585609436, + "learning_rate": 3.814524413673774e-05, + "loss": 0.02, + "num_input_tokens_seen": 15016032, + "step": 71155 + }, + { + "epoch": 7.8283828382838285, + "grad_norm": 0.3424404263496399, + "learning_rate": 3.814320257329144e-05, + "loss": 0.0184, + "num_input_tokens_seen": 15017120, + "step": 71160 + }, + { + "epoch": 7.828932893289329, + "grad_norm": 0.08752431720495224, + "learning_rate": 3.814116088871055e-05, + "loss": 0.049, + "num_input_tokens_seen": 15018176, + "step": 71165 + }, + { + "epoch": 7.82948294829483, + "grad_norm": 1.4358978271484375, + "learning_rate": 3.813911908301389e-05, + "loss": 0.0338, + "num_input_tokens_seen": 15019232, + "step": 71170 + }, + { + "epoch": 7.83003300330033, + "grad_norm": 2.4999442100524902, + "learning_rate": 3.813707715622027e-05, + "loss": 0.08, + "num_input_tokens_seen": 15020288, + "step": 71175 + }, + { + "epoch": 7.83058305830583, + "grad_norm": 0.04717414081096649, + "learning_rate": 3.813503510834851e-05, + "loss": 0.0274, + "num_input_tokens_seen": 15021344, + "step": 71180 + }, + { + "epoch": 7.831133113311331, + "grad_norm": 0.21484138071537018, + "learning_rate": 3.813299293941744e-05, + "loss": 0.0218, + "num_input_tokens_seen": 15022368, + "step": 71185 + }, + { + "epoch": 7.8316831683168315, + "grad_norm": 0.015179503709077835, + "learning_rate": 3.813095064944588e-05, + "loss": 0.0021, + "num_input_tokens_seen": 15023456, + "step": 71190 + }, + { + "epoch": 7.832233223322332, + "grad_norm": 1.3721319437026978, + "learning_rate": 3.8128908238452645e-05, + "loss": 0.0506, + "num_input_tokens_seen": 15024512, + "step": 71195 + }, + { + "epoch": 7.832783278327833, + "grad_norm": 0.897953987121582, + "learning_rate": 3.812686570645656e-05, + "loss": 0.0904, + "num_input_tokens_seen": 15025600, + "step": 71200 + }, + { + "epoch": 7.833333333333333, + "grad_norm": 0.23422934114933014, + "learning_rate": 3.812482305347646e-05, + "loss": 0.1071, + "num_input_tokens_seen": 15026688, + "step": 71205 + }, + { + "epoch": 7.833883388338834, + "grad_norm": 0.045437026768922806, + "learning_rate": 3.812278027953115e-05, + "loss": 0.0161, + "num_input_tokens_seen": 15027680, + "step": 71210 + }, + { + "epoch": 7.834433443344334, + "grad_norm": 0.0556517094373703, + "learning_rate": 3.812073738463948e-05, + "loss": 0.0042, + "num_input_tokens_seen": 15028832, + "step": 71215 + }, + { + "epoch": 7.834983498349835, + "grad_norm": 0.12647898495197296, + "learning_rate": 3.8118694368820265e-05, + "loss": 0.0363, + "num_input_tokens_seen": 15029856, + "step": 71220 + }, + { + "epoch": 7.835533553355336, + "grad_norm": 0.060467902570962906, + "learning_rate": 3.811665123209235e-05, + "loss": 0.0475, + "num_input_tokens_seen": 15030880, + "step": 71225 + }, + { + "epoch": 7.836083608360836, + "grad_norm": 1.2404894828796387, + "learning_rate": 3.8114607974474545e-05, + "loss": 0.0526, + "num_input_tokens_seen": 15031936, + "step": 71230 + }, + { + "epoch": 7.836633663366337, + "grad_norm": 0.05504191294312477, + "learning_rate": 3.81125645959857e-05, + "loss": 0.017, + "num_input_tokens_seen": 15032960, + "step": 71235 + }, + { + "epoch": 7.837183718371837, + "grad_norm": 0.1548256278038025, + "learning_rate": 3.811052109664463e-05, + "loss": 0.0523, + "num_input_tokens_seen": 15034016, + "step": 71240 + }, + { + "epoch": 7.837733773377337, + "grad_norm": 0.045965902507305145, + "learning_rate": 3.8108477476470175e-05, + "loss": 0.0829, + "num_input_tokens_seen": 15035072, + "step": 71245 + }, + { + "epoch": 7.838283828382838, + "grad_norm": 0.04834768921136856, + "learning_rate": 3.8106433735481177e-05, + "loss": 0.0476, + "num_input_tokens_seen": 15036160, + "step": 71250 + }, + { + "epoch": 7.838833883388339, + "grad_norm": 0.21070072054862976, + "learning_rate": 3.810438987369647e-05, + "loss": 0.0443, + "num_input_tokens_seen": 15037216, + "step": 71255 + }, + { + "epoch": 7.83938393839384, + "grad_norm": 0.46375927329063416, + "learning_rate": 3.8102345891134896e-05, + "loss": 0.0421, + "num_input_tokens_seen": 15038176, + "step": 71260 + }, + { + "epoch": 7.83993399339934, + "grad_norm": 0.022518664598464966, + "learning_rate": 3.8100301787815276e-05, + "loss": 0.0386, + "num_input_tokens_seen": 15039264, + "step": 71265 + }, + { + "epoch": 7.84048404840484, + "grad_norm": 0.019861426204442978, + "learning_rate": 3.809825756375647e-05, + "loss": 0.0619, + "num_input_tokens_seen": 15040320, + "step": 71270 + }, + { + "epoch": 7.841034103410341, + "grad_norm": 0.014016385190188885, + "learning_rate": 3.809621321897731e-05, + "loss": 0.0268, + "num_input_tokens_seen": 15041344, + "step": 71275 + }, + { + "epoch": 7.841584158415841, + "grad_norm": 0.2369888573884964, + "learning_rate": 3.8094168753496614e-05, + "loss": 0.0501, + "num_input_tokens_seen": 15042464, + "step": 71280 + }, + { + "epoch": 7.8421342134213425, + "grad_norm": 1.2496981620788574, + "learning_rate": 3.8092124167333276e-05, + "loss": 0.0403, + "num_input_tokens_seen": 15043552, + "step": 71285 + }, + { + "epoch": 7.842684268426843, + "grad_norm": 0.9232203960418701, + "learning_rate": 3.80900794605061e-05, + "loss": 0.0135, + "num_input_tokens_seen": 15044544, + "step": 71290 + }, + { + "epoch": 7.843234323432343, + "grad_norm": 0.11806037276983261, + "learning_rate": 3.808803463303394e-05, + "loss": 0.1218, + "num_input_tokens_seen": 15045664, + "step": 71295 + }, + { + "epoch": 7.843784378437844, + "grad_norm": 1.3291208744049072, + "learning_rate": 3.808598968493565e-05, + "loss": 0.0626, + "num_input_tokens_seen": 15046848, + "step": 71300 + }, + { + "epoch": 7.844334433443344, + "grad_norm": 0.18722984194755554, + "learning_rate": 3.808394461623007e-05, + "loss": 0.0086, + "num_input_tokens_seen": 15047968, + "step": 71305 + }, + { + "epoch": 7.8448844884488445, + "grad_norm": 0.13121627271175385, + "learning_rate": 3.808189942693606e-05, + "loss": 0.0136, + "num_input_tokens_seen": 15049024, + "step": 71310 + }, + { + "epoch": 7.8454345434543455, + "grad_norm": 0.7318653464317322, + "learning_rate": 3.807985411707245e-05, + "loss": 0.0188, + "num_input_tokens_seen": 15050080, + "step": 71315 + }, + { + "epoch": 7.845984598459846, + "grad_norm": 0.010967390611767769, + "learning_rate": 3.80778086866581e-05, + "loss": 0.0116, + "num_input_tokens_seen": 15051072, + "step": 71320 + }, + { + "epoch": 7.846534653465347, + "grad_norm": 0.17402340471744537, + "learning_rate": 3.807576313571187e-05, + "loss": 0.0261, + "num_input_tokens_seen": 15052064, + "step": 71325 + }, + { + "epoch": 7.847084708470847, + "grad_norm": 0.3426421284675598, + "learning_rate": 3.80737174642526e-05, + "loss": 0.0413, + "num_input_tokens_seen": 15053120, + "step": 71330 + }, + { + "epoch": 7.847634763476347, + "grad_norm": 0.6570471525192261, + "learning_rate": 3.807167167229915e-05, + "loss": 0.1791, + "num_input_tokens_seen": 15054208, + "step": 71335 + }, + { + "epoch": 7.848184818481848, + "grad_norm": 0.012528273276984692, + "learning_rate": 3.806962575987038e-05, + "loss": 0.0195, + "num_input_tokens_seen": 15055200, + "step": 71340 + }, + { + "epoch": 7.8487348734873486, + "grad_norm": 0.11027919501066208, + "learning_rate": 3.806757972698513e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15056256, + "step": 71345 + }, + { + "epoch": 7.84928492849285, + "grad_norm": 0.728669285774231, + "learning_rate": 3.806553357366227e-05, + "loss": 0.0341, + "num_input_tokens_seen": 15057280, + "step": 71350 + }, + { + "epoch": 7.84983498349835, + "grad_norm": 1.031156301498413, + "learning_rate": 3.806348729992067e-05, + "loss": 0.0396, + "num_input_tokens_seen": 15058336, + "step": 71355 + }, + { + "epoch": 7.85038503850385, + "grad_norm": 1.4212559461593628, + "learning_rate": 3.8061440905779165e-05, + "loss": 0.0497, + "num_input_tokens_seen": 15059424, + "step": 71360 + }, + { + "epoch": 7.850935093509351, + "grad_norm": 0.0590829961001873, + "learning_rate": 3.805939439125663e-05, + "loss": 0.0149, + "num_input_tokens_seen": 15060512, + "step": 71365 + }, + { + "epoch": 7.851485148514851, + "grad_norm": 0.16790105402469635, + "learning_rate": 3.805734775637192e-05, + "loss": 0.09, + "num_input_tokens_seen": 15061632, + "step": 71370 + }, + { + "epoch": 7.852035203520352, + "grad_norm": 0.14601920545101166, + "learning_rate": 3.8055301001143905e-05, + "loss": 0.0562, + "num_input_tokens_seen": 15062720, + "step": 71375 + }, + { + "epoch": 7.852585258525853, + "grad_norm": 0.04640106111764908, + "learning_rate": 3.805325412559144e-05, + "loss": 0.0543, + "num_input_tokens_seen": 15063776, + "step": 71380 + }, + { + "epoch": 7.853135313531353, + "grad_norm": 0.02401881478726864, + "learning_rate": 3.80512071297334e-05, + "loss": 0.0288, + "num_input_tokens_seen": 15064864, + "step": 71385 + }, + { + "epoch": 7.853685368536854, + "grad_norm": 0.02014927752315998, + "learning_rate": 3.804916001358865e-05, + "loss": 0.0031, + "num_input_tokens_seen": 15065920, + "step": 71390 + }, + { + "epoch": 7.854235423542354, + "grad_norm": 0.7007530331611633, + "learning_rate": 3.804711277717604e-05, + "loss": 0.0184, + "num_input_tokens_seen": 15067008, + "step": 71395 + }, + { + "epoch": 7.854785478547855, + "grad_norm": 0.051877930760383606, + "learning_rate": 3.8045065420514464e-05, + "loss": 0.0771, + "num_input_tokens_seen": 15068032, + "step": 71400 + }, + { + "epoch": 7.8553355335533555, + "grad_norm": 0.04403753578662872, + "learning_rate": 3.804301794362278e-05, + "loss": 0.0469, + "num_input_tokens_seen": 15069120, + "step": 71405 + }, + { + "epoch": 7.855885588558856, + "grad_norm": 0.03435436636209488, + "learning_rate": 3.804097034651985e-05, + "loss": 0.021, + "num_input_tokens_seen": 15070176, + "step": 71410 + }, + { + "epoch": 7.856435643564357, + "grad_norm": 0.10435876250267029, + "learning_rate": 3.8038922629224564e-05, + "loss": 0.0523, + "num_input_tokens_seen": 15071200, + "step": 71415 + }, + { + "epoch": 7.856985698569857, + "grad_norm": 0.3537994623184204, + "learning_rate": 3.8036874791755784e-05, + "loss": 0.026, + "num_input_tokens_seen": 15072256, + "step": 71420 + }, + { + "epoch": 7.857535753575357, + "grad_norm": 0.034451279789209366, + "learning_rate": 3.803482683413237e-05, + "loss": 0.0235, + "num_input_tokens_seen": 15073280, + "step": 71425 + }, + { + "epoch": 7.858085808580858, + "grad_norm": 0.6100879907608032, + "learning_rate": 3.8032778756373235e-05, + "loss": 0.0361, + "num_input_tokens_seen": 15074368, + "step": 71430 + }, + { + "epoch": 7.8586358635863585, + "grad_norm": 0.12352485954761505, + "learning_rate": 3.803073055849721e-05, + "loss": 0.0596, + "num_input_tokens_seen": 15075424, + "step": 71435 + }, + { + "epoch": 7.8591859185918596, + "grad_norm": 0.03050939552485943, + "learning_rate": 3.8028682240523206e-05, + "loss": 0.0424, + "num_input_tokens_seen": 15076480, + "step": 71440 + }, + { + "epoch": 7.85973597359736, + "grad_norm": 0.34485936164855957, + "learning_rate": 3.802663380247009e-05, + "loss": 0.0096, + "num_input_tokens_seen": 15077536, + "step": 71445 + }, + { + "epoch": 7.86028602860286, + "grad_norm": 0.40736016631126404, + "learning_rate": 3.8024585244356734e-05, + "loss": 0.0179, + "num_input_tokens_seen": 15078592, + "step": 71450 + }, + { + "epoch": 7.860836083608361, + "grad_norm": 0.04038340970873833, + "learning_rate": 3.8022536566202036e-05, + "loss": 0.0037, + "num_input_tokens_seen": 15079616, + "step": 71455 + }, + { + "epoch": 7.861386138613861, + "grad_norm": 0.02610231749713421, + "learning_rate": 3.802048776802486e-05, + "loss": 0.0046, + "num_input_tokens_seen": 15080672, + "step": 71460 + }, + { + "epoch": 7.861936193619362, + "grad_norm": 0.0411122627556324, + "learning_rate": 3.801843884984409e-05, + "loss": 0.0693, + "num_input_tokens_seen": 15081664, + "step": 71465 + }, + { + "epoch": 7.862486248624863, + "grad_norm": 0.25353795289993286, + "learning_rate": 3.801638981167862e-05, + "loss": 0.0091, + "num_input_tokens_seen": 15082720, + "step": 71470 + }, + { + "epoch": 7.863036303630363, + "grad_norm": 0.10670213401317596, + "learning_rate": 3.801434065354734e-05, + "loss": 0.0065, + "num_input_tokens_seen": 15083744, + "step": 71475 + }, + { + "epoch": 7.863586358635864, + "grad_norm": 0.025237243622541428, + "learning_rate": 3.801229137546912e-05, + "loss": 0.0575, + "num_input_tokens_seen": 15084768, + "step": 71480 + }, + { + "epoch": 7.864136413641364, + "grad_norm": 0.21518836915493011, + "learning_rate": 3.801024197746286e-05, + "loss": 0.0084, + "num_input_tokens_seen": 15085792, + "step": 71485 + }, + { + "epoch": 7.864686468646864, + "grad_norm": 0.03098653256893158, + "learning_rate": 3.800819245954744e-05, + "loss": 0.0527, + "num_input_tokens_seen": 15086848, + "step": 71490 + }, + { + "epoch": 7.865236523652365, + "grad_norm": 0.07755091041326523, + "learning_rate": 3.800614282174174e-05, + "loss": 0.01, + "num_input_tokens_seen": 15087936, + "step": 71495 + }, + { + "epoch": 7.865786578657866, + "grad_norm": 0.1836400032043457, + "learning_rate": 3.8004093064064685e-05, + "loss": 0.1243, + "num_input_tokens_seen": 15088992, + "step": 71500 + }, + { + "epoch": 7.866336633663367, + "grad_norm": 0.010896824300289154, + "learning_rate": 3.800204318653513e-05, + "loss": 0.0071, + "num_input_tokens_seen": 15090048, + "step": 71505 + }, + { + "epoch": 7.866886688668867, + "grad_norm": 0.04425816610455513, + "learning_rate": 3.799999318917198e-05, + "loss": 0.0173, + "num_input_tokens_seen": 15091104, + "step": 71510 + }, + { + "epoch": 7.867436743674367, + "grad_norm": 0.20595043897628784, + "learning_rate": 3.7997943071994136e-05, + "loss": 0.0295, + "num_input_tokens_seen": 15092288, + "step": 71515 + }, + { + "epoch": 7.867986798679868, + "grad_norm": 0.2759336531162262, + "learning_rate": 3.7995892835020487e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15093312, + "step": 71520 + }, + { + "epoch": 7.868536853685368, + "grad_norm": 0.18566223978996277, + "learning_rate": 3.799384247826993e-05, + "loss": 0.0183, + "num_input_tokens_seen": 15094400, + "step": 71525 + }, + { + "epoch": 7.8690869086908695, + "grad_norm": 1.138353705406189, + "learning_rate": 3.7991792001761363e-05, + "loss": 0.1029, + "num_input_tokens_seen": 15095456, + "step": 71530 + }, + { + "epoch": 7.86963696369637, + "grad_norm": 0.42509257793426514, + "learning_rate": 3.798974140551368e-05, + "loss": 0.0479, + "num_input_tokens_seen": 15096544, + "step": 71535 + }, + { + "epoch": 7.87018701870187, + "grad_norm": 0.12716931104660034, + "learning_rate": 3.798769068954577e-05, + "loss": 0.0918, + "num_input_tokens_seen": 15097536, + "step": 71540 + }, + { + "epoch": 7.870737073707371, + "grad_norm": 0.00959047395735979, + "learning_rate": 3.798563985387658e-05, + "loss": 0.0054, + "num_input_tokens_seen": 15098624, + "step": 71545 + }, + { + "epoch": 7.871287128712871, + "grad_norm": 1.2307499647140503, + "learning_rate": 3.7983588898524955e-05, + "loss": 0.0963, + "num_input_tokens_seen": 15099744, + "step": 71550 + }, + { + "epoch": 7.871837183718371, + "grad_norm": 0.007585732266306877, + "learning_rate": 3.798153782350982e-05, + "loss": 0.0797, + "num_input_tokens_seen": 15100768, + "step": 71555 + }, + { + "epoch": 7.8723872387238725, + "grad_norm": 1.70393967628479, + "learning_rate": 3.7979486628850094e-05, + "loss": 0.0589, + "num_input_tokens_seen": 15101888, + "step": 71560 + }, + { + "epoch": 7.872937293729373, + "grad_norm": 0.19901975989341736, + "learning_rate": 3.797743531456466e-05, + "loss": 0.1165, + "num_input_tokens_seen": 15102912, + "step": 71565 + }, + { + "epoch": 7.873487348734874, + "grad_norm": 0.013825738802552223, + "learning_rate": 3.797538388067243e-05, + "loss": 0.0036, + "num_input_tokens_seen": 15103904, + "step": 71570 + }, + { + "epoch": 7.874037403740374, + "grad_norm": 0.4571213126182556, + "learning_rate": 3.797333232719231e-05, + "loss": 0.0722, + "num_input_tokens_seen": 15104928, + "step": 71575 + }, + { + "epoch": 7.874587458745875, + "grad_norm": 0.016885707154870033, + "learning_rate": 3.797128065414323e-05, + "loss": 0.0075, + "num_input_tokens_seen": 15106048, + "step": 71580 + }, + { + "epoch": 7.875137513751375, + "grad_norm": 0.011048450134694576, + "learning_rate": 3.796922886154406e-05, + "loss": 0.0041, + "num_input_tokens_seen": 15107072, + "step": 71585 + }, + { + "epoch": 7.8756875687568755, + "grad_norm": 0.07967876642942429, + "learning_rate": 3.796717694941374e-05, + "loss": 0.037, + "num_input_tokens_seen": 15108096, + "step": 71590 + }, + { + "epoch": 7.876237623762377, + "grad_norm": 0.10853055864572525, + "learning_rate": 3.796512491777117e-05, + "loss": 0.0104, + "num_input_tokens_seen": 15109152, + "step": 71595 + }, + { + "epoch": 7.876787678767877, + "grad_norm": 0.018559293821454048, + "learning_rate": 3.796307276663526e-05, + "loss": 0.0785, + "num_input_tokens_seen": 15110272, + "step": 71600 + }, + { + "epoch": 7.877337733773377, + "grad_norm": 0.043890681117773056, + "learning_rate": 3.796102049602493e-05, + "loss": 0.0533, + "num_input_tokens_seen": 15111264, + "step": 71605 + }, + { + "epoch": 7.877887788778878, + "grad_norm": 0.02594323828816414, + "learning_rate": 3.79589681059591e-05, + "loss": 0.0056, + "num_input_tokens_seen": 15112352, + "step": 71610 + }, + { + "epoch": 7.878437843784378, + "grad_norm": 0.053494710475206375, + "learning_rate": 3.795691559645668e-05, + "loss": 0.0232, + "num_input_tokens_seen": 15113408, + "step": 71615 + }, + { + "epoch": 7.878987898789879, + "grad_norm": 0.036328669637441635, + "learning_rate": 3.7954862967536585e-05, + "loss": 0.0712, + "num_input_tokens_seen": 15114400, + "step": 71620 + }, + { + "epoch": 7.87953795379538, + "grad_norm": 0.09356291592121124, + "learning_rate": 3.7952810219217725e-05, + "loss": 0.0065, + "num_input_tokens_seen": 15115488, + "step": 71625 + }, + { + "epoch": 7.88008800880088, + "grad_norm": 2.0701043605804443, + "learning_rate": 3.795075735151904e-05, + "loss": 0.105, + "num_input_tokens_seen": 15116544, + "step": 71630 + }, + { + "epoch": 7.880638063806381, + "grad_norm": 0.2939963936805725, + "learning_rate": 3.794870436445943e-05, + "loss": 0.1259, + "num_input_tokens_seen": 15117600, + "step": 71635 + }, + { + "epoch": 7.881188118811881, + "grad_norm": 0.5885838866233826, + "learning_rate": 3.794665125805783e-05, + "loss": 0.042, + "num_input_tokens_seen": 15118720, + "step": 71640 + }, + { + "epoch": 7.881738173817382, + "grad_norm": 0.006729915272444487, + "learning_rate": 3.794459803233316e-05, + "loss": 0.0598, + "num_input_tokens_seen": 15119808, + "step": 71645 + }, + { + "epoch": 7.882288228822882, + "grad_norm": 0.021660659462213516, + "learning_rate": 3.794254468730434e-05, + "loss": 0.1572, + "num_input_tokens_seen": 15120864, + "step": 71650 + }, + { + "epoch": 7.882838283828383, + "grad_norm": 0.022203490138053894, + "learning_rate": 3.7940491222990296e-05, + "loss": 0.0136, + "num_input_tokens_seen": 15121920, + "step": 71655 + }, + { + "epoch": 7.883388338833884, + "grad_norm": 0.024616459384560585, + "learning_rate": 3.793843763940995e-05, + "loss": 0.0614, + "num_input_tokens_seen": 15122944, + "step": 71660 + }, + { + "epoch": 7.883938393839384, + "grad_norm": 0.7276953458786011, + "learning_rate": 3.7936383936582245e-05, + "loss": 0.1263, + "num_input_tokens_seen": 15124032, + "step": 71665 + }, + { + "epoch": 7.884488448844884, + "grad_norm": 0.1883247196674347, + "learning_rate": 3.7934330114526074e-05, + "loss": 0.0306, + "num_input_tokens_seen": 15125056, + "step": 71670 + }, + { + "epoch": 7.885038503850385, + "grad_norm": 1.4346259832382202, + "learning_rate": 3.79322761732604e-05, + "loss": 0.1684, + "num_input_tokens_seen": 15126144, + "step": 71675 + }, + { + "epoch": 7.885588558855885, + "grad_norm": 0.2935756742954254, + "learning_rate": 3.7930222112804146e-05, + "loss": 0.0087, + "num_input_tokens_seen": 15127168, + "step": 71680 + }, + { + "epoch": 7.8861386138613865, + "grad_norm": 1.3793123960494995, + "learning_rate": 3.792816793317624e-05, + "loss": 0.1751, + "num_input_tokens_seen": 15128192, + "step": 71685 + }, + { + "epoch": 7.886688668866887, + "grad_norm": 0.041846923530101776, + "learning_rate": 3.792611363439561e-05, + "loss": 0.0873, + "num_input_tokens_seen": 15129280, + "step": 71690 + }, + { + "epoch": 7.887238723872387, + "grad_norm": 0.5970556139945984, + "learning_rate": 3.79240592164812e-05, + "loss": 0.0244, + "num_input_tokens_seen": 15130336, + "step": 71695 + }, + { + "epoch": 7.887788778877888, + "grad_norm": 1.6336381435394287, + "learning_rate": 3.792200467945193e-05, + "loss": 0.0432, + "num_input_tokens_seen": 15131360, + "step": 71700 + }, + { + "epoch": 7.888338833883388, + "grad_norm": 0.025921080261468887, + "learning_rate": 3.7919950023326745e-05, + "loss": 0.0172, + "num_input_tokens_seen": 15132416, + "step": 71705 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 0.07739849388599396, + "learning_rate": 3.791789524812458e-05, + "loss": 0.0579, + "num_input_tokens_seen": 15133440, + "step": 71710 + }, + { + "epoch": 7.8894389438943895, + "grad_norm": 0.14868852496147156, + "learning_rate": 3.7915840353864375e-05, + "loss": 0.0225, + "num_input_tokens_seen": 15134496, + "step": 71715 + }, + { + "epoch": 7.88998899889989, + "grad_norm": 2.3084635734558105, + "learning_rate": 3.791378534056507e-05, + "loss": 0.1447, + "num_input_tokens_seen": 15135520, + "step": 71720 + }, + { + "epoch": 7.890539053905391, + "grad_norm": 0.03517339006066322, + "learning_rate": 3.791173020824559e-05, + "loss": 0.0294, + "num_input_tokens_seen": 15136576, + "step": 71725 + }, + { + "epoch": 7.891089108910891, + "grad_norm": 0.27916303277015686, + "learning_rate": 3.79096749569249e-05, + "loss": 0.0264, + "num_input_tokens_seen": 15137632, + "step": 71730 + }, + { + "epoch": 7.891639163916391, + "grad_norm": 1.0526326894760132, + "learning_rate": 3.7907619586621925e-05, + "loss": 0.093, + "num_input_tokens_seen": 15138688, + "step": 71735 + }, + { + "epoch": 7.892189218921892, + "grad_norm": 0.145394966006279, + "learning_rate": 3.7905564097355616e-05, + "loss": 0.0142, + "num_input_tokens_seen": 15139776, + "step": 71740 + }, + { + "epoch": 7.8927392739273925, + "grad_norm": 0.5220282673835754, + "learning_rate": 3.7903508489144914e-05, + "loss": 0.0686, + "num_input_tokens_seen": 15140832, + "step": 71745 + }, + { + "epoch": 7.893289328932894, + "grad_norm": 0.5628007650375366, + "learning_rate": 3.7901452762008774e-05, + "loss": 0.024, + "num_input_tokens_seen": 15141920, + "step": 71750 + }, + { + "epoch": 7.893839383938394, + "grad_norm": 0.676482081413269, + "learning_rate": 3.789939691596612e-05, + "loss": 0.1049, + "num_input_tokens_seen": 15142944, + "step": 71755 + }, + { + "epoch": 7.894389438943895, + "grad_norm": 0.047859761863946915, + "learning_rate": 3.7897340951035925e-05, + "loss": 0.0068, + "num_input_tokens_seen": 15144032, + "step": 71760 + }, + { + "epoch": 7.894939493949395, + "grad_norm": 0.5766016840934753, + "learning_rate": 3.789528486723712e-05, + "loss": 0.0262, + "num_input_tokens_seen": 15145184, + "step": 71765 + }, + { + "epoch": 7.895489548954895, + "grad_norm": 0.24336522817611694, + "learning_rate": 3.7893228664588666e-05, + "loss": 0.0337, + "num_input_tokens_seen": 15146240, + "step": 71770 + }, + { + "epoch": 7.896039603960396, + "grad_norm": 0.008761507458984852, + "learning_rate": 3.789117234310952e-05, + "loss": 0.0164, + "num_input_tokens_seen": 15147296, + "step": 71775 + }, + { + "epoch": 7.896589658965897, + "grad_norm": 0.20977821946144104, + "learning_rate": 3.788911590281861e-05, + "loss": 0.0151, + "num_input_tokens_seen": 15148352, + "step": 71780 + }, + { + "epoch": 7.897139713971397, + "grad_norm": 0.9253822565078735, + "learning_rate": 3.788705934373491e-05, + "loss": 0.0357, + "num_input_tokens_seen": 15149376, + "step": 71785 + }, + { + "epoch": 7.897689768976898, + "grad_norm": 0.7568450570106506, + "learning_rate": 3.7885002665877367e-05, + "loss": 0.0305, + "num_input_tokens_seen": 15150432, + "step": 71790 + }, + { + "epoch": 7.898239823982398, + "grad_norm": 0.013470690697431564, + "learning_rate": 3.788294586926493e-05, + "loss": 0.0124, + "num_input_tokens_seen": 15151456, + "step": 71795 + }, + { + "epoch": 7.898789878987898, + "grad_norm": 0.12661105394363403, + "learning_rate": 3.788088895391657e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15152512, + "step": 71800 + }, + { + "epoch": 7.899339933993399, + "grad_norm": 0.06028682738542557, + "learning_rate": 3.7878831919851244e-05, + "loss": 0.0075, + "num_input_tokens_seen": 15153632, + "step": 71805 + }, + { + "epoch": 7.8998899889989, + "grad_norm": 0.05286572873592377, + "learning_rate": 3.7876774767087896e-05, + "loss": 0.0184, + "num_input_tokens_seen": 15154720, + "step": 71810 + }, + { + "epoch": 7.900440044004401, + "grad_norm": 0.004427033942192793, + "learning_rate": 3.787471749564549e-05, + "loss": 0.0081, + "num_input_tokens_seen": 15155776, + "step": 71815 + }, + { + "epoch": 7.900990099009901, + "grad_norm": 0.05078107491135597, + "learning_rate": 3.7872660105543e-05, + "loss": 0.0771, + "num_input_tokens_seen": 15156832, + "step": 71820 + }, + { + "epoch": 7.901540154015402, + "grad_norm": 0.0416099987924099, + "learning_rate": 3.787060259679937e-05, + "loss": 0.0594, + "num_input_tokens_seen": 15157888, + "step": 71825 + }, + { + "epoch": 7.902090209020902, + "grad_norm": 0.10781244933605194, + "learning_rate": 3.786854496943358e-05, + "loss": 0.0201, + "num_input_tokens_seen": 15159008, + "step": 71830 + }, + { + "epoch": 7.902640264026402, + "grad_norm": 0.019115405157208443, + "learning_rate": 3.786648722346458e-05, + "loss": 0.0176, + "num_input_tokens_seen": 15160000, + "step": 71835 + }, + { + "epoch": 7.9031903190319035, + "grad_norm": 0.09371064603328705, + "learning_rate": 3.786442935891135e-05, + "loss": 0.0045, + "num_input_tokens_seen": 15161088, + "step": 71840 + }, + { + "epoch": 7.903740374037404, + "grad_norm": 0.03432586044073105, + "learning_rate": 3.7862371375792834e-05, + "loss": 0.0416, + "num_input_tokens_seen": 15162176, + "step": 71845 + }, + { + "epoch": 7.904290429042904, + "grad_norm": 0.11435778439044952, + "learning_rate": 3.786031327412802e-05, + "loss": 0.0505, + "num_input_tokens_seen": 15163232, + "step": 71850 + }, + { + "epoch": 7.904840484048405, + "grad_norm": 0.040542807430028915, + "learning_rate": 3.7858255053935866e-05, + "loss": 0.0599, + "num_input_tokens_seen": 15164256, + "step": 71855 + }, + { + "epoch": 7.905390539053905, + "grad_norm": 0.8343350887298584, + "learning_rate": 3.785619671523535e-05, + "loss": 0.009, + "num_input_tokens_seen": 15165280, + "step": 71860 + }, + { + "epoch": 7.905940594059406, + "grad_norm": 0.20655308663845062, + "learning_rate": 3.7854138258045434e-05, + "loss": 0.0412, + "num_input_tokens_seen": 15166400, + "step": 71865 + }, + { + "epoch": 7.9064906490649065, + "grad_norm": 0.030184542760252953, + "learning_rate": 3.78520796823851e-05, + "loss": 0.0487, + "num_input_tokens_seen": 15167392, + "step": 71870 + }, + { + "epoch": 7.907040704070407, + "grad_norm": 0.08660377562046051, + "learning_rate": 3.7850020988273306e-05, + "loss": 0.0561, + "num_input_tokens_seen": 15168448, + "step": 71875 + }, + { + "epoch": 7.907590759075908, + "grad_norm": 0.05008459836244583, + "learning_rate": 3.7847962175729043e-05, + "loss": 0.0233, + "num_input_tokens_seen": 15169600, + "step": 71880 + }, + { + "epoch": 7.908140814081408, + "grad_norm": 1.5180432796478271, + "learning_rate": 3.784590324477127e-05, + "loss": 0.0858, + "num_input_tokens_seen": 15170656, + "step": 71885 + }, + { + "epoch": 7.908690869086909, + "grad_norm": 0.028089961037039757, + "learning_rate": 3.784384419541898e-05, + "loss": 0.0706, + "num_input_tokens_seen": 15171712, + "step": 71890 + }, + { + "epoch": 7.909240924092409, + "grad_norm": 0.9433084726333618, + "learning_rate": 3.784178502769113e-05, + "loss": 0.1637, + "num_input_tokens_seen": 15172736, + "step": 71895 + }, + { + "epoch": 7.9097909790979095, + "grad_norm": 0.8336569666862488, + "learning_rate": 3.783972574160672e-05, + "loss": 0.0342, + "num_input_tokens_seen": 15173824, + "step": 71900 + }, + { + "epoch": 7.910341034103411, + "grad_norm": 0.006935103330761194, + "learning_rate": 3.783766633718471e-05, + "loss": 0.0063, + "num_input_tokens_seen": 15174880, + "step": 71905 + }, + { + "epoch": 7.910891089108911, + "grad_norm": 0.0817457064986229, + "learning_rate": 3.78356068144441e-05, + "loss": 0.013, + "num_input_tokens_seen": 15175904, + "step": 71910 + }, + { + "epoch": 7.911441144114411, + "grad_norm": 0.01840793527662754, + "learning_rate": 3.783354717340385e-05, + "loss": 0.0687, + "num_input_tokens_seen": 15176960, + "step": 71915 + }, + { + "epoch": 7.911991199119912, + "grad_norm": 0.06986041367053986, + "learning_rate": 3.7831487414082956e-05, + "loss": 0.0663, + "num_input_tokens_seen": 15178080, + "step": 71920 + }, + { + "epoch": 7.912541254125412, + "grad_norm": 0.01804972253739834, + "learning_rate": 3.7829427536500404e-05, + "loss": 0.0478, + "num_input_tokens_seen": 15179200, + "step": 71925 + }, + { + "epoch": 7.913091309130913, + "grad_norm": 0.03319338336586952, + "learning_rate": 3.782736754067518e-05, + "loss": 0.0178, + "num_input_tokens_seen": 15180256, + "step": 71930 + }, + { + "epoch": 7.913641364136414, + "grad_norm": 0.02242254465818405, + "learning_rate": 3.782530742662625e-05, + "loss": 0.0373, + "num_input_tokens_seen": 15181280, + "step": 71935 + }, + { + "epoch": 7.914191419141914, + "grad_norm": 0.01514736469835043, + "learning_rate": 3.7823247194372624e-05, + "loss": 0.0032, + "num_input_tokens_seen": 15182368, + "step": 71940 + }, + { + "epoch": 7.914741474147415, + "grad_norm": 1.7280762195587158, + "learning_rate": 3.782118684393328e-05, + "loss": 0.0649, + "num_input_tokens_seen": 15183552, + "step": 71945 + }, + { + "epoch": 7.915291529152915, + "grad_norm": 1.401274561882019, + "learning_rate": 3.7819126375327214e-05, + "loss": 0.0968, + "num_input_tokens_seen": 15184640, + "step": 71950 + }, + { + "epoch": 7.915841584158416, + "grad_norm": 1.545488715171814, + "learning_rate": 3.78170657885734e-05, + "loss": 0.0693, + "num_input_tokens_seen": 15185664, + "step": 71955 + }, + { + "epoch": 7.916391639163916, + "grad_norm": 0.08829446882009506, + "learning_rate": 3.781500508369085e-05, + "loss": 0.0423, + "num_input_tokens_seen": 15186688, + "step": 71960 + }, + { + "epoch": 7.916941694169417, + "grad_norm": 0.13664694130420685, + "learning_rate": 3.7812944260698546e-05, + "loss": 0.0234, + "num_input_tokens_seen": 15187808, + "step": 71965 + }, + { + "epoch": 7.917491749174918, + "grad_norm": 0.10219961404800415, + "learning_rate": 3.781088331961549e-05, + "loss": 0.0821, + "num_input_tokens_seen": 15188864, + "step": 71970 + }, + { + "epoch": 7.918041804180418, + "grad_norm": 0.6153510808944702, + "learning_rate": 3.780882226046065e-05, + "loss": 0.0078, + "num_input_tokens_seen": 15189920, + "step": 71975 + }, + { + "epoch": 7.918591859185918, + "grad_norm": 0.16027511656284332, + "learning_rate": 3.7806761083253064e-05, + "loss": 0.0657, + "num_input_tokens_seen": 15191040, + "step": 71980 + }, + { + "epoch": 7.919141914191419, + "grad_norm": 2.192758798599243, + "learning_rate": 3.780469978801169e-05, + "loss": 0.0301, + "num_input_tokens_seen": 15192064, + "step": 71985 + }, + { + "epoch": 7.919691969196919, + "grad_norm": 0.1857338547706604, + "learning_rate": 3.780263837475555e-05, + "loss": 0.201, + "num_input_tokens_seen": 15193120, + "step": 71990 + }, + { + "epoch": 7.9202420242024205, + "grad_norm": 0.5682050585746765, + "learning_rate": 3.780057684350363e-05, + "loss": 0.0357, + "num_input_tokens_seen": 15194176, + "step": 71995 + }, + { + "epoch": 7.920792079207921, + "grad_norm": 0.09881851077079773, + "learning_rate": 3.779851519427494e-05, + "loss": 0.0192, + "num_input_tokens_seen": 15195200, + "step": 72000 + }, + { + "epoch": 7.921342134213422, + "grad_norm": 0.7646623253822327, + "learning_rate": 3.7796453427088474e-05, + "loss": 0.0264, + "num_input_tokens_seen": 15196192, + "step": 72005 + }, + { + "epoch": 7.921892189218922, + "grad_norm": 0.11944406479597092, + "learning_rate": 3.7794391541963244e-05, + "loss": 0.0286, + "num_input_tokens_seen": 15197248, + "step": 72010 + }, + { + "epoch": 7.922442244224422, + "grad_norm": 0.5543918609619141, + "learning_rate": 3.779232953891824e-05, + "loss": 0.0816, + "num_input_tokens_seen": 15198304, + "step": 72015 + }, + { + "epoch": 7.922992299229923, + "grad_norm": 0.3659186363220215, + "learning_rate": 3.779026741797248e-05, + "loss": 0.024, + "num_input_tokens_seen": 15199328, + "step": 72020 + }, + { + "epoch": 7.9235423542354235, + "grad_norm": 0.23040549457073212, + "learning_rate": 3.778820517914495e-05, + "loss": 0.0609, + "num_input_tokens_seen": 15200416, + "step": 72025 + }, + { + "epoch": 7.924092409240924, + "grad_norm": 0.021578023210167885, + "learning_rate": 3.778614282245468e-05, + "loss": 0.1072, + "num_input_tokens_seen": 15201472, + "step": 72030 + }, + { + "epoch": 7.924642464246425, + "grad_norm": 0.061560265719890594, + "learning_rate": 3.778408034792066e-05, + "loss": 0.0392, + "num_input_tokens_seen": 15202496, + "step": 72035 + }, + { + "epoch": 7.925192519251925, + "grad_norm": 0.049962062388658524, + "learning_rate": 3.778201775556191e-05, + "loss": 0.0094, + "num_input_tokens_seen": 15203552, + "step": 72040 + }, + { + "epoch": 7.925742574257426, + "grad_norm": 0.08900444209575653, + "learning_rate": 3.777995504539743e-05, + "loss": 0.0107, + "num_input_tokens_seen": 15204576, + "step": 72045 + }, + { + "epoch": 7.926292629262926, + "grad_norm": 0.07419309765100479, + "learning_rate": 3.7777892217446254e-05, + "loss": 0.0229, + "num_input_tokens_seen": 15205728, + "step": 72050 + }, + { + "epoch": 7.9268426842684265, + "grad_norm": 0.04638588801026344, + "learning_rate": 3.777582927172736e-05, + "loss": 0.0819, + "num_input_tokens_seen": 15206752, + "step": 72055 + }, + { + "epoch": 7.927392739273928, + "grad_norm": 0.11028897017240524, + "learning_rate": 3.777376620825979e-05, + "loss": 0.0539, + "num_input_tokens_seen": 15207776, + "step": 72060 + }, + { + "epoch": 7.927942794279428, + "grad_norm": 0.04940043017268181, + "learning_rate": 3.777170302706253e-05, + "loss": 0.0091, + "num_input_tokens_seen": 15208768, + "step": 72065 + }, + { + "epoch": 7.928492849284929, + "grad_norm": 0.1143132895231247, + "learning_rate": 3.776963972815463e-05, + "loss": 0.0672, + "num_input_tokens_seen": 15209824, + "step": 72070 + }, + { + "epoch": 7.929042904290429, + "grad_norm": 0.46595507860183716, + "learning_rate": 3.776757631155507e-05, + "loss": 0.0138, + "num_input_tokens_seen": 15210880, + "step": 72075 + }, + { + "epoch": 7.929592959295929, + "grad_norm": 0.03392099216580391, + "learning_rate": 3.77655127772829e-05, + "loss": 0.0085, + "num_input_tokens_seen": 15211872, + "step": 72080 + }, + { + "epoch": 7.93014301430143, + "grad_norm": 0.600369930267334, + "learning_rate": 3.7763449125357114e-05, + "loss": 0.0316, + "num_input_tokens_seen": 15212896, + "step": 72085 + }, + { + "epoch": 7.930693069306931, + "grad_norm": 0.7820836901664734, + "learning_rate": 3.7761385355796745e-05, + "loss": 0.0191, + "num_input_tokens_seen": 15213952, + "step": 72090 + }, + { + "epoch": 7.931243124312431, + "grad_norm": 0.7052406072616577, + "learning_rate": 3.775932146862081e-05, + "loss": 0.1901, + "num_input_tokens_seen": 15215072, + "step": 72095 + }, + { + "epoch": 7.931793179317932, + "grad_norm": 0.30773818492889404, + "learning_rate": 3.775725746384834e-05, + "loss": 0.059, + "num_input_tokens_seen": 15216096, + "step": 72100 + }, + { + "epoch": 7.932343234323432, + "grad_norm": 0.010215208865702152, + "learning_rate": 3.775519334149834e-05, + "loss": 0.007, + "num_input_tokens_seen": 15217120, + "step": 72105 + }, + { + "epoch": 7.932893289328933, + "grad_norm": 0.1602104753255844, + "learning_rate": 3.775312910158985e-05, + "loss": 0.0134, + "num_input_tokens_seen": 15218176, + "step": 72110 + }, + { + "epoch": 7.933443344334433, + "grad_norm": 0.05662011355161667, + "learning_rate": 3.775106474414188e-05, + "loss": 0.0871, + "num_input_tokens_seen": 15219168, + "step": 72115 + }, + { + "epoch": 7.933993399339934, + "grad_norm": 0.05435285344719887, + "learning_rate": 3.774900026917347e-05, + "loss": 0.0053, + "num_input_tokens_seen": 15220256, + "step": 72120 + }, + { + "epoch": 7.934543454345435, + "grad_norm": 2.4629526138305664, + "learning_rate": 3.7746935676703646e-05, + "loss": 0.0997, + "num_input_tokens_seen": 15221280, + "step": 72125 + }, + { + "epoch": 7.935093509350935, + "grad_norm": 0.07749076932668686, + "learning_rate": 3.7744870966751435e-05, + "loss": 0.0477, + "num_input_tokens_seen": 15222368, + "step": 72130 + }, + { + "epoch": 7.935643564356436, + "grad_norm": 0.04182364419102669, + "learning_rate": 3.774280613933585e-05, + "loss": 0.01, + "num_input_tokens_seen": 15223424, + "step": 72135 + }, + { + "epoch": 7.936193619361936, + "grad_norm": 2.215761184692383, + "learning_rate": 3.774074119447595e-05, + "loss": 0.043, + "num_input_tokens_seen": 15224480, + "step": 72140 + }, + { + "epoch": 7.936743674367436, + "grad_norm": 0.012753626331686974, + "learning_rate": 3.773867613219074e-05, + "loss": 0.1263, + "num_input_tokens_seen": 15225504, + "step": 72145 + }, + { + "epoch": 7.9372937293729375, + "grad_norm": 0.03611821308732033, + "learning_rate": 3.773661095249927e-05, + "loss": 0.0239, + "num_input_tokens_seen": 15226496, + "step": 72150 + }, + { + "epoch": 7.937843784378438, + "grad_norm": 0.03283749520778656, + "learning_rate": 3.773454565542057e-05, + "loss": 0.0994, + "num_input_tokens_seen": 15227520, + "step": 72155 + }, + { + "epoch": 7.938393839383938, + "grad_norm": 0.9863253235816956, + "learning_rate": 3.773248024097367e-05, + "loss": 0.0366, + "num_input_tokens_seen": 15228608, + "step": 72160 + }, + { + "epoch": 7.938943894389439, + "grad_norm": 0.020735502243041992, + "learning_rate": 3.773041470917761e-05, + "loss": 0.014, + "num_input_tokens_seen": 15229728, + "step": 72165 + }, + { + "epoch": 7.939493949394939, + "grad_norm": 0.032595425844192505, + "learning_rate": 3.772834906005143e-05, + "loss": 0.0653, + "num_input_tokens_seen": 15230816, + "step": 72170 + }, + { + "epoch": 7.94004400440044, + "grad_norm": 0.34359872341156006, + "learning_rate": 3.7726283293614164e-05, + "loss": 0.0744, + "num_input_tokens_seen": 15231936, + "step": 72175 + }, + { + "epoch": 7.9405940594059405, + "grad_norm": 0.07587792724370956, + "learning_rate": 3.772421740988484e-05, + "loss": 0.0067, + "num_input_tokens_seen": 15232992, + "step": 72180 + }, + { + "epoch": 7.941144114411442, + "grad_norm": 0.14877472817897797, + "learning_rate": 3.7722151408882525e-05, + "loss": 0.0093, + "num_input_tokens_seen": 15234016, + "step": 72185 + }, + { + "epoch": 7.941694169416942, + "grad_norm": 0.020862197503447533, + "learning_rate": 3.772008529062623e-05, + "loss": 0.0193, + "num_input_tokens_seen": 15235104, + "step": 72190 + }, + { + "epoch": 7.942244224422442, + "grad_norm": 0.1395999789237976, + "learning_rate": 3.771801905513502e-05, + "loss": 0.0099, + "num_input_tokens_seen": 15236192, + "step": 72195 + }, + { + "epoch": 7.942794279427943, + "grad_norm": 1.3559764623641968, + "learning_rate": 3.7715952702427935e-05, + "loss": 0.04, + "num_input_tokens_seen": 15237248, + "step": 72200 + }, + { + "epoch": 7.943344334433443, + "grad_norm": 2.140169382095337, + "learning_rate": 3.771388623252402e-05, + "loss": 0.0633, + "num_input_tokens_seen": 15238304, + "step": 72205 + }, + { + "epoch": 7.9438943894389435, + "grad_norm": 0.05634018033742905, + "learning_rate": 3.771181964544231e-05, + "loss": 0.0396, + "num_input_tokens_seen": 15239328, + "step": 72210 + }, + { + "epoch": 7.944444444444445, + "grad_norm": 1.9294687509536743, + "learning_rate": 3.770975294120186e-05, + "loss": 0.068, + "num_input_tokens_seen": 15240384, + "step": 72215 + }, + { + "epoch": 7.944994499449945, + "grad_norm": 0.5359114408493042, + "learning_rate": 3.770768611982171e-05, + "loss": 0.0265, + "num_input_tokens_seen": 15241440, + "step": 72220 + }, + { + "epoch": 7.945544554455445, + "grad_norm": 0.8444489240646362, + "learning_rate": 3.770561918132092e-05, + "loss": 0.0536, + "num_input_tokens_seen": 15242464, + "step": 72225 + }, + { + "epoch": 7.946094609460946, + "grad_norm": 0.017674842849373817, + "learning_rate": 3.7703552125718525e-05, + "loss": 0.0383, + "num_input_tokens_seen": 15243488, + "step": 72230 + }, + { + "epoch": 7.946644664466446, + "grad_norm": 0.013924404047429562, + "learning_rate": 3.770148495303359e-05, + "loss": 0.0093, + "num_input_tokens_seen": 15244512, + "step": 72235 + }, + { + "epoch": 7.947194719471947, + "grad_norm": 0.4989818334579468, + "learning_rate": 3.7699417663285175e-05, + "loss": 0.085, + "num_input_tokens_seen": 15245568, + "step": 72240 + }, + { + "epoch": 7.947744774477448, + "grad_norm": 0.03755790367722511, + "learning_rate": 3.7697350256492316e-05, + "loss": 0.0834, + "num_input_tokens_seen": 15246656, + "step": 72245 + }, + { + "epoch": 7.948294829482949, + "grad_norm": 1.5489492416381836, + "learning_rate": 3.769528273267406e-05, + "loss": 0.0842, + "num_input_tokens_seen": 15247744, + "step": 72250 + }, + { + "epoch": 7.948844884488449, + "grad_norm": 0.4028184115886688, + "learning_rate": 3.769321509184949e-05, + "loss": 0.0114, + "num_input_tokens_seen": 15248864, + "step": 72255 + }, + { + "epoch": 7.949394939493949, + "grad_norm": 0.037166863679885864, + "learning_rate": 3.7691147334037645e-05, + "loss": 0.0386, + "num_input_tokens_seen": 15249856, + "step": 72260 + }, + { + "epoch": 7.94994499449945, + "grad_norm": 1.8979434967041016, + "learning_rate": 3.768907945925758e-05, + "loss": 0.0528, + "num_input_tokens_seen": 15250880, + "step": 72265 + }, + { + "epoch": 7.9504950495049505, + "grad_norm": 0.01417498104274273, + "learning_rate": 3.768701146752836e-05, + "loss": 0.0468, + "num_input_tokens_seen": 15251904, + "step": 72270 + }, + { + "epoch": 7.951045104510451, + "grad_norm": 1.6268115043640137, + "learning_rate": 3.768494335886904e-05, + "loss": 0.0848, + "num_input_tokens_seen": 15252960, + "step": 72275 + }, + { + "epoch": 7.951595159515952, + "grad_norm": 0.2399183213710785, + "learning_rate": 3.768287513329869e-05, + "loss": 0.0562, + "num_input_tokens_seen": 15254048, + "step": 72280 + }, + { + "epoch": 7.952145214521452, + "grad_norm": 0.7337198853492737, + "learning_rate": 3.768080679083637e-05, + "loss": 0.0857, + "num_input_tokens_seen": 15255136, + "step": 72285 + }, + { + "epoch": 7.952695269526953, + "grad_norm": 0.08049492537975311, + "learning_rate": 3.767873833150112e-05, + "loss": 0.0607, + "num_input_tokens_seen": 15256256, + "step": 72290 + }, + { + "epoch": 7.953245324532453, + "grad_norm": 0.041422877460718155, + "learning_rate": 3.767666975531203e-05, + "loss": 0.0106, + "num_input_tokens_seen": 15257344, + "step": 72295 + }, + { + "epoch": 7.9537953795379535, + "grad_norm": 1.1295384168624878, + "learning_rate": 3.767460106228816e-05, + "loss": 0.0511, + "num_input_tokens_seen": 15258368, + "step": 72300 + }, + { + "epoch": 7.9543454345434546, + "grad_norm": 0.8854425549507141, + "learning_rate": 3.767253225244859e-05, + "loss": 0.0502, + "num_input_tokens_seen": 15259424, + "step": 72305 + }, + { + "epoch": 7.954895489548955, + "grad_norm": 0.13703076541423798, + "learning_rate": 3.767046332581235e-05, + "loss": 0.0708, + "num_input_tokens_seen": 15260544, + "step": 72310 + }, + { + "epoch": 7.955445544554456, + "grad_norm": 1.4377409219741821, + "learning_rate": 3.766839428239853e-05, + "loss": 0.1674, + "num_input_tokens_seen": 15261632, + "step": 72315 + }, + { + "epoch": 7.955995599559956, + "grad_norm": 0.09457887709140778, + "learning_rate": 3.766632512222621e-05, + "loss": 0.0052, + "num_input_tokens_seen": 15262656, + "step": 72320 + }, + { + "epoch": 7.956545654565456, + "grad_norm": 0.5096108317375183, + "learning_rate": 3.766425584531445e-05, + "loss": 0.0819, + "num_input_tokens_seen": 15263680, + "step": 72325 + }, + { + "epoch": 7.957095709570957, + "grad_norm": 0.6492971777915955, + "learning_rate": 3.766218645168231e-05, + "loss": 0.0369, + "num_input_tokens_seen": 15264736, + "step": 72330 + }, + { + "epoch": 7.957645764576458, + "grad_norm": 0.3394947052001953, + "learning_rate": 3.766011694134888e-05, + "loss": 0.0762, + "num_input_tokens_seen": 15265856, + "step": 72335 + }, + { + "epoch": 7.958195819581958, + "grad_norm": 0.11120793223381042, + "learning_rate": 3.765804731433322e-05, + "loss": 0.0467, + "num_input_tokens_seen": 15266912, + "step": 72340 + }, + { + "epoch": 7.958745874587459, + "grad_norm": 0.28868407011032104, + "learning_rate": 3.765597757065442e-05, + "loss": 0.0185, + "num_input_tokens_seen": 15267936, + "step": 72345 + }, + { + "epoch": 7.959295929592959, + "grad_norm": 0.44234758615493774, + "learning_rate": 3.765390771033154e-05, + "loss": 0.0191, + "num_input_tokens_seen": 15268896, + "step": 72350 + }, + { + "epoch": 7.95984598459846, + "grad_norm": 0.0709415152668953, + "learning_rate": 3.7651837733383676e-05, + "loss": 0.0197, + "num_input_tokens_seen": 15269984, + "step": 72355 + }, + { + "epoch": 7.96039603960396, + "grad_norm": 0.05052074044942856, + "learning_rate": 3.764976763982989e-05, + "loss": 0.0282, + "num_input_tokens_seen": 15271040, + "step": 72360 + }, + { + "epoch": 7.960946094609461, + "grad_norm": 0.12384543567895889, + "learning_rate": 3.764769742968926e-05, + "loss": 0.028, + "num_input_tokens_seen": 15272128, + "step": 72365 + }, + { + "epoch": 7.961496149614962, + "grad_norm": 0.10355377942323685, + "learning_rate": 3.7645627102980875e-05, + "loss": 0.008, + "num_input_tokens_seen": 15273184, + "step": 72370 + }, + { + "epoch": 7.962046204620462, + "grad_norm": 0.2664492130279541, + "learning_rate": 3.764355665972381e-05, + "loss": 0.0775, + "num_input_tokens_seen": 15274272, + "step": 72375 + }, + { + "epoch": 7.962596259625963, + "grad_norm": 0.2125023603439331, + "learning_rate": 3.764148609993715e-05, + "loss": 0.1149, + "num_input_tokens_seen": 15275360, + "step": 72380 + }, + { + "epoch": 7.963146314631463, + "grad_norm": 0.06656219065189362, + "learning_rate": 3.763941542363999e-05, + "loss": 0.02, + "num_input_tokens_seen": 15276384, + "step": 72385 + }, + { + "epoch": 7.963696369636963, + "grad_norm": 1.2993974685668945, + "learning_rate": 3.763734463085139e-05, + "loss": 0.0562, + "num_input_tokens_seen": 15277440, + "step": 72390 + }, + { + "epoch": 7.9642464246424645, + "grad_norm": 0.6910961866378784, + "learning_rate": 3.763527372159046e-05, + "loss": 0.0292, + "num_input_tokens_seen": 15278560, + "step": 72395 + }, + { + "epoch": 7.964796479647965, + "grad_norm": 0.1843768209218979, + "learning_rate": 3.763320269587627e-05, + "loss": 0.0095, + "num_input_tokens_seen": 15279552, + "step": 72400 + }, + { + "epoch": 7.965346534653465, + "grad_norm": 0.09579446166753769, + "learning_rate": 3.7631131553727905e-05, + "loss": 0.0269, + "num_input_tokens_seen": 15280608, + "step": 72405 + }, + { + "epoch": 7.965896589658966, + "grad_norm": 1.7273848056793213, + "learning_rate": 3.7629060295164466e-05, + "loss": 0.0409, + "num_input_tokens_seen": 15281600, + "step": 72410 + }, + { + "epoch": 7.966446644664466, + "grad_norm": 2.0194952487945557, + "learning_rate": 3.7626988920205045e-05, + "loss": 0.1126, + "num_input_tokens_seen": 15282688, + "step": 72415 + }, + { + "epoch": 7.966996699669967, + "grad_norm": 0.02418696880340576, + "learning_rate": 3.762491742886872e-05, + "loss": 0.1923, + "num_input_tokens_seen": 15283776, + "step": 72420 + }, + { + "epoch": 7.9675467546754675, + "grad_norm": 0.027885375544428825, + "learning_rate": 3.762284582117459e-05, + "loss": 0.0911, + "num_input_tokens_seen": 15284864, + "step": 72425 + }, + { + "epoch": 7.968096809680969, + "grad_norm": 0.04817179962992668, + "learning_rate": 3.762077409714174e-05, + "loss": 0.045, + "num_input_tokens_seen": 15285920, + "step": 72430 + }, + { + "epoch": 7.968646864686469, + "grad_norm": 0.0353679396212101, + "learning_rate": 3.7618702256789285e-05, + "loss": 0.0353, + "num_input_tokens_seen": 15287008, + "step": 72435 + }, + { + "epoch": 7.969196919691969, + "grad_norm": 1.0764193534851074, + "learning_rate": 3.7616630300136296e-05, + "loss": 0.0233, + "num_input_tokens_seen": 15288064, + "step": 72440 + }, + { + "epoch": 7.96974697469747, + "grad_norm": 0.7742969989776611, + "learning_rate": 3.7614558227201886e-05, + "loss": 0.0267, + "num_input_tokens_seen": 15289088, + "step": 72445 + }, + { + "epoch": 7.97029702970297, + "grad_norm": 0.037272918969392776, + "learning_rate": 3.761248603800514e-05, + "loss": 0.0281, + "num_input_tokens_seen": 15290176, + "step": 72450 + }, + { + "epoch": 7.9708470847084705, + "grad_norm": 0.02437691204249859, + "learning_rate": 3.761041373256517e-05, + "loss": 0.0111, + "num_input_tokens_seen": 15291200, + "step": 72455 + }, + { + "epoch": 7.971397139713972, + "grad_norm": 0.05270180478692055, + "learning_rate": 3.760834131090106e-05, + "loss": 0.1294, + "num_input_tokens_seen": 15292256, + "step": 72460 + }, + { + "epoch": 7.971947194719472, + "grad_norm": 1.5987333059310913, + "learning_rate": 3.7606268773031926e-05, + "loss": 0.0725, + "num_input_tokens_seen": 15293376, + "step": 72465 + }, + { + "epoch": 7.972497249724973, + "grad_norm": 0.7384251952171326, + "learning_rate": 3.7604196118976854e-05, + "loss": 0.0238, + "num_input_tokens_seen": 15294400, + "step": 72470 + }, + { + "epoch": 7.973047304730473, + "grad_norm": 0.2338637113571167, + "learning_rate": 3.760212334875496e-05, + "loss": 0.0155, + "num_input_tokens_seen": 15295392, + "step": 72475 + }, + { + "epoch": 7.973597359735973, + "grad_norm": 0.011628111824393272, + "learning_rate": 3.760005046238534e-05, + "loss": 0.0276, + "num_input_tokens_seen": 15296480, + "step": 72480 + }, + { + "epoch": 7.974147414741474, + "grad_norm": 0.018927987664937973, + "learning_rate": 3.7597977459887105e-05, + "loss": 0.0304, + "num_input_tokens_seen": 15297536, + "step": 72485 + }, + { + "epoch": 7.974697469746975, + "grad_norm": 0.0524427704513073, + "learning_rate": 3.759590434127935e-05, + "loss": 0.0393, + "num_input_tokens_seen": 15298592, + "step": 72490 + }, + { + "epoch": 7.975247524752476, + "grad_norm": 0.022006666287779808, + "learning_rate": 3.759383110658119e-05, + "loss": 0.0957, + "num_input_tokens_seen": 15299680, + "step": 72495 + }, + { + "epoch": 7.975797579757976, + "grad_norm": 0.15710635483264923, + "learning_rate": 3.759175775581173e-05, + "loss": 0.0081, + "num_input_tokens_seen": 15300672, + "step": 72500 + }, + { + "epoch": 7.976347634763476, + "grad_norm": 0.07249738276004791, + "learning_rate": 3.758968428899009e-05, + "loss": 0.0609, + "num_input_tokens_seen": 15301760, + "step": 72505 + }, + { + "epoch": 7.976897689768977, + "grad_norm": 0.20729319751262665, + "learning_rate": 3.7587610706135366e-05, + "loss": 0.0182, + "num_input_tokens_seen": 15302720, + "step": 72510 + }, + { + "epoch": 7.977447744774477, + "grad_norm": 0.013435354456305504, + "learning_rate": 3.758553700726667e-05, + "loss": 0.0081, + "num_input_tokens_seen": 15303808, + "step": 72515 + }, + { + "epoch": 7.977997799779978, + "grad_norm": 0.6542243957519531, + "learning_rate": 3.758346319240312e-05, + "loss": 0.0617, + "num_input_tokens_seen": 15304800, + "step": 72520 + }, + { + "epoch": 7.978547854785479, + "grad_norm": 0.264211505651474, + "learning_rate": 3.7581389261563824e-05, + "loss": 0.0301, + "num_input_tokens_seen": 15305856, + "step": 72525 + }, + { + "epoch": 7.979097909790979, + "grad_norm": 0.11672024428844452, + "learning_rate": 3.757931521476791e-05, + "loss": 0.0072, + "num_input_tokens_seen": 15306880, + "step": 72530 + }, + { + "epoch": 7.97964796479648, + "grad_norm": 0.5700242519378662, + "learning_rate": 3.757724105203448e-05, + "loss": 0.0212, + "num_input_tokens_seen": 15307904, + "step": 72535 + }, + { + "epoch": 7.98019801980198, + "grad_norm": 0.020152540877461433, + "learning_rate": 3.757516677338266e-05, + "loss": 0.0091, + "num_input_tokens_seen": 15308960, + "step": 72540 + }, + { + "epoch": 7.98074807480748, + "grad_norm": 0.33588969707489014, + "learning_rate": 3.7573092378831555e-05, + "loss": 0.0318, + "num_input_tokens_seen": 15310016, + "step": 72545 + }, + { + "epoch": 7.9812981298129815, + "grad_norm": 0.47817251086235046, + "learning_rate": 3.757101786840029e-05, + "loss": 0.0323, + "num_input_tokens_seen": 15311040, + "step": 72550 + }, + { + "epoch": 7.981848184818482, + "grad_norm": 0.026357902213931084, + "learning_rate": 3.756894324210799e-05, + "loss": 0.0072, + "num_input_tokens_seen": 15312064, + "step": 72555 + }, + { + "epoch": 7.982398239823983, + "grad_norm": 0.6174798607826233, + "learning_rate": 3.756686849997377e-05, + "loss": 0.0825, + "num_input_tokens_seen": 15313152, + "step": 72560 + }, + { + "epoch": 7.982948294829483, + "grad_norm": 0.024833159521222115, + "learning_rate": 3.756479364201675e-05, + "loss": 0.0718, + "num_input_tokens_seen": 15314240, + "step": 72565 + }, + { + "epoch": 7.983498349834983, + "grad_norm": 0.6517944931983948, + "learning_rate": 3.756271866825606e-05, + "loss": 0.0318, + "num_input_tokens_seen": 15315296, + "step": 72570 + }, + { + "epoch": 7.984048404840484, + "grad_norm": 0.007756468839943409, + "learning_rate": 3.756064357871082e-05, + "loss": 0.0124, + "num_input_tokens_seen": 15316320, + "step": 72575 + }, + { + "epoch": 7.9845984598459845, + "grad_norm": 0.1637234389781952, + "learning_rate": 3.755856837340015e-05, + "loss": 0.0108, + "num_input_tokens_seen": 15317440, + "step": 72580 + }, + { + "epoch": 7.985148514851485, + "grad_norm": 0.035510241985321045, + "learning_rate": 3.7556493052343187e-05, + "loss": 0.0137, + "num_input_tokens_seen": 15318496, + "step": 72585 + }, + { + "epoch": 7.985698569856986, + "grad_norm": 0.01359882764518261, + "learning_rate": 3.755441761555906e-05, + "loss": 0.0131, + "num_input_tokens_seen": 15319456, + "step": 72590 + }, + { + "epoch": 7.986248624862486, + "grad_norm": 0.201277494430542, + "learning_rate": 3.755234206306687e-05, + "loss": 0.0206, + "num_input_tokens_seen": 15320512, + "step": 72595 + }, + { + "epoch": 7.986798679867987, + "grad_norm": 1.3999849557876587, + "learning_rate": 3.755026639488578e-05, + "loss": 0.1867, + "num_input_tokens_seen": 15321536, + "step": 72600 + }, + { + "epoch": 7.987348734873487, + "grad_norm": 0.44063252210617065, + "learning_rate": 3.7548190611034904e-05, + "loss": 0.0194, + "num_input_tokens_seen": 15322560, + "step": 72605 + }, + { + "epoch": 7.987898789878988, + "grad_norm": 0.49327751994132996, + "learning_rate": 3.754611471153337e-05, + "loss": 0.0377, + "num_input_tokens_seen": 15323616, + "step": 72610 + }, + { + "epoch": 7.988448844884489, + "grad_norm": 0.5031164884567261, + "learning_rate": 3.754403869640033e-05, + "loss": 0.0583, + "num_input_tokens_seen": 15324640, + "step": 72615 + }, + { + "epoch": 7.988998899889989, + "grad_norm": 0.13934119045734406, + "learning_rate": 3.75419625656549e-05, + "loss": 0.0098, + "num_input_tokens_seen": 15325728, + "step": 72620 + }, + { + "epoch": 7.98954895489549, + "grad_norm": 0.23599129915237427, + "learning_rate": 3.753988631931622e-05, + "loss": 0.0099, + "num_input_tokens_seen": 15326784, + "step": 72625 + }, + { + "epoch": 7.99009900990099, + "grad_norm": 0.0371781624853611, + "learning_rate": 3.753780995740342e-05, + "loss": 0.0177, + "num_input_tokens_seen": 15327808, + "step": 72630 + }, + { + "epoch": 7.99064906490649, + "grad_norm": 0.4233807325363159, + "learning_rate": 3.753573347993564e-05, + "loss": 0.0086, + "num_input_tokens_seen": 15328928, + "step": 72635 + }, + { + "epoch": 7.991199119911991, + "grad_norm": 0.6966692209243774, + "learning_rate": 3.753365688693202e-05, + "loss": 0.0461, + "num_input_tokens_seen": 15329952, + "step": 72640 + }, + { + "epoch": 7.991749174917492, + "grad_norm": 0.0956725925207138, + "learning_rate": 3.7531580178411705e-05, + "loss": 0.0099, + "num_input_tokens_seen": 15331008, + "step": 72645 + }, + { + "epoch": 7.992299229922993, + "grad_norm": 0.07163161039352417, + "learning_rate": 3.752950335439383e-05, + "loss": 0.0418, + "num_input_tokens_seen": 15332000, + "step": 72650 + }, + { + "epoch": 7.992849284928493, + "grad_norm": 1.6243312358856201, + "learning_rate": 3.7527426414897534e-05, + "loss": 0.0349, + "num_input_tokens_seen": 15333024, + "step": 72655 + }, + { + "epoch": 7.993399339933993, + "grad_norm": 0.07867331802845001, + "learning_rate": 3.7525349359941954e-05, + "loss": 0.0221, + "num_input_tokens_seen": 15334016, + "step": 72660 + }, + { + "epoch": 7.993949394939494, + "grad_norm": 0.014451646246016026, + "learning_rate": 3.7523272189546245e-05, + "loss": 0.0339, + "num_input_tokens_seen": 15335072, + "step": 72665 + }, + { + "epoch": 7.994499449944994, + "grad_norm": 0.13018324971199036, + "learning_rate": 3.752119490372954e-05, + "loss": 0.0054, + "num_input_tokens_seen": 15336160, + "step": 72670 + }, + { + "epoch": 7.9950495049504955, + "grad_norm": 0.017200510948896408, + "learning_rate": 3.7519117502510994e-05, + "loss": 0.0034, + "num_input_tokens_seen": 15337184, + "step": 72675 + }, + { + "epoch": 7.995599559955996, + "grad_norm": 0.06718774139881134, + "learning_rate": 3.7517039985909745e-05, + "loss": 0.0382, + "num_input_tokens_seen": 15338272, + "step": 72680 + }, + { + "epoch": 7.996149614961496, + "grad_norm": 1.8071818351745605, + "learning_rate": 3.751496235394495e-05, + "loss": 0.018, + "num_input_tokens_seen": 15339360, + "step": 72685 + }, + { + "epoch": 7.996699669966997, + "grad_norm": 0.010637864470481873, + "learning_rate": 3.751288460663575e-05, + "loss": 0.0699, + "num_input_tokens_seen": 15340384, + "step": 72690 + }, + { + "epoch": 7.997249724972497, + "grad_norm": 0.5904926061630249, + "learning_rate": 3.75108067440013e-05, + "loss": 0.013, + "num_input_tokens_seen": 15341440, + "step": 72695 + }, + { + "epoch": 7.997799779977997, + "grad_norm": 0.23764553666114807, + "learning_rate": 3.750872876606074e-05, + "loss": 0.0869, + "num_input_tokens_seen": 15342496, + "step": 72700 + }, + { + "epoch": 7.9983498349834985, + "grad_norm": 0.1422734409570694, + "learning_rate": 3.750665067283323e-05, + "loss": 0.0985, + "num_input_tokens_seen": 15343552, + "step": 72705 + }, + { + "epoch": 7.998899889988999, + "grad_norm": 0.07450118660926819, + "learning_rate": 3.7504572464337936e-05, + "loss": 0.0138, + "num_input_tokens_seen": 15344608, + "step": 72710 + }, + { + "epoch": 7.9994499449945, + "grad_norm": 0.008523261174559593, + "learning_rate": 3.750249414059398e-05, + "loss": 0.0246, + "num_input_tokens_seen": 15345696, + "step": 72715 + }, + { + "epoch": 8.0, + "grad_norm": 0.15833641588687897, + "learning_rate": 3.7500415701620537e-05, + "loss": 0.008, + "num_input_tokens_seen": 15346672, + "step": 72720 + }, + { + "epoch": 8.0, + "eval_loss": 0.06365779042243958, + "eval_runtime": 36.9791, + "eval_samples_per_second": 109.251, + "eval_steps_per_second": 27.313, + "num_input_tokens_seen": 15346672, + "step": 72720 + }, + { + "epoch": 8.000550055005501, + "grad_norm": 1.725860595703125, + "learning_rate": 3.749833714743677e-05, + "loss": 0.0528, + "num_input_tokens_seen": 15347728, + "step": 72725 + }, + { + "epoch": 8.001100110011, + "grad_norm": 1.704052448272705, + "learning_rate": 3.749625847806182e-05, + "loss": 0.0957, + "num_input_tokens_seen": 15348784, + "step": 72730 + }, + { + "epoch": 8.001650165016502, + "grad_norm": 0.04921539127826691, + "learning_rate": 3.7494179693514866e-05, + "loss": 0.0028, + "num_input_tokens_seen": 15349840, + "step": 72735 + }, + { + "epoch": 8.002200220022003, + "grad_norm": 0.09192293882369995, + "learning_rate": 3.749210079381504e-05, + "loss": 0.0306, + "num_input_tokens_seen": 15350928, + "step": 72740 + }, + { + "epoch": 8.002750275027502, + "grad_norm": 0.03628360852599144, + "learning_rate": 3.749002177898151e-05, + "loss": 0.0229, + "num_input_tokens_seen": 15352016, + "step": 72745 + }, + { + "epoch": 8.003300330033003, + "grad_norm": 1.1268892288208008, + "learning_rate": 3.7487942649033445e-05, + "loss": 0.1151, + "num_input_tokens_seen": 15353040, + "step": 72750 + }, + { + "epoch": 8.003850385038504, + "grad_norm": 1.5386838912963867, + "learning_rate": 3.748586340399002e-05, + "loss": 0.0486, + "num_input_tokens_seen": 15354032, + "step": 72755 + }, + { + "epoch": 8.004400440044005, + "grad_norm": 2.6603891849517822, + "learning_rate": 3.748378404387037e-05, + "loss": 0.0595, + "num_input_tokens_seen": 15355088, + "step": 72760 + }, + { + "epoch": 8.004950495049505, + "grad_norm": 1.2082823514938354, + "learning_rate": 3.7481704568693674e-05, + "loss": 0.1207, + "num_input_tokens_seen": 15356144, + "step": 72765 + }, + { + "epoch": 8.005500550055006, + "grad_norm": 0.07810434699058533, + "learning_rate": 3.74796249784791e-05, + "loss": 0.0567, + "num_input_tokens_seen": 15357168, + "step": 72770 + }, + { + "epoch": 8.006050605060507, + "grad_norm": 0.31199243664741516, + "learning_rate": 3.7477545273245807e-05, + "loss": 0.0171, + "num_input_tokens_seen": 15358192, + "step": 72775 + }, + { + "epoch": 8.006600660066006, + "grad_norm": 0.04208574816584587, + "learning_rate": 3.747546545301297e-05, + "loss": 0.0096, + "num_input_tokens_seen": 15359248, + "step": 72780 + }, + { + "epoch": 8.007150715071507, + "grad_norm": 0.030876444652676582, + "learning_rate": 3.747338551779975e-05, + "loss": 0.009, + "num_input_tokens_seen": 15360304, + "step": 72785 + }, + { + "epoch": 8.007700770077008, + "grad_norm": 0.22971297800540924, + "learning_rate": 3.747130546762532e-05, + "loss": 0.052, + "num_input_tokens_seen": 15361392, + "step": 72790 + }, + { + "epoch": 8.008250825082508, + "grad_norm": 1.580327033996582, + "learning_rate": 3.746922530250886e-05, + "loss": 0.0773, + "num_input_tokens_seen": 15362448, + "step": 72795 + }, + { + "epoch": 8.008800880088009, + "grad_norm": 1.8596906661987305, + "learning_rate": 3.746714502246953e-05, + "loss": 0.0651, + "num_input_tokens_seen": 15363504, + "step": 72800 + }, + { + "epoch": 8.00935093509351, + "grad_norm": 0.15957976877689362, + "learning_rate": 3.74650646275265e-05, + "loss": 0.0331, + "num_input_tokens_seen": 15364528, + "step": 72805 + }, + { + "epoch": 8.009900990099009, + "grad_norm": 0.5709072947502136, + "learning_rate": 3.746298411769896e-05, + "loss": 0.0428, + "num_input_tokens_seen": 15365584, + "step": 72810 + }, + { + "epoch": 8.01045104510451, + "grad_norm": 0.005410961806774139, + "learning_rate": 3.7460903493006075e-05, + "loss": 0.1353, + "num_input_tokens_seen": 15366576, + "step": 72815 + }, + { + "epoch": 8.011001100110011, + "grad_norm": 0.057801611721515656, + "learning_rate": 3.745882275346701e-05, + "loss": 0.0478, + "num_input_tokens_seen": 15367536, + "step": 72820 + }, + { + "epoch": 8.011551155115512, + "grad_norm": 0.12519316375255585, + "learning_rate": 3.7456741899100966e-05, + "loss": 0.0281, + "num_input_tokens_seen": 15368624, + "step": 72825 + }, + { + "epoch": 8.012101210121012, + "grad_norm": 0.08875782787799835, + "learning_rate": 3.745466092992711e-05, + "loss": 0.0067, + "num_input_tokens_seen": 15369584, + "step": 72830 + }, + { + "epoch": 8.012651265126513, + "grad_norm": 0.2597365081310272, + "learning_rate": 3.745257984596461e-05, + "loss": 0.0057, + "num_input_tokens_seen": 15370672, + "step": 72835 + }, + { + "epoch": 8.013201320132014, + "grad_norm": 0.04531557112932205, + "learning_rate": 3.7450498647232665e-05, + "loss": 0.0027, + "num_input_tokens_seen": 15371728, + "step": 72840 + }, + { + "epoch": 8.013751375137513, + "grad_norm": 0.20831547677516937, + "learning_rate": 3.7448417333750455e-05, + "loss": 0.0289, + "num_input_tokens_seen": 15372752, + "step": 72845 + }, + { + "epoch": 8.014301430143014, + "grad_norm": 0.5051479339599609, + "learning_rate": 3.744633590553714e-05, + "loss": 0.052, + "num_input_tokens_seen": 15373776, + "step": 72850 + }, + { + "epoch": 8.014851485148515, + "grad_norm": 0.06190621107816696, + "learning_rate": 3.744425436261193e-05, + "loss": 0.0235, + "num_input_tokens_seen": 15374800, + "step": 72855 + }, + { + "epoch": 8.015401540154015, + "grad_norm": 1.2341176271438599, + "learning_rate": 3.7442172704993995e-05, + "loss": 0.1141, + "num_input_tokens_seen": 15375888, + "step": 72860 + }, + { + "epoch": 8.015951595159516, + "grad_norm": 0.9087260365486145, + "learning_rate": 3.7440090932702524e-05, + "loss": 0.0687, + "num_input_tokens_seen": 15376944, + "step": 72865 + }, + { + "epoch": 8.016501650165017, + "grad_norm": 0.02606320194900036, + "learning_rate": 3.743800904575671e-05, + "loss": 0.0753, + "num_input_tokens_seen": 15377936, + "step": 72870 + }, + { + "epoch": 8.017051705170518, + "grad_norm": 0.029027074575424194, + "learning_rate": 3.743592704417573e-05, + "loss": 0.0052, + "num_input_tokens_seen": 15378992, + "step": 72875 + }, + { + "epoch": 8.017601760176017, + "grad_norm": 0.015583738684654236, + "learning_rate": 3.743384492797878e-05, + "loss": 0.0731, + "num_input_tokens_seen": 15380048, + "step": 72880 + }, + { + "epoch": 8.018151815181518, + "grad_norm": 0.020317286252975464, + "learning_rate": 3.743176269718504e-05, + "loss": 0.0278, + "num_input_tokens_seen": 15381072, + "step": 72885 + }, + { + "epoch": 8.01870187018702, + "grad_norm": 0.053269077092409134, + "learning_rate": 3.742968035181371e-05, + "loss": 0.0116, + "num_input_tokens_seen": 15382160, + "step": 72890 + }, + { + "epoch": 8.019251925192519, + "grad_norm": 0.1821296364068985, + "learning_rate": 3.7427597891883985e-05, + "loss": 0.0108, + "num_input_tokens_seen": 15383184, + "step": 72895 + }, + { + "epoch": 8.01980198019802, + "grad_norm": 1.135807752609253, + "learning_rate": 3.7425515317415064e-05, + "loss": 0.0317, + "num_input_tokens_seen": 15384272, + "step": 72900 + }, + { + "epoch": 8.020352035203521, + "grad_norm": 0.08668234199285507, + "learning_rate": 3.7423432628426116e-05, + "loss": 0.0465, + "num_input_tokens_seen": 15385328, + "step": 72905 + }, + { + "epoch": 8.02090209020902, + "grad_norm": 0.04363778233528137, + "learning_rate": 3.742134982493635e-05, + "loss": 0.0066, + "num_input_tokens_seen": 15386352, + "step": 72910 + }, + { + "epoch": 8.021452145214521, + "grad_norm": 0.0066305045038461685, + "learning_rate": 3.741926690696497e-05, + "loss": 0.0071, + "num_input_tokens_seen": 15387440, + "step": 72915 + }, + { + "epoch": 8.022002200220022, + "grad_norm": 0.04323312267661095, + "learning_rate": 3.7417183874531165e-05, + "loss": 0.0271, + "num_input_tokens_seen": 15388496, + "step": 72920 + }, + { + "epoch": 8.022552255225522, + "grad_norm": 0.01761561632156372, + "learning_rate": 3.7415100727654137e-05, + "loss": 0.0078, + "num_input_tokens_seen": 15389584, + "step": 72925 + }, + { + "epoch": 8.023102310231023, + "grad_norm": 0.04096471518278122, + "learning_rate": 3.741301746635308e-05, + "loss": 0.0156, + "num_input_tokens_seen": 15390640, + "step": 72930 + }, + { + "epoch": 8.023652365236524, + "grad_norm": 0.0577227957546711, + "learning_rate": 3.7410934090647193e-05, + "loss": 0.1108, + "num_input_tokens_seen": 15391696, + "step": 72935 + }, + { + "epoch": 8.024202420242025, + "grad_norm": 0.10397037118673325, + "learning_rate": 3.740885060055569e-05, + "loss": 0.0199, + "num_input_tokens_seen": 15392816, + "step": 72940 + }, + { + "epoch": 8.024752475247524, + "grad_norm": 0.061401791870594025, + "learning_rate": 3.740676699609776e-05, + "loss": 0.0489, + "num_input_tokens_seen": 15393840, + "step": 72945 + }, + { + "epoch": 8.025302530253025, + "grad_norm": 0.15274138748645782, + "learning_rate": 3.740468327729261e-05, + "loss": 0.0047, + "num_input_tokens_seen": 15394928, + "step": 72950 + }, + { + "epoch": 8.025852585258527, + "grad_norm": 0.19802844524383545, + "learning_rate": 3.740259944415945e-05, + "loss": 0.0186, + "num_input_tokens_seen": 15396016, + "step": 72955 + }, + { + "epoch": 8.026402640264026, + "grad_norm": 0.04258505627512932, + "learning_rate": 3.7400515496717484e-05, + "loss": 0.0036, + "num_input_tokens_seen": 15397040, + "step": 72960 + }, + { + "epoch": 8.026952695269527, + "grad_norm": 0.008534799329936504, + "learning_rate": 3.739843143498591e-05, + "loss": 0.0565, + "num_input_tokens_seen": 15398096, + "step": 72965 + }, + { + "epoch": 8.027502750275028, + "grad_norm": 0.6861963272094727, + "learning_rate": 3.739634725898396e-05, + "loss": 0.1005, + "num_input_tokens_seen": 15399184, + "step": 72970 + }, + { + "epoch": 8.028052805280527, + "grad_norm": 1.2018133401870728, + "learning_rate": 3.7394262968730804e-05, + "loss": 0.066, + "num_input_tokens_seen": 15400272, + "step": 72975 + }, + { + "epoch": 8.028602860286028, + "grad_norm": 0.30586305260658264, + "learning_rate": 3.739217856424568e-05, + "loss": 0.0089, + "num_input_tokens_seen": 15401328, + "step": 72980 + }, + { + "epoch": 8.02915291529153, + "grad_norm": 0.42151153087615967, + "learning_rate": 3.7390094045547804e-05, + "loss": 0.0103, + "num_input_tokens_seen": 15402384, + "step": 72985 + }, + { + "epoch": 8.029702970297029, + "grad_norm": 0.012745494022965431, + "learning_rate": 3.7388009412656366e-05, + "loss": 0.0283, + "num_input_tokens_seen": 15403408, + "step": 72990 + }, + { + "epoch": 8.03025302530253, + "grad_norm": 0.03551141917705536, + "learning_rate": 3.738592466559059e-05, + "loss": 0.008, + "num_input_tokens_seen": 15404528, + "step": 72995 + }, + { + "epoch": 8.030803080308031, + "grad_norm": 0.07029155641794205, + "learning_rate": 3.738383980436969e-05, + "loss": 0.005, + "num_input_tokens_seen": 15405584, + "step": 73000 + }, + { + "epoch": 8.031353135313532, + "grad_norm": 0.010036284103989601, + "learning_rate": 3.738175482901288e-05, + "loss": 0.1234, + "num_input_tokens_seen": 15406672, + "step": 73005 + }, + { + "epoch": 8.031903190319031, + "grad_norm": 0.01468099094927311, + "learning_rate": 3.737966973953938e-05, + "loss": 0.0237, + "num_input_tokens_seen": 15407728, + "step": 73010 + }, + { + "epoch": 8.032453245324533, + "grad_norm": 0.47035565972328186, + "learning_rate": 3.737758453596841e-05, + "loss": 0.0196, + "num_input_tokens_seen": 15408784, + "step": 73015 + }, + { + "epoch": 8.033003300330034, + "grad_norm": 0.07062403857707977, + "learning_rate": 3.7375499218319177e-05, + "loss": 0.0503, + "num_input_tokens_seen": 15409840, + "step": 73020 + }, + { + "epoch": 8.033553355335533, + "grad_norm": 2.208692789077759, + "learning_rate": 3.73734137866109e-05, + "loss": 0.1021, + "num_input_tokens_seen": 15410896, + "step": 73025 + }, + { + "epoch": 8.034103410341034, + "grad_norm": 0.9028143882751465, + "learning_rate": 3.737132824086281e-05, + "loss": 0.0256, + "num_input_tokens_seen": 15411920, + "step": 73030 + }, + { + "epoch": 8.034653465346535, + "grad_norm": 0.005284197628498077, + "learning_rate": 3.736924258109412e-05, + "loss": 0.0134, + "num_input_tokens_seen": 15413008, + "step": 73035 + }, + { + "epoch": 8.035203520352034, + "grad_norm": 0.04406827688217163, + "learning_rate": 3.7367156807324066e-05, + "loss": 0.043, + "num_input_tokens_seen": 15414032, + "step": 73040 + }, + { + "epoch": 8.035753575357536, + "grad_norm": 0.028246307745575905, + "learning_rate": 3.736507091957186e-05, + "loss": 0.0036, + "num_input_tokens_seen": 15415024, + "step": 73045 + }, + { + "epoch": 8.036303630363037, + "grad_norm": 1.3352750539779663, + "learning_rate": 3.7362984917856724e-05, + "loss": 0.1193, + "num_input_tokens_seen": 15416112, + "step": 73050 + }, + { + "epoch": 8.036853685368538, + "grad_norm": 0.19750218093395233, + "learning_rate": 3.736089880219789e-05, + "loss": 0.0766, + "num_input_tokens_seen": 15417168, + "step": 73055 + }, + { + "epoch": 8.037403740374037, + "grad_norm": 0.21250246465206146, + "learning_rate": 3.7358812572614586e-05, + "loss": 0.0203, + "num_input_tokens_seen": 15418288, + "step": 73060 + }, + { + "epoch": 8.037953795379538, + "grad_norm": 1.0883339643478394, + "learning_rate": 3.7356726229126036e-05, + "loss": 0.0218, + "num_input_tokens_seen": 15419280, + "step": 73065 + }, + { + "epoch": 8.03850385038504, + "grad_norm": 0.11075592786073685, + "learning_rate": 3.735463977175147e-05, + "loss": 0.0092, + "num_input_tokens_seen": 15420336, + "step": 73070 + }, + { + "epoch": 8.039053905390539, + "grad_norm": 0.020378828048706055, + "learning_rate": 3.735255320051012e-05, + "loss": 0.0257, + "num_input_tokens_seen": 15421424, + "step": 73075 + }, + { + "epoch": 8.03960396039604, + "grad_norm": 0.10825783014297485, + "learning_rate": 3.73504665154212e-05, + "loss": 0.0147, + "num_input_tokens_seen": 15422480, + "step": 73080 + }, + { + "epoch": 8.04015401540154, + "grad_norm": 1.2651909589767456, + "learning_rate": 3.734837971650397e-05, + "loss": 0.0716, + "num_input_tokens_seen": 15423536, + "step": 73085 + }, + { + "epoch": 8.04070407040704, + "grad_norm": 0.7995374798774719, + "learning_rate": 3.734629280377765e-05, + "loss": 0.081, + "num_input_tokens_seen": 15424624, + "step": 73090 + }, + { + "epoch": 8.041254125412541, + "grad_norm": 0.31500527262687683, + "learning_rate": 3.734420577726146e-05, + "loss": 0.0781, + "num_input_tokens_seen": 15425616, + "step": 73095 + }, + { + "epoch": 8.041804180418042, + "grad_norm": 0.043498288840055466, + "learning_rate": 3.7342118636974664e-05, + "loss": 0.008, + "num_input_tokens_seen": 15426704, + "step": 73100 + }, + { + "epoch": 8.042354235423542, + "grad_norm": 0.25970056653022766, + "learning_rate": 3.734003138293648e-05, + "loss": 0.0707, + "num_input_tokens_seen": 15427792, + "step": 73105 + }, + { + "epoch": 8.042904290429043, + "grad_norm": 0.1709185689687729, + "learning_rate": 3.733794401516614e-05, + "loss": 0.0502, + "num_input_tokens_seen": 15428880, + "step": 73110 + }, + { + "epoch": 8.043454345434544, + "grad_norm": 0.5371401906013489, + "learning_rate": 3.73358565336829e-05, + "loss": 0.0549, + "num_input_tokens_seen": 15429872, + "step": 73115 + }, + { + "epoch": 8.044004400440045, + "grad_norm": 0.2117753028869629, + "learning_rate": 3.7333768938505984e-05, + "loss": 0.0333, + "num_input_tokens_seen": 15430960, + "step": 73120 + }, + { + "epoch": 8.044554455445544, + "grad_norm": 0.11909570544958115, + "learning_rate": 3.7331681229654635e-05, + "loss": 0.023, + "num_input_tokens_seen": 15432016, + "step": 73125 + }, + { + "epoch": 8.045104510451045, + "grad_norm": 0.06146356463432312, + "learning_rate": 3.73295934071481e-05, + "loss": 0.0566, + "num_input_tokens_seen": 15433040, + "step": 73130 + }, + { + "epoch": 8.045654565456546, + "grad_norm": 0.04431966692209244, + "learning_rate": 3.7327505471005625e-05, + "loss": 0.011, + "num_input_tokens_seen": 15434096, + "step": 73135 + }, + { + "epoch": 8.046204620462046, + "grad_norm": 0.009232057258486748, + "learning_rate": 3.7325417421246446e-05, + "loss": 0.0613, + "num_input_tokens_seen": 15435088, + "step": 73140 + }, + { + "epoch": 8.046754675467547, + "grad_norm": 0.635549008846283, + "learning_rate": 3.732332925788981e-05, + "loss": 0.0216, + "num_input_tokens_seen": 15436144, + "step": 73145 + }, + { + "epoch": 8.047304730473048, + "grad_norm": 0.3040620684623718, + "learning_rate": 3.7321240980954955e-05, + "loss": 0.0657, + "num_input_tokens_seen": 15437200, + "step": 73150 + }, + { + "epoch": 8.047854785478547, + "grad_norm": 0.011115879751741886, + "learning_rate": 3.731915259046113e-05, + "loss": 0.0161, + "num_input_tokens_seen": 15438192, + "step": 73155 + }, + { + "epoch": 8.048404840484048, + "grad_norm": 0.014096884988248348, + "learning_rate": 3.731706408642759e-05, + "loss": 0.001, + "num_input_tokens_seen": 15439216, + "step": 73160 + }, + { + "epoch": 8.04895489548955, + "grad_norm": 0.046121321618556976, + "learning_rate": 3.7314975468873586e-05, + "loss": 0.0205, + "num_input_tokens_seen": 15440208, + "step": 73165 + }, + { + "epoch": 8.049504950495049, + "grad_norm": 0.05266450345516205, + "learning_rate": 3.7312886737818364e-05, + "loss": 0.0054, + "num_input_tokens_seen": 15441296, + "step": 73170 + }, + { + "epoch": 8.05005500550055, + "grad_norm": 0.03794208914041519, + "learning_rate": 3.731079789328117e-05, + "loss": 0.0032, + "num_input_tokens_seen": 15442320, + "step": 73175 + }, + { + "epoch": 8.05060506050605, + "grad_norm": 2.0411128997802734, + "learning_rate": 3.730870893528126e-05, + "loss": 0.0177, + "num_input_tokens_seen": 15443344, + "step": 73180 + }, + { + "epoch": 8.051155115511552, + "grad_norm": 0.16163180768489838, + "learning_rate": 3.730661986383788e-05, + "loss": 0.0792, + "num_input_tokens_seen": 15444400, + "step": 73185 + }, + { + "epoch": 8.051705170517051, + "grad_norm": 1.1045573949813843, + "learning_rate": 3.73045306789703e-05, + "loss": 0.04, + "num_input_tokens_seen": 15445488, + "step": 73190 + }, + { + "epoch": 8.052255225522552, + "grad_norm": 0.019560489803552628, + "learning_rate": 3.730244138069775e-05, + "loss": 0.0822, + "num_input_tokens_seen": 15446576, + "step": 73195 + }, + { + "epoch": 8.052805280528053, + "grad_norm": 0.2930563688278198, + "learning_rate": 3.730035196903952e-05, + "loss": 0.0292, + "num_input_tokens_seen": 15447632, + "step": 73200 + }, + { + "epoch": 8.053355335533553, + "grad_norm": 0.04435209929943085, + "learning_rate": 3.729826244401484e-05, + "loss": 0.0712, + "num_input_tokens_seen": 15448688, + "step": 73205 + }, + { + "epoch": 8.053905390539054, + "grad_norm": 0.4103235602378845, + "learning_rate": 3.729617280564297e-05, + "loss": 0.0137, + "num_input_tokens_seen": 15449808, + "step": 73210 + }, + { + "epoch": 8.054455445544555, + "grad_norm": 0.8641834259033203, + "learning_rate": 3.7294083053943184e-05, + "loss": 0.0607, + "num_input_tokens_seen": 15450864, + "step": 73215 + }, + { + "epoch": 8.055005500550054, + "grad_norm": 0.03921687230467796, + "learning_rate": 3.7291993188934726e-05, + "loss": 0.0294, + "num_input_tokens_seen": 15451888, + "step": 73220 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.042747050523757935, + "learning_rate": 3.7289903210636865e-05, + "loss": 0.1446, + "num_input_tokens_seen": 15452944, + "step": 73225 + }, + { + "epoch": 8.056105610561056, + "grad_norm": 0.0421992652118206, + "learning_rate": 3.728781311906887e-05, + "loss": 0.0154, + "num_input_tokens_seen": 15453968, + "step": 73230 + }, + { + "epoch": 8.056655665566556, + "grad_norm": 0.3848402798175812, + "learning_rate": 3.728572291425001e-05, + "loss": 0.0571, + "num_input_tokens_seen": 15455024, + "step": 73235 + }, + { + "epoch": 8.057205720572057, + "grad_norm": 1.1095486879348755, + "learning_rate": 3.728363259619951e-05, + "loss": 0.0804, + "num_input_tokens_seen": 15456112, + "step": 73240 + }, + { + "epoch": 8.057755775577558, + "grad_norm": 0.10650566220283508, + "learning_rate": 3.7281542164936675e-05, + "loss": 0.0746, + "num_input_tokens_seen": 15457168, + "step": 73245 + }, + { + "epoch": 8.058305830583059, + "grad_norm": 0.09172373265028, + "learning_rate": 3.7279451620480765e-05, + "loss": 0.0188, + "num_input_tokens_seen": 15458224, + "step": 73250 + }, + { + "epoch": 8.058855885588558, + "grad_norm": 1.0090301036834717, + "learning_rate": 3.7277360962851036e-05, + "loss": 0.0284, + "num_input_tokens_seen": 15459280, + "step": 73255 + }, + { + "epoch": 8.05940594059406, + "grad_norm": 0.17373719811439514, + "learning_rate": 3.7275270192066764e-05, + "loss": 0.0564, + "num_input_tokens_seen": 15460432, + "step": 73260 + }, + { + "epoch": 8.05995599559956, + "grad_norm": 0.13635092973709106, + "learning_rate": 3.7273179308147215e-05, + "loss": 0.0172, + "num_input_tokens_seen": 15461456, + "step": 73265 + }, + { + "epoch": 8.06050605060506, + "grad_norm": 0.045334793627262115, + "learning_rate": 3.7271088311111654e-05, + "loss": 0.039, + "num_input_tokens_seen": 15462544, + "step": 73270 + }, + { + "epoch": 8.061056105610561, + "grad_norm": 0.013713780790567398, + "learning_rate": 3.7268997200979373e-05, + "loss": 0.0175, + "num_input_tokens_seen": 15463664, + "step": 73275 + }, + { + "epoch": 8.061606160616062, + "grad_norm": 0.03692486882209778, + "learning_rate": 3.726690597776963e-05, + "loss": 0.0152, + "num_input_tokens_seen": 15464656, + "step": 73280 + }, + { + "epoch": 8.062156215621561, + "grad_norm": 1.6181633472442627, + "learning_rate": 3.72648146415017e-05, + "loss": 0.058, + "num_input_tokens_seen": 15465712, + "step": 73285 + }, + { + "epoch": 8.062706270627062, + "grad_norm": 0.03914858400821686, + "learning_rate": 3.726272319219485e-05, + "loss": 0.005, + "num_input_tokens_seen": 15466704, + "step": 73290 + }, + { + "epoch": 8.063256325632564, + "grad_norm": 0.1573394536972046, + "learning_rate": 3.726063162986838e-05, + "loss": 0.0243, + "num_input_tokens_seen": 15467728, + "step": 73295 + }, + { + "epoch": 8.063806380638065, + "grad_norm": 0.1971118003129959, + "learning_rate": 3.7258539954541536e-05, + "loss": 0.1082, + "num_input_tokens_seen": 15468752, + "step": 73300 + }, + { + "epoch": 8.064356435643564, + "grad_norm": 0.024337513372302055, + "learning_rate": 3.725644816623363e-05, + "loss": 0.0049, + "num_input_tokens_seen": 15469808, + "step": 73305 + }, + { + "epoch": 8.064906490649065, + "grad_norm": 0.09164217114448547, + "learning_rate": 3.72543562649639e-05, + "loss": 0.0854, + "num_input_tokens_seen": 15470864, + "step": 73310 + }, + { + "epoch": 8.065456545654566, + "grad_norm": 0.07447145134210587, + "learning_rate": 3.725226425075166e-05, + "loss": 0.0086, + "num_input_tokens_seen": 15471952, + "step": 73315 + }, + { + "epoch": 8.066006600660065, + "grad_norm": 1.965440273284912, + "learning_rate": 3.725017212361618e-05, + "loss": 0.0257, + "num_input_tokens_seen": 15473008, + "step": 73320 + }, + { + "epoch": 8.066556655665567, + "grad_norm": 2.019761562347412, + "learning_rate": 3.724807988357674e-05, + "loss": 0.0293, + "num_input_tokens_seen": 15474000, + "step": 73325 + }, + { + "epoch": 8.067106710671068, + "grad_norm": 0.03924521803855896, + "learning_rate": 3.724598753065263e-05, + "loss": 0.0066, + "num_input_tokens_seen": 15475088, + "step": 73330 + }, + { + "epoch": 8.067656765676567, + "grad_norm": 0.7831127643585205, + "learning_rate": 3.7243895064863124e-05, + "loss": 0.0172, + "num_input_tokens_seen": 15476144, + "step": 73335 + }, + { + "epoch": 8.068206820682068, + "grad_norm": 0.6711649298667908, + "learning_rate": 3.7241802486227505e-05, + "loss": 0.0524, + "num_input_tokens_seen": 15477168, + "step": 73340 + }, + { + "epoch": 8.06875687568757, + "grad_norm": 0.43459272384643555, + "learning_rate": 3.7239709794765085e-05, + "loss": 0.0123, + "num_input_tokens_seen": 15478192, + "step": 73345 + }, + { + "epoch": 8.069306930693068, + "grad_norm": 0.02006617560982704, + "learning_rate": 3.723761699049512e-05, + "loss": 0.056, + "num_input_tokens_seen": 15479280, + "step": 73350 + }, + { + "epoch": 8.06985698569857, + "grad_norm": 0.023450512439012527, + "learning_rate": 3.7235524073436915e-05, + "loss": 0.0083, + "num_input_tokens_seen": 15480272, + "step": 73355 + }, + { + "epoch": 8.07040704070407, + "grad_norm": 1.0465831756591797, + "learning_rate": 3.7233431043609755e-05, + "loss": 0.0447, + "num_input_tokens_seen": 15481296, + "step": 73360 + }, + { + "epoch": 8.070957095709572, + "grad_norm": 0.15447789430618286, + "learning_rate": 3.723133790103293e-05, + "loss": 0.0071, + "num_input_tokens_seen": 15482320, + "step": 73365 + }, + { + "epoch": 8.071507150715071, + "grad_norm": 0.014029556885361671, + "learning_rate": 3.722924464572573e-05, + "loss": 0.0028, + "num_input_tokens_seen": 15483376, + "step": 73370 + }, + { + "epoch": 8.072057205720572, + "grad_norm": 1.10645592212677, + "learning_rate": 3.7227151277707464e-05, + "loss": 0.0374, + "num_input_tokens_seen": 15484496, + "step": 73375 + }, + { + "epoch": 8.072607260726073, + "grad_norm": 0.4058409631252289, + "learning_rate": 3.7225057796997396e-05, + "loss": 0.0106, + "num_input_tokens_seen": 15485552, + "step": 73380 + }, + { + "epoch": 8.073157315731573, + "grad_norm": 0.7137987017631531, + "learning_rate": 3.722296420361484e-05, + "loss": 0.0899, + "num_input_tokens_seen": 15486608, + "step": 73385 + }, + { + "epoch": 8.073707370737074, + "grad_norm": 0.919750452041626, + "learning_rate": 3.7220870497579096e-05, + "loss": 0.0212, + "num_input_tokens_seen": 15487696, + "step": 73390 + }, + { + "epoch": 8.074257425742575, + "grad_norm": 0.027163363993167877, + "learning_rate": 3.721877667890945e-05, + "loss": 0.0207, + "num_input_tokens_seen": 15488784, + "step": 73395 + }, + { + "epoch": 8.074807480748074, + "grad_norm": 0.09662459790706635, + "learning_rate": 3.72166827476252e-05, + "loss": 0.0088, + "num_input_tokens_seen": 15489840, + "step": 73400 + }, + { + "epoch": 8.075357535753575, + "grad_norm": 0.06703051179647446, + "learning_rate": 3.7214588703745644e-05, + "loss": 0.0247, + "num_input_tokens_seen": 15490928, + "step": 73405 + }, + { + "epoch": 8.075907590759076, + "grad_norm": 0.1571188122034073, + "learning_rate": 3.721249454729009e-05, + "loss": 0.003, + "num_input_tokens_seen": 15492016, + "step": 73410 + }, + { + "epoch": 8.076457645764576, + "grad_norm": 0.04520951956510544, + "learning_rate": 3.721040027827783e-05, + "loss": 0.0022, + "num_input_tokens_seen": 15493136, + "step": 73415 + }, + { + "epoch": 8.077007700770077, + "grad_norm": 0.03510866314172745, + "learning_rate": 3.720830589672817e-05, + "loss": 0.013, + "num_input_tokens_seen": 15494192, + "step": 73420 + }, + { + "epoch": 8.077557755775578, + "grad_norm": 0.7242265343666077, + "learning_rate": 3.7206211402660415e-05, + "loss": 0.0175, + "num_input_tokens_seen": 15495248, + "step": 73425 + }, + { + "epoch": 8.078107810781079, + "grad_norm": 0.25307393074035645, + "learning_rate": 3.7204116796093865e-05, + "loss": 0.0111, + "num_input_tokens_seen": 15496272, + "step": 73430 + }, + { + "epoch": 8.078657865786578, + "grad_norm": 0.11597650498151779, + "learning_rate": 3.7202022077047825e-05, + "loss": 0.0634, + "num_input_tokens_seen": 15497328, + "step": 73435 + }, + { + "epoch": 8.07920792079208, + "grad_norm": 0.042075350880622864, + "learning_rate": 3.719992724554161e-05, + "loss": 0.0467, + "num_input_tokens_seen": 15498416, + "step": 73440 + }, + { + "epoch": 8.07975797579758, + "grad_norm": 0.3715938925743103, + "learning_rate": 3.7197832301594516e-05, + "loss": 0.0491, + "num_input_tokens_seen": 15499536, + "step": 73445 + }, + { + "epoch": 8.08030803080308, + "grad_norm": 0.33658647537231445, + "learning_rate": 3.719573724522585e-05, + "loss": 0.029, + "num_input_tokens_seen": 15500592, + "step": 73450 + }, + { + "epoch": 8.08085808580858, + "grad_norm": 0.08277798444032669, + "learning_rate": 3.719364207645493e-05, + "loss": 0.1114, + "num_input_tokens_seen": 15501616, + "step": 73455 + }, + { + "epoch": 8.081408140814082, + "grad_norm": 0.39440393447875977, + "learning_rate": 3.7191546795301064e-05, + "loss": 0.0636, + "num_input_tokens_seen": 15502608, + "step": 73460 + }, + { + "epoch": 8.081958195819581, + "grad_norm": 0.8076744079589844, + "learning_rate": 3.718945140178356e-05, + "loss": 0.0285, + "num_input_tokens_seen": 15503728, + "step": 73465 + }, + { + "epoch": 8.082508250825082, + "grad_norm": 0.007783152163028717, + "learning_rate": 3.718735589592173e-05, + "loss": 0.0017, + "num_input_tokens_seen": 15504848, + "step": 73470 + }, + { + "epoch": 8.083058305830583, + "grad_norm": 0.15212595462799072, + "learning_rate": 3.718526027773489e-05, + "loss": 0.0385, + "num_input_tokens_seen": 15505872, + "step": 73475 + }, + { + "epoch": 8.083608360836084, + "grad_norm": 1.2336339950561523, + "learning_rate": 3.7183164547242354e-05, + "loss": 0.0996, + "num_input_tokens_seen": 15506928, + "step": 73480 + }, + { + "epoch": 8.084158415841584, + "grad_norm": 0.010569491423666477, + "learning_rate": 3.7181068704463426e-05, + "loss": 0.0779, + "num_input_tokens_seen": 15507920, + "step": 73485 + }, + { + "epoch": 8.084708470847085, + "grad_norm": 0.16263467073440552, + "learning_rate": 3.717897274941744e-05, + "loss": 0.0309, + "num_input_tokens_seen": 15509104, + "step": 73490 + }, + { + "epoch": 8.085258525852586, + "grad_norm": 0.029335783794522285, + "learning_rate": 3.717687668212372e-05, + "loss": 0.0384, + "num_input_tokens_seen": 15510096, + "step": 73495 + }, + { + "epoch": 8.085808580858085, + "grad_norm": 0.3516712188720703, + "learning_rate": 3.7174780502601555e-05, + "loss": 0.0091, + "num_input_tokens_seen": 15511152, + "step": 73500 + }, + { + "epoch": 8.086358635863586, + "grad_norm": 0.32569801807403564, + "learning_rate": 3.7172684210870276e-05, + "loss": 0.0677, + "num_input_tokens_seen": 15512144, + "step": 73505 + }, + { + "epoch": 8.086908690869087, + "grad_norm": 0.07100433856248856, + "learning_rate": 3.717058780694922e-05, + "loss": 0.0132, + "num_input_tokens_seen": 15513200, + "step": 73510 + }, + { + "epoch": 8.087458745874587, + "grad_norm": 0.336647093296051, + "learning_rate": 3.716849129085769e-05, + "loss": 0.083, + "num_input_tokens_seen": 15514288, + "step": 73515 + }, + { + "epoch": 8.088008800880088, + "grad_norm": 1.8144593238830566, + "learning_rate": 3.7166394662615024e-05, + "loss": 0.1209, + "num_input_tokens_seen": 15515312, + "step": 73520 + }, + { + "epoch": 8.088558855885589, + "grad_norm": 0.501102864742279, + "learning_rate": 3.7164297922240526e-05, + "loss": 0.0912, + "num_input_tokens_seen": 15516432, + "step": 73525 + }, + { + "epoch": 8.089108910891088, + "grad_norm": 0.21417942643165588, + "learning_rate": 3.7162201069753527e-05, + "loss": 0.056, + "num_input_tokens_seen": 15517488, + "step": 73530 + }, + { + "epoch": 8.08965896589659, + "grad_norm": 0.06756890565156937, + "learning_rate": 3.7160104105173365e-05, + "loss": 0.0698, + "num_input_tokens_seen": 15518544, + "step": 73535 + }, + { + "epoch": 8.09020902090209, + "grad_norm": 0.1262391060590744, + "learning_rate": 3.715800702851937e-05, + "loss": 0.0073, + "num_input_tokens_seen": 15519632, + "step": 73540 + }, + { + "epoch": 8.090759075907592, + "grad_norm": 0.05142151191830635, + "learning_rate": 3.715590983981083e-05, + "loss": 0.0807, + "num_input_tokens_seen": 15520624, + "step": 73545 + }, + { + "epoch": 8.091309130913091, + "grad_norm": 0.3222294747829437, + "learning_rate": 3.7153812539067125e-05, + "loss": 0.0652, + "num_input_tokens_seen": 15521680, + "step": 73550 + }, + { + "epoch": 8.091859185918592, + "grad_norm": 0.3421270549297333, + "learning_rate": 3.7151715126307554e-05, + "loss": 0.0114, + "num_input_tokens_seen": 15522704, + "step": 73555 + }, + { + "epoch": 8.092409240924093, + "grad_norm": 0.00733561534434557, + "learning_rate": 3.714961760155144e-05, + "loss": 0.036, + "num_input_tokens_seen": 15523760, + "step": 73560 + }, + { + "epoch": 8.092959295929592, + "grad_norm": 0.008883650414645672, + "learning_rate": 3.714751996481816e-05, + "loss": 0.0378, + "num_input_tokens_seen": 15524816, + "step": 73565 + }, + { + "epoch": 8.093509350935093, + "grad_norm": 0.6309683322906494, + "learning_rate": 3.7145422216127e-05, + "loss": 0.0288, + "num_input_tokens_seen": 15525776, + "step": 73570 + }, + { + "epoch": 8.094059405940595, + "grad_norm": 0.01836395263671875, + "learning_rate": 3.714332435549731e-05, + "loss": 0.0025, + "num_input_tokens_seen": 15526832, + "step": 73575 + }, + { + "epoch": 8.094609460946094, + "grad_norm": 0.1972537338733673, + "learning_rate": 3.7141226382948434e-05, + "loss": 0.0045, + "num_input_tokens_seen": 15527920, + "step": 73580 + }, + { + "epoch": 8.095159515951595, + "grad_norm": 0.05160210281610489, + "learning_rate": 3.7139128298499704e-05, + "loss": 0.0161, + "num_input_tokens_seen": 15528976, + "step": 73585 + }, + { + "epoch": 8.095709570957096, + "grad_norm": 0.04406984522938728, + "learning_rate": 3.7137030102170445e-05, + "loss": 0.0023, + "num_input_tokens_seen": 15529968, + "step": 73590 + }, + { + "epoch": 8.096259625962595, + "grad_norm": 1.8741974830627441, + "learning_rate": 3.713493179398e-05, + "loss": 0.0215, + "num_input_tokens_seen": 15531056, + "step": 73595 + }, + { + "epoch": 8.096809680968097, + "grad_norm": 0.018603339791297913, + "learning_rate": 3.713283337394772e-05, + "loss": 0.029, + "num_input_tokens_seen": 15532176, + "step": 73600 + }, + { + "epoch": 8.097359735973598, + "grad_norm": 0.12546710669994354, + "learning_rate": 3.713073484209293e-05, + "loss": 0.0076, + "num_input_tokens_seen": 15533200, + "step": 73605 + }, + { + "epoch": 8.097909790979099, + "grad_norm": 0.48759737610816956, + "learning_rate": 3.7128636198434985e-05, + "loss": 0.019, + "num_input_tokens_seen": 15534256, + "step": 73610 + }, + { + "epoch": 8.098459845984598, + "grad_norm": 0.06949391961097717, + "learning_rate": 3.7126537442993216e-05, + "loss": 0.012, + "num_input_tokens_seen": 15535344, + "step": 73615 + }, + { + "epoch": 8.099009900990099, + "grad_norm": 0.7772673964500427, + "learning_rate": 3.712443857578697e-05, + "loss": 0.0849, + "num_input_tokens_seen": 15536400, + "step": 73620 + }, + { + "epoch": 8.0995599559956, + "grad_norm": 0.005760237108916044, + "learning_rate": 3.712233959683559e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15537488, + "step": 73625 + }, + { + "epoch": 8.1001100110011, + "grad_norm": 0.004542400129139423, + "learning_rate": 3.712024050615843e-05, + "loss": 0.0248, + "num_input_tokens_seen": 15538544, + "step": 73630 + }, + { + "epoch": 8.1006600660066, + "grad_norm": 0.08375348150730133, + "learning_rate": 3.7118141303774826e-05, + "loss": 0.0161, + "num_input_tokens_seen": 15539600, + "step": 73635 + }, + { + "epoch": 8.101210121012102, + "grad_norm": 0.004698372446000576, + "learning_rate": 3.711604198970413e-05, + "loss": 0.0072, + "num_input_tokens_seen": 15540592, + "step": 73640 + }, + { + "epoch": 8.101760176017601, + "grad_norm": 0.04066178575158119, + "learning_rate": 3.7113942563965675e-05, + "loss": 0.0038, + "num_input_tokens_seen": 15541712, + "step": 73645 + }, + { + "epoch": 8.102310231023102, + "grad_norm": 0.008717506192624569, + "learning_rate": 3.7111843026578834e-05, + "loss": 0.05, + "num_input_tokens_seen": 15542736, + "step": 73650 + }, + { + "epoch": 8.102860286028603, + "grad_norm": 0.24749970436096191, + "learning_rate": 3.710974337756295e-05, + "loss": 0.0215, + "num_input_tokens_seen": 15543824, + "step": 73655 + }, + { + "epoch": 8.103410341034103, + "grad_norm": 0.02669423259794712, + "learning_rate": 3.710764361693737e-05, + "loss": 0.0895, + "num_input_tokens_seen": 15544848, + "step": 73660 + }, + { + "epoch": 8.103960396039604, + "grad_norm": 0.013776392675936222, + "learning_rate": 3.710554374472145e-05, + "loss": 0.0332, + "num_input_tokens_seen": 15545904, + "step": 73665 + }, + { + "epoch": 8.104510451045105, + "grad_norm": 0.23190191388130188, + "learning_rate": 3.710344376093454e-05, + "loss": 0.0451, + "num_input_tokens_seen": 15546960, + "step": 73670 + }, + { + "epoch": 8.105060506050606, + "grad_norm": 0.1109444871544838, + "learning_rate": 3.7101343665596e-05, + "loss": 0.1335, + "num_input_tokens_seen": 15548016, + "step": 73675 + }, + { + "epoch": 8.105610561056105, + "grad_norm": 0.030492326244711876, + "learning_rate": 3.7099243458725183e-05, + "loss": 0.0047, + "num_input_tokens_seen": 15549104, + "step": 73680 + }, + { + "epoch": 8.106160616061606, + "grad_norm": 0.8062623739242554, + "learning_rate": 3.7097143140341445e-05, + "loss": 0.0872, + "num_input_tokens_seen": 15550192, + "step": 73685 + }, + { + "epoch": 8.106710671067107, + "grad_norm": 0.011008432134985924, + "learning_rate": 3.7095042710464136e-05, + "loss": 0.0081, + "num_input_tokens_seen": 15551248, + "step": 73690 + }, + { + "epoch": 8.107260726072607, + "grad_norm": 0.2797590494155884, + "learning_rate": 3.709294216911263e-05, + "loss": 0.0605, + "num_input_tokens_seen": 15552336, + "step": 73695 + }, + { + "epoch": 8.107810781078108, + "grad_norm": 0.9684202671051025, + "learning_rate": 3.7090841516306276e-05, + "loss": 0.0187, + "num_input_tokens_seen": 15553296, + "step": 73700 + }, + { + "epoch": 8.108360836083609, + "grad_norm": 1.9254134893417358, + "learning_rate": 3.7088740752064425e-05, + "loss": 0.0606, + "num_input_tokens_seen": 15554320, + "step": 73705 + }, + { + "epoch": 8.108910891089108, + "grad_norm": 0.0181562602519989, + "learning_rate": 3.708663987640647e-05, + "loss": 0.0203, + "num_input_tokens_seen": 15555376, + "step": 73710 + }, + { + "epoch": 8.10946094609461, + "grad_norm": 0.02082211710512638, + "learning_rate": 3.708453888935175e-05, + "loss": 0.0129, + "num_input_tokens_seen": 15556432, + "step": 73715 + }, + { + "epoch": 8.11001100110011, + "grad_norm": 0.2459225356578827, + "learning_rate": 3.708243779091963e-05, + "loss": 0.0103, + "num_input_tokens_seen": 15557456, + "step": 73720 + }, + { + "epoch": 8.110561056105611, + "grad_norm": 0.0038887697737663984, + "learning_rate": 3.7080336581129474e-05, + "loss": 0.0063, + "num_input_tokens_seen": 15558608, + "step": 73725 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 0.056628722697496414, + "learning_rate": 3.707823526000066e-05, + "loss": 0.0557, + "num_input_tokens_seen": 15559728, + "step": 73730 + }, + { + "epoch": 8.111661166116612, + "grad_norm": 0.20586253702640533, + "learning_rate": 3.707613382755255e-05, + "loss": 0.0078, + "num_input_tokens_seen": 15560848, + "step": 73735 + }, + { + "epoch": 8.112211221122113, + "grad_norm": 0.5007489919662476, + "learning_rate": 3.7074032283804507e-05, + "loss": 0.0555, + "num_input_tokens_seen": 15561904, + "step": 73740 + }, + { + "epoch": 8.112761276127612, + "grad_norm": 0.00992837455123663, + "learning_rate": 3.7071930628775896e-05, + "loss": 0.0241, + "num_input_tokens_seen": 15562928, + "step": 73745 + }, + { + "epoch": 8.113311331133113, + "grad_norm": 0.024664759635925293, + "learning_rate": 3.706982886248609e-05, + "loss": 0.0132, + "num_input_tokens_seen": 15564016, + "step": 73750 + }, + { + "epoch": 8.113861386138614, + "grad_norm": 0.012716573663055897, + "learning_rate": 3.7067726984954484e-05, + "loss": 0.0118, + "num_input_tokens_seen": 15565072, + "step": 73755 + }, + { + "epoch": 8.114411441144114, + "grad_norm": 2.0940139293670654, + "learning_rate": 3.706562499620041e-05, + "loss": 0.1196, + "num_input_tokens_seen": 15566128, + "step": 73760 + }, + { + "epoch": 8.114961496149615, + "grad_norm": 0.10417941957712173, + "learning_rate": 3.7063522896243266e-05, + "loss": 0.1848, + "num_input_tokens_seen": 15567184, + "step": 73765 + }, + { + "epoch": 8.115511551155116, + "grad_norm": 0.006549136247485876, + "learning_rate": 3.706142068510242e-05, + "loss": 0.0182, + "num_input_tokens_seen": 15568240, + "step": 73770 + }, + { + "epoch": 8.116061606160615, + "grad_norm": 0.07372420281171799, + "learning_rate": 3.7059318362797244e-05, + "loss": 0.0062, + "num_input_tokens_seen": 15569264, + "step": 73775 + }, + { + "epoch": 8.116611661166116, + "grad_norm": 0.05767470598220825, + "learning_rate": 3.7057215929347124e-05, + "loss": 0.0154, + "num_input_tokens_seen": 15570384, + "step": 73780 + }, + { + "epoch": 8.117161716171617, + "grad_norm": 1.9668443202972412, + "learning_rate": 3.705511338477143e-05, + "loss": 0.05, + "num_input_tokens_seen": 15571472, + "step": 73785 + }, + { + "epoch": 8.117711771177119, + "grad_norm": 0.02758360654115677, + "learning_rate": 3.705301072908953e-05, + "loss": 0.0165, + "num_input_tokens_seen": 15572528, + "step": 73790 + }, + { + "epoch": 8.118261826182618, + "grad_norm": 0.019050680100917816, + "learning_rate": 3.705090796232083e-05, + "loss": 0.0326, + "num_input_tokens_seen": 15573648, + "step": 73795 + }, + { + "epoch": 8.118811881188119, + "grad_norm": 0.01694655604660511, + "learning_rate": 3.7048805084484676e-05, + "loss": 0.1962, + "num_input_tokens_seen": 15574768, + "step": 73800 + }, + { + "epoch": 8.11936193619362, + "grad_norm": 0.030385997146368027, + "learning_rate": 3.704670209560048e-05, + "loss": 0.032, + "num_input_tokens_seen": 15575792, + "step": 73805 + }, + { + "epoch": 8.11991199119912, + "grad_norm": 0.012913047336041927, + "learning_rate": 3.7044598995687604e-05, + "loss": 0.0112, + "num_input_tokens_seen": 15576816, + "step": 73810 + }, + { + "epoch": 8.12046204620462, + "grad_norm": 0.23278400301933289, + "learning_rate": 3.7042495784765444e-05, + "loss": 0.0364, + "num_input_tokens_seen": 15577840, + "step": 73815 + }, + { + "epoch": 8.121012101210122, + "grad_norm": 0.11059972643852234, + "learning_rate": 3.704039246285337e-05, + "loss": 0.0211, + "num_input_tokens_seen": 15578832, + "step": 73820 + }, + { + "epoch": 8.12156215621562, + "grad_norm": 0.03922733664512634, + "learning_rate": 3.703828902997079e-05, + "loss": 0.0067, + "num_input_tokens_seen": 15579856, + "step": 73825 + }, + { + "epoch": 8.122112211221122, + "grad_norm": 0.6222148537635803, + "learning_rate": 3.703618548613707e-05, + "loss": 0.0221, + "num_input_tokens_seen": 15580976, + "step": 73830 + }, + { + "epoch": 8.122662266226623, + "grad_norm": 0.011105261743068695, + "learning_rate": 3.70340818313716e-05, + "loss": 0.0752, + "num_input_tokens_seen": 15582000, + "step": 73835 + }, + { + "epoch": 8.123212321232122, + "grad_norm": 0.23493355512619019, + "learning_rate": 3.7031978065693775e-05, + "loss": 0.1273, + "num_input_tokens_seen": 15583088, + "step": 73840 + }, + { + "epoch": 8.123762376237623, + "grad_norm": 0.06344293057918549, + "learning_rate": 3.702987418912298e-05, + "loss": 0.1109, + "num_input_tokens_seen": 15584144, + "step": 73845 + }, + { + "epoch": 8.124312431243125, + "grad_norm": 0.007761238608509302, + "learning_rate": 3.7027770201678615e-05, + "loss": 0.0536, + "num_input_tokens_seen": 15585200, + "step": 73850 + }, + { + "epoch": 8.124862486248626, + "grad_norm": 1.5311609506607056, + "learning_rate": 3.702566610338006e-05, + "loss": 0.0337, + "num_input_tokens_seen": 15586192, + "step": 73855 + }, + { + "epoch": 8.125412541254125, + "grad_norm": 0.0658550038933754, + "learning_rate": 3.702356189424671e-05, + "loss": 0.0683, + "num_input_tokens_seen": 15587280, + "step": 73860 + }, + { + "epoch": 8.125962596259626, + "grad_norm": 0.04485737159848213, + "learning_rate": 3.7021457574297954e-05, + "loss": 0.0047, + "num_input_tokens_seen": 15588368, + "step": 73865 + }, + { + "epoch": 8.126512651265127, + "grad_norm": 0.01698409952223301, + "learning_rate": 3.701935314355319e-05, + "loss": 0.0201, + "num_input_tokens_seen": 15589392, + "step": 73870 + }, + { + "epoch": 8.127062706270626, + "grad_norm": 0.019069818779826164, + "learning_rate": 3.701724860203183e-05, + "loss": 0.0085, + "num_input_tokens_seen": 15590384, + "step": 73875 + }, + { + "epoch": 8.127612761276128, + "grad_norm": 0.489707350730896, + "learning_rate": 3.7015143949753244e-05, + "loss": 0.0123, + "num_input_tokens_seen": 15591408, + "step": 73880 + }, + { + "epoch": 8.128162816281629, + "grad_norm": 0.05230378359556198, + "learning_rate": 3.701303918673685e-05, + "loss": 0.0091, + "num_input_tokens_seen": 15592496, + "step": 73885 + }, + { + "epoch": 8.128712871287128, + "grad_norm": 0.4868986904621124, + "learning_rate": 3.701093431300203e-05, + "loss": 0.0951, + "num_input_tokens_seen": 15593648, + "step": 73890 + }, + { + "epoch": 8.129262926292629, + "grad_norm": 0.043772656470537186, + "learning_rate": 3.7008829328568197e-05, + "loss": 0.0103, + "num_input_tokens_seen": 15594704, + "step": 73895 + }, + { + "epoch": 8.12981298129813, + "grad_norm": 0.02205156534910202, + "learning_rate": 3.700672423345475e-05, + "loss": 0.028, + "num_input_tokens_seen": 15595696, + "step": 73900 + }, + { + "epoch": 8.130363036303631, + "grad_norm": 1.5460489988327026, + "learning_rate": 3.7004619027681084e-05, + "loss": 0.124, + "num_input_tokens_seen": 15596784, + "step": 73905 + }, + { + "epoch": 8.13091309130913, + "grad_norm": 0.706322431564331, + "learning_rate": 3.7002513711266596e-05, + "loss": 0.0177, + "num_input_tokens_seen": 15597936, + "step": 73910 + }, + { + "epoch": 8.131463146314632, + "grad_norm": 0.015483880415558815, + "learning_rate": 3.70004082842307e-05, + "loss": 0.0073, + "num_input_tokens_seen": 15598992, + "step": 73915 + }, + { + "epoch": 8.132013201320133, + "grad_norm": 0.4265196621417999, + "learning_rate": 3.699830274659281e-05, + "loss": 0.0145, + "num_input_tokens_seen": 15600176, + "step": 73920 + }, + { + "epoch": 8.132563256325632, + "grad_norm": 0.2355484664440155, + "learning_rate": 3.6996197098372314e-05, + "loss": 0.022, + "num_input_tokens_seen": 15601232, + "step": 73925 + }, + { + "epoch": 8.133113311331133, + "grad_norm": 0.08969924598932266, + "learning_rate": 3.6994091339588635e-05, + "loss": 0.0465, + "num_input_tokens_seen": 15602320, + "step": 73930 + }, + { + "epoch": 8.133663366336634, + "grad_norm": 0.009351033717393875, + "learning_rate": 3.699198547026116e-05, + "loss": 0.0159, + "num_input_tokens_seen": 15603344, + "step": 73935 + }, + { + "epoch": 8.134213421342134, + "grad_norm": 0.07483780384063721, + "learning_rate": 3.698987949040931e-05, + "loss": 0.0515, + "num_input_tokens_seen": 15604368, + "step": 73940 + }, + { + "epoch": 8.134763476347635, + "grad_norm": 0.38334891200065613, + "learning_rate": 3.69877734000525e-05, + "loss": 0.093, + "num_input_tokens_seen": 15605392, + "step": 73945 + }, + { + "epoch": 8.135313531353136, + "grad_norm": 0.7558475136756897, + "learning_rate": 3.6985667199210125e-05, + "loss": 0.015, + "num_input_tokens_seen": 15606384, + "step": 73950 + }, + { + "epoch": 8.135863586358635, + "grad_norm": 0.043623678386211395, + "learning_rate": 3.6983560887901616e-05, + "loss": 0.0063, + "num_input_tokens_seen": 15607408, + "step": 73955 + }, + { + "epoch": 8.136413641364136, + "grad_norm": 0.06601264327764511, + "learning_rate": 3.6981454466146374e-05, + "loss": 0.0494, + "num_input_tokens_seen": 15608432, + "step": 73960 + }, + { + "epoch": 8.136963696369637, + "grad_norm": 0.230444997549057, + "learning_rate": 3.697934793396382e-05, + "loss": 0.0114, + "num_input_tokens_seen": 15609520, + "step": 73965 + }, + { + "epoch": 8.137513751375138, + "grad_norm": 0.05369904264807701, + "learning_rate": 3.697724129137336e-05, + "loss": 0.0173, + "num_input_tokens_seen": 15610640, + "step": 73970 + }, + { + "epoch": 8.138063806380638, + "grad_norm": 0.016922663897275925, + "learning_rate": 3.697513453839442e-05, + "loss": 0.0492, + "num_input_tokens_seen": 15611664, + "step": 73975 + }, + { + "epoch": 8.138613861386139, + "grad_norm": 0.06522594392299652, + "learning_rate": 3.69730276750464e-05, + "loss": 0.0045, + "num_input_tokens_seen": 15612688, + "step": 73980 + }, + { + "epoch": 8.13916391639164, + "grad_norm": 0.058893248438835144, + "learning_rate": 3.697092070134873e-05, + "loss": 0.0031, + "num_input_tokens_seen": 15613680, + "step": 73985 + }, + { + "epoch": 8.13971397139714, + "grad_norm": 0.11198914796113968, + "learning_rate": 3.696881361732084e-05, + "loss": 0.0103, + "num_input_tokens_seen": 15614800, + "step": 73990 + }, + { + "epoch": 8.14026402640264, + "grad_norm": 0.007100628688931465, + "learning_rate": 3.696670642298213e-05, + "loss": 0.004, + "num_input_tokens_seen": 15615856, + "step": 73995 + }, + { + "epoch": 8.140814081408141, + "grad_norm": 1.0504746437072754, + "learning_rate": 3.696459911835203e-05, + "loss": 0.0842, + "num_input_tokens_seen": 15616912, + "step": 74000 + }, + { + "epoch": 8.14136413641364, + "grad_norm": 0.08883269876241684, + "learning_rate": 3.696249170344996e-05, + "loss": 0.0606, + "num_input_tokens_seen": 15617936, + "step": 74005 + }, + { + "epoch": 8.141914191419142, + "grad_norm": 0.016058778390288353, + "learning_rate": 3.6960384178295335e-05, + "loss": 0.0568, + "num_input_tokens_seen": 15618928, + "step": 74010 + }, + { + "epoch": 8.142464246424643, + "grad_norm": 0.1596541851758957, + "learning_rate": 3.695827654290761e-05, + "loss": 0.0509, + "num_input_tokens_seen": 15619920, + "step": 74015 + }, + { + "epoch": 8.143014301430142, + "grad_norm": 0.007620318792760372, + "learning_rate": 3.695616879730617e-05, + "loss": 0.0022, + "num_input_tokens_seen": 15620944, + "step": 74020 + }, + { + "epoch": 8.143564356435643, + "grad_norm": 0.13465118408203125, + "learning_rate": 3.695406094151046e-05, + "loss": 0.0189, + "num_input_tokens_seen": 15622032, + "step": 74025 + }, + { + "epoch": 8.144114411441144, + "grad_norm": 0.0163656547665596, + "learning_rate": 3.695195297553992e-05, + "loss": 0.0389, + "num_input_tokens_seen": 15623024, + "step": 74030 + }, + { + "epoch": 8.144664466446645, + "grad_norm": 0.14812670648097992, + "learning_rate": 3.6949844899413956e-05, + "loss": 0.0089, + "num_input_tokens_seen": 15624112, + "step": 74035 + }, + { + "epoch": 8.145214521452145, + "grad_norm": 0.1399683654308319, + "learning_rate": 3.694773671315201e-05, + "loss": 0.0933, + "num_input_tokens_seen": 15625168, + "step": 74040 + }, + { + "epoch": 8.145764576457646, + "grad_norm": 0.0327458456158638, + "learning_rate": 3.6945628416773506e-05, + "loss": 0.075, + "num_input_tokens_seen": 15626192, + "step": 74045 + }, + { + "epoch": 8.146314631463147, + "grad_norm": 0.016458960250020027, + "learning_rate": 3.694352001029787e-05, + "loss": 0.0673, + "num_input_tokens_seen": 15627312, + "step": 74050 + }, + { + "epoch": 8.146864686468646, + "grad_norm": 0.3921847641468048, + "learning_rate": 3.6941411493744546e-05, + "loss": 0.0187, + "num_input_tokens_seen": 15628368, + "step": 74055 + }, + { + "epoch": 8.147414741474147, + "grad_norm": 0.6758432388305664, + "learning_rate": 3.6939302867132964e-05, + "loss": 0.0891, + "num_input_tokens_seen": 15629392, + "step": 74060 + }, + { + "epoch": 8.147964796479648, + "grad_norm": 0.005973714403808117, + "learning_rate": 3.693719413048255e-05, + "loss": 0.0045, + "num_input_tokens_seen": 15630448, + "step": 74065 + }, + { + "epoch": 8.148514851485148, + "grad_norm": 0.05991736799478531, + "learning_rate": 3.693508528381275e-05, + "loss": 0.0592, + "num_input_tokens_seen": 15631472, + "step": 74070 + }, + { + "epoch": 8.149064906490649, + "grad_norm": 0.03338190168142319, + "learning_rate": 3.6932976327143004e-05, + "loss": 0.0715, + "num_input_tokens_seen": 15632560, + "step": 74075 + }, + { + "epoch": 8.14961496149615, + "grad_norm": 0.12439294159412384, + "learning_rate": 3.693086726049274e-05, + "loss": 0.009, + "num_input_tokens_seen": 15633648, + "step": 74080 + }, + { + "epoch": 8.150165016501651, + "grad_norm": 0.06642382591962814, + "learning_rate": 3.6928758083881384e-05, + "loss": 0.0082, + "num_input_tokens_seen": 15634640, + "step": 74085 + }, + { + "epoch": 8.15071507150715, + "grad_norm": 1.1038521528244019, + "learning_rate": 3.69266487973284e-05, + "loss": 0.078, + "num_input_tokens_seen": 15635664, + "step": 74090 + }, + { + "epoch": 8.151265126512651, + "grad_norm": 0.011810550466179848, + "learning_rate": 3.692453940085321e-05, + "loss": 0.0271, + "num_input_tokens_seen": 15636784, + "step": 74095 + }, + { + "epoch": 8.151815181518153, + "grad_norm": 0.5767261385917664, + "learning_rate": 3.6922429894475266e-05, + "loss": 0.0348, + "num_input_tokens_seen": 15637808, + "step": 74100 + }, + { + "epoch": 8.152365236523652, + "grad_norm": 0.022600330412387848, + "learning_rate": 3.6920320278214e-05, + "loss": 0.0406, + "num_input_tokens_seen": 15638896, + "step": 74105 + }, + { + "epoch": 8.152915291529153, + "grad_norm": 0.026202430948615074, + "learning_rate": 3.6918210552088874e-05, + "loss": 0.0725, + "num_input_tokens_seen": 15639920, + "step": 74110 + }, + { + "epoch": 8.153465346534654, + "grad_norm": 0.019662722945213318, + "learning_rate": 3.691610071611931e-05, + "loss": 0.1095, + "num_input_tokens_seen": 15641040, + "step": 74115 + }, + { + "epoch": 8.154015401540153, + "grad_norm": 0.20147117972373962, + "learning_rate": 3.691399077032477e-05, + "loss": 0.0386, + "num_input_tokens_seen": 15642096, + "step": 74120 + }, + { + "epoch": 8.154565456545654, + "grad_norm": 1.0254515409469604, + "learning_rate": 3.6911880714724687e-05, + "loss": 0.0337, + "num_input_tokens_seen": 15643152, + "step": 74125 + }, + { + "epoch": 8.155115511551156, + "grad_norm": 0.10414142161607742, + "learning_rate": 3.6909770549338524e-05, + "loss": 0.0706, + "num_input_tokens_seen": 15644208, + "step": 74130 + }, + { + "epoch": 8.155665566556655, + "grad_norm": 0.011307768523693085, + "learning_rate": 3.690766027418573e-05, + "loss": 0.009, + "num_input_tokens_seen": 15645232, + "step": 74135 + }, + { + "epoch": 8.156215621562156, + "grad_norm": 0.05454113706946373, + "learning_rate": 3.690554988928572e-05, + "loss": 0.0084, + "num_input_tokens_seen": 15646288, + "step": 74140 + }, + { + "epoch": 8.156765676567657, + "grad_norm": 0.14452104270458221, + "learning_rate": 3.6903439394657976e-05, + "loss": 0.0118, + "num_input_tokens_seen": 15647280, + "step": 74145 + }, + { + "epoch": 8.157315731573158, + "grad_norm": 0.4011310040950775, + "learning_rate": 3.690132879032195e-05, + "loss": 0.0294, + "num_input_tokens_seen": 15648336, + "step": 74150 + }, + { + "epoch": 8.157865786578657, + "grad_norm": 0.20864525437355042, + "learning_rate": 3.6899218076297086e-05, + "loss": 0.0166, + "num_input_tokens_seen": 15649392, + "step": 74155 + }, + { + "epoch": 8.158415841584159, + "grad_norm": 0.01838645339012146, + "learning_rate": 3.6897107252602845e-05, + "loss": 0.0183, + "num_input_tokens_seen": 15650416, + "step": 74160 + }, + { + "epoch": 8.15896589658966, + "grad_norm": 0.8995175957679749, + "learning_rate": 3.6894996319258665e-05, + "loss": 0.0476, + "num_input_tokens_seen": 15651408, + "step": 74165 + }, + { + "epoch": 8.159515951595159, + "grad_norm": 0.05061948671936989, + "learning_rate": 3.689288527628402e-05, + "loss": 0.0057, + "num_input_tokens_seen": 15652464, + "step": 74170 + }, + { + "epoch": 8.16006600660066, + "grad_norm": 0.16389478743076324, + "learning_rate": 3.6890774123698345e-05, + "loss": 0.0606, + "num_input_tokens_seen": 15653488, + "step": 74175 + }, + { + "epoch": 8.160616061606161, + "grad_norm": 0.015467904508113861, + "learning_rate": 3.688866286152112e-05, + "loss": 0.0207, + "num_input_tokens_seen": 15654608, + "step": 74180 + }, + { + "epoch": 8.16116611661166, + "grad_norm": 0.01703859679400921, + "learning_rate": 3.68865514897718e-05, + "loss": 0.0032, + "num_input_tokens_seen": 15655696, + "step": 74185 + }, + { + "epoch": 8.161716171617162, + "grad_norm": 1.9134610891342163, + "learning_rate": 3.688444000846983e-05, + "loss": 0.2349, + "num_input_tokens_seen": 15656784, + "step": 74190 + }, + { + "epoch": 8.162266226622663, + "grad_norm": 0.022920571267604828, + "learning_rate": 3.6882328417634684e-05, + "loss": 0.0146, + "num_input_tokens_seen": 15657776, + "step": 74195 + }, + { + "epoch": 8.162816281628162, + "grad_norm": 0.45731231570243835, + "learning_rate": 3.6880216717285806e-05, + "loss": 0.0212, + "num_input_tokens_seen": 15658832, + "step": 74200 + }, + { + "epoch": 8.163366336633663, + "grad_norm": 0.01257329061627388, + "learning_rate": 3.687810490744269e-05, + "loss": 0.0023, + "num_input_tokens_seen": 15659888, + "step": 74205 + }, + { + "epoch": 8.163916391639164, + "grad_norm": 0.16635160148143768, + "learning_rate": 3.687599298812477e-05, + "loss": 0.0442, + "num_input_tokens_seen": 15660976, + "step": 74210 + }, + { + "epoch": 8.164466446644665, + "grad_norm": 0.05795406922698021, + "learning_rate": 3.687388095935152e-05, + "loss": 0.0135, + "num_input_tokens_seen": 15662064, + "step": 74215 + }, + { + "epoch": 8.165016501650165, + "grad_norm": 0.7910827994346619, + "learning_rate": 3.687176882114241e-05, + "loss": 0.1277, + "num_input_tokens_seen": 15663152, + "step": 74220 + }, + { + "epoch": 8.165566556655666, + "grad_norm": 0.49916282296180725, + "learning_rate": 3.686965657351691e-05, + "loss": 0.0064, + "num_input_tokens_seen": 15664240, + "step": 74225 + }, + { + "epoch": 8.166116611661167, + "grad_norm": 0.01555624045431614, + "learning_rate": 3.686754421649447e-05, + "loss": 0.0106, + "num_input_tokens_seen": 15665264, + "step": 74230 + }, + { + "epoch": 8.166666666666666, + "grad_norm": 0.015366431325674057, + "learning_rate": 3.686543175009457e-05, + "loss": 0.0388, + "num_input_tokens_seen": 15666288, + "step": 74235 + }, + { + "epoch": 8.167216721672167, + "grad_norm": 0.027710532769560814, + "learning_rate": 3.686331917433669e-05, + "loss": 0.0656, + "num_input_tokens_seen": 15667376, + "step": 74240 + }, + { + "epoch": 8.167766776677668, + "grad_norm": 0.02336232177913189, + "learning_rate": 3.686120648924028e-05, + "loss": 0.0093, + "num_input_tokens_seen": 15668464, + "step": 74245 + }, + { + "epoch": 8.168316831683168, + "grad_norm": 0.07718764245510101, + "learning_rate": 3.685909369482482e-05, + "loss": 0.006, + "num_input_tokens_seen": 15669552, + "step": 74250 + }, + { + "epoch": 8.168866886688669, + "grad_norm": 0.036538753658533096, + "learning_rate": 3.6856980791109794e-05, + "loss": 0.0051, + "num_input_tokens_seen": 15670608, + "step": 74255 + }, + { + "epoch": 8.16941694169417, + "grad_norm": 0.0335022434592247, + "learning_rate": 3.6854867778114655e-05, + "loss": 0.086, + "num_input_tokens_seen": 15671696, + "step": 74260 + }, + { + "epoch": 8.16996699669967, + "grad_norm": 0.8772925734519958, + "learning_rate": 3.685275465585889e-05, + "loss": 0.0735, + "num_input_tokens_seen": 15672816, + "step": 74265 + }, + { + "epoch": 8.17051705170517, + "grad_norm": 0.09038332849740982, + "learning_rate": 3.6850641424361977e-05, + "loss": 0.0129, + "num_input_tokens_seen": 15673904, + "step": 74270 + }, + { + "epoch": 8.171067106710671, + "grad_norm": 0.0281633622944355, + "learning_rate": 3.684852808364339e-05, + "loss": 0.0252, + "num_input_tokens_seen": 15674992, + "step": 74275 + }, + { + "epoch": 8.171617161716172, + "grad_norm": 0.011091175489127636, + "learning_rate": 3.6846414633722606e-05, + "loss": 0.0235, + "num_input_tokens_seen": 15676048, + "step": 74280 + }, + { + "epoch": 8.172167216721672, + "grad_norm": 1.387433409690857, + "learning_rate": 3.6844301074619095e-05, + "loss": 0.1029, + "num_input_tokens_seen": 15677072, + "step": 74285 + }, + { + "epoch": 8.172717271727173, + "grad_norm": 0.0340321809053421, + "learning_rate": 3.684218740635235e-05, + "loss": 0.0032, + "num_input_tokens_seen": 15678160, + "step": 74290 + }, + { + "epoch": 8.173267326732674, + "grad_norm": 0.9248824715614319, + "learning_rate": 3.6840073628941844e-05, + "loss": 0.0452, + "num_input_tokens_seen": 15679216, + "step": 74295 + }, + { + "epoch": 8.173817381738173, + "grad_norm": 0.01776123233139515, + "learning_rate": 3.683795974240706e-05, + "loss": 0.0166, + "num_input_tokens_seen": 15680240, + "step": 74300 + }, + { + "epoch": 8.174367436743674, + "grad_norm": 1.6625003814697266, + "learning_rate": 3.683584574676748e-05, + "loss": 0.1178, + "num_input_tokens_seen": 15681264, + "step": 74305 + }, + { + "epoch": 8.174917491749175, + "grad_norm": 0.07240082323551178, + "learning_rate": 3.68337316420426e-05, + "loss": 0.0519, + "num_input_tokens_seen": 15682320, + "step": 74310 + }, + { + "epoch": 8.175467546754675, + "grad_norm": 0.5715442299842834, + "learning_rate": 3.683161742825188e-05, + "loss": 0.0338, + "num_input_tokens_seen": 15683344, + "step": 74315 + }, + { + "epoch": 8.176017601760176, + "grad_norm": 0.03464967757463455, + "learning_rate": 3.682950310541483e-05, + "loss": 0.0565, + "num_input_tokens_seen": 15684336, + "step": 74320 + }, + { + "epoch": 8.176567656765677, + "grad_norm": 0.02425735630095005, + "learning_rate": 3.682738867355092e-05, + "loss": 0.002, + "num_input_tokens_seen": 15685360, + "step": 74325 + }, + { + "epoch": 8.177117711771178, + "grad_norm": 0.04562709480524063, + "learning_rate": 3.6825274132679644e-05, + "loss": 0.0068, + "num_input_tokens_seen": 15686416, + "step": 74330 + }, + { + "epoch": 8.177667766776677, + "grad_norm": 0.01780284196138382, + "learning_rate": 3.6823159482820494e-05, + "loss": 0.0234, + "num_input_tokens_seen": 15687440, + "step": 74335 + }, + { + "epoch": 8.178217821782178, + "grad_norm": 0.39539027214050293, + "learning_rate": 3.6821044723992954e-05, + "loss": 0.0098, + "num_input_tokens_seen": 15688464, + "step": 74340 + }, + { + "epoch": 8.17876787678768, + "grad_norm": 0.014271111227571964, + "learning_rate": 3.681892985621652e-05, + "loss": 0.005, + "num_input_tokens_seen": 15689488, + "step": 74345 + }, + { + "epoch": 8.179317931793179, + "grad_norm": 0.3609551787376404, + "learning_rate": 3.681681487951068e-05, + "loss": 0.0194, + "num_input_tokens_seen": 15690512, + "step": 74350 + }, + { + "epoch": 8.17986798679868, + "grad_norm": 1.641831636428833, + "learning_rate": 3.681469979389493e-05, + "loss": 0.0967, + "num_input_tokens_seen": 15691536, + "step": 74355 + }, + { + "epoch": 8.180418041804181, + "grad_norm": 1.0046775341033936, + "learning_rate": 3.681258459938875e-05, + "loss": 0.0461, + "num_input_tokens_seen": 15692592, + "step": 74360 + }, + { + "epoch": 8.18096809680968, + "grad_norm": 0.12706559896469116, + "learning_rate": 3.681046929601165e-05, + "loss": 0.0191, + "num_input_tokens_seen": 15693616, + "step": 74365 + }, + { + "epoch": 8.181518151815181, + "grad_norm": 1.6852954626083374, + "learning_rate": 3.680835388378313e-05, + "loss": 0.0984, + "num_input_tokens_seen": 15694640, + "step": 74370 + }, + { + "epoch": 8.182068206820682, + "grad_norm": 1.587276816368103, + "learning_rate": 3.680623836272267e-05, + "loss": 0.0695, + "num_input_tokens_seen": 15695632, + "step": 74375 + }, + { + "epoch": 8.182618261826182, + "grad_norm": 0.031720519065856934, + "learning_rate": 3.6804122732849785e-05, + "loss": 0.0175, + "num_input_tokens_seen": 15696688, + "step": 74380 + }, + { + "epoch": 8.183168316831683, + "grad_norm": 0.029582416638731956, + "learning_rate": 3.680200699418396e-05, + "loss": 0.0473, + "num_input_tokens_seen": 15697712, + "step": 74385 + }, + { + "epoch": 8.183718371837184, + "grad_norm": 2.4341888427734375, + "learning_rate": 3.679989114674469e-05, + "loss": 0.0371, + "num_input_tokens_seen": 15698832, + "step": 74390 + }, + { + "epoch": 8.184268426842685, + "grad_norm": 0.05396770313382149, + "learning_rate": 3.67977751905515e-05, + "loss": 0.0679, + "num_input_tokens_seen": 15699856, + "step": 74395 + }, + { + "epoch": 8.184818481848184, + "grad_norm": 0.04463053494691849, + "learning_rate": 3.679565912562387e-05, + "loss": 0.0298, + "num_input_tokens_seen": 15700912, + "step": 74400 + }, + { + "epoch": 8.185368536853685, + "grad_norm": 0.021208742633461952, + "learning_rate": 3.6793542951981306e-05, + "loss": 0.0032, + "num_input_tokens_seen": 15701936, + "step": 74405 + }, + { + "epoch": 8.185918591859187, + "grad_norm": 0.39673107862472534, + "learning_rate": 3.679142666964332e-05, + "loss": 0.0271, + "num_input_tokens_seen": 15702992, + "step": 74410 + }, + { + "epoch": 8.186468646864686, + "grad_norm": 0.057817406952381134, + "learning_rate": 3.678931027862942e-05, + "loss": 0.0041, + "num_input_tokens_seen": 15703984, + "step": 74415 + }, + { + "epoch": 8.187018701870187, + "grad_norm": 0.026632843539118767, + "learning_rate": 3.67871937789591e-05, + "loss": 0.0433, + "num_input_tokens_seen": 15705040, + "step": 74420 + }, + { + "epoch": 8.187568756875688, + "grad_norm": 0.35096895694732666, + "learning_rate": 3.678507717065187e-05, + "loss": 0.0199, + "num_input_tokens_seen": 15706160, + "step": 74425 + }, + { + "epoch": 8.188118811881187, + "grad_norm": 0.7669306397438049, + "learning_rate": 3.678296045372723e-05, + "loss": 0.1374, + "num_input_tokens_seen": 15707248, + "step": 74430 + }, + { + "epoch": 8.188668866886688, + "grad_norm": 0.8859485983848572, + "learning_rate": 3.6780843628204706e-05, + "loss": 0.0852, + "num_input_tokens_seen": 15708304, + "step": 74435 + }, + { + "epoch": 8.18921892189219, + "grad_norm": 1.1074856519699097, + "learning_rate": 3.6778726694103803e-05, + "loss": 0.0322, + "num_input_tokens_seen": 15709424, + "step": 74440 + }, + { + "epoch": 8.189768976897689, + "grad_norm": 0.7897575497627258, + "learning_rate": 3.677660965144403e-05, + "loss": 0.0124, + "num_input_tokens_seen": 15710448, + "step": 74445 + }, + { + "epoch": 8.19031903190319, + "grad_norm": 0.09184560179710388, + "learning_rate": 3.677449250024488e-05, + "loss": 0.1238, + "num_input_tokens_seen": 15711504, + "step": 74450 + }, + { + "epoch": 8.190869086908691, + "grad_norm": 0.03968168422579765, + "learning_rate": 3.67723752405259e-05, + "loss": 0.0069, + "num_input_tokens_seen": 15712528, + "step": 74455 + }, + { + "epoch": 8.191419141914192, + "grad_norm": 0.15243497490882874, + "learning_rate": 3.677025787230657e-05, + "loss": 0.0989, + "num_input_tokens_seen": 15713584, + "step": 74460 + }, + { + "epoch": 8.191969196919691, + "grad_norm": 0.046043235808610916, + "learning_rate": 3.6768140395606434e-05, + "loss": 0.0049, + "num_input_tokens_seen": 15714640, + "step": 74465 + }, + { + "epoch": 8.192519251925193, + "grad_norm": 0.08374249190092087, + "learning_rate": 3.6766022810445e-05, + "loss": 0.0179, + "num_input_tokens_seen": 15715696, + "step": 74470 + }, + { + "epoch": 8.193069306930694, + "grad_norm": 0.3853212893009186, + "learning_rate": 3.676390511684177e-05, + "loss": 0.0288, + "num_input_tokens_seen": 15716688, + "step": 74475 + }, + { + "epoch": 8.193619361936193, + "grad_norm": 0.11909275501966476, + "learning_rate": 3.676178731481626e-05, + "loss": 0.013, + "num_input_tokens_seen": 15717840, + "step": 74480 + }, + { + "epoch": 8.194169416941694, + "grad_norm": 0.08356624841690063, + "learning_rate": 3.675966940438802e-05, + "loss": 0.0572, + "num_input_tokens_seen": 15718928, + "step": 74485 + }, + { + "epoch": 8.194719471947195, + "grad_norm": 0.2872917354106903, + "learning_rate": 3.675755138557654e-05, + "loss": 0.0258, + "num_input_tokens_seen": 15719952, + "step": 74490 + }, + { + "epoch": 8.195269526952695, + "grad_norm": 0.11688216775655746, + "learning_rate": 3.6755433258401354e-05, + "loss": 0.0263, + "num_input_tokens_seen": 15721040, + "step": 74495 + }, + { + "epoch": 8.195819581958196, + "grad_norm": 0.07122573256492615, + "learning_rate": 3.6753315022881986e-05, + "loss": 0.0392, + "num_input_tokens_seen": 15722096, + "step": 74500 + }, + { + "epoch": 8.196369636963697, + "grad_norm": 0.04375074803829193, + "learning_rate": 3.675119667903794e-05, + "loss": 0.0558, + "num_input_tokens_seen": 15723088, + "step": 74505 + }, + { + "epoch": 8.196919691969198, + "grad_norm": 0.02070552110671997, + "learning_rate": 3.674907822688876e-05, + "loss": 0.0321, + "num_input_tokens_seen": 15724080, + "step": 74510 + }, + { + "epoch": 8.197469746974697, + "grad_norm": 3.1735081672668457, + "learning_rate": 3.6746959666453966e-05, + "loss": 0.0359, + "num_input_tokens_seen": 15725136, + "step": 74515 + }, + { + "epoch": 8.198019801980198, + "grad_norm": 0.39961573481559753, + "learning_rate": 3.6744840997753086e-05, + "loss": 0.0655, + "num_input_tokens_seen": 15726192, + "step": 74520 + }, + { + "epoch": 8.1985698569857, + "grad_norm": 0.3776834309101105, + "learning_rate": 3.674272222080563e-05, + "loss": 0.0423, + "num_input_tokens_seen": 15727280, + "step": 74525 + }, + { + "epoch": 8.199119911991199, + "grad_norm": 0.07113740593194962, + "learning_rate": 3.674060333563114e-05, + "loss": 0.0945, + "num_input_tokens_seen": 15728336, + "step": 74530 + }, + { + "epoch": 8.1996699669967, + "grad_norm": 0.9311324954032898, + "learning_rate": 3.673848434224915e-05, + "loss": 0.0457, + "num_input_tokens_seen": 15729392, + "step": 74535 + }, + { + "epoch": 8.2002200220022, + "grad_norm": 0.01758316345512867, + "learning_rate": 3.673636524067918e-05, + "loss": 0.0457, + "num_input_tokens_seen": 15730384, + "step": 74540 + }, + { + "epoch": 8.2007700770077, + "grad_norm": 0.15802153944969177, + "learning_rate": 3.6734246030940756e-05, + "loss": 0.0498, + "num_input_tokens_seen": 15731472, + "step": 74545 + }, + { + "epoch": 8.201320132013201, + "grad_norm": 0.01766812615096569, + "learning_rate": 3.673212671305343e-05, + "loss": 0.016, + "num_input_tokens_seen": 15732432, + "step": 74550 + }, + { + "epoch": 8.201870187018702, + "grad_norm": 0.029646342620253563, + "learning_rate": 3.673000728703671e-05, + "loss": 0.006, + "num_input_tokens_seen": 15733520, + "step": 74555 + }, + { + "epoch": 8.202420242024202, + "grad_norm": 0.015423267148435116, + "learning_rate": 3.672788775291015e-05, + "loss": 0.1229, + "num_input_tokens_seen": 15734608, + "step": 74560 + }, + { + "epoch": 8.202970297029703, + "grad_norm": 0.026614027097821236, + "learning_rate": 3.672576811069327e-05, + "loss": 0.0525, + "num_input_tokens_seen": 15735760, + "step": 74565 + }, + { + "epoch": 8.203520352035204, + "grad_norm": 0.007844561710953712, + "learning_rate": 3.6723648360405614e-05, + "loss": 0.0032, + "num_input_tokens_seen": 15736816, + "step": 74570 + }, + { + "epoch": 8.204070407040705, + "grad_norm": 2.131572723388672, + "learning_rate": 3.672152850206672e-05, + "loss": 0.0922, + "num_input_tokens_seen": 15737840, + "step": 74575 + }, + { + "epoch": 8.204620462046204, + "grad_norm": 0.5340148210525513, + "learning_rate": 3.671940853569611e-05, + "loss": 0.0174, + "num_input_tokens_seen": 15738864, + "step": 74580 + }, + { + "epoch": 8.205170517051705, + "grad_norm": 0.015072300098836422, + "learning_rate": 3.671728846131335e-05, + "loss": 0.043, + "num_input_tokens_seen": 15739984, + "step": 74585 + }, + { + "epoch": 8.205720572057206, + "grad_norm": 0.029057404026389122, + "learning_rate": 3.6715168278937954e-05, + "loss": 0.0719, + "num_input_tokens_seen": 15741008, + "step": 74590 + }, + { + "epoch": 8.206270627062706, + "grad_norm": 0.07381368428468704, + "learning_rate": 3.671304798858948e-05, + "loss": 0.0187, + "num_input_tokens_seen": 15742064, + "step": 74595 + }, + { + "epoch": 8.206820682068207, + "grad_norm": 0.03508549928665161, + "learning_rate": 3.6710927590287464e-05, + "loss": 0.0045, + "num_input_tokens_seen": 15743120, + "step": 74600 + }, + { + "epoch": 8.207370737073708, + "grad_norm": 0.1080874428153038, + "learning_rate": 3.670880708405144e-05, + "loss": 0.0327, + "num_input_tokens_seen": 15744080, + "step": 74605 + }, + { + "epoch": 8.207920792079207, + "grad_norm": 0.21202802658081055, + "learning_rate": 3.670668646990097e-05, + "loss": 0.0181, + "num_input_tokens_seen": 15745104, + "step": 74610 + }, + { + "epoch": 8.208470847084708, + "grad_norm": 0.07893117517232895, + "learning_rate": 3.6704565747855586e-05, + "loss": 0.0053, + "num_input_tokens_seen": 15746128, + "step": 74615 + }, + { + "epoch": 8.20902090209021, + "grad_norm": 0.025057289749383926, + "learning_rate": 3.6702444917934826e-05, + "loss": 0.01, + "num_input_tokens_seen": 15747216, + "step": 74620 + }, + { + "epoch": 8.209570957095709, + "grad_norm": 0.07424211502075195, + "learning_rate": 3.670032398015826e-05, + "loss": 0.0171, + "num_input_tokens_seen": 15748208, + "step": 74625 + }, + { + "epoch": 8.21012101210121, + "grad_norm": 0.024504413828253746, + "learning_rate": 3.669820293454541e-05, + "loss": 0.0455, + "num_input_tokens_seen": 15749296, + "step": 74630 + }, + { + "epoch": 8.210671067106711, + "grad_norm": 0.37565892934799194, + "learning_rate": 3.6696081781115844e-05, + "loss": 0.0071, + "num_input_tokens_seen": 15750320, + "step": 74635 + }, + { + "epoch": 8.211221122112212, + "grad_norm": 0.023396819829940796, + "learning_rate": 3.6693960519889106e-05, + "loss": 0.0239, + "num_input_tokens_seen": 15751344, + "step": 74640 + }, + { + "epoch": 8.211771177117711, + "grad_norm": 0.6461865305900574, + "learning_rate": 3.6691839150884744e-05, + "loss": 0.0146, + "num_input_tokens_seen": 15752400, + "step": 74645 + }, + { + "epoch": 8.212321232123212, + "grad_norm": 1.0991588830947876, + "learning_rate": 3.6689717674122306e-05, + "loss": 0.022, + "num_input_tokens_seen": 15753520, + "step": 74650 + }, + { + "epoch": 8.212871287128714, + "grad_norm": 0.05134877189993858, + "learning_rate": 3.668759608962135e-05, + "loss": 0.0117, + "num_input_tokens_seen": 15754608, + "step": 74655 + }, + { + "epoch": 8.213421342134213, + "grad_norm": 0.3382561206817627, + "learning_rate": 3.668547439740144e-05, + "loss": 0.0422, + "num_input_tokens_seen": 15755696, + "step": 74660 + }, + { + "epoch": 8.213971397139714, + "grad_norm": 0.13734254240989685, + "learning_rate": 3.66833525974821e-05, + "loss": 0.0443, + "num_input_tokens_seen": 15756720, + "step": 74665 + }, + { + "epoch": 8.214521452145215, + "grad_norm": 0.14326182007789612, + "learning_rate": 3.6681230689882924e-05, + "loss": 0.0199, + "num_input_tokens_seen": 15757712, + "step": 74670 + }, + { + "epoch": 8.215071507150714, + "grad_norm": 0.40313979983329773, + "learning_rate": 3.6679108674623444e-05, + "loss": 0.0255, + "num_input_tokens_seen": 15758768, + "step": 74675 + }, + { + "epoch": 8.215621562156215, + "grad_norm": 0.08920111507177353, + "learning_rate": 3.6676986551723214e-05, + "loss": 0.0232, + "num_input_tokens_seen": 15759792, + "step": 74680 + }, + { + "epoch": 8.216171617161717, + "grad_norm": 0.4943363666534424, + "learning_rate": 3.667486432120182e-05, + "loss": 0.0228, + "num_input_tokens_seen": 15760816, + "step": 74685 + }, + { + "epoch": 8.216721672167218, + "grad_norm": 0.05793622508645058, + "learning_rate": 3.667274198307879e-05, + "loss": 0.0403, + "num_input_tokens_seen": 15761904, + "step": 74690 + }, + { + "epoch": 8.217271727172717, + "grad_norm": 0.18557432293891907, + "learning_rate": 3.667061953737369e-05, + "loss": 0.0821, + "num_input_tokens_seen": 15762864, + "step": 74695 + }, + { + "epoch": 8.217821782178218, + "grad_norm": 0.015901535749435425, + "learning_rate": 3.666849698410611e-05, + "loss": 0.0845, + "num_input_tokens_seen": 15763920, + "step": 74700 + }, + { + "epoch": 8.218371837183719, + "grad_norm": 0.006426754407584667, + "learning_rate": 3.666637432329559e-05, + "loss": 0.015, + "num_input_tokens_seen": 15764912, + "step": 74705 + }, + { + "epoch": 8.218921892189218, + "grad_norm": 1.1100000143051147, + "learning_rate": 3.666425155496169e-05, + "loss": 0.0443, + "num_input_tokens_seen": 15765968, + "step": 74710 + }, + { + "epoch": 8.21947194719472, + "grad_norm": 0.08244870603084564, + "learning_rate": 3.666212867912398e-05, + "loss": 0.0092, + "num_input_tokens_seen": 15767024, + "step": 74715 + }, + { + "epoch": 8.22002200220022, + "grad_norm": 1.1086117029190063, + "learning_rate": 3.666000569580202e-05, + "loss": 0.055, + "num_input_tokens_seen": 15768048, + "step": 74720 + }, + { + "epoch": 8.22057205720572, + "grad_norm": 0.023897672072052956, + "learning_rate": 3.665788260501539e-05, + "loss": 0.0132, + "num_input_tokens_seen": 15769072, + "step": 74725 + }, + { + "epoch": 8.221122112211221, + "grad_norm": 0.011738136410713196, + "learning_rate": 3.665575940678366e-05, + "loss": 0.0273, + "num_input_tokens_seen": 15770128, + "step": 74730 + }, + { + "epoch": 8.221672167216722, + "grad_norm": 0.007534438278526068, + "learning_rate": 3.665363610112637e-05, + "loss": 0.0281, + "num_input_tokens_seen": 15771184, + "step": 74735 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.06766775995492935, + "learning_rate": 3.665151268806312e-05, + "loss": 0.0081, + "num_input_tokens_seen": 15772272, + "step": 74740 + }, + { + "epoch": 8.222772277227723, + "grad_norm": 0.3775279223918915, + "learning_rate": 3.664938916761346e-05, + "loss": 0.0198, + "num_input_tokens_seen": 15773392, + "step": 74745 + }, + { + "epoch": 8.223322332233224, + "grad_norm": 1.8049083948135376, + "learning_rate": 3.6647265539796975e-05, + "loss": 0.047, + "num_input_tokens_seen": 15774416, + "step": 74750 + }, + { + "epoch": 8.223872387238725, + "grad_norm": 0.2820225954055786, + "learning_rate": 3.664514180463324e-05, + "loss": 0.0742, + "num_input_tokens_seen": 15775440, + "step": 74755 + }, + { + "epoch": 8.224422442244224, + "grad_norm": 0.015187487006187439, + "learning_rate": 3.664301796214181e-05, + "loss": 0.0015, + "num_input_tokens_seen": 15776400, + "step": 74760 + }, + { + "epoch": 8.224972497249725, + "grad_norm": 0.20404356718063354, + "learning_rate": 3.6640894012342267e-05, + "loss": 0.0165, + "num_input_tokens_seen": 15777456, + "step": 74765 + }, + { + "epoch": 8.225522552255226, + "grad_norm": 0.024408681318163872, + "learning_rate": 3.66387699552542e-05, + "loss": 0.1036, + "num_input_tokens_seen": 15778544, + "step": 74770 + }, + { + "epoch": 8.226072607260726, + "grad_norm": 0.07506877183914185, + "learning_rate": 3.6636645790897164e-05, + "loss": 0.0144, + "num_input_tokens_seen": 15779536, + "step": 74775 + }, + { + "epoch": 8.226622662266227, + "grad_norm": 0.16583935916423798, + "learning_rate": 3.663452151929076e-05, + "loss": 0.0081, + "num_input_tokens_seen": 15780560, + "step": 74780 + }, + { + "epoch": 8.227172717271728, + "grad_norm": 0.010156551375985146, + "learning_rate": 3.663239714045455e-05, + "loss": 0.0083, + "num_input_tokens_seen": 15781584, + "step": 74785 + }, + { + "epoch": 8.227722772277227, + "grad_norm": 0.11682573705911636, + "learning_rate": 3.663027265440812e-05, + "loss": 0.027, + "num_input_tokens_seen": 15782672, + "step": 74790 + }, + { + "epoch": 8.228272827282728, + "grad_norm": 0.08446171879768372, + "learning_rate": 3.662814806117103e-05, + "loss": 0.0199, + "num_input_tokens_seen": 15783728, + "step": 74795 + }, + { + "epoch": 8.22882288228823, + "grad_norm": 0.018232295289635658, + "learning_rate": 3.66260233607629e-05, + "loss": 0.0727, + "num_input_tokens_seen": 15784720, + "step": 74800 + }, + { + "epoch": 8.229372937293729, + "grad_norm": 0.020774956792593002, + "learning_rate": 3.662389855320329e-05, + "loss": 0.0256, + "num_input_tokens_seen": 15785776, + "step": 74805 + }, + { + "epoch": 8.22992299229923, + "grad_norm": 0.11151614785194397, + "learning_rate": 3.662177363851177e-05, + "loss": 0.0739, + "num_input_tokens_seen": 15786864, + "step": 74810 + }, + { + "epoch": 8.23047304730473, + "grad_norm": 0.02262742817401886, + "learning_rate": 3.6619648616707954e-05, + "loss": 0.1521, + "num_input_tokens_seen": 15787920, + "step": 74815 + }, + { + "epoch": 8.231023102310232, + "grad_norm": 0.07986040413379669, + "learning_rate": 3.66175234878114e-05, + "loss": 0.0502, + "num_input_tokens_seen": 15788976, + "step": 74820 + }, + { + "epoch": 8.231573157315731, + "grad_norm": 0.14275309443473816, + "learning_rate": 3.6615398251841716e-05, + "loss": 0.0207, + "num_input_tokens_seen": 15790032, + "step": 74825 + }, + { + "epoch": 8.232123212321232, + "grad_norm": 0.05849220231175423, + "learning_rate": 3.661327290881847e-05, + "loss": 0.0187, + "num_input_tokens_seen": 15791088, + "step": 74830 + }, + { + "epoch": 8.232673267326733, + "grad_norm": 0.034639813005924225, + "learning_rate": 3.661114745876127e-05, + "loss": 0.1521, + "num_input_tokens_seen": 15792144, + "step": 74835 + }, + { + "epoch": 8.233223322332233, + "grad_norm": 0.06102989986538887, + "learning_rate": 3.660902190168969e-05, + "loss": 0.0077, + "num_input_tokens_seen": 15793296, + "step": 74840 + }, + { + "epoch": 8.233773377337734, + "grad_norm": 0.042906444519758224, + "learning_rate": 3.6606896237623336e-05, + "loss": 0.0221, + "num_input_tokens_seen": 15794416, + "step": 74845 + }, + { + "epoch": 8.234323432343235, + "grad_norm": 0.8642537593841553, + "learning_rate": 3.660477046658178e-05, + "loss": 0.0541, + "num_input_tokens_seen": 15795440, + "step": 74850 + }, + { + "epoch": 8.234873487348734, + "grad_norm": 0.11945988982915878, + "learning_rate": 3.660264458858462e-05, + "loss": 0.0145, + "num_input_tokens_seen": 15796496, + "step": 74855 + }, + { + "epoch": 8.235423542354235, + "grad_norm": 0.04690886661410332, + "learning_rate": 3.660051860365146e-05, + "loss": 0.0212, + "num_input_tokens_seen": 15797584, + "step": 74860 + }, + { + "epoch": 8.235973597359736, + "grad_norm": 0.07045800983905792, + "learning_rate": 3.659839251180188e-05, + "loss": 0.0369, + "num_input_tokens_seen": 15798608, + "step": 74865 + }, + { + "epoch": 8.236523652365236, + "grad_norm": 0.02486533485352993, + "learning_rate": 3.6596266313055484e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15799728, + "step": 74870 + }, + { + "epoch": 8.237073707370737, + "grad_norm": 4.176649570465088, + "learning_rate": 3.659414000743187e-05, + "loss": 0.075, + "num_input_tokens_seen": 15800720, + "step": 74875 + }, + { + "epoch": 8.237623762376238, + "grad_norm": 0.02952933870255947, + "learning_rate": 3.659201359495062e-05, + "loss": 0.0189, + "num_input_tokens_seen": 15801808, + "step": 74880 + }, + { + "epoch": 8.238173817381739, + "grad_norm": 0.047068871557712555, + "learning_rate": 3.658988707563135e-05, + "loss": 0.021, + "num_input_tokens_seen": 15802864, + "step": 74885 + }, + { + "epoch": 8.238723872387238, + "grad_norm": 0.015115920454263687, + "learning_rate": 3.658776044949366e-05, + "loss": 0.0105, + "num_input_tokens_seen": 15803920, + "step": 74890 + }, + { + "epoch": 8.23927392739274, + "grad_norm": 0.06047413498163223, + "learning_rate": 3.658563371655713e-05, + "loss": 0.0069, + "num_input_tokens_seen": 15804944, + "step": 74895 + }, + { + "epoch": 8.23982398239824, + "grad_norm": 0.02970319241285324, + "learning_rate": 3.658350687684138e-05, + "loss": 0.0267, + "num_input_tokens_seen": 15806000, + "step": 74900 + }, + { + "epoch": 8.24037403740374, + "grad_norm": 0.4473764896392822, + "learning_rate": 3.6581379930366005e-05, + "loss": 0.012, + "num_input_tokens_seen": 15806960, + "step": 74905 + }, + { + "epoch": 8.24092409240924, + "grad_norm": 0.054306261241436005, + "learning_rate": 3.6579252877150606e-05, + "loss": 0.0029, + "num_input_tokens_seen": 15808016, + "step": 74910 + }, + { + "epoch": 8.241474147414742, + "grad_norm": 0.12022329866886139, + "learning_rate": 3.657712571721479e-05, + "loss": 0.0027, + "num_input_tokens_seen": 15809104, + "step": 74915 + }, + { + "epoch": 8.242024202420241, + "grad_norm": 0.01708090491592884, + "learning_rate": 3.657499845057817e-05, + "loss": 0.0386, + "num_input_tokens_seen": 15810160, + "step": 74920 + }, + { + "epoch": 8.242574257425742, + "grad_norm": 1.2037885189056396, + "learning_rate": 3.6572871077260326e-05, + "loss": 0.215, + "num_input_tokens_seen": 15811184, + "step": 74925 + }, + { + "epoch": 8.243124312431243, + "grad_norm": 0.025135671719908714, + "learning_rate": 3.6570743597280895e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15812208, + "step": 74930 + }, + { + "epoch": 8.243674367436745, + "grad_norm": 0.04132189601659775, + "learning_rate": 3.656861601065947e-05, + "loss": 0.0629, + "num_input_tokens_seen": 15813296, + "step": 74935 + }, + { + "epoch": 8.244224422442244, + "grad_norm": 0.06382520496845245, + "learning_rate": 3.656648831741566e-05, + "loss": 0.0104, + "num_input_tokens_seen": 15814384, + "step": 74940 + }, + { + "epoch": 8.244774477447745, + "grad_norm": 0.2836191654205322, + "learning_rate": 3.6564360517569076e-05, + "loss": 0.0121, + "num_input_tokens_seen": 15815440, + "step": 74945 + }, + { + "epoch": 8.245324532453246, + "grad_norm": 1.8639010190963745, + "learning_rate": 3.656223261113934e-05, + "loss": 0.1057, + "num_input_tokens_seen": 15816496, + "step": 74950 + }, + { + "epoch": 8.245874587458745, + "grad_norm": 0.02941872738301754, + "learning_rate": 3.656010459814604e-05, + "loss": 0.1502, + "num_input_tokens_seen": 15817552, + "step": 74955 + }, + { + "epoch": 8.246424642464246, + "grad_norm": 1.26988685131073, + "learning_rate": 3.655797647860881e-05, + "loss": 0.0528, + "num_input_tokens_seen": 15818576, + "step": 74960 + }, + { + "epoch": 8.246974697469748, + "grad_norm": 1.6867587566375732, + "learning_rate": 3.655584825254726e-05, + "loss": 0.0693, + "num_input_tokens_seen": 15819632, + "step": 74965 + }, + { + "epoch": 8.247524752475247, + "grad_norm": 0.4701750874519348, + "learning_rate": 3.6553719919981e-05, + "loss": 0.0164, + "num_input_tokens_seen": 15820688, + "step": 74970 + }, + { + "epoch": 8.248074807480748, + "grad_norm": 0.02659609168767929, + "learning_rate": 3.655159148092965e-05, + "loss": 0.0489, + "num_input_tokens_seen": 15821712, + "step": 74975 + }, + { + "epoch": 8.248624862486249, + "grad_norm": 0.27531254291534424, + "learning_rate": 3.654946293541282e-05, + "loss": 0.0267, + "num_input_tokens_seen": 15822800, + "step": 74980 + }, + { + "epoch": 8.249174917491748, + "grad_norm": 0.0364299975335598, + "learning_rate": 3.654733428345013e-05, + "loss": 0.0596, + "num_input_tokens_seen": 15823824, + "step": 74985 + }, + { + "epoch": 8.24972497249725, + "grad_norm": 0.09298881143331528, + "learning_rate": 3.654520552506121e-05, + "loss": 0.0773, + "num_input_tokens_seen": 15824912, + "step": 74990 + }, + { + "epoch": 8.25027502750275, + "grad_norm": 0.0837704986333847, + "learning_rate": 3.6543076660265666e-05, + "loss": 0.004, + "num_input_tokens_seen": 15825936, + "step": 74995 + }, + { + "epoch": 8.250825082508252, + "grad_norm": 0.5325207114219666, + "learning_rate": 3.654094768908312e-05, + "loss": 0.0371, + "num_input_tokens_seen": 15827024, + "step": 75000 + }, + { + "epoch": 8.251375137513751, + "grad_norm": 0.20391790568828583, + "learning_rate": 3.6538818611533196e-05, + "loss": 0.0069, + "num_input_tokens_seen": 15828048, + "step": 75005 + }, + { + "epoch": 8.251925192519252, + "grad_norm": 2.7040936946868896, + "learning_rate": 3.6536689427635516e-05, + "loss": 0.0924, + "num_input_tokens_seen": 15829136, + "step": 75010 + }, + { + "epoch": 8.252475247524753, + "grad_norm": 0.23579977452754974, + "learning_rate": 3.6534560137409715e-05, + "loss": 0.0982, + "num_input_tokens_seen": 15830224, + "step": 75015 + }, + { + "epoch": 8.253025302530252, + "grad_norm": 0.33334434032440186, + "learning_rate": 3.6532430740875404e-05, + "loss": 0.0854, + "num_input_tokens_seen": 15831248, + "step": 75020 + }, + { + "epoch": 8.253575357535754, + "grad_norm": 0.9218437671661377, + "learning_rate": 3.6530301238052214e-05, + "loss": 0.0659, + "num_input_tokens_seen": 15832336, + "step": 75025 + }, + { + "epoch": 8.254125412541255, + "grad_norm": 0.042958471924066544, + "learning_rate": 3.652817162895976e-05, + "loss": 0.008, + "num_input_tokens_seen": 15833392, + "step": 75030 + }, + { + "epoch": 8.254675467546754, + "grad_norm": 0.02251617982983589, + "learning_rate": 3.6526041913617687e-05, + "loss": 0.0929, + "num_input_tokens_seen": 15834448, + "step": 75035 + }, + { + "epoch": 8.255225522552255, + "grad_norm": 0.006862882524728775, + "learning_rate": 3.652391209204562e-05, + "loss": 0.0134, + "num_input_tokens_seen": 15835536, + "step": 75040 + }, + { + "epoch": 8.255775577557756, + "grad_norm": 0.004125946667045355, + "learning_rate": 3.6521782164263186e-05, + "loss": 0.0285, + "num_input_tokens_seen": 15836656, + "step": 75045 + }, + { + "epoch": 8.256325632563255, + "grad_norm": 0.024139896035194397, + "learning_rate": 3.651965213029e-05, + "loss": 0.0362, + "num_input_tokens_seen": 15837712, + "step": 75050 + }, + { + "epoch": 8.256875687568757, + "grad_norm": 0.03226299583911896, + "learning_rate": 3.651752199014572e-05, + "loss": 0.0402, + "num_input_tokens_seen": 15838800, + "step": 75055 + }, + { + "epoch": 8.257425742574258, + "grad_norm": 0.0255307424813509, + "learning_rate": 3.651539174384997e-05, + "loss": 0.0196, + "num_input_tokens_seen": 15839856, + "step": 75060 + }, + { + "epoch": 8.257975797579759, + "grad_norm": 0.08963066339492798, + "learning_rate": 3.6513261391422376e-05, + "loss": 0.0104, + "num_input_tokens_seen": 15840976, + "step": 75065 + }, + { + "epoch": 8.258525852585258, + "grad_norm": 0.9974145293235779, + "learning_rate": 3.6511130932882575e-05, + "loss": 0.1178, + "num_input_tokens_seen": 15842032, + "step": 75070 + }, + { + "epoch": 8.25907590759076, + "grad_norm": 1.446153163909912, + "learning_rate": 3.65090003682502e-05, + "loss": 0.055, + "num_input_tokens_seen": 15843120, + "step": 75075 + }, + { + "epoch": 8.25962596259626, + "grad_norm": 0.04594692587852478, + "learning_rate": 3.6506869697544896e-05, + "loss": 0.0171, + "num_input_tokens_seen": 15844112, + "step": 75080 + }, + { + "epoch": 8.26017601760176, + "grad_norm": 1.226117730140686, + "learning_rate": 3.6504738920786295e-05, + "loss": 0.0251, + "num_input_tokens_seen": 15845200, + "step": 75085 + }, + { + "epoch": 8.26072607260726, + "grad_norm": 0.028809864073991776, + "learning_rate": 3.650260803799404e-05, + "loss": 0.0827, + "num_input_tokens_seen": 15846256, + "step": 75090 + }, + { + "epoch": 8.261276127612762, + "grad_norm": 0.4662822186946869, + "learning_rate": 3.650047704918776e-05, + "loss": 0.1675, + "num_input_tokens_seen": 15847344, + "step": 75095 + }, + { + "epoch": 8.261826182618261, + "grad_norm": 0.02380370907485485, + "learning_rate": 3.64983459543871e-05, + "loss": 0.0114, + "num_input_tokens_seen": 15848336, + "step": 75100 + }, + { + "epoch": 8.262376237623762, + "grad_norm": 0.08472955971956253, + "learning_rate": 3.649621475361171e-05, + "loss": 0.1125, + "num_input_tokens_seen": 15849360, + "step": 75105 + }, + { + "epoch": 8.262926292629263, + "grad_norm": 0.03232446685433388, + "learning_rate": 3.649408344688122e-05, + "loss": 0.0303, + "num_input_tokens_seen": 15850448, + "step": 75110 + }, + { + "epoch": 8.263476347634764, + "grad_norm": 0.017008427530527115, + "learning_rate": 3.6491952034215284e-05, + "loss": 0.0079, + "num_input_tokens_seen": 15851472, + "step": 75115 + }, + { + "epoch": 8.264026402640264, + "grad_norm": 0.026301810517907143, + "learning_rate": 3.6489820515633535e-05, + "loss": 0.0482, + "num_input_tokens_seen": 15852496, + "step": 75120 + }, + { + "epoch": 8.264576457645765, + "grad_norm": 0.03935360535979271, + "learning_rate": 3.6487688891155625e-05, + "loss": 0.0443, + "num_input_tokens_seen": 15853584, + "step": 75125 + }, + { + "epoch": 8.265126512651266, + "grad_norm": 0.02579510770738125, + "learning_rate": 3.648555716080119e-05, + "loss": 0.0166, + "num_input_tokens_seen": 15854672, + "step": 75130 + }, + { + "epoch": 8.265676567656765, + "grad_norm": 0.44260385632514954, + "learning_rate": 3.64834253245899e-05, + "loss": 0.0112, + "num_input_tokens_seen": 15855760, + "step": 75135 + }, + { + "epoch": 8.266226622662266, + "grad_norm": 0.009931623004376888, + "learning_rate": 3.648129338254139e-05, + "loss": 0.0171, + "num_input_tokens_seen": 15856752, + "step": 75140 + }, + { + "epoch": 8.266776677667767, + "grad_norm": 0.019947540014982224, + "learning_rate": 3.6479161334675296e-05, + "loss": 0.0283, + "num_input_tokens_seen": 15857840, + "step": 75145 + }, + { + "epoch": 8.267326732673267, + "grad_norm": 0.09232159703969955, + "learning_rate": 3.647702918101128e-05, + "loss": 0.0412, + "num_input_tokens_seen": 15858960, + "step": 75150 + }, + { + "epoch": 8.267876787678768, + "grad_norm": 0.242302805185318, + "learning_rate": 3.6474896921569e-05, + "loss": 0.0204, + "num_input_tokens_seen": 15860048, + "step": 75155 + }, + { + "epoch": 8.268426842684269, + "grad_norm": 0.03109661117196083, + "learning_rate": 3.6472764556368105e-05, + "loss": 0.0492, + "num_input_tokens_seen": 15861104, + "step": 75160 + }, + { + "epoch": 8.268976897689768, + "grad_norm": 0.1025395542383194, + "learning_rate": 3.647063208542824e-05, + "loss": 0.1016, + "num_input_tokens_seen": 15862192, + "step": 75165 + }, + { + "epoch": 8.26952695269527, + "grad_norm": 1.6157665252685547, + "learning_rate": 3.646849950876906e-05, + "loss": 0.0614, + "num_input_tokens_seen": 15863216, + "step": 75170 + }, + { + "epoch": 8.27007700770077, + "grad_norm": 0.06498632580041885, + "learning_rate": 3.646636682641022e-05, + "loss": 0.0335, + "num_input_tokens_seen": 15864240, + "step": 75175 + }, + { + "epoch": 8.270627062706271, + "grad_norm": 0.02232314646244049, + "learning_rate": 3.6464234038371394e-05, + "loss": 0.0143, + "num_input_tokens_seen": 15865328, + "step": 75180 + }, + { + "epoch": 8.27117711771177, + "grad_norm": 0.2294072061777115, + "learning_rate": 3.646210114467221e-05, + "loss": 0.0246, + "num_input_tokens_seen": 15866416, + "step": 75185 + }, + { + "epoch": 8.271727172717272, + "grad_norm": 1.203190803527832, + "learning_rate": 3.645996814533235e-05, + "loss": 0.0971, + "num_input_tokens_seen": 15867536, + "step": 75190 + }, + { + "epoch": 8.272277227722773, + "grad_norm": 0.3560183048248291, + "learning_rate": 3.645783504037146e-05, + "loss": 0.0088, + "num_input_tokens_seen": 15868592, + "step": 75195 + }, + { + "epoch": 8.272827282728272, + "grad_norm": 0.8945441246032715, + "learning_rate": 3.6455701829809196e-05, + "loss": 0.0369, + "num_input_tokens_seen": 15869712, + "step": 75200 + }, + { + "epoch": 8.273377337733773, + "grad_norm": 1.634368896484375, + "learning_rate": 3.6453568513665227e-05, + "loss": 0.0928, + "num_input_tokens_seen": 15870768, + "step": 75205 + }, + { + "epoch": 8.273927392739274, + "grad_norm": 0.016756055876612663, + "learning_rate": 3.645143509195922e-05, + "loss": 0.0404, + "num_input_tokens_seen": 15871856, + "step": 75210 + }, + { + "epoch": 8.274477447744774, + "grad_norm": 0.03780865669250488, + "learning_rate": 3.644930156471082e-05, + "loss": 0.0037, + "num_input_tokens_seen": 15872976, + "step": 75215 + }, + { + "epoch": 8.275027502750275, + "grad_norm": 0.05931974574923515, + "learning_rate": 3.644716793193971e-05, + "loss": 0.0021, + "num_input_tokens_seen": 15874000, + "step": 75220 + }, + { + "epoch": 8.275577557755776, + "grad_norm": 1.2785837650299072, + "learning_rate": 3.644503419366554e-05, + "loss": 0.0572, + "num_input_tokens_seen": 15875024, + "step": 75225 + }, + { + "epoch": 8.276127612761275, + "grad_norm": 0.023136582225561142, + "learning_rate": 3.6442900349907985e-05, + "loss": 0.0107, + "num_input_tokens_seen": 15876144, + "step": 75230 + }, + { + "epoch": 8.276677667766776, + "grad_norm": 0.38206803798675537, + "learning_rate": 3.644076640068671e-05, + "loss": 0.0577, + "num_input_tokens_seen": 15877232, + "step": 75235 + }, + { + "epoch": 8.277227722772277, + "grad_norm": 0.9347240328788757, + "learning_rate": 3.6438632346021376e-05, + "loss": 0.0567, + "num_input_tokens_seen": 15878256, + "step": 75240 + }, + { + "epoch": 8.277777777777779, + "grad_norm": 0.27452918887138367, + "learning_rate": 3.643649818593166e-05, + "loss": 0.0567, + "num_input_tokens_seen": 15879344, + "step": 75245 + }, + { + "epoch": 8.278327832783278, + "grad_norm": 0.05434121936559677, + "learning_rate": 3.6434363920437226e-05, + "loss": 0.0104, + "num_input_tokens_seen": 15880368, + "step": 75250 + }, + { + "epoch": 8.278877887788779, + "grad_norm": 1.474405288696289, + "learning_rate": 3.6432229549557754e-05, + "loss": 0.0843, + "num_input_tokens_seen": 15881456, + "step": 75255 + }, + { + "epoch": 8.27942794279428, + "grad_norm": 0.021234726533293724, + "learning_rate": 3.64300950733129e-05, + "loss": 0.0683, + "num_input_tokens_seen": 15882512, + "step": 75260 + }, + { + "epoch": 8.27997799779978, + "grad_norm": 0.7281951308250427, + "learning_rate": 3.6427960491722346e-05, + "loss": 0.0344, + "num_input_tokens_seen": 15883600, + "step": 75265 + }, + { + "epoch": 8.28052805280528, + "grad_norm": 1.828455924987793, + "learning_rate": 3.642582580480576e-05, + "loss": 0.0291, + "num_input_tokens_seen": 15884688, + "step": 75270 + }, + { + "epoch": 8.281078107810782, + "grad_norm": 0.2367856353521347, + "learning_rate": 3.642369101258283e-05, + "loss": 0.0207, + "num_input_tokens_seen": 15885744, + "step": 75275 + }, + { + "epoch": 8.281628162816281, + "grad_norm": 0.8248580694198608, + "learning_rate": 3.642155611507321e-05, + "loss": 0.0235, + "num_input_tokens_seen": 15886832, + "step": 75280 + }, + { + "epoch": 8.282178217821782, + "grad_norm": 0.017012761905789375, + "learning_rate": 3.6419421112296596e-05, + "loss": 0.0503, + "num_input_tokens_seen": 15887888, + "step": 75285 + }, + { + "epoch": 8.282728272827283, + "grad_norm": 1.4368559122085571, + "learning_rate": 3.6417286004272654e-05, + "loss": 0.0376, + "num_input_tokens_seen": 15888944, + "step": 75290 + }, + { + "epoch": 8.283278327832782, + "grad_norm": 1.4891687631607056, + "learning_rate": 3.641515079102107e-05, + "loss": 0.0128, + "num_input_tokens_seen": 15890064, + "step": 75295 + }, + { + "epoch": 8.283828382838283, + "grad_norm": 0.8377586603164673, + "learning_rate": 3.641301547256151e-05, + "loss": 0.0168, + "num_input_tokens_seen": 15891120, + "step": 75300 + }, + { + "epoch": 8.284378437843785, + "grad_norm": 0.10825410485267639, + "learning_rate": 3.641088004891367e-05, + "loss": 0.0323, + "num_input_tokens_seen": 15892144, + "step": 75305 + }, + { + "epoch": 8.284928492849286, + "grad_norm": 0.25088709592819214, + "learning_rate": 3.6408744520097226e-05, + "loss": 0.0968, + "num_input_tokens_seen": 15893232, + "step": 75310 + }, + { + "epoch": 8.285478547854785, + "grad_norm": 1.269460916519165, + "learning_rate": 3.640660888613185e-05, + "loss": 0.1052, + "num_input_tokens_seen": 15894288, + "step": 75315 + }, + { + "epoch": 8.286028602860286, + "grad_norm": 1.0369168519973755, + "learning_rate": 3.6404473147037235e-05, + "loss": 0.1841, + "num_input_tokens_seen": 15895312, + "step": 75320 + }, + { + "epoch": 8.286578657865787, + "grad_norm": 0.3800342082977295, + "learning_rate": 3.640233730283307e-05, + "loss": 0.0209, + "num_input_tokens_seen": 15896336, + "step": 75325 + }, + { + "epoch": 8.287128712871286, + "grad_norm": 0.19540096819400787, + "learning_rate": 3.640020135353902e-05, + "loss": 0.0152, + "num_input_tokens_seen": 15897424, + "step": 75330 + }, + { + "epoch": 8.287678767876788, + "grad_norm": 0.41419661045074463, + "learning_rate": 3.63980652991748e-05, + "loss": 0.1211, + "num_input_tokens_seen": 15898448, + "step": 75335 + }, + { + "epoch": 8.288228822882289, + "grad_norm": 0.06757443398237228, + "learning_rate": 3.639592913976008e-05, + "loss": 0.0473, + "num_input_tokens_seen": 15899536, + "step": 75340 + }, + { + "epoch": 8.288778877887788, + "grad_norm": 0.5262376666069031, + "learning_rate": 3.639379287531454e-05, + "loss": 0.0234, + "num_input_tokens_seen": 15900592, + "step": 75345 + }, + { + "epoch": 8.289328932893289, + "grad_norm": 0.1128017008304596, + "learning_rate": 3.639165650585789e-05, + "loss": 0.0115, + "num_input_tokens_seen": 15901680, + "step": 75350 + }, + { + "epoch": 8.28987898789879, + "grad_norm": 0.05173776298761368, + "learning_rate": 3.6389520031409794e-05, + "loss": 0.03, + "num_input_tokens_seen": 15902800, + "step": 75355 + }, + { + "epoch": 8.290429042904291, + "grad_norm": 0.013200867921113968, + "learning_rate": 3.6387383451989964e-05, + "loss": 0.0058, + "num_input_tokens_seen": 15903824, + "step": 75360 + }, + { + "epoch": 8.29097909790979, + "grad_norm": 0.2992584705352783, + "learning_rate": 3.6385246767618083e-05, + "loss": 0.0076, + "num_input_tokens_seen": 15904880, + "step": 75365 + }, + { + "epoch": 8.291529152915292, + "grad_norm": 0.06070028245449066, + "learning_rate": 3.638310997831386e-05, + "loss": 0.008, + "num_input_tokens_seen": 15905904, + "step": 75370 + }, + { + "epoch": 8.292079207920793, + "grad_norm": 0.017915841192007065, + "learning_rate": 3.638097308409696e-05, + "loss": 0.071, + "num_input_tokens_seen": 15906928, + "step": 75375 + }, + { + "epoch": 8.292629262926292, + "grad_norm": 0.10098917037248611, + "learning_rate": 3.637883608498709e-05, + "loss": 0.043, + "num_input_tokens_seen": 15907952, + "step": 75380 + }, + { + "epoch": 8.293179317931793, + "grad_norm": 0.04532467573881149, + "learning_rate": 3.6376698981003955e-05, + "loss": 0.0034, + "num_input_tokens_seen": 15909040, + "step": 75385 + }, + { + "epoch": 8.293729372937294, + "grad_norm": 0.04450271651148796, + "learning_rate": 3.637456177216724e-05, + "loss": 0.0202, + "num_input_tokens_seen": 15910064, + "step": 75390 + }, + { + "epoch": 8.294279427942794, + "grad_norm": 0.3359130918979645, + "learning_rate": 3.6372424458496654e-05, + "loss": 0.0203, + "num_input_tokens_seen": 15911152, + "step": 75395 + }, + { + "epoch": 8.294829482948295, + "grad_norm": 0.049469318240880966, + "learning_rate": 3.637028704001188e-05, + "loss": 0.0104, + "num_input_tokens_seen": 15912208, + "step": 75400 + }, + { + "epoch": 8.295379537953796, + "grad_norm": 0.00931565836071968, + "learning_rate": 3.636814951673263e-05, + "loss": 0.0013, + "num_input_tokens_seen": 15913264, + "step": 75405 + }, + { + "epoch": 8.295929592959295, + "grad_norm": 0.9938356280326843, + "learning_rate": 3.63660118886786e-05, + "loss": 0.0623, + "num_input_tokens_seen": 15914320, + "step": 75410 + }, + { + "epoch": 8.296479647964796, + "grad_norm": 0.023418355733156204, + "learning_rate": 3.636387415586949e-05, + "loss": 0.0212, + "num_input_tokens_seen": 15915312, + "step": 75415 + }, + { + "epoch": 8.297029702970297, + "grad_norm": 0.1637638360261917, + "learning_rate": 3.636173631832501e-05, + "loss": 0.012, + "num_input_tokens_seen": 15916400, + "step": 75420 + }, + { + "epoch": 8.297579757975798, + "grad_norm": 0.5032601952552795, + "learning_rate": 3.6359598376064866e-05, + "loss": 0.0327, + "num_input_tokens_seen": 15917456, + "step": 75425 + }, + { + "epoch": 8.298129812981298, + "grad_norm": 0.1655227392911911, + "learning_rate": 3.635746032910874e-05, + "loss": 0.061, + "num_input_tokens_seen": 15918544, + "step": 75430 + }, + { + "epoch": 8.298679867986799, + "grad_norm": 0.3011734187602997, + "learning_rate": 3.635532217747636e-05, + "loss": 0.0499, + "num_input_tokens_seen": 15919568, + "step": 75435 + }, + { + "epoch": 8.2992299229923, + "grad_norm": 0.2688937187194824, + "learning_rate": 3.635318392118742e-05, + "loss": 0.0266, + "num_input_tokens_seen": 15920560, + "step": 75440 + }, + { + "epoch": 8.2997799779978, + "grad_norm": 0.02074517123401165, + "learning_rate": 3.6351045560261636e-05, + "loss": 0.0854, + "num_input_tokens_seen": 15921616, + "step": 75445 + }, + { + "epoch": 8.3003300330033, + "grad_norm": 0.08154340833425522, + "learning_rate": 3.6348907094718707e-05, + "loss": 0.0103, + "num_input_tokens_seen": 15922736, + "step": 75450 + }, + { + "epoch": 8.300880088008801, + "grad_norm": 0.8860952258110046, + "learning_rate": 3.634676852457835e-05, + "loss": 0.013, + "num_input_tokens_seen": 15923792, + "step": 75455 + }, + { + "epoch": 8.3014301430143, + "grad_norm": 1.576364278793335, + "learning_rate": 3.6344629849860266e-05, + "loss": 0.1167, + "num_input_tokens_seen": 15924848, + "step": 75460 + }, + { + "epoch": 8.301980198019802, + "grad_norm": 0.04831930994987488, + "learning_rate": 3.6342491070584186e-05, + "loss": 0.0275, + "num_input_tokens_seen": 15925904, + "step": 75465 + }, + { + "epoch": 8.302530253025303, + "grad_norm": 0.04121790826320648, + "learning_rate": 3.63403521867698e-05, + "loss": 0.0114, + "num_input_tokens_seen": 15926992, + "step": 75470 + }, + { + "epoch": 8.303080308030804, + "grad_norm": 0.005071187391877174, + "learning_rate": 3.633821319843683e-05, + "loss": 0.006, + "num_input_tokens_seen": 15928048, + "step": 75475 + }, + { + "epoch": 8.303630363036303, + "grad_norm": 0.007495210971683264, + "learning_rate": 3.6336074105604986e-05, + "loss": 0.0063, + "num_input_tokens_seen": 15929104, + "step": 75480 + }, + { + "epoch": 8.304180418041804, + "grad_norm": 0.6779502630233765, + "learning_rate": 3.633393490829399e-05, + "loss": 0.0197, + "num_input_tokens_seen": 15930160, + "step": 75485 + }, + { + "epoch": 8.304730473047305, + "grad_norm": 0.04881009832024574, + "learning_rate": 3.6331795606523556e-05, + "loss": 0.0044, + "num_input_tokens_seen": 15931184, + "step": 75490 + }, + { + "epoch": 8.305280528052805, + "grad_norm": 0.012501051649451256, + "learning_rate": 3.63296562003134e-05, + "loss": 0.0193, + "num_input_tokens_seen": 15932240, + "step": 75495 + }, + { + "epoch": 8.305830583058306, + "grad_norm": 0.02587384544312954, + "learning_rate": 3.632751668968323e-05, + "loss": 0.0197, + "num_input_tokens_seen": 15933328, + "step": 75500 + }, + { + "epoch": 8.306380638063807, + "grad_norm": 0.016725078225135803, + "learning_rate": 3.632537707465278e-05, + "loss": 0.0656, + "num_input_tokens_seen": 15934384, + "step": 75505 + }, + { + "epoch": 8.306930693069306, + "grad_norm": 0.03262435272336006, + "learning_rate": 3.632323735524177e-05, + "loss": 0.0133, + "num_input_tokens_seen": 15935504, + "step": 75510 + }, + { + "epoch": 8.307480748074807, + "grad_norm": 0.021750710904598236, + "learning_rate": 3.6321097531469913e-05, + "loss": 0.0121, + "num_input_tokens_seen": 15936528, + "step": 75515 + }, + { + "epoch": 8.308030803080309, + "grad_norm": 1.7309441566467285, + "learning_rate": 3.631895760335692e-05, + "loss": 0.1054, + "num_input_tokens_seen": 15937552, + "step": 75520 + }, + { + "epoch": 8.308580858085808, + "grad_norm": 0.010199707001447678, + "learning_rate": 3.631681757092254e-05, + "loss": 0.019, + "num_input_tokens_seen": 15938640, + "step": 75525 + }, + { + "epoch": 8.309130913091309, + "grad_norm": 0.027851639315485954, + "learning_rate": 3.631467743418648e-05, + "loss": 0.0015, + "num_input_tokens_seen": 15939600, + "step": 75530 + }, + { + "epoch": 8.30968096809681, + "grad_norm": 0.179750457406044, + "learning_rate": 3.631253719316847e-05, + "loss": 0.0119, + "num_input_tokens_seen": 15940656, + "step": 75535 + }, + { + "epoch": 8.310231023102311, + "grad_norm": 0.9016746282577515, + "learning_rate": 3.631039684788822e-05, + "loss": 0.0852, + "num_input_tokens_seen": 15941744, + "step": 75540 + }, + { + "epoch": 8.31078107810781, + "grad_norm": 0.018553907051682472, + "learning_rate": 3.630825639836548e-05, + "loss": 0.026, + "num_input_tokens_seen": 15942864, + "step": 75545 + }, + { + "epoch": 8.311331133113312, + "grad_norm": 0.0202504713088274, + "learning_rate": 3.630611584461996e-05, + "loss": 0.0063, + "num_input_tokens_seen": 15943856, + "step": 75550 + }, + { + "epoch": 8.311881188118813, + "grad_norm": 2.533334255218506, + "learning_rate": 3.63039751866714e-05, + "loss": 0.0512, + "num_input_tokens_seen": 15944912, + "step": 75555 + }, + { + "epoch": 8.312431243124312, + "grad_norm": 0.19135841727256775, + "learning_rate": 3.630183442453954e-05, + "loss": 0.0867, + "num_input_tokens_seen": 15946000, + "step": 75560 + }, + { + "epoch": 8.312981298129813, + "grad_norm": 0.0462048202753067, + "learning_rate": 3.629969355824407e-05, + "loss": 0.0022, + "num_input_tokens_seen": 15947056, + "step": 75565 + }, + { + "epoch": 8.313531353135314, + "grad_norm": 0.005319869611412287, + "learning_rate": 3.629755258780476e-05, + "loss": 0.0044, + "num_input_tokens_seen": 15948048, + "step": 75570 + }, + { + "epoch": 8.314081408140813, + "grad_norm": 0.40407127141952515, + "learning_rate": 3.629541151324133e-05, + "loss": 0.0121, + "num_input_tokens_seen": 15949104, + "step": 75575 + }, + { + "epoch": 8.314631463146315, + "grad_norm": 0.0187925323843956, + "learning_rate": 3.6293270334573506e-05, + "loss": 0.0262, + "num_input_tokens_seen": 15950160, + "step": 75580 + }, + { + "epoch": 8.315181518151816, + "grad_norm": 0.031402140855789185, + "learning_rate": 3.629112905182103e-05, + "loss": 0.0031, + "num_input_tokens_seen": 15951216, + "step": 75585 + }, + { + "epoch": 8.315731573157315, + "grad_norm": 0.016689464449882507, + "learning_rate": 3.628898766500364e-05, + "loss": 0.0181, + "num_input_tokens_seen": 15952336, + "step": 75590 + }, + { + "epoch": 8.316281628162816, + "grad_norm": 0.030083224177360535, + "learning_rate": 3.628684617414106e-05, + "loss": 0.0966, + "num_input_tokens_seen": 15953360, + "step": 75595 + }, + { + "epoch": 8.316831683168317, + "grad_norm": 2.590965747833252, + "learning_rate": 3.6284704579253035e-05, + "loss": 0.2327, + "num_input_tokens_seen": 15954384, + "step": 75600 + }, + { + "epoch": 8.317381738173818, + "grad_norm": 1.5812751054763794, + "learning_rate": 3.628256288035931e-05, + "loss": 0.1677, + "num_input_tokens_seen": 15955408, + "step": 75605 + }, + { + "epoch": 8.317931793179318, + "grad_norm": 0.06079591065645218, + "learning_rate": 3.628042107747962e-05, + "loss": 0.0983, + "num_input_tokens_seen": 15956432, + "step": 75610 + }, + { + "epoch": 8.318481848184819, + "grad_norm": 0.24051518738269806, + "learning_rate": 3.6278279170633687e-05, + "loss": 0.0863, + "num_input_tokens_seen": 15957488, + "step": 75615 + }, + { + "epoch": 8.31903190319032, + "grad_norm": 0.008886204101145267, + "learning_rate": 3.627613715984128e-05, + "loss": 0.0063, + "num_input_tokens_seen": 15958544, + "step": 75620 + }, + { + "epoch": 8.319581958195819, + "grad_norm": 0.03842274099588394, + "learning_rate": 3.6273995045122114e-05, + "loss": 0.0732, + "num_input_tokens_seen": 15959632, + "step": 75625 + }, + { + "epoch": 8.32013201320132, + "grad_norm": 0.2554386258125305, + "learning_rate": 3.627185282649596e-05, + "loss": 0.1398, + "num_input_tokens_seen": 15960720, + "step": 75630 + }, + { + "epoch": 8.320682068206821, + "grad_norm": 0.010837722569704056, + "learning_rate": 3.626971050398254e-05, + "loss": 0.0485, + "num_input_tokens_seen": 15961744, + "step": 75635 + }, + { + "epoch": 8.32123212321232, + "grad_norm": 0.9995863437652588, + "learning_rate": 3.6267568077601605e-05, + "loss": 0.0455, + "num_input_tokens_seen": 15962832, + "step": 75640 + }, + { + "epoch": 8.321782178217822, + "grad_norm": 2.3050482273101807, + "learning_rate": 3.6265425547372906e-05, + "loss": 0.1323, + "num_input_tokens_seen": 15963888, + "step": 75645 + }, + { + "epoch": 8.322332233223323, + "grad_norm": 0.733871340751648, + "learning_rate": 3.626328291331618e-05, + "loss": 0.0427, + "num_input_tokens_seen": 15964944, + "step": 75650 + }, + { + "epoch": 8.322882288228822, + "grad_norm": 0.8168248534202576, + "learning_rate": 3.626114017545119e-05, + "loss": 0.0171, + "num_input_tokens_seen": 15965968, + "step": 75655 + }, + { + "epoch": 8.323432343234323, + "grad_norm": 1.3746235370635986, + "learning_rate": 3.6258997333797664e-05, + "loss": 0.0236, + "num_input_tokens_seen": 15967120, + "step": 75660 + }, + { + "epoch": 8.323982398239824, + "grad_norm": 0.7420557737350464, + "learning_rate": 3.625685438837537e-05, + "loss": 0.0105, + "num_input_tokens_seen": 15968176, + "step": 75665 + }, + { + "epoch": 8.324532453245325, + "grad_norm": 1.1219825744628906, + "learning_rate": 3.625471133920404e-05, + "loss": 0.0422, + "num_input_tokens_seen": 15969232, + "step": 75670 + }, + { + "epoch": 8.325082508250825, + "grad_norm": 0.04920033738017082, + "learning_rate": 3.625256818630345e-05, + "loss": 0.0667, + "num_input_tokens_seen": 15970224, + "step": 75675 + }, + { + "epoch": 8.325632563256326, + "grad_norm": 0.07367262989282608, + "learning_rate": 3.625042492969333e-05, + "loss": 0.006, + "num_input_tokens_seen": 15971216, + "step": 75680 + }, + { + "epoch": 8.326182618261827, + "grad_norm": 0.15083052217960358, + "learning_rate": 3.624828156939344e-05, + "loss": 0.0179, + "num_input_tokens_seen": 15972240, + "step": 75685 + }, + { + "epoch": 8.326732673267326, + "grad_norm": 1.0761924982070923, + "learning_rate": 3.6246138105423546e-05, + "loss": 0.0447, + "num_input_tokens_seen": 15973296, + "step": 75690 + }, + { + "epoch": 8.327282728272827, + "grad_norm": 0.24976561963558197, + "learning_rate": 3.624399453780338e-05, + "loss": 0.0126, + "num_input_tokens_seen": 15974352, + "step": 75695 + }, + { + "epoch": 8.327832783278328, + "grad_norm": 0.8045252561569214, + "learning_rate": 3.624185086655272e-05, + "loss": 0.0184, + "num_input_tokens_seen": 15975376, + "step": 75700 + }, + { + "epoch": 8.328382838283828, + "grad_norm": 0.20276175439357758, + "learning_rate": 3.623970709169132e-05, + "loss": 0.0067, + "num_input_tokens_seen": 15976368, + "step": 75705 + }, + { + "epoch": 8.328932893289329, + "grad_norm": 0.08757639676332474, + "learning_rate": 3.6237563213238924e-05, + "loss": 0.0046, + "num_input_tokens_seen": 15977488, + "step": 75710 + }, + { + "epoch": 8.32948294829483, + "grad_norm": 0.02432985045015812, + "learning_rate": 3.62354192312153e-05, + "loss": 0.0297, + "num_input_tokens_seen": 15978544, + "step": 75715 + }, + { + "epoch": 8.33003300330033, + "grad_norm": 0.04387567192316055, + "learning_rate": 3.623327514564021e-05, + "loss": 0.0368, + "num_input_tokens_seen": 15979632, + "step": 75720 + }, + { + "epoch": 8.33058305830583, + "grad_norm": 1.0365169048309326, + "learning_rate": 3.623113095653341e-05, + "loss": 0.0533, + "num_input_tokens_seen": 15980624, + "step": 75725 + }, + { + "epoch": 8.331133113311331, + "grad_norm": 0.05118507519364357, + "learning_rate": 3.622898666391468e-05, + "loss": 0.0027, + "num_input_tokens_seen": 15981648, + "step": 75730 + }, + { + "epoch": 8.331683168316832, + "grad_norm": 0.022041460499167442, + "learning_rate": 3.6226842267803756e-05, + "loss": 0.0061, + "num_input_tokens_seen": 15982672, + "step": 75735 + }, + { + "epoch": 8.332233223322332, + "grad_norm": 0.027344470843672752, + "learning_rate": 3.622469776822041e-05, + "loss": 0.0132, + "num_input_tokens_seen": 15983728, + "step": 75740 + }, + { + "epoch": 8.332783278327833, + "grad_norm": 0.018717747181653976, + "learning_rate": 3.622255316518442e-05, + "loss": 0.1165, + "num_input_tokens_seen": 15984752, + "step": 75745 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 1.4643640518188477, + "learning_rate": 3.622040845871554e-05, + "loss": 0.0251, + "num_input_tokens_seen": 15985840, + "step": 75750 + }, + { + "epoch": 8.333883388338833, + "grad_norm": 0.03064906597137451, + "learning_rate": 3.621826364883354e-05, + "loss": 0.0342, + "num_input_tokens_seen": 15986928, + "step": 75755 + }, + { + "epoch": 8.334433443344334, + "grad_norm": 0.3787134289741516, + "learning_rate": 3.621611873555818e-05, + "loss": 0.006, + "num_input_tokens_seen": 15987984, + "step": 75760 + }, + { + "epoch": 8.334983498349835, + "grad_norm": 1.2186193466186523, + "learning_rate": 3.621397371890924e-05, + "loss": 0.0639, + "num_input_tokens_seen": 15989040, + "step": 75765 + }, + { + "epoch": 8.335533553355335, + "grad_norm": 0.043394893407821655, + "learning_rate": 3.621182859890649e-05, + "loss": 0.0975, + "num_input_tokens_seen": 15990160, + "step": 75770 + }, + { + "epoch": 8.336083608360836, + "grad_norm": 0.022447094321250916, + "learning_rate": 3.62096833755697e-05, + "loss": 0.0335, + "num_input_tokens_seen": 15991216, + "step": 75775 + }, + { + "epoch": 8.336633663366337, + "grad_norm": 0.11024000495672226, + "learning_rate": 3.6207538048918625e-05, + "loss": 0.0157, + "num_input_tokens_seen": 15992272, + "step": 75780 + }, + { + "epoch": 8.337183718371838, + "grad_norm": 0.08033095300197601, + "learning_rate": 3.620539261897305e-05, + "loss": 0.0033, + "num_input_tokens_seen": 15993360, + "step": 75785 + }, + { + "epoch": 8.337733773377337, + "grad_norm": 0.20454005897045135, + "learning_rate": 3.6203247085752756e-05, + "loss": 0.0285, + "num_input_tokens_seen": 15994352, + "step": 75790 + }, + { + "epoch": 8.338283828382838, + "grad_norm": 1.0426604747772217, + "learning_rate": 3.62011014492775e-05, + "loss": 0.0527, + "num_input_tokens_seen": 15995376, + "step": 75795 + }, + { + "epoch": 8.33883388338834, + "grad_norm": 0.013905170373618603, + "learning_rate": 3.6198955709567076e-05, + "loss": 0.0223, + "num_input_tokens_seen": 15996336, + "step": 75800 + }, + { + "epoch": 8.339383938393839, + "grad_norm": 0.025766145437955856, + "learning_rate": 3.619680986664125e-05, + "loss": 0.0142, + "num_input_tokens_seen": 15997456, + "step": 75805 + }, + { + "epoch": 8.33993399339934, + "grad_norm": 1.7350318431854248, + "learning_rate": 3.61946639205198e-05, + "loss": 0.1331, + "num_input_tokens_seen": 15998544, + "step": 75810 + }, + { + "epoch": 8.340484048404841, + "grad_norm": 0.00584501912817359, + "learning_rate": 3.6192517871222497e-05, + "loss": 0.0142, + "num_input_tokens_seen": 15999664, + "step": 75815 + }, + { + "epoch": 8.34103410341034, + "grad_norm": 0.02602429687976837, + "learning_rate": 3.619037171876913e-05, + "loss": 0.1305, + "num_input_tokens_seen": 16000688, + "step": 75820 + }, + { + "epoch": 8.341584158415841, + "grad_norm": 0.9706298112869263, + "learning_rate": 3.6188225463179485e-05, + "loss": 0.166, + "num_input_tokens_seen": 16001712, + "step": 75825 + }, + { + "epoch": 8.342134213421343, + "grad_norm": 0.7131415009498596, + "learning_rate": 3.6186079104473327e-05, + "loss": 0.0283, + "num_input_tokens_seen": 16002832, + "step": 75830 + }, + { + "epoch": 8.342684268426842, + "grad_norm": 0.021915698423981667, + "learning_rate": 3.618393264267045e-05, + "loss": 0.0103, + "num_input_tokens_seen": 16003920, + "step": 75835 + }, + { + "epoch": 8.343234323432343, + "grad_norm": 0.1617671102285385, + "learning_rate": 3.618178607779062e-05, + "loss": 0.0078, + "num_input_tokens_seen": 16004944, + "step": 75840 + }, + { + "epoch": 8.343784378437844, + "grad_norm": 0.030785391107201576, + "learning_rate": 3.617963940985365e-05, + "loss": 0.043, + "num_input_tokens_seen": 16005968, + "step": 75845 + }, + { + "epoch": 8.344334433443345, + "grad_norm": 1.4659416675567627, + "learning_rate": 3.61774926388793e-05, + "loss": 0.1195, + "num_input_tokens_seen": 16007088, + "step": 75850 + }, + { + "epoch": 8.344884488448844, + "grad_norm": 0.01973528228700161, + "learning_rate": 3.6175345764887355e-05, + "loss": 0.0091, + "num_input_tokens_seen": 16008176, + "step": 75855 + }, + { + "epoch": 8.345434543454346, + "grad_norm": 1.3814836740493774, + "learning_rate": 3.617319878789763e-05, + "loss": 0.0429, + "num_input_tokens_seen": 16009168, + "step": 75860 + }, + { + "epoch": 8.345984598459847, + "grad_norm": 0.13965541124343872, + "learning_rate": 3.617105170792988e-05, + "loss": 0.032, + "num_input_tokens_seen": 16010160, + "step": 75865 + }, + { + "epoch": 8.346534653465346, + "grad_norm": 0.32123881578445435, + "learning_rate": 3.616890452500391e-05, + "loss": 0.0116, + "num_input_tokens_seen": 16011216, + "step": 75870 + }, + { + "epoch": 8.347084708470847, + "grad_norm": 0.02525283582508564, + "learning_rate": 3.616675723913951e-05, + "loss": 0.1489, + "num_input_tokens_seen": 16012272, + "step": 75875 + }, + { + "epoch": 8.347634763476348, + "grad_norm": 0.07268557697534561, + "learning_rate": 3.616460985035646e-05, + "loss": 0.0379, + "num_input_tokens_seen": 16013360, + "step": 75880 + }, + { + "epoch": 8.348184818481847, + "grad_norm": 0.08317580074071884, + "learning_rate": 3.616246235867456e-05, + "loss": 0.0356, + "num_input_tokens_seen": 16014448, + "step": 75885 + }, + { + "epoch": 8.348734873487349, + "grad_norm": 0.5068822503089905, + "learning_rate": 3.6160314764113604e-05, + "loss": 0.0113, + "num_input_tokens_seen": 16015472, + "step": 75890 + }, + { + "epoch": 8.34928492849285, + "grad_norm": 0.5630333423614502, + "learning_rate": 3.615816706669338e-05, + "loss": 0.0301, + "num_input_tokens_seen": 16016528, + "step": 75895 + }, + { + "epoch": 8.34983498349835, + "grad_norm": 0.12357749789953232, + "learning_rate": 3.615601926643368e-05, + "loss": 0.067, + "num_input_tokens_seen": 16017520, + "step": 75900 + }, + { + "epoch": 8.35038503850385, + "grad_norm": 1.1552884578704834, + "learning_rate": 3.615387136335431e-05, + "loss": 0.1004, + "num_input_tokens_seen": 16018608, + "step": 75905 + }, + { + "epoch": 8.350935093509351, + "grad_norm": 0.5019649863243103, + "learning_rate": 3.615172335747507e-05, + "loss": 0.0167, + "num_input_tokens_seen": 16019632, + "step": 75910 + }, + { + "epoch": 8.351485148514852, + "grad_norm": 0.06249217316508293, + "learning_rate": 3.614957524881573e-05, + "loss": 0.0838, + "num_input_tokens_seen": 16020720, + "step": 75915 + }, + { + "epoch": 8.352035203520352, + "grad_norm": 0.1094096228480339, + "learning_rate": 3.614742703739611e-05, + "loss": 0.0033, + "num_input_tokens_seen": 16021808, + "step": 75920 + }, + { + "epoch": 8.352585258525853, + "grad_norm": 0.14117975533008575, + "learning_rate": 3.6145278723236006e-05, + "loss": 0.1292, + "num_input_tokens_seen": 16022864, + "step": 75925 + }, + { + "epoch": 8.353135313531354, + "grad_norm": 0.016262419521808624, + "learning_rate": 3.6143130306355213e-05, + "loss": 0.0086, + "num_input_tokens_seen": 16024048, + "step": 75930 + }, + { + "epoch": 8.353685368536853, + "grad_norm": 1.9012463092803955, + "learning_rate": 3.614098178677353e-05, + "loss": 0.0743, + "num_input_tokens_seen": 16025072, + "step": 75935 + }, + { + "epoch": 8.354235423542354, + "grad_norm": 1.0032310485839844, + "learning_rate": 3.613883316451077e-05, + "loss": 0.0587, + "num_input_tokens_seen": 16026128, + "step": 75940 + }, + { + "epoch": 8.354785478547855, + "grad_norm": 0.5079864263534546, + "learning_rate": 3.6136684439586734e-05, + "loss": 0.0248, + "num_input_tokens_seen": 16027216, + "step": 75945 + }, + { + "epoch": 8.355335533553355, + "grad_norm": 1.275291919708252, + "learning_rate": 3.613453561202122e-05, + "loss": 0.0744, + "num_input_tokens_seen": 16028304, + "step": 75950 + }, + { + "epoch": 8.355885588558856, + "grad_norm": 0.7036617994308472, + "learning_rate": 3.613238668183403e-05, + "loss": 0.0194, + "num_input_tokens_seen": 16029328, + "step": 75955 + }, + { + "epoch": 8.356435643564357, + "grad_norm": 0.03149453550577164, + "learning_rate": 3.613023764904497e-05, + "loss": 0.0156, + "num_input_tokens_seen": 16030352, + "step": 75960 + }, + { + "epoch": 8.356985698569858, + "grad_norm": 0.04063505306839943, + "learning_rate": 3.6128088513673865e-05, + "loss": 0.0646, + "num_input_tokens_seen": 16031408, + "step": 75965 + }, + { + "epoch": 8.357535753575357, + "grad_norm": 0.10355907678604126, + "learning_rate": 3.612593927574049e-05, + "loss": 0.0405, + "num_input_tokens_seen": 16032432, + "step": 75970 + }, + { + "epoch": 8.358085808580858, + "grad_norm": 0.6330344676971436, + "learning_rate": 3.612378993526468e-05, + "loss": 0.0232, + "num_input_tokens_seen": 16033488, + "step": 75975 + }, + { + "epoch": 8.35863586358636, + "grad_norm": 0.10141648352146149, + "learning_rate": 3.6121640492266234e-05, + "loss": 0.0059, + "num_input_tokens_seen": 16034544, + "step": 75980 + }, + { + "epoch": 8.359185918591859, + "grad_norm": 0.5934684872627258, + "learning_rate": 3.611949094676497e-05, + "loss": 0.0259, + "num_input_tokens_seen": 16035504, + "step": 75985 + }, + { + "epoch": 8.35973597359736, + "grad_norm": 0.04622602090239525, + "learning_rate": 3.611734129878069e-05, + "loss": 0.0646, + "num_input_tokens_seen": 16036592, + "step": 75990 + }, + { + "epoch": 8.36028602860286, + "grad_norm": 1.4162667989730835, + "learning_rate": 3.611519154833321e-05, + "loss": 0.0874, + "num_input_tokens_seen": 16037584, + "step": 75995 + }, + { + "epoch": 8.36083608360836, + "grad_norm": 1.3782970905303955, + "learning_rate": 3.611304169544234e-05, + "loss": 0.0923, + "num_input_tokens_seen": 16038672, + "step": 76000 + }, + { + "epoch": 8.361386138613861, + "grad_norm": 0.46312034130096436, + "learning_rate": 3.611089174012789e-05, + "loss": 0.0339, + "num_input_tokens_seen": 16039696, + "step": 76005 + }, + { + "epoch": 8.361936193619362, + "grad_norm": 0.10271339863538742, + "learning_rate": 3.6108741682409696e-05, + "loss": 0.023, + "num_input_tokens_seen": 16040688, + "step": 76010 + }, + { + "epoch": 8.362486248624862, + "grad_norm": 1.8223272562026978, + "learning_rate": 3.6106591522307566e-05, + "loss": 0.0846, + "num_input_tokens_seen": 16041712, + "step": 76015 + }, + { + "epoch": 8.363036303630363, + "grad_norm": 2.0655410289764404, + "learning_rate": 3.61044412598413e-05, + "loss": 0.0314, + "num_input_tokens_seen": 16042704, + "step": 76020 + }, + { + "epoch": 8.363586358635864, + "grad_norm": 0.02721225842833519, + "learning_rate": 3.610229089503073e-05, + "loss": 0.0103, + "num_input_tokens_seen": 16043792, + "step": 76025 + }, + { + "epoch": 8.364136413641365, + "grad_norm": 0.4821479022502899, + "learning_rate": 3.610014042789567e-05, + "loss": 0.1051, + "num_input_tokens_seen": 16044848, + "step": 76030 + }, + { + "epoch": 8.364686468646864, + "grad_norm": 0.21835432946681976, + "learning_rate": 3.609798985845594e-05, + "loss": 0.0171, + "num_input_tokens_seen": 16045904, + "step": 76035 + }, + { + "epoch": 8.365236523652365, + "grad_norm": 0.06897953897714615, + "learning_rate": 3.6095839186731375e-05, + "loss": 0.007, + "num_input_tokens_seen": 16046992, + "step": 76040 + }, + { + "epoch": 8.365786578657866, + "grad_norm": 0.3424520790576935, + "learning_rate": 3.6093688412741764e-05, + "loss": 0.022, + "num_input_tokens_seen": 16048016, + "step": 76045 + }, + { + "epoch": 8.366336633663366, + "grad_norm": 0.06691224128007889, + "learning_rate": 3.609153753650696e-05, + "loss": 0.0104, + "num_input_tokens_seen": 16049104, + "step": 76050 + }, + { + "epoch": 8.366886688668867, + "grad_norm": 0.6452820897102356, + "learning_rate": 3.608938655804678e-05, + "loss": 0.0528, + "num_input_tokens_seen": 16050160, + "step": 76055 + }, + { + "epoch": 8.367436743674368, + "grad_norm": 0.1231108233332634, + "learning_rate": 3.6087235477381045e-05, + "loss": 0.035, + "num_input_tokens_seen": 16051152, + "step": 76060 + }, + { + "epoch": 8.367986798679867, + "grad_norm": 0.43486306071281433, + "learning_rate": 3.6085084294529586e-05, + "loss": 0.0137, + "num_input_tokens_seen": 16052272, + "step": 76065 + }, + { + "epoch": 8.368536853685368, + "grad_norm": 0.030585044994950294, + "learning_rate": 3.6082933009512224e-05, + "loss": 0.0141, + "num_input_tokens_seen": 16053264, + "step": 76070 + }, + { + "epoch": 8.36908690869087, + "grad_norm": 1.2199584245681763, + "learning_rate": 3.608078162234877e-05, + "loss": 0.0756, + "num_input_tokens_seen": 16054288, + "step": 76075 + }, + { + "epoch": 8.369636963696369, + "grad_norm": 0.08786312490701675, + "learning_rate": 3.6078630133059083e-05, + "loss": 0.011, + "num_input_tokens_seen": 16055408, + "step": 76080 + }, + { + "epoch": 8.37018701870187, + "grad_norm": 0.02084345556795597, + "learning_rate": 3.607647854166297e-05, + "loss": 0.0394, + "num_input_tokens_seen": 16056464, + "step": 76085 + }, + { + "epoch": 8.370737073707371, + "grad_norm": 0.09701933711767197, + "learning_rate": 3.6074326848180276e-05, + "loss": 0.0097, + "num_input_tokens_seen": 16057520, + "step": 76090 + }, + { + "epoch": 8.371287128712872, + "grad_norm": 0.027372237294912338, + "learning_rate": 3.607217505263083e-05, + "loss": 0.0273, + "num_input_tokens_seen": 16058544, + "step": 76095 + }, + { + "epoch": 8.371837183718371, + "grad_norm": 0.10090412944555283, + "learning_rate": 3.607002315503445e-05, + "loss": 0.0037, + "num_input_tokens_seen": 16059568, + "step": 76100 + }, + { + "epoch": 8.372387238723872, + "grad_norm": 0.29692718386650085, + "learning_rate": 3.606787115541098e-05, + "loss": 0.0607, + "num_input_tokens_seen": 16060560, + "step": 76105 + }, + { + "epoch": 8.372937293729374, + "grad_norm": 2.2208452224731445, + "learning_rate": 3.606571905378026e-05, + "loss": 0.0265, + "num_input_tokens_seen": 16061584, + "step": 76110 + }, + { + "epoch": 8.373487348734873, + "grad_norm": 0.06161876022815704, + "learning_rate": 3.60635668501621e-05, + "loss": 0.0024, + "num_input_tokens_seen": 16062672, + "step": 76115 + }, + { + "epoch": 8.374037403740374, + "grad_norm": 0.12254180759191513, + "learning_rate": 3.6061414544576366e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16063696, + "step": 76120 + }, + { + "epoch": 8.374587458745875, + "grad_norm": 0.07035832107067108, + "learning_rate": 3.605926213704288e-05, + "loss": 0.0214, + "num_input_tokens_seen": 16064752, + "step": 76125 + }, + { + "epoch": 8.375137513751374, + "grad_norm": 0.019931156188249588, + "learning_rate": 3.605710962758148e-05, + "loss": 0.0018, + "num_input_tokens_seen": 16065872, + "step": 76130 + }, + { + "epoch": 8.375687568756875, + "grad_norm": 2.342130184173584, + "learning_rate": 3.605495701621201e-05, + "loss": 0.1376, + "num_input_tokens_seen": 16066928, + "step": 76135 + }, + { + "epoch": 8.376237623762377, + "grad_norm": 1.029689908027649, + "learning_rate": 3.605280430295431e-05, + "loss": 0.0723, + "num_input_tokens_seen": 16067952, + "step": 76140 + }, + { + "epoch": 8.376787678767876, + "grad_norm": 0.06376917660236359, + "learning_rate": 3.6050651487828205e-05, + "loss": 0.0484, + "num_input_tokens_seen": 16068976, + "step": 76145 + }, + { + "epoch": 8.377337733773377, + "grad_norm": 0.026498721912503242, + "learning_rate": 3.604849857085355e-05, + "loss": 0.0393, + "num_input_tokens_seen": 16070000, + "step": 76150 + }, + { + "epoch": 8.377887788778878, + "grad_norm": 0.198114812374115, + "learning_rate": 3.60463455520502e-05, + "loss": 0.1241, + "num_input_tokens_seen": 16071056, + "step": 76155 + }, + { + "epoch": 8.37843784378438, + "grad_norm": 0.7511430382728577, + "learning_rate": 3.604419243143796e-05, + "loss": 0.1072, + "num_input_tokens_seen": 16072144, + "step": 76160 + }, + { + "epoch": 8.378987898789878, + "grad_norm": 0.9719456434249878, + "learning_rate": 3.604203920903672e-05, + "loss": 0.1001, + "num_input_tokens_seen": 16073264, + "step": 76165 + }, + { + "epoch": 8.37953795379538, + "grad_norm": 1.588708519935608, + "learning_rate": 3.603988588486629e-05, + "loss": 0.0247, + "num_input_tokens_seen": 16074288, + "step": 76170 + }, + { + "epoch": 8.38008800880088, + "grad_norm": 0.13845039904117584, + "learning_rate": 3.603773245894653e-05, + "loss": 0.0259, + "num_input_tokens_seen": 16075280, + "step": 76175 + }, + { + "epoch": 8.38063806380638, + "grad_norm": 0.029342040419578552, + "learning_rate": 3.603557893129729e-05, + "loss": 0.0744, + "num_input_tokens_seen": 16076368, + "step": 76180 + }, + { + "epoch": 8.381188118811881, + "grad_norm": 0.3578011393547058, + "learning_rate": 3.6033425301938414e-05, + "loss": 0.0128, + "num_input_tokens_seen": 16077360, + "step": 76185 + }, + { + "epoch": 8.381738173817382, + "grad_norm": 0.08867353945970535, + "learning_rate": 3.6031271570889746e-05, + "loss": 0.0131, + "num_input_tokens_seen": 16078352, + "step": 76190 + }, + { + "epoch": 8.382288228822881, + "grad_norm": 0.011966142803430557, + "learning_rate": 3.6029117738171145e-05, + "loss": 0.0032, + "num_input_tokens_seen": 16079376, + "step": 76195 + }, + { + "epoch": 8.382838283828383, + "grad_norm": 0.8176658153533936, + "learning_rate": 3.602696380380246e-05, + "loss": 0.0849, + "num_input_tokens_seen": 16080400, + "step": 76200 + }, + { + "epoch": 8.383388338833884, + "grad_norm": 0.08129048347473145, + "learning_rate": 3.602480976780354e-05, + "loss": 0.0292, + "num_input_tokens_seen": 16081424, + "step": 76205 + }, + { + "epoch": 8.383938393839385, + "grad_norm": 0.7910082936286926, + "learning_rate": 3.6022655630194234e-05, + "loss": 0.0236, + "num_input_tokens_seen": 16082416, + "step": 76210 + }, + { + "epoch": 8.384488448844884, + "grad_norm": 0.0928170382976532, + "learning_rate": 3.602050139099441e-05, + "loss": 0.0671, + "num_input_tokens_seen": 16083408, + "step": 76215 + }, + { + "epoch": 8.385038503850385, + "grad_norm": 0.9861994981765747, + "learning_rate": 3.60183470502239e-05, + "loss": 0.0375, + "num_input_tokens_seen": 16084464, + "step": 76220 + }, + { + "epoch": 8.385588558855886, + "grad_norm": 0.04070356860756874, + "learning_rate": 3.6016192607902584e-05, + "loss": 0.0465, + "num_input_tokens_seen": 16085456, + "step": 76225 + }, + { + "epoch": 8.386138613861386, + "grad_norm": 1.375126838684082, + "learning_rate": 3.60140380640503e-05, + "loss": 0.0928, + "num_input_tokens_seen": 16086576, + "step": 76230 + }, + { + "epoch": 8.386688668866887, + "grad_norm": 2.307786226272583, + "learning_rate": 3.6011883418686915e-05, + "loss": 0.0693, + "num_input_tokens_seen": 16087664, + "step": 76235 + }, + { + "epoch": 8.387238723872388, + "grad_norm": 0.16444186866283417, + "learning_rate": 3.6009728671832285e-05, + "loss": 0.1563, + "num_input_tokens_seen": 16088656, + "step": 76240 + }, + { + "epoch": 8.387788778877887, + "grad_norm": 0.19875985383987427, + "learning_rate": 3.600757382350627e-05, + "loss": 0.0893, + "num_input_tokens_seen": 16089744, + "step": 76245 + }, + { + "epoch": 8.388338833883388, + "grad_norm": 0.04365672171115875, + "learning_rate": 3.6005418873728724e-05, + "loss": 0.0052, + "num_input_tokens_seen": 16090864, + "step": 76250 + }, + { + "epoch": 8.38888888888889, + "grad_norm": 0.06668580323457718, + "learning_rate": 3.6003263822519516e-05, + "loss": 0.0196, + "num_input_tokens_seen": 16091952, + "step": 76255 + }, + { + "epoch": 8.389438943894389, + "grad_norm": 0.1547253429889679, + "learning_rate": 3.600110866989851e-05, + "loss": 0.0708, + "num_input_tokens_seen": 16093008, + "step": 76260 + }, + { + "epoch": 8.38998899889989, + "grad_norm": 0.09442584216594696, + "learning_rate": 3.599895341588556e-05, + "loss": 0.0182, + "num_input_tokens_seen": 16094064, + "step": 76265 + }, + { + "epoch": 8.39053905390539, + "grad_norm": 0.034804895520210266, + "learning_rate": 3.599679806050052e-05, + "loss": 0.0151, + "num_input_tokens_seen": 16095120, + "step": 76270 + }, + { + "epoch": 8.391089108910892, + "grad_norm": 0.058043453842401505, + "learning_rate": 3.599464260376329e-05, + "loss": 0.04, + "num_input_tokens_seen": 16096176, + "step": 76275 + }, + { + "epoch": 8.391639163916391, + "grad_norm": 1.0293259620666504, + "learning_rate": 3.599248704569371e-05, + "loss": 0.0406, + "num_input_tokens_seen": 16097232, + "step": 76280 + }, + { + "epoch": 8.392189218921892, + "grad_norm": 0.027743998914957047, + "learning_rate": 3.599033138631165e-05, + "loss": 0.0593, + "num_input_tokens_seen": 16098288, + "step": 76285 + }, + { + "epoch": 8.392739273927393, + "grad_norm": 0.027985960245132446, + "learning_rate": 3.598817562563698e-05, + "loss": 0.0017, + "num_input_tokens_seen": 16099376, + "step": 76290 + }, + { + "epoch": 8.393289328932893, + "grad_norm": 2.5481560230255127, + "learning_rate": 3.598601976368956e-05, + "loss": 0.1735, + "num_input_tokens_seen": 16100432, + "step": 76295 + }, + { + "epoch": 8.393839383938394, + "grad_norm": 0.10912753641605377, + "learning_rate": 3.5983863800489285e-05, + "loss": 0.0141, + "num_input_tokens_seen": 16101584, + "step": 76300 + }, + { + "epoch": 8.394389438943895, + "grad_norm": 0.6275713443756104, + "learning_rate": 3.5981707736055985e-05, + "loss": 0.0143, + "num_input_tokens_seen": 16102608, + "step": 76305 + }, + { + "epoch": 8.394939493949394, + "grad_norm": 0.24856293201446533, + "learning_rate": 3.597955157040957e-05, + "loss": 0.0215, + "num_input_tokens_seen": 16103728, + "step": 76310 + }, + { + "epoch": 8.395489548954895, + "grad_norm": 0.07162129878997803, + "learning_rate": 3.597739530356989e-05, + "loss": 0.0132, + "num_input_tokens_seen": 16104752, + "step": 76315 + }, + { + "epoch": 8.396039603960396, + "grad_norm": 0.26288315653800964, + "learning_rate": 3.597523893555683e-05, + "loss": 0.0435, + "num_input_tokens_seen": 16105808, + "step": 76320 + }, + { + "epoch": 8.396589658965897, + "grad_norm": 0.27265119552612305, + "learning_rate": 3.597308246639026e-05, + "loss": 0.0633, + "num_input_tokens_seen": 16106896, + "step": 76325 + }, + { + "epoch": 8.397139713971397, + "grad_norm": 0.1274338811635971, + "learning_rate": 3.597092589609005e-05, + "loss": 0.043, + "num_input_tokens_seen": 16107984, + "step": 76330 + }, + { + "epoch": 8.397689768976898, + "grad_norm": 0.0669013038277626, + "learning_rate": 3.596876922467608e-05, + "loss": 0.0534, + "num_input_tokens_seen": 16109040, + "step": 76335 + }, + { + "epoch": 8.398239823982399, + "grad_norm": 0.40340495109558105, + "learning_rate": 3.5966612452168225e-05, + "loss": 0.0337, + "num_input_tokens_seen": 16110160, + "step": 76340 + }, + { + "epoch": 8.398789878987898, + "grad_norm": 0.9499438405036926, + "learning_rate": 3.5964455578586374e-05, + "loss": 0.0224, + "num_input_tokens_seen": 16111280, + "step": 76345 + }, + { + "epoch": 8.3993399339934, + "grad_norm": 0.05029138922691345, + "learning_rate": 3.5962298603950396e-05, + "loss": 0.0131, + "num_input_tokens_seen": 16112368, + "step": 76350 + }, + { + "epoch": 8.3998899889989, + "grad_norm": 0.06202174723148346, + "learning_rate": 3.5960141528280165e-05, + "loss": 0.0098, + "num_input_tokens_seen": 16113392, + "step": 76355 + }, + { + "epoch": 8.4004400440044, + "grad_norm": 0.12387356907129288, + "learning_rate": 3.595798435159558e-05, + "loss": 0.006, + "num_input_tokens_seen": 16114512, + "step": 76360 + }, + { + "epoch": 8.400990099009901, + "grad_norm": 2.2256884574890137, + "learning_rate": 3.595582707391649e-05, + "loss": 0.0878, + "num_input_tokens_seen": 16115600, + "step": 76365 + }, + { + "epoch": 8.401540154015402, + "grad_norm": 1.1310757398605347, + "learning_rate": 3.595366969526282e-05, + "loss": 0.0222, + "num_input_tokens_seen": 16116688, + "step": 76370 + }, + { + "epoch": 8.402090209020901, + "grad_norm": 0.9096247553825378, + "learning_rate": 3.595151221565443e-05, + "loss": 0.0361, + "num_input_tokens_seen": 16117680, + "step": 76375 + }, + { + "epoch": 8.402640264026402, + "grad_norm": 0.5076618194580078, + "learning_rate": 3.594935463511119e-05, + "loss": 0.0207, + "num_input_tokens_seen": 16118736, + "step": 76380 + }, + { + "epoch": 8.403190319031903, + "grad_norm": 0.368605375289917, + "learning_rate": 3.594719695365302e-05, + "loss": 0.0194, + "num_input_tokens_seen": 16119792, + "step": 76385 + }, + { + "epoch": 8.403740374037405, + "grad_norm": 0.8437393307685852, + "learning_rate": 3.5945039171299785e-05, + "loss": 0.0377, + "num_input_tokens_seen": 16120848, + "step": 76390 + }, + { + "epoch": 8.404290429042904, + "grad_norm": 0.05617457628250122, + "learning_rate": 3.594288128807137e-05, + "loss": 0.0089, + "num_input_tokens_seen": 16121904, + "step": 76395 + }, + { + "epoch": 8.404840484048405, + "grad_norm": 0.033403180539608, + "learning_rate": 3.594072330398768e-05, + "loss": 0.0288, + "num_input_tokens_seen": 16122864, + "step": 76400 + }, + { + "epoch": 8.405390539053906, + "grad_norm": 0.1561257392168045, + "learning_rate": 3.5938565219068585e-05, + "loss": 0.0108, + "num_input_tokens_seen": 16123920, + "step": 76405 + }, + { + "epoch": 8.405940594059405, + "grad_norm": 0.18056629598140717, + "learning_rate": 3.5936407033333975e-05, + "loss": 0.0224, + "num_input_tokens_seen": 16125008, + "step": 76410 + }, + { + "epoch": 8.406490649064907, + "grad_norm": 0.021249236539006233, + "learning_rate": 3.593424874680376e-05, + "loss": 0.0106, + "num_input_tokens_seen": 16126064, + "step": 76415 + }, + { + "epoch": 8.407040704070408, + "grad_norm": 0.055520955473184586, + "learning_rate": 3.593209035949782e-05, + "loss": 0.0032, + "num_input_tokens_seen": 16127088, + "step": 76420 + }, + { + "epoch": 8.407590759075907, + "grad_norm": 0.21765220165252686, + "learning_rate": 3.592993187143605e-05, + "loss": 0.0964, + "num_input_tokens_seen": 16128144, + "step": 76425 + }, + { + "epoch": 8.408140814081408, + "grad_norm": 0.013609141111373901, + "learning_rate": 3.5927773282638345e-05, + "loss": 0.0012, + "num_input_tokens_seen": 16129296, + "step": 76430 + }, + { + "epoch": 8.408690869086909, + "grad_norm": 0.22028523683547974, + "learning_rate": 3.592561459312459e-05, + "loss": 0.0123, + "num_input_tokens_seen": 16130384, + "step": 76435 + }, + { + "epoch": 8.409240924092408, + "grad_norm": 0.045088328421115875, + "learning_rate": 3.5923455802914694e-05, + "loss": 0.0234, + "num_input_tokens_seen": 16131440, + "step": 76440 + }, + { + "epoch": 8.40979097909791, + "grad_norm": 0.17516601085662842, + "learning_rate": 3.5921296912028543e-05, + "loss": 0.0471, + "num_input_tokens_seen": 16132528, + "step": 76445 + }, + { + "epoch": 8.41034103410341, + "grad_norm": 0.05396687984466553, + "learning_rate": 3.591913792048604e-05, + "loss": 0.0667, + "num_input_tokens_seen": 16133584, + "step": 76450 + }, + { + "epoch": 8.410891089108912, + "grad_norm": 0.024219172075390816, + "learning_rate": 3.5916978828307077e-05, + "loss": 0.0292, + "num_input_tokens_seen": 16134672, + "step": 76455 + }, + { + "epoch": 8.411441144114411, + "grad_norm": 0.0283723846077919, + "learning_rate": 3.591481963551157e-05, + "loss": 0.0488, + "num_input_tokens_seen": 16135728, + "step": 76460 + }, + { + "epoch": 8.411991199119912, + "grad_norm": 0.011844807304441929, + "learning_rate": 3.59126603421194e-05, + "loss": 0.0131, + "num_input_tokens_seen": 16136816, + "step": 76465 + }, + { + "epoch": 8.412541254125413, + "grad_norm": 0.029625747352838516, + "learning_rate": 3.591050094815048e-05, + "loss": 0.0098, + "num_input_tokens_seen": 16137808, + "step": 76470 + }, + { + "epoch": 8.413091309130913, + "grad_norm": 0.011073323898017406, + "learning_rate": 3.590834145362471e-05, + "loss": 0.0047, + "num_input_tokens_seen": 16138832, + "step": 76475 + }, + { + "epoch": 8.413641364136414, + "grad_norm": 0.014384944923222065, + "learning_rate": 3.590618185856198e-05, + "loss": 0.0136, + "num_input_tokens_seen": 16139856, + "step": 76480 + }, + { + "epoch": 8.414191419141915, + "grad_norm": 0.015063491649925709, + "learning_rate": 3.590402216298222e-05, + "loss": 0.0093, + "num_input_tokens_seen": 16140880, + "step": 76485 + }, + { + "epoch": 8.414741474147414, + "grad_norm": 0.016922229900956154, + "learning_rate": 3.5901862366905317e-05, + "loss": 0.0023, + "num_input_tokens_seen": 16141872, + "step": 76490 + }, + { + "epoch": 8.415291529152915, + "grad_norm": 0.19574497640132904, + "learning_rate": 3.589970247035117e-05, + "loss": 0.0755, + "num_input_tokens_seen": 16142896, + "step": 76495 + }, + { + "epoch": 8.415841584158416, + "grad_norm": 0.020663337782025337, + "learning_rate": 3.58975424733397e-05, + "loss": 0.0058, + "num_input_tokens_seen": 16143952, + "step": 76500 + }, + { + "epoch": 8.416391639163916, + "grad_norm": 0.008092569187283516, + "learning_rate": 3.589538237589081e-05, + "loss": 0.0034, + "num_input_tokens_seen": 16145008, + "step": 76505 + }, + { + "epoch": 8.416941694169417, + "grad_norm": 0.030338317155838013, + "learning_rate": 3.589322217802441e-05, + "loss": 0.1263, + "num_input_tokens_seen": 16146096, + "step": 76510 + }, + { + "epoch": 8.417491749174918, + "grad_norm": 0.026458214968442917, + "learning_rate": 3.5891061879760416e-05, + "loss": 0.0062, + "num_input_tokens_seen": 16147120, + "step": 76515 + }, + { + "epoch": 8.418041804180419, + "grad_norm": 0.04565688967704773, + "learning_rate": 3.588890148111872e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16148176, + "step": 76520 + }, + { + "epoch": 8.418591859185918, + "grad_norm": 0.05287855863571167, + "learning_rate": 3.5886740982119245e-05, + "loss": 0.0056, + "num_input_tokens_seen": 16149168, + "step": 76525 + }, + { + "epoch": 8.41914191419142, + "grad_norm": 0.21334506571292877, + "learning_rate": 3.5884580382781904e-05, + "loss": 0.0174, + "num_input_tokens_seen": 16150256, + "step": 76530 + }, + { + "epoch": 8.41969196919692, + "grad_norm": 0.026465149596333504, + "learning_rate": 3.588241968312661e-05, + "loss": 0.0917, + "num_input_tokens_seen": 16151280, + "step": 76535 + }, + { + "epoch": 8.42024202420242, + "grad_norm": 0.015705635771155357, + "learning_rate": 3.5880258883173276e-05, + "loss": 0.0062, + "num_input_tokens_seen": 16152368, + "step": 76540 + }, + { + "epoch": 8.42079207920792, + "grad_norm": 0.6767091155052185, + "learning_rate": 3.587809798294182e-05, + "loss": 0.0283, + "num_input_tokens_seen": 16153424, + "step": 76545 + }, + { + "epoch": 8.421342134213422, + "grad_norm": 0.6840074062347412, + "learning_rate": 3.587593698245215e-05, + "loss": 0.0228, + "num_input_tokens_seen": 16154448, + "step": 76550 + }, + { + "epoch": 8.421892189218921, + "grad_norm": 0.011942334473133087, + "learning_rate": 3.587377588172418e-05, + "loss": 0.0038, + "num_input_tokens_seen": 16155472, + "step": 76555 + }, + { + "epoch": 8.422442244224422, + "grad_norm": 0.8409450650215149, + "learning_rate": 3.587161468077785e-05, + "loss": 0.0138, + "num_input_tokens_seen": 16156464, + "step": 76560 + }, + { + "epoch": 8.422992299229923, + "grad_norm": 0.008179121650755405, + "learning_rate": 3.5869453379633056e-05, + "loss": 0.0115, + "num_input_tokens_seen": 16157520, + "step": 76565 + }, + { + "epoch": 8.423542354235423, + "grad_norm": 0.024948906153440475, + "learning_rate": 3.586729197830972e-05, + "loss": 0.0992, + "num_input_tokens_seen": 16158544, + "step": 76570 + }, + { + "epoch": 8.424092409240924, + "grad_norm": 0.03701730817556381, + "learning_rate": 3.586513047682778e-05, + "loss": 0.0164, + "num_input_tokens_seen": 16159600, + "step": 76575 + }, + { + "epoch": 8.424642464246425, + "grad_norm": 0.9692330360412598, + "learning_rate": 3.586296887520714e-05, + "loss": 0.0452, + "num_input_tokens_seen": 16160656, + "step": 76580 + }, + { + "epoch": 8.425192519251926, + "grad_norm": 0.005793618969619274, + "learning_rate": 3.586080717346773e-05, + "loss": 0.0583, + "num_input_tokens_seen": 16161712, + "step": 76585 + }, + { + "epoch": 8.425742574257425, + "grad_norm": 0.0025210415478795767, + "learning_rate": 3.585864537162947e-05, + "loss": 0.058, + "num_input_tokens_seen": 16162736, + "step": 76590 + }, + { + "epoch": 8.426292629262926, + "grad_norm": 0.09327645599842072, + "learning_rate": 3.5856483469712293e-05, + "loss": 0.0094, + "num_input_tokens_seen": 16163792, + "step": 76595 + }, + { + "epoch": 8.426842684268427, + "grad_norm": 0.030826760455965996, + "learning_rate": 3.585432146773611e-05, + "loss": 0.0013, + "num_input_tokens_seen": 16164784, + "step": 76600 + }, + { + "epoch": 8.427392739273927, + "grad_norm": 0.055434249341487885, + "learning_rate": 3.585215936572086e-05, + "loss": 0.004, + "num_input_tokens_seen": 16165872, + "step": 76605 + }, + { + "epoch": 8.427942794279428, + "grad_norm": 1.2882035970687866, + "learning_rate": 3.5849997163686456e-05, + "loss": 0.0476, + "num_input_tokens_seen": 16166832, + "step": 76610 + }, + { + "epoch": 8.428492849284929, + "grad_norm": 0.025111006572842598, + "learning_rate": 3.584783486165284e-05, + "loss": 0.0768, + "num_input_tokens_seen": 16167856, + "step": 76615 + }, + { + "epoch": 8.429042904290428, + "grad_norm": 0.025669526308774948, + "learning_rate": 3.584567245963993e-05, + "loss": 0.0516, + "num_input_tokens_seen": 16168848, + "step": 76620 + }, + { + "epoch": 8.42959295929593, + "grad_norm": 1.6054598093032837, + "learning_rate": 3.584350995766767e-05, + "loss": 0.1642, + "num_input_tokens_seen": 16169904, + "step": 76625 + }, + { + "epoch": 8.43014301430143, + "grad_norm": 1.8224527835845947, + "learning_rate": 3.5841347355755986e-05, + "loss": 0.0695, + "num_input_tokens_seen": 16170992, + "step": 76630 + }, + { + "epoch": 8.430693069306932, + "grad_norm": 0.07898608595132828, + "learning_rate": 3.5839184653924794e-05, + "loss": 0.0434, + "num_input_tokens_seen": 16172048, + "step": 76635 + }, + { + "epoch": 8.43124312431243, + "grad_norm": 1.2061657905578613, + "learning_rate": 3.583702185219404e-05, + "loss": 0.0533, + "num_input_tokens_seen": 16173072, + "step": 76640 + }, + { + "epoch": 8.431793179317932, + "grad_norm": 0.727881133556366, + "learning_rate": 3.583485895058366e-05, + "loss": 0.0264, + "num_input_tokens_seen": 16174224, + "step": 76645 + }, + { + "epoch": 8.432343234323433, + "grad_norm": 1.4690001010894775, + "learning_rate": 3.583269594911358e-05, + "loss": 0.0306, + "num_input_tokens_seen": 16175280, + "step": 76650 + }, + { + "epoch": 8.432893289328932, + "grad_norm": 0.0792536810040474, + "learning_rate": 3.583053284780374e-05, + "loss": 0.0131, + "num_input_tokens_seen": 16176304, + "step": 76655 + }, + { + "epoch": 8.433443344334433, + "grad_norm": 0.0706048458814621, + "learning_rate": 3.582836964667408e-05, + "loss": 0.0023, + "num_input_tokens_seen": 16177360, + "step": 76660 + }, + { + "epoch": 8.433993399339935, + "grad_norm": 0.6420537233352661, + "learning_rate": 3.5826206345744526e-05, + "loss": 0.1031, + "num_input_tokens_seen": 16178480, + "step": 76665 + }, + { + "epoch": 8.434543454345434, + "grad_norm": 0.007665031589567661, + "learning_rate": 3.582404294503502e-05, + "loss": 0.0101, + "num_input_tokens_seen": 16179536, + "step": 76670 + }, + { + "epoch": 8.435093509350935, + "grad_norm": 1.1890082359313965, + "learning_rate": 3.582187944456552e-05, + "loss": 0.0857, + "num_input_tokens_seen": 16180656, + "step": 76675 + }, + { + "epoch": 8.435643564356436, + "grad_norm": 0.32098081707954407, + "learning_rate": 3.581971584435594e-05, + "loss": 0.0235, + "num_input_tokens_seen": 16181744, + "step": 76680 + }, + { + "epoch": 8.436193619361935, + "grad_norm": 0.028707433491945267, + "learning_rate": 3.581755214442623e-05, + "loss": 0.0075, + "num_input_tokens_seen": 16182832, + "step": 76685 + }, + { + "epoch": 8.436743674367436, + "grad_norm": 0.01707669533789158, + "learning_rate": 3.581538834479634e-05, + "loss": 0.029, + "num_input_tokens_seen": 16183920, + "step": 76690 + }, + { + "epoch": 8.437293729372938, + "grad_norm": 1.5814008712768555, + "learning_rate": 3.58132244454862e-05, + "loss": 0.1158, + "num_input_tokens_seen": 16184944, + "step": 76695 + }, + { + "epoch": 8.437843784378439, + "grad_norm": 0.05919097363948822, + "learning_rate": 3.581106044651575e-05, + "loss": 0.1126, + "num_input_tokens_seen": 16186064, + "step": 76700 + }, + { + "epoch": 8.438393839383938, + "grad_norm": 0.016151105985045433, + "learning_rate": 3.5808896347904964e-05, + "loss": 0.0912, + "num_input_tokens_seen": 16187184, + "step": 76705 + }, + { + "epoch": 8.438943894389439, + "grad_norm": 0.9512628316879272, + "learning_rate": 3.580673214967375e-05, + "loss": 0.021, + "num_input_tokens_seen": 16188240, + "step": 76710 + }, + { + "epoch": 8.43949394939494, + "grad_norm": 0.01479171309620142, + "learning_rate": 3.580456785184208e-05, + "loss": 0.0155, + "num_input_tokens_seen": 16189328, + "step": 76715 + }, + { + "epoch": 8.44004400440044, + "grad_norm": 0.009995147585868835, + "learning_rate": 3.5802403454429886e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16190352, + "step": 76720 + }, + { + "epoch": 8.44059405940594, + "grad_norm": 0.036740344017744064, + "learning_rate": 3.580023895745713e-05, + "loss": 0.0509, + "num_input_tokens_seen": 16191440, + "step": 76725 + }, + { + "epoch": 8.441144114411442, + "grad_norm": 0.25600379705429077, + "learning_rate": 3.5798074360943745e-05, + "loss": 0.0463, + "num_input_tokens_seen": 16192496, + "step": 76730 + }, + { + "epoch": 8.441694169416941, + "grad_norm": 0.10055586695671082, + "learning_rate": 3.57959096649097e-05, + "loss": 0.1646, + "num_input_tokens_seen": 16193552, + "step": 76735 + }, + { + "epoch": 8.442244224422442, + "grad_norm": 0.010625387541949749, + "learning_rate": 3.579374486937493e-05, + "loss": 0.0666, + "num_input_tokens_seen": 16194704, + "step": 76740 + }, + { + "epoch": 8.442794279427943, + "grad_norm": 0.017116935923695564, + "learning_rate": 3.579157997435939e-05, + "loss": 0.0555, + "num_input_tokens_seen": 16195728, + "step": 76745 + }, + { + "epoch": 8.443344334433444, + "grad_norm": 0.061137109994888306, + "learning_rate": 3.578941497988304e-05, + "loss": 0.1294, + "num_input_tokens_seen": 16196784, + "step": 76750 + }, + { + "epoch": 8.443894389438944, + "grad_norm": 0.05211133882403374, + "learning_rate": 3.578724988596583e-05, + "loss": 0.0043, + "num_input_tokens_seen": 16197808, + "step": 76755 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 0.12678855657577515, + "learning_rate": 3.578508469262771e-05, + "loss": 0.0779, + "num_input_tokens_seen": 16198800, + "step": 76760 + }, + { + "epoch": 8.444994499449946, + "grad_norm": 0.38711780309677124, + "learning_rate": 3.578291939988865e-05, + "loss": 0.013, + "num_input_tokens_seen": 16199792, + "step": 76765 + }, + { + "epoch": 8.445544554455445, + "grad_norm": 0.04606496915221214, + "learning_rate": 3.578075400776859e-05, + "loss": 0.0087, + "num_input_tokens_seen": 16200848, + "step": 76770 + }, + { + "epoch": 8.446094609460946, + "grad_norm": 0.09712561219930649, + "learning_rate": 3.577858851628749e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16201904, + "step": 76775 + }, + { + "epoch": 8.446644664466447, + "grad_norm": 0.40461552143096924, + "learning_rate": 3.577642292546532e-05, + "loss": 0.0086, + "num_input_tokens_seen": 16202960, + "step": 76780 + }, + { + "epoch": 8.447194719471947, + "grad_norm": 0.04946631193161011, + "learning_rate": 3.577425723532202e-05, + "loss": 0.0134, + "num_input_tokens_seen": 16203984, + "step": 76785 + }, + { + "epoch": 8.447744774477448, + "grad_norm": 0.009271426126360893, + "learning_rate": 3.5772091445877565e-05, + "loss": 0.0333, + "num_input_tokens_seen": 16205008, + "step": 76790 + }, + { + "epoch": 8.448294829482949, + "grad_norm": 0.018131718039512634, + "learning_rate": 3.576992555715191e-05, + "loss": 0.1111, + "num_input_tokens_seen": 16206000, + "step": 76795 + }, + { + "epoch": 8.448844884488448, + "grad_norm": 0.3116335868835449, + "learning_rate": 3.576775956916503e-05, + "loss": 0.0064, + "num_input_tokens_seen": 16207024, + "step": 76800 + }, + { + "epoch": 8.44939493949395, + "grad_norm": 0.9199342131614685, + "learning_rate": 3.576559348193687e-05, + "loss": 0.0518, + "num_input_tokens_seen": 16208144, + "step": 76805 + }, + { + "epoch": 8.44994499449945, + "grad_norm": 0.10202000290155411, + "learning_rate": 3.5763427295487394e-05, + "loss": 0.0749, + "num_input_tokens_seen": 16209200, + "step": 76810 + }, + { + "epoch": 8.450495049504951, + "grad_norm": 2.6297566890716553, + "learning_rate": 3.576126100983658e-05, + "loss": 0.0844, + "num_input_tokens_seen": 16210192, + "step": 76815 + }, + { + "epoch": 8.45104510451045, + "grad_norm": 0.21331775188446045, + "learning_rate": 3.575909462500439e-05, + "loss": 0.0139, + "num_input_tokens_seen": 16211216, + "step": 76820 + }, + { + "epoch": 8.451595159515952, + "grad_norm": 0.028898324817419052, + "learning_rate": 3.575692814101078e-05, + "loss": 0.0109, + "num_input_tokens_seen": 16212272, + "step": 76825 + }, + { + "epoch": 8.452145214521453, + "grad_norm": 1.4140632152557373, + "learning_rate": 3.5754761557875725e-05, + "loss": 0.0393, + "num_input_tokens_seen": 16213264, + "step": 76830 + }, + { + "epoch": 8.452695269526952, + "grad_norm": 0.46477094292640686, + "learning_rate": 3.57525948756192e-05, + "loss": 0.2042, + "num_input_tokens_seen": 16214320, + "step": 76835 + }, + { + "epoch": 8.453245324532453, + "grad_norm": 0.07616469264030457, + "learning_rate": 3.575042809426116e-05, + "loss": 0.0261, + "num_input_tokens_seen": 16215376, + "step": 76840 + }, + { + "epoch": 8.453795379537954, + "grad_norm": 0.13836172223091125, + "learning_rate": 3.574826121382159e-05, + "loss": 0.0658, + "num_input_tokens_seen": 16216464, + "step": 76845 + }, + { + "epoch": 8.454345434543454, + "grad_norm": 0.09203208982944489, + "learning_rate": 3.5746094234320454e-05, + "loss": 0.0167, + "num_input_tokens_seen": 16217552, + "step": 76850 + }, + { + "epoch": 8.454895489548955, + "grad_norm": 0.049892038106918335, + "learning_rate": 3.574392715577772e-05, + "loss": 0.0602, + "num_input_tokens_seen": 16218608, + "step": 76855 + }, + { + "epoch": 8.455445544554456, + "grad_norm": 0.10361051559448242, + "learning_rate": 3.5741759978213365e-05, + "loss": 0.0865, + "num_input_tokens_seen": 16219664, + "step": 76860 + }, + { + "epoch": 8.455995599559955, + "grad_norm": 0.021679572761058807, + "learning_rate": 3.573959270164736e-05, + "loss": 0.0091, + "num_input_tokens_seen": 16220720, + "step": 76865 + }, + { + "epoch": 8.456545654565456, + "grad_norm": 1.462918758392334, + "learning_rate": 3.573742532609969e-05, + "loss": 0.1228, + "num_input_tokens_seen": 16221808, + "step": 76870 + }, + { + "epoch": 8.457095709570957, + "grad_norm": 0.07596004754304886, + "learning_rate": 3.573525785159032e-05, + "loss": 0.0391, + "num_input_tokens_seen": 16222864, + "step": 76875 + }, + { + "epoch": 8.457645764576458, + "grad_norm": 1.447553038597107, + "learning_rate": 3.573309027813923e-05, + "loss": 0.0755, + "num_input_tokens_seen": 16223856, + "step": 76880 + }, + { + "epoch": 8.458195819581958, + "grad_norm": 0.39725807309150696, + "learning_rate": 3.57309226057664e-05, + "loss": 0.0121, + "num_input_tokens_seen": 16224880, + "step": 76885 + }, + { + "epoch": 8.458745874587459, + "grad_norm": 0.11061672866344452, + "learning_rate": 3.5728754834491794e-05, + "loss": 0.0049, + "num_input_tokens_seen": 16225904, + "step": 76890 + }, + { + "epoch": 8.45929592959296, + "grad_norm": 1.9232677221298218, + "learning_rate": 3.5726586964335416e-05, + "loss": 0.094, + "num_input_tokens_seen": 16226896, + "step": 76895 + }, + { + "epoch": 8.45984598459846, + "grad_norm": 0.014656673185527325, + "learning_rate": 3.572441899531722e-05, + "loss": 0.0656, + "num_input_tokens_seen": 16227984, + "step": 76900 + }, + { + "epoch": 8.46039603960396, + "grad_norm": 0.19515366852283478, + "learning_rate": 3.572225092745721e-05, + "loss": 0.0365, + "num_input_tokens_seen": 16229072, + "step": 76905 + }, + { + "epoch": 8.460946094609461, + "grad_norm": 0.009719409048557281, + "learning_rate": 3.572008276077536e-05, + "loss": 0.1612, + "num_input_tokens_seen": 16230064, + "step": 76910 + }, + { + "epoch": 8.46149614961496, + "grad_norm": 0.15746568143367767, + "learning_rate": 3.571791449529165e-05, + "loss": 0.017, + "num_input_tokens_seen": 16231184, + "step": 76915 + }, + { + "epoch": 8.462046204620462, + "grad_norm": 0.018863091245293617, + "learning_rate": 3.571574613102606e-05, + "loss": 0.0197, + "num_input_tokens_seen": 16232208, + "step": 76920 + }, + { + "epoch": 8.462596259625963, + "grad_norm": 0.20745845139026642, + "learning_rate": 3.571357766799858e-05, + "loss": 0.068, + "num_input_tokens_seen": 16233264, + "step": 76925 + }, + { + "epoch": 8.463146314631462, + "grad_norm": 0.7520591616630554, + "learning_rate": 3.57114091062292e-05, + "loss": 0.0301, + "num_input_tokens_seen": 16234320, + "step": 76930 + }, + { + "epoch": 8.463696369636963, + "grad_norm": 2.6929783821105957, + "learning_rate": 3.57092404457379e-05, + "loss": 0.1205, + "num_input_tokens_seen": 16235408, + "step": 76935 + }, + { + "epoch": 8.464246424642464, + "grad_norm": 0.16989560425281525, + "learning_rate": 3.5707071686544676e-05, + "loss": 0.0417, + "num_input_tokens_seen": 16236528, + "step": 76940 + }, + { + "epoch": 8.464796479647966, + "grad_norm": 0.5393053293228149, + "learning_rate": 3.570490282866949e-05, + "loss": 0.039, + "num_input_tokens_seen": 16237584, + "step": 76945 + }, + { + "epoch": 8.465346534653465, + "grad_norm": 0.07264658063650131, + "learning_rate": 3.570273387213237e-05, + "loss": 0.0766, + "num_input_tokens_seen": 16238576, + "step": 76950 + }, + { + "epoch": 8.465896589658966, + "grad_norm": 0.6594369411468506, + "learning_rate": 3.5700564816953286e-05, + "loss": 0.0146, + "num_input_tokens_seen": 16239568, + "step": 76955 + }, + { + "epoch": 8.466446644664467, + "grad_norm": 0.21765194833278656, + "learning_rate": 3.5698395663152215e-05, + "loss": 0.0211, + "num_input_tokens_seen": 16240592, + "step": 76960 + }, + { + "epoch": 8.466996699669966, + "grad_norm": 0.06577308475971222, + "learning_rate": 3.569622641074919e-05, + "loss": 0.0167, + "num_input_tokens_seen": 16241648, + "step": 76965 + }, + { + "epoch": 8.467546754675467, + "grad_norm": 0.0881945863366127, + "learning_rate": 3.569405705976416e-05, + "loss": 0.0708, + "num_input_tokens_seen": 16242672, + "step": 76970 + }, + { + "epoch": 8.468096809680969, + "grad_norm": 0.07422143220901489, + "learning_rate": 3.5691887610217146e-05, + "loss": 0.0894, + "num_input_tokens_seen": 16243696, + "step": 76975 + }, + { + "epoch": 8.468646864686468, + "grad_norm": 0.039969827979803085, + "learning_rate": 3.568971806212813e-05, + "loss": 0.0717, + "num_input_tokens_seen": 16244720, + "step": 76980 + }, + { + "epoch": 8.469196919691969, + "grad_norm": 0.04496322572231293, + "learning_rate": 3.568754841551711e-05, + "loss": 0.0209, + "num_input_tokens_seen": 16245808, + "step": 76985 + }, + { + "epoch": 8.46974697469747, + "grad_norm": 0.04549531638622284, + "learning_rate": 3.568537867040409e-05, + "loss": 0.0079, + "num_input_tokens_seen": 16246832, + "step": 76990 + }, + { + "epoch": 8.47029702970297, + "grad_norm": 0.024769935756921768, + "learning_rate": 3.568320882680906e-05, + "loss": 0.0341, + "num_input_tokens_seen": 16247888, + "step": 76995 + }, + { + "epoch": 8.47084708470847, + "grad_norm": 0.6393251419067383, + "learning_rate": 3.5681038884752026e-05, + "loss": 0.0213, + "num_input_tokens_seen": 16248944, + "step": 77000 + }, + { + "epoch": 8.471397139713972, + "grad_norm": 0.28715625405311584, + "learning_rate": 3.567886884425297e-05, + "loss": 0.0066, + "num_input_tokens_seen": 16250000, + "step": 77005 + }, + { + "epoch": 8.471947194719473, + "grad_norm": 0.046647511422634125, + "learning_rate": 3.5676698705331916e-05, + "loss": 0.0168, + "num_input_tokens_seen": 16251088, + "step": 77010 + }, + { + "epoch": 8.472497249724972, + "grad_norm": 1.07176673412323, + "learning_rate": 3.567452846800885e-05, + "loss": 0.0206, + "num_input_tokens_seen": 16252208, + "step": 77015 + }, + { + "epoch": 8.473047304730473, + "grad_norm": 0.026669956743717194, + "learning_rate": 3.5672358132303774e-05, + "loss": 0.044, + "num_input_tokens_seen": 16253264, + "step": 77020 + }, + { + "epoch": 8.473597359735974, + "grad_norm": 0.1853017508983612, + "learning_rate": 3.567018769823669e-05, + "loss": 0.0054, + "num_input_tokens_seen": 16254320, + "step": 77025 + }, + { + "epoch": 8.474147414741473, + "grad_norm": 0.7183611392974854, + "learning_rate": 3.5668017165827616e-05, + "loss": 0.0733, + "num_input_tokens_seen": 16255344, + "step": 77030 + }, + { + "epoch": 8.474697469746975, + "grad_norm": 0.46018877625465393, + "learning_rate": 3.566584653509654e-05, + "loss": 0.0108, + "num_input_tokens_seen": 16256400, + "step": 77035 + }, + { + "epoch": 8.475247524752476, + "grad_norm": 0.014711749739944935, + "learning_rate": 3.566367580606348e-05, + "loss": 0.0202, + "num_input_tokens_seen": 16257488, + "step": 77040 + }, + { + "epoch": 8.475797579757975, + "grad_norm": 0.11878994107246399, + "learning_rate": 3.5661504978748436e-05, + "loss": 0.0071, + "num_input_tokens_seen": 16258544, + "step": 77045 + }, + { + "epoch": 8.476347634763476, + "grad_norm": 0.2096596211194992, + "learning_rate": 3.5659334053171415e-05, + "loss": 0.0115, + "num_input_tokens_seen": 16259632, + "step": 77050 + }, + { + "epoch": 8.476897689768977, + "grad_norm": 0.4983924925327301, + "learning_rate": 3.565716302935242e-05, + "loss": 0.0145, + "num_input_tokens_seen": 16260752, + "step": 77055 + }, + { + "epoch": 8.477447744774478, + "grad_norm": 0.008334661833941936, + "learning_rate": 3.5654991907311476e-05, + "loss": 0.0082, + "num_input_tokens_seen": 16261808, + "step": 77060 + }, + { + "epoch": 8.477997799779978, + "grad_norm": 0.04371487721800804, + "learning_rate": 3.565282068706858e-05, + "loss": 0.0161, + "num_input_tokens_seen": 16262832, + "step": 77065 + }, + { + "epoch": 8.478547854785479, + "grad_norm": 0.07364027947187424, + "learning_rate": 3.565064936864375e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16263952, + "step": 77070 + }, + { + "epoch": 8.47909790979098, + "grad_norm": 0.8357188701629639, + "learning_rate": 3.5648477952056996e-05, + "loss": 0.0696, + "num_input_tokens_seen": 16265008, + "step": 77075 + }, + { + "epoch": 8.479647964796479, + "grad_norm": 0.013070475310087204, + "learning_rate": 3.564630643732832e-05, + "loss": 0.0396, + "num_input_tokens_seen": 16266064, + "step": 77080 + }, + { + "epoch": 8.48019801980198, + "grad_norm": 0.17871788144111633, + "learning_rate": 3.564413482447777e-05, + "loss": 0.0431, + "num_input_tokens_seen": 16267120, + "step": 77085 + }, + { + "epoch": 8.480748074807481, + "grad_norm": 1.9435749053955078, + "learning_rate": 3.564196311352531e-05, + "loss": 0.0306, + "num_input_tokens_seen": 16268240, + "step": 77090 + }, + { + "epoch": 8.48129812981298, + "grad_norm": 0.012470483779907227, + "learning_rate": 3.5639791304490997e-05, + "loss": 0.0552, + "num_input_tokens_seen": 16269296, + "step": 77095 + }, + { + "epoch": 8.481848184818482, + "grad_norm": 0.23213887214660645, + "learning_rate": 3.563761939739483e-05, + "loss": 0.0777, + "num_input_tokens_seen": 16270352, + "step": 77100 + }, + { + "epoch": 8.482398239823983, + "grad_norm": 1.3167937994003296, + "learning_rate": 3.5635447392256834e-05, + "loss": 0.03, + "num_input_tokens_seen": 16271376, + "step": 77105 + }, + { + "epoch": 8.482948294829482, + "grad_norm": 0.2231789082288742, + "learning_rate": 3.5633275289097015e-05, + "loss": 0.0252, + "num_input_tokens_seen": 16272368, + "step": 77110 + }, + { + "epoch": 8.483498349834983, + "grad_norm": 1.6445043087005615, + "learning_rate": 3.563110308793541e-05, + "loss": 0.0888, + "num_input_tokens_seen": 16273424, + "step": 77115 + }, + { + "epoch": 8.484048404840484, + "grad_norm": 0.013481500558555126, + "learning_rate": 3.5628930788792015e-05, + "loss": 0.075, + "num_input_tokens_seen": 16274512, + "step": 77120 + }, + { + "epoch": 8.484598459845985, + "grad_norm": 0.02983796037733555, + "learning_rate": 3.5626758391686874e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16275536, + "step": 77125 + }, + { + "epoch": 8.485148514851485, + "grad_norm": 0.2504322826862335, + "learning_rate": 3.5624585896640004e-05, + "loss": 0.0469, + "num_input_tokens_seen": 16276560, + "step": 77130 + }, + { + "epoch": 8.485698569856986, + "grad_norm": 0.1369626224040985, + "learning_rate": 3.562241330367142e-05, + "loss": 0.0454, + "num_input_tokens_seen": 16277584, + "step": 77135 + }, + { + "epoch": 8.486248624862487, + "grad_norm": 0.6413862109184265, + "learning_rate": 3.562024061280115e-05, + "loss": 0.0283, + "num_input_tokens_seen": 16278608, + "step": 77140 + }, + { + "epoch": 8.486798679867986, + "grad_norm": 0.062287408858537674, + "learning_rate": 3.561806782404921e-05, + "loss": 0.0062, + "num_input_tokens_seen": 16279568, + "step": 77145 + }, + { + "epoch": 8.487348734873487, + "grad_norm": 0.052256666123867035, + "learning_rate": 3.561589493743564e-05, + "loss": 0.0959, + "num_input_tokens_seen": 16280624, + "step": 77150 + }, + { + "epoch": 8.487898789878988, + "grad_norm": 0.3716357350349426, + "learning_rate": 3.5613721952980464e-05, + "loss": 0.0744, + "num_input_tokens_seen": 16281680, + "step": 77155 + }, + { + "epoch": 8.488448844884488, + "grad_norm": 0.03154532238841057, + "learning_rate": 3.5611548870703704e-05, + "loss": 0.0029, + "num_input_tokens_seen": 16282736, + "step": 77160 + }, + { + "epoch": 8.488998899889989, + "grad_norm": 0.012479268014431, + "learning_rate": 3.560937569062538e-05, + "loss": 0.0844, + "num_input_tokens_seen": 16283824, + "step": 77165 + }, + { + "epoch": 8.48954895489549, + "grad_norm": 0.018800679594278336, + "learning_rate": 3.560720241276555e-05, + "loss": 0.0229, + "num_input_tokens_seen": 16284880, + "step": 77170 + }, + { + "epoch": 8.490099009900991, + "grad_norm": 0.028507104143500328, + "learning_rate": 3.560502903714421e-05, + "loss": 0.1445, + "num_input_tokens_seen": 16285936, + "step": 77175 + }, + { + "epoch": 8.49064906490649, + "grad_norm": 0.3730229437351227, + "learning_rate": 3.560285556378142e-05, + "loss": 0.0545, + "num_input_tokens_seen": 16286960, + "step": 77180 + }, + { + "epoch": 8.491199119911991, + "grad_norm": 0.11291227489709854, + "learning_rate": 3.560068199269719e-05, + "loss": 0.0245, + "num_input_tokens_seen": 16288048, + "step": 77185 + }, + { + "epoch": 8.491749174917492, + "grad_norm": 0.1346154808998108, + "learning_rate": 3.5598508323911555e-05, + "loss": 0.0651, + "num_input_tokens_seen": 16289104, + "step": 77190 + }, + { + "epoch": 8.492299229922992, + "grad_norm": 1.800547480583191, + "learning_rate": 3.5596334557444556e-05, + "loss": 0.083, + "num_input_tokens_seen": 16290160, + "step": 77195 + }, + { + "epoch": 8.492849284928493, + "grad_norm": 0.47873160243034363, + "learning_rate": 3.5594160693316236e-05, + "loss": 0.0797, + "num_input_tokens_seen": 16291184, + "step": 77200 + }, + { + "epoch": 8.493399339933994, + "grad_norm": 0.12179160863161087, + "learning_rate": 3.5591986731546614e-05, + "loss": 0.005, + "num_input_tokens_seen": 16292272, + "step": 77205 + }, + { + "epoch": 8.493949394939493, + "grad_norm": 0.018327124416828156, + "learning_rate": 3.558981267215573e-05, + "loss": 0.038, + "num_input_tokens_seen": 16293328, + "step": 77210 + }, + { + "epoch": 8.494499449944994, + "grad_norm": 1.0073177814483643, + "learning_rate": 3.558763851516363e-05, + "loss": 0.0293, + "num_input_tokens_seen": 16294416, + "step": 77215 + }, + { + "epoch": 8.495049504950495, + "grad_norm": 0.07274558395147324, + "learning_rate": 3.558546426059034e-05, + "loss": 0.0534, + "num_input_tokens_seen": 16295440, + "step": 77220 + }, + { + "epoch": 8.495599559955995, + "grad_norm": 0.020774902775883675, + "learning_rate": 3.558328990845591e-05, + "loss": 0.0191, + "num_input_tokens_seen": 16296592, + "step": 77225 + }, + { + "epoch": 8.496149614961496, + "grad_norm": 0.15734297037124634, + "learning_rate": 3.558111545878038e-05, + "loss": 0.0273, + "num_input_tokens_seen": 16297648, + "step": 77230 + }, + { + "epoch": 8.496699669966997, + "grad_norm": 0.043595071882009506, + "learning_rate": 3.557894091158377e-05, + "loss": 0.0091, + "num_input_tokens_seen": 16298640, + "step": 77235 + }, + { + "epoch": 8.497249724972498, + "grad_norm": 1.3544801473617554, + "learning_rate": 3.557676626688615e-05, + "loss": 0.0195, + "num_input_tokens_seen": 16299632, + "step": 77240 + }, + { + "epoch": 8.497799779977997, + "grad_norm": 0.0059545873664319515, + "learning_rate": 3.557459152470755e-05, + "loss": 0.0053, + "num_input_tokens_seen": 16300688, + "step": 77245 + }, + { + "epoch": 8.498349834983498, + "grad_norm": 0.05704791843891144, + "learning_rate": 3.557241668506801e-05, + "loss": 0.0297, + "num_input_tokens_seen": 16301744, + "step": 77250 + }, + { + "epoch": 8.498899889989, + "grad_norm": 0.10891655832529068, + "learning_rate": 3.557024174798759e-05, + "loss": 0.0043, + "num_input_tokens_seen": 16302768, + "step": 77255 + }, + { + "epoch": 8.499449944994499, + "grad_norm": 0.046716511249542236, + "learning_rate": 3.5568066713486314e-05, + "loss": 0.0455, + "num_input_tokens_seen": 16303760, + "step": 77260 + }, + { + "epoch": 8.5, + "grad_norm": 0.7477333545684814, + "learning_rate": 3.556589158158423e-05, + "loss": 0.0134, + "num_input_tokens_seen": 16304880, + "step": 77265 + }, + { + "epoch": 8.500550055005501, + "grad_norm": 0.4253392517566681, + "learning_rate": 3.556371635230141e-05, + "loss": 0.0387, + "num_input_tokens_seen": 16305936, + "step": 77270 + }, + { + "epoch": 8.501100110011, + "grad_norm": 0.028022587299346924, + "learning_rate": 3.556154102565788e-05, + "loss": 0.0223, + "num_input_tokens_seen": 16307024, + "step": 77275 + }, + { + "epoch": 8.501650165016502, + "grad_norm": 0.20779934525489807, + "learning_rate": 3.555936560167369e-05, + "loss": 0.0472, + "num_input_tokens_seen": 16308048, + "step": 77280 + }, + { + "epoch": 8.502200220022003, + "grad_norm": 1.3798582553863525, + "learning_rate": 3.55571900803689e-05, + "loss": 0.0415, + "num_input_tokens_seen": 16309104, + "step": 77285 + }, + { + "epoch": 8.502750275027502, + "grad_norm": 0.013109364546835423, + "learning_rate": 3.555501446176355e-05, + "loss": 0.004, + "num_input_tokens_seen": 16310160, + "step": 77290 + }, + { + "epoch": 8.503300330033003, + "grad_norm": 0.04390016570687294, + "learning_rate": 3.555283874587769e-05, + "loss": 0.0371, + "num_input_tokens_seen": 16311312, + "step": 77295 + }, + { + "epoch": 8.503850385038504, + "grad_norm": 0.8822153210639954, + "learning_rate": 3.555066293273139e-05, + "loss": 0.0375, + "num_input_tokens_seen": 16312336, + "step": 77300 + }, + { + "epoch": 8.504400440044005, + "grad_norm": 0.529008150100708, + "learning_rate": 3.5548487022344686e-05, + "loss": 0.0094, + "num_input_tokens_seen": 16313360, + "step": 77305 + }, + { + "epoch": 8.504950495049505, + "grad_norm": 0.28467264771461487, + "learning_rate": 3.554631101473764e-05, + "loss": 0.0173, + "num_input_tokens_seen": 16314384, + "step": 77310 + }, + { + "epoch": 8.505500550055006, + "grad_norm": 0.03393472358584404, + "learning_rate": 3.554413490993031e-05, + "loss": 0.013, + "num_input_tokens_seen": 16315376, + "step": 77315 + }, + { + "epoch": 8.506050605060507, + "grad_norm": 0.2924419045448303, + "learning_rate": 3.554195870794275e-05, + "loss": 0.0042, + "num_input_tokens_seen": 16316464, + "step": 77320 + }, + { + "epoch": 8.506600660066006, + "grad_norm": 0.507510244846344, + "learning_rate": 3.5539782408795006e-05, + "loss": 0.0202, + "num_input_tokens_seen": 16317488, + "step": 77325 + }, + { + "epoch": 8.507150715071507, + "grad_norm": 0.04667123407125473, + "learning_rate": 3.553760601250715e-05, + "loss": 0.025, + "num_input_tokens_seen": 16318544, + "step": 77330 + }, + { + "epoch": 8.507700770077008, + "grad_norm": 0.5629301071166992, + "learning_rate": 3.553542951909924e-05, + "loss": 0.0503, + "num_input_tokens_seen": 16319632, + "step": 77335 + }, + { + "epoch": 8.508250825082508, + "grad_norm": 1.4884977340698242, + "learning_rate": 3.553325292859133e-05, + "loss": 0.0384, + "num_input_tokens_seen": 16320656, + "step": 77340 + }, + { + "epoch": 8.508800880088009, + "grad_norm": 2.1238229274749756, + "learning_rate": 3.553107624100348e-05, + "loss": 0.1047, + "num_input_tokens_seen": 16321744, + "step": 77345 + }, + { + "epoch": 8.50935093509351, + "grad_norm": 1.032566785812378, + "learning_rate": 3.552889945635575e-05, + "loss": 0.0252, + "num_input_tokens_seen": 16322832, + "step": 77350 + }, + { + "epoch": 8.509900990099009, + "grad_norm": 0.0725005641579628, + "learning_rate": 3.552672257466821e-05, + "loss": 0.0097, + "num_input_tokens_seen": 16323888, + "step": 77355 + }, + { + "epoch": 8.51045104510451, + "grad_norm": 0.08627284318208694, + "learning_rate": 3.5524545595960914e-05, + "loss": 0.0539, + "num_input_tokens_seen": 16324976, + "step": 77360 + }, + { + "epoch": 8.511001100110011, + "grad_norm": 0.4581940174102783, + "learning_rate": 3.552236852025394e-05, + "loss": 0.0144, + "num_input_tokens_seen": 16326032, + "step": 77365 + }, + { + "epoch": 8.511551155115512, + "grad_norm": 0.20129063725471497, + "learning_rate": 3.552019134756734e-05, + "loss": 0.0069, + "num_input_tokens_seen": 16327088, + "step": 77370 + }, + { + "epoch": 8.512101210121012, + "grad_norm": 0.009580830112099648, + "learning_rate": 3.551801407792118e-05, + "loss": 0.0069, + "num_input_tokens_seen": 16328176, + "step": 77375 + }, + { + "epoch": 8.512651265126513, + "grad_norm": 0.017858784645795822, + "learning_rate": 3.551583671133554e-05, + "loss": 0.004, + "num_input_tokens_seen": 16329232, + "step": 77380 + }, + { + "epoch": 8.513201320132014, + "grad_norm": 1.4551652669906616, + "learning_rate": 3.551365924783047e-05, + "loss": 0.0763, + "num_input_tokens_seen": 16330352, + "step": 77385 + }, + { + "epoch": 8.513751375137513, + "grad_norm": 0.09628507494926453, + "learning_rate": 3.551148168742606e-05, + "loss": 0.0536, + "num_input_tokens_seen": 16331376, + "step": 77390 + }, + { + "epoch": 8.514301430143014, + "grad_norm": 0.0036605354398489, + "learning_rate": 3.550930403014235e-05, + "loss": 0.0323, + "num_input_tokens_seen": 16332400, + "step": 77395 + }, + { + "epoch": 8.514851485148515, + "grad_norm": 0.10726982355117798, + "learning_rate": 3.550712627599945e-05, + "loss": 0.0039, + "num_input_tokens_seen": 16333424, + "step": 77400 + }, + { + "epoch": 8.515401540154015, + "grad_norm": 0.8609127998352051, + "learning_rate": 3.550494842501739e-05, + "loss": 0.0522, + "num_input_tokens_seen": 16334544, + "step": 77405 + }, + { + "epoch": 8.515951595159516, + "grad_norm": 0.03251037746667862, + "learning_rate": 3.550277047721627e-05, + "loss": 0.0033, + "num_input_tokens_seen": 16335536, + "step": 77410 + }, + { + "epoch": 8.516501650165017, + "grad_norm": 0.9664223194122314, + "learning_rate": 3.550059243261615e-05, + "loss": 0.0097, + "num_input_tokens_seen": 16336528, + "step": 77415 + }, + { + "epoch": 8.517051705170516, + "grad_norm": 0.011145494878292084, + "learning_rate": 3.549841429123712e-05, + "loss": 0.0202, + "num_input_tokens_seen": 16337520, + "step": 77420 + }, + { + "epoch": 8.517601760176017, + "grad_norm": 0.012672023847699165, + "learning_rate": 3.549623605309922e-05, + "loss": 0.0035, + "num_input_tokens_seen": 16338640, + "step": 77425 + }, + { + "epoch": 8.518151815181518, + "grad_norm": 0.05269239842891693, + "learning_rate": 3.5494057718222565e-05, + "loss": 0.0169, + "num_input_tokens_seen": 16339728, + "step": 77430 + }, + { + "epoch": 8.51870187018702, + "grad_norm": 0.0397847555577755, + "learning_rate": 3.549187928662721e-05, + "loss": 0.0115, + "num_input_tokens_seen": 16340784, + "step": 77435 + }, + { + "epoch": 8.519251925192519, + "grad_norm": 0.4477028250694275, + "learning_rate": 3.548970075833324e-05, + "loss": 0.0161, + "num_input_tokens_seen": 16341808, + "step": 77440 + }, + { + "epoch": 8.51980198019802, + "grad_norm": 0.2837647497653961, + "learning_rate": 3.548752213336074e-05, + "loss": 0.1405, + "num_input_tokens_seen": 16342896, + "step": 77445 + }, + { + "epoch": 8.520352035203521, + "grad_norm": 1.1329138278961182, + "learning_rate": 3.5485343411729775e-05, + "loss": 0.0665, + "num_input_tokens_seen": 16343952, + "step": 77450 + }, + { + "epoch": 8.52090209020902, + "grad_norm": 0.18852779269218445, + "learning_rate": 3.548316459346042e-05, + "loss": 0.0556, + "num_input_tokens_seen": 16345104, + "step": 77455 + }, + { + "epoch": 8.521452145214521, + "grad_norm": 0.03753461316227913, + "learning_rate": 3.548098567857278e-05, + "loss": 0.0177, + "num_input_tokens_seen": 16346096, + "step": 77460 + }, + { + "epoch": 8.522002200220022, + "grad_norm": 0.10248568654060364, + "learning_rate": 3.547880666708692e-05, + "loss": 0.0036, + "num_input_tokens_seen": 16347120, + "step": 77465 + }, + { + "epoch": 8.522552255225522, + "grad_norm": 0.08084788173437119, + "learning_rate": 3.5476627559022926e-05, + "loss": 0.0092, + "num_input_tokens_seen": 16348208, + "step": 77470 + }, + { + "epoch": 8.523102310231023, + "grad_norm": 0.017604511231184006, + "learning_rate": 3.547444835440089e-05, + "loss": 0.0704, + "num_input_tokens_seen": 16349264, + "step": 77475 + }, + { + "epoch": 8.523652365236524, + "grad_norm": 0.10762005299329758, + "learning_rate": 3.5472269053240884e-05, + "loss": 0.0281, + "num_input_tokens_seen": 16350288, + "step": 77480 + }, + { + "epoch": 8.524202420242025, + "grad_norm": 0.008378633297979832, + "learning_rate": 3.5470089655563e-05, + "loss": 0.043, + "num_input_tokens_seen": 16351312, + "step": 77485 + }, + { + "epoch": 8.524752475247524, + "grad_norm": 0.10479265451431274, + "learning_rate": 3.546791016138732e-05, + "loss": 0.0573, + "num_input_tokens_seen": 16352336, + "step": 77490 + }, + { + "epoch": 8.525302530253025, + "grad_norm": 0.10430337488651276, + "learning_rate": 3.546573057073394e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16353360, + "step": 77495 + }, + { + "epoch": 8.525852585258527, + "grad_norm": 0.18275609612464905, + "learning_rate": 3.546355088362294e-05, + "loss": 0.0179, + "num_input_tokens_seen": 16354416, + "step": 77500 + }, + { + "epoch": 8.526402640264026, + "grad_norm": 0.00817374512553215, + "learning_rate": 3.5461371100074415e-05, + "loss": 0.0197, + "num_input_tokens_seen": 16355408, + "step": 77505 + }, + { + "epoch": 8.526952695269527, + "grad_norm": 0.08939614146947861, + "learning_rate": 3.545919122010845e-05, + "loss": 0.0548, + "num_input_tokens_seen": 16356432, + "step": 77510 + }, + { + "epoch": 8.527502750275028, + "grad_norm": 0.03631952404975891, + "learning_rate": 3.545701124374514e-05, + "loss": 0.0767, + "num_input_tokens_seen": 16357520, + "step": 77515 + }, + { + "epoch": 8.528052805280527, + "grad_norm": 1.184607744216919, + "learning_rate": 3.545483117100458e-05, + "loss": 0.0318, + "num_input_tokens_seen": 16358576, + "step": 77520 + }, + { + "epoch": 8.528602860286028, + "grad_norm": 0.029002346098423004, + "learning_rate": 3.5452651001906854e-05, + "loss": 0.0287, + "num_input_tokens_seen": 16359664, + "step": 77525 + }, + { + "epoch": 8.52915291529153, + "grad_norm": 0.011021620593965054, + "learning_rate": 3.5450470736472054e-05, + "loss": 0.0017, + "num_input_tokens_seen": 16360720, + "step": 77530 + }, + { + "epoch": 8.52970297029703, + "grad_norm": 0.050404734909534454, + "learning_rate": 3.5448290374720285e-05, + "loss": 0.0571, + "num_input_tokens_seen": 16361776, + "step": 77535 + }, + { + "epoch": 8.53025302530253, + "grad_norm": 0.012189891189336777, + "learning_rate": 3.544610991667163e-05, + "loss": 0.0728, + "num_input_tokens_seen": 16362800, + "step": 77540 + }, + { + "epoch": 8.530803080308031, + "grad_norm": 0.00984843261539936, + "learning_rate": 3.54439293623462e-05, + "loss": 0.0295, + "num_input_tokens_seen": 16363824, + "step": 77545 + }, + { + "epoch": 8.531353135313532, + "grad_norm": 0.0457456037402153, + "learning_rate": 3.544174871176409e-05, + "loss": 0.0159, + "num_input_tokens_seen": 16364912, + "step": 77550 + }, + { + "epoch": 8.531903190319031, + "grad_norm": 1.2093216180801392, + "learning_rate": 3.543956796494538e-05, + "loss": 0.0445, + "num_input_tokens_seen": 16365904, + "step": 77555 + }, + { + "epoch": 8.532453245324533, + "grad_norm": 0.025180600583553314, + "learning_rate": 3.543738712191019e-05, + "loss": 0.0099, + "num_input_tokens_seen": 16366960, + "step": 77560 + }, + { + "epoch": 8.533003300330034, + "grad_norm": 0.021753447130322456, + "learning_rate": 3.543520618267861e-05, + "loss": 0.0098, + "num_input_tokens_seen": 16367952, + "step": 77565 + }, + { + "epoch": 8.533553355335533, + "grad_norm": 0.0688653215765953, + "learning_rate": 3.543302514727074e-05, + "loss": 0.056, + "num_input_tokens_seen": 16368944, + "step": 77570 + }, + { + "epoch": 8.534103410341034, + "grad_norm": 0.014022916555404663, + "learning_rate": 3.5430844015706674e-05, + "loss": 0.012, + "num_input_tokens_seen": 16370000, + "step": 77575 + }, + { + "epoch": 8.534653465346535, + "grad_norm": 0.09574058651924133, + "learning_rate": 3.542866278800654e-05, + "loss": 0.0035, + "num_input_tokens_seen": 16371024, + "step": 77580 + }, + { + "epoch": 8.535203520352034, + "grad_norm": 0.49161022901535034, + "learning_rate": 3.542648146419042e-05, + "loss": 0.0286, + "num_input_tokens_seen": 16372016, + "step": 77585 + }, + { + "epoch": 8.535753575357536, + "grad_norm": 0.26933160424232483, + "learning_rate": 3.542430004427842e-05, + "loss": 0.1209, + "num_input_tokens_seen": 16373008, + "step": 77590 + }, + { + "epoch": 8.536303630363037, + "grad_norm": 0.22531531751155853, + "learning_rate": 3.542211852829065e-05, + "loss": 0.0091, + "num_input_tokens_seen": 16374000, + "step": 77595 + }, + { + "epoch": 8.536853685368538, + "grad_norm": 0.07288932055234909, + "learning_rate": 3.5419936916247216e-05, + "loss": 0.0027, + "num_input_tokens_seen": 16375088, + "step": 77600 + }, + { + "epoch": 8.537403740374037, + "grad_norm": 1.3378734588623047, + "learning_rate": 3.541775520816823e-05, + "loss": 0.063, + "num_input_tokens_seen": 16376176, + "step": 77605 + }, + { + "epoch": 8.537953795379538, + "grad_norm": 1.5791215896606445, + "learning_rate": 3.541557340407379e-05, + "loss": 0.0428, + "num_input_tokens_seen": 16377296, + "step": 77610 + }, + { + "epoch": 8.53850385038504, + "grad_norm": 0.04074333235621452, + "learning_rate": 3.5413391503984e-05, + "loss": 0.0018, + "num_input_tokens_seen": 16378384, + "step": 77615 + }, + { + "epoch": 8.539053905390539, + "grad_norm": 0.10022515058517456, + "learning_rate": 3.541120950791897e-05, + "loss": 0.055, + "num_input_tokens_seen": 16379440, + "step": 77620 + }, + { + "epoch": 8.53960396039604, + "grad_norm": 0.6525088548660278, + "learning_rate": 3.5409027415898835e-05, + "loss": 0.0065, + "num_input_tokens_seen": 16380560, + "step": 77625 + }, + { + "epoch": 8.54015401540154, + "grad_norm": 0.03086453303694725, + "learning_rate": 3.5406845227943686e-05, + "loss": 0.0019, + "num_input_tokens_seen": 16381680, + "step": 77630 + }, + { + "epoch": 8.54070407040704, + "grad_norm": 0.2811312973499298, + "learning_rate": 3.5404662944073635e-05, + "loss": 0.0241, + "num_input_tokens_seen": 16382800, + "step": 77635 + }, + { + "epoch": 8.541254125412541, + "grad_norm": 1.0042318105697632, + "learning_rate": 3.5402480564308797e-05, + "loss": 0.0284, + "num_input_tokens_seen": 16383856, + "step": 77640 + }, + { + "epoch": 8.541804180418042, + "grad_norm": 0.020670799538493156, + "learning_rate": 3.540029808866929e-05, + "loss": 0.0278, + "num_input_tokens_seen": 16384944, + "step": 77645 + }, + { + "epoch": 8.542354235423542, + "grad_norm": 0.5583385825157166, + "learning_rate": 3.539811551717522e-05, + "loss": 0.0267, + "num_input_tokens_seen": 16386000, + "step": 77650 + }, + { + "epoch": 8.542904290429043, + "grad_norm": 0.020849954336881638, + "learning_rate": 3.539593284984672e-05, + "loss": 0.02, + "num_input_tokens_seen": 16387024, + "step": 77655 + }, + { + "epoch": 8.543454345434544, + "grad_norm": 0.06661270558834076, + "learning_rate": 3.539375008670389e-05, + "loss": 0.05, + "num_input_tokens_seen": 16388080, + "step": 77660 + }, + { + "epoch": 8.544004400440045, + "grad_norm": 0.010594209656119347, + "learning_rate": 3.539156722776685e-05, + "loss": 0.0092, + "num_input_tokens_seen": 16389104, + "step": 77665 + }, + { + "epoch": 8.544554455445544, + "grad_norm": 0.012624708004295826, + "learning_rate": 3.538938427305573e-05, + "loss": 0.096, + "num_input_tokens_seen": 16390192, + "step": 77670 + }, + { + "epoch": 8.545104510451045, + "grad_norm": 0.05484096705913544, + "learning_rate": 3.5387201222590624e-05, + "loss": 0.0315, + "num_input_tokens_seen": 16391280, + "step": 77675 + }, + { + "epoch": 8.545654565456546, + "grad_norm": 0.9371052980422974, + "learning_rate": 3.538501807639168e-05, + "loss": 0.1236, + "num_input_tokens_seen": 16392336, + "step": 77680 + }, + { + "epoch": 8.546204620462046, + "grad_norm": 1.181807041168213, + "learning_rate": 3.538283483447901e-05, + "loss": 0.0861, + "num_input_tokens_seen": 16393456, + "step": 77685 + }, + { + "epoch": 8.546754675467547, + "grad_norm": 2.616697311401367, + "learning_rate": 3.538065149687272e-05, + "loss": 0.0904, + "num_input_tokens_seen": 16394544, + "step": 77690 + }, + { + "epoch": 8.547304730473048, + "grad_norm": 0.02795715071260929, + "learning_rate": 3.537846806359296e-05, + "loss": 0.0181, + "num_input_tokens_seen": 16395568, + "step": 77695 + }, + { + "epoch": 8.547854785478547, + "grad_norm": 0.077248215675354, + "learning_rate": 3.537628453465984e-05, + "loss": 0.0407, + "num_input_tokens_seen": 16396688, + "step": 77700 + }, + { + "epoch": 8.548404840484048, + "grad_norm": 0.042975522577762604, + "learning_rate": 3.537410091009348e-05, + "loss": 0.0111, + "num_input_tokens_seen": 16397808, + "step": 77705 + }, + { + "epoch": 8.54895489548955, + "grad_norm": 0.04171828553080559, + "learning_rate": 3.537191718991401e-05, + "loss": 0.1062, + "num_input_tokens_seen": 16398896, + "step": 77710 + }, + { + "epoch": 8.549504950495049, + "grad_norm": 0.008259271271526814, + "learning_rate": 3.536973337414155e-05, + "loss": 0.0075, + "num_input_tokens_seen": 16399952, + "step": 77715 + }, + { + "epoch": 8.55005500550055, + "grad_norm": 0.15240983664989471, + "learning_rate": 3.536754946279624e-05, + "loss": 0.0146, + "num_input_tokens_seen": 16401072, + "step": 77720 + }, + { + "epoch": 8.55060506050605, + "grad_norm": 0.020602088421583176, + "learning_rate": 3.53653654558982e-05, + "loss": 0.0968, + "num_input_tokens_seen": 16402096, + "step": 77725 + }, + { + "epoch": 8.551155115511552, + "grad_norm": 0.4483696520328522, + "learning_rate": 3.536318135346756e-05, + "loss": 0.029, + "num_input_tokens_seen": 16403184, + "step": 77730 + }, + { + "epoch": 8.551705170517051, + "grad_norm": 1.3352702856063843, + "learning_rate": 3.536099715552446e-05, + "loss": 0.0553, + "num_input_tokens_seen": 16404272, + "step": 77735 + }, + { + "epoch": 8.552255225522552, + "grad_norm": 0.02829432114958763, + "learning_rate": 3.535881286208901e-05, + "loss": 0.075, + "num_input_tokens_seen": 16405296, + "step": 77740 + }, + { + "epoch": 8.552805280528053, + "grad_norm": 0.03936151787638664, + "learning_rate": 3.5356628473181345e-05, + "loss": 0.0067, + "num_input_tokens_seen": 16406352, + "step": 77745 + }, + { + "epoch": 8.553355335533553, + "grad_norm": 0.10025884956121445, + "learning_rate": 3.535444398882162e-05, + "loss": 0.0033, + "num_input_tokens_seen": 16407408, + "step": 77750 + }, + { + "epoch": 8.553905390539054, + "grad_norm": 0.2398112267255783, + "learning_rate": 3.535225940902995e-05, + "loss": 0.029, + "num_input_tokens_seen": 16408400, + "step": 77755 + }, + { + "epoch": 8.554455445544555, + "grad_norm": 0.014133263379335403, + "learning_rate": 3.5350074733826466e-05, + "loss": 0.0195, + "num_input_tokens_seen": 16409456, + "step": 77760 + }, + { + "epoch": 8.555005500550054, + "grad_norm": 1.0416994094848633, + "learning_rate": 3.5347889963231316e-05, + "loss": 0.0603, + "num_input_tokens_seen": 16410576, + "step": 77765 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 0.09665122628211975, + "learning_rate": 3.5345705097264634e-05, + "loss": 0.1571, + "num_input_tokens_seen": 16411568, + "step": 77770 + }, + { + "epoch": 8.556105610561056, + "grad_norm": 0.022042647004127502, + "learning_rate": 3.5343520135946544e-05, + "loss": 0.1002, + "num_input_tokens_seen": 16412592, + "step": 77775 + }, + { + "epoch": 8.556655665566556, + "grad_norm": 0.020621871575713158, + "learning_rate": 3.5341335079297194e-05, + "loss": 0.027, + "num_input_tokens_seen": 16413680, + "step": 77780 + }, + { + "epoch": 8.557205720572057, + "grad_norm": 0.041548311710357666, + "learning_rate": 3.533914992733672e-05, + "loss": 0.0325, + "num_input_tokens_seen": 16414768, + "step": 77785 + }, + { + "epoch": 8.557755775577558, + "grad_norm": 0.08212076127529144, + "learning_rate": 3.533696468008526e-05, + "loss": 0.0028, + "num_input_tokens_seen": 16415728, + "step": 77790 + }, + { + "epoch": 8.558305830583059, + "grad_norm": 0.23109818994998932, + "learning_rate": 3.533477933756297e-05, + "loss": 0.0724, + "num_input_tokens_seen": 16416880, + "step": 77795 + }, + { + "epoch": 8.558855885588558, + "grad_norm": 0.1924991011619568, + "learning_rate": 3.533259389978997e-05, + "loss": 0.0354, + "num_input_tokens_seen": 16417968, + "step": 77800 + }, + { + "epoch": 8.55940594059406, + "grad_norm": 0.052249182015657425, + "learning_rate": 3.5330408366786406e-05, + "loss": 0.0131, + "num_input_tokens_seen": 16419056, + "step": 77805 + }, + { + "epoch": 8.55995599559956, + "grad_norm": 0.06299193948507309, + "learning_rate": 3.532822273857242e-05, + "loss": 0.0499, + "num_input_tokens_seen": 16420112, + "step": 77810 + }, + { + "epoch": 8.56050605060506, + "grad_norm": 0.040010835975408554, + "learning_rate": 3.532603701516817e-05, + "loss": 0.0272, + "num_input_tokens_seen": 16421136, + "step": 77815 + }, + { + "epoch": 8.561056105610561, + "grad_norm": 0.009086553938686848, + "learning_rate": 3.5323851196593794e-05, + "loss": 0.0028, + "num_input_tokens_seen": 16422224, + "step": 77820 + }, + { + "epoch": 8.561606160616062, + "grad_norm": 0.044834356755018234, + "learning_rate": 3.532166528286943e-05, + "loss": 0.0085, + "num_input_tokens_seen": 16423344, + "step": 77825 + }, + { + "epoch": 8.562156215621561, + "grad_norm": 0.09878598153591156, + "learning_rate": 3.5319479274015235e-05, + "loss": 0.0159, + "num_input_tokens_seen": 16424336, + "step": 77830 + }, + { + "epoch": 8.562706270627062, + "grad_norm": 0.9135596752166748, + "learning_rate": 3.531729317005134e-05, + "loss": 0.0712, + "num_input_tokens_seen": 16425424, + "step": 77835 + }, + { + "epoch": 8.563256325632564, + "grad_norm": 0.14220581948757172, + "learning_rate": 3.531510697099791e-05, + "loss": 0.0228, + "num_input_tokens_seen": 16426480, + "step": 77840 + }, + { + "epoch": 8.563806380638063, + "grad_norm": 0.09038415551185608, + "learning_rate": 3.53129206768751e-05, + "loss": 0.1499, + "num_input_tokens_seen": 16427568, + "step": 77845 + }, + { + "epoch": 8.564356435643564, + "grad_norm": 0.01267685741186142, + "learning_rate": 3.531073428770304e-05, + "loss": 0.0045, + "num_input_tokens_seen": 16428592, + "step": 77850 + }, + { + "epoch": 8.564906490649065, + "grad_norm": 0.08859993517398834, + "learning_rate": 3.530854780350189e-05, + "loss": 0.0048, + "num_input_tokens_seen": 16429680, + "step": 77855 + }, + { + "epoch": 8.565456545654566, + "grad_norm": 0.02935151755809784, + "learning_rate": 3.53063612242918e-05, + "loss": 0.0763, + "num_input_tokens_seen": 16430704, + "step": 77860 + }, + { + "epoch": 8.566006600660065, + "grad_norm": 0.029731741175055504, + "learning_rate": 3.5304174550092926e-05, + "loss": 0.0057, + "num_input_tokens_seen": 16431792, + "step": 77865 + }, + { + "epoch": 8.566556655665567, + "grad_norm": 2.730011463165283, + "learning_rate": 3.5301987780925413e-05, + "loss": 0.0392, + "num_input_tokens_seen": 16432848, + "step": 77870 + }, + { + "epoch": 8.567106710671068, + "grad_norm": 1.6566269397735596, + "learning_rate": 3.5299800916809425e-05, + "loss": 0.0235, + "num_input_tokens_seen": 16433936, + "step": 77875 + }, + { + "epoch": 8.567656765676567, + "grad_norm": 0.333832323551178, + "learning_rate": 3.529761395776512e-05, + "loss": 0.1178, + "num_input_tokens_seen": 16435024, + "step": 77880 + }, + { + "epoch": 8.568206820682068, + "grad_norm": 0.273032009601593, + "learning_rate": 3.529542690381265e-05, + "loss": 0.018, + "num_input_tokens_seen": 16436080, + "step": 77885 + }, + { + "epoch": 8.56875687568757, + "grad_norm": 0.16195173561573029, + "learning_rate": 3.5293239754972165e-05, + "loss": 0.0136, + "num_input_tokens_seen": 16437200, + "step": 77890 + }, + { + "epoch": 8.569306930693068, + "grad_norm": 0.09387620538473129, + "learning_rate": 3.529105251126383e-05, + "loss": 0.0359, + "num_input_tokens_seen": 16438192, + "step": 77895 + }, + { + "epoch": 8.56985698569857, + "grad_norm": 1.9870429039001465, + "learning_rate": 3.52888651727078e-05, + "loss": 0.113, + "num_input_tokens_seen": 16439152, + "step": 77900 + }, + { + "epoch": 8.57040704070407, + "grad_norm": 2.393500804901123, + "learning_rate": 3.528667773932424e-05, + "loss": 0.0807, + "num_input_tokens_seen": 16440240, + "step": 77905 + }, + { + "epoch": 8.570957095709572, + "grad_norm": 0.2823098599910736, + "learning_rate": 3.5284490211133306e-05, + "loss": 0.0129, + "num_input_tokens_seen": 16441264, + "step": 77910 + }, + { + "epoch": 8.571507150715071, + "grad_norm": 0.963179349899292, + "learning_rate": 3.528230258815516e-05, + "loss": 0.0286, + "num_input_tokens_seen": 16442320, + "step": 77915 + }, + { + "epoch": 8.572057205720572, + "grad_norm": 0.2200954109430313, + "learning_rate": 3.5280114870409965e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16443408, + "step": 77920 + }, + { + "epoch": 8.572607260726073, + "grad_norm": 0.1539694368839264, + "learning_rate": 3.527792705791788e-05, + "loss": 0.0059, + "num_input_tokens_seen": 16444464, + "step": 77925 + }, + { + "epoch": 8.573157315731573, + "grad_norm": 0.2735476791858673, + "learning_rate": 3.527573915069908e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16445520, + "step": 77930 + }, + { + "epoch": 8.573707370737074, + "grad_norm": 0.10554080456495285, + "learning_rate": 3.5273551148773714e-05, + "loss": 0.0469, + "num_input_tokens_seen": 16446576, + "step": 77935 + }, + { + "epoch": 8.574257425742575, + "grad_norm": 0.10425763577222824, + "learning_rate": 3.527136305216197e-05, + "loss": 0.1515, + "num_input_tokens_seen": 16447696, + "step": 77940 + }, + { + "epoch": 8.574807480748074, + "grad_norm": 0.025257114320993423, + "learning_rate": 3.5269174860883996e-05, + "loss": 0.1674, + "num_input_tokens_seen": 16448720, + "step": 77945 + }, + { + "epoch": 8.575357535753575, + "grad_norm": 1.10409414768219, + "learning_rate": 3.526698657495996e-05, + "loss": 0.0453, + "num_input_tokens_seen": 16449808, + "step": 77950 + }, + { + "epoch": 8.575907590759076, + "grad_norm": 0.017411598935723305, + "learning_rate": 3.5264798194410036e-05, + "loss": 0.0315, + "num_input_tokens_seen": 16450864, + "step": 77955 + }, + { + "epoch": 8.576457645764577, + "grad_norm": 0.5986855626106262, + "learning_rate": 3.5262609719254397e-05, + "loss": 0.0526, + "num_input_tokens_seen": 16451952, + "step": 77960 + }, + { + "epoch": 8.577007700770077, + "grad_norm": 0.050228673964738846, + "learning_rate": 3.5260421149513204e-05, + "loss": 0.0039, + "num_input_tokens_seen": 16453040, + "step": 77965 + }, + { + "epoch": 8.577557755775578, + "grad_norm": 0.21520677208900452, + "learning_rate": 3.5258232485206644e-05, + "loss": 0.0607, + "num_input_tokens_seen": 16454000, + "step": 77970 + }, + { + "epoch": 8.578107810781079, + "grad_norm": 0.33162790536880493, + "learning_rate": 3.525604372635486e-05, + "loss": 0.0267, + "num_input_tokens_seen": 16455024, + "step": 77975 + }, + { + "epoch": 8.578657865786578, + "grad_norm": 0.03406409174203873, + "learning_rate": 3.525385487297805e-05, + "loss": 0.0052, + "num_input_tokens_seen": 16456112, + "step": 77980 + }, + { + "epoch": 8.57920792079208, + "grad_norm": 0.22972379624843597, + "learning_rate": 3.525166592509639e-05, + "loss": 0.0407, + "num_input_tokens_seen": 16457168, + "step": 77985 + }, + { + "epoch": 8.57975797579758, + "grad_norm": 0.39126455783843994, + "learning_rate": 3.524947688273003e-05, + "loss": 0.0639, + "num_input_tokens_seen": 16458224, + "step": 77990 + }, + { + "epoch": 8.58030803080308, + "grad_norm": 0.0241800956428051, + "learning_rate": 3.524728774589917e-05, + "loss": 0.0693, + "num_input_tokens_seen": 16459312, + "step": 77995 + }, + { + "epoch": 8.58085808580858, + "grad_norm": 0.35724112391471863, + "learning_rate": 3.524509851462397e-05, + "loss": 0.0161, + "num_input_tokens_seen": 16460368, + "step": 78000 + }, + { + "epoch": 8.581408140814082, + "grad_norm": 0.10449403524398804, + "learning_rate": 3.524290918892462e-05, + "loss": 0.0241, + "num_input_tokens_seen": 16461488, + "step": 78005 + }, + { + "epoch": 8.581958195819581, + "grad_norm": 0.33856165409088135, + "learning_rate": 3.524071976882129e-05, + "loss": 0.0062, + "num_input_tokens_seen": 16462512, + "step": 78010 + }, + { + "epoch": 8.582508250825082, + "grad_norm": 0.12648984789848328, + "learning_rate": 3.523853025433415e-05, + "loss": 0.0122, + "num_input_tokens_seen": 16463568, + "step": 78015 + }, + { + "epoch": 8.583058305830583, + "grad_norm": 0.5566087961196899, + "learning_rate": 3.523634064548339e-05, + "loss": 0.0107, + "num_input_tokens_seen": 16464656, + "step": 78020 + }, + { + "epoch": 8.583608360836084, + "grad_norm": 0.2680712342262268, + "learning_rate": 3.5234150942289194e-05, + "loss": 0.0573, + "num_input_tokens_seen": 16465776, + "step": 78025 + }, + { + "epoch": 8.584158415841584, + "grad_norm": 0.10747965425252914, + "learning_rate": 3.523196114477174e-05, + "loss": 0.0045, + "num_input_tokens_seen": 16466864, + "step": 78030 + }, + { + "epoch": 8.584708470847085, + "grad_norm": 0.09760571271181107, + "learning_rate": 3.522977125295122e-05, + "loss": 0.0023, + "num_input_tokens_seen": 16467888, + "step": 78035 + }, + { + "epoch": 8.585258525852586, + "grad_norm": 0.13647128641605377, + "learning_rate": 3.522758126684779e-05, + "loss": 0.0034, + "num_input_tokens_seen": 16468912, + "step": 78040 + }, + { + "epoch": 8.585808580858085, + "grad_norm": 0.06974995881319046, + "learning_rate": 3.522539118648166e-05, + "loss": 0.0301, + "num_input_tokens_seen": 16470032, + "step": 78045 + }, + { + "epoch": 8.586358635863586, + "grad_norm": 0.18988537788391113, + "learning_rate": 3.5223201011873004e-05, + "loss": 0.0517, + "num_input_tokens_seen": 16471088, + "step": 78050 + }, + { + "epoch": 8.586908690869087, + "grad_norm": 0.6936371326446533, + "learning_rate": 3.522101074304201e-05, + "loss": 0.0129, + "num_input_tokens_seen": 16472144, + "step": 78055 + }, + { + "epoch": 8.587458745874587, + "grad_norm": 0.039435259997844696, + "learning_rate": 3.5218820380008874e-05, + "loss": 0.0066, + "num_input_tokens_seen": 16473264, + "step": 78060 + }, + { + "epoch": 8.588008800880088, + "grad_norm": 0.6856012940406799, + "learning_rate": 3.521662992279376e-05, + "loss": 0.0239, + "num_input_tokens_seen": 16474288, + "step": 78065 + }, + { + "epoch": 8.588558855885589, + "grad_norm": 0.09664983302354813, + "learning_rate": 3.521443937141687e-05, + "loss": 0.0734, + "num_input_tokens_seen": 16475312, + "step": 78070 + }, + { + "epoch": 8.589108910891088, + "grad_norm": 0.027577828615903854, + "learning_rate": 3.5212248725898406e-05, + "loss": 0.0059, + "num_input_tokens_seen": 16476336, + "step": 78075 + }, + { + "epoch": 8.58965896589659, + "grad_norm": 0.12234149873256683, + "learning_rate": 3.521005798625854e-05, + "loss": 0.0043, + "num_input_tokens_seen": 16477392, + "step": 78080 + }, + { + "epoch": 8.59020902090209, + "grad_norm": 0.1148025318980217, + "learning_rate": 3.520786715251747e-05, + "loss": 0.0372, + "num_input_tokens_seen": 16478448, + "step": 78085 + }, + { + "epoch": 8.590759075907592, + "grad_norm": 0.38240399956703186, + "learning_rate": 3.5205676224695386e-05, + "loss": 0.0541, + "num_input_tokens_seen": 16479536, + "step": 78090 + }, + { + "epoch": 8.591309130913091, + "grad_norm": 0.09000736474990845, + "learning_rate": 3.520348520281247e-05, + "loss": 0.0239, + "num_input_tokens_seen": 16480592, + "step": 78095 + }, + { + "epoch": 8.591859185918592, + "grad_norm": 0.20961260795593262, + "learning_rate": 3.520129408688894e-05, + "loss": 0.0122, + "num_input_tokens_seen": 16481712, + "step": 78100 + }, + { + "epoch": 8.592409240924093, + "grad_norm": 0.29615411162376404, + "learning_rate": 3.5199102876944986e-05, + "loss": 0.0094, + "num_input_tokens_seen": 16482768, + "step": 78105 + }, + { + "epoch": 8.592959295929592, + "grad_norm": 0.7077049016952515, + "learning_rate": 3.519691157300077e-05, + "loss": 0.0174, + "num_input_tokens_seen": 16483920, + "step": 78110 + }, + { + "epoch": 8.593509350935093, + "grad_norm": 1.2465591430664062, + "learning_rate": 3.519472017507653e-05, + "loss": 0.0307, + "num_input_tokens_seen": 16485008, + "step": 78115 + }, + { + "epoch": 8.594059405940595, + "grad_norm": 1.4309751987457275, + "learning_rate": 3.5192528683192444e-05, + "loss": 0.0969, + "num_input_tokens_seen": 16486128, + "step": 78120 + }, + { + "epoch": 8.594609460946094, + "grad_norm": 0.006468247622251511, + "learning_rate": 3.519033709736871e-05, + "loss": 0.0109, + "num_input_tokens_seen": 16487152, + "step": 78125 + }, + { + "epoch": 8.595159515951595, + "grad_norm": 0.9144055843353271, + "learning_rate": 3.5188145417625535e-05, + "loss": 0.0234, + "num_input_tokens_seen": 16488176, + "step": 78130 + }, + { + "epoch": 8.595709570957096, + "grad_norm": 0.09040768444538116, + "learning_rate": 3.518595364398311e-05, + "loss": 0.0243, + "num_input_tokens_seen": 16489200, + "step": 78135 + }, + { + "epoch": 8.596259625962595, + "grad_norm": 0.06601057946681976, + "learning_rate": 3.5183761776461625e-05, + "loss": 0.0526, + "num_input_tokens_seen": 16490288, + "step": 78140 + }, + { + "epoch": 8.596809680968097, + "grad_norm": 0.18646694719791412, + "learning_rate": 3.51815698150813e-05, + "loss": 0.0215, + "num_input_tokens_seen": 16491344, + "step": 78145 + }, + { + "epoch": 8.597359735973598, + "grad_norm": 0.23311658203601837, + "learning_rate": 3.517937775986234e-05, + "loss": 0.0105, + "num_input_tokens_seen": 16492432, + "step": 78150 + }, + { + "epoch": 8.597909790979099, + "grad_norm": 0.038368914276361465, + "learning_rate": 3.517718561082493e-05, + "loss": 0.0495, + "num_input_tokens_seen": 16493552, + "step": 78155 + }, + { + "epoch": 8.598459845984598, + "grad_norm": 0.12938810884952545, + "learning_rate": 3.517499336798929e-05, + "loss": 0.0784, + "num_input_tokens_seen": 16494576, + "step": 78160 + }, + { + "epoch": 8.599009900990099, + "grad_norm": 2.0262064933776855, + "learning_rate": 3.5172801031375624e-05, + "loss": 0.0629, + "num_input_tokens_seen": 16495568, + "step": 78165 + }, + { + "epoch": 8.5995599559956, + "grad_norm": 2.4233243465423584, + "learning_rate": 3.5170608601004124e-05, + "loss": 0.0452, + "num_input_tokens_seen": 16496656, + "step": 78170 + }, + { + "epoch": 8.6001100110011, + "grad_norm": 0.4691665470600128, + "learning_rate": 3.516841607689501e-05, + "loss": 0.023, + "num_input_tokens_seen": 16497712, + "step": 78175 + }, + { + "epoch": 8.6006600660066, + "grad_norm": 1.411423921585083, + "learning_rate": 3.5166223459068484e-05, + "loss": 0.1023, + "num_input_tokens_seen": 16498800, + "step": 78180 + }, + { + "epoch": 8.601210121012102, + "grad_norm": 0.09116777777671814, + "learning_rate": 3.5164030747544746e-05, + "loss": 0.0604, + "num_input_tokens_seen": 16499792, + "step": 78185 + }, + { + "epoch": 8.601760176017601, + "grad_norm": 0.3172941207885742, + "learning_rate": 3.516183794234402e-05, + "loss": 0.0086, + "num_input_tokens_seen": 16500848, + "step": 78190 + }, + { + "epoch": 8.602310231023102, + "grad_norm": 0.005304002668708563, + "learning_rate": 3.515964504348651e-05, + "loss": 0.0175, + "num_input_tokens_seen": 16501872, + "step": 78195 + }, + { + "epoch": 8.602860286028603, + "grad_norm": 0.4062674343585968, + "learning_rate": 3.5157452050992426e-05, + "loss": 0.0096, + "num_input_tokens_seen": 16502992, + "step": 78200 + }, + { + "epoch": 8.603410341034103, + "grad_norm": 0.05669749528169632, + "learning_rate": 3.515525896488198e-05, + "loss": 0.0635, + "num_input_tokens_seen": 16504048, + "step": 78205 + }, + { + "epoch": 8.603960396039604, + "grad_norm": 0.00464971549808979, + "learning_rate": 3.515306578517538e-05, + "loss": 0.0041, + "num_input_tokens_seen": 16505104, + "step": 78210 + }, + { + "epoch": 8.604510451045105, + "grad_norm": 0.016103800386190414, + "learning_rate": 3.5150872511892855e-05, + "loss": 0.0177, + "num_input_tokens_seen": 16506160, + "step": 78215 + }, + { + "epoch": 8.605060506050606, + "grad_norm": 0.3384944200515747, + "learning_rate": 3.51486791450546e-05, + "loss": 0.0054, + "num_input_tokens_seen": 16507248, + "step": 78220 + }, + { + "epoch": 8.605610561056105, + "grad_norm": 0.025531137362122536, + "learning_rate": 3.514648568468085e-05, + "loss": 0.0032, + "num_input_tokens_seen": 16508304, + "step": 78225 + }, + { + "epoch": 8.606160616061606, + "grad_norm": 1.7790669202804565, + "learning_rate": 3.51442921307918e-05, + "loss": 0.1725, + "num_input_tokens_seen": 16509328, + "step": 78230 + }, + { + "epoch": 8.606710671067107, + "grad_norm": 0.5796021223068237, + "learning_rate": 3.5142098483407684e-05, + "loss": 0.0461, + "num_input_tokens_seen": 16510416, + "step": 78235 + }, + { + "epoch": 8.607260726072607, + "grad_norm": 1.2364505529403687, + "learning_rate": 3.51399047425487e-05, + "loss": 0.0615, + "num_input_tokens_seen": 16511472, + "step": 78240 + }, + { + "epoch": 8.607810781078108, + "grad_norm": 0.047275952994823456, + "learning_rate": 3.51377109082351e-05, + "loss": 0.0433, + "num_input_tokens_seen": 16512528, + "step": 78245 + }, + { + "epoch": 8.608360836083609, + "grad_norm": 0.017435144633054733, + "learning_rate": 3.5135516980487074e-05, + "loss": 0.0019, + "num_input_tokens_seen": 16513520, + "step": 78250 + }, + { + "epoch": 8.608910891089108, + "grad_norm": 0.007346757687628269, + "learning_rate": 3.513332295932484e-05, + "loss": 0.0254, + "num_input_tokens_seen": 16514640, + "step": 78255 + }, + { + "epoch": 8.60946094609461, + "grad_norm": 0.015628831461071968, + "learning_rate": 3.513112884476865e-05, + "loss": 0.0049, + "num_input_tokens_seen": 16515664, + "step": 78260 + }, + { + "epoch": 8.61001100110011, + "grad_norm": 1.0712268352508545, + "learning_rate": 3.512893463683869e-05, + "loss": 0.0741, + "num_input_tokens_seen": 16516784, + "step": 78265 + }, + { + "epoch": 8.61056105610561, + "grad_norm": 0.32644641399383545, + "learning_rate": 3.5126740335555205e-05, + "loss": 0.0139, + "num_input_tokens_seen": 16517872, + "step": 78270 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.11146122217178345, + "learning_rate": 3.512454594093842e-05, + "loss": 0.0161, + "num_input_tokens_seen": 16518896, + "step": 78275 + }, + { + "epoch": 8.611661166116612, + "grad_norm": 0.03130768984556198, + "learning_rate": 3.512235145300855e-05, + "loss": 0.0013, + "num_input_tokens_seen": 16519952, + "step": 78280 + }, + { + "epoch": 8.612211221122113, + "grad_norm": 0.05558885633945465, + "learning_rate": 3.5120156871785814e-05, + "loss": 0.0675, + "num_input_tokens_seen": 16521008, + "step": 78285 + }, + { + "epoch": 8.612761276127612, + "grad_norm": 0.2025480717420578, + "learning_rate": 3.511796219729046e-05, + "loss": 0.1006, + "num_input_tokens_seen": 16522064, + "step": 78290 + }, + { + "epoch": 8.613311331133113, + "grad_norm": 0.009908421896398067, + "learning_rate": 3.51157674295427e-05, + "loss": 0.0034, + "num_input_tokens_seen": 16523152, + "step": 78295 + }, + { + "epoch": 8.613861386138614, + "grad_norm": 0.7925743460655212, + "learning_rate": 3.511357256856277e-05, + "loss": 0.1032, + "num_input_tokens_seen": 16524240, + "step": 78300 + }, + { + "epoch": 8.614411441144114, + "grad_norm": 0.0992671474814415, + "learning_rate": 3.511137761437089e-05, + "loss": 0.0049, + "num_input_tokens_seen": 16525296, + "step": 78305 + }, + { + "epoch": 8.614961496149615, + "grad_norm": 0.2296203374862671, + "learning_rate": 3.51091825669873e-05, + "loss": 0.0224, + "num_input_tokens_seen": 16526320, + "step": 78310 + }, + { + "epoch": 8.615511551155116, + "grad_norm": 0.330990195274353, + "learning_rate": 3.510698742643221e-05, + "loss": 0.0566, + "num_input_tokens_seen": 16527344, + "step": 78315 + }, + { + "epoch": 8.616061606160617, + "grad_norm": 0.16268585622310638, + "learning_rate": 3.510479219272588e-05, + "loss": 0.009, + "num_input_tokens_seen": 16528432, + "step": 78320 + }, + { + "epoch": 8.616611661166116, + "grad_norm": 0.004573538899421692, + "learning_rate": 3.5102596865888525e-05, + "loss": 0.1462, + "num_input_tokens_seen": 16529488, + "step": 78325 + }, + { + "epoch": 8.617161716171617, + "grad_norm": 0.12510709464550018, + "learning_rate": 3.5100401445940386e-05, + "loss": 0.043, + "num_input_tokens_seen": 16530544, + "step": 78330 + }, + { + "epoch": 8.617711771177119, + "grad_norm": 0.0225303266197443, + "learning_rate": 3.509820593290169e-05, + "loss": 0.0562, + "num_input_tokens_seen": 16531664, + "step": 78335 + }, + { + "epoch": 8.618261826182618, + "grad_norm": 0.1995607167482376, + "learning_rate": 3.5096010326792675e-05, + "loss": 0.0206, + "num_input_tokens_seen": 16532816, + "step": 78340 + }, + { + "epoch": 8.618811881188119, + "grad_norm": 0.47610601782798767, + "learning_rate": 3.509381462763358e-05, + "loss": 0.1009, + "num_input_tokens_seen": 16533936, + "step": 78345 + }, + { + "epoch": 8.61936193619362, + "grad_norm": 0.044652871787548065, + "learning_rate": 3.509161883544464e-05, + "loss": 0.0358, + "num_input_tokens_seen": 16534928, + "step": 78350 + }, + { + "epoch": 8.61991199119912, + "grad_norm": 0.5988028645515442, + "learning_rate": 3.5089422950246084e-05, + "loss": 0.0109, + "num_input_tokens_seen": 16535952, + "step": 78355 + }, + { + "epoch": 8.62046204620462, + "grad_norm": 1.721900463104248, + "learning_rate": 3.5087226972058164e-05, + "loss": 0.1181, + "num_input_tokens_seen": 16536976, + "step": 78360 + }, + { + "epoch": 8.621012101210122, + "grad_norm": 0.0647556334733963, + "learning_rate": 3.5085030900901115e-05, + "loss": 0.0079, + "num_input_tokens_seen": 16538064, + "step": 78365 + }, + { + "epoch": 8.62156215621562, + "grad_norm": 1.359382152557373, + "learning_rate": 3.5082834736795175e-05, + "loss": 0.0949, + "num_input_tokens_seen": 16539088, + "step": 78370 + }, + { + "epoch": 8.622112211221122, + "grad_norm": 0.10500167310237885, + "learning_rate": 3.508063847976058e-05, + "loss": 0.066, + "num_input_tokens_seen": 16540144, + "step": 78375 + }, + { + "epoch": 8.622662266226623, + "grad_norm": 0.9306854009628296, + "learning_rate": 3.5078442129817585e-05, + "loss": 0.0271, + "num_input_tokens_seen": 16541168, + "step": 78380 + }, + { + "epoch": 8.623212321232124, + "grad_norm": 2.143528461456299, + "learning_rate": 3.5076245686986415e-05, + "loss": 0.0602, + "num_input_tokens_seen": 16542224, + "step": 78385 + }, + { + "epoch": 8.623762376237623, + "grad_norm": 0.23979122936725616, + "learning_rate": 3.507404915128734e-05, + "loss": 0.1037, + "num_input_tokens_seen": 16543312, + "step": 78390 + }, + { + "epoch": 8.624312431243125, + "grad_norm": 0.05667031928896904, + "learning_rate": 3.507185252274057e-05, + "loss": 0.0964, + "num_input_tokens_seen": 16544400, + "step": 78395 + }, + { + "epoch": 8.624862486248626, + "grad_norm": 0.016381777822971344, + "learning_rate": 3.5069655801366376e-05, + "loss": 0.0069, + "num_input_tokens_seen": 16545488, + "step": 78400 + }, + { + "epoch": 8.625412541254125, + "grad_norm": 0.053025469183921814, + "learning_rate": 3.5067458987185e-05, + "loss": 0.0478, + "num_input_tokens_seen": 16546576, + "step": 78405 + }, + { + "epoch": 8.625962596259626, + "grad_norm": 0.15153314173221588, + "learning_rate": 3.506526208021668e-05, + "loss": 0.021, + "num_input_tokens_seen": 16547568, + "step": 78410 + }, + { + "epoch": 8.626512651265127, + "grad_norm": 0.023385033011436462, + "learning_rate": 3.506306508048167e-05, + "loss": 0.0959, + "num_input_tokens_seen": 16548592, + "step": 78415 + }, + { + "epoch": 8.627062706270626, + "grad_norm": 0.2700863480567932, + "learning_rate": 3.506086798800022e-05, + "loss": 0.0585, + "num_input_tokens_seen": 16549680, + "step": 78420 + }, + { + "epoch": 8.627612761276128, + "grad_norm": 1.2422236204147339, + "learning_rate": 3.5058670802792574e-05, + "loss": 0.0644, + "num_input_tokens_seen": 16550736, + "step": 78425 + }, + { + "epoch": 8.628162816281629, + "grad_norm": 0.0670536458492279, + "learning_rate": 3.505647352487899e-05, + "loss": 0.0101, + "num_input_tokens_seen": 16551888, + "step": 78430 + }, + { + "epoch": 8.628712871287128, + "grad_norm": 0.09925433248281479, + "learning_rate": 3.505427615427971e-05, + "loss": 0.037, + "num_input_tokens_seen": 16552912, + "step": 78435 + }, + { + "epoch": 8.629262926292629, + "grad_norm": 0.19554001092910767, + "learning_rate": 3.5052078691015e-05, + "loss": 0.071, + "num_input_tokens_seen": 16553968, + "step": 78440 + }, + { + "epoch": 8.62981298129813, + "grad_norm": 1.0081641674041748, + "learning_rate": 3.504988113510509e-05, + "loss": 0.0465, + "num_input_tokens_seen": 16554992, + "step": 78445 + }, + { + "epoch": 8.630363036303631, + "grad_norm": 1.2460455894470215, + "learning_rate": 3.504768348657026e-05, + "loss": 0.0355, + "num_input_tokens_seen": 16556048, + "step": 78450 + }, + { + "epoch": 8.63091309130913, + "grad_norm": 0.012524005956947803, + "learning_rate": 3.504548574543075e-05, + "loss": 0.0124, + "num_input_tokens_seen": 16557168, + "step": 78455 + }, + { + "epoch": 8.631463146314632, + "grad_norm": 0.04408273100852966, + "learning_rate": 3.5043287911706815e-05, + "loss": 0.1363, + "num_input_tokens_seen": 16558192, + "step": 78460 + }, + { + "epoch": 8.632013201320133, + "grad_norm": 0.09639880806207657, + "learning_rate": 3.504108998541872e-05, + "loss": 0.088, + "num_input_tokens_seen": 16559280, + "step": 78465 + }, + { + "epoch": 8.632563256325632, + "grad_norm": 1.3995401859283447, + "learning_rate": 3.5038891966586704e-05, + "loss": 0.0165, + "num_input_tokens_seen": 16560272, + "step": 78470 + }, + { + "epoch": 8.633113311331133, + "grad_norm": 0.1842757910490036, + "learning_rate": 3.503669385523105e-05, + "loss": 0.0104, + "num_input_tokens_seen": 16561264, + "step": 78475 + }, + { + "epoch": 8.633663366336634, + "grad_norm": 0.6056830286979675, + "learning_rate": 3.5034495651372e-05, + "loss": 0.1514, + "num_input_tokens_seen": 16562288, + "step": 78480 + }, + { + "epoch": 8.634213421342134, + "grad_norm": 0.8340206742286682, + "learning_rate": 3.5032297355029826e-05, + "loss": 0.0731, + "num_input_tokens_seen": 16563376, + "step": 78485 + }, + { + "epoch": 8.634763476347635, + "grad_norm": 0.27271562814712524, + "learning_rate": 3.503009896622478e-05, + "loss": 0.0104, + "num_input_tokens_seen": 16564336, + "step": 78490 + }, + { + "epoch": 8.635313531353136, + "grad_norm": 0.13603055477142334, + "learning_rate": 3.5027900484977114e-05, + "loss": 0.0114, + "num_input_tokens_seen": 16565296, + "step": 78495 + }, + { + "epoch": 8.635863586358635, + "grad_norm": 0.008969253860414028, + "learning_rate": 3.502570191130711e-05, + "loss": 0.0863, + "num_input_tokens_seen": 16566320, + "step": 78500 + }, + { + "epoch": 8.636413641364136, + "grad_norm": 1.0562573671340942, + "learning_rate": 3.5023503245235014e-05, + "loss": 0.0481, + "num_input_tokens_seen": 16567376, + "step": 78505 + }, + { + "epoch": 8.636963696369637, + "grad_norm": 0.20862556993961334, + "learning_rate": 3.5021304486781114e-05, + "loss": 0.0081, + "num_input_tokens_seen": 16568400, + "step": 78510 + }, + { + "epoch": 8.637513751375138, + "grad_norm": 0.7637332081794739, + "learning_rate": 3.5019105635965644e-05, + "loss": 0.0195, + "num_input_tokens_seen": 16569488, + "step": 78515 + }, + { + "epoch": 8.638063806380638, + "grad_norm": 0.15425778925418854, + "learning_rate": 3.501690669280889e-05, + "loss": 0.0378, + "num_input_tokens_seen": 16570608, + "step": 78520 + }, + { + "epoch": 8.638613861386139, + "grad_norm": 0.11966682225465775, + "learning_rate": 3.501470765733111e-05, + "loss": 0.0252, + "num_input_tokens_seen": 16571664, + "step": 78525 + }, + { + "epoch": 8.63916391639164, + "grad_norm": 0.038839567452669144, + "learning_rate": 3.501250852955258e-05, + "loss": 0.0455, + "num_input_tokens_seen": 16572688, + "step": 78530 + }, + { + "epoch": 8.63971397139714, + "grad_norm": 0.01513929944485426, + "learning_rate": 3.501030930949357e-05, + "loss": 0.0162, + "num_input_tokens_seen": 16573776, + "step": 78535 + }, + { + "epoch": 8.64026402640264, + "grad_norm": 0.09593411535024643, + "learning_rate": 3.5008109997174334e-05, + "loss": 0.0888, + "num_input_tokens_seen": 16574896, + "step": 78540 + }, + { + "epoch": 8.640814081408141, + "grad_norm": 0.3781953752040863, + "learning_rate": 3.500591059261515e-05, + "loss": 0.0694, + "num_input_tokens_seen": 16575952, + "step": 78545 + }, + { + "epoch": 8.64136413641364, + "grad_norm": 0.06977799534797668, + "learning_rate": 3.500371109583629e-05, + "loss": 0.0078, + "num_input_tokens_seen": 16577008, + "step": 78550 + }, + { + "epoch": 8.641914191419142, + "grad_norm": 0.039735570549964905, + "learning_rate": 3.5001511506858034e-05, + "loss": 0.0138, + "num_input_tokens_seen": 16578032, + "step": 78555 + }, + { + "epoch": 8.642464246424643, + "grad_norm": 0.036711450666189194, + "learning_rate": 3.499931182570063e-05, + "loss": 0.0069, + "num_input_tokens_seen": 16579056, + "step": 78560 + }, + { + "epoch": 8.643014301430142, + "grad_norm": 0.0283210389316082, + "learning_rate": 3.499711205238437e-05, + "loss": 0.006, + "num_input_tokens_seen": 16580016, + "step": 78565 + }, + { + "epoch": 8.643564356435643, + "grad_norm": 0.2149711698293686, + "learning_rate": 3.499491218692953e-05, + "loss": 0.0587, + "num_input_tokens_seen": 16581104, + "step": 78570 + }, + { + "epoch": 8.644114411441144, + "grad_norm": 0.07754281163215637, + "learning_rate": 3.4992712229356384e-05, + "loss": 0.0205, + "num_input_tokens_seen": 16582192, + "step": 78575 + }, + { + "epoch": 8.644664466446645, + "grad_norm": 0.607178270816803, + "learning_rate": 3.49905121796852e-05, + "loss": 0.0475, + "num_input_tokens_seen": 16583248, + "step": 78580 + }, + { + "epoch": 8.645214521452145, + "grad_norm": 0.031443703919649124, + "learning_rate": 3.4988312037936266e-05, + "loss": 0.0038, + "num_input_tokens_seen": 16584272, + "step": 78585 + }, + { + "epoch": 8.645764576457646, + "grad_norm": 0.965370774269104, + "learning_rate": 3.498611180412984e-05, + "loss": 0.058, + "num_input_tokens_seen": 16585360, + "step": 78590 + }, + { + "epoch": 8.646314631463147, + "grad_norm": 0.7136552333831787, + "learning_rate": 3.4983911478286226e-05, + "loss": 0.052, + "num_input_tokens_seen": 16586416, + "step": 78595 + }, + { + "epoch": 8.646864686468646, + "grad_norm": 0.6550730466842651, + "learning_rate": 3.4981711060425685e-05, + "loss": 0.0418, + "num_input_tokens_seen": 16587472, + "step": 78600 + }, + { + "epoch": 8.647414741474147, + "grad_norm": 0.010750534012913704, + "learning_rate": 3.49795105505685e-05, + "loss": 0.0086, + "num_input_tokens_seen": 16588592, + "step": 78605 + }, + { + "epoch": 8.647964796479648, + "grad_norm": 1.492231011390686, + "learning_rate": 3.497730994873496e-05, + "loss": 0.0867, + "num_input_tokens_seen": 16589680, + "step": 78610 + }, + { + "epoch": 8.648514851485148, + "grad_norm": 0.17875608801841736, + "learning_rate": 3.497510925494534e-05, + "loss": 0.0499, + "num_input_tokens_seen": 16590736, + "step": 78615 + }, + { + "epoch": 8.649064906490649, + "grad_norm": 0.09278911352157593, + "learning_rate": 3.497290846921992e-05, + "loss": 0.0255, + "num_input_tokens_seen": 16591792, + "step": 78620 + }, + { + "epoch": 8.64961496149615, + "grad_norm": 0.020516324788331985, + "learning_rate": 3.497070759157899e-05, + "loss": 0.0293, + "num_input_tokens_seen": 16592848, + "step": 78625 + }, + { + "epoch": 8.65016501650165, + "grad_norm": 0.10616154223680496, + "learning_rate": 3.4968506622042835e-05, + "loss": 0.0086, + "num_input_tokens_seen": 16593840, + "step": 78630 + }, + { + "epoch": 8.65071507150715, + "grad_norm": 0.04969535395503044, + "learning_rate": 3.496630556063174e-05, + "loss": 0.005, + "num_input_tokens_seen": 16594928, + "step": 78635 + }, + { + "epoch": 8.651265126512651, + "grad_norm": 0.03520430997014046, + "learning_rate": 3.4964104407365985e-05, + "loss": 0.0032, + "num_input_tokens_seen": 16595952, + "step": 78640 + }, + { + "epoch": 8.651815181518153, + "grad_norm": 1.0291013717651367, + "learning_rate": 3.496190316226586e-05, + "loss": 0.0836, + "num_input_tokens_seen": 16596976, + "step": 78645 + }, + { + "epoch": 8.652365236523652, + "grad_norm": 0.08477122336626053, + "learning_rate": 3.495970182535166e-05, + "loss": 0.0416, + "num_input_tokens_seen": 16598000, + "step": 78650 + }, + { + "epoch": 8.652915291529153, + "grad_norm": 0.6397399306297302, + "learning_rate": 3.495750039664366e-05, + "loss": 0.0222, + "num_input_tokens_seen": 16599056, + "step": 78655 + }, + { + "epoch": 8.653465346534654, + "grad_norm": 0.0649985522031784, + "learning_rate": 3.4955298876162155e-05, + "loss": 0.1107, + "num_input_tokens_seen": 16600208, + "step": 78660 + }, + { + "epoch": 8.654015401540153, + "grad_norm": 0.010206460021436214, + "learning_rate": 3.495309726392745e-05, + "loss": 0.1343, + "num_input_tokens_seen": 16601264, + "step": 78665 + }, + { + "epoch": 8.654565456545654, + "grad_norm": 0.0551985464990139, + "learning_rate": 3.495089555995981e-05, + "loss": 0.0327, + "num_input_tokens_seen": 16602352, + "step": 78670 + }, + { + "epoch": 8.655115511551156, + "grad_norm": 0.11286740005016327, + "learning_rate": 3.494869376427955e-05, + "loss": 0.0762, + "num_input_tokens_seen": 16603472, + "step": 78675 + }, + { + "epoch": 8.655665566556655, + "grad_norm": 0.12951326370239258, + "learning_rate": 3.494649187690695e-05, + "loss": 0.0622, + "num_input_tokens_seen": 16604560, + "step": 78680 + }, + { + "epoch": 8.656215621562156, + "grad_norm": 0.07008267939090729, + "learning_rate": 3.494428989786231e-05, + "loss": 0.0229, + "num_input_tokens_seen": 16605584, + "step": 78685 + }, + { + "epoch": 8.656765676567657, + "grad_norm": 0.06388085335493088, + "learning_rate": 3.4942087827165915e-05, + "loss": 0.0051, + "num_input_tokens_seen": 16606608, + "step": 78690 + }, + { + "epoch": 8.657315731573158, + "grad_norm": 0.02611161395907402, + "learning_rate": 3.493988566483807e-05, + "loss": 0.006, + "num_input_tokens_seen": 16607664, + "step": 78695 + }, + { + "epoch": 8.657865786578657, + "grad_norm": 0.5737731456756592, + "learning_rate": 3.4937683410899074e-05, + "loss": 0.0794, + "num_input_tokens_seen": 16608720, + "step": 78700 + }, + { + "epoch": 8.658415841584159, + "grad_norm": 0.05477001518011093, + "learning_rate": 3.4935481065369204e-05, + "loss": 0.0715, + "num_input_tokens_seen": 16609744, + "step": 78705 + }, + { + "epoch": 8.65896589658966, + "grad_norm": 0.014843503013253212, + "learning_rate": 3.493327862826879e-05, + "loss": 0.011, + "num_input_tokens_seen": 16610800, + "step": 78710 + }, + { + "epoch": 8.659515951595159, + "grad_norm": 0.17439988255500793, + "learning_rate": 3.4931076099618104e-05, + "loss": 0.0298, + "num_input_tokens_seen": 16611792, + "step": 78715 + }, + { + "epoch": 8.66006600660066, + "grad_norm": 0.023743534460663795, + "learning_rate": 3.492887347943745e-05, + "loss": 0.0724, + "num_input_tokens_seen": 16612784, + "step": 78720 + }, + { + "epoch": 8.660616061606161, + "grad_norm": 1.6288949251174927, + "learning_rate": 3.492667076774715e-05, + "loss": 0.0323, + "num_input_tokens_seen": 16613840, + "step": 78725 + }, + { + "epoch": 8.66116611661166, + "grad_norm": 0.019682656973600388, + "learning_rate": 3.492446796456747e-05, + "loss": 0.0109, + "num_input_tokens_seen": 16614864, + "step": 78730 + }, + { + "epoch": 8.661716171617162, + "grad_norm": 0.06819703429937363, + "learning_rate": 3.492226506991874e-05, + "loss": 0.0215, + "num_input_tokens_seen": 16615920, + "step": 78735 + }, + { + "epoch": 8.662266226622663, + "grad_norm": 0.021941274404525757, + "learning_rate": 3.4920062083821256e-05, + "loss": 0.0475, + "num_input_tokens_seen": 16616976, + "step": 78740 + }, + { + "epoch": 8.662816281628164, + "grad_norm": 0.011086814105510712, + "learning_rate": 3.491785900629532e-05, + "loss": 0.0063, + "num_input_tokens_seen": 16618000, + "step": 78745 + }, + { + "epoch": 8.663366336633663, + "grad_norm": 0.190848708152771, + "learning_rate": 3.491565583736124e-05, + "loss": 0.007, + "num_input_tokens_seen": 16619024, + "step": 78750 + }, + { + "epoch": 8.663916391639164, + "grad_norm": 0.08017883449792862, + "learning_rate": 3.491345257703931e-05, + "loss": 0.0064, + "num_input_tokens_seen": 16620048, + "step": 78755 + }, + { + "epoch": 8.664466446644665, + "grad_norm": 0.9512629508972168, + "learning_rate": 3.491124922534985e-05, + "loss": 0.0255, + "num_input_tokens_seen": 16621136, + "step": 78760 + }, + { + "epoch": 8.665016501650165, + "grad_norm": 0.5579646825790405, + "learning_rate": 3.4909045782313156e-05, + "loss": 0.0156, + "num_input_tokens_seen": 16622192, + "step": 78765 + }, + { + "epoch": 8.665566556655666, + "grad_norm": 0.10388471186161041, + "learning_rate": 3.4906842247949547e-05, + "loss": 0.0358, + "num_input_tokens_seen": 16623152, + "step": 78770 + }, + { + "epoch": 8.666116611661167, + "grad_norm": 0.013476276770234108, + "learning_rate": 3.4904638622279316e-05, + "loss": 0.0023, + "num_input_tokens_seen": 16624240, + "step": 78775 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.13050812482833862, + "learning_rate": 3.490243490532279e-05, + "loss": 0.0038, + "num_input_tokens_seen": 16625296, + "step": 78780 + }, + { + "epoch": 8.667216721672167, + "grad_norm": 0.7238597869873047, + "learning_rate": 3.4900231097100286e-05, + "loss": 0.0234, + "num_input_tokens_seen": 16626448, + "step": 78785 + }, + { + "epoch": 8.667766776677668, + "grad_norm": 0.5714496374130249, + "learning_rate": 3.489802719763209e-05, + "loss": 0.0245, + "num_input_tokens_seen": 16627408, + "step": 78790 + }, + { + "epoch": 8.668316831683168, + "grad_norm": 0.04719632863998413, + "learning_rate": 3.4895823206938526e-05, + "loss": 0.0394, + "num_input_tokens_seen": 16628496, + "step": 78795 + }, + { + "epoch": 8.668866886688669, + "grad_norm": 1.4420831203460693, + "learning_rate": 3.489361912503991e-05, + "loss": 0.1456, + "num_input_tokens_seen": 16629488, + "step": 78800 + }, + { + "epoch": 8.66941694169417, + "grad_norm": 0.009065430611371994, + "learning_rate": 3.489141495195655e-05, + "loss": 0.0051, + "num_input_tokens_seen": 16630576, + "step": 78805 + }, + { + "epoch": 8.66996699669967, + "grad_norm": 0.49241840839385986, + "learning_rate": 3.488921068770877e-05, + "loss": 0.0106, + "num_input_tokens_seen": 16631632, + "step": 78810 + }, + { + "epoch": 8.67051705170517, + "grad_norm": 0.013541734777390957, + "learning_rate": 3.4887006332316886e-05, + "loss": 0.0135, + "num_input_tokens_seen": 16632720, + "step": 78815 + }, + { + "epoch": 8.671067106710671, + "grad_norm": 0.09843797981739044, + "learning_rate": 3.48848018858012e-05, + "loss": 0.0739, + "num_input_tokens_seen": 16633808, + "step": 78820 + }, + { + "epoch": 8.671617161716172, + "grad_norm": 0.6365883350372314, + "learning_rate": 3.488259734818204e-05, + "loss": 0.0498, + "num_input_tokens_seen": 16634864, + "step": 78825 + }, + { + "epoch": 8.672167216721672, + "grad_norm": 0.17181219160556793, + "learning_rate": 3.488039271947972e-05, + "loss": 0.0464, + "num_input_tokens_seen": 16635920, + "step": 78830 + }, + { + "epoch": 8.672717271727173, + "grad_norm": 0.0485805869102478, + "learning_rate": 3.4878187999714565e-05, + "loss": 0.0485, + "num_input_tokens_seen": 16637008, + "step": 78835 + }, + { + "epoch": 8.673267326732674, + "grad_norm": 0.010898067615926266, + "learning_rate": 3.48759831889069e-05, + "loss": 0.0367, + "num_input_tokens_seen": 16638096, + "step": 78840 + }, + { + "epoch": 8.673817381738173, + "grad_norm": 0.007861215621232986, + "learning_rate": 3.487377828707703e-05, + "loss": 0.0754, + "num_input_tokens_seen": 16639152, + "step": 78845 + }, + { + "epoch": 8.674367436743674, + "grad_norm": 0.02044663205742836, + "learning_rate": 3.487157329424527e-05, + "loss": 0.0026, + "num_input_tokens_seen": 16640208, + "step": 78850 + }, + { + "epoch": 8.674917491749175, + "grad_norm": 1.1802982091903687, + "learning_rate": 3.486936821043197e-05, + "loss": 0.0696, + "num_input_tokens_seen": 16641232, + "step": 78855 + }, + { + "epoch": 8.675467546754675, + "grad_norm": 0.5599712133407593, + "learning_rate": 3.4867163035657433e-05, + "loss": 0.1046, + "num_input_tokens_seen": 16642352, + "step": 78860 + }, + { + "epoch": 8.676017601760176, + "grad_norm": 0.7382585406303406, + "learning_rate": 3.4864957769942e-05, + "loss": 0.0476, + "num_input_tokens_seen": 16643376, + "step": 78865 + }, + { + "epoch": 8.676567656765677, + "grad_norm": 0.04791708663105965, + "learning_rate": 3.4862752413305976e-05, + "loss": 0.005, + "num_input_tokens_seen": 16644400, + "step": 78870 + }, + { + "epoch": 8.677117711771178, + "grad_norm": 0.323231965303421, + "learning_rate": 3.4860546965769696e-05, + "loss": 0.0148, + "num_input_tokens_seen": 16645456, + "step": 78875 + }, + { + "epoch": 8.677667766776677, + "grad_norm": 0.011272959411144257, + "learning_rate": 3.485834142735348e-05, + "loss": 0.0088, + "num_input_tokens_seen": 16646544, + "step": 78880 + }, + { + "epoch": 8.678217821782178, + "grad_norm": 0.047154221683740616, + "learning_rate": 3.485613579807767e-05, + "loss": 0.0605, + "num_input_tokens_seen": 16647600, + "step": 78885 + }, + { + "epoch": 8.67876787678768, + "grad_norm": 0.13175523281097412, + "learning_rate": 3.485393007796259e-05, + "loss": 0.1011, + "num_input_tokens_seen": 16648720, + "step": 78890 + }, + { + "epoch": 8.679317931793179, + "grad_norm": 0.3664989769458771, + "learning_rate": 3.485172426702855e-05, + "loss": 0.0634, + "num_input_tokens_seen": 16649744, + "step": 78895 + }, + { + "epoch": 8.67986798679868, + "grad_norm": 0.0439513735473156, + "learning_rate": 3.484951836529591e-05, + "loss": 0.0228, + "num_input_tokens_seen": 16650768, + "step": 78900 + }, + { + "epoch": 8.680418041804181, + "grad_norm": 0.05381641909480095, + "learning_rate": 3.484731237278498e-05, + "loss": 0.0091, + "num_input_tokens_seen": 16651888, + "step": 78905 + }, + { + "epoch": 8.68096809680968, + "grad_norm": 0.1685577929019928, + "learning_rate": 3.484510628951609e-05, + "loss": 0.0108, + "num_input_tokens_seen": 16652880, + "step": 78910 + }, + { + "epoch": 8.681518151815181, + "grad_norm": 0.7953194379806519, + "learning_rate": 3.484290011550959e-05, + "loss": 0.1955, + "num_input_tokens_seen": 16653968, + "step": 78915 + }, + { + "epoch": 8.682068206820682, + "grad_norm": 0.22570951282978058, + "learning_rate": 3.48406938507858e-05, + "loss": 0.0393, + "num_input_tokens_seen": 16655024, + "step": 78920 + }, + { + "epoch": 8.682618261826182, + "grad_norm": 0.8269532918930054, + "learning_rate": 3.4838487495365054e-05, + "loss": 0.018, + "num_input_tokens_seen": 16656080, + "step": 78925 + }, + { + "epoch": 8.683168316831683, + "grad_norm": 0.02605700120329857, + "learning_rate": 3.483628104926769e-05, + "loss": 0.004, + "num_input_tokens_seen": 16657136, + "step": 78930 + }, + { + "epoch": 8.683718371837184, + "grad_norm": 0.017850937321782112, + "learning_rate": 3.4834074512514044e-05, + "loss": 0.0832, + "num_input_tokens_seen": 16658256, + "step": 78935 + }, + { + "epoch": 8.684268426842685, + "grad_norm": 0.11007342487573624, + "learning_rate": 3.483186788512445e-05, + "loss": 0.0189, + "num_input_tokens_seen": 16659280, + "step": 78940 + }, + { + "epoch": 8.684818481848184, + "grad_norm": 0.03362999111413956, + "learning_rate": 3.482966116711925e-05, + "loss": 0.0132, + "num_input_tokens_seen": 16660400, + "step": 78945 + }, + { + "epoch": 8.685368536853685, + "grad_norm": 1.3674026727676392, + "learning_rate": 3.482745435851878e-05, + "loss": 0.072, + "num_input_tokens_seen": 16661456, + "step": 78950 + }, + { + "epoch": 8.685918591859187, + "grad_norm": 1.710021734237671, + "learning_rate": 3.482524745934338e-05, + "loss": 0.0767, + "num_input_tokens_seen": 16662480, + "step": 78955 + }, + { + "epoch": 8.686468646864686, + "grad_norm": 0.1983146369457245, + "learning_rate": 3.482304046961339e-05, + "loss": 0.0051, + "num_input_tokens_seen": 16663568, + "step": 78960 + }, + { + "epoch": 8.687018701870187, + "grad_norm": 0.01988956891000271, + "learning_rate": 3.482083338934914e-05, + "loss": 0.013, + "num_input_tokens_seen": 16664688, + "step": 78965 + }, + { + "epoch": 8.687568756875688, + "grad_norm": 0.33407533168792725, + "learning_rate": 3.4818626218570994e-05, + "loss": 0.0417, + "num_input_tokens_seen": 16665808, + "step": 78970 + }, + { + "epoch": 8.688118811881187, + "grad_norm": 0.1964011788368225, + "learning_rate": 3.481641895729927e-05, + "loss": 0.02, + "num_input_tokens_seen": 16666896, + "step": 78975 + }, + { + "epoch": 8.688668866886688, + "grad_norm": 0.012853553518652916, + "learning_rate": 3.481421160555433e-05, + "loss": 0.0134, + "num_input_tokens_seen": 16667888, + "step": 78980 + }, + { + "epoch": 8.68921892189219, + "grad_norm": 0.013552406802773476, + "learning_rate": 3.4812004163356507e-05, + "loss": 0.0116, + "num_input_tokens_seen": 16668912, + "step": 78985 + }, + { + "epoch": 8.689768976897689, + "grad_norm": 5.797325134277344, + "learning_rate": 3.480979663072615e-05, + "loss": 0.0906, + "num_input_tokens_seen": 16670000, + "step": 78990 + }, + { + "epoch": 8.69031903190319, + "grad_norm": 0.26759660243988037, + "learning_rate": 3.480758900768361e-05, + "loss": 0.0423, + "num_input_tokens_seen": 16671152, + "step": 78995 + }, + { + "epoch": 8.690869086908691, + "grad_norm": 0.09494911879301071, + "learning_rate": 3.480538129424923e-05, + "loss": 0.0291, + "num_input_tokens_seen": 16672272, + "step": 79000 + }, + { + "epoch": 8.691419141914192, + "grad_norm": 0.6922582983970642, + "learning_rate": 3.4803173490443344e-05, + "loss": 0.2019, + "num_input_tokens_seen": 16673328, + "step": 79005 + }, + { + "epoch": 8.691969196919691, + "grad_norm": 0.011613857932388783, + "learning_rate": 3.4800965596286325e-05, + "loss": 0.082, + "num_input_tokens_seen": 16674416, + "step": 79010 + }, + { + "epoch": 8.692519251925193, + "grad_norm": 0.04940392076969147, + "learning_rate": 3.47987576117985e-05, + "loss": 0.0345, + "num_input_tokens_seen": 16675504, + "step": 79015 + }, + { + "epoch": 8.693069306930694, + "grad_norm": 1.2830207347869873, + "learning_rate": 3.479654953700023e-05, + "loss": 0.0797, + "num_input_tokens_seen": 16676528, + "step": 79020 + }, + { + "epoch": 8.693619361936193, + "grad_norm": 0.19907186925411224, + "learning_rate": 3.4794341371911864e-05, + "loss": 0.0301, + "num_input_tokens_seen": 16677552, + "step": 79025 + }, + { + "epoch": 8.694169416941694, + "grad_norm": 0.02348397485911846, + "learning_rate": 3.479213311655376e-05, + "loss": 0.0028, + "num_input_tokens_seen": 16678608, + "step": 79030 + }, + { + "epoch": 8.694719471947195, + "grad_norm": 0.16661928594112396, + "learning_rate": 3.478992477094626e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16679600, + "step": 79035 + }, + { + "epoch": 8.695269526952695, + "grad_norm": 0.7956116199493408, + "learning_rate": 3.478771633510971e-05, + "loss": 0.0317, + "num_input_tokens_seen": 16680720, + "step": 79040 + }, + { + "epoch": 8.695819581958196, + "grad_norm": 1.1302964687347412, + "learning_rate": 3.4785507809064485e-05, + "loss": 0.0877, + "num_input_tokens_seen": 16681840, + "step": 79045 + }, + { + "epoch": 8.696369636963697, + "grad_norm": 0.5824523568153381, + "learning_rate": 3.478329919283093e-05, + "loss": 0.0212, + "num_input_tokens_seen": 16682896, + "step": 79050 + }, + { + "epoch": 8.696919691969196, + "grad_norm": 0.9870553016662598, + "learning_rate": 3.47810904864294e-05, + "loss": 0.0877, + "num_input_tokens_seen": 16683984, + "step": 79055 + }, + { + "epoch": 8.697469746974697, + "grad_norm": 0.22328908741474152, + "learning_rate": 3.477888168988025e-05, + "loss": 0.0328, + "num_input_tokens_seen": 16685040, + "step": 79060 + }, + { + "epoch": 8.698019801980198, + "grad_norm": 0.9906550049781799, + "learning_rate": 3.477667280320385e-05, + "loss": 0.0917, + "num_input_tokens_seen": 16686096, + "step": 79065 + }, + { + "epoch": 8.6985698569857, + "grad_norm": 0.09921608865261078, + "learning_rate": 3.477446382642053e-05, + "loss": 0.0233, + "num_input_tokens_seen": 16687216, + "step": 79070 + }, + { + "epoch": 8.699119911991199, + "grad_norm": 0.1693081259727478, + "learning_rate": 3.477225475955068e-05, + "loss": 0.0791, + "num_input_tokens_seen": 16688272, + "step": 79075 + }, + { + "epoch": 8.6996699669967, + "grad_norm": 0.09053883701562881, + "learning_rate": 3.477004560261464e-05, + "loss": 0.1337, + "num_input_tokens_seen": 16689392, + "step": 79080 + }, + { + "epoch": 8.7002200220022, + "grad_norm": 0.53180330991745, + "learning_rate": 3.476783635563279e-05, + "loss": 0.0446, + "num_input_tokens_seen": 16690480, + "step": 79085 + }, + { + "epoch": 8.7007700770077, + "grad_norm": 0.9166305661201477, + "learning_rate": 3.476562701862547e-05, + "loss": 0.057, + "num_input_tokens_seen": 16691472, + "step": 79090 + }, + { + "epoch": 8.701320132013201, + "grad_norm": 1.8847553730010986, + "learning_rate": 3.476341759161305e-05, + "loss": 0.1249, + "num_input_tokens_seen": 16692528, + "step": 79095 + }, + { + "epoch": 8.701870187018702, + "grad_norm": 0.04198358580470085, + "learning_rate": 3.4761208074615895e-05, + "loss": 0.0108, + "num_input_tokens_seen": 16693552, + "step": 79100 + }, + { + "epoch": 8.702420242024202, + "grad_norm": 0.02908283658325672, + "learning_rate": 3.475899846765438e-05, + "loss": 0.0582, + "num_input_tokens_seen": 16694608, + "step": 79105 + }, + { + "epoch": 8.702970297029703, + "grad_norm": 1.0981558561325073, + "learning_rate": 3.475678877074885e-05, + "loss": 0.0363, + "num_input_tokens_seen": 16695760, + "step": 79110 + }, + { + "epoch": 8.703520352035204, + "grad_norm": 0.08816169202327728, + "learning_rate": 3.475457898391968e-05, + "loss": 0.0446, + "num_input_tokens_seen": 16696944, + "step": 79115 + }, + { + "epoch": 8.704070407040705, + "grad_norm": 0.022171134129166603, + "learning_rate": 3.4752369107187233e-05, + "loss": 0.0405, + "num_input_tokens_seen": 16697968, + "step": 79120 + }, + { + "epoch": 8.704620462046204, + "grad_norm": 0.3746749460697174, + "learning_rate": 3.4750159140571886e-05, + "loss": 0.1307, + "num_input_tokens_seen": 16699120, + "step": 79125 + }, + { + "epoch": 8.705170517051705, + "grad_norm": 0.07881952822208405, + "learning_rate": 3.4747949084094e-05, + "loss": 0.0031, + "num_input_tokens_seen": 16700144, + "step": 79130 + }, + { + "epoch": 8.705720572057206, + "grad_norm": 0.03822803869843483, + "learning_rate": 3.474573893777394e-05, + "loss": 0.0252, + "num_input_tokens_seen": 16701200, + "step": 79135 + }, + { + "epoch": 8.706270627062706, + "grad_norm": 0.5521162152290344, + "learning_rate": 3.4743528701632085e-05, + "loss": 0.0235, + "num_input_tokens_seen": 16702288, + "step": 79140 + }, + { + "epoch": 8.706820682068207, + "grad_norm": 1.035874366760254, + "learning_rate": 3.47413183756888e-05, + "loss": 0.0441, + "num_input_tokens_seen": 16703312, + "step": 79145 + }, + { + "epoch": 8.707370737073708, + "grad_norm": 0.11071349680423737, + "learning_rate": 3.473910795996446e-05, + "loss": 0.0233, + "num_input_tokens_seen": 16704400, + "step": 79150 + }, + { + "epoch": 8.707920792079207, + "grad_norm": 0.015691401436924934, + "learning_rate": 3.473689745447943e-05, + "loss": 0.0776, + "num_input_tokens_seen": 16705456, + "step": 79155 + }, + { + "epoch": 8.708470847084708, + "grad_norm": 0.06740128248929977, + "learning_rate": 3.4734686859254094e-05, + "loss": 0.0847, + "num_input_tokens_seen": 16706512, + "step": 79160 + }, + { + "epoch": 8.70902090209021, + "grad_norm": 0.3363695740699768, + "learning_rate": 3.473247617430882e-05, + "loss": 0.0232, + "num_input_tokens_seen": 16707568, + "step": 79165 + }, + { + "epoch": 8.70957095709571, + "grad_norm": 0.6215263605117798, + "learning_rate": 3.473026539966398e-05, + "loss": 0.0256, + "num_input_tokens_seen": 16708688, + "step": 79170 + }, + { + "epoch": 8.71012101210121, + "grad_norm": 1.2000178098678589, + "learning_rate": 3.472805453533995e-05, + "loss": 0.062, + "num_input_tokens_seen": 16709712, + "step": 79175 + }, + { + "epoch": 8.710671067106711, + "grad_norm": 0.015046716667711735, + "learning_rate": 3.472584358135712e-05, + "loss": 0.005, + "num_input_tokens_seen": 16710736, + "step": 79180 + }, + { + "epoch": 8.711221122112212, + "grad_norm": 0.025227192789316177, + "learning_rate": 3.4723632537735846e-05, + "loss": 0.0422, + "num_input_tokens_seen": 16711824, + "step": 79185 + }, + { + "epoch": 8.711771177117711, + "grad_norm": 0.09406048059463501, + "learning_rate": 3.4721421404496525e-05, + "loss": 0.0102, + "num_input_tokens_seen": 16712848, + "step": 79190 + }, + { + "epoch": 8.712321232123212, + "grad_norm": 0.5746518969535828, + "learning_rate": 3.471921018165952e-05, + "loss": 0.0321, + "num_input_tokens_seen": 16713936, + "step": 79195 + }, + { + "epoch": 8.712871287128714, + "grad_norm": 0.06869913637638092, + "learning_rate": 3.471699886924522e-05, + "loss": 0.0045, + "num_input_tokens_seen": 16714992, + "step": 79200 + }, + { + "epoch": 8.713421342134213, + "grad_norm": 0.9256543517112732, + "learning_rate": 3.471478746727401e-05, + "loss": 0.0856, + "num_input_tokens_seen": 16715984, + "step": 79205 + }, + { + "epoch": 8.713971397139714, + "grad_norm": 0.3366243541240692, + "learning_rate": 3.471257597576626e-05, + "loss": 0.0479, + "num_input_tokens_seen": 16717040, + "step": 79210 + }, + { + "epoch": 8.714521452145215, + "grad_norm": 0.03208545967936516, + "learning_rate": 3.471036439474236e-05, + "loss": 0.0083, + "num_input_tokens_seen": 16718096, + "step": 79215 + }, + { + "epoch": 8.715071507150714, + "grad_norm": 0.014088563621044159, + "learning_rate": 3.470815272422269e-05, + "loss": 0.0464, + "num_input_tokens_seen": 16719184, + "step": 79220 + }, + { + "epoch": 8.715621562156215, + "grad_norm": 0.07267727702856064, + "learning_rate": 3.470594096422764e-05, + "loss": 0.014, + "num_input_tokens_seen": 16720208, + "step": 79225 + }, + { + "epoch": 8.716171617161717, + "grad_norm": 0.022342145442962646, + "learning_rate": 3.470372911477758e-05, + "loss": 0.0064, + "num_input_tokens_seen": 16721200, + "step": 79230 + }, + { + "epoch": 8.716721672167218, + "grad_norm": 0.19777020812034607, + "learning_rate": 3.470151717589291e-05, + "loss": 0.0179, + "num_input_tokens_seen": 16722224, + "step": 79235 + }, + { + "epoch": 8.717271727172717, + "grad_norm": 0.19031959772109985, + "learning_rate": 3.4699305147594016e-05, + "loss": 0.0129, + "num_input_tokens_seen": 16723280, + "step": 79240 + }, + { + "epoch": 8.717821782178218, + "grad_norm": 1.5791432857513428, + "learning_rate": 3.469709302990127e-05, + "loss": 0.0941, + "num_input_tokens_seen": 16724400, + "step": 79245 + }, + { + "epoch": 8.718371837183719, + "grad_norm": 0.03762770816683769, + "learning_rate": 3.469488082283508e-05, + "loss": 0.0455, + "num_input_tokens_seen": 16725456, + "step": 79250 + }, + { + "epoch": 8.718921892189218, + "grad_norm": 0.03422249108552933, + "learning_rate": 3.469266852641582e-05, + "loss": 0.073, + "num_input_tokens_seen": 16726576, + "step": 79255 + }, + { + "epoch": 8.71947194719472, + "grad_norm": 0.014880463480949402, + "learning_rate": 3.4690456140663884e-05, + "loss": 0.1117, + "num_input_tokens_seen": 16727696, + "step": 79260 + }, + { + "epoch": 8.72002200220022, + "grad_norm": 0.006633324082940817, + "learning_rate": 3.468824366559967e-05, + "loss": 0.0052, + "num_input_tokens_seen": 16728720, + "step": 79265 + }, + { + "epoch": 8.72057205720572, + "grad_norm": 0.6552591323852539, + "learning_rate": 3.468603110124356e-05, + "loss": 0.0855, + "num_input_tokens_seen": 16729808, + "step": 79270 + }, + { + "epoch": 8.721122112211221, + "grad_norm": 0.14725786447525024, + "learning_rate": 3.468381844761595e-05, + "loss": 0.0135, + "num_input_tokens_seen": 16730800, + "step": 79275 + }, + { + "epoch": 8.721672167216722, + "grad_norm": 0.06255123764276505, + "learning_rate": 3.468160570473723e-05, + "loss": 0.0858, + "num_input_tokens_seen": 16731856, + "step": 79280 + }, + { + "epoch": 8.722222222222221, + "grad_norm": 0.1486452966928482, + "learning_rate": 3.467939287262779e-05, + "loss": 0.0493, + "num_input_tokens_seen": 16732880, + "step": 79285 + }, + { + "epoch": 8.722772277227723, + "grad_norm": 0.5603165626525879, + "learning_rate": 3.467717995130804e-05, + "loss": 0.0872, + "num_input_tokens_seen": 16733968, + "step": 79290 + }, + { + "epoch": 8.723322332233224, + "grad_norm": 1.1179922819137573, + "learning_rate": 3.467496694079837e-05, + "loss": 0.0949, + "num_input_tokens_seen": 16735024, + "step": 79295 + }, + { + "epoch": 8.723872387238725, + "grad_norm": 0.07518211752176285, + "learning_rate": 3.467275384111916e-05, + "loss": 0.045, + "num_input_tokens_seen": 16735984, + "step": 79300 + }, + { + "epoch": 8.724422442244224, + "grad_norm": 0.7307694554328918, + "learning_rate": 3.4670540652290825e-05, + "loss": 0.0186, + "num_input_tokens_seen": 16736976, + "step": 79305 + }, + { + "epoch": 8.724972497249725, + "grad_norm": 1.0982558727264404, + "learning_rate": 3.466832737433376e-05, + "loss": 0.0134, + "num_input_tokens_seen": 16738128, + "step": 79310 + }, + { + "epoch": 8.725522552255226, + "grad_norm": 0.08018963038921356, + "learning_rate": 3.466611400726835e-05, + "loss": 0.0375, + "num_input_tokens_seen": 16739280, + "step": 79315 + }, + { + "epoch": 8.726072607260726, + "grad_norm": 0.43358564376831055, + "learning_rate": 3.4663900551115026e-05, + "loss": 0.0161, + "num_input_tokens_seen": 16740304, + "step": 79320 + }, + { + "epoch": 8.726622662266227, + "grad_norm": 0.019340185448527336, + "learning_rate": 3.466168700589415e-05, + "loss": 0.0352, + "num_input_tokens_seen": 16741360, + "step": 79325 + }, + { + "epoch": 8.727172717271728, + "grad_norm": 0.018558170646429062, + "learning_rate": 3.4659473371626146e-05, + "loss": 0.0151, + "num_input_tokens_seen": 16742448, + "step": 79330 + }, + { + "epoch": 8.727722772277227, + "grad_norm": 1.068954586982727, + "learning_rate": 3.465725964833141e-05, + "loss": 0.049, + "num_input_tokens_seen": 16743440, + "step": 79335 + }, + { + "epoch": 8.728272827282728, + "grad_norm": 0.33720237016677856, + "learning_rate": 3.465504583603035e-05, + "loss": 0.0075, + "num_input_tokens_seen": 16744464, + "step": 79340 + }, + { + "epoch": 8.72882288228823, + "grad_norm": 0.06909679621458054, + "learning_rate": 3.465283193474336e-05, + "loss": 0.055, + "num_input_tokens_seen": 16745552, + "step": 79345 + }, + { + "epoch": 8.729372937293729, + "grad_norm": 0.06049763783812523, + "learning_rate": 3.4650617944490854e-05, + "loss": 0.0689, + "num_input_tokens_seen": 16746576, + "step": 79350 + }, + { + "epoch": 8.72992299229923, + "grad_norm": 0.09681464731693268, + "learning_rate": 3.4648403865293236e-05, + "loss": 0.0297, + "num_input_tokens_seen": 16747600, + "step": 79355 + }, + { + "epoch": 8.73047304730473, + "grad_norm": 1.1861711740493774, + "learning_rate": 3.4646189697170895e-05, + "loss": 0.0291, + "num_input_tokens_seen": 16748656, + "step": 79360 + }, + { + "epoch": 8.731023102310232, + "grad_norm": 0.03679193556308746, + "learning_rate": 3.464397544014427e-05, + "loss": 0.0139, + "num_input_tokens_seen": 16749744, + "step": 79365 + }, + { + "epoch": 8.731573157315731, + "grad_norm": 0.1241527572274208, + "learning_rate": 3.464176109423374e-05, + "loss": 0.0348, + "num_input_tokens_seen": 16750800, + "step": 79370 + }, + { + "epoch": 8.732123212321232, + "grad_norm": 0.03173863887786865, + "learning_rate": 3.463954665945973e-05, + "loss": 0.0514, + "num_input_tokens_seen": 16751856, + "step": 79375 + }, + { + "epoch": 8.732673267326733, + "grad_norm": 1.0213340520858765, + "learning_rate": 3.463733213584264e-05, + "loss": 0.0387, + "num_input_tokens_seen": 16752944, + "step": 79380 + }, + { + "epoch": 8.733223322332233, + "grad_norm": 0.22884151339530945, + "learning_rate": 3.463511752340289e-05, + "loss": 0.0149, + "num_input_tokens_seen": 16754000, + "step": 79385 + }, + { + "epoch": 8.733773377337734, + "grad_norm": 0.06354113668203354, + "learning_rate": 3.463290282216088e-05, + "loss": 0.0818, + "num_input_tokens_seen": 16754960, + "step": 79390 + }, + { + "epoch": 8.734323432343235, + "grad_norm": 0.041255176067352295, + "learning_rate": 3.463068803213703e-05, + "loss": 0.0232, + "num_input_tokens_seen": 16756048, + "step": 79395 + }, + { + "epoch": 8.734873487348734, + "grad_norm": 0.9586905241012573, + "learning_rate": 3.4628473153351746e-05, + "loss": 0.0458, + "num_input_tokens_seen": 16757136, + "step": 79400 + }, + { + "epoch": 8.735423542354235, + "grad_norm": 0.3413161039352417, + "learning_rate": 3.4626258185825445e-05, + "loss": 0.0923, + "num_input_tokens_seen": 16758224, + "step": 79405 + }, + { + "epoch": 8.735973597359736, + "grad_norm": 0.3983265161514282, + "learning_rate": 3.462404312957854e-05, + "loss": 0.0185, + "num_input_tokens_seen": 16759312, + "step": 79410 + }, + { + "epoch": 8.736523652365236, + "grad_norm": 0.6554847359657288, + "learning_rate": 3.462182798463145e-05, + "loss": 0.0259, + "num_input_tokens_seen": 16760336, + "step": 79415 + }, + { + "epoch": 8.737073707370737, + "grad_norm": 0.0057141827419400215, + "learning_rate": 3.4619612751004594e-05, + "loss": 0.0313, + "num_input_tokens_seen": 16761424, + "step": 79420 + }, + { + "epoch": 8.737623762376238, + "grad_norm": 0.01845364458858967, + "learning_rate": 3.461739742871838e-05, + "loss": 0.0064, + "num_input_tokens_seen": 16762480, + "step": 79425 + }, + { + "epoch": 8.738173817381739, + "grad_norm": 0.9143269658088684, + "learning_rate": 3.4615182017793226e-05, + "loss": 0.0368, + "num_input_tokens_seen": 16763568, + "step": 79430 + }, + { + "epoch": 8.738723872387238, + "grad_norm": 0.17053306102752686, + "learning_rate": 3.4612966518249556e-05, + "loss": 0.0257, + "num_input_tokens_seen": 16764624, + "step": 79435 + }, + { + "epoch": 8.73927392739274, + "grad_norm": 0.09498105943202972, + "learning_rate": 3.4610750930107784e-05, + "loss": 0.0063, + "num_input_tokens_seen": 16765712, + "step": 79440 + }, + { + "epoch": 8.73982398239824, + "grad_norm": 1.8035115003585815, + "learning_rate": 3.460853525338833e-05, + "loss": 0.0801, + "num_input_tokens_seen": 16766800, + "step": 79445 + }, + { + "epoch": 8.74037403740374, + "grad_norm": 0.23883062601089478, + "learning_rate": 3.460631948811162e-05, + "loss": 0.0156, + "num_input_tokens_seen": 16767824, + "step": 79450 + }, + { + "epoch": 8.74092409240924, + "grad_norm": 0.00731825502589345, + "learning_rate": 3.460410363429807e-05, + "loss": 0.0115, + "num_input_tokens_seen": 16768848, + "step": 79455 + }, + { + "epoch": 8.741474147414742, + "grad_norm": 0.018084529787302017, + "learning_rate": 3.460188769196811e-05, + "loss": 0.0043, + "num_input_tokens_seen": 16769840, + "step": 79460 + }, + { + "epoch": 8.742024202420241, + "grad_norm": 0.015592687763273716, + "learning_rate": 3.459967166114216e-05, + "loss": 0.0253, + "num_input_tokens_seen": 16770864, + "step": 79465 + }, + { + "epoch": 8.742574257425742, + "grad_norm": 0.1388789862394333, + "learning_rate": 3.459745554184064e-05, + "loss": 0.0201, + "num_input_tokens_seen": 16771952, + "step": 79470 + }, + { + "epoch": 8.743124312431243, + "grad_norm": 3.459091901779175, + "learning_rate": 3.4595239334083974e-05, + "loss": 0.0364, + "num_input_tokens_seen": 16773040, + "step": 79475 + }, + { + "epoch": 8.743674367436743, + "grad_norm": 0.2517068088054657, + "learning_rate": 3.45930230378926e-05, + "loss": 0.0289, + "num_input_tokens_seen": 16774160, + "step": 79480 + }, + { + "epoch": 8.744224422442244, + "grad_norm": 0.20008915662765503, + "learning_rate": 3.459080665328693e-05, + "loss": 0.0088, + "num_input_tokens_seen": 16775184, + "step": 79485 + }, + { + "epoch": 8.744774477447745, + "grad_norm": 0.17789730429649353, + "learning_rate": 3.4588590180287394e-05, + "loss": 0.0642, + "num_input_tokens_seen": 16776208, + "step": 79490 + }, + { + "epoch": 8.745324532453246, + "grad_norm": 0.036822788417339325, + "learning_rate": 3.458637361891442e-05, + "loss": 0.0061, + "num_input_tokens_seen": 16777296, + "step": 79495 + }, + { + "epoch": 8.745874587458745, + "grad_norm": 0.015024245716631413, + "learning_rate": 3.458415696918846e-05, + "loss": 0.0714, + "num_input_tokens_seen": 16778384, + "step": 79500 + }, + { + "epoch": 8.746424642464246, + "grad_norm": 0.030171792954206467, + "learning_rate": 3.4581940231129905e-05, + "loss": 0.0584, + "num_input_tokens_seen": 16779472, + "step": 79505 + }, + { + "epoch": 8.746974697469748, + "grad_norm": 0.05564305558800697, + "learning_rate": 3.4579723404759213e-05, + "loss": 0.0118, + "num_input_tokens_seen": 16780560, + "step": 79510 + }, + { + "epoch": 8.747524752475247, + "grad_norm": 0.3332679271697998, + "learning_rate": 3.457750649009681e-05, + "loss": 0.0147, + "num_input_tokens_seen": 16781648, + "step": 79515 + }, + { + "epoch": 8.748074807480748, + "grad_norm": 0.07260643690824509, + "learning_rate": 3.4575289487163116e-05, + "loss": 0.0038, + "num_input_tokens_seen": 16782640, + "step": 79520 + }, + { + "epoch": 8.748624862486249, + "grad_norm": 0.036455605179071426, + "learning_rate": 3.457307239597858e-05, + "loss": 0.0258, + "num_input_tokens_seen": 16783664, + "step": 79525 + }, + { + "epoch": 8.749174917491748, + "grad_norm": 0.7441731691360474, + "learning_rate": 3.4570855216563626e-05, + "loss": 0.0234, + "num_input_tokens_seen": 16784688, + "step": 79530 + }, + { + "epoch": 8.74972497249725, + "grad_norm": 0.030433746054768562, + "learning_rate": 3.4568637948938696e-05, + "loss": 0.033, + "num_input_tokens_seen": 16785744, + "step": 79535 + }, + { + "epoch": 8.75027502750275, + "grad_norm": 0.010785712860524654, + "learning_rate": 3.456642059312421e-05, + "loss": 0.0019, + "num_input_tokens_seen": 16786768, + "step": 79540 + }, + { + "epoch": 8.750825082508252, + "grad_norm": 0.18774181604385376, + "learning_rate": 3.456420314914063e-05, + "loss": 0.0077, + "num_input_tokens_seen": 16787824, + "step": 79545 + }, + { + "epoch": 8.751375137513751, + "grad_norm": 0.09829894453287125, + "learning_rate": 3.4561985617008375e-05, + "loss": 0.087, + "num_input_tokens_seen": 16788880, + "step": 79550 + }, + { + "epoch": 8.751925192519252, + "grad_norm": 0.02323100157082081, + "learning_rate": 3.455976799674789e-05, + "loss": 0.082, + "num_input_tokens_seen": 16789936, + "step": 79555 + }, + { + "epoch": 8.752475247524753, + "grad_norm": 0.02237727865576744, + "learning_rate": 3.45575502883796e-05, + "loss": 0.0118, + "num_input_tokens_seen": 16790992, + "step": 79560 + }, + { + "epoch": 8.753025302530252, + "grad_norm": 0.756517231464386, + "learning_rate": 3.4555332491923964e-05, + "loss": 0.0304, + "num_input_tokens_seen": 16792016, + "step": 79565 + }, + { + "epoch": 8.753575357535754, + "grad_norm": 0.21111872792243958, + "learning_rate": 3.455311460740141e-05, + "loss": 0.0134, + "num_input_tokens_seen": 16793104, + "step": 79570 + }, + { + "epoch": 8.754125412541255, + "grad_norm": 0.702820360660553, + "learning_rate": 3.455089663483239e-05, + "loss": 0.0259, + "num_input_tokens_seen": 16794128, + "step": 79575 + }, + { + "epoch": 8.754675467546754, + "grad_norm": 0.20132476091384888, + "learning_rate": 3.454867857423733e-05, + "loss": 0.0427, + "num_input_tokens_seen": 16795216, + "step": 79580 + }, + { + "epoch": 8.755225522552255, + "grad_norm": 0.04436902329325676, + "learning_rate": 3.454646042563668e-05, + "loss": 0.007, + "num_input_tokens_seen": 16796272, + "step": 79585 + }, + { + "epoch": 8.755775577557756, + "grad_norm": 1.7614154815673828, + "learning_rate": 3.454424218905089e-05, + "loss": 0.026, + "num_input_tokens_seen": 16797456, + "step": 79590 + }, + { + "epoch": 8.756325632563257, + "grad_norm": 0.019708244130015373, + "learning_rate": 3.454202386450039e-05, + "loss": 0.0038, + "num_input_tokens_seen": 16798416, + "step": 79595 + }, + { + "epoch": 8.756875687568757, + "grad_norm": 0.45611125230789185, + "learning_rate": 3.453980545200565e-05, + "loss": 0.0284, + "num_input_tokens_seen": 16799472, + "step": 79600 + }, + { + "epoch": 8.757425742574258, + "grad_norm": 0.020423492416739464, + "learning_rate": 3.45375869515871e-05, + "loss": 0.0612, + "num_input_tokens_seen": 16800528, + "step": 79605 + }, + { + "epoch": 8.757975797579759, + "grad_norm": 0.01840556040406227, + "learning_rate": 3.453536836326518e-05, + "loss": 0.0077, + "num_input_tokens_seen": 16801552, + "step": 79610 + }, + { + "epoch": 8.758525852585258, + "grad_norm": 0.43369144201278687, + "learning_rate": 3.4533149687060344e-05, + "loss": 0.0464, + "num_input_tokens_seen": 16802672, + "step": 79615 + }, + { + "epoch": 8.75907590759076, + "grad_norm": 0.3558097183704376, + "learning_rate": 3.4530930922993046e-05, + "loss": 0.0528, + "num_input_tokens_seen": 16803760, + "step": 79620 + }, + { + "epoch": 8.75962596259626, + "grad_norm": 0.008226594887673855, + "learning_rate": 3.4528712071083737e-05, + "loss": 0.001, + "num_input_tokens_seen": 16804848, + "step": 79625 + }, + { + "epoch": 8.76017601760176, + "grad_norm": 0.04342498630285263, + "learning_rate": 3.452649313135285e-05, + "loss": 0.1651, + "num_input_tokens_seen": 16805872, + "step": 79630 + }, + { + "epoch": 8.76072607260726, + "grad_norm": 0.003584815189242363, + "learning_rate": 3.452427410382085e-05, + "loss": 0.1113, + "num_input_tokens_seen": 16806960, + "step": 79635 + }, + { + "epoch": 8.761276127612762, + "grad_norm": 0.1820000559091568, + "learning_rate": 3.4522054988508185e-05, + "loss": 0.026, + "num_input_tokens_seen": 16807984, + "step": 79640 + }, + { + "epoch": 8.761826182618261, + "grad_norm": 0.05926201492547989, + "learning_rate": 3.451983578543531e-05, + "loss": 0.0276, + "num_input_tokens_seen": 16809008, + "step": 79645 + }, + { + "epoch": 8.762376237623762, + "grad_norm": 1.7575446367263794, + "learning_rate": 3.451761649462268e-05, + "loss": 0.0504, + "num_input_tokens_seen": 16810064, + "step": 79650 + }, + { + "epoch": 8.762926292629263, + "grad_norm": 0.05234883725643158, + "learning_rate": 3.451539711609074e-05, + "loss": 0.0132, + "num_input_tokens_seen": 16811088, + "step": 79655 + }, + { + "epoch": 8.763476347634764, + "grad_norm": 0.037395212799310684, + "learning_rate": 3.451317764985995e-05, + "loss": 0.0322, + "num_input_tokens_seen": 16812208, + "step": 79660 + }, + { + "epoch": 8.764026402640264, + "grad_norm": 0.012266716919839382, + "learning_rate": 3.451095809595077e-05, + "loss": 0.0073, + "num_input_tokens_seen": 16813168, + "step": 79665 + }, + { + "epoch": 8.764576457645765, + "grad_norm": 0.060290321707725525, + "learning_rate": 3.4508738454383656e-05, + "loss": 0.1182, + "num_input_tokens_seen": 16814256, + "step": 79670 + }, + { + "epoch": 8.765126512651266, + "grad_norm": 0.28415554761886597, + "learning_rate": 3.450651872517906e-05, + "loss": 0.0111, + "num_input_tokens_seen": 16815280, + "step": 79675 + }, + { + "epoch": 8.765676567656765, + "grad_norm": 0.01841551810503006, + "learning_rate": 3.4504298908357444e-05, + "loss": 0.0087, + "num_input_tokens_seen": 16816336, + "step": 79680 + }, + { + "epoch": 8.766226622662266, + "grad_norm": 3.286612033843994, + "learning_rate": 3.450207900393926e-05, + "loss": 0.0499, + "num_input_tokens_seen": 16817392, + "step": 79685 + }, + { + "epoch": 8.766776677667767, + "grad_norm": 0.07725463062524796, + "learning_rate": 3.449985901194498e-05, + "loss": 0.0063, + "num_input_tokens_seen": 16818480, + "step": 79690 + }, + { + "epoch": 8.767326732673267, + "grad_norm": 1.5875906944274902, + "learning_rate": 3.449763893239505e-05, + "loss": 0.0548, + "num_input_tokens_seen": 16819472, + "step": 79695 + }, + { + "epoch": 8.767876787678768, + "grad_norm": 0.19446571171283722, + "learning_rate": 3.4495418765309946e-05, + "loss": 0.0033, + "num_input_tokens_seen": 16820496, + "step": 79700 + }, + { + "epoch": 8.768426842684269, + "grad_norm": 0.1997241973876953, + "learning_rate": 3.4493198510710125e-05, + "loss": 0.0121, + "num_input_tokens_seen": 16821552, + "step": 79705 + }, + { + "epoch": 8.768976897689768, + "grad_norm": 0.11837603151798248, + "learning_rate": 3.449097816861604e-05, + "loss": 0.0216, + "num_input_tokens_seen": 16822640, + "step": 79710 + }, + { + "epoch": 8.76952695269527, + "grad_norm": 1.2503175735473633, + "learning_rate": 3.448875773904817e-05, + "loss": 0.0199, + "num_input_tokens_seen": 16823728, + "step": 79715 + }, + { + "epoch": 8.77007700770077, + "grad_norm": 2.1687850952148438, + "learning_rate": 3.448653722202697e-05, + "loss": 0.1181, + "num_input_tokens_seen": 16824720, + "step": 79720 + }, + { + "epoch": 8.770627062706271, + "grad_norm": 0.016966652125120163, + "learning_rate": 3.448431661757291e-05, + "loss": 0.0126, + "num_input_tokens_seen": 16825776, + "step": 79725 + }, + { + "epoch": 8.77117711771177, + "grad_norm": 0.060246024280786514, + "learning_rate": 3.448209592570646e-05, + "loss": 0.0164, + "num_input_tokens_seen": 16826832, + "step": 79730 + }, + { + "epoch": 8.771727172717272, + "grad_norm": 0.08689632266759872, + "learning_rate": 3.447987514644807e-05, + "loss": 0.0029, + "num_input_tokens_seen": 16827920, + "step": 79735 + }, + { + "epoch": 8.772277227722773, + "grad_norm": 0.26668986678123474, + "learning_rate": 3.4477654279818226e-05, + "loss": 0.0484, + "num_input_tokens_seen": 16828912, + "step": 79740 + }, + { + "epoch": 8.772827282728272, + "grad_norm": 0.004976519383490086, + "learning_rate": 3.4475433325837395e-05, + "loss": 0.007, + "num_input_tokens_seen": 16829936, + "step": 79745 + }, + { + "epoch": 8.773377337733773, + "grad_norm": 0.04465243220329285, + "learning_rate": 3.447321228452604e-05, + "loss": 0.0429, + "num_input_tokens_seen": 16830960, + "step": 79750 + }, + { + "epoch": 8.773927392739274, + "grad_norm": 1.8712875843048096, + "learning_rate": 3.4470991155904625e-05, + "loss": 0.0243, + "num_input_tokens_seen": 16832016, + "step": 79755 + }, + { + "epoch": 8.774477447744774, + "grad_norm": 2.4676434993743896, + "learning_rate": 3.446876993999364e-05, + "loss": 0.0611, + "num_input_tokens_seen": 16833072, + "step": 79760 + }, + { + "epoch": 8.775027502750275, + "grad_norm": 0.299428254365921, + "learning_rate": 3.4466548636813536e-05, + "loss": 0.0098, + "num_input_tokens_seen": 16834160, + "step": 79765 + }, + { + "epoch": 8.775577557755776, + "grad_norm": 1.1132084131240845, + "learning_rate": 3.44643272463848e-05, + "loss": 0.0999, + "num_input_tokens_seen": 16835216, + "step": 79770 + }, + { + "epoch": 8.776127612761275, + "grad_norm": 0.0028867919463664293, + "learning_rate": 3.4462105768727906e-05, + "loss": 0.002, + "num_input_tokens_seen": 16836336, + "step": 79775 + }, + { + "epoch": 8.776677667766776, + "grad_norm": 0.007145373150706291, + "learning_rate": 3.4459884203863315e-05, + "loss": 0.0148, + "num_input_tokens_seen": 16837456, + "step": 79780 + }, + { + "epoch": 8.777227722772277, + "grad_norm": 0.4093456268310547, + "learning_rate": 3.4457662551811516e-05, + "loss": 0.0229, + "num_input_tokens_seen": 16838512, + "step": 79785 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 0.820112943649292, + "learning_rate": 3.445544081259298e-05, + "loss": 0.0079, + "num_input_tokens_seen": 16839536, + "step": 79790 + }, + { + "epoch": 8.778327832783278, + "grad_norm": 0.049323439598083496, + "learning_rate": 3.445321898622818e-05, + "loss": 0.0696, + "num_input_tokens_seen": 16840624, + "step": 79795 + }, + { + "epoch": 8.778877887788779, + "grad_norm": 0.0078545231372118, + "learning_rate": 3.4450997072737604e-05, + "loss": 0.115, + "num_input_tokens_seen": 16841712, + "step": 79800 + }, + { + "epoch": 8.77942794279428, + "grad_norm": 0.02019379660487175, + "learning_rate": 3.4448775072141714e-05, + "loss": 0.0473, + "num_input_tokens_seen": 16842800, + "step": 79805 + }, + { + "epoch": 8.77997799779978, + "grad_norm": 1.4345289468765259, + "learning_rate": 3.444655298446099e-05, + "loss": 0.0165, + "num_input_tokens_seen": 16843824, + "step": 79810 + }, + { + "epoch": 8.78052805280528, + "grad_norm": 0.023479165509343147, + "learning_rate": 3.444433080971594e-05, + "loss": 0.0189, + "num_input_tokens_seen": 16844848, + "step": 79815 + }, + { + "epoch": 8.781078107810782, + "grad_norm": 0.8843727707862854, + "learning_rate": 3.444210854792702e-05, + "loss": 0.032, + "num_input_tokens_seen": 16845872, + "step": 79820 + }, + { + "epoch": 8.781628162816281, + "grad_norm": 0.08405566960573196, + "learning_rate": 3.443988619911471e-05, + "loss": 0.008, + "num_input_tokens_seen": 16846928, + "step": 79825 + }, + { + "epoch": 8.782178217821782, + "grad_norm": 0.014420888386666775, + "learning_rate": 3.44376637632995e-05, + "loss": 0.0518, + "num_input_tokens_seen": 16848048, + "step": 79830 + }, + { + "epoch": 8.782728272827283, + "grad_norm": 0.003810558468103409, + "learning_rate": 3.443544124050187e-05, + "loss": 0.0111, + "num_input_tokens_seen": 16849104, + "step": 79835 + }, + { + "epoch": 8.783278327832782, + "grad_norm": 0.029802996665239334, + "learning_rate": 3.443321863074231e-05, + "loss": 0.0809, + "num_input_tokens_seen": 16850160, + "step": 79840 + }, + { + "epoch": 8.783828382838283, + "grad_norm": 0.009113684296607971, + "learning_rate": 3.44309959340413e-05, + "loss": 0.0054, + "num_input_tokens_seen": 16851120, + "step": 79845 + }, + { + "epoch": 8.784378437843785, + "grad_norm": 0.9273233413696289, + "learning_rate": 3.442877315041933e-05, + "loss": 0.1109, + "num_input_tokens_seen": 16852144, + "step": 79850 + }, + { + "epoch": 8.784928492849286, + "grad_norm": 0.40760985016822815, + "learning_rate": 3.442655027989687e-05, + "loss": 0.0154, + "num_input_tokens_seen": 16853232, + "step": 79855 + }, + { + "epoch": 8.785478547854785, + "grad_norm": 0.18808090686798096, + "learning_rate": 3.442432732249443e-05, + "loss": 0.0446, + "num_input_tokens_seen": 16854320, + "step": 79860 + }, + { + "epoch": 8.786028602860286, + "grad_norm": 0.00536151509732008, + "learning_rate": 3.442210427823248e-05, + "loss": 0.0445, + "num_input_tokens_seen": 16855440, + "step": 79865 + }, + { + "epoch": 8.786578657865787, + "grad_norm": 0.02125387452542782, + "learning_rate": 3.4419881147131514e-05, + "loss": 0.0692, + "num_input_tokens_seen": 16856496, + "step": 79870 + }, + { + "epoch": 8.787128712871286, + "grad_norm": 0.2797362208366394, + "learning_rate": 3.441765792921203e-05, + "loss": 0.0114, + "num_input_tokens_seen": 16857584, + "step": 79875 + }, + { + "epoch": 8.787678767876788, + "grad_norm": 0.14490917325019836, + "learning_rate": 3.441543462449451e-05, + "loss": 0.0033, + "num_input_tokens_seen": 16858672, + "step": 79880 + }, + { + "epoch": 8.788228822882289, + "grad_norm": 0.5524349212646484, + "learning_rate": 3.4413211232999446e-05, + "loss": 0.0351, + "num_input_tokens_seen": 16859664, + "step": 79885 + }, + { + "epoch": 8.788778877887788, + "grad_norm": 0.025438392534852028, + "learning_rate": 3.441098775474734e-05, + "loss": 0.0265, + "num_input_tokens_seen": 16860688, + "step": 79890 + }, + { + "epoch": 8.789328932893289, + "grad_norm": 0.40805694460868835, + "learning_rate": 3.440876418975867e-05, + "loss": 0.1225, + "num_input_tokens_seen": 16861744, + "step": 79895 + }, + { + "epoch": 8.78987898789879, + "grad_norm": 0.32548636198043823, + "learning_rate": 3.4406540538053935e-05, + "loss": 0.0318, + "num_input_tokens_seen": 16862832, + "step": 79900 + }, + { + "epoch": 8.79042904290429, + "grad_norm": 0.042873263359069824, + "learning_rate": 3.4404316799653626e-05, + "loss": 0.0592, + "num_input_tokens_seen": 16863920, + "step": 79905 + }, + { + "epoch": 8.79097909790979, + "grad_norm": 0.018399585038423538, + "learning_rate": 3.4402092974578245e-05, + "loss": 0.0809, + "num_input_tokens_seen": 16864944, + "step": 79910 + }, + { + "epoch": 8.791529152915292, + "grad_norm": 0.080714151263237, + "learning_rate": 3.4399869062848284e-05, + "loss": 0.0039, + "num_input_tokens_seen": 16866000, + "step": 79915 + }, + { + "epoch": 8.792079207920793, + "grad_norm": 0.8954883813858032, + "learning_rate": 3.439764506448424e-05, + "loss": 0.0542, + "num_input_tokens_seen": 16866992, + "step": 79920 + }, + { + "epoch": 8.792629262926292, + "grad_norm": 0.019640162587165833, + "learning_rate": 3.439542097950661e-05, + "loss": 0.0595, + "num_input_tokens_seen": 16868080, + "step": 79925 + }, + { + "epoch": 8.793179317931793, + "grad_norm": 0.2249729037284851, + "learning_rate": 3.439319680793589e-05, + "loss": 0.0037, + "num_input_tokens_seen": 16869104, + "step": 79930 + }, + { + "epoch": 8.793729372937294, + "grad_norm": 0.06816622614860535, + "learning_rate": 3.439097254979259e-05, + "loss": 0.0229, + "num_input_tokens_seen": 16870160, + "step": 79935 + }, + { + "epoch": 8.794279427942794, + "grad_norm": 2.5715198516845703, + "learning_rate": 3.43887482050972e-05, + "loss": 0.1138, + "num_input_tokens_seen": 16871248, + "step": 79940 + }, + { + "epoch": 8.794829482948295, + "grad_norm": 0.028483346104621887, + "learning_rate": 3.438652377387022e-05, + "loss": 0.0157, + "num_input_tokens_seen": 16872304, + "step": 79945 + }, + { + "epoch": 8.795379537953796, + "grad_norm": 0.13959860801696777, + "learning_rate": 3.438429925613216e-05, + "loss": 0.0116, + "num_input_tokens_seen": 16873360, + "step": 79950 + }, + { + "epoch": 8.795929592959295, + "grad_norm": 0.006173474248498678, + "learning_rate": 3.4382074651903505e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16874384, + "step": 79955 + }, + { + "epoch": 8.796479647964796, + "grad_norm": 0.06371909379959106, + "learning_rate": 3.437984996120478e-05, + "loss": 0.006, + "num_input_tokens_seen": 16875408, + "step": 79960 + }, + { + "epoch": 8.797029702970297, + "grad_norm": 0.06784428656101227, + "learning_rate": 3.437762518405648e-05, + "loss": 0.1127, + "num_input_tokens_seen": 16876496, + "step": 79965 + }, + { + "epoch": 8.797579757975798, + "grad_norm": 0.15569403767585754, + "learning_rate": 3.43754003204791e-05, + "loss": 0.05, + "num_input_tokens_seen": 16877584, + "step": 79970 + }, + { + "epoch": 8.798129812981298, + "grad_norm": 0.014446832239627838, + "learning_rate": 3.437317537049316e-05, + "loss": 0.003, + "num_input_tokens_seen": 16878672, + "step": 79975 + }, + { + "epoch": 8.798679867986799, + "grad_norm": 0.01333670411258936, + "learning_rate": 3.437095033411916e-05, + "loss": 0.0051, + "num_input_tokens_seen": 16879664, + "step": 79980 + }, + { + "epoch": 8.7992299229923, + "grad_norm": 0.02524745836853981, + "learning_rate": 3.436872521137761e-05, + "loss": 0.0239, + "num_input_tokens_seen": 16880816, + "step": 79985 + }, + { + "epoch": 8.7997799779978, + "grad_norm": 1.8503222465515137, + "learning_rate": 3.436650000228901e-05, + "loss": 0.1092, + "num_input_tokens_seen": 16881808, + "step": 79990 + }, + { + "epoch": 8.8003300330033, + "grad_norm": 0.05118803307414055, + "learning_rate": 3.436427470687388e-05, + "loss": 0.0271, + "num_input_tokens_seen": 16882896, + "step": 79995 + }, + { + "epoch": 8.800880088008801, + "grad_norm": 1.2859055995941162, + "learning_rate": 3.4362049325152714e-05, + "loss": 0.0666, + "num_input_tokens_seen": 16884016, + "step": 80000 + }, + { + "epoch": 8.8014301430143, + "grad_norm": 0.546837329864502, + "learning_rate": 3.4359823857146045e-05, + "loss": 0.0292, + "num_input_tokens_seen": 16885072, + "step": 80005 + }, + { + "epoch": 8.801980198019802, + "grad_norm": 0.021150872111320496, + "learning_rate": 3.435759830287436e-05, + "loss": 0.0588, + "num_input_tokens_seen": 16886064, + "step": 80010 + }, + { + "epoch": 8.802530253025303, + "grad_norm": 0.012000366114079952, + "learning_rate": 3.435537266235818e-05, + "loss": 0.0671, + "num_input_tokens_seen": 16887120, + "step": 80015 + }, + { + "epoch": 8.803080308030804, + "grad_norm": 0.011555285193026066, + "learning_rate": 3.435314693561803e-05, + "loss": 0.055, + "num_input_tokens_seen": 16888112, + "step": 80020 + }, + { + "epoch": 8.803630363036303, + "grad_norm": 0.688054621219635, + "learning_rate": 3.435092112267441e-05, + "loss": 0.028, + "num_input_tokens_seen": 16889168, + "step": 80025 + }, + { + "epoch": 8.804180418041804, + "grad_norm": 1.1971534490585327, + "learning_rate": 3.434869522354783e-05, + "loss": 0.0286, + "num_input_tokens_seen": 16890224, + "step": 80030 + }, + { + "epoch": 8.804730473047305, + "grad_norm": 0.12402015179395676, + "learning_rate": 3.434646923825882e-05, + "loss": 0.1549, + "num_input_tokens_seen": 16891280, + "step": 80035 + }, + { + "epoch": 8.805280528052805, + "grad_norm": 0.2914939224720001, + "learning_rate": 3.434424316682787e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16892304, + "step": 80040 + }, + { + "epoch": 8.805830583058306, + "grad_norm": 0.023141080513596535, + "learning_rate": 3.434201700927554e-05, + "loss": 0.0207, + "num_input_tokens_seen": 16893328, + "step": 80045 + }, + { + "epoch": 8.806380638063807, + "grad_norm": 0.015488474629819393, + "learning_rate": 3.4339790765622305e-05, + "loss": 0.0158, + "num_input_tokens_seen": 16894416, + "step": 80050 + }, + { + "epoch": 8.806930693069306, + "grad_norm": 0.497437059879303, + "learning_rate": 3.43375644358887e-05, + "loss": 0.0887, + "num_input_tokens_seen": 16895504, + "step": 80055 + }, + { + "epoch": 8.807480748074807, + "grad_norm": 0.06639939546585083, + "learning_rate": 3.4335338020095254e-05, + "loss": 0.0053, + "num_input_tokens_seen": 16896496, + "step": 80060 + }, + { + "epoch": 8.808030803080309, + "grad_norm": 1.4686787128448486, + "learning_rate": 3.433311151826247e-05, + "loss": 0.0824, + "num_input_tokens_seen": 16897488, + "step": 80065 + }, + { + "epoch": 8.808580858085808, + "grad_norm": 0.03347082436084747, + "learning_rate": 3.4330884930410876e-05, + "loss": 0.0074, + "num_input_tokens_seen": 16898512, + "step": 80070 + }, + { + "epoch": 8.809130913091309, + "grad_norm": 0.030063895508646965, + "learning_rate": 3.4328658256560995e-05, + "loss": 0.1722, + "num_input_tokens_seen": 16899568, + "step": 80075 + }, + { + "epoch": 8.80968096809681, + "grad_norm": 0.8549244403839111, + "learning_rate": 3.432643149673336e-05, + "loss": 0.0973, + "num_input_tokens_seen": 16900624, + "step": 80080 + }, + { + "epoch": 8.810231023102311, + "grad_norm": 0.08612419664859772, + "learning_rate": 3.432420465094846e-05, + "loss": 0.0231, + "num_input_tokens_seen": 16901616, + "step": 80085 + }, + { + "epoch": 8.81078107810781, + "grad_norm": 0.3289882242679596, + "learning_rate": 3.432197771922685e-05, + "loss": 0.0417, + "num_input_tokens_seen": 16902672, + "step": 80090 + }, + { + "epoch": 8.811331133113312, + "grad_norm": 0.04710911959409714, + "learning_rate": 3.431975070158904e-05, + "loss": 0.0079, + "num_input_tokens_seen": 16903696, + "step": 80095 + }, + { + "epoch": 8.811881188118813, + "grad_norm": 0.06237465515732765, + "learning_rate": 3.431752359805557e-05, + "loss": 0.0376, + "num_input_tokens_seen": 16904688, + "step": 80100 + }, + { + "epoch": 8.812431243124312, + "grad_norm": 0.008720616810023785, + "learning_rate": 3.431529640864695e-05, + "loss": 0.1031, + "num_input_tokens_seen": 16905744, + "step": 80105 + }, + { + "epoch": 8.812981298129813, + "grad_norm": 0.23941737413406372, + "learning_rate": 3.4313069133383705e-05, + "loss": 0.0122, + "num_input_tokens_seen": 16906736, + "step": 80110 + }, + { + "epoch": 8.813531353135314, + "grad_norm": 0.26393112540245056, + "learning_rate": 3.431084177228638e-05, + "loss": 0.0947, + "num_input_tokens_seen": 16907792, + "step": 80115 + }, + { + "epoch": 8.814081408140813, + "grad_norm": 0.4925239384174347, + "learning_rate": 3.4308614325375486e-05, + "loss": 0.0269, + "num_input_tokens_seen": 16908848, + "step": 80120 + }, + { + "epoch": 8.814631463146315, + "grad_norm": 0.2427767664194107, + "learning_rate": 3.4306386792671575e-05, + "loss": 0.0193, + "num_input_tokens_seen": 16909872, + "step": 80125 + }, + { + "epoch": 8.815181518151816, + "grad_norm": 0.018348418176174164, + "learning_rate": 3.430415917419515e-05, + "loss": 0.045, + "num_input_tokens_seen": 16910928, + "step": 80130 + }, + { + "epoch": 8.815731573157315, + "grad_norm": 0.3436030447483063, + "learning_rate": 3.430193146996676e-05, + "loss": 0.0131, + "num_input_tokens_seen": 16911952, + "step": 80135 + }, + { + "epoch": 8.816281628162816, + "grad_norm": 0.005231871735304594, + "learning_rate": 3.429970368000693e-05, + "loss": 0.0297, + "num_input_tokens_seen": 16913040, + "step": 80140 + }, + { + "epoch": 8.816831683168317, + "grad_norm": 0.36549076437950134, + "learning_rate": 3.4297475804336185e-05, + "loss": 0.0229, + "num_input_tokens_seen": 16914096, + "step": 80145 + }, + { + "epoch": 8.817381738173818, + "grad_norm": 0.042714301496744156, + "learning_rate": 3.429524784297508e-05, + "loss": 0.0057, + "num_input_tokens_seen": 16915152, + "step": 80150 + }, + { + "epoch": 8.817931793179318, + "grad_norm": 0.7303016185760498, + "learning_rate": 3.429301979594413e-05, + "loss": 0.0226, + "num_input_tokens_seen": 16916208, + "step": 80155 + }, + { + "epoch": 8.818481848184819, + "grad_norm": 0.10027716308832169, + "learning_rate": 3.429079166326388e-05, + "loss": 0.0535, + "num_input_tokens_seen": 16917200, + "step": 80160 + }, + { + "epoch": 8.81903190319032, + "grad_norm": 0.6354044675827026, + "learning_rate": 3.428856344495485e-05, + "loss": 0.0196, + "num_input_tokens_seen": 16918192, + "step": 80165 + }, + { + "epoch": 8.819581958195819, + "grad_norm": 0.0297868512570858, + "learning_rate": 3.428633514103759e-05, + "loss": 0.0934, + "num_input_tokens_seen": 16919216, + "step": 80170 + }, + { + "epoch": 8.82013201320132, + "grad_norm": 1.5108202695846558, + "learning_rate": 3.4284106751532655e-05, + "loss": 0.0627, + "num_input_tokens_seen": 16920272, + "step": 80175 + }, + { + "epoch": 8.820682068206821, + "grad_norm": 0.14746899902820587, + "learning_rate": 3.4281878276460544e-05, + "loss": 0.0211, + "num_input_tokens_seen": 16921296, + "step": 80180 + }, + { + "epoch": 8.82123212321232, + "grad_norm": 0.1649276316165924, + "learning_rate": 3.4279649715841825e-05, + "loss": 0.0106, + "num_input_tokens_seen": 16922448, + "step": 80185 + }, + { + "epoch": 8.821782178217822, + "grad_norm": 0.34690219163894653, + "learning_rate": 3.427742106969701e-05, + "loss": 0.0347, + "num_input_tokens_seen": 16923504, + "step": 80190 + }, + { + "epoch": 8.822332233223323, + "grad_norm": 0.08251472562551498, + "learning_rate": 3.427519233804667e-05, + "loss": 0.0027, + "num_input_tokens_seen": 16924592, + "step": 80195 + }, + { + "epoch": 8.822882288228822, + "grad_norm": 1.4291030168533325, + "learning_rate": 3.427296352091133e-05, + "loss": 0.0129, + "num_input_tokens_seen": 16925648, + "step": 80200 + }, + { + "epoch": 8.823432343234323, + "grad_norm": 0.029693003743886948, + "learning_rate": 3.427073461831154e-05, + "loss": 0.0584, + "num_input_tokens_seen": 16926704, + "step": 80205 + }, + { + "epoch": 8.823982398239824, + "grad_norm": 0.03113095462322235, + "learning_rate": 3.426850563026783e-05, + "loss": 0.0187, + "num_input_tokens_seen": 16927728, + "step": 80210 + }, + { + "epoch": 8.824532453245325, + "grad_norm": 0.5156809091567993, + "learning_rate": 3.4266276556800755e-05, + "loss": 0.0176, + "num_input_tokens_seen": 16928816, + "step": 80215 + }, + { + "epoch": 8.825082508250825, + "grad_norm": 0.01776672713458538, + "learning_rate": 3.426404739793086e-05, + "loss": 0.0231, + "num_input_tokens_seen": 16929840, + "step": 80220 + }, + { + "epoch": 8.825632563256326, + "grad_norm": 0.02690574899315834, + "learning_rate": 3.426181815367868e-05, + "loss": 0.035, + "num_input_tokens_seen": 16930928, + "step": 80225 + }, + { + "epoch": 8.826182618261827, + "grad_norm": 0.27689817547798157, + "learning_rate": 3.425958882406476e-05, + "loss": 0.0083, + "num_input_tokens_seen": 16931952, + "step": 80230 + }, + { + "epoch": 8.826732673267326, + "grad_norm": 0.10556793212890625, + "learning_rate": 3.4257359409109666e-05, + "loss": 0.0349, + "num_input_tokens_seen": 16932976, + "step": 80235 + }, + { + "epoch": 8.827282728272827, + "grad_norm": 0.08027739822864532, + "learning_rate": 3.4255129908833924e-05, + "loss": 0.0284, + "num_input_tokens_seen": 16933968, + "step": 80240 + }, + { + "epoch": 8.827832783278328, + "grad_norm": 0.04209787771105766, + "learning_rate": 3.4252900323258095e-05, + "loss": 0.0716, + "num_input_tokens_seen": 16935152, + "step": 80245 + }, + { + "epoch": 8.828382838283828, + "grad_norm": 0.05946826562285423, + "learning_rate": 3.4250670652402725e-05, + "loss": 0.0041, + "num_input_tokens_seen": 16936208, + "step": 80250 + }, + { + "epoch": 8.828932893289329, + "grad_norm": 0.01912675052881241, + "learning_rate": 3.424844089628836e-05, + "loss": 0.0032, + "num_input_tokens_seen": 16937264, + "step": 80255 + }, + { + "epoch": 8.82948294829483, + "grad_norm": 0.044333361089229584, + "learning_rate": 3.424621105493554e-05, + "loss": 0.0104, + "num_input_tokens_seen": 16938288, + "step": 80260 + }, + { + "epoch": 8.83003300330033, + "grad_norm": 0.019584888592362404, + "learning_rate": 3.424398112836485e-05, + "loss": 0.0143, + "num_input_tokens_seen": 16939376, + "step": 80265 + }, + { + "epoch": 8.83058305830583, + "grad_norm": 0.01174930389970541, + "learning_rate": 3.4241751116596823e-05, + "loss": 0.0026, + "num_input_tokens_seen": 16940400, + "step": 80270 + }, + { + "epoch": 8.831133113311331, + "grad_norm": 0.7178049683570862, + "learning_rate": 3.4239521019651985e-05, + "loss": 0.0123, + "num_input_tokens_seen": 16941456, + "step": 80275 + }, + { + "epoch": 8.831683168316832, + "grad_norm": 1.868322491645813, + "learning_rate": 3.423729083755094e-05, + "loss": 0.1961, + "num_input_tokens_seen": 16942512, + "step": 80280 + }, + { + "epoch": 8.832233223322332, + "grad_norm": 0.04262912645936012, + "learning_rate": 3.4235060570314206e-05, + "loss": 0.015, + "num_input_tokens_seen": 16943568, + "step": 80285 + }, + { + "epoch": 8.832783278327833, + "grad_norm": 0.01953381486237049, + "learning_rate": 3.423283021796235e-05, + "loss": 0.0671, + "num_input_tokens_seen": 16944656, + "step": 80290 + }, + { + "epoch": 8.833333333333334, + "grad_norm": 0.1739591509103775, + "learning_rate": 3.423059978051594e-05, + "loss": 0.0202, + "num_input_tokens_seen": 16945744, + "step": 80295 + }, + { + "epoch": 8.833883388338833, + "grad_norm": 1.1240150928497314, + "learning_rate": 3.422836925799551e-05, + "loss": 0.0844, + "num_input_tokens_seen": 16946800, + "step": 80300 + }, + { + "epoch": 8.834433443344334, + "grad_norm": 0.020888281986117363, + "learning_rate": 3.422613865042163e-05, + "loss": 0.0454, + "num_input_tokens_seen": 16947920, + "step": 80305 + }, + { + "epoch": 8.834983498349835, + "grad_norm": 0.046105898916721344, + "learning_rate": 3.422390795781486e-05, + "loss": 0.0095, + "num_input_tokens_seen": 16949040, + "step": 80310 + }, + { + "epoch": 8.835533553355335, + "grad_norm": 1.5372155904769897, + "learning_rate": 3.422167718019576e-05, + "loss": 0.1315, + "num_input_tokens_seen": 16950160, + "step": 80315 + }, + { + "epoch": 8.836083608360836, + "grad_norm": 0.4425387978553772, + "learning_rate": 3.421944631758487e-05, + "loss": 0.0252, + "num_input_tokens_seen": 16951248, + "step": 80320 + }, + { + "epoch": 8.836633663366337, + "grad_norm": 0.05548691004514694, + "learning_rate": 3.4217215370002785e-05, + "loss": 0.0049, + "num_input_tokens_seen": 16952208, + "step": 80325 + }, + { + "epoch": 8.837183718371836, + "grad_norm": 0.2779380977153778, + "learning_rate": 3.421498433747004e-05, + "loss": 0.0061, + "num_input_tokens_seen": 16953264, + "step": 80330 + }, + { + "epoch": 8.837733773377337, + "grad_norm": 0.05540691316127777, + "learning_rate": 3.421275322000721e-05, + "loss": 0.0115, + "num_input_tokens_seen": 16954352, + "step": 80335 + }, + { + "epoch": 8.838283828382838, + "grad_norm": 0.015220247209072113, + "learning_rate": 3.4210522017634854e-05, + "loss": 0.0387, + "num_input_tokens_seen": 16955376, + "step": 80340 + }, + { + "epoch": 8.83883388338834, + "grad_norm": 0.01798130013048649, + "learning_rate": 3.4208290730373534e-05, + "loss": 0.0043, + "num_input_tokens_seen": 16956496, + "step": 80345 + }, + { + "epoch": 8.839383938393839, + "grad_norm": 0.7357379198074341, + "learning_rate": 3.4206059358243815e-05, + "loss": 0.065, + "num_input_tokens_seen": 16957552, + "step": 80350 + }, + { + "epoch": 8.83993399339934, + "grad_norm": 1.657711148262024, + "learning_rate": 3.420382790126627e-05, + "loss": 0.1826, + "num_input_tokens_seen": 16958672, + "step": 80355 + }, + { + "epoch": 8.840484048404841, + "grad_norm": 1.36733078956604, + "learning_rate": 3.420159635946146e-05, + "loss": 0.0254, + "num_input_tokens_seen": 16959696, + "step": 80360 + }, + { + "epoch": 8.84103410341034, + "grad_norm": 0.47870171070098877, + "learning_rate": 3.419936473284995e-05, + "loss": 0.0122, + "num_input_tokens_seen": 16960752, + "step": 80365 + }, + { + "epoch": 8.841584158415841, + "grad_norm": 0.38314151763916016, + "learning_rate": 3.4197133021452304e-05, + "loss": 0.1193, + "num_input_tokens_seen": 16961872, + "step": 80370 + }, + { + "epoch": 8.842134213421343, + "grad_norm": 2.1330153942108154, + "learning_rate": 3.41949012252891e-05, + "loss": 0.1411, + "num_input_tokens_seen": 16962896, + "step": 80375 + }, + { + "epoch": 8.842684268426842, + "grad_norm": 0.9079635739326477, + "learning_rate": 3.41926693443809e-05, + "loss": 0.0322, + "num_input_tokens_seen": 16963984, + "step": 80380 + }, + { + "epoch": 8.843234323432343, + "grad_norm": 0.030637161806225777, + "learning_rate": 3.4190437378748294e-05, + "loss": 0.0097, + "num_input_tokens_seen": 16965008, + "step": 80385 + }, + { + "epoch": 8.843784378437844, + "grad_norm": 0.06599848717451096, + "learning_rate": 3.418820532841182e-05, + "loss": 0.0161, + "num_input_tokens_seen": 16966096, + "step": 80390 + }, + { + "epoch": 8.844334433443345, + "grad_norm": 0.06458351761102676, + "learning_rate": 3.418597319339207e-05, + "loss": 0.0149, + "num_input_tokens_seen": 16967152, + "step": 80395 + }, + { + "epoch": 8.844884488448844, + "grad_norm": 1.1219096183776855, + "learning_rate": 3.418374097370961e-05, + "loss": 0.1342, + "num_input_tokens_seen": 16968112, + "step": 80400 + }, + { + "epoch": 8.845434543454346, + "grad_norm": 0.3996323347091675, + "learning_rate": 3.418150866938502e-05, + "loss": 0.0351, + "num_input_tokens_seen": 16969200, + "step": 80405 + }, + { + "epoch": 8.845984598459847, + "grad_norm": 0.002777776448056102, + "learning_rate": 3.4179276280438874e-05, + "loss": 0.0082, + "num_input_tokens_seen": 16970288, + "step": 80410 + }, + { + "epoch": 8.846534653465346, + "grad_norm": 0.5608092546463013, + "learning_rate": 3.4177043806891737e-05, + "loss": 0.0151, + "num_input_tokens_seen": 16971248, + "step": 80415 + }, + { + "epoch": 8.847084708470847, + "grad_norm": 0.020092487335205078, + "learning_rate": 3.417481124876419e-05, + "loss": 0.0179, + "num_input_tokens_seen": 16972272, + "step": 80420 + }, + { + "epoch": 8.847634763476348, + "grad_norm": 0.38822928071022034, + "learning_rate": 3.417257860607682e-05, + "loss": 0.0771, + "num_input_tokens_seen": 16973296, + "step": 80425 + }, + { + "epoch": 8.848184818481847, + "grad_norm": 0.04835726320743561, + "learning_rate": 3.4170345878850185e-05, + "loss": 0.0506, + "num_input_tokens_seen": 16974320, + "step": 80430 + }, + { + "epoch": 8.848734873487349, + "grad_norm": 0.05337083712220192, + "learning_rate": 3.4168113067104875e-05, + "loss": 0.0057, + "num_input_tokens_seen": 16975312, + "step": 80435 + }, + { + "epoch": 8.84928492849285, + "grad_norm": 0.044206682592630386, + "learning_rate": 3.416588017086146e-05, + "loss": 0.0165, + "num_input_tokens_seen": 16976368, + "step": 80440 + }, + { + "epoch": 8.84983498349835, + "grad_norm": 0.07162921875715256, + "learning_rate": 3.416364719014054e-05, + "loss": 0.0706, + "num_input_tokens_seen": 16977392, + "step": 80445 + }, + { + "epoch": 8.85038503850385, + "grad_norm": 0.07594238966703415, + "learning_rate": 3.416141412496266e-05, + "loss": 0.1063, + "num_input_tokens_seen": 16978448, + "step": 80450 + }, + { + "epoch": 8.850935093509351, + "grad_norm": 0.204824760556221, + "learning_rate": 3.4159180975348446e-05, + "loss": 0.0433, + "num_input_tokens_seen": 16979472, + "step": 80455 + }, + { + "epoch": 8.851485148514852, + "grad_norm": 0.20744773745536804, + "learning_rate": 3.4156947741318445e-05, + "loss": 0.0285, + "num_input_tokens_seen": 16980528, + "step": 80460 + }, + { + "epoch": 8.852035203520352, + "grad_norm": 0.025361428037285805, + "learning_rate": 3.415471442289325e-05, + "loss": 0.0072, + "num_input_tokens_seen": 16981616, + "step": 80465 + }, + { + "epoch": 8.852585258525853, + "grad_norm": 1.210677981376648, + "learning_rate": 3.415248102009345e-05, + "loss": 0.0924, + "num_input_tokens_seen": 16982640, + "step": 80470 + }, + { + "epoch": 8.853135313531354, + "grad_norm": 0.25399309396743774, + "learning_rate": 3.415024753293962e-05, + "loss": 0.0676, + "num_input_tokens_seen": 16983664, + "step": 80475 + }, + { + "epoch": 8.853685368536853, + "grad_norm": 0.07358720153570175, + "learning_rate": 3.414801396145235e-05, + "loss": 0.0491, + "num_input_tokens_seen": 16984720, + "step": 80480 + }, + { + "epoch": 8.854235423542354, + "grad_norm": 1.7024827003479004, + "learning_rate": 3.414578030565222e-05, + "loss": 0.0657, + "num_input_tokens_seen": 16985776, + "step": 80485 + }, + { + "epoch": 8.854785478547855, + "grad_norm": 0.6949647068977356, + "learning_rate": 3.414354656555983e-05, + "loss": 0.0235, + "num_input_tokens_seen": 16986832, + "step": 80490 + }, + { + "epoch": 8.855335533553355, + "grad_norm": 0.06368928402662277, + "learning_rate": 3.4141312741195744e-05, + "loss": 0.0039, + "num_input_tokens_seen": 16987888, + "step": 80495 + }, + { + "epoch": 8.855885588558856, + "grad_norm": 0.26555585861206055, + "learning_rate": 3.413907883258058e-05, + "loss": 0.0116, + "num_input_tokens_seen": 16988912, + "step": 80500 + }, + { + "epoch": 8.856435643564357, + "grad_norm": 0.3502563238143921, + "learning_rate": 3.4136844839734907e-05, + "loss": 0.0115, + "num_input_tokens_seen": 16989968, + "step": 80505 + }, + { + "epoch": 8.856985698569858, + "grad_norm": 0.00433448888361454, + "learning_rate": 3.413461076267932e-05, + "loss": 0.0142, + "num_input_tokens_seen": 16991024, + "step": 80510 + }, + { + "epoch": 8.857535753575357, + "grad_norm": 0.06027129665017128, + "learning_rate": 3.413237660143441e-05, + "loss": 0.0395, + "num_input_tokens_seen": 16992080, + "step": 80515 + }, + { + "epoch": 8.858085808580858, + "grad_norm": 1.263112187385559, + "learning_rate": 3.4130142356020764e-05, + "loss": 0.0379, + "num_input_tokens_seen": 16993200, + "step": 80520 + }, + { + "epoch": 8.85863586358636, + "grad_norm": 0.05493545159697533, + "learning_rate": 3.4127908026458976e-05, + "loss": 0.0048, + "num_input_tokens_seen": 16994288, + "step": 80525 + }, + { + "epoch": 8.859185918591859, + "grad_norm": 0.029277721419930458, + "learning_rate": 3.412567361276965e-05, + "loss": 0.0743, + "num_input_tokens_seen": 16995344, + "step": 80530 + }, + { + "epoch": 8.85973597359736, + "grad_norm": 0.05907958745956421, + "learning_rate": 3.412343911497335e-05, + "loss": 0.0044, + "num_input_tokens_seen": 16996368, + "step": 80535 + }, + { + "epoch": 8.86028602860286, + "grad_norm": 0.008425279520452023, + "learning_rate": 3.412120453309071e-05, + "loss": 0.0024, + "num_input_tokens_seen": 16997456, + "step": 80540 + }, + { + "epoch": 8.86083608360836, + "grad_norm": 0.008501953445374966, + "learning_rate": 3.4118969867142294e-05, + "loss": 0.02, + "num_input_tokens_seen": 16998448, + "step": 80545 + }, + { + "epoch": 8.861386138613861, + "grad_norm": 2.1635196208953857, + "learning_rate": 3.411673511714871e-05, + "loss": 0.0999, + "num_input_tokens_seen": 16999568, + "step": 80550 + }, + { + "epoch": 8.861936193619362, + "grad_norm": 0.024732207879424095, + "learning_rate": 3.4114500283130554e-05, + "loss": 0.0068, + "num_input_tokens_seen": 17000656, + "step": 80555 + }, + { + "epoch": 8.862486248624862, + "grad_norm": 2.639744997024536, + "learning_rate": 3.411226536510842e-05, + "loss": 0.0596, + "num_input_tokens_seen": 17001744, + "step": 80560 + }, + { + "epoch": 8.863036303630363, + "grad_norm": 1.005967140197754, + "learning_rate": 3.4110030363102905e-05, + "loss": 0.0282, + "num_input_tokens_seen": 17002800, + "step": 80565 + }, + { + "epoch": 8.863586358635864, + "grad_norm": 0.9664910435676575, + "learning_rate": 3.410779527713461e-05, + "loss": 0.0217, + "num_input_tokens_seen": 17003824, + "step": 80570 + }, + { + "epoch": 8.864136413641365, + "grad_norm": 0.012231981381773949, + "learning_rate": 3.410556010722416e-05, + "loss": 0.0083, + "num_input_tokens_seen": 17004880, + "step": 80575 + }, + { + "epoch": 8.864686468646864, + "grad_norm": 0.2012341022491455, + "learning_rate": 3.4103324853392106e-05, + "loss": 0.0112, + "num_input_tokens_seen": 17005904, + "step": 80580 + }, + { + "epoch": 8.865236523652365, + "grad_norm": 0.018720269203186035, + "learning_rate": 3.4101089515659084e-05, + "loss": 0.0145, + "num_input_tokens_seen": 17006928, + "step": 80585 + }, + { + "epoch": 8.865786578657866, + "grad_norm": 1.6716439723968506, + "learning_rate": 3.409885409404568e-05, + "loss": 0.0474, + "num_input_tokens_seen": 17008048, + "step": 80590 + }, + { + "epoch": 8.866336633663366, + "grad_norm": 0.31204140186309814, + "learning_rate": 3.4096618588572516e-05, + "loss": 0.0232, + "num_input_tokens_seen": 17009072, + "step": 80595 + }, + { + "epoch": 8.866886688668867, + "grad_norm": 0.011394036933779716, + "learning_rate": 3.409438299926018e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17010064, + "step": 80600 + }, + { + "epoch": 8.867436743674368, + "grad_norm": 0.0808420181274414, + "learning_rate": 3.4092147326129274e-05, + "loss": 0.014, + "num_input_tokens_seen": 17011120, + "step": 80605 + }, + { + "epoch": 8.867986798679867, + "grad_norm": 1.1963938474655151, + "learning_rate": 3.408991156920041e-05, + "loss": 0.1321, + "num_input_tokens_seen": 17012112, + "step": 80610 + }, + { + "epoch": 8.868536853685368, + "grad_norm": 0.6852591037750244, + "learning_rate": 3.4087675728494204e-05, + "loss": 0.0133, + "num_input_tokens_seen": 17013168, + "step": 80615 + }, + { + "epoch": 8.86908690869087, + "grad_norm": 0.033372994512319565, + "learning_rate": 3.408543980403124e-05, + "loss": 0.0846, + "num_input_tokens_seen": 17014288, + "step": 80620 + }, + { + "epoch": 8.869636963696369, + "grad_norm": 0.09252085536718369, + "learning_rate": 3.4083203795832144e-05, + "loss": 0.0294, + "num_input_tokens_seen": 17015344, + "step": 80625 + }, + { + "epoch": 8.87018701870187, + "grad_norm": 0.11612076312303543, + "learning_rate": 3.408096770391751e-05, + "loss": 0.0125, + "num_input_tokens_seen": 17016400, + "step": 80630 + }, + { + "epoch": 8.870737073707371, + "grad_norm": 1.0052987337112427, + "learning_rate": 3.4078731528307955e-05, + "loss": 0.0561, + "num_input_tokens_seen": 17017424, + "step": 80635 + }, + { + "epoch": 8.871287128712872, + "grad_norm": 1.6677597761154175, + "learning_rate": 3.407649526902409e-05, + "loss": 0.0965, + "num_input_tokens_seen": 17018480, + "step": 80640 + }, + { + "epoch": 8.871837183718371, + "grad_norm": 0.8640955686569214, + "learning_rate": 3.4074258926086524e-05, + "loss": 0.0334, + "num_input_tokens_seen": 17019536, + "step": 80645 + }, + { + "epoch": 8.872387238723872, + "grad_norm": 0.5143478512763977, + "learning_rate": 3.407202249951586e-05, + "loss": 0.0182, + "num_input_tokens_seen": 17020688, + "step": 80650 + }, + { + "epoch": 8.872937293729374, + "grad_norm": 0.08757630735635757, + "learning_rate": 3.406978598933273e-05, + "loss": 0.0146, + "num_input_tokens_seen": 17021776, + "step": 80655 + }, + { + "epoch": 8.873487348734873, + "grad_norm": 0.6331567764282227, + "learning_rate": 3.406754939555773e-05, + "loss": 0.0127, + "num_input_tokens_seen": 17022768, + "step": 80660 + }, + { + "epoch": 8.874037403740374, + "grad_norm": 3.0461690425872803, + "learning_rate": 3.406531271821148e-05, + "loss": 0.0274, + "num_input_tokens_seen": 17023792, + "step": 80665 + }, + { + "epoch": 8.874587458745875, + "grad_norm": 0.20729757845401764, + "learning_rate": 3.406307595731459e-05, + "loss": 0.1053, + "num_input_tokens_seen": 17024816, + "step": 80670 + }, + { + "epoch": 8.875137513751374, + "grad_norm": 0.0977560505270958, + "learning_rate": 3.4060839112887683e-05, + "loss": 0.0043, + "num_input_tokens_seen": 17025872, + "step": 80675 + }, + { + "epoch": 8.875687568756875, + "grad_norm": 1.257399082183838, + "learning_rate": 3.4058602184951363e-05, + "loss": 0.0224, + "num_input_tokens_seen": 17026960, + "step": 80680 + }, + { + "epoch": 8.876237623762377, + "grad_norm": 0.038955722004175186, + "learning_rate": 3.405636517352625e-05, + "loss": 0.0461, + "num_input_tokens_seen": 17028048, + "step": 80685 + }, + { + "epoch": 8.876787678767876, + "grad_norm": 0.1780872344970703, + "learning_rate": 3.4054128078632975e-05, + "loss": 0.0465, + "num_input_tokens_seen": 17029104, + "step": 80690 + }, + { + "epoch": 8.877337733773377, + "grad_norm": 0.1392184942960739, + "learning_rate": 3.405189090029214e-05, + "loss": 0.0074, + "num_input_tokens_seen": 17030192, + "step": 80695 + }, + { + "epoch": 8.877887788778878, + "grad_norm": 0.006280157249420881, + "learning_rate": 3.404965363852437e-05, + "loss": 0.0425, + "num_input_tokens_seen": 17031280, + "step": 80700 + }, + { + "epoch": 8.87843784378438, + "grad_norm": 0.009534318000078201, + "learning_rate": 3.404741629335029e-05, + "loss": 0.0239, + "num_input_tokens_seen": 17032272, + "step": 80705 + }, + { + "epoch": 8.878987898789878, + "grad_norm": 0.041930750012397766, + "learning_rate": 3.404517886479051e-05, + "loss": 0.0673, + "num_input_tokens_seen": 17033296, + "step": 80710 + }, + { + "epoch": 8.87953795379538, + "grad_norm": 0.10484930127859116, + "learning_rate": 3.4042941352865657e-05, + "loss": 0.0082, + "num_input_tokens_seen": 17034352, + "step": 80715 + }, + { + "epoch": 8.88008800880088, + "grad_norm": 0.01633645035326481, + "learning_rate": 3.404070375759636e-05, + "loss": 0.023, + "num_input_tokens_seen": 17035440, + "step": 80720 + }, + { + "epoch": 8.88063806380638, + "grad_norm": 0.026302270591259003, + "learning_rate": 3.403846607900322e-05, + "loss": 0.0072, + "num_input_tokens_seen": 17036560, + "step": 80725 + }, + { + "epoch": 8.881188118811881, + "grad_norm": 0.06576701998710632, + "learning_rate": 3.403622831710689e-05, + "loss": 0.0067, + "num_input_tokens_seen": 17037616, + "step": 80730 + }, + { + "epoch": 8.881738173817382, + "grad_norm": 0.16522130370140076, + "learning_rate": 3.403399047192798e-05, + "loss": 0.0036, + "num_input_tokens_seen": 17038576, + "step": 80735 + }, + { + "epoch": 8.882288228822881, + "grad_norm": 0.006483507342636585, + "learning_rate": 3.4031752543487093e-05, + "loss": 0.1848, + "num_input_tokens_seen": 17039632, + "step": 80740 + }, + { + "epoch": 8.882838283828383, + "grad_norm": 1.0259342193603516, + "learning_rate": 3.40295145318049e-05, + "loss": 0.0346, + "num_input_tokens_seen": 17040688, + "step": 80745 + }, + { + "epoch": 8.883388338833884, + "grad_norm": 0.18561407923698425, + "learning_rate": 3.4027276436902e-05, + "loss": 0.0362, + "num_input_tokens_seen": 17041744, + "step": 80750 + }, + { + "epoch": 8.883938393839383, + "grad_norm": 0.18449808657169342, + "learning_rate": 3.4025038258799015e-05, + "loss": 0.0744, + "num_input_tokens_seen": 17042896, + "step": 80755 + }, + { + "epoch": 8.884488448844884, + "grad_norm": 0.13068479299545288, + "learning_rate": 3.402279999751659e-05, + "loss": 0.1055, + "num_input_tokens_seen": 17044080, + "step": 80760 + }, + { + "epoch": 8.885038503850385, + "grad_norm": 0.20537146925926208, + "learning_rate": 3.4020561653075356e-05, + "loss": 0.0128, + "num_input_tokens_seen": 17045104, + "step": 80765 + }, + { + "epoch": 8.885588558855886, + "grad_norm": 0.47817468643188477, + "learning_rate": 3.401832322549592e-05, + "loss": 0.0087, + "num_input_tokens_seen": 17046224, + "step": 80770 + }, + { + "epoch": 8.886138613861386, + "grad_norm": 0.22733592987060547, + "learning_rate": 3.401608471479893e-05, + "loss": 0.01, + "num_input_tokens_seen": 17047280, + "step": 80775 + }, + { + "epoch": 8.886688668866887, + "grad_norm": 1.14747953414917, + "learning_rate": 3.401384612100501e-05, + "loss": 0.0327, + "num_input_tokens_seen": 17048304, + "step": 80780 + }, + { + "epoch": 8.887238723872388, + "grad_norm": 0.014855019748210907, + "learning_rate": 3.40116074441348e-05, + "loss": 0.0206, + "num_input_tokens_seen": 17049360, + "step": 80785 + }, + { + "epoch": 8.887788778877887, + "grad_norm": 0.7455343008041382, + "learning_rate": 3.400936868420893e-05, + "loss": 0.1044, + "num_input_tokens_seen": 17050448, + "step": 80790 + }, + { + "epoch": 8.888338833883388, + "grad_norm": 0.5170316100120544, + "learning_rate": 3.400712984124803e-05, + "loss": 0.0967, + "num_input_tokens_seen": 17051472, + "step": 80795 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.2565501630306244, + "learning_rate": 3.400489091527274e-05, + "loss": 0.0098, + "num_input_tokens_seen": 17052496, + "step": 80800 + }, + { + "epoch": 8.88943894389439, + "grad_norm": 0.0423508919775486, + "learning_rate": 3.4002651906303676e-05, + "loss": 0.0094, + "num_input_tokens_seen": 17053488, + "step": 80805 + }, + { + "epoch": 8.88998899889989, + "grad_norm": 0.01797974295914173, + "learning_rate": 3.4000412814361504e-05, + "loss": 0.0301, + "num_input_tokens_seen": 17054608, + "step": 80810 + }, + { + "epoch": 8.89053905390539, + "grad_norm": 0.5160008668899536, + "learning_rate": 3.399817363946685e-05, + "loss": 0.0323, + "num_input_tokens_seen": 17055600, + "step": 80815 + }, + { + "epoch": 8.891089108910892, + "grad_norm": 0.12605270743370056, + "learning_rate": 3.3995934381640334e-05, + "loss": 0.0109, + "num_input_tokens_seen": 17056688, + "step": 80820 + }, + { + "epoch": 8.891639163916391, + "grad_norm": 0.2760677933692932, + "learning_rate": 3.399369504090262e-05, + "loss": 0.011, + "num_input_tokens_seen": 17057712, + "step": 80825 + }, + { + "epoch": 8.892189218921892, + "grad_norm": 0.39534440636634827, + "learning_rate": 3.3991455617274323e-05, + "loss": 0.1097, + "num_input_tokens_seen": 17058736, + "step": 80830 + }, + { + "epoch": 8.892739273927393, + "grad_norm": 0.006985912565141916, + "learning_rate": 3.3989216110776094e-05, + "loss": 0.0142, + "num_input_tokens_seen": 17059728, + "step": 80835 + }, + { + "epoch": 8.893289328932893, + "grad_norm": 0.014161384664475918, + "learning_rate": 3.398697652142858e-05, + "loss": 0.0188, + "num_input_tokens_seen": 17060816, + "step": 80840 + }, + { + "epoch": 8.893839383938394, + "grad_norm": 0.021299749612808228, + "learning_rate": 3.3984736849252416e-05, + "loss": 0.0058, + "num_input_tokens_seen": 17061840, + "step": 80845 + }, + { + "epoch": 8.894389438943895, + "grad_norm": 0.008152272552251816, + "learning_rate": 3.398249709426824e-05, + "loss": 0.0201, + "num_input_tokens_seen": 17062992, + "step": 80850 + }, + { + "epoch": 8.894939493949394, + "grad_norm": 0.028517330065369606, + "learning_rate": 3.3980257256496694e-05, + "loss": 0.1009, + "num_input_tokens_seen": 17064048, + "step": 80855 + }, + { + "epoch": 8.895489548954895, + "grad_norm": 0.8911230564117432, + "learning_rate": 3.397801733595843e-05, + "loss": 0.0164, + "num_input_tokens_seen": 17065104, + "step": 80860 + }, + { + "epoch": 8.896039603960396, + "grad_norm": 0.009027899242937565, + "learning_rate": 3.3975777332674106e-05, + "loss": 0.1844, + "num_input_tokens_seen": 17066160, + "step": 80865 + }, + { + "epoch": 8.896589658965897, + "grad_norm": 1.142063021659851, + "learning_rate": 3.3973537246664326e-05, + "loss": 0.0311, + "num_input_tokens_seen": 17067184, + "step": 80870 + }, + { + "epoch": 8.897139713971397, + "grad_norm": 0.08631671965122223, + "learning_rate": 3.397129707794977e-05, + "loss": 0.0571, + "num_input_tokens_seen": 17068272, + "step": 80875 + }, + { + "epoch": 8.897689768976898, + "grad_norm": 0.022464238107204437, + "learning_rate": 3.396905682655107e-05, + "loss": 0.0357, + "num_input_tokens_seen": 17069328, + "step": 80880 + }, + { + "epoch": 8.898239823982399, + "grad_norm": 0.01393450889736414, + "learning_rate": 3.396681649248888e-05, + "loss": 0.0487, + "num_input_tokens_seen": 17070384, + "step": 80885 + }, + { + "epoch": 8.898789878987898, + "grad_norm": 1.5945550203323364, + "learning_rate": 3.3964576075783846e-05, + "loss": 0.1221, + "num_input_tokens_seen": 17071440, + "step": 80890 + }, + { + "epoch": 8.8993399339934, + "grad_norm": 0.5080977082252502, + "learning_rate": 3.396233557645661e-05, + "loss": 0.0154, + "num_input_tokens_seen": 17072464, + "step": 80895 + }, + { + "epoch": 8.8998899889989, + "grad_norm": 0.0457625612616539, + "learning_rate": 3.396009499452782e-05, + "loss": 0.1136, + "num_input_tokens_seen": 17073488, + "step": 80900 + }, + { + "epoch": 8.9004400440044, + "grad_norm": 2.2098379135131836, + "learning_rate": 3.3957854330018154e-05, + "loss": 0.0616, + "num_input_tokens_seen": 17074512, + "step": 80905 + }, + { + "epoch": 8.900990099009901, + "grad_norm": 0.027332978323101997, + "learning_rate": 3.395561358294824e-05, + "loss": 0.0488, + "num_input_tokens_seen": 17075568, + "step": 80910 + }, + { + "epoch": 8.901540154015402, + "grad_norm": 0.107828788459301, + "learning_rate": 3.395337275333872e-05, + "loss": 0.0075, + "num_input_tokens_seen": 17076624, + "step": 80915 + }, + { + "epoch": 8.902090209020901, + "grad_norm": 0.018986687064170837, + "learning_rate": 3.3951131841210266e-05, + "loss": 0.003, + "num_input_tokens_seen": 17077680, + "step": 80920 + }, + { + "epoch": 8.902640264026402, + "grad_norm": 0.06658707559108734, + "learning_rate": 3.394889084658353e-05, + "loss": 0.0442, + "num_input_tokens_seen": 17078640, + "step": 80925 + }, + { + "epoch": 8.903190319031903, + "grad_norm": 0.0890030786395073, + "learning_rate": 3.3946649769479144e-05, + "loss": 0.0509, + "num_input_tokens_seen": 17079664, + "step": 80930 + }, + { + "epoch": 8.903740374037405, + "grad_norm": 0.2497488558292389, + "learning_rate": 3.39444086099178e-05, + "loss": 0.0206, + "num_input_tokens_seen": 17080656, + "step": 80935 + }, + { + "epoch": 8.904290429042904, + "grad_norm": 0.22019915282726288, + "learning_rate": 3.394216736792012e-05, + "loss": 0.0233, + "num_input_tokens_seen": 17081712, + "step": 80940 + }, + { + "epoch": 8.904840484048405, + "grad_norm": 0.01245495118200779, + "learning_rate": 3.393992604350678e-05, + "loss": 0.1366, + "num_input_tokens_seen": 17082800, + "step": 80945 + }, + { + "epoch": 8.905390539053906, + "grad_norm": 1.5203163623809814, + "learning_rate": 3.393768463669844e-05, + "loss": 0.0401, + "num_input_tokens_seen": 17083856, + "step": 80950 + }, + { + "epoch": 8.905940594059405, + "grad_norm": 3.523703098297119, + "learning_rate": 3.393544314751573e-05, + "loss": 0.0519, + "num_input_tokens_seen": 17084912, + "step": 80955 + }, + { + "epoch": 8.906490649064907, + "grad_norm": 0.7882192134857178, + "learning_rate": 3.3933201575979346e-05, + "loss": 0.166, + "num_input_tokens_seen": 17085936, + "step": 80960 + }, + { + "epoch": 8.907040704070408, + "grad_norm": 0.046455614268779755, + "learning_rate": 3.393095992210992e-05, + "loss": 0.0122, + "num_input_tokens_seen": 17087024, + "step": 80965 + }, + { + "epoch": 8.907590759075907, + "grad_norm": 0.015596335753798485, + "learning_rate": 3.3928718185928125e-05, + "loss": 0.0681, + "num_input_tokens_seen": 17088112, + "step": 80970 + }, + { + "epoch": 8.908140814081408, + "grad_norm": 0.1691405475139618, + "learning_rate": 3.392647636745462e-05, + "loss": 0.0179, + "num_input_tokens_seen": 17089168, + "step": 80975 + }, + { + "epoch": 8.908690869086909, + "grad_norm": 0.0690692737698555, + "learning_rate": 3.392423446671007e-05, + "loss": 0.0207, + "num_input_tokens_seen": 17090192, + "step": 80980 + }, + { + "epoch": 8.909240924092408, + "grad_norm": 0.472685843706131, + "learning_rate": 3.392199248371512e-05, + "loss": 0.0337, + "num_input_tokens_seen": 17091280, + "step": 80985 + }, + { + "epoch": 8.90979097909791, + "grad_norm": 0.13095568120479584, + "learning_rate": 3.3919750418490455e-05, + "loss": 0.0032, + "num_input_tokens_seen": 17092304, + "step": 80990 + }, + { + "epoch": 8.91034103410341, + "grad_norm": 0.0682823657989502, + "learning_rate": 3.3917508271056734e-05, + "loss": 0.005, + "num_input_tokens_seen": 17093360, + "step": 80995 + }, + { + "epoch": 8.910891089108912, + "grad_norm": 0.34248650074005127, + "learning_rate": 3.391526604143461e-05, + "loss": 0.0092, + "num_input_tokens_seen": 17094416, + "step": 81000 + }, + { + "epoch": 8.911441144114411, + "grad_norm": 0.09069689363241196, + "learning_rate": 3.391302372964476e-05, + "loss": 0.0043, + "num_input_tokens_seen": 17095376, + "step": 81005 + }, + { + "epoch": 8.911991199119912, + "grad_norm": 0.03472862020134926, + "learning_rate": 3.391078133570785e-05, + "loss": 0.033, + "num_input_tokens_seen": 17096400, + "step": 81010 + }, + { + "epoch": 8.912541254125413, + "grad_norm": 0.800744891166687, + "learning_rate": 3.390853885964454e-05, + "loss": 0.0327, + "num_input_tokens_seen": 17097488, + "step": 81015 + }, + { + "epoch": 8.913091309130913, + "grad_norm": 0.9339114427566528, + "learning_rate": 3.3906296301475506e-05, + "loss": 0.0553, + "num_input_tokens_seen": 17098576, + "step": 81020 + }, + { + "epoch": 8.913641364136414, + "grad_norm": 0.013638902455568314, + "learning_rate": 3.390405366122141e-05, + "loss": 0.0364, + "num_input_tokens_seen": 17099664, + "step": 81025 + }, + { + "epoch": 8.914191419141915, + "grad_norm": 0.044512998312711716, + "learning_rate": 3.390181093890292e-05, + "loss": 0.0023, + "num_input_tokens_seen": 17100752, + "step": 81030 + }, + { + "epoch": 8.914741474147414, + "grad_norm": 0.04397301375865936, + "learning_rate": 3.389956813454072e-05, + "loss": 0.0584, + "num_input_tokens_seen": 17101808, + "step": 81035 + }, + { + "epoch": 8.915291529152915, + "grad_norm": 0.04480822756886482, + "learning_rate": 3.3897325248155466e-05, + "loss": 0.0948, + "num_input_tokens_seen": 17102864, + "step": 81040 + }, + { + "epoch": 8.915841584158416, + "grad_norm": 0.06003805994987488, + "learning_rate": 3.389508227976783e-05, + "loss": 0.025, + "num_input_tokens_seen": 17103952, + "step": 81045 + }, + { + "epoch": 8.916391639163916, + "grad_norm": 0.015158343128859997, + "learning_rate": 3.389283922939849e-05, + "loss": 0.0096, + "num_input_tokens_seen": 17105040, + "step": 81050 + }, + { + "epoch": 8.916941694169417, + "grad_norm": 0.1947702020406723, + "learning_rate": 3.3890596097068125e-05, + "loss": 0.0319, + "num_input_tokens_seen": 17106128, + "step": 81055 + }, + { + "epoch": 8.917491749174918, + "grad_norm": 0.09002555161714554, + "learning_rate": 3.3888352882797397e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17107152, + "step": 81060 + }, + { + "epoch": 8.918041804180419, + "grad_norm": 0.8579202890396118, + "learning_rate": 3.388610958660699e-05, + "loss": 0.0235, + "num_input_tokens_seen": 17108144, + "step": 81065 + }, + { + "epoch": 8.918591859185918, + "grad_norm": 0.01826510578393936, + "learning_rate": 3.3883866208517566e-05, + "loss": 0.0443, + "num_input_tokens_seen": 17109200, + "step": 81070 + }, + { + "epoch": 8.91914191419142, + "grad_norm": 0.04756290465593338, + "learning_rate": 3.388162274854983e-05, + "loss": 0.0287, + "num_input_tokens_seen": 17110224, + "step": 81075 + }, + { + "epoch": 8.91969196919692, + "grad_norm": 0.012906387448310852, + "learning_rate": 3.387937920672442e-05, + "loss": 0.0232, + "num_input_tokens_seen": 17111248, + "step": 81080 + }, + { + "epoch": 8.92024202420242, + "grad_norm": 0.033107463270425797, + "learning_rate": 3.387713558306204e-05, + "loss": 0.0792, + "num_input_tokens_seen": 17112272, + "step": 81085 + }, + { + "epoch": 8.92079207920792, + "grad_norm": 0.021927786991000175, + "learning_rate": 3.387489187758335e-05, + "loss": 0.0752, + "num_input_tokens_seen": 17113328, + "step": 81090 + }, + { + "epoch": 8.921342134213422, + "grad_norm": 1.2964258193969727, + "learning_rate": 3.387264809030904e-05, + "loss": 0.1223, + "num_input_tokens_seen": 17114448, + "step": 81095 + }, + { + "epoch": 8.921892189218921, + "grad_norm": 0.05549916252493858, + "learning_rate": 3.38704042212598e-05, + "loss": 0.0052, + "num_input_tokens_seen": 17115472, + "step": 81100 + }, + { + "epoch": 8.922442244224422, + "grad_norm": 0.03613827005028725, + "learning_rate": 3.3868160270456305e-05, + "loss": 0.0166, + "num_input_tokens_seen": 17116464, + "step": 81105 + }, + { + "epoch": 8.922992299229923, + "grad_norm": 0.017803341150283813, + "learning_rate": 3.386591623791922e-05, + "loss": 0.0014, + "num_input_tokens_seen": 17117520, + "step": 81110 + }, + { + "epoch": 8.923542354235423, + "grad_norm": 0.016022324562072754, + "learning_rate": 3.386367212366925e-05, + "loss": 0.0207, + "num_input_tokens_seen": 17118608, + "step": 81115 + }, + { + "epoch": 8.924092409240924, + "grad_norm": 0.08729075640439987, + "learning_rate": 3.386142792772706e-05, + "loss": 0.0168, + "num_input_tokens_seen": 17119728, + "step": 81120 + }, + { + "epoch": 8.924642464246425, + "grad_norm": 0.021901054307818413, + "learning_rate": 3.3859183650113355e-05, + "loss": 0.0257, + "num_input_tokens_seen": 17120816, + "step": 81125 + }, + { + "epoch": 8.925192519251926, + "grad_norm": 1.0007634162902832, + "learning_rate": 3.3856939290848784e-05, + "loss": 0.0612, + "num_input_tokens_seen": 17121904, + "step": 81130 + }, + { + "epoch": 8.925742574257425, + "grad_norm": 0.12729601562023163, + "learning_rate": 3.385469484995407e-05, + "loss": 0.042, + "num_input_tokens_seen": 17122960, + "step": 81135 + }, + { + "epoch": 8.926292629262926, + "grad_norm": 2.018359899520874, + "learning_rate": 3.385245032744987e-05, + "loss": 0.0256, + "num_input_tokens_seen": 17124016, + "step": 81140 + }, + { + "epoch": 8.926842684268427, + "grad_norm": 0.14829017221927643, + "learning_rate": 3.385020572335689e-05, + "loss": 0.0067, + "num_input_tokens_seen": 17125136, + "step": 81145 + }, + { + "epoch": 8.927392739273927, + "grad_norm": 1.2961291074752808, + "learning_rate": 3.384796103769581e-05, + "loss": 0.0656, + "num_input_tokens_seen": 17126192, + "step": 81150 + }, + { + "epoch": 8.927942794279428, + "grad_norm": 0.10095018893480301, + "learning_rate": 3.384571627048732e-05, + "loss": 0.0159, + "num_input_tokens_seen": 17127248, + "step": 81155 + }, + { + "epoch": 8.928492849284929, + "grad_norm": 0.04864446818828583, + "learning_rate": 3.384347142175211e-05, + "loss": 0.0586, + "num_input_tokens_seen": 17128208, + "step": 81160 + }, + { + "epoch": 8.929042904290428, + "grad_norm": 0.029995154589414597, + "learning_rate": 3.384122649151086e-05, + "loss": 0.0153, + "num_input_tokens_seen": 17129232, + "step": 81165 + }, + { + "epoch": 8.92959295929593, + "grad_norm": 0.0275319442152977, + "learning_rate": 3.383898147978427e-05, + "loss": 0.0025, + "num_input_tokens_seen": 17130256, + "step": 81170 + }, + { + "epoch": 8.93014301430143, + "grad_norm": 0.0078025758266448975, + "learning_rate": 3.3836736386593026e-05, + "loss": 0.0363, + "num_input_tokens_seen": 17131312, + "step": 81175 + }, + { + "epoch": 8.930693069306932, + "grad_norm": 1.0560448169708252, + "learning_rate": 3.383449121195783e-05, + "loss": 0.0224, + "num_input_tokens_seen": 17132464, + "step": 81180 + }, + { + "epoch": 8.93124312431243, + "grad_norm": 1.6287133693695068, + "learning_rate": 3.383224595589937e-05, + "loss": 0.0194, + "num_input_tokens_seen": 17133520, + "step": 81185 + }, + { + "epoch": 8.931793179317932, + "grad_norm": 0.06131140515208244, + "learning_rate": 3.383000061843833e-05, + "loss": 0.0264, + "num_input_tokens_seen": 17134544, + "step": 81190 + }, + { + "epoch": 8.932343234323433, + "grad_norm": 0.696057915687561, + "learning_rate": 3.382775519959541e-05, + "loss": 0.0419, + "num_input_tokens_seen": 17135536, + "step": 81195 + }, + { + "epoch": 8.932893289328932, + "grad_norm": 1.6478767395019531, + "learning_rate": 3.382550969939132e-05, + "loss": 0.1334, + "num_input_tokens_seen": 17136624, + "step": 81200 + }, + { + "epoch": 8.933443344334433, + "grad_norm": 0.4959489703178406, + "learning_rate": 3.382326411784672e-05, + "loss": 0.0979, + "num_input_tokens_seen": 17137616, + "step": 81205 + }, + { + "epoch": 8.933993399339935, + "grad_norm": 0.23235230147838593, + "learning_rate": 3.382101845498235e-05, + "loss": 0.026, + "num_input_tokens_seen": 17138672, + "step": 81210 + }, + { + "epoch": 8.934543454345434, + "grad_norm": 1.2711559534072876, + "learning_rate": 3.381877271081887e-05, + "loss": 0.0711, + "num_input_tokens_seen": 17139728, + "step": 81215 + }, + { + "epoch": 8.935093509350935, + "grad_norm": 0.01724550500512123, + "learning_rate": 3.3816526885377006e-05, + "loss": 0.0555, + "num_input_tokens_seen": 17140848, + "step": 81220 + }, + { + "epoch": 8.935643564356436, + "grad_norm": 0.32943257689476013, + "learning_rate": 3.381428097867744e-05, + "loss": 0.0235, + "num_input_tokens_seen": 17141936, + "step": 81225 + }, + { + "epoch": 8.936193619361937, + "grad_norm": 0.03868846967816353, + "learning_rate": 3.381203499074087e-05, + "loss": 0.0737, + "num_input_tokens_seen": 17142992, + "step": 81230 + }, + { + "epoch": 8.936743674367436, + "grad_norm": 0.04134064167737961, + "learning_rate": 3.3809788921588e-05, + "loss": 0.0269, + "num_input_tokens_seen": 17144048, + "step": 81235 + }, + { + "epoch": 8.937293729372938, + "grad_norm": 0.04892521724104881, + "learning_rate": 3.3807542771239544e-05, + "loss": 0.01, + "num_input_tokens_seen": 17145072, + "step": 81240 + }, + { + "epoch": 8.937843784378439, + "grad_norm": 0.019215602427721024, + "learning_rate": 3.380529653971619e-05, + "loss": 0.0134, + "num_input_tokens_seen": 17146128, + "step": 81245 + }, + { + "epoch": 8.938393839383938, + "grad_norm": 1.5185048580169678, + "learning_rate": 3.3803050227038635e-05, + "loss": 0.0532, + "num_input_tokens_seen": 17147216, + "step": 81250 + }, + { + "epoch": 8.938943894389439, + "grad_norm": 0.06784386187791824, + "learning_rate": 3.3800803833227596e-05, + "loss": 0.0679, + "num_input_tokens_seen": 17148304, + "step": 81255 + }, + { + "epoch": 8.93949394939494, + "grad_norm": 0.015805697068572044, + "learning_rate": 3.379855735830377e-05, + "loss": 0.0837, + "num_input_tokens_seen": 17149360, + "step": 81260 + }, + { + "epoch": 8.94004400440044, + "grad_norm": 0.07420874387025833, + "learning_rate": 3.3796310802287864e-05, + "loss": 0.0358, + "num_input_tokens_seen": 17150448, + "step": 81265 + }, + { + "epoch": 8.94059405940594, + "grad_norm": 0.037294793874025345, + "learning_rate": 3.379406416520058e-05, + "loss": 0.014, + "num_input_tokens_seen": 17151440, + "step": 81270 + }, + { + "epoch": 8.941144114411442, + "grad_norm": 0.01456552091985941, + "learning_rate": 3.379181744706263e-05, + "loss": 0.0433, + "num_input_tokens_seen": 17152496, + "step": 81275 + }, + { + "epoch": 8.941694169416941, + "grad_norm": 0.04649248346686363, + "learning_rate": 3.3789570647894706e-05, + "loss": 0.0171, + "num_input_tokens_seen": 17153520, + "step": 81280 + }, + { + "epoch": 8.942244224422442, + "grad_norm": 1.1609923839569092, + "learning_rate": 3.378732376771754e-05, + "loss": 0.0366, + "num_input_tokens_seen": 17154608, + "step": 81285 + }, + { + "epoch": 8.942794279427943, + "grad_norm": 0.16348965466022491, + "learning_rate": 3.3785076806551824e-05, + "loss": 0.02, + "num_input_tokens_seen": 17155632, + "step": 81290 + }, + { + "epoch": 8.943344334433444, + "grad_norm": 0.67772376537323, + "learning_rate": 3.378282976441827e-05, + "loss": 0.0924, + "num_input_tokens_seen": 17156656, + "step": 81295 + }, + { + "epoch": 8.943894389438944, + "grad_norm": 0.07913567870855331, + "learning_rate": 3.3780582641337586e-05, + "loss": 0.0683, + "num_input_tokens_seen": 17157744, + "step": 81300 + }, + { + "epoch": 8.944444444444445, + "grad_norm": 0.07350532710552216, + "learning_rate": 3.3778335437330494e-05, + "loss": 0.0114, + "num_input_tokens_seen": 17158832, + "step": 81305 + }, + { + "epoch": 8.944994499449946, + "grad_norm": 0.06629416346549988, + "learning_rate": 3.377608815241769e-05, + "loss": 0.0378, + "num_input_tokens_seen": 17159888, + "step": 81310 + }, + { + "epoch": 8.945544554455445, + "grad_norm": 0.10121463239192963, + "learning_rate": 3.3773840786619896e-05, + "loss": 0.0308, + "num_input_tokens_seen": 17160976, + "step": 81315 + }, + { + "epoch": 8.946094609460946, + "grad_norm": 0.06366744637489319, + "learning_rate": 3.3771593339957806e-05, + "loss": 0.0307, + "num_input_tokens_seen": 17162032, + "step": 81320 + }, + { + "epoch": 8.946644664466447, + "grad_norm": 1.0920028686523438, + "learning_rate": 3.376934581245217e-05, + "loss": 0.0332, + "num_input_tokens_seen": 17163120, + "step": 81325 + }, + { + "epoch": 8.947194719471947, + "grad_norm": 0.5865985155105591, + "learning_rate": 3.376709820412367e-05, + "loss": 0.0456, + "num_input_tokens_seen": 17164176, + "step": 81330 + }, + { + "epoch": 8.947744774477448, + "grad_norm": 0.5255133509635925, + "learning_rate": 3.376485051499303e-05, + "loss": 0.0634, + "num_input_tokens_seen": 17165232, + "step": 81335 + }, + { + "epoch": 8.948294829482949, + "grad_norm": 1.0536054372787476, + "learning_rate": 3.3762602745080985e-05, + "loss": 0.1507, + "num_input_tokens_seen": 17166192, + "step": 81340 + }, + { + "epoch": 8.948844884488448, + "grad_norm": 0.024072933942079544, + "learning_rate": 3.376035489440822e-05, + "loss": 0.0065, + "num_input_tokens_seen": 17167248, + "step": 81345 + }, + { + "epoch": 8.94939493949395, + "grad_norm": 0.02383730188012123, + "learning_rate": 3.3758106962995465e-05, + "loss": 0.0058, + "num_input_tokens_seen": 17168336, + "step": 81350 + }, + { + "epoch": 8.94994499449945, + "grad_norm": 0.03137257695198059, + "learning_rate": 3.375585895086345e-05, + "loss": 0.0037, + "num_input_tokens_seen": 17169424, + "step": 81355 + }, + { + "epoch": 8.950495049504951, + "grad_norm": 0.03711966425180435, + "learning_rate": 3.375361085803289e-05, + "loss": 0.0093, + "num_input_tokens_seen": 17170480, + "step": 81360 + }, + { + "epoch": 8.95104510451045, + "grad_norm": 0.05280962958931923, + "learning_rate": 3.375136268452449e-05, + "loss": 0.0115, + "num_input_tokens_seen": 17171536, + "step": 81365 + }, + { + "epoch": 8.951595159515952, + "grad_norm": 0.060280293226242065, + "learning_rate": 3.374911443035898e-05, + "loss": 0.0083, + "num_input_tokens_seen": 17172624, + "step": 81370 + }, + { + "epoch": 8.952145214521453, + "grad_norm": 0.31695756316185, + "learning_rate": 3.374686609555708e-05, + "loss": 0.0846, + "num_input_tokens_seen": 17173680, + "step": 81375 + }, + { + "epoch": 8.952695269526952, + "grad_norm": 0.27431824803352356, + "learning_rate": 3.374461768013951e-05, + "loss": 0.0051, + "num_input_tokens_seen": 17174704, + "step": 81380 + }, + { + "epoch": 8.953245324532453, + "grad_norm": 0.07058743387460709, + "learning_rate": 3.3742369184127005e-05, + "loss": 0.1189, + "num_input_tokens_seen": 17175792, + "step": 81385 + }, + { + "epoch": 8.953795379537954, + "grad_norm": 0.9736048579216003, + "learning_rate": 3.374012060754027e-05, + "loss": 0.0183, + "num_input_tokens_seen": 17176880, + "step": 81390 + }, + { + "epoch": 8.954345434543454, + "grad_norm": 0.17703311145305634, + "learning_rate": 3.373787195040003e-05, + "loss": 0.1919, + "num_input_tokens_seen": 17177968, + "step": 81395 + }, + { + "epoch": 8.954895489548955, + "grad_norm": 0.24320487678050995, + "learning_rate": 3.373562321272704e-05, + "loss": 0.0128, + "num_input_tokens_seen": 17179056, + "step": 81400 + }, + { + "epoch": 8.955445544554456, + "grad_norm": 0.06848824769258499, + "learning_rate": 3.3733374394541984e-05, + "loss": 0.0078, + "num_input_tokens_seen": 17180144, + "step": 81405 + }, + { + "epoch": 8.955995599559955, + "grad_norm": 1.0471620559692383, + "learning_rate": 3.373112549586561e-05, + "loss": 0.0441, + "num_input_tokens_seen": 17181168, + "step": 81410 + }, + { + "epoch": 8.956545654565456, + "grad_norm": 0.04101433977484703, + "learning_rate": 3.372887651671864e-05, + "loss": 0.0848, + "num_input_tokens_seen": 17182288, + "step": 81415 + }, + { + "epoch": 8.957095709570957, + "grad_norm": 0.04078705608844757, + "learning_rate": 3.372662745712182e-05, + "loss": 0.0105, + "num_input_tokens_seen": 17183344, + "step": 81420 + }, + { + "epoch": 8.957645764576458, + "grad_norm": 1.4385290145874023, + "learning_rate": 3.372437831709584e-05, + "loss": 0.0786, + "num_input_tokens_seen": 17184400, + "step": 81425 + }, + { + "epoch": 8.958195819581958, + "grad_norm": 0.14558926224708557, + "learning_rate": 3.372212909666146e-05, + "loss": 0.0338, + "num_input_tokens_seen": 17185424, + "step": 81430 + }, + { + "epoch": 8.958745874587459, + "grad_norm": 0.01881900615990162, + "learning_rate": 3.3719879795839406e-05, + "loss": 0.0431, + "num_input_tokens_seen": 17186448, + "step": 81435 + }, + { + "epoch": 8.95929592959296, + "grad_norm": 0.7687557339668274, + "learning_rate": 3.37176304146504e-05, + "loss": 0.0209, + "num_input_tokens_seen": 17187440, + "step": 81440 + }, + { + "epoch": 8.95984598459846, + "grad_norm": 0.008822948671877384, + "learning_rate": 3.3715380953115175e-05, + "loss": 0.0073, + "num_input_tokens_seen": 17188624, + "step": 81445 + }, + { + "epoch": 8.96039603960396, + "grad_norm": 0.03764931112527847, + "learning_rate": 3.371313141125447e-05, + "loss": 0.0247, + "num_input_tokens_seen": 17189744, + "step": 81450 + }, + { + "epoch": 8.960946094609461, + "grad_norm": 0.9983534216880798, + "learning_rate": 3.371088178908901e-05, + "loss": 0.1495, + "num_input_tokens_seen": 17190800, + "step": 81455 + }, + { + "epoch": 8.96149614961496, + "grad_norm": 0.22816088795661926, + "learning_rate": 3.370863208663954e-05, + "loss": 0.0209, + "num_input_tokens_seen": 17191824, + "step": 81460 + }, + { + "epoch": 8.962046204620462, + "grad_norm": 0.34537363052368164, + "learning_rate": 3.370638230392678e-05, + "loss": 0.0063, + "num_input_tokens_seen": 17192880, + "step": 81465 + }, + { + "epoch": 8.962596259625963, + "grad_norm": 1.280818223953247, + "learning_rate": 3.3704132440971474e-05, + "loss": 0.0412, + "num_input_tokens_seen": 17193936, + "step": 81470 + }, + { + "epoch": 8.963146314631462, + "grad_norm": 0.0046029966324567795, + "learning_rate": 3.370188249779436e-05, + "loss": 0.0345, + "num_input_tokens_seen": 17194960, + "step": 81475 + }, + { + "epoch": 8.963696369636963, + "grad_norm": 0.10552777349948883, + "learning_rate": 3.369963247441617e-05, + "loss": 0.0065, + "num_input_tokens_seen": 17195984, + "step": 81480 + }, + { + "epoch": 8.964246424642464, + "grad_norm": 0.5756309628486633, + "learning_rate": 3.369738237085763e-05, + "loss": 0.0541, + "num_input_tokens_seen": 17197040, + "step": 81485 + }, + { + "epoch": 8.964796479647966, + "grad_norm": 0.9039140939712524, + "learning_rate": 3.36951321871395e-05, + "loss": 0.08, + "num_input_tokens_seen": 17198096, + "step": 81490 + }, + { + "epoch": 8.965346534653465, + "grad_norm": 0.05059261620044708, + "learning_rate": 3.3692881923282506e-05, + "loss": 0.0091, + "num_input_tokens_seen": 17199120, + "step": 81495 + }, + { + "epoch": 8.965896589658966, + "grad_norm": 0.014379896223545074, + "learning_rate": 3.369063157930739e-05, + "loss": 0.027, + "num_input_tokens_seen": 17200112, + "step": 81500 + }, + { + "epoch": 8.966446644664467, + "grad_norm": 1.757084846496582, + "learning_rate": 3.3688381155234904e-05, + "loss": 0.1687, + "num_input_tokens_seen": 17201200, + "step": 81505 + }, + { + "epoch": 8.966996699669966, + "grad_norm": 1.1107522249221802, + "learning_rate": 3.3686130651085764e-05, + "loss": 0.0713, + "num_input_tokens_seen": 17202224, + "step": 81510 + }, + { + "epoch": 8.967546754675467, + "grad_norm": 0.01905072294175625, + "learning_rate": 3.368388006688073e-05, + "loss": 0.0249, + "num_input_tokens_seen": 17203312, + "step": 81515 + }, + { + "epoch": 8.968096809680969, + "grad_norm": 0.07777831703424454, + "learning_rate": 3.368162940264054e-05, + "loss": 0.0682, + "num_input_tokens_seen": 17204368, + "step": 81520 + }, + { + "epoch": 8.968646864686468, + "grad_norm": 1.8694642782211304, + "learning_rate": 3.367937865838594e-05, + "loss": 0.0372, + "num_input_tokens_seen": 17205392, + "step": 81525 + }, + { + "epoch": 8.969196919691969, + "grad_norm": 0.021307390183210373, + "learning_rate": 3.3677127834137674e-05, + "loss": 0.0459, + "num_input_tokens_seen": 17206416, + "step": 81530 + }, + { + "epoch": 8.96974697469747, + "grad_norm": 0.007033415604382753, + "learning_rate": 3.3674876929916476e-05, + "loss": 0.0038, + "num_input_tokens_seen": 17207440, + "step": 81535 + }, + { + "epoch": 8.97029702970297, + "grad_norm": 0.5914103984832764, + "learning_rate": 3.3672625945743106e-05, + "loss": 0.0228, + "num_input_tokens_seen": 17208432, + "step": 81540 + }, + { + "epoch": 8.97084708470847, + "grad_norm": 0.6785745024681091, + "learning_rate": 3.36703748816383e-05, + "loss": 0.0207, + "num_input_tokens_seen": 17209520, + "step": 81545 + }, + { + "epoch": 8.971397139713972, + "grad_norm": 0.03732145577669144, + "learning_rate": 3.366812373762282e-05, + "loss": 0.012, + "num_input_tokens_seen": 17210544, + "step": 81550 + }, + { + "epoch": 8.971947194719473, + "grad_norm": 0.45687517523765564, + "learning_rate": 3.366587251371739e-05, + "loss": 0.0934, + "num_input_tokens_seen": 17211632, + "step": 81555 + }, + { + "epoch": 8.972497249724972, + "grad_norm": 0.7521078586578369, + "learning_rate": 3.366362120994277e-05, + "loss": 0.0462, + "num_input_tokens_seen": 17212688, + "step": 81560 + }, + { + "epoch": 8.973047304730473, + "grad_norm": 1.2961879968643188, + "learning_rate": 3.366136982631973e-05, + "loss": 0.0342, + "num_input_tokens_seen": 17213744, + "step": 81565 + }, + { + "epoch": 8.973597359735974, + "grad_norm": 0.0634499043226242, + "learning_rate": 3.365911836286898e-05, + "loss": 0.0825, + "num_input_tokens_seen": 17214736, + "step": 81570 + }, + { + "epoch": 8.974147414741473, + "grad_norm": 0.011278395541012287, + "learning_rate": 3.3656866819611306e-05, + "loss": 0.0163, + "num_input_tokens_seen": 17215824, + "step": 81575 + }, + { + "epoch": 8.974697469746975, + "grad_norm": 0.07230111956596375, + "learning_rate": 3.3654615196567435e-05, + "loss": 0.0069, + "num_input_tokens_seen": 17216912, + "step": 81580 + }, + { + "epoch": 8.975247524752476, + "grad_norm": 0.13574835658073425, + "learning_rate": 3.3652363493758126e-05, + "loss": 0.0072, + "num_input_tokens_seen": 17217904, + "step": 81585 + }, + { + "epoch": 8.975797579757975, + "grad_norm": 1.3252025842666626, + "learning_rate": 3.365011171120414e-05, + "loss": 0.1511, + "num_input_tokens_seen": 17218928, + "step": 81590 + }, + { + "epoch": 8.976347634763476, + "grad_norm": 0.10303164273500443, + "learning_rate": 3.364785984892622e-05, + "loss": 0.1391, + "num_input_tokens_seen": 17220016, + "step": 81595 + }, + { + "epoch": 8.976897689768977, + "grad_norm": 0.09691662341356277, + "learning_rate": 3.3645607906945134e-05, + "loss": 0.0072, + "num_input_tokens_seen": 17221008, + "step": 81600 + }, + { + "epoch": 8.977447744774478, + "grad_norm": 0.004636889323592186, + "learning_rate": 3.364335588528162e-05, + "loss": 0.018, + "num_input_tokens_seen": 17222032, + "step": 81605 + }, + { + "epoch": 8.977997799779978, + "grad_norm": 0.7188406586647034, + "learning_rate": 3.364110378395645e-05, + "loss": 0.0891, + "num_input_tokens_seen": 17223056, + "step": 81610 + }, + { + "epoch": 8.978547854785479, + "grad_norm": 1.940394401550293, + "learning_rate": 3.363885160299037e-05, + "loss": 0.0985, + "num_input_tokens_seen": 17224176, + "step": 81615 + }, + { + "epoch": 8.97909790979098, + "grad_norm": 1.0499929189682007, + "learning_rate": 3.363659934240414e-05, + "loss": 0.0648, + "num_input_tokens_seen": 17225232, + "step": 81620 + }, + { + "epoch": 8.979647964796479, + "grad_norm": 0.03305945172905922, + "learning_rate": 3.3634347002218514e-05, + "loss": 0.0542, + "num_input_tokens_seen": 17226256, + "step": 81625 + }, + { + "epoch": 8.98019801980198, + "grad_norm": 0.33730071783065796, + "learning_rate": 3.363209458245426e-05, + "loss": 0.1231, + "num_input_tokens_seen": 17227280, + "step": 81630 + }, + { + "epoch": 8.980748074807481, + "grad_norm": 0.05558359995484352, + "learning_rate": 3.3629842083132126e-05, + "loss": 0.0441, + "num_input_tokens_seen": 17228368, + "step": 81635 + }, + { + "epoch": 8.98129812981298, + "grad_norm": 0.2042393684387207, + "learning_rate": 3.3627589504272886e-05, + "loss": 0.0387, + "num_input_tokens_seen": 17229424, + "step": 81640 + }, + { + "epoch": 8.981848184818482, + "grad_norm": 0.8511958718299866, + "learning_rate": 3.3625336845897296e-05, + "loss": 0.0385, + "num_input_tokens_seen": 17230544, + "step": 81645 + }, + { + "epoch": 8.982398239823983, + "grad_norm": 0.01880895532667637, + "learning_rate": 3.362308410802611e-05, + "loss": 0.0102, + "num_input_tokens_seen": 17231664, + "step": 81650 + }, + { + "epoch": 8.982948294829484, + "grad_norm": 0.14906415343284607, + "learning_rate": 3.362083129068009e-05, + "loss": 0.0141, + "num_input_tokens_seen": 17232720, + "step": 81655 + }, + { + "epoch": 8.983498349834983, + "grad_norm": 0.9376741051673889, + "learning_rate": 3.361857839388001e-05, + "loss": 0.0673, + "num_input_tokens_seen": 17233712, + "step": 81660 + }, + { + "epoch": 8.984048404840484, + "grad_norm": 0.5151171088218689, + "learning_rate": 3.3616325417646624e-05, + "loss": 0.0267, + "num_input_tokens_seen": 17234768, + "step": 81665 + }, + { + "epoch": 8.984598459845985, + "grad_norm": 0.04124909266829491, + "learning_rate": 3.361407236200071e-05, + "loss": 0.0076, + "num_input_tokens_seen": 17235824, + "step": 81670 + }, + { + "epoch": 8.985148514851485, + "grad_norm": 0.5930317640304565, + "learning_rate": 3.361181922696302e-05, + "loss": 0.0199, + "num_input_tokens_seen": 17236880, + "step": 81675 + }, + { + "epoch": 8.985698569856986, + "grad_norm": 2.7953639030456543, + "learning_rate": 3.360956601255433e-05, + "loss": 0.1151, + "num_input_tokens_seen": 17237936, + "step": 81680 + }, + { + "epoch": 8.986248624862487, + "grad_norm": 0.16565053164958954, + "learning_rate": 3.360731271879538e-05, + "loss": 0.0814, + "num_input_tokens_seen": 17238992, + "step": 81685 + }, + { + "epoch": 8.986798679867986, + "grad_norm": 1.7802187204360962, + "learning_rate": 3.360505934570698e-05, + "loss": 0.0538, + "num_input_tokens_seen": 17239984, + "step": 81690 + }, + { + "epoch": 8.987348734873487, + "grad_norm": 0.4406736493110657, + "learning_rate": 3.360280589330988e-05, + "loss": 0.0246, + "num_input_tokens_seen": 17241104, + "step": 81695 + }, + { + "epoch": 8.987898789878988, + "grad_norm": 0.670829713344574, + "learning_rate": 3.3600552361624833e-05, + "loss": 0.0442, + "num_input_tokens_seen": 17242128, + "step": 81700 + }, + { + "epoch": 8.988448844884488, + "grad_norm": 0.11553215235471725, + "learning_rate": 3.359829875067263e-05, + "loss": 0.0067, + "num_input_tokens_seen": 17243216, + "step": 81705 + }, + { + "epoch": 8.988998899889989, + "grad_norm": 0.04394998401403427, + "learning_rate": 3.359604506047403e-05, + "loss": 0.0048, + "num_input_tokens_seen": 17244368, + "step": 81710 + }, + { + "epoch": 8.98954895489549, + "grad_norm": 0.8849867582321167, + "learning_rate": 3.35937912910498e-05, + "loss": 0.0357, + "num_input_tokens_seen": 17245456, + "step": 81715 + }, + { + "epoch": 8.990099009900991, + "grad_norm": 0.18081139028072357, + "learning_rate": 3.359153744242073e-05, + "loss": 0.0074, + "num_input_tokens_seen": 17246480, + "step": 81720 + }, + { + "epoch": 8.99064906490649, + "grad_norm": 0.024219371378421783, + "learning_rate": 3.358928351460758e-05, + "loss": 0.0074, + "num_input_tokens_seen": 17247504, + "step": 81725 + }, + { + "epoch": 8.991199119911991, + "grad_norm": 0.036065809428691864, + "learning_rate": 3.358702950763112e-05, + "loss": 0.0128, + "num_input_tokens_seen": 17248560, + "step": 81730 + }, + { + "epoch": 8.991749174917492, + "grad_norm": 0.06519243121147156, + "learning_rate": 3.358477542151214e-05, + "loss": 0.0027, + "num_input_tokens_seen": 17249616, + "step": 81735 + }, + { + "epoch": 8.992299229922992, + "grad_norm": 1.1422996520996094, + "learning_rate": 3.35825212562714e-05, + "loss": 0.0993, + "num_input_tokens_seen": 17250704, + "step": 81740 + }, + { + "epoch": 8.992849284928493, + "grad_norm": 0.022308295592665672, + "learning_rate": 3.358026701192968e-05, + "loss": 0.0343, + "num_input_tokens_seen": 17251856, + "step": 81745 + }, + { + "epoch": 8.993399339933994, + "grad_norm": 1.4698256254196167, + "learning_rate": 3.357801268850775e-05, + "loss": 0.0476, + "num_input_tokens_seen": 17252912, + "step": 81750 + }, + { + "epoch": 8.993949394939493, + "grad_norm": 0.2722921371459961, + "learning_rate": 3.35757582860264e-05, + "loss": 0.0442, + "num_input_tokens_seen": 17254064, + "step": 81755 + }, + { + "epoch": 8.994499449944994, + "grad_norm": 1.4327874183654785, + "learning_rate": 3.357350380450639e-05, + "loss": 0.1434, + "num_input_tokens_seen": 17255088, + "step": 81760 + }, + { + "epoch": 8.995049504950495, + "grad_norm": 1.702820062637329, + "learning_rate": 3.357124924396853e-05, + "loss": 0.0342, + "num_input_tokens_seen": 17256112, + "step": 81765 + }, + { + "epoch": 8.995599559955995, + "grad_norm": 0.1488395482301712, + "learning_rate": 3.356899460443356e-05, + "loss": 0.0219, + "num_input_tokens_seen": 17257104, + "step": 81770 + }, + { + "epoch": 8.996149614961496, + "grad_norm": 0.01939750462770462, + "learning_rate": 3.356673988592228e-05, + "loss": 0.0555, + "num_input_tokens_seen": 17258096, + "step": 81775 + }, + { + "epoch": 8.996699669966997, + "grad_norm": 0.003983430564403534, + "learning_rate": 3.356448508845548e-05, + "loss": 0.0292, + "num_input_tokens_seen": 17259216, + "step": 81780 + }, + { + "epoch": 8.997249724972498, + "grad_norm": 1.4421852827072144, + "learning_rate": 3.356223021205392e-05, + "loss": 0.0559, + "num_input_tokens_seen": 17260336, + "step": 81785 + }, + { + "epoch": 8.997799779977997, + "grad_norm": 0.29333850741386414, + "learning_rate": 3.35599752567384e-05, + "loss": 0.0066, + "num_input_tokens_seen": 17261328, + "step": 81790 + }, + { + "epoch": 8.998349834983498, + "grad_norm": 0.191339373588562, + "learning_rate": 3.355772022252969e-05, + "loss": 0.044, + "num_input_tokens_seen": 17262320, + "step": 81795 + }, + { + "epoch": 8.998899889989, + "grad_norm": 0.09212575852870941, + "learning_rate": 3.355546510944858e-05, + "loss": 0.0481, + "num_input_tokens_seen": 17263376, + "step": 81800 + }, + { + "epoch": 8.999449944994499, + "grad_norm": 0.3021642863750458, + "learning_rate": 3.355320991751586e-05, + "loss": 0.0064, + "num_input_tokens_seen": 17264432, + "step": 81805 + }, + { + "epoch": 9.0, + "grad_norm": 0.007250255439430475, + "learning_rate": 3.3550954646752304e-05, + "loss": 0.0722, + "num_input_tokens_seen": 17265344, + "step": 81810 + }, + { + "epoch": 9.0, + "eval_loss": 0.06155921891331673, + "eval_runtime": 37.0833, + "eval_samples_per_second": 108.944, + "eval_steps_per_second": 27.236, + "num_input_tokens_seen": 17265344, + "step": 81810 + }, + { + "epoch": 9.000550055005501, + "grad_norm": 0.031099241226911545, + "learning_rate": 3.35486992971787e-05, + "loss": 0.0094, + "num_input_tokens_seen": 17266336, + "step": 81815 + }, + { + "epoch": 9.001100110011, + "grad_norm": 0.37087133526802063, + "learning_rate": 3.354644386881584e-05, + "loss": 0.0073, + "num_input_tokens_seen": 17267392, + "step": 81820 + }, + { + "epoch": 9.001650165016502, + "grad_norm": 0.1389748454093933, + "learning_rate": 3.354418836168451e-05, + "loss": 0.0073, + "num_input_tokens_seen": 17268416, + "step": 81825 + }, + { + "epoch": 9.002200220022003, + "grad_norm": 0.0758371576666832, + "learning_rate": 3.3541932775805485e-05, + "loss": 0.0555, + "num_input_tokens_seen": 17269376, + "step": 81830 + }, + { + "epoch": 9.002750275027502, + "grad_norm": 0.2697429955005646, + "learning_rate": 3.3539677111199573e-05, + "loss": 0.0081, + "num_input_tokens_seen": 17270368, + "step": 81835 + }, + { + "epoch": 9.003300330033003, + "grad_norm": 0.05068430304527283, + "learning_rate": 3.353742136788756e-05, + "loss": 0.0409, + "num_input_tokens_seen": 17271392, + "step": 81840 + }, + { + "epoch": 9.003850385038504, + "grad_norm": 0.2905871272087097, + "learning_rate": 3.353516554589022e-05, + "loss": 0.0991, + "num_input_tokens_seen": 17272480, + "step": 81845 + }, + { + "epoch": 9.004400440044005, + "grad_norm": 0.0074938395991921425, + "learning_rate": 3.3532909645228354e-05, + "loss": 0.0031, + "num_input_tokens_seen": 17273600, + "step": 81850 + }, + { + "epoch": 9.004950495049505, + "grad_norm": 0.04074529558420181, + "learning_rate": 3.353065366592276e-05, + "loss": 0.0127, + "num_input_tokens_seen": 17274720, + "step": 81855 + }, + { + "epoch": 9.005500550055006, + "grad_norm": 0.015924612060189247, + "learning_rate": 3.3528397607994226e-05, + "loss": 0.0371, + "num_input_tokens_seen": 17275776, + "step": 81860 + }, + { + "epoch": 9.006050605060507, + "grad_norm": 0.04469268396496773, + "learning_rate": 3.352614147146355e-05, + "loss": 0.0879, + "num_input_tokens_seen": 17276832, + "step": 81865 + }, + { + "epoch": 9.006600660066006, + "grad_norm": 0.05981505289673805, + "learning_rate": 3.3523885256351504e-05, + "loss": 0.0281, + "num_input_tokens_seen": 17277888, + "step": 81870 + }, + { + "epoch": 9.007150715071507, + "grad_norm": 0.01278897374868393, + "learning_rate": 3.35216289626789e-05, + "loss": 0.0295, + "num_input_tokens_seen": 17279040, + "step": 81875 + }, + { + "epoch": 9.007700770077008, + "grad_norm": 0.0048940363340079784, + "learning_rate": 3.351937259046654e-05, + "loss": 0.0573, + "num_input_tokens_seen": 17280096, + "step": 81880 + }, + { + "epoch": 9.008250825082508, + "grad_norm": 0.31201988458633423, + "learning_rate": 3.35171161397352e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17281184, + "step": 81885 + }, + { + "epoch": 9.008800880088009, + "grad_norm": 0.9752101898193359, + "learning_rate": 3.35148596105057e-05, + "loss": 0.0497, + "num_input_tokens_seen": 17282176, + "step": 81890 + }, + { + "epoch": 9.00935093509351, + "grad_norm": 0.09733681380748749, + "learning_rate": 3.351260300279882e-05, + "loss": 0.0111, + "num_input_tokens_seen": 17283232, + "step": 81895 + }, + { + "epoch": 9.009900990099009, + "grad_norm": 0.013950885273516178, + "learning_rate": 3.351034631663536e-05, + "loss": 0.0202, + "num_input_tokens_seen": 17284288, + "step": 81900 + }, + { + "epoch": 9.01045104510451, + "grad_norm": 0.3457321524620056, + "learning_rate": 3.3508089552036114e-05, + "loss": 0.0109, + "num_input_tokens_seen": 17285344, + "step": 81905 + }, + { + "epoch": 9.011001100110011, + "grad_norm": 1.6684330701828003, + "learning_rate": 3.3505832709021906e-05, + "loss": 0.0663, + "num_input_tokens_seen": 17286400, + "step": 81910 + }, + { + "epoch": 9.011551155115512, + "grad_norm": 0.586482584476471, + "learning_rate": 3.3503575787613505e-05, + "loss": 0.0523, + "num_input_tokens_seen": 17287424, + "step": 81915 + }, + { + "epoch": 9.012101210121012, + "grad_norm": 0.01722581684589386, + "learning_rate": 3.3501318787831734e-05, + "loss": 0.0604, + "num_input_tokens_seen": 17288480, + "step": 81920 + }, + { + "epoch": 9.012651265126513, + "grad_norm": 1.6200169324874878, + "learning_rate": 3.349906170969739e-05, + "loss": 0.1174, + "num_input_tokens_seen": 17289536, + "step": 81925 + }, + { + "epoch": 9.013201320132014, + "grad_norm": 0.07638876885175705, + "learning_rate": 3.349680455323127e-05, + "loss": 0.0039, + "num_input_tokens_seen": 17290592, + "step": 81930 + }, + { + "epoch": 9.013751375137513, + "grad_norm": 0.20208455622196198, + "learning_rate": 3.349454731845418e-05, + "loss": 0.0718, + "num_input_tokens_seen": 17291680, + "step": 81935 + }, + { + "epoch": 9.014301430143014, + "grad_norm": 0.09774113446474075, + "learning_rate": 3.3492290005386917e-05, + "loss": 0.0118, + "num_input_tokens_seen": 17292704, + "step": 81940 + }, + { + "epoch": 9.014851485148515, + "grad_norm": 0.02083342708647251, + "learning_rate": 3.3490032614050295e-05, + "loss": 0.0047, + "num_input_tokens_seen": 17293760, + "step": 81945 + }, + { + "epoch": 9.015401540154015, + "grad_norm": 0.9354941844940186, + "learning_rate": 3.3487775144465115e-05, + "loss": 0.1509, + "num_input_tokens_seen": 17294752, + "step": 81950 + }, + { + "epoch": 9.015951595159516, + "grad_norm": 0.228677898645401, + "learning_rate": 3.348551759665219e-05, + "loss": 0.0733, + "num_input_tokens_seen": 17295776, + "step": 81955 + }, + { + "epoch": 9.016501650165017, + "grad_norm": 0.44935673475265503, + "learning_rate": 3.348325997063232e-05, + "loss": 0.0528, + "num_input_tokens_seen": 17296832, + "step": 81960 + }, + { + "epoch": 9.017051705170518, + "grad_norm": 0.20627456903457642, + "learning_rate": 3.348100226642631e-05, + "loss": 0.0569, + "num_input_tokens_seen": 17297952, + "step": 81965 + }, + { + "epoch": 9.017601760176017, + "grad_norm": 0.013721705414354801, + "learning_rate": 3.347874448405498e-05, + "loss": 0.0063, + "num_input_tokens_seen": 17299040, + "step": 81970 + }, + { + "epoch": 9.018151815181518, + "grad_norm": 0.08989851921796799, + "learning_rate": 3.3476486623539115e-05, + "loss": 0.0095, + "num_input_tokens_seen": 17300160, + "step": 81975 + }, + { + "epoch": 9.01870187018702, + "grad_norm": 0.0645793229341507, + "learning_rate": 3.3474228684899554e-05, + "loss": 0.0226, + "num_input_tokens_seen": 17301216, + "step": 81980 + }, + { + "epoch": 9.019251925192519, + "grad_norm": 2.3582417964935303, + "learning_rate": 3.347197066815709e-05, + "loss": 0.0419, + "num_input_tokens_seen": 17302272, + "step": 81985 + }, + { + "epoch": 9.01980198019802, + "grad_norm": 0.14381860196590424, + "learning_rate": 3.3469712573332535e-05, + "loss": 0.0726, + "num_input_tokens_seen": 17303328, + "step": 81990 + }, + { + "epoch": 9.020352035203521, + "grad_norm": 0.07609947770833969, + "learning_rate": 3.346745440044671e-05, + "loss": 0.0191, + "num_input_tokens_seen": 17304416, + "step": 81995 + }, + { + "epoch": 9.02090209020902, + "grad_norm": 0.049778010696172714, + "learning_rate": 3.346519614952041e-05, + "loss": 0.005, + "num_input_tokens_seen": 17305440, + "step": 82000 + }, + { + "epoch": 9.021452145214521, + "grad_norm": 0.18518459796905518, + "learning_rate": 3.3462937820574466e-05, + "loss": 0.0463, + "num_input_tokens_seen": 17306528, + "step": 82005 + }, + { + "epoch": 9.022002200220022, + "grad_norm": 0.03715097904205322, + "learning_rate": 3.346067941362969e-05, + "loss": 0.0047, + "num_input_tokens_seen": 17307616, + "step": 82010 + }, + { + "epoch": 9.022552255225522, + "grad_norm": 0.32321101427078247, + "learning_rate": 3.345842092870688e-05, + "loss": 0.0191, + "num_input_tokens_seen": 17308640, + "step": 82015 + }, + { + "epoch": 9.023102310231023, + "grad_norm": 0.01999463327229023, + "learning_rate": 3.3456162365826865e-05, + "loss": 0.007, + "num_input_tokens_seen": 17309728, + "step": 82020 + }, + { + "epoch": 9.023652365236524, + "grad_norm": 0.012538348324596882, + "learning_rate": 3.345390372501047e-05, + "loss": 0.0949, + "num_input_tokens_seen": 17310752, + "step": 82025 + }, + { + "epoch": 9.024202420242025, + "grad_norm": 0.5907494425773621, + "learning_rate": 3.345164500627849e-05, + "loss": 0.0858, + "num_input_tokens_seen": 17311840, + "step": 82030 + }, + { + "epoch": 9.024752475247524, + "grad_norm": 0.5295901298522949, + "learning_rate": 3.3449386209651754e-05, + "loss": 0.0098, + "num_input_tokens_seen": 17312896, + "step": 82035 + }, + { + "epoch": 9.025302530253025, + "grad_norm": 0.12233420461416245, + "learning_rate": 3.3447127335151084e-05, + "loss": 0.0064, + "num_input_tokens_seen": 17313920, + "step": 82040 + }, + { + "epoch": 9.025852585258527, + "grad_norm": 0.03894643113017082, + "learning_rate": 3.34448683827973e-05, + "loss": 0.0462, + "num_input_tokens_seen": 17314976, + "step": 82045 + }, + { + "epoch": 9.026402640264026, + "grad_norm": 0.007512051612138748, + "learning_rate": 3.3442609352611205e-05, + "loss": 0.0289, + "num_input_tokens_seen": 17316064, + "step": 82050 + }, + { + "epoch": 9.026952695269527, + "grad_norm": 1.0501606464385986, + "learning_rate": 3.344035024461364e-05, + "loss": 0.0182, + "num_input_tokens_seen": 17317152, + "step": 82055 + }, + { + "epoch": 9.027502750275028, + "grad_norm": 0.12393690645694733, + "learning_rate": 3.343809105882541e-05, + "loss": 0.0103, + "num_input_tokens_seen": 17318208, + "step": 82060 + }, + { + "epoch": 9.028052805280527, + "grad_norm": 0.09206175804138184, + "learning_rate": 3.343583179526734e-05, + "loss": 0.0046, + "num_input_tokens_seen": 17319296, + "step": 82065 + }, + { + "epoch": 9.028602860286028, + "grad_norm": 0.11510506272315979, + "learning_rate": 3.343357245396026e-05, + "loss": 0.0524, + "num_input_tokens_seen": 17320320, + "step": 82070 + }, + { + "epoch": 9.02915291529153, + "grad_norm": 0.03071272186934948, + "learning_rate": 3.343131303492499e-05, + "loss": 0.0025, + "num_input_tokens_seen": 17321408, + "step": 82075 + }, + { + "epoch": 9.029702970297029, + "grad_norm": 0.002530602738261223, + "learning_rate": 3.3429053538182366e-05, + "loss": 0.004, + "num_input_tokens_seen": 17322432, + "step": 82080 + }, + { + "epoch": 9.03025302530253, + "grad_norm": 0.009891165420413017, + "learning_rate": 3.342679396375319e-05, + "loss": 0.0027, + "num_input_tokens_seen": 17323424, + "step": 82085 + }, + { + "epoch": 9.030803080308031, + "grad_norm": 0.4241426885128021, + "learning_rate": 3.3424534311658297e-05, + "loss": 0.0052, + "num_input_tokens_seen": 17324512, + "step": 82090 + }, + { + "epoch": 9.031353135313532, + "grad_norm": 0.0162222757935524, + "learning_rate": 3.342227458191851e-05, + "loss": 0.0801, + "num_input_tokens_seen": 17325600, + "step": 82095 + }, + { + "epoch": 9.031903190319031, + "grad_norm": 0.05223049223423004, + "learning_rate": 3.342001477455466e-05, + "loss": 0.0343, + "num_input_tokens_seen": 17326656, + "step": 82100 + }, + { + "epoch": 9.032453245324533, + "grad_norm": 0.313060462474823, + "learning_rate": 3.3417754889587576e-05, + "loss": 0.1319, + "num_input_tokens_seen": 17327776, + "step": 82105 + }, + { + "epoch": 9.033003300330034, + "grad_norm": 0.02430952526628971, + "learning_rate": 3.3415494927038085e-05, + "loss": 0.0037, + "num_input_tokens_seen": 17328864, + "step": 82110 + }, + { + "epoch": 9.033553355335533, + "grad_norm": 0.2634817063808441, + "learning_rate": 3.341323488692702e-05, + "loss": 0.0488, + "num_input_tokens_seen": 17329920, + "step": 82115 + }, + { + "epoch": 9.034103410341034, + "grad_norm": 0.06700845807790756, + "learning_rate": 3.3410974769275205e-05, + "loss": 0.0411, + "num_input_tokens_seen": 17330912, + "step": 82120 + }, + { + "epoch": 9.034653465346535, + "grad_norm": 0.11086703091859818, + "learning_rate": 3.340871457410347e-05, + "loss": 0.0405, + "num_input_tokens_seen": 17331968, + "step": 82125 + }, + { + "epoch": 9.035203520352034, + "grad_norm": 0.01160870585590601, + "learning_rate": 3.3406454301432646e-05, + "loss": 0.0682, + "num_input_tokens_seen": 17332960, + "step": 82130 + }, + { + "epoch": 9.035753575357536, + "grad_norm": 0.5860046148300171, + "learning_rate": 3.340419395128356e-05, + "loss": 0.0186, + "num_input_tokens_seen": 17333984, + "step": 82135 + }, + { + "epoch": 9.036303630363037, + "grad_norm": 0.017607729882001877, + "learning_rate": 3.340193352367706e-05, + "loss": 0.0876, + "num_input_tokens_seen": 17335104, + "step": 82140 + }, + { + "epoch": 9.036853685368538, + "grad_norm": 0.7013111710548401, + "learning_rate": 3.339967301863398e-05, + "loss": 0.011, + "num_input_tokens_seen": 17336096, + "step": 82145 + }, + { + "epoch": 9.037403740374037, + "grad_norm": 1.8766005039215088, + "learning_rate": 3.339741243617512e-05, + "loss": 0.0597, + "num_input_tokens_seen": 17337152, + "step": 82150 + }, + { + "epoch": 9.037953795379538, + "grad_norm": 0.029902709648013115, + "learning_rate": 3.339515177632136e-05, + "loss": 0.0275, + "num_input_tokens_seen": 17338240, + "step": 82155 + }, + { + "epoch": 9.03850385038504, + "grad_norm": 0.18253354728221893, + "learning_rate": 3.3392891039093503e-05, + "loss": 0.0079, + "num_input_tokens_seen": 17339264, + "step": 82160 + }, + { + "epoch": 9.039053905390539, + "grad_norm": 0.563772439956665, + "learning_rate": 3.33906302245124e-05, + "loss": 0.0146, + "num_input_tokens_seen": 17340320, + "step": 82165 + }, + { + "epoch": 9.03960396039604, + "grad_norm": 0.33473241329193115, + "learning_rate": 3.338836933259889e-05, + "loss": 0.0181, + "num_input_tokens_seen": 17341376, + "step": 82170 + }, + { + "epoch": 9.04015401540154, + "grad_norm": 1.0316165685653687, + "learning_rate": 3.33861083633738e-05, + "loss": 0.0609, + "num_input_tokens_seen": 17342496, + "step": 82175 + }, + { + "epoch": 9.04070407040704, + "grad_norm": 0.045368846505880356, + "learning_rate": 3.338384731685796e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17343584, + "step": 82180 + }, + { + "epoch": 9.041254125412541, + "grad_norm": 0.3516688346862793, + "learning_rate": 3.3381586193072244e-05, + "loss": 0.0111, + "num_input_tokens_seen": 17344672, + "step": 82185 + }, + { + "epoch": 9.041804180418042, + "grad_norm": 0.07751023024320602, + "learning_rate": 3.337932499203746e-05, + "loss": 0.0027, + "num_input_tokens_seen": 17345792, + "step": 82190 + }, + { + "epoch": 9.042354235423542, + "grad_norm": 0.39380937814712524, + "learning_rate": 3.337706371377446e-05, + "loss": 0.0083, + "num_input_tokens_seen": 17346784, + "step": 82195 + }, + { + "epoch": 9.042904290429043, + "grad_norm": 0.11000064015388489, + "learning_rate": 3.337480235830408e-05, + "loss": 0.0125, + "num_input_tokens_seen": 17347808, + "step": 82200 + }, + { + "epoch": 9.043454345434544, + "grad_norm": 0.07563070207834244, + "learning_rate": 3.3372540925647175e-05, + "loss": 0.0332, + "num_input_tokens_seen": 17348896, + "step": 82205 + }, + { + "epoch": 9.044004400440045, + "grad_norm": 0.47461336851119995, + "learning_rate": 3.3370279415824565e-05, + "loss": 0.0715, + "num_input_tokens_seen": 17349952, + "step": 82210 + }, + { + "epoch": 9.044554455445544, + "grad_norm": 0.03221384435892105, + "learning_rate": 3.336801782885712e-05, + "loss": 0.0018, + "num_input_tokens_seen": 17351008, + "step": 82215 + }, + { + "epoch": 9.045104510451045, + "grad_norm": 2.2913095951080322, + "learning_rate": 3.336575616476565e-05, + "loss": 0.0459, + "num_input_tokens_seen": 17352032, + "step": 82220 + }, + { + "epoch": 9.045654565456546, + "grad_norm": 0.9099123477935791, + "learning_rate": 3.336349442357103e-05, + "loss": 0.0437, + "num_input_tokens_seen": 17353088, + "step": 82225 + }, + { + "epoch": 9.046204620462046, + "grad_norm": 0.01150805875658989, + "learning_rate": 3.3361232605294104e-05, + "loss": 0.0197, + "num_input_tokens_seen": 17354112, + "step": 82230 + }, + { + "epoch": 9.046754675467547, + "grad_norm": 2.1135103702545166, + "learning_rate": 3.33589707099557e-05, + "loss": 0.0554, + "num_input_tokens_seen": 17355232, + "step": 82235 + }, + { + "epoch": 9.047304730473048, + "grad_norm": 0.2909627854824066, + "learning_rate": 3.335670873757668e-05, + "loss": 0.032, + "num_input_tokens_seen": 17356320, + "step": 82240 + }, + { + "epoch": 9.047854785478547, + "grad_norm": 0.005850657820701599, + "learning_rate": 3.335444668817788e-05, + "loss": 0.0227, + "num_input_tokens_seen": 17357344, + "step": 82245 + }, + { + "epoch": 9.048404840484048, + "grad_norm": 0.9700080752372742, + "learning_rate": 3.3352184561780154e-05, + "loss": 0.0797, + "num_input_tokens_seen": 17358400, + "step": 82250 + }, + { + "epoch": 9.04895489548955, + "grad_norm": 0.12738008797168732, + "learning_rate": 3.334992235840435e-05, + "loss": 0.0836, + "num_input_tokens_seen": 17359456, + "step": 82255 + }, + { + "epoch": 9.049504950495049, + "grad_norm": 0.020227674394845963, + "learning_rate": 3.334766007807132e-05, + "loss": 0.0474, + "num_input_tokens_seen": 17360576, + "step": 82260 + }, + { + "epoch": 9.05005500550055, + "grad_norm": 0.008675554767251015, + "learning_rate": 3.334539772080191e-05, + "loss": 0.0099, + "num_input_tokens_seen": 17361632, + "step": 82265 + }, + { + "epoch": 9.05060506050605, + "grad_norm": 0.4931621849536896, + "learning_rate": 3.334313528661698e-05, + "loss": 0.0122, + "num_input_tokens_seen": 17362720, + "step": 82270 + }, + { + "epoch": 9.051155115511552, + "grad_norm": 0.007865473628044128, + "learning_rate": 3.334087277553737e-05, + "loss": 0.0304, + "num_input_tokens_seen": 17363744, + "step": 82275 + }, + { + "epoch": 9.051705170517051, + "grad_norm": 0.013694512657821178, + "learning_rate": 3.3338610187583944e-05, + "loss": 0.1302, + "num_input_tokens_seen": 17364800, + "step": 82280 + }, + { + "epoch": 9.052255225522552, + "grad_norm": 0.026676258072257042, + "learning_rate": 3.3336347522777544e-05, + "loss": 0.0137, + "num_input_tokens_seen": 17365856, + "step": 82285 + }, + { + "epoch": 9.052805280528053, + "grad_norm": 0.1279408484697342, + "learning_rate": 3.3334084781139035e-05, + "loss": 0.0129, + "num_input_tokens_seen": 17366848, + "step": 82290 + }, + { + "epoch": 9.053355335533553, + "grad_norm": 0.010673115029931068, + "learning_rate": 3.333182196268926e-05, + "loss": 0.0057, + "num_input_tokens_seen": 17367904, + "step": 82295 + }, + { + "epoch": 9.053905390539054, + "grad_norm": 0.9698657393455505, + "learning_rate": 3.332955906744908e-05, + "loss": 0.0285, + "num_input_tokens_seen": 17368992, + "step": 82300 + }, + { + "epoch": 9.054455445544555, + "grad_norm": 0.017782021313905716, + "learning_rate": 3.332729609543935e-05, + "loss": 0.0034, + "num_input_tokens_seen": 17370080, + "step": 82305 + }, + { + "epoch": 9.055005500550054, + "grad_norm": 0.032889533787965775, + "learning_rate": 3.332503304668093e-05, + "loss": 0.0747, + "num_input_tokens_seen": 17371104, + "step": 82310 + }, + { + "epoch": 9.055555555555555, + "grad_norm": 0.11756368726491928, + "learning_rate": 3.3322769921194674e-05, + "loss": 0.0056, + "num_input_tokens_seen": 17372192, + "step": 82315 + }, + { + "epoch": 9.056105610561056, + "grad_norm": 0.129241943359375, + "learning_rate": 3.332050671900144e-05, + "loss": 0.0063, + "num_input_tokens_seen": 17373248, + "step": 82320 + }, + { + "epoch": 9.056655665566556, + "grad_norm": 0.38709914684295654, + "learning_rate": 3.331824344012209e-05, + "loss": 0.0103, + "num_input_tokens_seen": 17374336, + "step": 82325 + }, + { + "epoch": 9.057205720572057, + "grad_norm": 0.5485463738441467, + "learning_rate": 3.331598008457748e-05, + "loss": 0.0681, + "num_input_tokens_seen": 17375360, + "step": 82330 + }, + { + "epoch": 9.057755775577558, + "grad_norm": 0.0384538359940052, + "learning_rate": 3.3313716652388486e-05, + "loss": 0.0329, + "num_input_tokens_seen": 17376416, + "step": 82335 + }, + { + "epoch": 9.058305830583059, + "grad_norm": 0.034126196056604385, + "learning_rate": 3.331145314357593e-05, + "loss": 0.0138, + "num_input_tokens_seen": 17377440, + "step": 82340 + }, + { + "epoch": 9.058855885588558, + "grad_norm": 0.008888364769518375, + "learning_rate": 3.3309189558160715e-05, + "loss": 0.0157, + "num_input_tokens_seen": 17378432, + "step": 82345 + }, + { + "epoch": 9.05940594059406, + "grad_norm": 0.006920452229678631, + "learning_rate": 3.3306925896163683e-05, + "loss": 0.0533, + "num_input_tokens_seen": 17379488, + "step": 82350 + }, + { + "epoch": 9.05995599559956, + "grad_norm": 0.308767169713974, + "learning_rate": 3.3304662157605695e-05, + "loss": 0.0071, + "num_input_tokens_seen": 17380576, + "step": 82355 + }, + { + "epoch": 9.06050605060506, + "grad_norm": 0.0057795289903879166, + "learning_rate": 3.3302398342507634e-05, + "loss": 0.0666, + "num_input_tokens_seen": 17381664, + "step": 82360 + }, + { + "epoch": 9.061056105610561, + "grad_norm": 0.060902368277311325, + "learning_rate": 3.3300134450890345e-05, + "loss": 0.0872, + "num_input_tokens_seen": 17382784, + "step": 82365 + }, + { + "epoch": 9.061606160616062, + "grad_norm": 0.053681451827287674, + "learning_rate": 3.32978704827747e-05, + "loss": 0.0041, + "num_input_tokens_seen": 17383776, + "step": 82370 + }, + { + "epoch": 9.062156215621561, + "grad_norm": 0.013414602726697922, + "learning_rate": 3.329560643818156e-05, + "loss": 0.0339, + "num_input_tokens_seen": 17384832, + "step": 82375 + }, + { + "epoch": 9.062706270627062, + "grad_norm": 0.0032120593823492527, + "learning_rate": 3.32933423171318e-05, + "loss": 0.0232, + "num_input_tokens_seen": 17385920, + "step": 82380 + }, + { + "epoch": 9.063256325632564, + "grad_norm": 0.045598238706588745, + "learning_rate": 3.329107811964629e-05, + "loss": 0.015, + "num_input_tokens_seen": 17386976, + "step": 82385 + }, + { + "epoch": 9.063806380638065, + "grad_norm": 0.01204645074903965, + "learning_rate": 3.328881384574588e-05, + "loss": 0.0131, + "num_input_tokens_seen": 17388064, + "step": 82390 + }, + { + "epoch": 9.064356435643564, + "grad_norm": 0.4450951814651489, + "learning_rate": 3.328654949545146e-05, + "loss": 0.0397, + "num_input_tokens_seen": 17389056, + "step": 82395 + }, + { + "epoch": 9.064906490649065, + "grad_norm": 0.16339775919914246, + "learning_rate": 3.3284285068783874e-05, + "loss": 0.0082, + "num_input_tokens_seen": 17390112, + "step": 82400 + }, + { + "epoch": 9.065456545654566, + "grad_norm": 0.06721407175064087, + "learning_rate": 3.328202056576403e-05, + "loss": 0.0498, + "num_input_tokens_seen": 17391168, + "step": 82405 + }, + { + "epoch": 9.066006600660065, + "grad_norm": 0.03446102887392044, + "learning_rate": 3.327975598641275e-05, + "loss": 0.0016, + "num_input_tokens_seen": 17392256, + "step": 82410 + }, + { + "epoch": 9.066556655665567, + "grad_norm": 0.2691519856452942, + "learning_rate": 3.327749133075095e-05, + "loss": 0.0635, + "num_input_tokens_seen": 17393312, + "step": 82415 + }, + { + "epoch": 9.067106710671068, + "grad_norm": 0.009099757298827171, + "learning_rate": 3.327522659879948e-05, + "loss": 0.0021, + "num_input_tokens_seen": 17394400, + "step": 82420 + }, + { + "epoch": 9.067656765676567, + "grad_norm": 1.318840742111206, + "learning_rate": 3.327296179057922e-05, + "loss": 0.056, + "num_input_tokens_seen": 17395488, + "step": 82425 + }, + { + "epoch": 9.068206820682068, + "grad_norm": 0.15593795478343964, + "learning_rate": 3.3270696906111045e-05, + "loss": 0.0094, + "num_input_tokens_seen": 17396544, + "step": 82430 + }, + { + "epoch": 9.06875687568757, + "grad_norm": 0.2674836814403534, + "learning_rate": 3.326843194541582e-05, + "loss": 0.0715, + "num_input_tokens_seen": 17397664, + "step": 82435 + }, + { + "epoch": 9.069306930693068, + "grad_norm": 0.12751466035842896, + "learning_rate": 3.326616690851442e-05, + "loss": 0.01, + "num_input_tokens_seen": 17398752, + "step": 82440 + }, + { + "epoch": 9.06985698569857, + "grad_norm": 0.8709967732429504, + "learning_rate": 3.3263901795427724e-05, + "loss": 0.0917, + "num_input_tokens_seen": 17399808, + "step": 82445 + }, + { + "epoch": 9.07040704070407, + "grad_norm": 0.07526807487010956, + "learning_rate": 3.326163660617662e-05, + "loss": 0.0822, + "num_input_tokens_seen": 17400832, + "step": 82450 + }, + { + "epoch": 9.070957095709572, + "grad_norm": 0.017195066437125206, + "learning_rate": 3.325937134078198e-05, + "loss": 0.0074, + "num_input_tokens_seen": 17401888, + "step": 82455 + }, + { + "epoch": 9.071507150715071, + "grad_norm": 0.9767832159996033, + "learning_rate": 3.325710599926467e-05, + "loss": 0.0577, + "num_input_tokens_seen": 17402944, + "step": 82460 + }, + { + "epoch": 9.072057205720572, + "grad_norm": 0.045614153146743774, + "learning_rate": 3.325484058164557e-05, + "loss": 0.0239, + "num_input_tokens_seen": 17404032, + "step": 82465 + }, + { + "epoch": 9.072607260726073, + "grad_norm": 0.09904016554355621, + "learning_rate": 3.325257508794557e-05, + "loss": 0.0237, + "num_input_tokens_seen": 17405088, + "step": 82470 + }, + { + "epoch": 9.073157315731573, + "grad_norm": 0.013496318832039833, + "learning_rate": 3.325030951818555e-05, + "loss": 0.0077, + "num_input_tokens_seen": 17406080, + "step": 82475 + }, + { + "epoch": 9.073707370737074, + "grad_norm": 0.025144852697849274, + "learning_rate": 3.3248043872386393e-05, + "loss": 0.0025, + "num_input_tokens_seen": 17407168, + "step": 82480 + }, + { + "epoch": 9.074257425742575, + "grad_norm": 0.029842855408787727, + "learning_rate": 3.3245778150568955e-05, + "loss": 0.0051, + "num_input_tokens_seen": 17408256, + "step": 82485 + }, + { + "epoch": 9.074807480748074, + "grad_norm": 0.06539254635572433, + "learning_rate": 3.324351235275415e-05, + "loss": 0.0039, + "num_input_tokens_seen": 17409376, + "step": 82490 + }, + { + "epoch": 9.075357535753575, + "grad_norm": 0.1457553654909134, + "learning_rate": 3.324124647896284e-05, + "loss": 0.0857, + "num_input_tokens_seen": 17410432, + "step": 82495 + }, + { + "epoch": 9.075907590759076, + "grad_norm": 1.0819952487945557, + "learning_rate": 3.3238980529215916e-05, + "loss": 0.0624, + "num_input_tokens_seen": 17411520, + "step": 82500 + }, + { + "epoch": 9.076457645764576, + "grad_norm": 0.04933995380997658, + "learning_rate": 3.323671450353427e-05, + "loss": 0.0071, + "num_input_tokens_seen": 17412544, + "step": 82505 + }, + { + "epoch": 9.077007700770077, + "grad_norm": 0.7153427004814148, + "learning_rate": 3.323444840193877e-05, + "loss": 0.2027, + "num_input_tokens_seen": 17413600, + "step": 82510 + }, + { + "epoch": 9.077557755775578, + "grad_norm": 0.4074699878692627, + "learning_rate": 3.3232182224450315e-05, + "loss": 0.0219, + "num_input_tokens_seen": 17414656, + "step": 82515 + }, + { + "epoch": 9.078107810781079, + "grad_norm": 1.1394720077514648, + "learning_rate": 3.322991597108978e-05, + "loss": 0.1287, + "num_input_tokens_seen": 17415648, + "step": 82520 + }, + { + "epoch": 9.078657865786578, + "grad_norm": 1.1183726787567139, + "learning_rate": 3.3227649641878065e-05, + "loss": 0.0513, + "num_input_tokens_seen": 17416736, + "step": 82525 + }, + { + "epoch": 9.07920792079208, + "grad_norm": 0.016869282349944115, + "learning_rate": 3.322538323683605e-05, + "loss": 0.0404, + "num_input_tokens_seen": 17417760, + "step": 82530 + }, + { + "epoch": 9.07975797579758, + "grad_norm": 1.2679682970046997, + "learning_rate": 3.322311675598463e-05, + "loss": 0.1705, + "num_input_tokens_seen": 17418848, + "step": 82535 + }, + { + "epoch": 9.08030803080308, + "grad_norm": 0.20156416296958923, + "learning_rate": 3.322085019934468e-05, + "loss": 0.003, + "num_input_tokens_seen": 17419936, + "step": 82540 + }, + { + "epoch": 9.08085808580858, + "grad_norm": 0.017769303172826767, + "learning_rate": 3.32185835669371e-05, + "loss": 0.0421, + "num_input_tokens_seen": 17420992, + "step": 82545 + }, + { + "epoch": 9.081408140814082, + "grad_norm": 0.1693076342344284, + "learning_rate": 3.3216316858782785e-05, + "loss": 0.0074, + "num_input_tokens_seen": 17422048, + "step": 82550 + }, + { + "epoch": 9.081958195819581, + "grad_norm": 0.05194106325507164, + "learning_rate": 3.3214050074902616e-05, + "loss": 0.0202, + "num_input_tokens_seen": 17423136, + "step": 82555 + }, + { + "epoch": 9.082508250825082, + "grad_norm": 1.1771587133407593, + "learning_rate": 3.321178321531749e-05, + "loss": 0.1641, + "num_input_tokens_seen": 17424192, + "step": 82560 + }, + { + "epoch": 9.083058305830583, + "grad_norm": 0.6610504388809204, + "learning_rate": 3.3209516280048294e-05, + "loss": 0.0369, + "num_input_tokens_seen": 17425184, + "step": 82565 + }, + { + "epoch": 9.083608360836084, + "grad_norm": 0.03319437801837921, + "learning_rate": 3.320724926911593e-05, + "loss": 0.002, + "num_input_tokens_seen": 17426272, + "step": 82570 + }, + { + "epoch": 9.084158415841584, + "grad_norm": 0.02944183722138405, + "learning_rate": 3.320498218254129e-05, + "loss": 0.063, + "num_input_tokens_seen": 17427360, + "step": 82575 + }, + { + "epoch": 9.084708470847085, + "grad_norm": 0.13814391195774078, + "learning_rate": 3.320271502034526e-05, + "loss": 0.0308, + "num_input_tokens_seen": 17428384, + "step": 82580 + }, + { + "epoch": 9.085258525852586, + "grad_norm": 0.030070362612605095, + "learning_rate": 3.320044778254874e-05, + "loss": 0.0093, + "num_input_tokens_seen": 17429472, + "step": 82585 + }, + { + "epoch": 9.085808580858085, + "grad_norm": 0.30774083733558655, + "learning_rate": 3.3198180469172636e-05, + "loss": 0.0607, + "num_input_tokens_seen": 17430560, + "step": 82590 + }, + { + "epoch": 9.086358635863586, + "grad_norm": 0.036431264132261276, + "learning_rate": 3.319591308023783e-05, + "loss": 0.0108, + "num_input_tokens_seen": 17431648, + "step": 82595 + }, + { + "epoch": 9.086908690869087, + "grad_norm": 0.051687248051166534, + "learning_rate": 3.3193645615765234e-05, + "loss": 0.0782, + "num_input_tokens_seen": 17432736, + "step": 82600 + }, + { + "epoch": 9.087458745874587, + "grad_norm": 0.08941815048456192, + "learning_rate": 3.319137807577573e-05, + "loss": 0.0121, + "num_input_tokens_seen": 17433760, + "step": 82605 + }, + { + "epoch": 9.088008800880088, + "grad_norm": 0.06927099823951721, + "learning_rate": 3.318911046029023e-05, + "loss": 0.0105, + "num_input_tokens_seen": 17434752, + "step": 82610 + }, + { + "epoch": 9.088558855885589, + "grad_norm": 0.01532810926437378, + "learning_rate": 3.318684276932962e-05, + "loss": 0.0035, + "num_input_tokens_seen": 17435776, + "step": 82615 + }, + { + "epoch": 9.089108910891088, + "grad_norm": 0.02586839720606804, + "learning_rate": 3.318457500291482e-05, + "loss": 0.0282, + "num_input_tokens_seen": 17436864, + "step": 82620 + }, + { + "epoch": 9.08965896589659, + "grad_norm": 0.030112795531749725, + "learning_rate": 3.318230716106672e-05, + "loss": 0.0052, + "num_input_tokens_seen": 17438016, + "step": 82625 + }, + { + "epoch": 9.09020902090209, + "grad_norm": 1.0355020761489868, + "learning_rate": 3.318003924380621e-05, + "loss": 0.1, + "num_input_tokens_seen": 17438976, + "step": 82630 + }, + { + "epoch": 9.090759075907592, + "grad_norm": 0.07015194743871689, + "learning_rate": 3.3177771251154214e-05, + "loss": 0.0179, + "num_input_tokens_seen": 17439968, + "step": 82635 + }, + { + "epoch": 9.091309130913091, + "grad_norm": 0.0498969629406929, + "learning_rate": 3.3175503183131616e-05, + "loss": 0.1411, + "num_input_tokens_seen": 17440960, + "step": 82640 + }, + { + "epoch": 9.091859185918592, + "grad_norm": 0.04473031684756279, + "learning_rate": 3.317323503975933e-05, + "loss": 0.054, + "num_input_tokens_seen": 17442080, + "step": 82645 + }, + { + "epoch": 9.092409240924093, + "grad_norm": 0.027690257877111435, + "learning_rate": 3.317096682105827e-05, + "loss": 0.0072, + "num_input_tokens_seen": 17443136, + "step": 82650 + }, + { + "epoch": 9.092959295929592, + "grad_norm": 1.526234745979309, + "learning_rate": 3.316869852704931e-05, + "loss": 0.076, + "num_input_tokens_seen": 17444128, + "step": 82655 + }, + { + "epoch": 9.093509350935093, + "grad_norm": 0.015917310491204262, + "learning_rate": 3.316643015775338e-05, + "loss": 0.0044, + "num_input_tokens_seen": 17445184, + "step": 82660 + }, + { + "epoch": 9.094059405940595, + "grad_norm": 0.10423168540000916, + "learning_rate": 3.316416171319139e-05, + "loss": 0.0963, + "num_input_tokens_seen": 17446304, + "step": 82665 + }, + { + "epoch": 9.094609460946094, + "grad_norm": 0.01068944577127695, + "learning_rate": 3.316189319338424e-05, + "loss": 0.0156, + "num_input_tokens_seen": 17447296, + "step": 82670 + }, + { + "epoch": 9.095159515951595, + "grad_norm": 0.0902625173330307, + "learning_rate": 3.3159624598352835e-05, + "loss": 0.0221, + "num_input_tokens_seen": 17448352, + "step": 82675 + }, + { + "epoch": 9.095709570957096, + "grad_norm": 0.13428322970867157, + "learning_rate": 3.315735592811808e-05, + "loss": 0.0057, + "num_input_tokens_seen": 17449408, + "step": 82680 + }, + { + "epoch": 9.096259625962595, + "grad_norm": 0.12555952370166779, + "learning_rate": 3.3155087182700895e-05, + "loss": 0.0093, + "num_input_tokens_seen": 17450496, + "step": 82685 + }, + { + "epoch": 9.096809680968097, + "grad_norm": 1.3570301532745361, + "learning_rate": 3.315281836212218e-05, + "loss": 0.0558, + "num_input_tokens_seen": 17451488, + "step": 82690 + }, + { + "epoch": 9.097359735973598, + "grad_norm": 0.11078417301177979, + "learning_rate": 3.315054946640286e-05, + "loss": 0.0062, + "num_input_tokens_seen": 17452448, + "step": 82695 + }, + { + "epoch": 9.097909790979099, + "grad_norm": 0.027589401230216026, + "learning_rate": 3.314828049556382e-05, + "loss": 0.0231, + "num_input_tokens_seen": 17453472, + "step": 82700 + }, + { + "epoch": 9.098459845984598, + "grad_norm": 0.03673780709505081, + "learning_rate": 3.3146011449626e-05, + "loss": 0.0267, + "num_input_tokens_seen": 17454464, + "step": 82705 + }, + { + "epoch": 9.099009900990099, + "grad_norm": 0.9218559861183167, + "learning_rate": 3.31437423286103e-05, + "loss": 0.015, + "num_input_tokens_seen": 17455552, + "step": 82710 + }, + { + "epoch": 9.0995599559956, + "grad_norm": 0.042908426374197006, + "learning_rate": 3.314147313253764e-05, + "loss": 0.0085, + "num_input_tokens_seen": 17456576, + "step": 82715 + }, + { + "epoch": 9.1001100110011, + "grad_norm": 0.005179561674594879, + "learning_rate": 3.313920386142892e-05, + "loss": 0.0831, + "num_input_tokens_seen": 17457632, + "step": 82720 + }, + { + "epoch": 9.1006600660066, + "grad_norm": 0.027108611539006233, + "learning_rate": 3.313693451530507e-05, + "loss": 0.0476, + "num_input_tokens_seen": 17458624, + "step": 82725 + }, + { + "epoch": 9.101210121012102, + "grad_norm": 0.9439674615859985, + "learning_rate": 3.313466509418699e-05, + "loss": 0.0689, + "num_input_tokens_seen": 17459712, + "step": 82730 + }, + { + "epoch": 9.101760176017601, + "grad_norm": 0.4216955006122589, + "learning_rate": 3.313239559809561e-05, + "loss": 0.0121, + "num_input_tokens_seen": 17460768, + "step": 82735 + }, + { + "epoch": 9.102310231023102, + "grad_norm": 0.016673803329467773, + "learning_rate": 3.313012602705185e-05, + "loss": 0.0085, + "num_input_tokens_seen": 17461792, + "step": 82740 + }, + { + "epoch": 9.102860286028603, + "grad_norm": 0.02319304645061493, + "learning_rate": 3.3127856381076604e-05, + "loss": 0.0318, + "num_input_tokens_seen": 17462848, + "step": 82745 + }, + { + "epoch": 9.103410341034103, + "grad_norm": 0.6658835411071777, + "learning_rate": 3.312558666019082e-05, + "loss": 0.0175, + "num_input_tokens_seen": 17463872, + "step": 82750 + }, + { + "epoch": 9.103960396039604, + "grad_norm": 0.014717929065227509, + "learning_rate": 3.31233168644154e-05, + "loss": 0.0232, + "num_input_tokens_seen": 17464992, + "step": 82755 + }, + { + "epoch": 9.104510451045105, + "grad_norm": 0.34417372941970825, + "learning_rate": 3.312104699377126e-05, + "loss": 0.0108, + "num_input_tokens_seen": 17466016, + "step": 82760 + }, + { + "epoch": 9.105060506050606, + "grad_norm": 0.5453968048095703, + "learning_rate": 3.311877704827933e-05, + "loss": 0.1262, + "num_input_tokens_seen": 17467104, + "step": 82765 + }, + { + "epoch": 9.105610561056105, + "grad_norm": 0.019855298101902008, + "learning_rate": 3.311650702796053e-05, + "loss": 0.0043, + "num_input_tokens_seen": 17468192, + "step": 82770 + }, + { + "epoch": 9.106160616061606, + "grad_norm": 0.06111036241054535, + "learning_rate": 3.311423693283577e-05, + "loss": 0.0086, + "num_input_tokens_seen": 17469184, + "step": 82775 + }, + { + "epoch": 9.106710671067107, + "grad_norm": 0.5862002372741699, + "learning_rate": 3.3111966762925995e-05, + "loss": 0.0822, + "num_input_tokens_seen": 17470272, + "step": 82780 + }, + { + "epoch": 9.107260726072607, + "grad_norm": 0.5899654030799866, + "learning_rate": 3.310969651825211e-05, + "loss": 0.0204, + "num_input_tokens_seen": 17471392, + "step": 82785 + }, + { + "epoch": 9.107810781078108, + "grad_norm": 0.010363166220486164, + "learning_rate": 3.310742619883504e-05, + "loss": 0.0258, + "num_input_tokens_seen": 17472448, + "step": 82790 + }, + { + "epoch": 9.108360836083609, + "grad_norm": 0.03161235153675079, + "learning_rate": 3.310515580469572e-05, + "loss": 0.0237, + "num_input_tokens_seen": 17473440, + "step": 82795 + }, + { + "epoch": 9.108910891089108, + "grad_norm": 0.2900920510292053, + "learning_rate": 3.310288533585506e-05, + "loss": 0.0103, + "num_input_tokens_seen": 17474528, + "step": 82800 + }, + { + "epoch": 9.10946094609461, + "grad_norm": 0.522786021232605, + "learning_rate": 3.3100614792334e-05, + "loss": 0.0114, + "num_input_tokens_seen": 17475616, + "step": 82805 + }, + { + "epoch": 9.11001100110011, + "grad_norm": 0.41350531578063965, + "learning_rate": 3.309834417415346e-05, + "loss": 0.0798, + "num_input_tokens_seen": 17476704, + "step": 82810 + }, + { + "epoch": 9.110561056105611, + "grad_norm": 0.05092993378639221, + "learning_rate": 3.309607348133437e-05, + "loss": 0.019, + "num_input_tokens_seen": 17477760, + "step": 82815 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 0.013380957767367363, + "learning_rate": 3.309380271389764e-05, + "loss": 0.0021, + "num_input_tokens_seen": 17478752, + "step": 82820 + }, + { + "epoch": 9.111661166116612, + "grad_norm": 0.016000570729374886, + "learning_rate": 3.309153187186423e-05, + "loss": 0.0409, + "num_input_tokens_seen": 17479776, + "step": 82825 + }, + { + "epoch": 9.112211221122113, + "grad_norm": 0.09606754034757614, + "learning_rate": 3.308926095525505e-05, + "loss": 0.0028, + "num_input_tokens_seen": 17480832, + "step": 82830 + }, + { + "epoch": 9.112761276127612, + "grad_norm": 0.9603784084320068, + "learning_rate": 3.308698996409103e-05, + "loss": 0.0376, + "num_input_tokens_seen": 17481952, + "step": 82835 + }, + { + "epoch": 9.113311331133113, + "grad_norm": 1.6121610403060913, + "learning_rate": 3.3084718898393116e-05, + "loss": 0.1533, + "num_input_tokens_seen": 17482976, + "step": 82840 + }, + { + "epoch": 9.113861386138614, + "grad_norm": 0.03047752194106579, + "learning_rate": 3.308244775818221e-05, + "loss": 0.0819, + "num_input_tokens_seen": 17484032, + "step": 82845 + }, + { + "epoch": 9.114411441144114, + "grad_norm": 0.9965717792510986, + "learning_rate": 3.3080176543479266e-05, + "loss": 0.0416, + "num_input_tokens_seen": 17485056, + "step": 82850 + }, + { + "epoch": 9.114961496149615, + "grad_norm": 0.03769354149699211, + "learning_rate": 3.307790525430522e-05, + "loss": 0.019, + "num_input_tokens_seen": 17486048, + "step": 82855 + }, + { + "epoch": 9.115511551155116, + "grad_norm": 0.017251048237085342, + "learning_rate": 3.307563389068099e-05, + "loss": 0.0057, + "num_input_tokens_seen": 17487168, + "step": 82860 + }, + { + "epoch": 9.116061606160615, + "grad_norm": 0.09220254421234131, + "learning_rate": 3.307336245262752e-05, + "loss": 0.0253, + "num_input_tokens_seen": 17488224, + "step": 82865 + }, + { + "epoch": 9.116611661166116, + "grad_norm": 1.3291093111038208, + "learning_rate": 3.307109094016575e-05, + "loss": 0.0561, + "num_input_tokens_seen": 17489312, + "step": 82870 + }, + { + "epoch": 9.117161716171617, + "grad_norm": 1.4651986360549927, + "learning_rate": 3.30688193533166e-05, + "loss": 0.0525, + "num_input_tokens_seen": 17490400, + "step": 82875 + }, + { + "epoch": 9.117711771177119, + "grad_norm": 0.024247245863080025, + "learning_rate": 3.306654769210101e-05, + "loss": 0.0038, + "num_input_tokens_seen": 17491456, + "step": 82880 + }, + { + "epoch": 9.118261826182618, + "grad_norm": 0.033206354826688766, + "learning_rate": 3.306427595653993e-05, + "loss": 0.015, + "num_input_tokens_seen": 17492480, + "step": 82885 + }, + { + "epoch": 9.118811881188119, + "grad_norm": 0.04505898803472519, + "learning_rate": 3.306200414665428e-05, + "loss": 0.0188, + "num_input_tokens_seen": 17493536, + "step": 82890 + }, + { + "epoch": 9.11936193619362, + "grad_norm": 0.0028582881204783916, + "learning_rate": 3.3059732262465e-05, + "loss": 0.001, + "num_input_tokens_seen": 17494656, + "step": 82895 + }, + { + "epoch": 9.11991199119912, + "grad_norm": 0.22008302807807922, + "learning_rate": 3.3057460303993054e-05, + "loss": 0.0391, + "num_input_tokens_seen": 17495744, + "step": 82900 + }, + { + "epoch": 9.12046204620462, + "grad_norm": 0.042256876826286316, + "learning_rate": 3.305518827125935e-05, + "loss": 0.0992, + "num_input_tokens_seen": 17496768, + "step": 82905 + }, + { + "epoch": 9.121012101210122, + "grad_norm": 0.4302562475204468, + "learning_rate": 3.3052916164284846e-05, + "loss": 0.0207, + "num_input_tokens_seen": 17497856, + "step": 82910 + }, + { + "epoch": 9.12156215621562, + "grad_norm": 0.019460683688521385, + "learning_rate": 3.305064398309048e-05, + "loss": 0.0118, + "num_input_tokens_seen": 17498880, + "step": 82915 + }, + { + "epoch": 9.122112211221122, + "grad_norm": 0.380584180355072, + "learning_rate": 3.304837172769718e-05, + "loss": 0.0802, + "num_input_tokens_seen": 17500000, + "step": 82920 + }, + { + "epoch": 9.122662266226623, + "grad_norm": 0.30888667702674866, + "learning_rate": 3.304609939812591e-05, + "loss": 0.0157, + "num_input_tokens_seen": 17501024, + "step": 82925 + }, + { + "epoch": 9.123212321232122, + "grad_norm": 0.10678455978631973, + "learning_rate": 3.3043826994397606e-05, + "loss": 0.0277, + "num_input_tokens_seen": 17502080, + "step": 82930 + }, + { + "epoch": 9.123762376237623, + "grad_norm": 0.44780445098876953, + "learning_rate": 3.304155451653319e-05, + "loss": 0.0692, + "num_input_tokens_seen": 17503168, + "step": 82935 + }, + { + "epoch": 9.124312431243125, + "grad_norm": 0.20941179990768433, + "learning_rate": 3.303928196455364e-05, + "loss": 0.0082, + "num_input_tokens_seen": 17504256, + "step": 82940 + }, + { + "epoch": 9.124862486248626, + "grad_norm": 0.27522018551826477, + "learning_rate": 3.303700933847988e-05, + "loss": 0.0112, + "num_input_tokens_seen": 17505312, + "step": 82945 + }, + { + "epoch": 9.125412541254125, + "grad_norm": 0.007987326011061668, + "learning_rate": 3.303473663833286e-05, + "loss": 0.0572, + "num_input_tokens_seen": 17506400, + "step": 82950 + }, + { + "epoch": 9.125962596259626, + "grad_norm": 0.042774226516485214, + "learning_rate": 3.3032463864133544e-05, + "loss": 0.0022, + "num_input_tokens_seen": 17507392, + "step": 82955 + }, + { + "epoch": 9.126512651265127, + "grad_norm": 0.004909568931907415, + "learning_rate": 3.303019101590286e-05, + "loss": 0.008, + "num_input_tokens_seen": 17508448, + "step": 82960 + }, + { + "epoch": 9.127062706270626, + "grad_norm": 0.32584884762763977, + "learning_rate": 3.302791809366174e-05, + "loss": 0.0185, + "num_input_tokens_seen": 17509568, + "step": 82965 + }, + { + "epoch": 9.127612761276128, + "grad_norm": 1.6600340604782104, + "learning_rate": 3.302564509743116e-05, + "loss": 0.0542, + "num_input_tokens_seen": 17510624, + "step": 82970 + }, + { + "epoch": 9.128162816281629, + "grad_norm": 0.5722423195838928, + "learning_rate": 3.302337202723206e-05, + "loss": 0.0364, + "num_input_tokens_seen": 17511712, + "step": 82975 + }, + { + "epoch": 9.128712871287128, + "grad_norm": 0.042562857270240784, + "learning_rate": 3.3021098883085396e-05, + "loss": 0.059, + "num_input_tokens_seen": 17512832, + "step": 82980 + }, + { + "epoch": 9.129262926292629, + "grad_norm": 0.07319381088018417, + "learning_rate": 3.3018825665012104e-05, + "loss": 0.0025, + "num_input_tokens_seen": 17513920, + "step": 82985 + }, + { + "epoch": 9.12981298129813, + "grad_norm": 0.17780764400959015, + "learning_rate": 3.301655237303315e-05, + "loss": 0.0098, + "num_input_tokens_seen": 17514944, + "step": 82990 + }, + { + "epoch": 9.130363036303631, + "grad_norm": 0.045189063996076584, + "learning_rate": 3.3014279007169466e-05, + "loss": 0.0135, + "num_input_tokens_seen": 17515968, + "step": 82995 + }, + { + "epoch": 9.13091309130913, + "grad_norm": 0.10638294368982315, + "learning_rate": 3.301200556744204e-05, + "loss": 0.0529, + "num_input_tokens_seen": 17517056, + "step": 83000 + }, + { + "epoch": 9.131463146314632, + "grad_norm": 0.02522529661655426, + "learning_rate": 3.300973205387179e-05, + "loss": 0.0036, + "num_input_tokens_seen": 17518080, + "step": 83005 + }, + { + "epoch": 9.132013201320133, + "grad_norm": 0.07268126308917999, + "learning_rate": 3.300745846647968e-05, + "loss": 0.0105, + "num_input_tokens_seen": 17519168, + "step": 83010 + }, + { + "epoch": 9.132563256325632, + "grad_norm": 0.004171936307102442, + "learning_rate": 3.3005184805286674e-05, + "loss": 0.0052, + "num_input_tokens_seen": 17520288, + "step": 83015 + }, + { + "epoch": 9.133113311331133, + "grad_norm": 0.023198327049613, + "learning_rate": 3.300291107031372e-05, + "loss": 0.0161, + "num_input_tokens_seen": 17521408, + "step": 83020 + }, + { + "epoch": 9.133663366336634, + "grad_norm": 0.022641325369477272, + "learning_rate": 3.300063726158178e-05, + "loss": 0.0028, + "num_input_tokens_seen": 17522496, + "step": 83025 + }, + { + "epoch": 9.134213421342134, + "grad_norm": 0.03897479921579361, + "learning_rate": 3.29983633791118e-05, + "loss": 0.0047, + "num_input_tokens_seen": 17523616, + "step": 83030 + }, + { + "epoch": 9.134763476347635, + "grad_norm": 0.0959305465221405, + "learning_rate": 3.2996089422924734e-05, + "loss": 0.0031, + "num_input_tokens_seen": 17524640, + "step": 83035 + }, + { + "epoch": 9.135313531353136, + "grad_norm": 0.7566993236541748, + "learning_rate": 3.299381539304156e-05, + "loss": 0.0808, + "num_input_tokens_seen": 17525728, + "step": 83040 + }, + { + "epoch": 9.135863586358635, + "grad_norm": 1.1605950593948364, + "learning_rate": 3.299154128948322e-05, + "loss": 0.0238, + "num_input_tokens_seen": 17526784, + "step": 83045 + }, + { + "epoch": 9.136413641364136, + "grad_norm": 0.004846260882914066, + "learning_rate": 3.298926711227068e-05, + "loss": 0.0179, + "num_input_tokens_seen": 17527840, + "step": 83050 + }, + { + "epoch": 9.136963696369637, + "grad_norm": 0.06675233691930771, + "learning_rate": 3.2986992861424904e-05, + "loss": 0.0807, + "num_input_tokens_seen": 17528832, + "step": 83055 + }, + { + "epoch": 9.137513751375138, + "grad_norm": 0.24167002737522125, + "learning_rate": 3.2984718536966846e-05, + "loss": 0.129, + "num_input_tokens_seen": 17529920, + "step": 83060 + }, + { + "epoch": 9.138063806380638, + "grad_norm": 0.00793936662375927, + "learning_rate": 3.298244413891746e-05, + "loss": 0.0023, + "num_input_tokens_seen": 17530976, + "step": 83065 + }, + { + "epoch": 9.138613861386139, + "grad_norm": 0.00799570232629776, + "learning_rate": 3.298016966729772e-05, + "loss": 0.0094, + "num_input_tokens_seen": 17532000, + "step": 83070 + }, + { + "epoch": 9.13916391639164, + "grad_norm": 0.08682357519865036, + "learning_rate": 3.29778951221286e-05, + "loss": 0.0868, + "num_input_tokens_seen": 17533152, + "step": 83075 + }, + { + "epoch": 9.13971397139714, + "grad_norm": 0.382008820772171, + "learning_rate": 3.2975620503431035e-05, + "loss": 0.0092, + "num_input_tokens_seen": 17534208, + "step": 83080 + }, + { + "epoch": 9.14026402640264, + "grad_norm": 1.5601091384887695, + "learning_rate": 3.2973345811226006e-05, + "loss": 0.1647, + "num_input_tokens_seen": 17535232, + "step": 83085 + }, + { + "epoch": 9.140814081408141, + "grad_norm": 1.8482394218444824, + "learning_rate": 3.2971071045534466e-05, + "loss": 0.0743, + "num_input_tokens_seen": 17536320, + "step": 83090 + }, + { + "epoch": 9.14136413641364, + "grad_norm": 0.010179135948419571, + "learning_rate": 3.29687962063774e-05, + "loss": 0.1532, + "num_input_tokens_seen": 17537408, + "step": 83095 + }, + { + "epoch": 9.141914191419142, + "grad_norm": 0.28815969824790955, + "learning_rate": 3.296652129377576e-05, + "loss": 0.0139, + "num_input_tokens_seen": 17538496, + "step": 83100 + }, + { + "epoch": 9.142464246424643, + "grad_norm": 0.041682954877614975, + "learning_rate": 3.296424630775052e-05, + "loss": 0.0714, + "num_input_tokens_seen": 17539520, + "step": 83105 + }, + { + "epoch": 9.143014301430142, + "grad_norm": 0.14044931530952454, + "learning_rate": 3.296197124832263e-05, + "loss": 0.0281, + "num_input_tokens_seen": 17540640, + "step": 83110 + }, + { + "epoch": 9.143564356435643, + "grad_norm": 0.39709004759788513, + "learning_rate": 3.2959696115513086e-05, + "loss": 0.0219, + "num_input_tokens_seen": 17541664, + "step": 83115 + }, + { + "epoch": 9.144114411441144, + "grad_norm": 0.07573328167200089, + "learning_rate": 3.295742090934284e-05, + "loss": 0.0451, + "num_input_tokens_seen": 17542720, + "step": 83120 + }, + { + "epoch": 9.144664466446645, + "grad_norm": 0.009435238316655159, + "learning_rate": 3.295514562983286e-05, + "loss": 0.0035, + "num_input_tokens_seen": 17543872, + "step": 83125 + }, + { + "epoch": 9.145214521452145, + "grad_norm": 0.4512721002101898, + "learning_rate": 3.295287027700412e-05, + "loss": 0.0233, + "num_input_tokens_seen": 17544960, + "step": 83130 + }, + { + "epoch": 9.145764576457646, + "grad_norm": 0.1847534477710724, + "learning_rate": 3.295059485087759e-05, + "loss": 0.0896, + "num_input_tokens_seen": 17546016, + "step": 83135 + }, + { + "epoch": 9.146314631463147, + "grad_norm": 0.09011123329401016, + "learning_rate": 3.294831935147424e-05, + "loss": 0.0256, + "num_input_tokens_seen": 17547104, + "step": 83140 + }, + { + "epoch": 9.146864686468646, + "grad_norm": 0.09667883813381195, + "learning_rate": 3.294604377881505e-05, + "loss": 0.003, + "num_input_tokens_seen": 17548128, + "step": 83145 + }, + { + "epoch": 9.147414741474147, + "grad_norm": 0.006394405383616686, + "learning_rate": 3.294376813292099e-05, + "loss": 0.0027, + "num_input_tokens_seen": 17549216, + "step": 83150 + }, + { + "epoch": 9.147964796479648, + "grad_norm": 0.0613832101225853, + "learning_rate": 3.294149241381302e-05, + "loss": 0.0096, + "num_input_tokens_seen": 17550272, + "step": 83155 + }, + { + "epoch": 9.148514851485148, + "grad_norm": 0.014255943708121777, + "learning_rate": 3.2939216621512134e-05, + "loss": 0.0129, + "num_input_tokens_seen": 17551328, + "step": 83160 + }, + { + "epoch": 9.149064906490649, + "grad_norm": 0.018205901607871056, + "learning_rate": 3.29369407560393e-05, + "loss": 0.0513, + "num_input_tokens_seen": 17552384, + "step": 83165 + }, + { + "epoch": 9.14961496149615, + "grad_norm": 0.01992347091436386, + "learning_rate": 3.293466481741548e-05, + "loss": 0.1161, + "num_input_tokens_seen": 17553472, + "step": 83170 + }, + { + "epoch": 9.150165016501651, + "grad_norm": 0.02853976935148239, + "learning_rate": 3.293238880566167e-05, + "loss": 0.0088, + "num_input_tokens_seen": 17554528, + "step": 83175 + }, + { + "epoch": 9.15071507150715, + "grad_norm": 0.09209251403808594, + "learning_rate": 3.2930112720798846e-05, + "loss": 0.0224, + "num_input_tokens_seen": 17555584, + "step": 83180 + }, + { + "epoch": 9.151265126512651, + "grad_norm": 0.2584065794944763, + "learning_rate": 3.2927836562847964e-05, + "loss": 0.0098, + "num_input_tokens_seen": 17556576, + "step": 83185 + }, + { + "epoch": 9.151815181518153, + "grad_norm": 0.4475342035293579, + "learning_rate": 3.292556033183003e-05, + "loss": 0.007, + "num_input_tokens_seen": 17557600, + "step": 83190 + }, + { + "epoch": 9.152365236523652, + "grad_norm": 0.019226403906941414, + "learning_rate": 3.292328402776599e-05, + "loss": 0.0091, + "num_input_tokens_seen": 17558656, + "step": 83195 + }, + { + "epoch": 9.152915291529153, + "grad_norm": 0.006519859656691551, + "learning_rate": 3.292100765067686e-05, + "loss": 0.0033, + "num_input_tokens_seen": 17559712, + "step": 83200 + }, + { + "epoch": 9.153465346534654, + "grad_norm": 0.02292523719370365, + "learning_rate": 3.2918731200583594e-05, + "loss": 0.0989, + "num_input_tokens_seen": 17560736, + "step": 83205 + }, + { + "epoch": 9.154015401540153, + "grad_norm": 0.2392256259918213, + "learning_rate": 3.2916454677507194e-05, + "loss": 0.0068, + "num_input_tokens_seen": 17561792, + "step": 83210 + }, + { + "epoch": 9.154565456545654, + "grad_norm": 0.03747357428073883, + "learning_rate": 3.291417808146862e-05, + "loss": 0.0094, + "num_input_tokens_seen": 17562752, + "step": 83215 + }, + { + "epoch": 9.155115511551156, + "grad_norm": 0.07268942892551422, + "learning_rate": 3.291190141248887e-05, + "loss": 0.0058, + "num_input_tokens_seen": 17563840, + "step": 83220 + }, + { + "epoch": 9.155665566556655, + "grad_norm": 0.11050526052713394, + "learning_rate": 3.290962467058891e-05, + "loss": 0.0069, + "num_input_tokens_seen": 17564832, + "step": 83225 + }, + { + "epoch": 9.156215621562156, + "grad_norm": 0.05546412616968155, + "learning_rate": 3.290734785578975e-05, + "loss": 0.0088, + "num_input_tokens_seen": 17565888, + "step": 83230 + }, + { + "epoch": 9.156765676567657, + "grad_norm": 1.0810215473175049, + "learning_rate": 3.290507096811235e-05, + "loss": 0.0976, + "num_input_tokens_seen": 17566944, + "step": 83235 + }, + { + "epoch": 9.157315731573158, + "grad_norm": 0.22394637763500214, + "learning_rate": 3.290279400757771e-05, + "loss": 0.0056, + "num_input_tokens_seen": 17568032, + "step": 83240 + }, + { + "epoch": 9.157865786578657, + "grad_norm": 0.009000211954116821, + "learning_rate": 3.2900516974206804e-05, + "loss": 0.0099, + "num_input_tokens_seen": 17568992, + "step": 83245 + }, + { + "epoch": 9.158415841584159, + "grad_norm": 0.9830825328826904, + "learning_rate": 3.2898239868020624e-05, + "loss": 0.0846, + "num_input_tokens_seen": 17569984, + "step": 83250 + }, + { + "epoch": 9.15896589658966, + "grad_norm": 1.089016079902649, + "learning_rate": 3.289596268904016e-05, + "loss": 0.0492, + "num_input_tokens_seen": 17571104, + "step": 83255 + }, + { + "epoch": 9.159515951595159, + "grad_norm": 0.009204954840242863, + "learning_rate": 3.2893685437286396e-05, + "loss": 0.0102, + "num_input_tokens_seen": 17572128, + "step": 83260 + }, + { + "epoch": 9.16006600660066, + "grad_norm": 0.1363781839609146, + "learning_rate": 3.2891408112780326e-05, + "loss": 0.0105, + "num_input_tokens_seen": 17573184, + "step": 83265 + }, + { + "epoch": 9.160616061606161, + "grad_norm": 0.06110754236578941, + "learning_rate": 3.2889130715542926e-05, + "loss": 0.0113, + "num_input_tokens_seen": 17574240, + "step": 83270 + }, + { + "epoch": 9.16116611661166, + "grad_norm": 0.04209807515144348, + "learning_rate": 3.2886853245595194e-05, + "loss": 0.0271, + "num_input_tokens_seen": 17575328, + "step": 83275 + }, + { + "epoch": 9.161716171617162, + "grad_norm": 0.07833049446344376, + "learning_rate": 3.288457570295812e-05, + "loss": 0.1245, + "num_input_tokens_seen": 17576320, + "step": 83280 + }, + { + "epoch": 9.162266226622663, + "grad_norm": 1.1254429817199707, + "learning_rate": 3.28822980876527e-05, + "loss": 0.0494, + "num_input_tokens_seen": 17577312, + "step": 83285 + }, + { + "epoch": 9.162816281628162, + "grad_norm": 2.4914793968200684, + "learning_rate": 3.2880020399699925e-05, + "loss": 0.0275, + "num_input_tokens_seen": 17578400, + "step": 83290 + }, + { + "epoch": 9.163366336633663, + "grad_norm": 0.06295929104089737, + "learning_rate": 3.2877742639120776e-05, + "loss": 0.0286, + "num_input_tokens_seen": 17579392, + "step": 83295 + }, + { + "epoch": 9.163916391639164, + "grad_norm": 1.2962592840194702, + "learning_rate": 3.287546480593625e-05, + "loss": 0.2746, + "num_input_tokens_seen": 17580480, + "step": 83300 + }, + { + "epoch": 9.164466446644665, + "grad_norm": 0.2090766727924347, + "learning_rate": 3.2873186900167355e-05, + "loss": 0.0073, + "num_input_tokens_seen": 17581600, + "step": 83305 + }, + { + "epoch": 9.165016501650165, + "grad_norm": 0.11936111748218536, + "learning_rate": 3.287090892183506e-05, + "loss": 0.0744, + "num_input_tokens_seen": 17582688, + "step": 83310 + }, + { + "epoch": 9.165566556655666, + "grad_norm": 0.134373277425766, + "learning_rate": 3.286863087096038e-05, + "loss": 0.0058, + "num_input_tokens_seen": 17583744, + "step": 83315 + }, + { + "epoch": 9.166116611661167, + "grad_norm": 0.042950671166181564, + "learning_rate": 3.286635274756431e-05, + "loss": 0.0034, + "num_input_tokens_seen": 17584832, + "step": 83320 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 0.03810778632760048, + "learning_rate": 3.286407455166784e-05, + "loss": 0.0084, + "num_input_tokens_seen": 17585888, + "step": 83325 + }, + { + "epoch": 9.167216721672167, + "grad_norm": 0.404617577791214, + "learning_rate": 3.2861796283291965e-05, + "loss": 0.0205, + "num_input_tokens_seen": 17586880, + "step": 83330 + }, + { + "epoch": 9.167766776677668, + "grad_norm": 2.6258370876312256, + "learning_rate": 3.285951794245769e-05, + "loss": 0.0326, + "num_input_tokens_seen": 17587968, + "step": 83335 + }, + { + "epoch": 9.168316831683168, + "grad_norm": 0.09610308706760406, + "learning_rate": 3.2857239529186e-05, + "loss": 0.0591, + "num_input_tokens_seen": 17589120, + "step": 83340 + }, + { + "epoch": 9.168866886688669, + "grad_norm": 0.22184529900550842, + "learning_rate": 3.285496104349791e-05, + "loss": 0.0096, + "num_input_tokens_seen": 17590208, + "step": 83345 + }, + { + "epoch": 9.16941694169417, + "grad_norm": 0.8341006636619568, + "learning_rate": 3.285268248541442e-05, + "loss": 0.0245, + "num_input_tokens_seen": 17591296, + "step": 83350 + }, + { + "epoch": 9.16996699669967, + "grad_norm": 0.05870533362030983, + "learning_rate": 3.285040385495652e-05, + "loss": 0.0426, + "num_input_tokens_seen": 17592352, + "step": 83355 + }, + { + "epoch": 9.17051705170517, + "grad_norm": 0.05355369299650192, + "learning_rate": 3.2848125152145206e-05, + "loss": 0.1206, + "num_input_tokens_seen": 17593376, + "step": 83360 + }, + { + "epoch": 9.171067106710671, + "grad_norm": 0.0421321727335453, + "learning_rate": 3.28458463770015e-05, + "loss": 0.0128, + "num_input_tokens_seen": 17594496, + "step": 83365 + }, + { + "epoch": 9.171617161716172, + "grad_norm": 0.8899160623550415, + "learning_rate": 3.284356752954638e-05, + "loss": 0.1319, + "num_input_tokens_seen": 17595520, + "step": 83370 + }, + { + "epoch": 9.172167216721672, + "grad_norm": 0.0459073930978775, + "learning_rate": 3.284128860980087e-05, + "loss": 0.0121, + "num_input_tokens_seen": 17596640, + "step": 83375 + }, + { + "epoch": 9.172717271727173, + "grad_norm": 0.021907487884163857, + "learning_rate": 3.2839009617785974e-05, + "loss": 0.025, + "num_input_tokens_seen": 17597696, + "step": 83380 + }, + { + "epoch": 9.173267326732674, + "grad_norm": 0.2802891731262207, + "learning_rate": 3.2836730553522674e-05, + "loss": 0.0098, + "num_input_tokens_seen": 17598784, + "step": 83385 + }, + { + "epoch": 9.173817381738173, + "grad_norm": 0.01843162812292576, + "learning_rate": 3.2834451417031994e-05, + "loss": 0.0038, + "num_input_tokens_seen": 17599808, + "step": 83390 + }, + { + "epoch": 9.174367436743674, + "grad_norm": 0.015690414234995842, + "learning_rate": 3.2832172208334933e-05, + "loss": 0.0023, + "num_input_tokens_seen": 17600896, + "step": 83395 + }, + { + "epoch": 9.174917491749175, + "grad_norm": 0.21322636306285858, + "learning_rate": 3.28298929274525e-05, + "loss": 0.029, + "num_input_tokens_seen": 17601920, + "step": 83400 + }, + { + "epoch": 9.175467546754675, + "grad_norm": 0.004233343992382288, + "learning_rate": 3.28276135744057e-05, + "loss": 0.0373, + "num_input_tokens_seen": 17602944, + "step": 83405 + }, + { + "epoch": 9.176017601760176, + "grad_norm": 0.004681181628257036, + "learning_rate": 3.2825334149215545e-05, + "loss": 0.0069, + "num_input_tokens_seen": 17604032, + "step": 83410 + }, + { + "epoch": 9.176567656765677, + "grad_norm": 0.024219784885644913, + "learning_rate": 3.282305465190303e-05, + "loss": 0.0026, + "num_input_tokens_seen": 17605120, + "step": 83415 + }, + { + "epoch": 9.177117711771178, + "grad_norm": 0.02077723853290081, + "learning_rate": 3.2820775082489185e-05, + "loss": 0.0128, + "num_input_tokens_seen": 17606176, + "step": 83420 + }, + { + "epoch": 9.177667766776677, + "grad_norm": 1.1607286930084229, + "learning_rate": 3.281849544099501e-05, + "loss": 0.0404, + "num_input_tokens_seen": 17607232, + "step": 83425 + }, + { + "epoch": 9.178217821782178, + "grad_norm": 1.4931432008743286, + "learning_rate": 3.281621572744151e-05, + "loss": 0.0882, + "num_input_tokens_seen": 17608352, + "step": 83430 + }, + { + "epoch": 9.17876787678768, + "grad_norm": 0.023524755612015724, + "learning_rate": 3.28139359418497e-05, + "loss": 0.0275, + "num_input_tokens_seen": 17609312, + "step": 83435 + }, + { + "epoch": 9.179317931793179, + "grad_norm": 0.45145538449287415, + "learning_rate": 3.2811656084240594e-05, + "loss": 0.0261, + "num_input_tokens_seen": 17610368, + "step": 83440 + }, + { + "epoch": 9.17986798679868, + "grad_norm": 0.14801733195781708, + "learning_rate": 3.2809376154635194e-05, + "loss": 0.0853, + "num_input_tokens_seen": 17611456, + "step": 83445 + }, + { + "epoch": 9.180418041804181, + "grad_norm": 0.0015257032355293632, + "learning_rate": 3.2807096153054526e-05, + "loss": 0.1036, + "num_input_tokens_seen": 17612512, + "step": 83450 + }, + { + "epoch": 9.18096809680968, + "grad_norm": 1.7505114078521729, + "learning_rate": 3.2804816079519614e-05, + "loss": 0.0338, + "num_input_tokens_seen": 17613504, + "step": 83455 + }, + { + "epoch": 9.181518151815181, + "grad_norm": 0.02358069270849228, + "learning_rate": 3.280253593405144e-05, + "loss": 0.0485, + "num_input_tokens_seen": 17614560, + "step": 83460 + }, + { + "epoch": 9.182068206820682, + "grad_norm": 0.07022460550069809, + "learning_rate": 3.2800255716671035e-05, + "loss": 0.1182, + "num_input_tokens_seen": 17615616, + "step": 83465 + }, + { + "epoch": 9.182618261826182, + "grad_norm": 0.14789587259292603, + "learning_rate": 3.279797542739943e-05, + "loss": 0.0066, + "num_input_tokens_seen": 17616704, + "step": 83470 + }, + { + "epoch": 9.183168316831683, + "grad_norm": 0.2254732996225357, + "learning_rate": 3.2795695066257616e-05, + "loss": 0.0134, + "num_input_tokens_seen": 17617824, + "step": 83475 + }, + { + "epoch": 9.183718371837184, + "grad_norm": 0.04207180067896843, + "learning_rate": 3.279341463326662e-05, + "loss": 0.0072, + "num_input_tokens_seen": 17618880, + "step": 83480 + }, + { + "epoch": 9.184268426842685, + "grad_norm": 0.9907525181770325, + "learning_rate": 3.279113412844746e-05, + "loss": 0.0209, + "num_input_tokens_seen": 17619872, + "step": 83485 + }, + { + "epoch": 9.184818481848184, + "grad_norm": 0.011400642804801464, + "learning_rate": 3.278885355182116e-05, + "loss": 0.0227, + "num_input_tokens_seen": 17620960, + "step": 83490 + }, + { + "epoch": 9.185368536853685, + "grad_norm": 0.04733598977327347, + "learning_rate": 3.278657290340874e-05, + "loss": 0.0081, + "num_input_tokens_seen": 17621952, + "step": 83495 + }, + { + "epoch": 9.185918591859187, + "grad_norm": 1.0048528909683228, + "learning_rate": 3.2784292183231206e-05, + "loss": 0.1278, + "num_input_tokens_seen": 17623072, + "step": 83500 + }, + { + "epoch": 9.186468646864686, + "grad_norm": 0.1325683444738388, + "learning_rate": 3.278201139130959e-05, + "loss": 0.0359, + "num_input_tokens_seen": 17624096, + "step": 83505 + }, + { + "epoch": 9.187018701870187, + "grad_norm": 1.628527283668518, + "learning_rate": 3.277973052766491e-05, + "loss": 0.0374, + "num_input_tokens_seen": 17625088, + "step": 83510 + }, + { + "epoch": 9.187568756875688, + "grad_norm": 0.02137899212539196, + "learning_rate": 3.2777449592318176e-05, + "loss": 0.0212, + "num_input_tokens_seen": 17626112, + "step": 83515 + }, + { + "epoch": 9.188118811881187, + "grad_norm": 0.03193928301334381, + "learning_rate": 3.277516858529043e-05, + "loss": 0.0072, + "num_input_tokens_seen": 17627168, + "step": 83520 + }, + { + "epoch": 9.188668866886688, + "grad_norm": 0.06551096588373184, + "learning_rate": 3.27728875066027e-05, + "loss": 0.0037, + "num_input_tokens_seen": 17628160, + "step": 83525 + }, + { + "epoch": 9.18921892189219, + "grad_norm": 0.004661389626562595, + "learning_rate": 3.277060635627597e-05, + "loss": 0.0056, + "num_input_tokens_seen": 17629184, + "step": 83530 + }, + { + "epoch": 9.189768976897689, + "grad_norm": 0.5056986212730408, + "learning_rate": 3.27683251343313e-05, + "loss": 0.0455, + "num_input_tokens_seen": 17630240, + "step": 83535 + }, + { + "epoch": 9.19031903190319, + "grad_norm": 0.19568856060504913, + "learning_rate": 3.276604384078971e-05, + "loss": 0.0237, + "num_input_tokens_seen": 17631328, + "step": 83540 + }, + { + "epoch": 9.190869086908691, + "grad_norm": 0.07404179126024246, + "learning_rate": 3.276376247567221e-05, + "loss": 0.0547, + "num_input_tokens_seen": 17632384, + "step": 83545 + }, + { + "epoch": 9.191419141914192, + "grad_norm": 0.3344661593437195, + "learning_rate": 3.276148103899985e-05, + "loss": 0.006, + "num_input_tokens_seen": 17633408, + "step": 83550 + }, + { + "epoch": 9.191969196919691, + "grad_norm": 0.00990899745374918, + "learning_rate": 3.275919953079364e-05, + "loss": 0.0119, + "num_input_tokens_seen": 17634496, + "step": 83555 + }, + { + "epoch": 9.192519251925193, + "grad_norm": 0.009465563111007214, + "learning_rate": 3.2756917951074605e-05, + "loss": 0.0451, + "num_input_tokens_seen": 17635616, + "step": 83560 + }, + { + "epoch": 9.193069306930694, + "grad_norm": 0.09172270447015762, + "learning_rate": 3.275463629986378e-05, + "loss": 0.0016, + "num_input_tokens_seen": 17636608, + "step": 83565 + }, + { + "epoch": 9.193619361936193, + "grad_norm": 0.06726249307394028, + "learning_rate": 3.2752354577182205e-05, + "loss": 0.0035, + "num_input_tokens_seen": 17637696, + "step": 83570 + }, + { + "epoch": 9.194169416941694, + "grad_norm": 0.2963559627532959, + "learning_rate": 3.2750072783050884e-05, + "loss": 0.0789, + "num_input_tokens_seen": 17638720, + "step": 83575 + }, + { + "epoch": 9.194719471947195, + "grad_norm": 1.4679763317108154, + "learning_rate": 3.274779091749086e-05, + "loss": 0.0494, + "num_input_tokens_seen": 17639840, + "step": 83580 + }, + { + "epoch": 9.195269526952695, + "grad_norm": 0.16044436395168304, + "learning_rate": 3.2745508980523173e-05, + "loss": 0.011, + "num_input_tokens_seen": 17640928, + "step": 83585 + }, + { + "epoch": 9.195819581958196, + "grad_norm": 0.013305000960826874, + "learning_rate": 3.274322697216884e-05, + "loss": 0.0035, + "num_input_tokens_seen": 17642016, + "step": 83590 + }, + { + "epoch": 9.196369636963697, + "grad_norm": 0.07327401638031006, + "learning_rate": 3.274094489244891e-05, + "loss": 0.0455, + "num_input_tokens_seen": 17643072, + "step": 83595 + }, + { + "epoch": 9.196919691969198, + "grad_norm": 0.21296538412570953, + "learning_rate": 3.273866274138439e-05, + "loss": 0.0139, + "num_input_tokens_seen": 17644160, + "step": 83600 + }, + { + "epoch": 9.197469746974697, + "grad_norm": 0.011182058602571487, + "learning_rate": 3.2736380518996344e-05, + "loss": 0.0504, + "num_input_tokens_seen": 17645152, + "step": 83605 + }, + { + "epoch": 9.198019801980198, + "grad_norm": 0.7281731963157654, + "learning_rate": 3.273409822530578e-05, + "loss": 0.0162, + "num_input_tokens_seen": 17646176, + "step": 83610 + }, + { + "epoch": 9.1985698569857, + "grad_norm": 0.85654616355896, + "learning_rate": 3.2731815860333755e-05, + "loss": 0.0606, + "num_input_tokens_seen": 17647264, + "step": 83615 + }, + { + "epoch": 9.199119911991199, + "grad_norm": 0.04383733123540878, + "learning_rate": 3.272953342410128e-05, + "loss": 0.0039, + "num_input_tokens_seen": 17648384, + "step": 83620 + }, + { + "epoch": 9.1996699669967, + "grad_norm": 0.014457770623266697, + "learning_rate": 3.2727250916629414e-05, + "loss": 0.0017, + "num_input_tokens_seen": 17649472, + "step": 83625 + }, + { + "epoch": 9.2002200220022, + "grad_norm": 0.002821115544065833, + "learning_rate": 3.2724968337939185e-05, + "loss": 0.0142, + "num_input_tokens_seen": 17650560, + "step": 83630 + }, + { + "epoch": 9.2007700770077, + "grad_norm": 0.010286812670528889, + "learning_rate": 3.272268568805162e-05, + "loss": 0.0032, + "num_input_tokens_seen": 17651616, + "step": 83635 + }, + { + "epoch": 9.201320132013201, + "grad_norm": 2.3996472358703613, + "learning_rate": 3.272040296698777e-05, + "loss": 0.0199, + "num_input_tokens_seen": 17652672, + "step": 83640 + }, + { + "epoch": 9.201870187018702, + "grad_norm": 0.23619522154331207, + "learning_rate": 3.2718120174768676e-05, + "loss": 0.0043, + "num_input_tokens_seen": 17653696, + "step": 83645 + }, + { + "epoch": 9.202420242024202, + "grad_norm": 0.7825743556022644, + "learning_rate": 3.2715837311415374e-05, + "loss": 0.1489, + "num_input_tokens_seen": 17654688, + "step": 83650 + }, + { + "epoch": 9.202970297029703, + "grad_norm": 0.038202084600925446, + "learning_rate": 3.27135543769489e-05, + "loss": 0.0293, + "num_input_tokens_seen": 17655744, + "step": 83655 + }, + { + "epoch": 9.203520352035204, + "grad_norm": 1.3634984493255615, + "learning_rate": 3.271127137139029e-05, + "loss": 0.1425, + "num_input_tokens_seen": 17656832, + "step": 83660 + }, + { + "epoch": 9.204070407040705, + "grad_norm": 0.025919470936059952, + "learning_rate": 3.2708988294760596e-05, + "loss": 0.0087, + "num_input_tokens_seen": 17657856, + "step": 83665 + }, + { + "epoch": 9.204620462046204, + "grad_norm": 0.13897459208965302, + "learning_rate": 3.270670514708086e-05, + "loss": 0.0039, + "num_input_tokens_seen": 17658880, + "step": 83670 + }, + { + "epoch": 9.205170517051705, + "grad_norm": 0.004893856588751078, + "learning_rate": 3.270442192837211e-05, + "loss": 0.0938, + "num_input_tokens_seen": 17659936, + "step": 83675 + }, + { + "epoch": 9.205720572057206, + "grad_norm": 1.4336494207382202, + "learning_rate": 3.2702138638655406e-05, + "loss": 0.0158, + "num_input_tokens_seen": 17660992, + "step": 83680 + }, + { + "epoch": 9.206270627062706, + "grad_norm": 0.5748856067657471, + "learning_rate": 3.269985527795179e-05, + "loss": 0.0096, + "num_input_tokens_seen": 17662080, + "step": 83685 + }, + { + "epoch": 9.206820682068207, + "grad_norm": 0.018432658165693283, + "learning_rate": 3.269757184628229e-05, + "loss": 0.0211, + "num_input_tokens_seen": 17663168, + "step": 83690 + }, + { + "epoch": 9.207370737073708, + "grad_norm": 0.023410120978951454, + "learning_rate": 3.269528834366798e-05, + "loss": 0.0078, + "num_input_tokens_seen": 17664256, + "step": 83695 + }, + { + "epoch": 9.207920792079207, + "grad_norm": 0.03818771615624428, + "learning_rate": 3.269300477012988e-05, + "loss": 0.0016, + "num_input_tokens_seen": 17665280, + "step": 83700 + }, + { + "epoch": 9.208470847084708, + "grad_norm": 0.011769530363380909, + "learning_rate": 3.269072112568905e-05, + "loss": 0.0909, + "num_input_tokens_seen": 17666304, + "step": 83705 + }, + { + "epoch": 9.20902090209021, + "grad_norm": 0.021515298634767532, + "learning_rate": 3.268843741036653e-05, + "loss": 0.0212, + "num_input_tokens_seen": 17667424, + "step": 83710 + }, + { + "epoch": 9.209570957095709, + "grad_norm": 0.08973892778158188, + "learning_rate": 3.268615362418338e-05, + "loss": 0.0832, + "num_input_tokens_seen": 17668448, + "step": 83715 + }, + { + "epoch": 9.21012101210121, + "grad_norm": 0.27593275904655457, + "learning_rate": 3.268386976716063e-05, + "loss": 0.0135, + "num_input_tokens_seen": 17669504, + "step": 83720 + }, + { + "epoch": 9.210671067106711, + "grad_norm": 0.015278897248208523, + "learning_rate": 3.2681585839319345e-05, + "loss": 0.0889, + "num_input_tokens_seen": 17670496, + "step": 83725 + }, + { + "epoch": 9.211221122112212, + "grad_norm": 0.07198019325733185, + "learning_rate": 3.267930184068057e-05, + "loss": 0.0888, + "num_input_tokens_seen": 17671520, + "step": 83730 + }, + { + "epoch": 9.211771177117711, + "grad_norm": 0.021537380293011665, + "learning_rate": 3.267701777126535e-05, + "loss": 0.0213, + "num_input_tokens_seen": 17672512, + "step": 83735 + }, + { + "epoch": 9.212321232123212, + "grad_norm": 0.008381843566894531, + "learning_rate": 3.267473363109475e-05, + "loss": 0.0065, + "num_input_tokens_seen": 17673504, + "step": 83740 + }, + { + "epoch": 9.212871287128714, + "grad_norm": 0.025537606328725815, + "learning_rate": 3.267244942018981e-05, + "loss": 0.0433, + "num_input_tokens_seen": 17674528, + "step": 83745 + }, + { + "epoch": 9.213421342134213, + "grad_norm": 0.053175732493400574, + "learning_rate": 3.267016513857158e-05, + "loss": 0.0256, + "num_input_tokens_seen": 17675552, + "step": 83750 + }, + { + "epoch": 9.213971397139714, + "grad_norm": 0.005615684669464827, + "learning_rate": 3.2667880786261116e-05, + "loss": 0.0017, + "num_input_tokens_seen": 17676640, + "step": 83755 + }, + { + "epoch": 9.214521452145215, + "grad_norm": 1.23282790184021, + "learning_rate": 3.2665596363279485e-05, + "loss": 0.1962, + "num_input_tokens_seen": 17677728, + "step": 83760 + }, + { + "epoch": 9.215071507150714, + "grad_norm": 0.03913973271846771, + "learning_rate": 3.2663311869647724e-05, + "loss": 0.0239, + "num_input_tokens_seen": 17678720, + "step": 83765 + }, + { + "epoch": 9.215621562156215, + "grad_norm": 0.03734038397669792, + "learning_rate": 3.266102730538689e-05, + "loss": 0.0014, + "num_input_tokens_seen": 17679776, + "step": 83770 + }, + { + "epoch": 9.216171617161717, + "grad_norm": 0.12058380991220474, + "learning_rate": 3.265874267051806e-05, + "loss": 0.0094, + "num_input_tokens_seen": 17680832, + "step": 83775 + }, + { + "epoch": 9.216721672167218, + "grad_norm": 0.024878347292542458, + "learning_rate": 3.2656457965062256e-05, + "loss": 0.0387, + "num_input_tokens_seen": 17681888, + "step": 83780 + }, + { + "epoch": 9.217271727172717, + "grad_norm": 0.0390155091881752, + "learning_rate": 3.265417318904056e-05, + "loss": 0.0294, + "num_input_tokens_seen": 17683008, + "step": 83785 + }, + { + "epoch": 9.217821782178218, + "grad_norm": 0.004819189198315144, + "learning_rate": 3.265188834247402e-05, + "loss": 0.021, + "num_input_tokens_seen": 17684064, + "step": 83790 + }, + { + "epoch": 9.218371837183719, + "grad_norm": 0.1418527364730835, + "learning_rate": 3.26496034253837e-05, + "loss": 0.0227, + "num_input_tokens_seen": 17685056, + "step": 83795 + }, + { + "epoch": 9.218921892189218, + "grad_norm": 0.03352469578385353, + "learning_rate": 3.264731843779065e-05, + "loss": 0.0023, + "num_input_tokens_seen": 17686144, + "step": 83800 + }, + { + "epoch": 9.21947194719472, + "grad_norm": 0.29645484685897827, + "learning_rate": 3.2645033379715946e-05, + "loss": 0.0562, + "num_input_tokens_seen": 17687168, + "step": 83805 + }, + { + "epoch": 9.22002200220022, + "grad_norm": 3.148132562637329, + "learning_rate": 3.264274825118063e-05, + "loss": 0.054, + "num_input_tokens_seen": 17688160, + "step": 83810 + }, + { + "epoch": 9.22057205720572, + "grad_norm": 0.015781709924340248, + "learning_rate": 3.264046305220578e-05, + "loss": 0.0305, + "num_input_tokens_seen": 17689248, + "step": 83815 + }, + { + "epoch": 9.221122112211221, + "grad_norm": 0.04057827964425087, + "learning_rate": 3.263817778281243e-05, + "loss": 0.0503, + "num_input_tokens_seen": 17690272, + "step": 83820 + }, + { + "epoch": 9.221672167216722, + "grad_norm": 0.02848030813038349, + "learning_rate": 3.263589244302168e-05, + "loss": 0.026, + "num_input_tokens_seen": 17691328, + "step": 83825 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 0.011785940267145634, + "learning_rate": 3.263360703285456e-05, + "loss": 0.1684, + "num_input_tokens_seen": 17692416, + "step": 83830 + }, + { + "epoch": 9.222772277227723, + "grad_norm": 0.39516907930374146, + "learning_rate": 3.263132155233216e-05, + "loss": 0.1229, + "num_input_tokens_seen": 17693472, + "step": 83835 + }, + { + "epoch": 9.223322332233224, + "grad_norm": 0.00869562104344368, + "learning_rate": 3.262903600147552e-05, + "loss": 0.0615, + "num_input_tokens_seen": 17694432, + "step": 83840 + }, + { + "epoch": 9.223872387238725, + "grad_norm": 0.9473168253898621, + "learning_rate": 3.262675038030571e-05, + "loss": 0.0755, + "num_input_tokens_seen": 17695424, + "step": 83845 + }, + { + "epoch": 9.224422442244224, + "grad_norm": 0.053447164595127106, + "learning_rate": 3.262446468884381e-05, + "loss": 0.0598, + "num_input_tokens_seen": 17696512, + "step": 83850 + }, + { + "epoch": 9.224972497249725, + "grad_norm": 0.1161348819732666, + "learning_rate": 3.2622178927110894e-05, + "loss": 0.1091, + "num_input_tokens_seen": 17697536, + "step": 83855 + }, + { + "epoch": 9.225522552255226, + "grad_norm": 0.03311781585216522, + "learning_rate": 3.261989309512799e-05, + "loss": 0.0337, + "num_input_tokens_seen": 17698592, + "step": 83860 + }, + { + "epoch": 9.226072607260726, + "grad_norm": 0.1671553999185562, + "learning_rate": 3.261760719291619e-05, + "loss": 0.0906, + "num_input_tokens_seen": 17699680, + "step": 83865 + }, + { + "epoch": 9.226622662266227, + "grad_norm": 1.009209156036377, + "learning_rate": 3.2615321220496564e-05, + "loss": 0.119, + "num_input_tokens_seen": 17700704, + "step": 83870 + }, + { + "epoch": 9.227172717271728, + "grad_norm": 0.2897104024887085, + "learning_rate": 3.261303517789018e-05, + "loss": 0.0333, + "num_input_tokens_seen": 17701728, + "step": 83875 + }, + { + "epoch": 9.227722772277227, + "grad_norm": 0.057303208857774734, + "learning_rate": 3.26107490651181e-05, + "loss": 0.0127, + "num_input_tokens_seen": 17702784, + "step": 83880 + }, + { + "epoch": 9.228272827282728, + "grad_norm": 0.0370657779276371, + "learning_rate": 3.26084628822014e-05, + "loss": 0.0111, + "num_input_tokens_seen": 17703840, + "step": 83885 + }, + { + "epoch": 9.22882288228823, + "grad_norm": 0.019171863794326782, + "learning_rate": 3.2606176629161146e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17704928, + "step": 83890 + }, + { + "epoch": 9.229372937293729, + "grad_norm": 0.05208968743681908, + "learning_rate": 3.260389030601841e-05, + "loss": 0.0307, + "num_input_tokens_seen": 17705952, + "step": 83895 + }, + { + "epoch": 9.22992299229923, + "grad_norm": 0.4411156177520752, + "learning_rate": 3.2601603912794266e-05, + "loss": 0.0234, + "num_input_tokens_seen": 17707104, + "step": 83900 + }, + { + "epoch": 9.23047304730473, + "grad_norm": 0.5921734571456909, + "learning_rate": 3.259931744950979e-05, + "loss": 0.0118, + "num_input_tokens_seen": 17708160, + "step": 83905 + }, + { + "epoch": 9.231023102310232, + "grad_norm": 0.45294204354286194, + "learning_rate": 3.259703091618605e-05, + "loss": 0.0215, + "num_input_tokens_seen": 17709184, + "step": 83910 + }, + { + "epoch": 9.231573157315731, + "grad_norm": 0.04693060740828514, + "learning_rate": 3.259474431284412e-05, + "loss": 0.0194, + "num_input_tokens_seen": 17710208, + "step": 83915 + }, + { + "epoch": 9.232123212321232, + "grad_norm": 0.023090502247214317, + "learning_rate": 3.2592457639505083e-05, + "loss": 0.0316, + "num_input_tokens_seen": 17711296, + "step": 83920 + }, + { + "epoch": 9.232673267326733, + "grad_norm": 0.07859771698713303, + "learning_rate": 3.259017089619e-05, + "loss": 0.007, + "num_input_tokens_seen": 17712416, + "step": 83925 + }, + { + "epoch": 9.233223322332233, + "grad_norm": 0.011369533836841583, + "learning_rate": 3.2587884082919955e-05, + "loss": 0.0132, + "num_input_tokens_seen": 17713440, + "step": 83930 + }, + { + "epoch": 9.233773377337734, + "grad_norm": 0.03404267504811287, + "learning_rate": 3.258559719971603e-05, + "loss": 0.0724, + "num_input_tokens_seen": 17714528, + "step": 83935 + }, + { + "epoch": 9.234323432343235, + "grad_norm": 0.011877351440489292, + "learning_rate": 3.258331024659928e-05, + "loss": 0.0115, + "num_input_tokens_seen": 17715616, + "step": 83940 + }, + { + "epoch": 9.234873487348734, + "grad_norm": 0.577052652835846, + "learning_rate": 3.2581023223590814e-05, + "loss": 0.0128, + "num_input_tokens_seen": 17716704, + "step": 83945 + }, + { + "epoch": 9.235423542354235, + "grad_norm": 0.05318447947502136, + "learning_rate": 3.2578736130711684e-05, + "loss": 0.0042, + "num_input_tokens_seen": 17717728, + "step": 83950 + }, + { + "epoch": 9.235973597359736, + "grad_norm": 0.07651949673891068, + "learning_rate": 3.2576448967982984e-05, + "loss": 0.1172, + "num_input_tokens_seen": 17718784, + "step": 83955 + }, + { + "epoch": 9.236523652365236, + "grad_norm": 0.012019779533147812, + "learning_rate": 3.2574161735425796e-05, + "loss": 0.002, + "num_input_tokens_seen": 17719840, + "step": 83960 + }, + { + "epoch": 9.237073707370737, + "grad_norm": 0.10229241102933884, + "learning_rate": 3.257187443306118e-05, + "loss": 0.0106, + "num_input_tokens_seen": 17720896, + "step": 83965 + }, + { + "epoch": 9.237623762376238, + "grad_norm": 0.7770996689796448, + "learning_rate": 3.256958706091024e-05, + "loss": 0.015, + "num_input_tokens_seen": 17721952, + "step": 83970 + }, + { + "epoch": 9.238173817381739, + "grad_norm": 0.007697508670389652, + "learning_rate": 3.256729961899405e-05, + "loss": 0.01, + "num_input_tokens_seen": 17723040, + "step": 83975 + }, + { + "epoch": 9.238723872387238, + "grad_norm": 0.9507391452789307, + "learning_rate": 3.256501210733368e-05, + "loss": 0.0239, + "num_input_tokens_seen": 17724096, + "step": 83980 + }, + { + "epoch": 9.23927392739274, + "grad_norm": 0.023320527747273445, + "learning_rate": 3.256272452595023e-05, + "loss": 0.0317, + "num_input_tokens_seen": 17725184, + "step": 83985 + }, + { + "epoch": 9.23982398239824, + "grad_norm": 0.23734115064144135, + "learning_rate": 3.256043687486479e-05, + "loss": 0.029, + "num_input_tokens_seen": 17726272, + "step": 83990 + }, + { + "epoch": 9.24037403740374, + "grad_norm": 0.009142708033323288, + "learning_rate": 3.255814915409841e-05, + "loss": 0.0124, + "num_input_tokens_seen": 17727328, + "step": 83995 + }, + { + "epoch": 9.24092409240924, + "grad_norm": 0.013621115125715733, + "learning_rate": 3.2555861363672205e-05, + "loss": 0.003, + "num_input_tokens_seen": 17728352, + "step": 84000 + }, + { + "epoch": 9.241474147414742, + "grad_norm": 0.697983980178833, + "learning_rate": 3.255357350360725e-05, + "loss": 0.0087, + "num_input_tokens_seen": 17729408, + "step": 84005 + }, + { + "epoch": 9.242024202420241, + "grad_norm": 1.5288840532302856, + "learning_rate": 3.2551285573924636e-05, + "loss": 0.115, + "num_input_tokens_seen": 17730400, + "step": 84010 + }, + { + "epoch": 9.242574257425742, + "grad_norm": 0.19237129390239716, + "learning_rate": 3.254899757464545e-05, + "loss": 0.0045, + "num_input_tokens_seen": 17731488, + "step": 84015 + }, + { + "epoch": 9.243124312431243, + "grad_norm": 0.7078284025192261, + "learning_rate": 3.2546709505790765e-05, + "loss": 0.1036, + "num_input_tokens_seen": 17732480, + "step": 84020 + }, + { + "epoch": 9.243674367436745, + "grad_norm": 0.39903348684310913, + "learning_rate": 3.2544421367381686e-05, + "loss": 0.0905, + "num_input_tokens_seen": 17733504, + "step": 84025 + }, + { + "epoch": 9.244224422442244, + "grad_norm": 0.03222096338868141, + "learning_rate": 3.2542133159439294e-05, + "loss": 0.0076, + "num_input_tokens_seen": 17734560, + "step": 84030 + }, + { + "epoch": 9.244774477447745, + "grad_norm": 0.023194393143057823, + "learning_rate": 3.2539844881984674e-05, + "loss": 0.0434, + "num_input_tokens_seen": 17735648, + "step": 84035 + }, + { + "epoch": 9.245324532453246, + "grad_norm": 0.06822782009840012, + "learning_rate": 3.253755653503893e-05, + "loss": 0.008, + "num_input_tokens_seen": 17736672, + "step": 84040 + }, + { + "epoch": 9.245874587458745, + "grad_norm": 0.004714827984571457, + "learning_rate": 3.253526811862314e-05, + "loss": 0.0136, + "num_input_tokens_seen": 17737792, + "step": 84045 + }, + { + "epoch": 9.246424642464246, + "grad_norm": 0.013426641933619976, + "learning_rate": 3.253297963275841e-05, + "loss": 0.0135, + "num_input_tokens_seen": 17738848, + "step": 84050 + }, + { + "epoch": 9.246974697469748, + "grad_norm": 0.02629159763455391, + "learning_rate": 3.2530691077465804e-05, + "loss": 0.1885, + "num_input_tokens_seen": 17739904, + "step": 84055 + }, + { + "epoch": 9.247524752475247, + "grad_norm": 0.008176122792065144, + "learning_rate": 3.2528402452766436e-05, + "loss": 0.0043, + "num_input_tokens_seen": 17740896, + "step": 84060 + }, + { + "epoch": 9.248074807480748, + "grad_norm": 0.01907394453883171, + "learning_rate": 3.25261137586814e-05, + "loss": 0.0463, + "num_input_tokens_seen": 17741920, + "step": 84065 + }, + { + "epoch": 9.248624862486249, + "grad_norm": 0.016295073553919792, + "learning_rate": 3.252382499523179e-05, + "loss": 0.0117, + "num_input_tokens_seen": 17742944, + "step": 84070 + }, + { + "epoch": 9.249174917491748, + "grad_norm": 0.0992501899600029, + "learning_rate": 3.2521536162438695e-05, + "loss": 0.0083, + "num_input_tokens_seen": 17744000, + "step": 84075 + }, + { + "epoch": 9.24972497249725, + "grad_norm": 0.07167519629001617, + "learning_rate": 3.2519247260323203e-05, + "loss": 0.0083, + "num_input_tokens_seen": 17745088, + "step": 84080 + }, + { + "epoch": 9.25027502750275, + "grad_norm": 0.016163060441613197, + "learning_rate": 3.251695828890641e-05, + "loss": 0.0056, + "num_input_tokens_seen": 17746112, + "step": 84085 + }, + { + "epoch": 9.250825082508252, + "grad_norm": 0.11263903975486755, + "learning_rate": 3.2514669248209437e-05, + "loss": 0.0399, + "num_input_tokens_seen": 17747168, + "step": 84090 + }, + { + "epoch": 9.251375137513751, + "grad_norm": 0.02368089370429516, + "learning_rate": 3.251238013825335e-05, + "loss": 0.0143, + "num_input_tokens_seen": 17748256, + "step": 84095 + }, + { + "epoch": 9.251925192519252, + "grad_norm": 0.017369473353028297, + "learning_rate": 3.251009095905927e-05, + "loss": 0.0088, + "num_input_tokens_seen": 17749376, + "step": 84100 + }, + { + "epoch": 9.252475247524753, + "grad_norm": 0.007617889903485775, + "learning_rate": 3.250780171064828e-05, + "loss": 0.0952, + "num_input_tokens_seen": 17750528, + "step": 84105 + }, + { + "epoch": 9.253025302530252, + "grad_norm": 0.1767856925725937, + "learning_rate": 3.2505512393041484e-05, + "loss": 0.0382, + "num_input_tokens_seen": 17751552, + "step": 84110 + }, + { + "epoch": 9.253575357535754, + "grad_norm": 0.03760726377367973, + "learning_rate": 3.2503223006259974e-05, + "loss": 0.083, + "num_input_tokens_seen": 17752576, + "step": 84115 + }, + { + "epoch": 9.254125412541255, + "grad_norm": 0.4036818742752075, + "learning_rate": 3.250093355032488e-05, + "loss": 0.0054, + "num_input_tokens_seen": 17753632, + "step": 84120 + }, + { + "epoch": 9.254675467546754, + "grad_norm": 0.04412250593304634, + "learning_rate": 3.2498644025257257e-05, + "loss": 0.0113, + "num_input_tokens_seen": 17754688, + "step": 84125 + }, + { + "epoch": 9.255225522552255, + "grad_norm": 1.1609441041946411, + "learning_rate": 3.249635443107824e-05, + "loss": 0.0372, + "num_input_tokens_seen": 17755744, + "step": 84130 + }, + { + "epoch": 9.255775577557756, + "grad_norm": 0.5415851473808289, + "learning_rate": 3.2494064767808925e-05, + "loss": 0.0087, + "num_input_tokens_seen": 17756832, + "step": 84135 + }, + { + "epoch": 9.256325632563255, + "grad_norm": 0.007721336092799902, + "learning_rate": 3.2491775035470404e-05, + "loss": 0.0617, + "num_input_tokens_seen": 17757792, + "step": 84140 + }, + { + "epoch": 9.256875687568757, + "grad_norm": 2.5329742431640625, + "learning_rate": 3.2489485234083796e-05, + "loss": 0.0563, + "num_input_tokens_seen": 17758944, + "step": 84145 + }, + { + "epoch": 9.257425742574258, + "grad_norm": 0.038171540945768356, + "learning_rate": 3.248719536367019e-05, + "loss": 0.1805, + "num_input_tokens_seen": 17760000, + "step": 84150 + }, + { + "epoch": 9.257975797579759, + "grad_norm": 1.2826528549194336, + "learning_rate": 3.2484905424250694e-05, + "loss": 0.0201, + "num_input_tokens_seen": 17761056, + "step": 84155 + }, + { + "epoch": 9.258525852585258, + "grad_norm": 0.027373841032385826, + "learning_rate": 3.248261541584642e-05, + "loss": 0.0064, + "num_input_tokens_seen": 17762048, + "step": 84160 + }, + { + "epoch": 9.25907590759076, + "grad_norm": 0.9245233535766602, + "learning_rate": 3.248032533847848e-05, + "loss": 0.1789, + "num_input_tokens_seen": 17763072, + "step": 84165 + }, + { + "epoch": 9.25962596259626, + "grad_norm": 7.236125946044922, + "learning_rate": 3.2478035192167956e-05, + "loss": 0.1031, + "num_input_tokens_seen": 17764160, + "step": 84170 + }, + { + "epoch": 9.26017601760176, + "grad_norm": 0.0317518413066864, + "learning_rate": 3.247574497693597e-05, + "loss": 0.0823, + "num_input_tokens_seen": 17765216, + "step": 84175 + }, + { + "epoch": 9.26072607260726, + "grad_norm": 0.17411479353904724, + "learning_rate": 3.247345469280364e-05, + "loss": 0.0735, + "num_input_tokens_seen": 17766304, + "step": 84180 + }, + { + "epoch": 9.261276127612762, + "grad_norm": 0.03852854669094086, + "learning_rate": 3.247116433979206e-05, + "loss": 0.0328, + "num_input_tokens_seen": 17767360, + "step": 84185 + }, + { + "epoch": 9.261826182618261, + "grad_norm": 1.1473592519760132, + "learning_rate": 3.246887391792235e-05, + "loss": 0.0817, + "num_input_tokens_seen": 17768448, + "step": 84190 + }, + { + "epoch": 9.262376237623762, + "grad_norm": 0.6358537673950195, + "learning_rate": 3.24665834272156e-05, + "loss": 0.0183, + "num_input_tokens_seen": 17769504, + "step": 84195 + }, + { + "epoch": 9.262926292629263, + "grad_norm": 0.045667145401239395, + "learning_rate": 3.246429286769293e-05, + "loss": 0.0634, + "num_input_tokens_seen": 17770560, + "step": 84200 + }, + { + "epoch": 9.263476347634764, + "grad_norm": 0.37361451983451843, + "learning_rate": 3.246200223937547e-05, + "loss": 0.0528, + "num_input_tokens_seen": 17771680, + "step": 84205 + }, + { + "epoch": 9.264026402640264, + "grad_norm": 0.5987895727157593, + "learning_rate": 3.245971154228431e-05, + "loss": 0.021, + "num_input_tokens_seen": 17772768, + "step": 84210 + }, + { + "epoch": 9.264576457645765, + "grad_norm": 0.014322303235530853, + "learning_rate": 3.245742077644057e-05, + "loss": 0.0729, + "num_input_tokens_seen": 17773824, + "step": 84215 + }, + { + "epoch": 9.265126512651266, + "grad_norm": 0.3461485207080841, + "learning_rate": 3.245512994186536e-05, + "loss": 0.0057, + "num_input_tokens_seen": 17774880, + "step": 84220 + }, + { + "epoch": 9.265676567656765, + "grad_norm": 0.41733068227767944, + "learning_rate": 3.24528390385798e-05, + "loss": 0.0938, + "num_input_tokens_seen": 17776000, + "step": 84225 + }, + { + "epoch": 9.266226622662266, + "grad_norm": 0.04693623632192612, + "learning_rate": 3.245054806660499e-05, + "loss": 0.1078, + "num_input_tokens_seen": 17777024, + "step": 84230 + }, + { + "epoch": 9.266776677667767, + "grad_norm": 1.147627353668213, + "learning_rate": 3.244825702596205e-05, + "loss": 0.0336, + "num_input_tokens_seen": 17778048, + "step": 84235 + }, + { + "epoch": 9.267326732673267, + "grad_norm": 0.038443826138973236, + "learning_rate": 3.244596591667211e-05, + "loss": 0.0049, + "num_input_tokens_seen": 17779008, + "step": 84240 + }, + { + "epoch": 9.267876787678768, + "grad_norm": 0.17654117941856384, + "learning_rate": 3.244367473875627e-05, + "loss": 0.0101, + "num_input_tokens_seen": 17780032, + "step": 84245 + }, + { + "epoch": 9.268426842684269, + "grad_norm": 0.6929320096969604, + "learning_rate": 3.244138349223565e-05, + "loss": 0.0454, + "num_input_tokens_seen": 17781088, + "step": 84250 + }, + { + "epoch": 9.268976897689768, + "grad_norm": 0.39881080389022827, + "learning_rate": 3.243909217713137e-05, + "loss": 0.0171, + "num_input_tokens_seen": 17782112, + "step": 84255 + }, + { + "epoch": 9.26952695269527, + "grad_norm": 0.07163599878549576, + "learning_rate": 3.243680079346455e-05, + "loss": 0.0333, + "num_input_tokens_seen": 17783136, + "step": 84260 + }, + { + "epoch": 9.27007700770077, + "grad_norm": 0.22451993823051453, + "learning_rate": 3.24345093412563e-05, + "loss": 0.0192, + "num_input_tokens_seen": 17784192, + "step": 84265 + }, + { + "epoch": 9.270627062706271, + "grad_norm": 0.10870729386806488, + "learning_rate": 3.243221782052775e-05, + "loss": 0.004, + "num_input_tokens_seen": 17785248, + "step": 84270 + }, + { + "epoch": 9.27117711771177, + "grad_norm": 1.13564133644104, + "learning_rate": 3.242992623130001e-05, + "loss": 0.0628, + "num_input_tokens_seen": 17786304, + "step": 84275 + }, + { + "epoch": 9.271727172717272, + "grad_norm": 0.31040674448013306, + "learning_rate": 3.242763457359421e-05, + "loss": 0.0279, + "num_input_tokens_seen": 17787392, + "step": 84280 + }, + { + "epoch": 9.272277227722773, + "grad_norm": 0.042697545140981674, + "learning_rate": 3.2425342847431464e-05, + "loss": 0.0119, + "num_input_tokens_seen": 17788416, + "step": 84285 + }, + { + "epoch": 9.272827282728272, + "grad_norm": 0.01078131515532732, + "learning_rate": 3.24230510528329e-05, + "loss": 0.0085, + "num_input_tokens_seen": 17789568, + "step": 84290 + }, + { + "epoch": 9.273377337733773, + "grad_norm": 0.3744588792324066, + "learning_rate": 3.242075918981963e-05, + "loss": 0.0091, + "num_input_tokens_seen": 17790656, + "step": 84295 + }, + { + "epoch": 9.273927392739274, + "grad_norm": 0.015420658513903618, + "learning_rate": 3.241846725841279e-05, + "loss": 0.0204, + "num_input_tokens_seen": 17791776, + "step": 84300 + }, + { + "epoch": 9.274477447744774, + "grad_norm": 0.01500534825026989, + "learning_rate": 3.241617525863349e-05, + "loss": 0.0191, + "num_input_tokens_seen": 17792800, + "step": 84305 + }, + { + "epoch": 9.275027502750275, + "grad_norm": 0.5244342088699341, + "learning_rate": 3.241388319050287e-05, + "loss": 0.1198, + "num_input_tokens_seen": 17793792, + "step": 84310 + }, + { + "epoch": 9.275577557755776, + "grad_norm": 0.3529583811759949, + "learning_rate": 3.241159105404203e-05, + "loss": 0.0265, + "num_input_tokens_seen": 17794848, + "step": 84315 + }, + { + "epoch": 9.276127612761275, + "grad_norm": 0.4827617406845093, + "learning_rate": 3.240929884927213e-05, + "loss": 0.025, + "num_input_tokens_seen": 17795936, + "step": 84320 + }, + { + "epoch": 9.276677667766776, + "grad_norm": 0.944393515586853, + "learning_rate": 3.2407006576214274e-05, + "loss": 0.0773, + "num_input_tokens_seen": 17796928, + "step": 84325 + }, + { + "epoch": 9.277227722772277, + "grad_norm": 0.038963682949543, + "learning_rate": 3.240471423488958e-05, + "loss": 0.0048, + "num_input_tokens_seen": 17798080, + "step": 84330 + }, + { + "epoch": 9.277777777777779, + "grad_norm": 0.011513813398778439, + "learning_rate": 3.24024218253192e-05, + "loss": 0.0508, + "num_input_tokens_seen": 17799168, + "step": 84335 + }, + { + "epoch": 9.278327832783278, + "grad_norm": 0.06767912954092026, + "learning_rate": 3.240012934752425e-05, + "loss": 0.0191, + "num_input_tokens_seen": 17800160, + "step": 84340 + }, + { + "epoch": 9.278877887788779, + "grad_norm": 1.160835862159729, + "learning_rate": 3.2397836801525847e-05, + "loss": 0.0813, + "num_input_tokens_seen": 17801216, + "step": 84345 + }, + { + "epoch": 9.27942794279428, + "grad_norm": 0.18325404822826385, + "learning_rate": 3.239554418734515e-05, + "loss": 0.0331, + "num_input_tokens_seen": 17802336, + "step": 84350 + }, + { + "epoch": 9.27997799779978, + "grad_norm": 1.4305731058120728, + "learning_rate": 3.2393251505003265e-05, + "loss": 0.0735, + "num_input_tokens_seen": 17803360, + "step": 84355 + }, + { + "epoch": 9.28052805280528, + "grad_norm": 0.08210454881191254, + "learning_rate": 3.239095875452132e-05, + "loss": 0.0575, + "num_input_tokens_seen": 17804416, + "step": 84360 + }, + { + "epoch": 9.281078107810782, + "grad_norm": 0.07184679806232452, + "learning_rate": 3.238866593592046e-05, + "loss": 0.0262, + "num_input_tokens_seen": 17805472, + "step": 84365 + }, + { + "epoch": 9.281628162816281, + "grad_norm": 0.008129909634590149, + "learning_rate": 3.2386373049221815e-05, + "loss": 0.0402, + "num_input_tokens_seen": 17806528, + "step": 84370 + }, + { + "epoch": 9.282178217821782, + "grad_norm": 0.16779501736164093, + "learning_rate": 3.2384080094446504e-05, + "loss": 0.0058, + "num_input_tokens_seen": 17807584, + "step": 84375 + }, + { + "epoch": 9.282728272827283, + "grad_norm": 0.5281998515129089, + "learning_rate": 3.238178707161568e-05, + "loss": 0.0231, + "num_input_tokens_seen": 17808576, + "step": 84380 + }, + { + "epoch": 9.283278327832782, + "grad_norm": 0.0047174980863928795, + "learning_rate": 3.2379493980750466e-05, + "loss": 0.0527, + "num_input_tokens_seen": 17809664, + "step": 84385 + }, + { + "epoch": 9.283828382838283, + "grad_norm": 1.6355286836624146, + "learning_rate": 3.237720082187199e-05, + "loss": 0.0602, + "num_input_tokens_seen": 17810752, + "step": 84390 + }, + { + "epoch": 9.284378437843785, + "grad_norm": 0.08602950721979141, + "learning_rate": 3.23749075950014e-05, + "loss": 0.0464, + "num_input_tokens_seen": 17811776, + "step": 84395 + }, + { + "epoch": 9.284928492849286, + "grad_norm": 1.8147135972976685, + "learning_rate": 3.237261430015982e-05, + "loss": 0.0232, + "num_input_tokens_seen": 17812800, + "step": 84400 + }, + { + "epoch": 9.285478547854785, + "grad_norm": 0.21895888447761536, + "learning_rate": 3.23703209373684e-05, + "loss": 0.0063, + "num_input_tokens_seen": 17813824, + "step": 84405 + }, + { + "epoch": 9.286028602860286, + "grad_norm": 0.012674323283135891, + "learning_rate": 3.2368027506648264e-05, + "loss": 0.158, + "num_input_tokens_seen": 17814944, + "step": 84410 + }, + { + "epoch": 9.286578657865787, + "grad_norm": 0.014585491269826889, + "learning_rate": 3.2365734008020555e-05, + "loss": 0.0082, + "num_input_tokens_seen": 17816032, + "step": 84415 + }, + { + "epoch": 9.287128712871286, + "grad_norm": 0.31003981828689575, + "learning_rate": 3.23634404415064e-05, + "loss": 0.005, + "num_input_tokens_seen": 17817056, + "step": 84420 + }, + { + "epoch": 9.287678767876788, + "grad_norm": 0.04558207467198372, + "learning_rate": 3.236114680712696e-05, + "loss": 0.0127, + "num_input_tokens_seen": 17818048, + "step": 84425 + }, + { + "epoch": 9.288228822882289, + "grad_norm": 0.051551416516304016, + "learning_rate": 3.235885310490336e-05, + "loss": 0.0025, + "num_input_tokens_seen": 17819072, + "step": 84430 + }, + { + "epoch": 9.288778877887788, + "grad_norm": 0.11407404392957687, + "learning_rate": 3.235655933485674e-05, + "loss": 0.0119, + "num_input_tokens_seen": 17820096, + "step": 84435 + }, + { + "epoch": 9.289328932893289, + "grad_norm": 0.031470511108636856, + "learning_rate": 3.235426549700824e-05, + "loss": 0.068, + "num_input_tokens_seen": 17821120, + "step": 84440 + }, + { + "epoch": 9.28987898789879, + "grad_norm": 0.013641035184264183, + "learning_rate": 3.235197159137901e-05, + "loss": 0.0133, + "num_input_tokens_seen": 17822144, + "step": 84445 + }, + { + "epoch": 9.290429042904291, + "grad_norm": 0.04417509585618973, + "learning_rate": 3.234967761799018e-05, + "loss": 0.0293, + "num_input_tokens_seen": 17823232, + "step": 84450 + }, + { + "epoch": 9.29097909790979, + "grad_norm": 1.6283286809921265, + "learning_rate": 3.2347383576862907e-05, + "loss": 0.0322, + "num_input_tokens_seen": 17824256, + "step": 84455 + }, + { + "epoch": 9.291529152915292, + "grad_norm": 0.012859055772423744, + "learning_rate": 3.234508946801831e-05, + "loss": 0.0096, + "num_input_tokens_seen": 17825344, + "step": 84460 + }, + { + "epoch": 9.292079207920793, + "grad_norm": 0.48703494668006897, + "learning_rate": 3.234279529147756e-05, + "loss": 0.0167, + "num_input_tokens_seen": 17826336, + "step": 84465 + }, + { + "epoch": 9.292629262926292, + "grad_norm": 0.07071700692176819, + "learning_rate": 3.234050104726179e-05, + "loss": 0.0173, + "num_input_tokens_seen": 17827360, + "step": 84470 + }, + { + "epoch": 9.293179317931793, + "grad_norm": 0.3559115529060364, + "learning_rate": 3.233820673539214e-05, + "loss": 0.0145, + "num_input_tokens_seen": 17828384, + "step": 84475 + }, + { + "epoch": 9.293729372937294, + "grad_norm": 0.02086290903389454, + "learning_rate": 3.233591235588976e-05, + "loss": 0.006, + "num_input_tokens_seen": 17829440, + "step": 84480 + }, + { + "epoch": 9.294279427942794, + "grad_norm": 0.0018783232662826777, + "learning_rate": 3.23336179087758e-05, + "loss": 0.0202, + "num_input_tokens_seen": 17830528, + "step": 84485 + }, + { + "epoch": 9.294829482948295, + "grad_norm": 0.3833808898925781, + "learning_rate": 3.233132339407139e-05, + "loss": 0.0675, + "num_input_tokens_seen": 17831552, + "step": 84490 + }, + { + "epoch": 9.295379537953796, + "grad_norm": 0.0524030365049839, + "learning_rate": 3.23290288117977e-05, + "loss": 0.0099, + "num_input_tokens_seen": 17832640, + "step": 84495 + }, + { + "epoch": 9.295929592959295, + "grad_norm": 0.26295456290245056, + "learning_rate": 3.232673416197588e-05, + "loss": 0.0173, + "num_input_tokens_seen": 17833664, + "step": 84500 + }, + { + "epoch": 9.296479647964796, + "grad_norm": 0.12168747186660767, + "learning_rate": 3.232443944462704e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17834752, + "step": 84505 + }, + { + "epoch": 9.297029702970297, + "grad_norm": 0.05060910806059837, + "learning_rate": 3.2322144659772374e-05, + "loss": 0.0078, + "num_input_tokens_seen": 17835840, + "step": 84510 + }, + { + "epoch": 9.297579757975798, + "grad_norm": 0.013598868623375893, + "learning_rate": 3.2319849807433014e-05, + "loss": 0.0548, + "num_input_tokens_seen": 17836832, + "step": 84515 + }, + { + "epoch": 9.298129812981298, + "grad_norm": 0.9220375418663025, + "learning_rate": 3.2317554887630096e-05, + "loss": 0.0299, + "num_input_tokens_seen": 17837824, + "step": 84520 + }, + { + "epoch": 9.298679867986799, + "grad_norm": 0.007473624311387539, + "learning_rate": 3.231525990038481e-05, + "loss": 0.0042, + "num_input_tokens_seen": 17838784, + "step": 84525 + }, + { + "epoch": 9.2992299229923, + "grad_norm": 0.11957547813653946, + "learning_rate": 3.2312964845718266e-05, + "loss": 0.0541, + "num_input_tokens_seen": 17839840, + "step": 84530 + }, + { + "epoch": 9.2997799779978, + "grad_norm": 0.03770443797111511, + "learning_rate": 3.231066972365164e-05, + "loss": 0.0384, + "num_input_tokens_seen": 17840896, + "step": 84535 + }, + { + "epoch": 9.3003300330033, + "grad_norm": 0.03630802780389786, + "learning_rate": 3.2308374534206075e-05, + "loss": 0.0185, + "num_input_tokens_seen": 17842016, + "step": 84540 + }, + { + "epoch": 9.300880088008801, + "grad_norm": 0.038093723356723785, + "learning_rate": 3.2306079277402736e-05, + "loss": 0.0965, + "num_input_tokens_seen": 17843104, + "step": 84545 + }, + { + "epoch": 9.3014301430143, + "grad_norm": 0.019997458904981613, + "learning_rate": 3.2303783953262765e-05, + "loss": 0.0627, + "num_input_tokens_seen": 17844224, + "step": 84550 + }, + { + "epoch": 9.301980198019802, + "grad_norm": 0.2480689436197281, + "learning_rate": 3.2301488561807324e-05, + "loss": 0.0114, + "num_input_tokens_seen": 17845248, + "step": 84555 + }, + { + "epoch": 9.302530253025303, + "grad_norm": 0.008795857429504395, + "learning_rate": 3.229919310305757e-05, + "loss": 0.0016, + "num_input_tokens_seen": 17846336, + "step": 84560 + }, + { + "epoch": 9.303080308030804, + "grad_norm": 0.07639532536268234, + "learning_rate": 3.229689757703465e-05, + "loss": 0.0029, + "num_input_tokens_seen": 17847456, + "step": 84565 + }, + { + "epoch": 9.303630363036303, + "grad_norm": 0.11145813018083572, + "learning_rate": 3.229460198375973e-05, + "loss": 0.0102, + "num_input_tokens_seen": 17848512, + "step": 84570 + }, + { + "epoch": 9.304180418041804, + "grad_norm": 0.11958182603120804, + "learning_rate": 3.229230632325395e-05, + "loss": 0.0629, + "num_input_tokens_seen": 17849568, + "step": 84575 + }, + { + "epoch": 9.304730473047305, + "grad_norm": 0.010293245315551758, + "learning_rate": 3.22900105955385e-05, + "loss": 0.0268, + "num_input_tokens_seen": 17850656, + "step": 84580 + }, + { + "epoch": 9.305280528052805, + "grad_norm": 0.1278316080570221, + "learning_rate": 3.228771480063451e-05, + "loss": 0.0556, + "num_input_tokens_seen": 17851712, + "step": 84585 + }, + { + "epoch": 9.305830583058306, + "grad_norm": 0.11639579385519028, + "learning_rate": 3.2285418938563155e-05, + "loss": 0.0025, + "num_input_tokens_seen": 17852800, + "step": 84590 + }, + { + "epoch": 9.306380638063807, + "grad_norm": 0.04944611340761185, + "learning_rate": 3.228312300934559e-05, + "loss": 0.1553, + "num_input_tokens_seen": 17853856, + "step": 84595 + }, + { + "epoch": 9.306930693069306, + "grad_norm": 0.08094210922718048, + "learning_rate": 3.228082701300297e-05, + "loss": 0.0064, + "num_input_tokens_seen": 17854944, + "step": 84600 + }, + { + "epoch": 9.307480748074807, + "grad_norm": 0.025704024359583855, + "learning_rate": 3.227853094955646e-05, + "loss": 0.0039, + "num_input_tokens_seen": 17856032, + "step": 84605 + }, + { + "epoch": 9.308030803080309, + "grad_norm": 0.014880074188113213, + "learning_rate": 3.227623481902723e-05, + "loss": 0.0617, + "num_input_tokens_seen": 17857024, + "step": 84610 + }, + { + "epoch": 9.308580858085808, + "grad_norm": 0.2226734459400177, + "learning_rate": 3.2273938621436435e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17858144, + "step": 84615 + }, + { + "epoch": 9.309130913091309, + "grad_norm": 1.3666027784347534, + "learning_rate": 3.227164235680523e-05, + "loss": 0.1066, + "num_input_tokens_seen": 17859200, + "step": 84620 + }, + { + "epoch": 9.30968096809681, + "grad_norm": 0.10157950222492218, + "learning_rate": 3.22693460251548e-05, + "loss": 0.0698, + "num_input_tokens_seen": 17860288, + "step": 84625 + }, + { + "epoch": 9.310231023102311, + "grad_norm": 0.033899690955877304, + "learning_rate": 3.226704962650629e-05, + "loss": 0.0915, + "num_input_tokens_seen": 17861312, + "step": 84630 + }, + { + "epoch": 9.31078107810781, + "grad_norm": 0.023973383009433746, + "learning_rate": 3.226475316088086e-05, + "loss": 0.0353, + "num_input_tokens_seen": 17862368, + "step": 84635 + }, + { + "epoch": 9.311331133113312, + "grad_norm": 0.02439921163022518, + "learning_rate": 3.22624566282997e-05, + "loss": 0.0109, + "num_input_tokens_seen": 17863392, + "step": 84640 + }, + { + "epoch": 9.311881188118813, + "grad_norm": 0.08975294232368469, + "learning_rate": 3.226016002878396e-05, + "loss": 0.0134, + "num_input_tokens_seen": 17864512, + "step": 84645 + }, + { + "epoch": 9.312431243124312, + "grad_norm": 0.019438521936535835, + "learning_rate": 3.225786336235479e-05, + "loss": 0.0783, + "num_input_tokens_seen": 17865600, + "step": 84650 + }, + { + "epoch": 9.312981298129813, + "grad_norm": 0.04104485362768173, + "learning_rate": 3.22555666290334e-05, + "loss": 0.0773, + "num_input_tokens_seen": 17866656, + "step": 84655 + }, + { + "epoch": 9.313531353135314, + "grad_norm": 0.02045419067144394, + "learning_rate": 3.225326982884093e-05, + "loss": 0.0573, + "num_input_tokens_seen": 17867680, + "step": 84660 + }, + { + "epoch": 9.314081408140813, + "grad_norm": 0.02117406390607357, + "learning_rate": 3.225097296179854e-05, + "loss": 0.0024, + "num_input_tokens_seen": 17868768, + "step": 84665 + }, + { + "epoch": 9.314631463146315, + "grad_norm": 1.3254598379135132, + "learning_rate": 3.2248676027927424e-05, + "loss": 0.0244, + "num_input_tokens_seen": 17869760, + "step": 84670 + }, + { + "epoch": 9.315181518151816, + "grad_norm": 0.1712150275707245, + "learning_rate": 3.224637902724874e-05, + "loss": 0.0098, + "num_input_tokens_seen": 17870784, + "step": 84675 + }, + { + "epoch": 9.315731573157315, + "grad_norm": 0.11417299509048462, + "learning_rate": 3.2244081959783644e-05, + "loss": 0.0144, + "num_input_tokens_seen": 17871808, + "step": 84680 + }, + { + "epoch": 9.316281628162816, + "grad_norm": 0.10521993786096573, + "learning_rate": 3.2241784825553326e-05, + "loss": 0.0178, + "num_input_tokens_seen": 17872896, + "step": 84685 + }, + { + "epoch": 9.316831683168317, + "grad_norm": 0.039642173796892166, + "learning_rate": 3.223948762457896e-05, + "loss": 0.0633, + "num_input_tokens_seen": 17873952, + "step": 84690 + }, + { + "epoch": 9.317381738173818, + "grad_norm": 0.02449057810008526, + "learning_rate": 3.22371903568817e-05, + "loss": 0.0036, + "num_input_tokens_seen": 17875072, + "step": 84695 + }, + { + "epoch": 9.317931793179318, + "grad_norm": 0.16026867926120758, + "learning_rate": 3.2234893022482734e-05, + "loss": 0.0184, + "num_input_tokens_seen": 17876160, + "step": 84700 + }, + { + "epoch": 9.318481848184819, + "grad_norm": 0.054834865033626556, + "learning_rate": 3.223259562140324e-05, + "loss": 0.0122, + "num_input_tokens_seen": 17877248, + "step": 84705 + }, + { + "epoch": 9.31903190319032, + "grad_norm": 0.588992178440094, + "learning_rate": 3.223029815366437e-05, + "loss": 0.0883, + "num_input_tokens_seen": 17878272, + "step": 84710 + }, + { + "epoch": 9.319581958195819, + "grad_norm": 0.021564681082963943, + "learning_rate": 3.222800061928732e-05, + "loss": 0.0067, + "num_input_tokens_seen": 17879296, + "step": 84715 + }, + { + "epoch": 9.32013201320132, + "grad_norm": 0.010596396401524544, + "learning_rate": 3.222570301829325e-05, + "loss": 0.0052, + "num_input_tokens_seen": 17880320, + "step": 84720 + }, + { + "epoch": 9.320682068206821, + "grad_norm": 1.2572815418243408, + "learning_rate": 3.222340535070334e-05, + "loss": 0.0789, + "num_input_tokens_seen": 17881440, + "step": 84725 + }, + { + "epoch": 9.32123212321232, + "grad_norm": 0.010866782627999783, + "learning_rate": 3.222110761653878e-05, + "loss": 0.0075, + "num_input_tokens_seen": 17882528, + "step": 84730 + }, + { + "epoch": 9.321782178217822, + "grad_norm": 0.20586808025836945, + "learning_rate": 3.221880981582073e-05, + "loss": 0.0617, + "num_input_tokens_seen": 17883520, + "step": 84735 + }, + { + "epoch": 9.322332233223323, + "grad_norm": 0.015637526288628578, + "learning_rate": 3.2216511948570374e-05, + "loss": 0.0096, + "num_input_tokens_seen": 17884544, + "step": 84740 + }, + { + "epoch": 9.322882288228822, + "grad_norm": 0.05529433861374855, + "learning_rate": 3.2214214014808886e-05, + "loss": 0.0124, + "num_input_tokens_seen": 17885568, + "step": 84745 + }, + { + "epoch": 9.323432343234323, + "grad_norm": 0.03417232632637024, + "learning_rate": 3.221191601455745e-05, + "loss": 0.0039, + "num_input_tokens_seen": 17886624, + "step": 84750 + }, + { + "epoch": 9.323982398239824, + "grad_norm": 0.1193995475769043, + "learning_rate": 3.2209617947837245e-05, + "loss": 0.1607, + "num_input_tokens_seen": 17887680, + "step": 84755 + }, + { + "epoch": 9.324532453245325, + "grad_norm": 0.009870360605418682, + "learning_rate": 3.220731981466946e-05, + "loss": 0.0023, + "num_input_tokens_seen": 17888736, + "step": 84760 + }, + { + "epoch": 9.325082508250825, + "grad_norm": 0.9215027689933777, + "learning_rate": 3.2205021615075256e-05, + "loss": 0.0107, + "num_input_tokens_seen": 17889792, + "step": 84765 + }, + { + "epoch": 9.325632563256326, + "grad_norm": 0.012515355832874775, + "learning_rate": 3.220272334907583e-05, + "loss": 0.003, + "num_input_tokens_seen": 17890848, + "step": 84770 + }, + { + "epoch": 9.326182618261827, + "grad_norm": 1.1365289688110352, + "learning_rate": 3.220042501669236e-05, + "loss": 0.0293, + "num_input_tokens_seen": 17891936, + "step": 84775 + }, + { + "epoch": 9.326732673267326, + "grad_norm": 0.17304183542728424, + "learning_rate": 3.2198126617946026e-05, + "loss": 0.0157, + "num_input_tokens_seen": 17893024, + "step": 84780 + }, + { + "epoch": 9.327282728272827, + "grad_norm": 0.20734253525733948, + "learning_rate": 3.2195828152858016e-05, + "loss": 0.0535, + "num_input_tokens_seen": 17894080, + "step": 84785 + }, + { + "epoch": 9.327832783278328, + "grad_norm": 0.017480576410889626, + "learning_rate": 3.219352962144951e-05, + "loss": 0.0269, + "num_input_tokens_seen": 17895136, + "step": 84790 + }, + { + "epoch": 9.328382838283828, + "grad_norm": 0.011449174024164677, + "learning_rate": 3.2191231023741686e-05, + "loss": 0.0043, + "num_input_tokens_seen": 17896288, + "step": 84795 + }, + { + "epoch": 9.328932893289329, + "grad_norm": 0.1246580258011818, + "learning_rate": 3.2188932359755744e-05, + "loss": 0.0221, + "num_input_tokens_seen": 17897376, + "step": 84800 + }, + { + "epoch": 9.32948294829483, + "grad_norm": 0.9555838704109192, + "learning_rate": 3.218663362951286e-05, + "loss": 0.0618, + "num_input_tokens_seen": 17898464, + "step": 84805 + }, + { + "epoch": 9.33003300330033, + "grad_norm": 1.0315731763839722, + "learning_rate": 3.218433483303422e-05, + "loss": 0.0175, + "num_input_tokens_seen": 17899552, + "step": 84810 + }, + { + "epoch": 9.33058305830583, + "grad_norm": 0.06015878543257713, + "learning_rate": 3.2182035970341016e-05, + "loss": 0.0384, + "num_input_tokens_seen": 17900576, + "step": 84815 + }, + { + "epoch": 9.331133113311331, + "grad_norm": 0.9444252848625183, + "learning_rate": 3.217973704145443e-05, + "loss": 0.051, + "num_input_tokens_seen": 17901632, + "step": 84820 + }, + { + "epoch": 9.331683168316832, + "grad_norm": 3.542330265045166, + "learning_rate": 3.217743804639565e-05, + "loss": 0.103, + "num_input_tokens_seen": 17902656, + "step": 84825 + }, + { + "epoch": 9.332233223322332, + "grad_norm": 1.2596724033355713, + "learning_rate": 3.2175138985185885e-05, + "loss": 0.1923, + "num_input_tokens_seen": 17903712, + "step": 84830 + }, + { + "epoch": 9.332783278327833, + "grad_norm": 0.16136260330677032, + "learning_rate": 3.217283985784629e-05, + "loss": 0.0092, + "num_input_tokens_seen": 17904704, + "step": 84835 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 0.15349939465522766, + "learning_rate": 3.217054066439807e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17905728, + "step": 84840 + }, + { + "epoch": 9.333883388338833, + "grad_norm": 0.04195336252450943, + "learning_rate": 3.216824140486242e-05, + "loss": 0.068, + "num_input_tokens_seen": 17906752, + "step": 84845 + }, + { + "epoch": 9.334433443344334, + "grad_norm": 0.02703562192618847, + "learning_rate": 3.216594207926054e-05, + "loss": 0.0194, + "num_input_tokens_seen": 17907840, + "step": 84850 + }, + { + "epoch": 9.334983498349835, + "grad_norm": 0.02546250820159912, + "learning_rate": 3.21636426876136e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17908928, + "step": 84855 + }, + { + "epoch": 9.335533553355335, + "grad_norm": 0.013451892882585526, + "learning_rate": 3.216134322994281e-05, + "loss": 0.156, + "num_input_tokens_seen": 17909952, + "step": 84860 + }, + { + "epoch": 9.336083608360836, + "grad_norm": 0.13918288052082062, + "learning_rate": 3.2159043706269345e-05, + "loss": 0.0247, + "num_input_tokens_seen": 17910944, + "step": 84865 + }, + { + "epoch": 9.336633663366337, + "grad_norm": 0.9168707728385925, + "learning_rate": 3.215674411661441e-05, + "loss": 0.0102, + "num_input_tokens_seen": 17912000, + "step": 84870 + }, + { + "epoch": 9.337183718371838, + "grad_norm": 1.9675860404968262, + "learning_rate": 3.215444446099921e-05, + "loss": 0.064, + "num_input_tokens_seen": 17913120, + "step": 84875 + }, + { + "epoch": 9.337733773377337, + "grad_norm": 0.26015666127204895, + "learning_rate": 3.215214473944492e-05, + "loss": 0.0293, + "num_input_tokens_seen": 17914240, + "step": 84880 + }, + { + "epoch": 9.338283828382838, + "grad_norm": 0.047901131212711334, + "learning_rate": 3.214984495197275e-05, + "loss": 0.0054, + "num_input_tokens_seen": 17915264, + "step": 84885 + }, + { + "epoch": 9.33883388338834, + "grad_norm": 0.04729511961340904, + "learning_rate": 3.2147545098603885e-05, + "loss": 0.0623, + "num_input_tokens_seen": 17916352, + "step": 84890 + }, + { + "epoch": 9.339383938393839, + "grad_norm": 0.009825768880546093, + "learning_rate": 3.2145245179359526e-05, + "loss": 0.002, + "num_input_tokens_seen": 17917440, + "step": 84895 + }, + { + "epoch": 9.33993399339934, + "grad_norm": 0.06164541468024254, + "learning_rate": 3.214294519426087e-05, + "loss": 0.0051, + "num_input_tokens_seen": 17918496, + "step": 84900 + }, + { + "epoch": 9.340484048404841, + "grad_norm": 0.008700745180249214, + "learning_rate": 3.214064514332913e-05, + "loss": 0.0808, + "num_input_tokens_seen": 17919616, + "step": 84905 + }, + { + "epoch": 9.34103410341034, + "grad_norm": 0.07601913064718246, + "learning_rate": 3.213834502658547e-05, + "loss": 0.1135, + "num_input_tokens_seen": 17920672, + "step": 84910 + }, + { + "epoch": 9.341584158415841, + "grad_norm": 0.4149380326271057, + "learning_rate": 3.213604484405112e-05, + "loss": 0.027, + "num_input_tokens_seen": 17921792, + "step": 84915 + }, + { + "epoch": 9.342134213421343, + "grad_norm": 0.038152892142534256, + "learning_rate": 3.213374459574727e-05, + "loss": 0.0019, + "num_input_tokens_seen": 17922848, + "step": 84920 + }, + { + "epoch": 9.342684268426842, + "grad_norm": 0.11849330365657806, + "learning_rate": 3.2131444281695114e-05, + "loss": 0.0958, + "num_input_tokens_seen": 17923840, + "step": 84925 + }, + { + "epoch": 9.343234323432343, + "grad_norm": 0.032499898225069046, + "learning_rate": 3.212914390191586e-05, + "loss": 0.0087, + "num_input_tokens_seen": 17924896, + "step": 84930 + }, + { + "epoch": 9.343784378437844, + "grad_norm": 0.019402287900447845, + "learning_rate": 3.2126843456430713e-05, + "loss": 0.0076, + "num_input_tokens_seen": 17925920, + "step": 84935 + }, + { + "epoch": 9.344334433443345, + "grad_norm": 0.019137704744935036, + "learning_rate": 3.212454294526085e-05, + "loss": 0.0147, + "num_input_tokens_seen": 17926944, + "step": 84940 + }, + { + "epoch": 9.344884488448844, + "grad_norm": 1.3467109203338623, + "learning_rate": 3.2122242368427515e-05, + "loss": 0.1025, + "num_input_tokens_seen": 17927936, + "step": 84945 + }, + { + "epoch": 9.345434543454346, + "grad_norm": 0.010442005470395088, + "learning_rate": 3.211994172595189e-05, + "loss": 0.0083, + "num_input_tokens_seen": 17928960, + "step": 84950 + }, + { + "epoch": 9.345984598459847, + "grad_norm": 0.03854084759950638, + "learning_rate": 3.211764101785516e-05, + "loss": 0.0259, + "num_input_tokens_seen": 17930016, + "step": 84955 + }, + { + "epoch": 9.346534653465346, + "grad_norm": 1.294386625289917, + "learning_rate": 3.211534024415856e-05, + "loss": 0.0299, + "num_input_tokens_seen": 17931040, + "step": 84960 + }, + { + "epoch": 9.347084708470847, + "grad_norm": 0.004096901044249535, + "learning_rate": 3.211303940488328e-05, + "loss": 0.0871, + "num_input_tokens_seen": 17932160, + "step": 84965 + }, + { + "epoch": 9.347634763476348, + "grad_norm": 0.21056869626045227, + "learning_rate": 3.211073850005053e-05, + "loss": 0.0234, + "num_input_tokens_seen": 17933184, + "step": 84970 + }, + { + "epoch": 9.348184818481847, + "grad_norm": 0.21526290476322174, + "learning_rate": 3.2108437529681524e-05, + "loss": 0.0118, + "num_input_tokens_seen": 17934176, + "step": 84975 + }, + { + "epoch": 9.348734873487349, + "grad_norm": 0.009817622601985931, + "learning_rate": 3.210613649379745e-05, + "loss": 0.0645, + "num_input_tokens_seen": 17935200, + "step": 84980 + }, + { + "epoch": 9.34928492849285, + "grad_norm": 0.01036609522998333, + "learning_rate": 3.210383539241952e-05, + "loss": 0.0319, + "num_input_tokens_seen": 17936320, + "step": 84985 + }, + { + "epoch": 9.34983498349835, + "grad_norm": 0.4759994447231293, + "learning_rate": 3.2101534225568964e-05, + "loss": 0.0346, + "num_input_tokens_seen": 17937440, + "step": 84990 + }, + { + "epoch": 9.35038503850385, + "grad_norm": 0.01725420355796814, + "learning_rate": 3.2099232993266965e-05, + "loss": 0.0737, + "num_input_tokens_seen": 17938432, + "step": 84995 + }, + { + "epoch": 9.350935093509351, + "grad_norm": 0.013649823144078255, + "learning_rate": 3.209693169553474e-05, + "loss": 0.0106, + "num_input_tokens_seen": 17939488, + "step": 85000 + }, + { + "epoch": 9.351485148514852, + "grad_norm": 1.0820560455322266, + "learning_rate": 3.209463033239351e-05, + "loss": 0.0785, + "num_input_tokens_seen": 17940512, + "step": 85005 + }, + { + "epoch": 9.352035203520352, + "grad_norm": 2.437030076980591, + "learning_rate": 3.2092328903864475e-05, + "loss": 0.1098, + "num_input_tokens_seen": 17941568, + "step": 85010 + }, + { + "epoch": 9.352585258525853, + "grad_norm": 1.1886038780212402, + "learning_rate": 3.209002740996884e-05, + "loss": 0.0187, + "num_input_tokens_seen": 17942592, + "step": 85015 + }, + { + "epoch": 9.353135313531354, + "grad_norm": 0.026651546359062195, + "learning_rate": 3.2087725850727837e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17943648, + "step": 85020 + }, + { + "epoch": 9.353685368536853, + "grad_norm": 0.010939770378172398, + "learning_rate": 3.208542422616265e-05, + "loss": 0.0055, + "num_input_tokens_seen": 17944640, + "step": 85025 + }, + { + "epoch": 9.354235423542354, + "grad_norm": 0.14060157537460327, + "learning_rate": 3.208312253629451e-05, + "loss": 0.0803, + "num_input_tokens_seen": 17945664, + "step": 85030 + }, + { + "epoch": 9.354785478547855, + "grad_norm": 0.9579302072525024, + "learning_rate": 3.208082078114464e-05, + "loss": 0.1115, + "num_input_tokens_seen": 17946720, + "step": 85035 + }, + { + "epoch": 9.355335533553355, + "grad_norm": 0.18938304483890533, + "learning_rate": 3.2078518960734236e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17947808, + "step": 85040 + }, + { + "epoch": 9.355885588558856, + "grad_norm": 0.041489314287900925, + "learning_rate": 3.207621707508452e-05, + "loss": 0.0117, + "num_input_tokens_seen": 17948864, + "step": 85045 + }, + { + "epoch": 9.356435643564357, + "grad_norm": 0.26016679406166077, + "learning_rate": 3.207391512421671e-05, + "loss": 0.0079, + "num_input_tokens_seen": 17949920, + "step": 85050 + }, + { + "epoch": 9.356985698569858, + "grad_norm": 0.017453161999583244, + "learning_rate": 3.207161310815201e-05, + "loss": 0.0052, + "num_input_tokens_seen": 17950944, + "step": 85055 + }, + { + "epoch": 9.357535753575357, + "grad_norm": 0.00885981135070324, + "learning_rate": 3.206931102691165e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17952000, + "step": 85060 + }, + { + "epoch": 9.358085808580858, + "grad_norm": 0.04581783339381218, + "learning_rate": 3.206700888051685e-05, + "loss": 0.0161, + "num_input_tokens_seen": 17953024, + "step": 85065 + }, + { + "epoch": 9.35863586358636, + "grad_norm": 0.007541521918028593, + "learning_rate": 3.206470666898881e-05, + "loss": 0.0107, + "num_input_tokens_seen": 17954080, + "step": 85070 + }, + { + "epoch": 9.359185918591859, + "grad_norm": 0.01810218207538128, + "learning_rate": 3.206240439234876e-05, + "loss": 0.0068, + "num_input_tokens_seen": 17955136, + "step": 85075 + }, + { + "epoch": 9.35973597359736, + "grad_norm": 0.014069879427552223, + "learning_rate": 3.206010205061792e-05, + "loss": 0.0252, + "num_input_tokens_seen": 17956160, + "step": 85080 + }, + { + "epoch": 9.36028602860286, + "grad_norm": 0.10868760198354721, + "learning_rate": 3.20577996438175e-05, + "loss": 0.0102, + "num_input_tokens_seen": 17957184, + "step": 85085 + }, + { + "epoch": 9.36083608360836, + "grad_norm": 0.925698459148407, + "learning_rate": 3.205549717196873e-05, + "loss": 0.0333, + "num_input_tokens_seen": 17958208, + "step": 85090 + }, + { + "epoch": 9.361386138613861, + "grad_norm": 0.016470221802592278, + "learning_rate": 3.205319463509284e-05, + "loss": 0.0105, + "num_input_tokens_seen": 17959328, + "step": 85095 + }, + { + "epoch": 9.361936193619362, + "grad_norm": 0.03455328941345215, + "learning_rate": 3.205089203321103e-05, + "loss": 0.0638, + "num_input_tokens_seen": 17960448, + "step": 85100 + }, + { + "epoch": 9.362486248624862, + "grad_norm": 0.16088150441646576, + "learning_rate": 3.2048589366344535e-05, + "loss": 0.0509, + "num_input_tokens_seen": 17961536, + "step": 85105 + }, + { + "epoch": 9.363036303630363, + "grad_norm": 5.224169731140137, + "learning_rate": 3.204628663451457e-05, + "loss": 0.0385, + "num_input_tokens_seen": 17962624, + "step": 85110 + }, + { + "epoch": 9.363586358635864, + "grad_norm": 0.29600560665130615, + "learning_rate": 3.2043983837742365e-05, + "loss": 0.0879, + "num_input_tokens_seen": 17963744, + "step": 85115 + }, + { + "epoch": 9.364136413641365, + "grad_norm": 0.033882107585668564, + "learning_rate": 3.2041680976049144e-05, + "loss": 0.0984, + "num_input_tokens_seen": 17964896, + "step": 85120 + }, + { + "epoch": 9.364686468646864, + "grad_norm": 0.0970921739935875, + "learning_rate": 3.203937804945612e-05, + "loss": 0.0175, + "num_input_tokens_seen": 17965984, + "step": 85125 + }, + { + "epoch": 9.365236523652365, + "grad_norm": 0.04406716302037239, + "learning_rate": 3.203707505798453e-05, + "loss": 0.0804, + "num_input_tokens_seen": 17966976, + "step": 85130 + }, + { + "epoch": 9.365786578657866, + "grad_norm": 0.012850696220993996, + "learning_rate": 3.20347720016556e-05, + "loss": 0.0099, + "num_input_tokens_seen": 17968032, + "step": 85135 + }, + { + "epoch": 9.366336633663366, + "grad_norm": 1.5724290609359741, + "learning_rate": 3.203246888049055e-05, + "loss": 0.0945, + "num_input_tokens_seen": 17969056, + "step": 85140 + }, + { + "epoch": 9.366886688668867, + "grad_norm": 0.05915834382176399, + "learning_rate": 3.2030165694510606e-05, + "loss": 0.0199, + "num_input_tokens_seen": 17970112, + "step": 85145 + }, + { + "epoch": 9.367436743674368, + "grad_norm": 0.01797613687813282, + "learning_rate": 3.2027862443737e-05, + "loss": 0.0069, + "num_input_tokens_seen": 17971200, + "step": 85150 + }, + { + "epoch": 9.367986798679867, + "grad_norm": 0.03760964423418045, + "learning_rate": 3.2025559128190964e-05, + "loss": 0.0271, + "num_input_tokens_seen": 17972288, + "step": 85155 + }, + { + "epoch": 9.368536853685368, + "grad_norm": 0.04034378007054329, + "learning_rate": 3.20232557478937e-05, + "loss": 0.0622, + "num_input_tokens_seen": 17973376, + "step": 85160 + }, + { + "epoch": 9.36908690869087, + "grad_norm": 0.10913130640983582, + "learning_rate": 3.202095230286649e-05, + "loss": 0.0114, + "num_input_tokens_seen": 17974464, + "step": 85165 + }, + { + "epoch": 9.369636963696369, + "grad_norm": 1.6520222425460815, + "learning_rate": 3.201864879313051e-05, + "loss": 0.0608, + "num_input_tokens_seen": 17975488, + "step": 85170 + }, + { + "epoch": 9.37018701870187, + "grad_norm": 0.3216583728790283, + "learning_rate": 3.2016345218707014e-05, + "loss": 0.0061, + "num_input_tokens_seen": 17976480, + "step": 85175 + }, + { + "epoch": 9.370737073707371, + "grad_norm": 0.5441031455993652, + "learning_rate": 3.2014041579617234e-05, + "loss": 0.0695, + "num_input_tokens_seen": 17977536, + "step": 85180 + }, + { + "epoch": 9.371287128712872, + "grad_norm": 0.15752655267715454, + "learning_rate": 3.20117378758824e-05, + "loss": 0.005, + "num_input_tokens_seen": 17978528, + "step": 85185 + }, + { + "epoch": 9.371837183718371, + "grad_norm": 0.020920224487781525, + "learning_rate": 3.2009434107523736e-05, + "loss": 0.0045, + "num_input_tokens_seen": 17979648, + "step": 85190 + }, + { + "epoch": 9.372387238723872, + "grad_norm": 0.0363255999982357, + "learning_rate": 3.200713027456249e-05, + "loss": 0.1184, + "num_input_tokens_seen": 17980768, + "step": 85195 + }, + { + "epoch": 9.372937293729374, + "grad_norm": 0.3903667628765106, + "learning_rate": 3.200482637701988e-05, + "loss": 0.0147, + "num_input_tokens_seen": 17981760, + "step": 85200 + }, + { + "epoch": 9.373487348734873, + "grad_norm": 0.018394285812973976, + "learning_rate": 3.200252241491714e-05, + "loss": 0.0812, + "num_input_tokens_seen": 17982816, + "step": 85205 + }, + { + "epoch": 9.374037403740374, + "grad_norm": 0.15471743047237396, + "learning_rate": 3.200021838827553e-05, + "loss": 0.0081, + "num_input_tokens_seen": 17983904, + "step": 85210 + }, + { + "epoch": 9.374587458745875, + "grad_norm": 0.04983612522482872, + "learning_rate": 3.1997914297116246e-05, + "loss": 0.0041, + "num_input_tokens_seen": 17984992, + "step": 85215 + }, + { + "epoch": 9.375137513751374, + "grad_norm": 0.023598095402121544, + "learning_rate": 3.1995610141460557e-05, + "loss": 0.0054, + "num_input_tokens_seen": 17985984, + "step": 85220 + }, + { + "epoch": 9.375687568756875, + "grad_norm": 1.154049038887024, + "learning_rate": 3.199330592132969e-05, + "loss": 0.0364, + "num_input_tokens_seen": 17987072, + "step": 85225 + }, + { + "epoch": 9.376237623762377, + "grad_norm": 0.6930754780769348, + "learning_rate": 3.1991001636744863e-05, + "loss": 0.024, + "num_input_tokens_seen": 17988224, + "step": 85230 + }, + { + "epoch": 9.376787678767876, + "grad_norm": 0.03541795536875725, + "learning_rate": 3.198869728772734e-05, + "loss": 0.0087, + "num_input_tokens_seen": 17989280, + "step": 85235 + }, + { + "epoch": 9.377337733773377, + "grad_norm": 0.1348501294851303, + "learning_rate": 3.198639287429834e-05, + "loss": 0.012, + "num_input_tokens_seen": 17990304, + "step": 85240 + }, + { + "epoch": 9.377887788778878, + "grad_norm": 0.03253847360610962, + "learning_rate": 3.198408839647911e-05, + "loss": 0.0374, + "num_input_tokens_seen": 17991392, + "step": 85245 + }, + { + "epoch": 9.37843784378438, + "grad_norm": 0.08337228000164032, + "learning_rate": 3.1981783854290894e-05, + "loss": 0.0038, + "num_input_tokens_seen": 17992448, + "step": 85250 + }, + { + "epoch": 9.378987898789878, + "grad_norm": 0.38122767210006714, + "learning_rate": 3.1979479247754925e-05, + "loss": 0.0511, + "num_input_tokens_seen": 17993504, + "step": 85255 + }, + { + "epoch": 9.37953795379538, + "grad_norm": 0.018760258331894875, + "learning_rate": 3.197717457689244e-05, + "loss": 0.0385, + "num_input_tokens_seen": 17994624, + "step": 85260 + }, + { + "epoch": 9.38008800880088, + "grad_norm": 0.7091531157493591, + "learning_rate": 3.19748698417247e-05, + "loss": 0.1576, + "num_input_tokens_seen": 17995744, + "step": 85265 + }, + { + "epoch": 9.38063806380638, + "grad_norm": 0.026137232780456543, + "learning_rate": 3.197256504227292e-05, + "loss": 0.0566, + "num_input_tokens_seen": 17996768, + "step": 85270 + }, + { + "epoch": 9.381188118811881, + "grad_norm": 0.29610520601272583, + "learning_rate": 3.1970260178558345e-05, + "loss": 0.0458, + "num_input_tokens_seen": 17997824, + "step": 85275 + }, + { + "epoch": 9.381738173817382, + "grad_norm": 0.41037997603416443, + "learning_rate": 3.196795525060223e-05, + "loss": 0.0229, + "num_input_tokens_seen": 17998880, + "step": 85280 + }, + { + "epoch": 9.382288228822881, + "grad_norm": 2.147735118865967, + "learning_rate": 3.196565025842583e-05, + "loss": 0.2458, + "num_input_tokens_seen": 18000032, + "step": 85285 + }, + { + "epoch": 9.382838283828383, + "grad_norm": 0.2980963885784149, + "learning_rate": 3.196334520205037e-05, + "loss": 0.0157, + "num_input_tokens_seen": 18001152, + "step": 85290 + }, + { + "epoch": 9.383388338833884, + "grad_norm": 0.04737177863717079, + "learning_rate": 3.196104008149708e-05, + "loss": 0.0318, + "num_input_tokens_seen": 18002208, + "step": 85295 + }, + { + "epoch": 9.383938393839385, + "grad_norm": 0.9401470422744751, + "learning_rate": 3.1958734896787245e-05, + "loss": 0.0268, + "num_input_tokens_seen": 18003328, + "step": 85300 + }, + { + "epoch": 9.384488448844884, + "grad_norm": 1.1938921213150024, + "learning_rate": 3.1956429647942085e-05, + "loss": 0.0156, + "num_input_tokens_seen": 18004384, + "step": 85305 + }, + { + "epoch": 9.385038503850385, + "grad_norm": 0.2689906656742096, + "learning_rate": 3.195412433498284e-05, + "loss": 0.0876, + "num_input_tokens_seen": 18005440, + "step": 85310 + }, + { + "epoch": 9.385588558855886, + "grad_norm": 0.20774054527282715, + "learning_rate": 3.195181895793078e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18006496, + "step": 85315 + }, + { + "epoch": 9.386138613861386, + "grad_norm": 0.11218445748090744, + "learning_rate": 3.1949513516807136e-05, + "loss": 0.0118, + "num_input_tokens_seen": 18007488, + "step": 85320 + }, + { + "epoch": 9.386688668866887, + "grad_norm": 0.020875656977295876, + "learning_rate": 3.194720801163316e-05, + "loss": 0.0024, + "num_input_tokens_seen": 18008544, + "step": 85325 + }, + { + "epoch": 9.387238723872388, + "grad_norm": 0.03506544604897499, + "learning_rate": 3.19449024424301e-05, + "loss": 0.0213, + "num_input_tokens_seen": 18009536, + "step": 85330 + }, + { + "epoch": 9.387788778877887, + "grad_norm": 1.656783103942871, + "learning_rate": 3.194259680921921e-05, + "loss": 0.0991, + "num_input_tokens_seen": 18010624, + "step": 85335 + }, + { + "epoch": 9.388338833883388, + "grad_norm": 0.02407105080783367, + "learning_rate": 3.194029111202174e-05, + "loss": 0.116, + "num_input_tokens_seen": 18011712, + "step": 85340 + }, + { + "epoch": 9.38888888888889, + "grad_norm": 0.06685490161180496, + "learning_rate": 3.1937985350858926e-05, + "loss": 0.0661, + "num_input_tokens_seen": 18012768, + "step": 85345 + }, + { + "epoch": 9.389438943894389, + "grad_norm": 0.04559958726167679, + "learning_rate": 3.193567952575204e-05, + "loss": 0.0248, + "num_input_tokens_seen": 18013824, + "step": 85350 + }, + { + "epoch": 9.38998899889989, + "grad_norm": 0.4694240093231201, + "learning_rate": 3.193337363672232e-05, + "loss": 0.0445, + "num_input_tokens_seen": 18014848, + "step": 85355 + }, + { + "epoch": 9.39053905390539, + "grad_norm": 0.024800969287753105, + "learning_rate": 3.193106768379102e-05, + "loss": 0.0041, + "num_input_tokens_seen": 18015904, + "step": 85360 + }, + { + "epoch": 9.391089108910892, + "grad_norm": 0.020394591614603996, + "learning_rate": 3.19287616669794e-05, + "loss": 0.0138, + "num_input_tokens_seen": 18017024, + "step": 85365 + }, + { + "epoch": 9.391639163916391, + "grad_norm": 0.06550053507089615, + "learning_rate": 3.192645558630871e-05, + "loss": 0.0046, + "num_input_tokens_seen": 18018080, + "step": 85370 + }, + { + "epoch": 9.392189218921892, + "grad_norm": 0.02321590855717659, + "learning_rate": 3.19241494418002e-05, + "loss": 0.0693, + "num_input_tokens_seen": 18019104, + "step": 85375 + }, + { + "epoch": 9.392739273927393, + "grad_norm": 0.10730087012052536, + "learning_rate": 3.1921843233475126e-05, + "loss": 0.0352, + "num_input_tokens_seen": 18020160, + "step": 85380 + }, + { + "epoch": 9.393289328932893, + "grad_norm": 0.47419923543930054, + "learning_rate": 3.191953696135474e-05, + "loss": 0.0431, + "num_input_tokens_seen": 18021184, + "step": 85385 + }, + { + "epoch": 9.393839383938394, + "grad_norm": 0.46236786246299744, + "learning_rate": 3.1917230625460304e-05, + "loss": 0.0429, + "num_input_tokens_seen": 18022208, + "step": 85390 + }, + { + "epoch": 9.394389438943895, + "grad_norm": 0.03940632566809654, + "learning_rate": 3.191492422581307e-05, + "loss": 0.0119, + "num_input_tokens_seen": 18023296, + "step": 85395 + }, + { + "epoch": 9.394939493949394, + "grad_norm": 0.0705946534872055, + "learning_rate": 3.191261776243431e-05, + "loss": 0.0393, + "num_input_tokens_seen": 18024352, + "step": 85400 + }, + { + "epoch": 9.395489548954895, + "grad_norm": 0.3126180171966553, + "learning_rate": 3.191031123534526e-05, + "loss": 0.1191, + "num_input_tokens_seen": 18025312, + "step": 85405 + }, + { + "epoch": 9.396039603960396, + "grad_norm": 0.013214897364377975, + "learning_rate": 3.190800464456719e-05, + "loss": 0.0858, + "num_input_tokens_seen": 18026336, + "step": 85410 + }, + { + "epoch": 9.396589658965897, + "grad_norm": 0.1543148308992386, + "learning_rate": 3.1905697990121356e-05, + "loss": 0.0627, + "num_input_tokens_seen": 18027392, + "step": 85415 + }, + { + "epoch": 9.397139713971397, + "grad_norm": 0.05474017933011055, + "learning_rate": 3.190339127202901e-05, + "loss": 0.0078, + "num_input_tokens_seen": 18028448, + "step": 85420 + }, + { + "epoch": 9.397689768976898, + "grad_norm": 0.2522256672382355, + "learning_rate": 3.1901084490311427e-05, + "loss": 0.0306, + "num_input_tokens_seen": 18029536, + "step": 85425 + }, + { + "epoch": 9.398239823982399, + "grad_norm": 0.36041900515556335, + "learning_rate": 3.189877764498986e-05, + "loss": 0.0185, + "num_input_tokens_seen": 18030624, + "step": 85430 + }, + { + "epoch": 9.398789878987898, + "grad_norm": 0.023251252248883247, + "learning_rate": 3.189647073608556e-05, + "loss": 0.0124, + "num_input_tokens_seen": 18031616, + "step": 85435 + }, + { + "epoch": 9.3993399339934, + "grad_norm": 0.5015141367912292, + "learning_rate": 3.189416376361981e-05, + "loss": 0.0521, + "num_input_tokens_seen": 18032672, + "step": 85440 + }, + { + "epoch": 9.3998899889989, + "grad_norm": 0.42572861909866333, + "learning_rate": 3.1891856727613856e-05, + "loss": 0.1245, + "num_input_tokens_seen": 18033728, + "step": 85445 + }, + { + "epoch": 9.4004400440044, + "grad_norm": 0.04671899974346161, + "learning_rate": 3.1889549628088965e-05, + "loss": 0.0838, + "num_input_tokens_seen": 18034784, + "step": 85450 + }, + { + "epoch": 9.400990099009901, + "grad_norm": 0.033406298607587814, + "learning_rate": 3.1887242465066404e-05, + "loss": 0.0114, + "num_input_tokens_seen": 18035872, + "step": 85455 + }, + { + "epoch": 9.401540154015402, + "grad_norm": 2.635491371154785, + "learning_rate": 3.188493523856744e-05, + "loss": 0.0572, + "num_input_tokens_seen": 18036864, + "step": 85460 + }, + { + "epoch": 9.402090209020901, + "grad_norm": 2.4015798568725586, + "learning_rate": 3.188262794861331e-05, + "loss": 0.0297, + "num_input_tokens_seen": 18037952, + "step": 85465 + }, + { + "epoch": 9.402640264026402, + "grad_norm": 0.2447659820318222, + "learning_rate": 3.188032059522532e-05, + "loss": 0.0493, + "num_input_tokens_seen": 18039008, + "step": 85470 + }, + { + "epoch": 9.403190319031903, + "grad_norm": 0.04776788502931595, + "learning_rate": 3.1878013178424716e-05, + "loss": 0.0095, + "num_input_tokens_seen": 18040096, + "step": 85475 + }, + { + "epoch": 9.403740374037405, + "grad_norm": 1.4214609861373901, + "learning_rate": 3.1875705698232756e-05, + "loss": 0.0351, + "num_input_tokens_seen": 18041248, + "step": 85480 + }, + { + "epoch": 9.404290429042904, + "grad_norm": 0.05272432044148445, + "learning_rate": 3.1873398154670717e-05, + "loss": 0.0621, + "num_input_tokens_seen": 18042304, + "step": 85485 + }, + { + "epoch": 9.404840484048405, + "grad_norm": 0.28259167075157166, + "learning_rate": 3.1871090547759876e-05, + "loss": 0.0474, + "num_input_tokens_seen": 18043328, + "step": 85490 + }, + { + "epoch": 9.405390539053906, + "grad_norm": 2.5692367553710938, + "learning_rate": 3.186878287752148e-05, + "loss": 0.0661, + "num_input_tokens_seen": 18044384, + "step": 85495 + }, + { + "epoch": 9.405940594059405, + "grad_norm": 1.6968281269073486, + "learning_rate": 3.186647514397681e-05, + "loss": 0.1756, + "num_input_tokens_seen": 18045408, + "step": 85500 + }, + { + "epoch": 9.406490649064907, + "grad_norm": 0.1580231785774231, + "learning_rate": 3.186416734714714e-05, + "loss": 0.0913, + "num_input_tokens_seen": 18046464, + "step": 85505 + }, + { + "epoch": 9.407040704070408, + "grad_norm": 0.7596298456192017, + "learning_rate": 3.1861859487053725e-05, + "loss": 0.0597, + "num_input_tokens_seen": 18047520, + "step": 85510 + }, + { + "epoch": 9.407590759075907, + "grad_norm": 0.29005616903305054, + "learning_rate": 3.185955156371784e-05, + "loss": 0.0604, + "num_input_tokens_seen": 18048512, + "step": 85515 + }, + { + "epoch": 9.408140814081408, + "grad_norm": 0.012977128848433495, + "learning_rate": 3.185724357716077e-05, + "loss": 0.074, + "num_input_tokens_seen": 18049536, + "step": 85520 + }, + { + "epoch": 9.408690869086909, + "grad_norm": 0.9140024781227112, + "learning_rate": 3.185493552740377e-05, + "loss": 0.0365, + "num_input_tokens_seen": 18050592, + "step": 85525 + }, + { + "epoch": 9.409240924092408, + "grad_norm": 0.766633927822113, + "learning_rate": 3.185262741446813e-05, + "loss": 0.0601, + "num_input_tokens_seen": 18051648, + "step": 85530 + }, + { + "epoch": 9.40979097909791, + "grad_norm": 0.6443178653717041, + "learning_rate": 3.1850319238375096e-05, + "loss": 0.0183, + "num_input_tokens_seen": 18052768, + "step": 85535 + }, + { + "epoch": 9.41034103410341, + "grad_norm": 0.07831866294145584, + "learning_rate": 3.184801099914596e-05, + "loss": 0.0403, + "num_input_tokens_seen": 18053856, + "step": 85540 + }, + { + "epoch": 9.410891089108912, + "grad_norm": 0.17615091800689697, + "learning_rate": 3.1845702696802e-05, + "loss": 0.0206, + "num_input_tokens_seen": 18054880, + "step": 85545 + }, + { + "epoch": 9.411441144114411, + "grad_norm": 0.020864959806203842, + "learning_rate": 3.184339433136448e-05, + "loss": 0.043, + "num_input_tokens_seen": 18056000, + "step": 85550 + }, + { + "epoch": 9.411991199119912, + "grad_norm": 0.11092457175254822, + "learning_rate": 3.1841085902854676e-05, + "loss": 0.0687, + "num_input_tokens_seen": 18056992, + "step": 85555 + }, + { + "epoch": 9.412541254125413, + "grad_norm": 0.3170144855976105, + "learning_rate": 3.183877741129387e-05, + "loss": 0.0267, + "num_input_tokens_seen": 18058048, + "step": 85560 + }, + { + "epoch": 9.413091309130913, + "grad_norm": 0.07793726772069931, + "learning_rate": 3.183646885670333e-05, + "loss": 0.0483, + "num_input_tokens_seen": 18059136, + "step": 85565 + }, + { + "epoch": 9.413641364136414, + "grad_norm": 1.2829928398132324, + "learning_rate": 3.183416023910434e-05, + "loss": 0.0489, + "num_input_tokens_seen": 18060160, + "step": 85570 + }, + { + "epoch": 9.414191419141915, + "grad_norm": 0.34175652265548706, + "learning_rate": 3.183185155851818e-05, + "loss": 0.01, + "num_input_tokens_seen": 18061184, + "step": 85575 + }, + { + "epoch": 9.414741474147414, + "grad_norm": 0.161203071475029, + "learning_rate": 3.182954281496611e-05, + "loss": 0.0574, + "num_input_tokens_seen": 18062272, + "step": 85580 + }, + { + "epoch": 9.415291529152915, + "grad_norm": 0.056895557790994644, + "learning_rate": 3.182723400846942e-05, + "loss": 0.003, + "num_input_tokens_seen": 18063264, + "step": 85585 + }, + { + "epoch": 9.415841584158416, + "grad_norm": 0.01789597049355507, + "learning_rate": 3.1824925139049406e-05, + "loss": 0.0099, + "num_input_tokens_seen": 18064288, + "step": 85590 + }, + { + "epoch": 9.416391639163916, + "grad_norm": 0.054454099386930466, + "learning_rate": 3.1822616206727315e-05, + "loss": 0.0108, + "num_input_tokens_seen": 18065344, + "step": 85595 + }, + { + "epoch": 9.416941694169417, + "grad_norm": 0.0885392352938652, + "learning_rate": 3.182030721152445e-05, + "loss": 0.0331, + "num_input_tokens_seen": 18066400, + "step": 85600 + }, + { + "epoch": 9.417491749174918, + "grad_norm": 0.047366030514240265, + "learning_rate": 3.181799815346209e-05, + "loss": 0.0154, + "num_input_tokens_seen": 18067424, + "step": 85605 + }, + { + "epoch": 9.418041804180419, + "grad_norm": 0.6058638095855713, + "learning_rate": 3.181568903256151e-05, + "loss": 0.028, + "num_input_tokens_seen": 18068416, + "step": 85610 + }, + { + "epoch": 9.418591859185918, + "grad_norm": 0.15728044509887695, + "learning_rate": 3.1813379848844e-05, + "loss": 0.0176, + "num_input_tokens_seen": 18069472, + "step": 85615 + }, + { + "epoch": 9.41914191419142, + "grad_norm": 0.05424187704920769, + "learning_rate": 3.181107060233084e-05, + "loss": 0.0246, + "num_input_tokens_seen": 18070528, + "step": 85620 + }, + { + "epoch": 9.41969196919692, + "grad_norm": 0.03244819492101669, + "learning_rate": 3.1808761293043285e-05, + "loss": 0.0075, + "num_input_tokens_seen": 18071552, + "step": 85625 + }, + { + "epoch": 9.42024202420242, + "grad_norm": 0.05612356215715408, + "learning_rate": 3.180645192100267e-05, + "loss": 0.0755, + "num_input_tokens_seen": 18072576, + "step": 85630 + }, + { + "epoch": 9.42079207920792, + "grad_norm": 0.1509954333305359, + "learning_rate": 3.180414248623025e-05, + "loss": 0.0397, + "num_input_tokens_seen": 18073600, + "step": 85635 + }, + { + "epoch": 9.421342134213422, + "grad_norm": 0.006912339944392443, + "learning_rate": 3.1801832988747304e-05, + "loss": 0.0134, + "num_input_tokens_seen": 18074656, + "step": 85640 + }, + { + "epoch": 9.421892189218921, + "grad_norm": 0.26947352290153503, + "learning_rate": 3.179952342857513e-05, + "loss": 0.0132, + "num_input_tokens_seen": 18075744, + "step": 85645 + }, + { + "epoch": 9.422442244224422, + "grad_norm": 0.9048940539360046, + "learning_rate": 3.179721380573502e-05, + "loss": 0.0537, + "num_input_tokens_seen": 18076800, + "step": 85650 + }, + { + "epoch": 9.422992299229923, + "grad_norm": 0.011979722417891026, + "learning_rate": 3.179490412024824e-05, + "loss": 0.0121, + "num_input_tokens_seen": 18077856, + "step": 85655 + }, + { + "epoch": 9.423542354235423, + "grad_norm": 0.05676080286502838, + "learning_rate": 3.17925943721361e-05, + "loss": 0.0228, + "num_input_tokens_seen": 18078912, + "step": 85660 + }, + { + "epoch": 9.424092409240924, + "grad_norm": 0.30223456025123596, + "learning_rate": 3.179028456141987e-05, + "loss": 0.0098, + "num_input_tokens_seen": 18079936, + "step": 85665 + }, + { + "epoch": 9.424642464246425, + "grad_norm": 0.006365286651998758, + "learning_rate": 3.178797468812085e-05, + "loss": 0.0069, + "num_input_tokens_seen": 18081024, + "step": 85670 + }, + { + "epoch": 9.425192519251926, + "grad_norm": 0.7495414614677429, + "learning_rate": 3.1785664752260326e-05, + "loss": 0.0307, + "num_input_tokens_seen": 18082048, + "step": 85675 + }, + { + "epoch": 9.425742574257425, + "grad_norm": 1.295485496520996, + "learning_rate": 3.178335475385958e-05, + "loss": 0.0133, + "num_input_tokens_seen": 18083104, + "step": 85680 + }, + { + "epoch": 9.426292629262926, + "grad_norm": 0.9494967460632324, + "learning_rate": 3.178104469293991e-05, + "loss": 0.053, + "num_input_tokens_seen": 18084160, + "step": 85685 + }, + { + "epoch": 9.426842684268427, + "grad_norm": 0.1431708186864853, + "learning_rate": 3.1778734569522606e-05, + "loss": 0.0059, + "num_input_tokens_seen": 18085248, + "step": 85690 + }, + { + "epoch": 9.427392739273927, + "grad_norm": 0.015081570483744144, + "learning_rate": 3.1776424383628955e-05, + "loss": 0.0083, + "num_input_tokens_seen": 18086304, + "step": 85695 + }, + { + "epoch": 9.427942794279428, + "grad_norm": 0.008070548065006733, + "learning_rate": 3.177411413528025e-05, + "loss": 0.0005, + "num_input_tokens_seen": 18087328, + "step": 85700 + }, + { + "epoch": 9.428492849284929, + "grad_norm": 0.029830798506736755, + "learning_rate": 3.177180382449779e-05, + "loss": 0.0059, + "num_input_tokens_seen": 18088416, + "step": 85705 + }, + { + "epoch": 9.429042904290428, + "grad_norm": 0.463156133890152, + "learning_rate": 3.176949345130287e-05, + "loss": 0.0111, + "num_input_tokens_seen": 18089536, + "step": 85710 + }, + { + "epoch": 9.42959295929593, + "grad_norm": 0.13180111348628998, + "learning_rate": 3.1767183015716774e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18090592, + "step": 85715 + }, + { + "epoch": 9.43014301430143, + "grad_norm": 1.0022670030593872, + "learning_rate": 3.176487251776079e-05, + "loss": 0.0468, + "num_input_tokens_seen": 18091616, + "step": 85720 + }, + { + "epoch": 9.430693069306932, + "grad_norm": 1.116380214691162, + "learning_rate": 3.176256195745623e-05, + "loss": 0.0409, + "num_input_tokens_seen": 18092704, + "step": 85725 + }, + { + "epoch": 9.43124312431243, + "grad_norm": 0.33165690302848816, + "learning_rate": 3.176025133482437e-05, + "loss": 0.007, + "num_input_tokens_seen": 18093760, + "step": 85730 + }, + { + "epoch": 9.431793179317932, + "grad_norm": 0.006738670170307159, + "learning_rate": 3.175794064988654e-05, + "loss": 0.1267, + "num_input_tokens_seen": 18094784, + "step": 85735 + }, + { + "epoch": 9.432343234323433, + "grad_norm": 0.43589502573013306, + "learning_rate": 3.1755629902663995e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18095904, + "step": 85740 + }, + { + "epoch": 9.432893289328932, + "grad_norm": 0.3396328091621399, + "learning_rate": 3.175331909317806e-05, + "loss": 0.0338, + "num_input_tokens_seen": 18096960, + "step": 85745 + }, + { + "epoch": 9.433443344334433, + "grad_norm": 0.007723303511738777, + "learning_rate": 3.1751008221450025e-05, + "loss": 0.0504, + "num_input_tokens_seen": 18098048, + "step": 85750 + }, + { + "epoch": 9.433993399339935, + "grad_norm": 1.2966604232788086, + "learning_rate": 3.174869728750117e-05, + "loss": 0.0357, + "num_input_tokens_seen": 18099136, + "step": 85755 + }, + { + "epoch": 9.434543454345434, + "grad_norm": 0.04071378707885742, + "learning_rate": 3.174638629135284e-05, + "loss": 0.1681, + "num_input_tokens_seen": 18100192, + "step": 85760 + }, + { + "epoch": 9.435093509350935, + "grad_norm": 0.007704790681600571, + "learning_rate": 3.1744075233026284e-05, + "loss": 0.0582, + "num_input_tokens_seen": 18101248, + "step": 85765 + }, + { + "epoch": 9.435643564356436, + "grad_norm": 2.3346304893493652, + "learning_rate": 3.174176411254282e-05, + "loss": 0.0599, + "num_input_tokens_seen": 18102336, + "step": 85770 + }, + { + "epoch": 9.436193619361935, + "grad_norm": 0.017225846648216248, + "learning_rate": 3.173945292992376e-05, + "loss": 0.0575, + "num_input_tokens_seen": 18103392, + "step": 85775 + }, + { + "epoch": 9.436743674367436, + "grad_norm": 0.04410369321703911, + "learning_rate": 3.173714168519039e-05, + "loss": 0.0224, + "num_input_tokens_seen": 18104480, + "step": 85780 + }, + { + "epoch": 9.437293729372938, + "grad_norm": 0.09759942442178726, + "learning_rate": 3.173483037836402e-05, + "loss": 0.0104, + "num_input_tokens_seen": 18105568, + "step": 85785 + }, + { + "epoch": 9.437843784378439, + "grad_norm": 2.1424031257629395, + "learning_rate": 3.173251900946595e-05, + "loss": 0.1081, + "num_input_tokens_seen": 18106624, + "step": 85790 + }, + { + "epoch": 9.438393839383938, + "grad_norm": 0.08565734326839447, + "learning_rate": 3.173020757851749e-05, + "loss": 0.0596, + "num_input_tokens_seen": 18107616, + "step": 85795 + }, + { + "epoch": 9.438943894389439, + "grad_norm": 0.7801436185836792, + "learning_rate": 3.172789608553993e-05, + "loss": 0.1187, + "num_input_tokens_seen": 18108640, + "step": 85800 + }, + { + "epoch": 9.43949394939494, + "grad_norm": 0.7909152507781982, + "learning_rate": 3.172558453055459e-05, + "loss": 0.0385, + "num_input_tokens_seen": 18109792, + "step": 85805 + }, + { + "epoch": 9.44004400440044, + "grad_norm": 0.1808466762304306, + "learning_rate": 3.1723272913582745e-05, + "loss": 0.0366, + "num_input_tokens_seen": 18110784, + "step": 85810 + }, + { + "epoch": 9.44059405940594, + "grad_norm": 0.14043568074703217, + "learning_rate": 3.172096123464573e-05, + "loss": 0.0046, + "num_input_tokens_seen": 18111872, + "step": 85815 + }, + { + "epoch": 9.441144114411442, + "grad_norm": 0.03180762007832527, + "learning_rate": 3.171864949376484e-05, + "loss": 0.0131, + "num_input_tokens_seen": 18112896, + "step": 85820 + }, + { + "epoch": 9.441694169416941, + "grad_norm": 1.9955904483795166, + "learning_rate": 3.171633769096138e-05, + "loss": 0.1384, + "num_input_tokens_seen": 18113952, + "step": 85825 + }, + { + "epoch": 9.442244224422442, + "grad_norm": 0.5754852890968323, + "learning_rate": 3.171402582625667e-05, + "loss": 0.014, + "num_input_tokens_seen": 18115040, + "step": 85830 + }, + { + "epoch": 9.442794279427943, + "grad_norm": 0.2495969980955124, + "learning_rate": 3.171171389967199e-05, + "loss": 0.0052, + "num_input_tokens_seen": 18116096, + "step": 85835 + }, + { + "epoch": 9.443344334433444, + "grad_norm": 0.0637882798910141, + "learning_rate": 3.170940191122867e-05, + "loss": 0.0693, + "num_input_tokens_seen": 18117152, + "step": 85840 + }, + { + "epoch": 9.443894389438944, + "grad_norm": 0.02926618419587612, + "learning_rate": 3.170708986094801e-05, + "loss": 0.0092, + "num_input_tokens_seen": 18118144, + "step": 85845 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.05078024044632912, + "learning_rate": 3.1704777748851324e-05, + "loss": 0.045, + "num_input_tokens_seen": 18119168, + "step": 85850 + }, + { + "epoch": 9.444994499449946, + "grad_norm": 0.10588140785694122, + "learning_rate": 3.1702465574959914e-05, + "loss": 0.0036, + "num_input_tokens_seen": 18120256, + "step": 85855 + }, + { + "epoch": 9.445544554455445, + "grad_norm": 0.3024112284183502, + "learning_rate": 3.17001533392951e-05, + "loss": 0.0123, + "num_input_tokens_seen": 18121280, + "step": 85860 + }, + { + "epoch": 9.446094609460946, + "grad_norm": 0.03108697012066841, + "learning_rate": 3.169784104187818e-05, + "loss": 0.0027, + "num_input_tokens_seen": 18122400, + "step": 85865 + }, + { + "epoch": 9.446644664466447, + "grad_norm": 0.013739907182753086, + "learning_rate": 3.169552868273048e-05, + "loss": 0.089, + "num_input_tokens_seen": 18123424, + "step": 85870 + }, + { + "epoch": 9.447194719471947, + "grad_norm": 0.022753585129976273, + "learning_rate": 3.16932162618733e-05, + "loss": 0.0076, + "num_input_tokens_seen": 18124512, + "step": 85875 + }, + { + "epoch": 9.447744774477448, + "grad_norm": 0.4861217439174652, + "learning_rate": 3.169090377932797e-05, + "loss": 0.0321, + "num_input_tokens_seen": 18125568, + "step": 85880 + }, + { + "epoch": 9.448294829482949, + "grad_norm": 0.005151254124939442, + "learning_rate": 3.168859123511578e-05, + "loss": 0.029, + "num_input_tokens_seen": 18126656, + "step": 85885 + }, + { + "epoch": 9.448844884488448, + "grad_norm": 0.05076019465923309, + "learning_rate": 3.168627862925806e-05, + "loss": 0.0037, + "num_input_tokens_seen": 18127744, + "step": 85890 + }, + { + "epoch": 9.44939493949395, + "grad_norm": 1.2996562719345093, + "learning_rate": 3.168396596177611e-05, + "loss": 0.0478, + "num_input_tokens_seen": 18128800, + "step": 85895 + }, + { + "epoch": 9.44994499449945, + "grad_norm": 0.11495041102170944, + "learning_rate": 3.1681653232691264e-05, + "loss": 0.0901, + "num_input_tokens_seen": 18129888, + "step": 85900 + }, + { + "epoch": 9.450495049504951, + "grad_norm": 0.014402237720787525, + "learning_rate": 3.167934044202482e-05, + "loss": 0.0612, + "num_input_tokens_seen": 18130944, + "step": 85905 + }, + { + "epoch": 9.45104510451045, + "grad_norm": 0.04822515696287155, + "learning_rate": 3.1677027589798097e-05, + "loss": 0.0565, + "num_input_tokens_seen": 18132000, + "step": 85910 + }, + { + "epoch": 9.451595159515952, + "grad_norm": 0.16694147884845734, + "learning_rate": 3.167471467603241e-05, + "loss": 0.0248, + "num_input_tokens_seen": 18133088, + "step": 85915 + }, + { + "epoch": 9.452145214521453, + "grad_norm": 1.3768867254257202, + "learning_rate": 3.167240170074909e-05, + "loss": 0.0642, + "num_input_tokens_seen": 18134144, + "step": 85920 + }, + { + "epoch": 9.452695269526952, + "grad_norm": 0.03403573855757713, + "learning_rate": 3.167008866396946e-05, + "loss": 0.0024, + "num_input_tokens_seen": 18135232, + "step": 85925 + }, + { + "epoch": 9.453245324532453, + "grad_norm": 0.02059815637767315, + "learning_rate": 3.16677755657148e-05, + "loss": 0.0529, + "num_input_tokens_seen": 18136256, + "step": 85930 + }, + { + "epoch": 9.453795379537954, + "grad_norm": 0.2511770725250244, + "learning_rate": 3.166546240600645e-05, + "loss": 0.0182, + "num_input_tokens_seen": 18137312, + "step": 85935 + }, + { + "epoch": 9.454345434543454, + "grad_norm": 0.1288338303565979, + "learning_rate": 3.1663149184865746e-05, + "loss": 0.0655, + "num_input_tokens_seen": 18138400, + "step": 85940 + }, + { + "epoch": 9.454895489548955, + "grad_norm": 0.2771686613559723, + "learning_rate": 3.166083590231398e-05, + "loss": 0.0333, + "num_input_tokens_seen": 18139456, + "step": 85945 + }, + { + "epoch": 9.455445544554456, + "grad_norm": 0.0917259007692337, + "learning_rate": 3.16585225583725e-05, + "loss": 0.0736, + "num_input_tokens_seen": 18140544, + "step": 85950 + }, + { + "epoch": 9.455995599559955, + "grad_norm": 1.1889197826385498, + "learning_rate": 3.165620915306261e-05, + "loss": 0.101, + "num_input_tokens_seen": 18141632, + "step": 85955 + }, + { + "epoch": 9.456545654565456, + "grad_norm": 0.4889051914215088, + "learning_rate": 3.165389568640562e-05, + "loss": 0.0144, + "num_input_tokens_seen": 18142656, + "step": 85960 + }, + { + "epoch": 9.457095709570957, + "grad_norm": 0.04774012044072151, + "learning_rate": 3.165158215842288e-05, + "loss": 0.0207, + "num_input_tokens_seen": 18143776, + "step": 85965 + }, + { + "epoch": 9.457645764576458, + "grad_norm": 0.03621159493923187, + "learning_rate": 3.1649268569135696e-05, + "loss": 0.041, + "num_input_tokens_seen": 18144864, + "step": 85970 + }, + { + "epoch": 9.458195819581958, + "grad_norm": 0.014616000466048717, + "learning_rate": 3.16469549185654e-05, + "loss": 0.0599, + "num_input_tokens_seen": 18145856, + "step": 85975 + }, + { + "epoch": 9.458745874587459, + "grad_norm": 0.03370937332510948, + "learning_rate": 3.16446412067333e-05, + "loss": 0.0625, + "num_input_tokens_seen": 18146848, + "step": 85980 + }, + { + "epoch": 9.45929592959296, + "grad_norm": 0.14567795395851135, + "learning_rate": 3.164232743366074e-05, + "loss": 0.0221, + "num_input_tokens_seen": 18147904, + "step": 85985 + }, + { + "epoch": 9.45984598459846, + "grad_norm": 0.11873938888311386, + "learning_rate": 3.164001359936902e-05, + "loss": 0.0044, + "num_input_tokens_seen": 18148928, + "step": 85990 + }, + { + "epoch": 9.46039603960396, + "grad_norm": 0.2782689332962036, + "learning_rate": 3.1637699703879494e-05, + "loss": 0.0332, + "num_input_tokens_seen": 18149952, + "step": 85995 + }, + { + "epoch": 9.460946094609461, + "grad_norm": 0.07228589057922363, + "learning_rate": 3.1635385747213466e-05, + "loss": 0.0537, + "num_input_tokens_seen": 18151008, + "step": 86000 + }, + { + "epoch": 9.46149614961496, + "grad_norm": 0.020185451954603195, + "learning_rate": 3.163307172939228e-05, + "loss": 0.0226, + "num_input_tokens_seen": 18152096, + "step": 86005 + }, + { + "epoch": 9.462046204620462, + "grad_norm": 0.03370169922709465, + "learning_rate": 3.1630757650437255e-05, + "loss": 0.0242, + "num_input_tokens_seen": 18153184, + "step": 86010 + }, + { + "epoch": 9.462596259625963, + "grad_norm": 0.022104406729340553, + "learning_rate": 3.162844351036971e-05, + "loss": 0.0735, + "num_input_tokens_seen": 18154208, + "step": 86015 + }, + { + "epoch": 9.463146314631462, + "grad_norm": 0.34188151359558105, + "learning_rate": 3.162612930921099e-05, + "loss": 0.0139, + "num_input_tokens_seen": 18155264, + "step": 86020 + }, + { + "epoch": 9.463696369636963, + "grad_norm": 0.27188193798065186, + "learning_rate": 3.162381504698241e-05, + "loss": 0.0381, + "num_input_tokens_seen": 18156288, + "step": 86025 + }, + { + "epoch": 9.464246424642464, + "grad_norm": 0.03159460052847862, + "learning_rate": 3.1621500723705305e-05, + "loss": 0.0229, + "num_input_tokens_seen": 18157344, + "step": 86030 + }, + { + "epoch": 9.464796479647966, + "grad_norm": 0.1733999103307724, + "learning_rate": 3.1619186339401005e-05, + "loss": 0.003, + "num_input_tokens_seen": 18158368, + "step": 86035 + }, + { + "epoch": 9.465346534653465, + "grad_norm": 0.0046635461039841175, + "learning_rate": 3.161687189409085e-05, + "loss": 0.0018, + "num_input_tokens_seen": 18159456, + "step": 86040 + }, + { + "epoch": 9.465896589658966, + "grad_norm": 0.3532625734806061, + "learning_rate": 3.1614557387796155e-05, + "loss": 0.0189, + "num_input_tokens_seen": 18160480, + "step": 86045 + }, + { + "epoch": 9.466446644664467, + "grad_norm": 0.19913147389888763, + "learning_rate": 3.1612242820538255e-05, + "loss": 0.0212, + "num_input_tokens_seen": 18161568, + "step": 86050 + }, + { + "epoch": 9.466996699669966, + "grad_norm": 0.017687659710645676, + "learning_rate": 3.1609928192338496e-05, + "loss": 0.0065, + "num_input_tokens_seen": 18162560, + "step": 86055 + }, + { + "epoch": 9.467546754675467, + "grad_norm": 0.10329677909612656, + "learning_rate": 3.160761350321819e-05, + "loss": 0.0191, + "num_input_tokens_seen": 18163680, + "step": 86060 + }, + { + "epoch": 9.468096809680969, + "grad_norm": 0.14199694991111755, + "learning_rate": 3.1605298753198686e-05, + "loss": 0.0079, + "num_input_tokens_seen": 18164704, + "step": 86065 + }, + { + "epoch": 9.468646864686468, + "grad_norm": 0.249395951628685, + "learning_rate": 3.160298394230132e-05, + "loss": 0.0459, + "num_input_tokens_seen": 18165760, + "step": 86070 + }, + { + "epoch": 9.469196919691969, + "grad_norm": 1.5640850067138672, + "learning_rate": 3.16006690705474e-05, + "loss": 0.0788, + "num_input_tokens_seen": 18166880, + "step": 86075 + }, + { + "epoch": 9.46974697469747, + "grad_norm": 0.012038829736411572, + "learning_rate": 3.1598354137958295e-05, + "loss": 0.0041, + "num_input_tokens_seen": 18167936, + "step": 86080 + }, + { + "epoch": 9.47029702970297, + "grad_norm": 0.014576968736946583, + "learning_rate": 3.159603914455532e-05, + "loss": 0.0269, + "num_input_tokens_seen": 18168896, + "step": 86085 + }, + { + "epoch": 9.47084708470847, + "grad_norm": 0.02583875134587288, + "learning_rate": 3.159372409035982e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18169984, + "step": 86090 + }, + { + "epoch": 9.471397139713972, + "grad_norm": 1.5660980939865112, + "learning_rate": 3.159140897539313e-05, + "loss": 0.0358, + "num_input_tokens_seen": 18171072, + "step": 86095 + }, + { + "epoch": 9.471947194719473, + "grad_norm": 0.02344457618892193, + "learning_rate": 3.1589093799676595e-05, + "loss": 0.011, + "num_input_tokens_seen": 18172064, + "step": 86100 + }, + { + "epoch": 9.472497249724972, + "grad_norm": 0.0239595677703619, + "learning_rate": 3.158677856323153e-05, + "loss": 0.0144, + "num_input_tokens_seen": 18173152, + "step": 86105 + }, + { + "epoch": 9.473047304730473, + "grad_norm": 0.4498518407344818, + "learning_rate": 3.158446326607929e-05, + "loss": 0.028, + "num_input_tokens_seen": 18174208, + "step": 86110 + }, + { + "epoch": 9.473597359735974, + "grad_norm": 0.4980764091014862, + "learning_rate": 3.1582147908241214e-05, + "loss": 0.0375, + "num_input_tokens_seen": 18175264, + "step": 86115 + }, + { + "epoch": 9.474147414741473, + "grad_norm": 0.011517195031046867, + "learning_rate": 3.157983248973864e-05, + "loss": 0.0109, + "num_input_tokens_seen": 18176256, + "step": 86120 + }, + { + "epoch": 9.474697469746975, + "grad_norm": 0.1867605745792389, + "learning_rate": 3.1577517010592906e-05, + "loss": 0.0629, + "num_input_tokens_seen": 18177344, + "step": 86125 + }, + { + "epoch": 9.475247524752476, + "grad_norm": 0.11129403859376907, + "learning_rate": 3.1575201470825356e-05, + "loss": 0.095, + "num_input_tokens_seen": 18178400, + "step": 86130 + }, + { + "epoch": 9.475797579757975, + "grad_norm": 0.08206845819950104, + "learning_rate": 3.157288587045733e-05, + "loss": 0.009, + "num_input_tokens_seen": 18179424, + "step": 86135 + }, + { + "epoch": 9.476347634763476, + "grad_norm": 0.05903882533311844, + "learning_rate": 3.157057020951016e-05, + "loss": 0.0007, + "num_input_tokens_seen": 18180512, + "step": 86140 + }, + { + "epoch": 9.476897689768977, + "grad_norm": 0.6584845781326294, + "learning_rate": 3.1568254488005207e-05, + "loss": 0.0494, + "num_input_tokens_seen": 18181600, + "step": 86145 + }, + { + "epoch": 9.477447744774478, + "grad_norm": 0.11665046215057373, + "learning_rate": 3.15659387059638e-05, + "loss": 0.0822, + "num_input_tokens_seen": 18182688, + "step": 86150 + }, + { + "epoch": 9.477997799779978, + "grad_norm": 0.013619002886116505, + "learning_rate": 3.156362286340729e-05, + "loss": 0.0009, + "num_input_tokens_seen": 18183680, + "step": 86155 + }, + { + "epoch": 9.478547854785479, + "grad_norm": 0.4377985894680023, + "learning_rate": 3.156130696035702e-05, + "loss": 0.0615, + "num_input_tokens_seen": 18184832, + "step": 86160 + }, + { + "epoch": 9.47909790979098, + "grad_norm": 0.727353572845459, + "learning_rate": 3.155899099683433e-05, + "loss": 0.0201, + "num_input_tokens_seen": 18185856, + "step": 86165 + }, + { + "epoch": 9.479647964796479, + "grad_norm": 0.19148772954940796, + "learning_rate": 3.155667497286056e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18186944, + "step": 86170 + }, + { + "epoch": 9.48019801980198, + "grad_norm": 0.07517523318529129, + "learning_rate": 3.1554358888457055e-05, + "loss": 0.1869, + "num_input_tokens_seen": 18188032, + "step": 86175 + }, + { + "epoch": 9.480748074807481, + "grad_norm": 0.021773748099803925, + "learning_rate": 3.155204274364519e-05, + "loss": 0.1401, + "num_input_tokens_seen": 18189056, + "step": 86180 + }, + { + "epoch": 9.48129812981298, + "grad_norm": 1.104835867881775, + "learning_rate": 3.1549726538446286e-05, + "loss": 0.0652, + "num_input_tokens_seen": 18190176, + "step": 86185 + }, + { + "epoch": 9.481848184818482, + "grad_norm": 0.016087761148810387, + "learning_rate": 3.154741027288169e-05, + "loss": 0.0198, + "num_input_tokens_seen": 18191168, + "step": 86190 + }, + { + "epoch": 9.482398239823983, + "grad_norm": 0.03515402227640152, + "learning_rate": 3.154509394697276e-05, + "loss": 0.0021, + "num_input_tokens_seen": 18192192, + "step": 86195 + }, + { + "epoch": 9.482948294829482, + "grad_norm": 0.583152174949646, + "learning_rate": 3.1542777560740835e-05, + "loss": 0.0984, + "num_input_tokens_seen": 18193280, + "step": 86200 + }, + { + "epoch": 9.483498349834983, + "grad_norm": 0.05326269939541817, + "learning_rate": 3.154046111420726e-05, + "loss": 0.0695, + "num_input_tokens_seen": 18194400, + "step": 86205 + }, + { + "epoch": 9.484048404840484, + "grad_norm": 1.5932302474975586, + "learning_rate": 3.153814460739342e-05, + "loss": 0.07, + "num_input_tokens_seen": 18195488, + "step": 86210 + }, + { + "epoch": 9.484598459845985, + "grad_norm": 0.33623501658439636, + "learning_rate": 3.153582804032062e-05, + "loss": 0.0547, + "num_input_tokens_seen": 18196576, + "step": 86215 + }, + { + "epoch": 9.485148514851485, + "grad_norm": 1.4617267847061157, + "learning_rate": 3.1533511413010225e-05, + "loss": 0.1675, + "num_input_tokens_seen": 18197600, + "step": 86220 + }, + { + "epoch": 9.485698569856986, + "grad_norm": 0.03813765570521355, + "learning_rate": 3.1531194725483595e-05, + "loss": 0.0104, + "num_input_tokens_seen": 18198720, + "step": 86225 + }, + { + "epoch": 9.486248624862487, + "grad_norm": 0.07875795662403107, + "learning_rate": 3.152887797776208e-05, + "loss": 0.0142, + "num_input_tokens_seen": 18199840, + "step": 86230 + }, + { + "epoch": 9.486798679867986, + "grad_norm": 0.02277985028922558, + "learning_rate": 3.1526561169867034e-05, + "loss": 0.0507, + "num_input_tokens_seen": 18200864, + "step": 86235 + }, + { + "epoch": 9.487348734873487, + "grad_norm": 0.20074564218521118, + "learning_rate": 3.15242443018198e-05, + "loss": 0.0103, + "num_input_tokens_seen": 18201984, + "step": 86240 + }, + { + "epoch": 9.487898789878988, + "grad_norm": 1.4940348863601685, + "learning_rate": 3.152192737364174e-05, + "loss": 0.0402, + "num_input_tokens_seen": 18203008, + "step": 86245 + }, + { + "epoch": 9.488448844884488, + "grad_norm": 0.019558077678084373, + "learning_rate": 3.15196103853542e-05, + "loss": 0.0727, + "num_input_tokens_seen": 18204064, + "step": 86250 + }, + { + "epoch": 9.488998899889989, + "grad_norm": 0.019493553787469864, + "learning_rate": 3.151729333697854e-05, + "loss": 0.0076, + "num_input_tokens_seen": 18205152, + "step": 86255 + }, + { + "epoch": 9.48954895489549, + "grad_norm": 0.5383972525596619, + "learning_rate": 3.1514976228536116e-05, + "loss": 0.0357, + "num_input_tokens_seen": 18206208, + "step": 86260 + }, + { + "epoch": 9.490099009900991, + "grad_norm": 0.02179563045501709, + "learning_rate": 3.151265906004828e-05, + "loss": 0.0033, + "num_input_tokens_seen": 18207296, + "step": 86265 + }, + { + "epoch": 9.49064906490649, + "grad_norm": 0.013682018034160137, + "learning_rate": 3.15103418315364e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18208352, + "step": 86270 + }, + { + "epoch": 9.491199119911991, + "grad_norm": 0.17782293260097504, + "learning_rate": 3.1508024543021814e-05, + "loss": 0.0069, + "num_input_tokens_seen": 18209376, + "step": 86275 + }, + { + "epoch": 9.491749174917492, + "grad_norm": 0.12707626819610596, + "learning_rate": 3.150570719452589e-05, + "loss": 0.0084, + "num_input_tokens_seen": 18210560, + "step": 86280 + }, + { + "epoch": 9.492299229922992, + "grad_norm": 0.9271699786186218, + "learning_rate": 3.150338978606999e-05, + "loss": 0.0115, + "num_input_tokens_seen": 18211616, + "step": 86285 + }, + { + "epoch": 9.492849284928493, + "grad_norm": 0.034577831625938416, + "learning_rate": 3.1501072317675464e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18212608, + "step": 86290 + }, + { + "epoch": 9.493399339933994, + "grad_norm": 0.02982030063867569, + "learning_rate": 3.149875478936366e-05, + "loss": 0.0304, + "num_input_tokens_seen": 18213664, + "step": 86295 + }, + { + "epoch": 9.493949394939493, + "grad_norm": 0.008794387802481651, + "learning_rate": 3.149643720115597e-05, + "loss": 0.0025, + "num_input_tokens_seen": 18214752, + "step": 86300 + }, + { + "epoch": 9.494499449944994, + "grad_norm": 2.5618770122528076, + "learning_rate": 3.149411955307373e-05, + "loss": 0.0714, + "num_input_tokens_seen": 18215872, + "step": 86305 + }, + { + "epoch": 9.495049504950495, + "grad_norm": 0.12263676524162292, + "learning_rate": 3.149180184513831e-05, + "loss": 0.0036, + "num_input_tokens_seen": 18216896, + "step": 86310 + }, + { + "epoch": 9.495599559955995, + "grad_norm": 2.1241202354431152, + "learning_rate": 3.148948407737106e-05, + "loss": 0.1889, + "num_input_tokens_seen": 18217920, + "step": 86315 + }, + { + "epoch": 9.496149614961496, + "grad_norm": 0.025714289397001266, + "learning_rate": 3.1487166249793356e-05, + "loss": 0.0465, + "num_input_tokens_seen": 18219040, + "step": 86320 + }, + { + "epoch": 9.496699669966997, + "grad_norm": 1.1806190013885498, + "learning_rate": 3.1484848362426545e-05, + "loss": 0.0784, + "num_input_tokens_seen": 18220096, + "step": 86325 + }, + { + "epoch": 9.497249724972498, + "grad_norm": 0.021584751084446907, + "learning_rate": 3.148253041529201e-05, + "loss": 0.0337, + "num_input_tokens_seen": 18221120, + "step": 86330 + }, + { + "epoch": 9.497799779977997, + "grad_norm": 0.06581201404333115, + "learning_rate": 3.1480212408411095e-05, + "loss": 0.1186, + "num_input_tokens_seen": 18222144, + "step": 86335 + }, + { + "epoch": 9.498349834983498, + "grad_norm": 0.09531406313180923, + "learning_rate": 3.147789434180517e-05, + "loss": 0.0076, + "num_input_tokens_seen": 18223168, + "step": 86340 + }, + { + "epoch": 9.498899889989, + "grad_norm": 0.004980073776096106, + "learning_rate": 3.1475576215495605e-05, + "loss": 0.0152, + "num_input_tokens_seen": 18224224, + "step": 86345 + }, + { + "epoch": 9.499449944994499, + "grad_norm": 0.3616233766078949, + "learning_rate": 3.147325802950376e-05, + "loss": 0.0351, + "num_input_tokens_seen": 18225280, + "step": 86350 + }, + { + "epoch": 9.5, + "grad_norm": 0.01977662742137909, + "learning_rate": 3.147093978385101e-05, + "loss": 0.0475, + "num_input_tokens_seen": 18226336, + "step": 86355 + }, + { + "epoch": 9.500550055005501, + "grad_norm": 0.2772860825061798, + "learning_rate": 3.1468621478558704e-05, + "loss": 0.0769, + "num_input_tokens_seen": 18227456, + "step": 86360 + }, + { + "epoch": 9.501100110011, + "grad_norm": 0.1252962201833725, + "learning_rate": 3.146630311364821e-05, + "loss": 0.005, + "num_input_tokens_seen": 18228576, + "step": 86365 + }, + { + "epoch": 9.501650165016502, + "grad_norm": 0.09192502498626709, + "learning_rate": 3.1463984689140915e-05, + "loss": 0.0043, + "num_input_tokens_seen": 18229600, + "step": 86370 + }, + { + "epoch": 9.502200220022003, + "grad_norm": 0.19427815079689026, + "learning_rate": 3.146166620505818e-05, + "loss": 0.1209, + "num_input_tokens_seen": 18230688, + "step": 86375 + }, + { + "epoch": 9.502750275027502, + "grad_norm": 0.030182262882590294, + "learning_rate": 3.145934766142135e-05, + "loss": 0.0217, + "num_input_tokens_seen": 18231744, + "step": 86380 + }, + { + "epoch": 9.503300330033003, + "grad_norm": 1.6512302160263062, + "learning_rate": 3.1457029058251824e-05, + "loss": 0.1158, + "num_input_tokens_seen": 18232832, + "step": 86385 + }, + { + "epoch": 9.503850385038504, + "grad_norm": 0.2766728401184082, + "learning_rate": 3.145471039557096e-05, + "loss": 0.0086, + "num_input_tokens_seen": 18233888, + "step": 86390 + }, + { + "epoch": 9.504400440044005, + "grad_norm": 0.47793111205101013, + "learning_rate": 3.1452391673400123e-05, + "loss": 0.0116, + "num_input_tokens_seen": 18234912, + "step": 86395 + }, + { + "epoch": 9.504950495049505, + "grad_norm": 0.05277711898088455, + "learning_rate": 3.1450072891760694e-05, + "loss": 0.0034, + "num_input_tokens_seen": 18235968, + "step": 86400 + }, + { + "epoch": 9.505500550055006, + "grad_norm": 0.03600500896573067, + "learning_rate": 3.144775405067404e-05, + "loss": 0.0122, + "num_input_tokens_seen": 18236992, + "step": 86405 + }, + { + "epoch": 9.506050605060507, + "grad_norm": 0.04200342670083046, + "learning_rate": 3.144543515016152e-05, + "loss": 0.0106, + "num_input_tokens_seen": 18238048, + "step": 86410 + }, + { + "epoch": 9.506600660066006, + "grad_norm": 0.15185049176216125, + "learning_rate": 3.144311619024452e-05, + "loss": 0.0542, + "num_input_tokens_seen": 18239136, + "step": 86415 + }, + { + "epoch": 9.507150715071507, + "grad_norm": 0.025305405259132385, + "learning_rate": 3.144079717094441e-05, + "loss": 0.0168, + "num_input_tokens_seen": 18240224, + "step": 86420 + }, + { + "epoch": 9.507700770077008, + "grad_norm": 0.05084151774644852, + "learning_rate": 3.143847809228257e-05, + "loss": 0.0303, + "num_input_tokens_seen": 18241248, + "step": 86425 + }, + { + "epoch": 9.508250825082508, + "grad_norm": 0.010757746174931526, + "learning_rate": 3.1436158954280363e-05, + "loss": 0.0296, + "num_input_tokens_seen": 18242272, + "step": 86430 + }, + { + "epoch": 9.508800880088009, + "grad_norm": 0.028822841122746468, + "learning_rate": 3.143383975695917e-05, + "loss": 0.0572, + "num_input_tokens_seen": 18243360, + "step": 86435 + }, + { + "epoch": 9.50935093509351, + "grad_norm": 0.04277975484728813, + "learning_rate": 3.1431520500340356e-05, + "loss": 0.0092, + "num_input_tokens_seen": 18244448, + "step": 86440 + }, + { + "epoch": 9.509900990099009, + "grad_norm": 0.7886396646499634, + "learning_rate": 3.1429201184445304e-05, + "loss": 0.0164, + "num_input_tokens_seen": 18245568, + "step": 86445 + }, + { + "epoch": 9.51045104510451, + "grad_norm": 0.02376275323331356, + "learning_rate": 3.14268818092954e-05, + "loss": 0.0334, + "num_input_tokens_seen": 18246688, + "step": 86450 + }, + { + "epoch": 9.511001100110011, + "grad_norm": 0.04318606108427048, + "learning_rate": 3.1424562374912e-05, + "loss": 0.0425, + "num_input_tokens_seen": 18247744, + "step": 86455 + }, + { + "epoch": 9.511551155115512, + "grad_norm": 0.0447084866464138, + "learning_rate": 3.14222428813165e-05, + "loss": 0.0714, + "num_input_tokens_seen": 18248800, + "step": 86460 + }, + { + "epoch": 9.512101210121012, + "grad_norm": 0.1920393407344818, + "learning_rate": 3.141992332853026e-05, + "loss": 0.0099, + "num_input_tokens_seen": 18249792, + "step": 86465 + }, + { + "epoch": 9.512651265126513, + "grad_norm": 0.23214617371559143, + "learning_rate": 3.141760371657467e-05, + "loss": 0.0175, + "num_input_tokens_seen": 18250816, + "step": 86470 + }, + { + "epoch": 9.513201320132014, + "grad_norm": 0.0724041685461998, + "learning_rate": 3.1415284045471114e-05, + "loss": 0.0055, + "num_input_tokens_seen": 18251904, + "step": 86475 + }, + { + "epoch": 9.513751375137513, + "grad_norm": 0.10086182504892349, + "learning_rate": 3.141296431524095e-05, + "loss": 0.0144, + "num_input_tokens_seen": 18252896, + "step": 86480 + }, + { + "epoch": 9.514301430143014, + "grad_norm": 1.468267798423767, + "learning_rate": 3.141064452590558e-05, + "loss": 0.0788, + "num_input_tokens_seen": 18254016, + "step": 86485 + }, + { + "epoch": 9.514851485148515, + "grad_norm": 0.019675238057971, + "learning_rate": 3.1408324677486375e-05, + "loss": 0.0062, + "num_input_tokens_seen": 18255072, + "step": 86490 + }, + { + "epoch": 9.515401540154015, + "grad_norm": 0.049390606582164764, + "learning_rate": 3.1406004770004714e-05, + "loss": 0.0018, + "num_input_tokens_seen": 18256128, + "step": 86495 + }, + { + "epoch": 9.515951595159516, + "grad_norm": 0.024102654308080673, + "learning_rate": 3.140368480348198e-05, + "loss": 0.0018, + "num_input_tokens_seen": 18257184, + "step": 86500 + }, + { + "epoch": 9.516501650165017, + "grad_norm": 0.008367469534277916, + "learning_rate": 3.1401364777939564e-05, + "loss": 0.0453, + "num_input_tokens_seen": 18258144, + "step": 86505 + }, + { + "epoch": 9.517051705170516, + "grad_norm": 0.38702231645584106, + "learning_rate": 3.1399044693398825e-05, + "loss": 0.1452, + "num_input_tokens_seen": 18259232, + "step": 86510 + }, + { + "epoch": 9.517601760176017, + "grad_norm": 0.027058077976107597, + "learning_rate": 3.1396724549881175e-05, + "loss": 0.022, + "num_input_tokens_seen": 18260256, + "step": 86515 + }, + { + "epoch": 9.518151815181518, + "grad_norm": 0.1526520550251007, + "learning_rate": 3.1394404347407985e-05, + "loss": 0.1322, + "num_input_tokens_seen": 18261312, + "step": 86520 + }, + { + "epoch": 9.51870187018702, + "grad_norm": 0.19280678033828735, + "learning_rate": 3.139208408600063e-05, + "loss": 0.0513, + "num_input_tokens_seen": 18262400, + "step": 86525 + }, + { + "epoch": 9.519251925192519, + "grad_norm": 0.05315448343753815, + "learning_rate": 3.138976376568051e-05, + "loss": 0.0033, + "num_input_tokens_seen": 18263424, + "step": 86530 + }, + { + "epoch": 9.51980198019802, + "grad_norm": 0.005158515181392431, + "learning_rate": 3.1387443386468995e-05, + "loss": 0.0032, + "num_input_tokens_seen": 18264512, + "step": 86535 + }, + { + "epoch": 9.520352035203521, + "grad_norm": 0.0212937593460083, + "learning_rate": 3.1385122948387486e-05, + "loss": 0.0152, + "num_input_tokens_seen": 18265568, + "step": 86540 + }, + { + "epoch": 9.52090209020902, + "grad_norm": 0.06190183013677597, + "learning_rate": 3.1382802451457366e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18266688, + "step": 86545 + }, + { + "epoch": 9.521452145214521, + "grad_norm": 0.3973468244075775, + "learning_rate": 3.1380481895700015e-05, + "loss": 0.0098, + "num_input_tokens_seen": 18267808, + "step": 86550 + }, + { + "epoch": 9.522002200220022, + "grad_norm": 0.003902910742908716, + "learning_rate": 3.137816128113682e-05, + "loss": 0.0129, + "num_input_tokens_seen": 18268832, + "step": 86555 + }, + { + "epoch": 9.522552255225522, + "grad_norm": 0.8163953423500061, + "learning_rate": 3.137584060778918e-05, + "loss": 0.0462, + "num_input_tokens_seen": 18269888, + "step": 86560 + }, + { + "epoch": 9.523102310231023, + "grad_norm": 0.1036146730184555, + "learning_rate": 3.137351987567847e-05, + "loss": 0.0487, + "num_input_tokens_seen": 18271008, + "step": 86565 + }, + { + "epoch": 9.523652365236524, + "grad_norm": 0.1094936653971672, + "learning_rate": 3.137119908482609e-05, + "loss": 0.0063, + "num_input_tokens_seen": 18272064, + "step": 86570 + }, + { + "epoch": 9.524202420242025, + "grad_norm": 1.8751024007797241, + "learning_rate": 3.1368878235253417e-05, + "loss": 0.0467, + "num_input_tokens_seen": 18273056, + "step": 86575 + }, + { + "epoch": 9.524752475247524, + "grad_norm": 0.8729297518730164, + "learning_rate": 3.136655732698186e-05, + "loss": 0.0224, + "num_input_tokens_seen": 18274048, + "step": 86580 + }, + { + "epoch": 9.525302530253025, + "grad_norm": 0.016675105318427086, + "learning_rate": 3.136423636003279e-05, + "loss": 0.0133, + "num_input_tokens_seen": 18275104, + "step": 86585 + }, + { + "epoch": 9.525852585258527, + "grad_norm": 0.0038710935041308403, + "learning_rate": 3.136191533442762e-05, + "loss": 0.0156, + "num_input_tokens_seen": 18276224, + "step": 86590 + }, + { + "epoch": 9.526402640264026, + "grad_norm": 1.3264353275299072, + "learning_rate": 3.1359594250187716e-05, + "loss": 0.0467, + "num_input_tokens_seen": 18277248, + "step": 86595 + }, + { + "epoch": 9.526952695269527, + "grad_norm": 0.007225881330668926, + "learning_rate": 3.135727310733448e-05, + "loss": 0.0432, + "num_input_tokens_seen": 18278240, + "step": 86600 + }, + { + "epoch": 9.527502750275028, + "grad_norm": 0.48579633235931396, + "learning_rate": 3.135495190588932e-05, + "loss": 0.0902, + "num_input_tokens_seen": 18279264, + "step": 86605 + }, + { + "epoch": 9.528052805280527, + "grad_norm": 0.021071670576930046, + "learning_rate": 3.135263064587362e-05, + "loss": 0.0865, + "num_input_tokens_seen": 18280320, + "step": 86610 + }, + { + "epoch": 9.528602860286028, + "grad_norm": 0.012962882407009602, + "learning_rate": 3.135030932730877e-05, + "loss": 0.0389, + "num_input_tokens_seen": 18281408, + "step": 86615 + }, + { + "epoch": 9.52915291529153, + "grad_norm": 0.0033021883573383093, + "learning_rate": 3.134798795021616e-05, + "loss": 0.0049, + "num_input_tokens_seen": 18282432, + "step": 86620 + }, + { + "epoch": 9.52970297029703, + "grad_norm": 1.5014482736587524, + "learning_rate": 3.134566651461719e-05, + "loss": 0.1147, + "num_input_tokens_seen": 18283520, + "step": 86625 + }, + { + "epoch": 9.53025302530253, + "grad_norm": 0.22339360415935516, + "learning_rate": 3.134334502053326e-05, + "loss": 0.0685, + "num_input_tokens_seen": 18284512, + "step": 86630 + }, + { + "epoch": 9.530803080308031, + "grad_norm": 0.08625205606222153, + "learning_rate": 3.1341023467985764e-05, + "loss": 0.0072, + "num_input_tokens_seen": 18285600, + "step": 86635 + }, + { + "epoch": 9.531353135313532, + "grad_norm": 1.1177860498428345, + "learning_rate": 3.13387018569961e-05, + "loss": 0.1032, + "num_input_tokens_seen": 18286688, + "step": 86640 + }, + { + "epoch": 9.531903190319031, + "grad_norm": 0.2053627371788025, + "learning_rate": 3.133638018758566e-05, + "loss": 0.0691, + "num_input_tokens_seen": 18287712, + "step": 86645 + }, + { + "epoch": 9.532453245324533, + "grad_norm": 1.595988154411316, + "learning_rate": 3.133405845977584e-05, + "loss": 0.0929, + "num_input_tokens_seen": 18288800, + "step": 86650 + }, + { + "epoch": 9.533003300330034, + "grad_norm": 0.04401170462369919, + "learning_rate": 3.133173667358804e-05, + "loss": 0.0245, + "num_input_tokens_seen": 18289888, + "step": 86655 + }, + { + "epoch": 9.533553355335533, + "grad_norm": 0.01744718849658966, + "learning_rate": 3.132941482904367e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18290976, + "step": 86660 + }, + { + "epoch": 9.534103410341034, + "grad_norm": 0.5527759194374084, + "learning_rate": 3.1327092926164126e-05, + "loss": 0.0102, + "num_input_tokens_seen": 18292064, + "step": 86665 + }, + { + "epoch": 9.534653465346535, + "grad_norm": 0.39325326681137085, + "learning_rate": 3.1324770964970786e-05, + "loss": 0.0124, + "num_input_tokens_seen": 18293120, + "step": 86670 + }, + { + "epoch": 9.535203520352034, + "grad_norm": 0.009320263750851154, + "learning_rate": 3.132244894548508e-05, + "loss": 0.0118, + "num_input_tokens_seen": 18294144, + "step": 86675 + }, + { + "epoch": 9.535753575357536, + "grad_norm": 0.12336353957653046, + "learning_rate": 3.132012686772839e-05, + "loss": 0.0321, + "num_input_tokens_seen": 18295200, + "step": 86680 + }, + { + "epoch": 9.536303630363037, + "grad_norm": 0.023013640195131302, + "learning_rate": 3.131780473172212e-05, + "loss": 0.0026, + "num_input_tokens_seen": 18296352, + "step": 86685 + }, + { + "epoch": 9.536853685368538, + "grad_norm": 0.017679117619991302, + "learning_rate": 3.131548253748768e-05, + "loss": 0.0173, + "num_input_tokens_seen": 18297344, + "step": 86690 + }, + { + "epoch": 9.537403740374037, + "grad_norm": 0.07157289236783981, + "learning_rate": 3.131316028504647e-05, + "loss": 0.0027, + "num_input_tokens_seen": 18298432, + "step": 86695 + }, + { + "epoch": 9.537953795379538, + "grad_norm": 0.012205448001623154, + "learning_rate": 3.1310837974419885e-05, + "loss": 0.0125, + "num_input_tokens_seen": 18299520, + "step": 86700 + }, + { + "epoch": 9.53850385038504, + "grad_norm": 0.011229592375457287, + "learning_rate": 3.130851560562934e-05, + "loss": 0.0046, + "num_input_tokens_seen": 18300672, + "step": 86705 + }, + { + "epoch": 9.539053905390539, + "grad_norm": 0.012102779932320118, + "learning_rate": 3.130619317869624e-05, + "loss": 0.0238, + "num_input_tokens_seen": 18301728, + "step": 86710 + }, + { + "epoch": 9.53960396039604, + "grad_norm": 0.006055423524230719, + "learning_rate": 3.130387069364197e-05, + "loss": 0.0205, + "num_input_tokens_seen": 18302816, + "step": 86715 + }, + { + "epoch": 9.54015401540154, + "grad_norm": 2.159069776535034, + "learning_rate": 3.1301548150487956e-05, + "loss": 0.0884, + "num_input_tokens_seen": 18303872, + "step": 86720 + }, + { + "epoch": 9.54070407040704, + "grad_norm": 0.020147530362010002, + "learning_rate": 3.1299225549255596e-05, + "loss": 0.0218, + "num_input_tokens_seen": 18304896, + "step": 86725 + }, + { + "epoch": 9.541254125412541, + "grad_norm": 0.11285814642906189, + "learning_rate": 3.129690288996629e-05, + "loss": 0.0056, + "num_input_tokens_seen": 18305984, + "step": 86730 + }, + { + "epoch": 9.541804180418042, + "grad_norm": 0.024432972073554993, + "learning_rate": 3.1294580172641465e-05, + "loss": 0.0034, + "num_input_tokens_seen": 18307008, + "step": 86735 + }, + { + "epoch": 9.542354235423542, + "grad_norm": 1.0534160137176514, + "learning_rate": 3.1292257397302506e-05, + "loss": 0.0208, + "num_input_tokens_seen": 18308064, + "step": 86740 + }, + { + "epoch": 9.542904290429043, + "grad_norm": 0.028685312718153, + "learning_rate": 3.1289934563970826e-05, + "loss": 0.0112, + "num_input_tokens_seen": 18309120, + "step": 86745 + }, + { + "epoch": 9.543454345434544, + "grad_norm": 0.11241747438907623, + "learning_rate": 3.1287611672667846e-05, + "loss": 0.0871, + "num_input_tokens_seen": 18310112, + "step": 86750 + }, + { + "epoch": 9.544004400440045, + "grad_norm": 0.0313241146504879, + "learning_rate": 3.1285288723414964e-05, + "loss": 0.0021, + "num_input_tokens_seen": 18311264, + "step": 86755 + }, + { + "epoch": 9.544554455445544, + "grad_norm": 0.04894028604030609, + "learning_rate": 3.1282965716233594e-05, + "loss": 0.0031, + "num_input_tokens_seen": 18312384, + "step": 86760 + }, + { + "epoch": 9.545104510451045, + "grad_norm": 0.29300829768180847, + "learning_rate": 3.1280642651145135e-05, + "loss": 0.006, + "num_input_tokens_seen": 18313440, + "step": 86765 + }, + { + "epoch": 9.545654565456546, + "grad_norm": 0.1195359081029892, + "learning_rate": 3.127831952817101e-05, + "loss": 0.0053, + "num_input_tokens_seen": 18314464, + "step": 86770 + }, + { + "epoch": 9.546204620462046, + "grad_norm": 0.7357729077339172, + "learning_rate": 3.127599634733263e-05, + "loss": 0.0443, + "num_input_tokens_seen": 18315488, + "step": 86775 + }, + { + "epoch": 9.546754675467547, + "grad_norm": 0.02992403134703636, + "learning_rate": 3.12736731086514e-05, + "loss": 0.0204, + "num_input_tokens_seen": 18316512, + "step": 86780 + }, + { + "epoch": 9.547304730473048, + "grad_norm": 2.702390432357788, + "learning_rate": 3.1271349812148736e-05, + "loss": 0.1416, + "num_input_tokens_seen": 18317536, + "step": 86785 + }, + { + "epoch": 9.547854785478547, + "grad_norm": 0.0667332336306572, + "learning_rate": 3.1269026457846054e-05, + "loss": 0.0274, + "num_input_tokens_seen": 18318560, + "step": 86790 + }, + { + "epoch": 9.548404840484048, + "grad_norm": 0.07468107342720032, + "learning_rate": 3.126670304576476e-05, + "loss": 0.0132, + "num_input_tokens_seen": 18319616, + "step": 86795 + }, + { + "epoch": 9.54895489548955, + "grad_norm": 0.01215369999408722, + "learning_rate": 3.126437957592627e-05, + "loss": 0.0119, + "num_input_tokens_seen": 18320704, + "step": 86800 + }, + { + "epoch": 9.549504950495049, + "grad_norm": 0.9021347761154175, + "learning_rate": 3.126205604835201e-05, + "loss": 0.1279, + "num_input_tokens_seen": 18321728, + "step": 86805 + }, + { + "epoch": 9.55005500550055, + "grad_norm": 0.04976046085357666, + "learning_rate": 3.125973246306337e-05, + "loss": 0.0033, + "num_input_tokens_seen": 18322720, + "step": 86810 + }, + { + "epoch": 9.55060506050605, + "grad_norm": 1.3364362716674805, + "learning_rate": 3.125740882008178e-05, + "loss": 0.0321, + "num_input_tokens_seen": 18323808, + "step": 86815 + }, + { + "epoch": 9.551155115511552, + "grad_norm": 0.039754025638103485, + "learning_rate": 3.125508511942866e-05, + "loss": 0.0023, + "num_input_tokens_seen": 18324832, + "step": 86820 + }, + { + "epoch": 9.551705170517051, + "grad_norm": 0.3424803912639618, + "learning_rate": 3.125276136112542e-05, + "loss": 0.1695, + "num_input_tokens_seen": 18325888, + "step": 86825 + }, + { + "epoch": 9.552255225522552, + "grad_norm": 0.023260755464434624, + "learning_rate": 3.1250437545193484e-05, + "loss": 0.0337, + "num_input_tokens_seen": 18326912, + "step": 86830 + }, + { + "epoch": 9.552805280528053, + "grad_norm": 0.803911030292511, + "learning_rate": 3.124811367165426e-05, + "loss": 0.0191, + "num_input_tokens_seen": 18328064, + "step": 86835 + }, + { + "epoch": 9.553355335533553, + "grad_norm": 0.12037836760282516, + "learning_rate": 3.1245789740529174e-05, + "loss": 0.0096, + "num_input_tokens_seen": 18329056, + "step": 86840 + }, + { + "epoch": 9.553905390539054, + "grad_norm": 0.24388475716114044, + "learning_rate": 3.1243465751839635e-05, + "loss": 0.0112, + "num_input_tokens_seen": 18330080, + "step": 86845 + }, + { + "epoch": 9.554455445544555, + "grad_norm": 0.007917952723801136, + "learning_rate": 3.124114170560708e-05, + "loss": 0.008, + "num_input_tokens_seen": 18331168, + "step": 86850 + }, + { + "epoch": 9.555005500550054, + "grad_norm": 0.03575631231069565, + "learning_rate": 3.123881760185291e-05, + "loss": 0.0091, + "num_input_tokens_seen": 18332160, + "step": 86855 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 0.41320565342903137, + "learning_rate": 3.123649344059854e-05, + "loss": 0.0282, + "num_input_tokens_seen": 18333184, + "step": 86860 + }, + { + "epoch": 9.556105610561056, + "grad_norm": 1.581335425376892, + "learning_rate": 3.1234169221865415e-05, + "loss": 0.2271, + "num_input_tokens_seen": 18334240, + "step": 86865 + }, + { + "epoch": 9.556655665566556, + "grad_norm": 0.02912134677171707, + "learning_rate": 3.1231844945674943e-05, + "loss": 0.0332, + "num_input_tokens_seen": 18335328, + "step": 86870 + }, + { + "epoch": 9.557205720572057, + "grad_norm": 0.018552294000983238, + "learning_rate": 3.122952061204854e-05, + "loss": 0.004, + "num_input_tokens_seen": 18336416, + "step": 86875 + }, + { + "epoch": 9.557755775577558, + "grad_norm": 0.2139059603214264, + "learning_rate": 3.122719622100765e-05, + "loss": 0.0097, + "num_input_tokens_seen": 18337472, + "step": 86880 + }, + { + "epoch": 9.558305830583059, + "grad_norm": 0.026759343221783638, + "learning_rate": 3.122487177257367e-05, + "loss": 0.0819, + "num_input_tokens_seen": 18338528, + "step": 86885 + }, + { + "epoch": 9.558855885588558, + "grad_norm": 0.0880570337176323, + "learning_rate": 3.122254726676803e-05, + "loss": 0.0856, + "num_input_tokens_seen": 18339648, + "step": 86890 + }, + { + "epoch": 9.55940594059406, + "grad_norm": 0.03979695588350296, + "learning_rate": 3.122022270361216e-05, + "loss": 0.0545, + "num_input_tokens_seen": 18340672, + "step": 86895 + }, + { + "epoch": 9.55995599559956, + "grad_norm": 0.445593923330307, + "learning_rate": 3.121789808312748e-05, + "loss": 0.0298, + "num_input_tokens_seen": 18341664, + "step": 86900 + }, + { + "epoch": 9.56050605060506, + "grad_norm": 1.4266717433929443, + "learning_rate": 3.121557340533543e-05, + "loss": 0.0252, + "num_input_tokens_seen": 18342720, + "step": 86905 + }, + { + "epoch": 9.561056105610561, + "grad_norm": 0.021173961460590363, + "learning_rate": 3.1213248670257414e-05, + "loss": 0.0167, + "num_input_tokens_seen": 18343808, + "step": 86910 + }, + { + "epoch": 9.561606160616062, + "grad_norm": 0.009243609383702278, + "learning_rate": 3.1210923877914866e-05, + "loss": 0.0031, + "num_input_tokens_seen": 18344832, + "step": 86915 + }, + { + "epoch": 9.562156215621561, + "grad_norm": 1.9529694318771362, + "learning_rate": 3.12085990283292e-05, + "loss": 0.0709, + "num_input_tokens_seen": 18345952, + "step": 86920 + }, + { + "epoch": 9.562706270627062, + "grad_norm": 0.049559470266103745, + "learning_rate": 3.120627412152188e-05, + "loss": 0.0413, + "num_input_tokens_seen": 18347008, + "step": 86925 + }, + { + "epoch": 9.563256325632564, + "grad_norm": 1.7595833539962769, + "learning_rate": 3.1203949157514296e-05, + "loss": 0.0484, + "num_input_tokens_seen": 18348096, + "step": 86930 + }, + { + "epoch": 9.563806380638063, + "grad_norm": 0.039528995752334595, + "learning_rate": 3.120162413632789e-05, + "loss": 0.0263, + "num_input_tokens_seen": 18349152, + "step": 86935 + }, + { + "epoch": 9.564356435643564, + "grad_norm": 0.8423325419425964, + "learning_rate": 3.1199299057984096e-05, + "loss": 0.011, + "num_input_tokens_seen": 18350208, + "step": 86940 + }, + { + "epoch": 9.564906490649065, + "grad_norm": 3.0258800983428955, + "learning_rate": 3.119697392250433e-05, + "loss": 0.0878, + "num_input_tokens_seen": 18351200, + "step": 86945 + }, + { + "epoch": 9.565456545654566, + "grad_norm": 0.9333046078681946, + "learning_rate": 3.1194648729910035e-05, + "loss": 0.0388, + "num_input_tokens_seen": 18352224, + "step": 86950 + }, + { + "epoch": 9.566006600660065, + "grad_norm": 0.15060995519161224, + "learning_rate": 3.119232348022263e-05, + "loss": 0.0086, + "num_input_tokens_seen": 18353248, + "step": 86955 + }, + { + "epoch": 9.566556655665567, + "grad_norm": 0.024097571149468422, + "learning_rate": 3.118999817346355e-05, + "loss": 0.0245, + "num_input_tokens_seen": 18354304, + "step": 86960 + }, + { + "epoch": 9.567106710671068, + "grad_norm": 0.11938399821519852, + "learning_rate": 3.118767280965423e-05, + "loss": 0.0669, + "num_input_tokens_seen": 18355360, + "step": 86965 + }, + { + "epoch": 9.567656765676567, + "grad_norm": 0.8522059917449951, + "learning_rate": 3.1185347388816105e-05, + "loss": 0.021, + "num_input_tokens_seen": 18356448, + "step": 86970 + }, + { + "epoch": 9.568206820682068, + "grad_norm": 0.23535968363285065, + "learning_rate": 3.1183021910970586e-05, + "loss": 0.011, + "num_input_tokens_seen": 18357504, + "step": 86975 + }, + { + "epoch": 9.56875687568757, + "grad_norm": 1.2856812477111816, + "learning_rate": 3.1180696376139136e-05, + "loss": 0.062, + "num_input_tokens_seen": 18358592, + "step": 86980 + }, + { + "epoch": 9.569306930693068, + "grad_norm": 0.0057466402649879456, + "learning_rate": 3.117837078434317e-05, + "loss": 0.0308, + "num_input_tokens_seen": 18359616, + "step": 86985 + }, + { + "epoch": 9.56985698569857, + "grad_norm": 0.005110189318656921, + "learning_rate": 3.117604513560412e-05, + "loss": 0.0379, + "num_input_tokens_seen": 18360640, + "step": 86990 + }, + { + "epoch": 9.57040704070407, + "grad_norm": 0.031129110604524612, + "learning_rate": 3.117371942994343e-05, + "loss": 0.0074, + "num_input_tokens_seen": 18361728, + "step": 86995 + }, + { + "epoch": 9.570957095709572, + "grad_norm": 0.08144344389438629, + "learning_rate": 3.117139366738253e-05, + "loss": 0.0125, + "num_input_tokens_seen": 18362720, + "step": 87000 + }, + { + "epoch": 9.571507150715071, + "grad_norm": 0.009857158176600933, + "learning_rate": 3.116906784794285e-05, + "loss": 0.0022, + "num_input_tokens_seen": 18363840, + "step": 87005 + }, + { + "epoch": 9.572057205720572, + "grad_norm": 0.04212663322687149, + "learning_rate": 3.116674197164583e-05, + "loss": 0.0431, + "num_input_tokens_seen": 18364896, + "step": 87010 + }, + { + "epoch": 9.572607260726073, + "grad_norm": 0.07021015882492065, + "learning_rate": 3.116441603851292e-05, + "loss": 0.0768, + "num_input_tokens_seen": 18366016, + "step": 87015 + }, + { + "epoch": 9.573157315731573, + "grad_norm": 1.0778566598892212, + "learning_rate": 3.1162090048565546e-05, + "loss": 0.0422, + "num_input_tokens_seen": 18367008, + "step": 87020 + }, + { + "epoch": 9.573707370737074, + "grad_norm": 0.03266467526555061, + "learning_rate": 3.115976400182514e-05, + "loss": 0.0119, + "num_input_tokens_seen": 18368064, + "step": 87025 + }, + { + "epoch": 9.574257425742575, + "grad_norm": 0.008046189323067665, + "learning_rate": 3.1157437898313144e-05, + "loss": 0.0255, + "num_input_tokens_seen": 18369152, + "step": 87030 + }, + { + "epoch": 9.574807480748074, + "grad_norm": 0.03900143876671791, + "learning_rate": 3.115511173805099e-05, + "loss": 0.0176, + "num_input_tokens_seen": 18370144, + "step": 87035 + }, + { + "epoch": 9.575357535753575, + "grad_norm": 0.007391551975160837, + "learning_rate": 3.115278552106014e-05, + "loss": 0.0008, + "num_input_tokens_seen": 18371232, + "step": 87040 + }, + { + "epoch": 9.575907590759076, + "grad_norm": 0.0694219321012497, + "learning_rate": 3.115045924736201e-05, + "loss": 0.005, + "num_input_tokens_seen": 18372320, + "step": 87045 + }, + { + "epoch": 9.576457645764577, + "grad_norm": 0.17457082867622375, + "learning_rate": 3.114813291697804e-05, + "loss": 0.1052, + "num_input_tokens_seen": 18373376, + "step": 87050 + }, + { + "epoch": 9.577007700770077, + "grad_norm": 0.022079624235630035, + "learning_rate": 3.114580652992969e-05, + "loss": 0.003, + "num_input_tokens_seen": 18374368, + "step": 87055 + }, + { + "epoch": 9.577557755775578, + "grad_norm": 2.289792537689209, + "learning_rate": 3.114348008623839e-05, + "loss": 0.2084, + "num_input_tokens_seen": 18375392, + "step": 87060 + }, + { + "epoch": 9.578107810781079, + "grad_norm": 0.02041902020573616, + "learning_rate": 3.114115358592558e-05, + "loss": 0.1, + "num_input_tokens_seen": 18376448, + "step": 87065 + }, + { + "epoch": 9.578657865786578, + "grad_norm": 0.015250672586262226, + "learning_rate": 3.113882702901271e-05, + "loss": 0.0043, + "num_input_tokens_seen": 18377408, + "step": 87070 + }, + { + "epoch": 9.57920792079208, + "grad_norm": 0.41146335005760193, + "learning_rate": 3.113650041552121e-05, + "loss": 0.0105, + "num_input_tokens_seen": 18378432, + "step": 87075 + }, + { + "epoch": 9.57975797579758, + "grad_norm": 0.10892889648675919, + "learning_rate": 3.113417374547252e-05, + "loss": 0.0035, + "num_input_tokens_seen": 18379488, + "step": 87080 + }, + { + "epoch": 9.58030803080308, + "grad_norm": 1.5861886739730835, + "learning_rate": 3.113184701888811e-05, + "loss": 0.0623, + "num_input_tokens_seen": 18380512, + "step": 87085 + }, + { + "epoch": 9.58085808580858, + "grad_norm": 0.9071252942085266, + "learning_rate": 3.1129520235789404e-05, + "loss": 0.0135, + "num_input_tokens_seen": 18381568, + "step": 87090 + }, + { + "epoch": 9.581408140814082, + "grad_norm": 1.0152456760406494, + "learning_rate": 3.112719339619785e-05, + "loss": 0.0172, + "num_input_tokens_seen": 18382592, + "step": 87095 + }, + { + "epoch": 9.581958195819581, + "grad_norm": 0.03676783666014671, + "learning_rate": 3.11248665001349e-05, + "loss": 0.0663, + "num_input_tokens_seen": 18383584, + "step": 87100 + }, + { + "epoch": 9.582508250825082, + "grad_norm": 0.02099418267607689, + "learning_rate": 3.1122539547621985e-05, + "loss": 0.0074, + "num_input_tokens_seen": 18384640, + "step": 87105 + }, + { + "epoch": 9.583058305830583, + "grad_norm": 0.039766788482666016, + "learning_rate": 3.1120212538680564e-05, + "loss": 0.0141, + "num_input_tokens_seen": 18385728, + "step": 87110 + }, + { + "epoch": 9.583608360836084, + "grad_norm": 0.047673556953668594, + "learning_rate": 3.1117885473332095e-05, + "loss": 0.1294, + "num_input_tokens_seen": 18386752, + "step": 87115 + }, + { + "epoch": 9.584158415841584, + "grad_norm": 0.054924964904785156, + "learning_rate": 3.111555835159799e-05, + "loss": 0.0025, + "num_input_tokens_seen": 18387744, + "step": 87120 + }, + { + "epoch": 9.584708470847085, + "grad_norm": 0.013151383027434349, + "learning_rate": 3.111323117349973e-05, + "loss": 0.0099, + "num_input_tokens_seen": 18388864, + "step": 87125 + }, + { + "epoch": 9.585258525852586, + "grad_norm": 0.23690515756607056, + "learning_rate": 3.111090393905874e-05, + "loss": 0.0316, + "num_input_tokens_seen": 18389920, + "step": 87130 + }, + { + "epoch": 9.585808580858085, + "grad_norm": 0.028663206845521927, + "learning_rate": 3.11085766482965e-05, + "loss": 0.0127, + "num_input_tokens_seen": 18390976, + "step": 87135 + }, + { + "epoch": 9.586358635863586, + "grad_norm": 0.23416925966739655, + "learning_rate": 3.1106249301234424e-05, + "loss": 0.0126, + "num_input_tokens_seen": 18392032, + "step": 87140 + }, + { + "epoch": 9.586908690869087, + "grad_norm": 0.032354533672332764, + "learning_rate": 3.110392189789399e-05, + "loss": 0.003, + "num_input_tokens_seen": 18393088, + "step": 87145 + }, + { + "epoch": 9.587458745874587, + "grad_norm": 0.01640629768371582, + "learning_rate": 3.110159443829662e-05, + "loss": 0.0752, + "num_input_tokens_seen": 18394144, + "step": 87150 + }, + { + "epoch": 9.588008800880088, + "grad_norm": 0.6109902858734131, + "learning_rate": 3.10992669224638e-05, + "loss": 0.0105, + "num_input_tokens_seen": 18395200, + "step": 87155 + }, + { + "epoch": 9.588558855885589, + "grad_norm": 1.5759526491165161, + "learning_rate": 3.109693935041696e-05, + "loss": 0.0777, + "num_input_tokens_seen": 18396256, + "step": 87160 + }, + { + "epoch": 9.589108910891088, + "grad_norm": 0.011274825781583786, + "learning_rate": 3.109461172217755e-05, + "loss": 0.0242, + "num_input_tokens_seen": 18397312, + "step": 87165 + }, + { + "epoch": 9.58965896589659, + "grad_norm": 0.013794980011880398, + "learning_rate": 3.109228403776703e-05, + "loss": 0.0032, + "num_input_tokens_seen": 18398368, + "step": 87170 + }, + { + "epoch": 9.59020902090209, + "grad_norm": 1.1674940586090088, + "learning_rate": 3.1089956297206855e-05, + "loss": 0.0908, + "num_input_tokens_seen": 18399456, + "step": 87175 + }, + { + "epoch": 9.590759075907592, + "grad_norm": 0.005585734732449055, + "learning_rate": 3.1087628500518464e-05, + "loss": 0.0205, + "num_input_tokens_seen": 18400512, + "step": 87180 + }, + { + "epoch": 9.591309130913091, + "grad_norm": 0.4462507665157318, + "learning_rate": 3.1085300647723335e-05, + "loss": 0.0138, + "num_input_tokens_seen": 18401568, + "step": 87185 + }, + { + "epoch": 9.591859185918592, + "grad_norm": 0.22640815377235413, + "learning_rate": 3.10829727388429e-05, + "loss": 0.0082, + "num_input_tokens_seen": 18402624, + "step": 87190 + }, + { + "epoch": 9.592409240924093, + "grad_norm": 0.07247261703014374, + "learning_rate": 3.108064477389863e-05, + "loss": 0.0091, + "num_input_tokens_seen": 18403712, + "step": 87195 + }, + { + "epoch": 9.592959295929592, + "grad_norm": 0.04299221560359001, + "learning_rate": 3.1078316752911975e-05, + "loss": 0.0229, + "num_input_tokens_seen": 18404736, + "step": 87200 + }, + { + "epoch": 9.593509350935093, + "grad_norm": 0.032254938036203384, + "learning_rate": 3.1075988675904386e-05, + "loss": 0.0125, + "num_input_tokens_seen": 18405760, + "step": 87205 + }, + { + "epoch": 9.594059405940595, + "grad_norm": 0.1292048692703247, + "learning_rate": 3.107366054289733e-05, + "loss": 0.0133, + "num_input_tokens_seen": 18406848, + "step": 87210 + }, + { + "epoch": 9.594609460946094, + "grad_norm": 0.008327485993504524, + "learning_rate": 3.1071332353912255e-05, + "loss": 0.0027, + "num_input_tokens_seen": 18407936, + "step": 87215 + }, + { + "epoch": 9.595159515951595, + "grad_norm": 0.1347479671239853, + "learning_rate": 3.1069004108970634e-05, + "loss": 0.0033, + "num_input_tokens_seen": 18408960, + "step": 87220 + }, + { + "epoch": 9.595709570957096, + "grad_norm": 0.004352455493062735, + "learning_rate": 3.10666758080939e-05, + "loss": 0.0388, + "num_input_tokens_seen": 18410112, + "step": 87225 + }, + { + "epoch": 9.596259625962595, + "grad_norm": 1.855201244354248, + "learning_rate": 3.106434745130354e-05, + "loss": 0.0734, + "num_input_tokens_seen": 18411104, + "step": 87230 + }, + { + "epoch": 9.596809680968097, + "grad_norm": 0.7672745585441589, + "learning_rate": 3.106201903862099e-05, + "loss": 0.0605, + "num_input_tokens_seen": 18412192, + "step": 87235 + }, + { + "epoch": 9.597359735973598, + "grad_norm": 1.3780806064605713, + "learning_rate": 3.105969057006772e-05, + "loss": 0.123, + "num_input_tokens_seen": 18413216, + "step": 87240 + }, + { + "epoch": 9.597909790979099, + "grad_norm": 0.08591301739215851, + "learning_rate": 3.105736204566519e-05, + "loss": 0.0164, + "num_input_tokens_seen": 18414304, + "step": 87245 + }, + { + "epoch": 9.598459845984598, + "grad_norm": 0.12620070576667786, + "learning_rate": 3.1055033465434864e-05, + "loss": 0.0054, + "num_input_tokens_seen": 18415360, + "step": 87250 + }, + { + "epoch": 9.599009900990099, + "grad_norm": 0.03213068097829819, + "learning_rate": 3.1052704829398195e-05, + "loss": 0.0026, + "num_input_tokens_seen": 18416384, + "step": 87255 + }, + { + "epoch": 9.5995599559956, + "grad_norm": 0.6424281001091003, + "learning_rate": 3.1050376137576654e-05, + "loss": 0.0626, + "num_input_tokens_seen": 18417472, + "step": 87260 + }, + { + "epoch": 9.6001100110011, + "grad_norm": 0.7039114236831665, + "learning_rate": 3.104804738999169e-05, + "loss": 0.0095, + "num_input_tokens_seen": 18418496, + "step": 87265 + }, + { + "epoch": 9.6006600660066, + "grad_norm": 0.049020808190107346, + "learning_rate": 3.1045718586664786e-05, + "loss": 0.0035, + "num_input_tokens_seen": 18419520, + "step": 87270 + }, + { + "epoch": 9.601210121012102, + "grad_norm": 0.08539260923862457, + "learning_rate": 3.1043389727617393e-05, + "loss": 0.004, + "num_input_tokens_seen": 18420544, + "step": 87275 + }, + { + "epoch": 9.601760176017601, + "grad_norm": 0.02992607280611992, + "learning_rate": 3.104106081287098e-05, + "loss": 0.0861, + "num_input_tokens_seen": 18421568, + "step": 87280 + }, + { + "epoch": 9.602310231023102, + "grad_norm": 0.007285719737410545, + "learning_rate": 3.1038731842447005e-05, + "loss": 0.0099, + "num_input_tokens_seen": 18422656, + "step": 87285 + }, + { + "epoch": 9.602860286028603, + "grad_norm": 0.17132225632667542, + "learning_rate": 3.103640281636693e-05, + "loss": 0.0117, + "num_input_tokens_seen": 18423680, + "step": 87290 + }, + { + "epoch": 9.603410341034103, + "grad_norm": 0.0047781141474843025, + "learning_rate": 3.103407373465223e-05, + "loss": 0.0037, + "num_input_tokens_seen": 18424736, + "step": 87295 + }, + { + "epoch": 9.603960396039604, + "grad_norm": 0.003310298779979348, + "learning_rate": 3.103174459732437e-05, + "loss": 0.0658, + "num_input_tokens_seen": 18425792, + "step": 87300 + }, + { + "epoch": 9.604510451045105, + "grad_norm": 0.08301179111003876, + "learning_rate": 3.102941540440482e-05, + "loss": 0.0499, + "num_input_tokens_seen": 18426816, + "step": 87305 + }, + { + "epoch": 9.605060506050606, + "grad_norm": 0.6305369734764099, + "learning_rate": 3.102708615591504e-05, + "loss": 0.0177, + "num_input_tokens_seen": 18427808, + "step": 87310 + }, + { + "epoch": 9.605610561056105, + "grad_norm": 1.902684211730957, + "learning_rate": 3.102475685187649e-05, + "loss": 0.0199, + "num_input_tokens_seen": 18428864, + "step": 87315 + }, + { + "epoch": 9.606160616061606, + "grad_norm": 0.07451727986335754, + "learning_rate": 3.102242749231065e-05, + "loss": 0.0569, + "num_input_tokens_seen": 18429888, + "step": 87320 + }, + { + "epoch": 9.606710671067107, + "grad_norm": 0.3949057161808014, + "learning_rate": 3.102009807723899e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18430912, + "step": 87325 + }, + { + "epoch": 9.607260726072607, + "grad_norm": 0.06798399239778519, + "learning_rate": 3.101776860668297e-05, + "loss": 0.0687, + "num_input_tokens_seen": 18431936, + "step": 87330 + }, + { + "epoch": 9.607810781078108, + "grad_norm": 0.0174795463681221, + "learning_rate": 3.1015439080664074e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18432928, + "step": 87335 + }, + { + "epoch": 9.608360836083609, + "grad_norm": 0.04000735655426979, + "learning_rate": 3.101310949920375e-05, + "loss": 0.0092, + "num_input_tokens_seen": 18433952, + "step": 87340 + }, + { + "epoch": 9.608910891089108, + "grad_norm": 0.2310987412929535, + "learning_rate": 3.1010779862323485e-05, + "loss": 0.0121, + "num_input_tokens_seen": 18434976, + "step": 87345 + }, + { + "epoch": 9.60946094609461, + "grad_norm": 0.0666140541434288, + "learning_rate": 3.1008450170044754e-05, + "loss": 0.1665, + "num_input_tokens_seen": 18436032, + "step": 87350 + }, + { + "epoch": 9.61001100110011, + "grad_norm": 0.2844596207141876, + "learning_rate": 3.100612042238901e-05, + "loss": 0.0699, + "num_input_tokens_seen": 18437088, + "step": 87355 + }, + { + "epoch": 9.61056105610561, + "grad_norm": 0.023079246282577515, + "learning_rate": 3.100379061937774e-05, + "loss": 0.1387, + "num_input_tokens_seen": 18438176, + "step": 87360 + }, + { + "epoch": 9.61111111111111, + "grad_norm": 0.12825922667980194, + "learning_rate": 3.100146076103241e-05, + "loss": 0.003, + "num_input_tokens_seen": 18439232, + "step": 87365 + }, + { + "epoch": 9.611661166116612, + "grad_norm": 0.025420550256967545, + "learning_rate": 3.09991308473745e-05, + "loss": 0.0135, + "num_input_tokens_seen": 18440256, + "step": 87370 + }, + { + "epoch": 9.612211221122113, + "grad_norm": 0.03453334420919418, + "learning_rate": 3.099680087842549e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18441344, + "step": 87375 + }, + { + "epoch": 9.612761276127612, + "grad_norm": 0.007377226836979389, + "learning_rate": 3.099447085420682e-05, + "loss": 0.0417, + "num_input_tokens_seen": 18442304, + "step": 87380 + }, + { + "epoch": 9.613311331133113, + "grad_norm": 0.307388037443161, + "learning_rate": 3.099214077474e-05, + "loss": 0.1042, + "num_input_tokens_seen": 18443296, + "step": 87385 + }, + { + "epoch": 9.613861386138614, + "grad_norm": 0.21002525091171265, + "learning_rate": 3.09898106400465e-05, + "loss": 0.0249, + "num_input_tokens_seen": 18444288, + "step": 87390 + }, + { + "epoch": 9.614411441144114, + "grad_norm": 0.025084707885980606, + "learning_rate": 3.0987480450147785e-05, + "loss": 0.094, + "num_input_tokens_seen": 18445408, + "step": 87395 + }, + { + "epoch": 9.614961496149615, + "grad_norm": 0.0257914699614048, + "learning_rate": 3.098515020506534e-05, + "loss": 0.0067, + "num_input_tokens_seen": 18446464, + "step": 87400 + }, + { + "epoch": 9.615511551155116, + "grad_norm": 0.008178438059985638, + "learning_rate": 3.0982819904820635e-05, + "loss": 0.0161, + "num_input_tokens_seen": 18447456, + "step": 87405 + }, + { + "epoch": 9.616061606160617, + "grad_norm": 0.09199056774377823, + "learning_rate": 3.098048954943514e-05, + "loss": 0.0037, + "num_input_tokens_seen": 18448480, + "step": 87410 + }, + { + "epoch": 9.616611661166116, + "grad_norm": 0.011553994379937649, + "learning_rate": 3.097815913893035e-05, + "loss": 0.0346, + "num_input_tokens_seen": 18449568, + "step": 87415 + }, + { + "epoch": 9.617161716171617, + "grad_norm": 0.007678708992898464, + "learning_rate": 3.097582867332773e-05, + "loss": 0.126, + "num_input_tokens_seen": 18450624, + "step": 87420 + }, + { + "epoch": 9.617711771177119, + "grad_norm": 0.58309406042099, + "learning_rate": 3.097349815264878e-05, + "loss": 0.0169, + "num_input_tokens_seen": 18451712, + "step": 87425 + }, + { + "epoch": 9.618261826182618, + "grad_norm": 0.016533907502889633, + "learning_rate": 3.097116757691495e-05, + "loss": 0.002, + "num_input_tokens_seen": 18452800, + "step": 87430 + }, + { + "epoch": 9.618811881188119, + "grad_norm": 0.03949359431862831, + "learning_rate": 3.096883694614774e-05, + "loss": 0.0691, + "num_input_tokens_seen": 18453920, + "step": 87435 + }, + { + "epoch": 9.61936193619362, + "grad_norm": 0.03198855742812157, + "learning_rate": 3.096650626036862e-05, + "loss": 0.0074, + "num_input_tokens_seen": 18455040, + "step": 87440 + }, + { + "epoch": 9.61991199119912, + "grad_norm": 0.0899810940027237, + "learning_rate": 3.096417551959908e-05, + "loss": 0.0025, + "num_input_tokens_seen": 18456128, + "step": 87445 + }, + { + "epoch": 9.62046204620462, + "grad_norm": 0.02607041224837303, + "learning_rate": 3.0961844723860593e-05, + "loss": 0.0251, + "num_input_tokens_seen": 18457152, + "step": 87450 + }, + { + "epoch": 9.621012101210122, + "grad_norm": 0.012367467395961285, + "learning_rate": 3.095951387317464e-05, + "loss": 0.0582, + "num_input_tokens_seen": 18458176, + "step": 87455 + }, + { + "epoch": 9.62156215621562, + "grad_norm": 0.11058414727449417, + "learning_rate": 3.095718296756271e-05, + "loss": 0.0534, + "num_input_tokens_seen": 18459264, + "step": 87460 + }, + { + "epoch": 9.622112211221122, + "grad_norm": 0.03339982405304909, + "learning_rate": 3.0954852007046284e-05, + "loss": 0.0082, + "num_input_tokens_seen": 18460288, + "step": 87465 + }, + { + "epoch": 9.622662266226623, + "grad_norm": 0.09105416387319565, + "learning_rate": 3.095252099164685e-05, + "loss": 0.0491, + "num_input_tokens_seen": 18461344, + "step": 87470 + }, + { + "epoch": 9.623212321232124, + "grad_norm": 0.009698357433080673, + "learning_rate": 3.095018992138588e-05, + "loss": 0.1479, + "num_input_tokens_seen": 18462368, + "step": 87475 + }, + { + "epoch": 9.623762376237623, + "grad_norm": 0.02209620736539364, + "learning_rate": 3.0947858796284866e-05, + "loss": 0.0052, + "num_input_tokens_seen": 18463456, + "step": 87480 + }, + { + "epoch": 9.624312431243125, + "grad_norm": 0.016369586810469627, + "learning_rate": 3.094552761636529e-05, + "loss": 0.004, + "num_input_tokens_seen": 18464544, + "step": 87485 + }, + { + "epoch": 9.624862486248626, + "grad_norm": 0.047194160521030426, + "learning_rate": 3.0943196381648646e-05, + "loss": 0.0411, + "num_input_tokens_seen": 18465664, + "step": 87490 + }, + { + "epoch": 9.625412541254125, + "grad_norm": 1.4283617734909058, + "learning_rate": 3.094086509215641e-05, + "loss": 0.0724, + "num_input_tokens_seen": 18466752, + "step": 87495 + }, + { + "epoch": 9.625962596259626, + "grad_norm": 0.019589446485042572, + "learning_rate": 3.093853374791006e-05, + "loss": 0.0746, + "num_input_tokens_seen": 18467808, + "step": 87500 + }, + { + "epoch": 9.626512651265127, + "grad_norm": 0.2549223005771637, + "learning_rate": 3.0936202348931105e-05, + "loss": 0.0555, + "num_input_tokens_seen": 18468864, + "step": 87505 + }, + { + "epoch": 9.627062706270626, + "grad_norm": 0.04084727540612221, + "learning_rate": 3.093387089524102e-05, + "loss": 0.017, + "num_input_tokens_seen": 18469856, + "step": 87510 + }, + { + "epoch": 9.627612761276128, + "grad_norm": 0.060018375515937805, + "learning_rate": 3.0931539386861286e-05, + "loss": 0.0105, + "num_input_tokens_seen": 18470976, + "step": 87515 + }, + { + "epoch": 9.628162816281629, + "grad_norm": 0.054960947483778, + "learning_rate": 3.092920782381342e-05, + "loss": 0.1018, + "num_input_tokens_seen": 18472032, + "step": 87520 + }, + { + "epoch": 9.628712871287128, + "grad_norm": 0.03147469460964203, + "learning_rate": 3.092687620611887e-05, + "loss": 0.0367, + "num_input_tokens_seen": 18473024, + "step": 87525 + }, + { + "epoch": 9.629262926292629, + "grad_norm": 0.020574769005179405, + "learning_rate": 3.0924544533799146e-05, + "loss": 0.0337, + "num_input_tokens_seen": 18474048, + "step": 87530 + }, + { + "epoch": 9.62981298129813, + "grad_norm": 0.03796513006091118, + "learning_rate": 3.092221280687575e-05, + "loss": 0.0474, + "num_input_tokens_seen": 18475104, + "step": 87535 + }, + { + "epoch": 9.630363036303631, + "grad_norm": 0.053658440709114075, + "learning_rate": 3.091988102537016e-05, + "loss": 0.0839, + "num_input_tokens_seen": 18476192, + "step": 87540 + }, + { + "epoch": 9.63091309130913, + "grad_norm": 0.6571788787841797, + "learning_rate": 3.0917549189303865e-05, + "loss": 0.0305, + "num_input_tokens_seen": 18477312, + "step": 87545 + }, + { + "epoch": 9.631463146314632, + "grad_norm": 0.07101672887802124, + "learning_rate": 3.0915217298698354e-05, + "loss": 0.0085, + "num_input_tokens_seen": 18478400, + "step": 87550 + }, + { + "epoch": 9.632013201320133, + "grad_norm": 0.03118731640279293, + "learning_rate": 3.0912885353575126e-05, + "loss": 0.0067, + "num_input_tokens_seen": 18479456, + "step": 87555 + }, + { + "epoch": 9.632563256325632, + "grad_norm": 1.6077934503555298, + "learning_rate": 3.091055335395567e-05, + "loss": 0.047, + "num_input_tokens_seen": 18480512, + "step": 87560 + }, + { + "epoch": 9.633113311331133, + "grad_norm": 0.05398445948958397, + "learning_rate": 3.090822129986148e-05, + "loss": 0.0696, + "num_input_tokens_seen": 18481632, + "step": 87565 + }, + { + "epoch": 9.633663366336634, + "grad_norm": 0.015387334860861301, + "learning_rate": 3.090588919131405e-05, + "loss": 0.0072, + "num_input_tokens_seen": 18482688, + "step": 87570 + }, + { + "epoch": 9.634213421342134, + "grad_norm": 0.27411824464797974, + "learning_rate": 3.0903557028334874e-05, + "loss": 0.0578, + "num_input_tokens_seen": 18483744, + "step": 87575 + }, + { + "epoch": 9.634763476347635, + "grad_norm": 0.03097851574420929, + "learning_rate": 3.090122481094545e-05, + "loss": 0.0064, + "num_input_tokens_seen": 18484768, + "step": 87580 + }, + { + "epoch": 9.635313531353136, + "grad_norm": 0.029866628348827362, + "learning_rate": 3.089889253916726e-05, + "loss": 0.0818, + "num_input_tokens_seen": 18485792, + "step": 87585 + }, + { + "epoch": 9.635863586358635, + "grad_norm": 0.019665220752358437, + "learning_rate": 3.0896560213021816e-05, + "loss": 0.0092, + "num_input_tokens_seen": 18486784, + "step": 87590 + }, + { + "epoch": 9.636413641364136, + "grad_norm": 0.8464083075523376, + "learning_rate": 3.08942278325306e-05, + "loss": 0.0496, + "num_input_tokens_seen": 18487840, + "step": 87595 + }, + { + "epoch": 9.636963696369637, + "grad_norm": 0.9161196947097778, + "learning_rate": 3.089189539771511e-05, + "loss": 0.0479, + "num_input_tokens_seen": 18488928, + "step": 87600 + }, + { + "epoch": 9.637513751375138, + "grad_norm": 0.07205689698457718, + "learning_rate": 3.088956290859685e-05, + "loss": 0.0318, + "num_input_tokens_seen": 18489952, + "step": 87605 + }, + { + "epoch": 9.638063806380638, + "grad_norm": 0.03918277472257614, + "learning_rate": 3.088723036519732e-05, + "loss": 0.0157, + "num_input_tokens_seen": 18491008, + "step": 87610 + }, + { + "epoch": 9.638613861386139, + "grad_norm": 0.023617327213287354, + "learning_rate": 3.0884897767538016e-05, + "loss": 0.0344, + "num_input_tokens_seen": 18492160, + "step": 87615 + }, + { + "epoch": 9.63916391639164, + "grad_norm": 0.19702310860157013, + "learning_rate": 3.088256511564043e-05, + "loss": 0.0499, + "num_input_tokens_seen": 18493216, + "step": 87620 + }, + { + "epoch": 9.63971397139714, + "grad_norm": 0.058051470667123795, + "learning_rate": 3.088023240952606e-05, + "loss": 0.026, + "num_input_tokens_seen": 18494208, + "step": 87625 + }, + { + "epoch": 9.64026402640264, + "grad_norm": 0.006703355349600315, + "learning_rate": 3.087789964921641e-05, + "loss": 0.0264, + "num_input_tokens_seen": 18495232, + "step": 87630 + }, + { + "epoch": 9.640814081408141, + "grad_norm": 0.008991760201752186, + "learning_rate": 3.0875566834732985e-05, + "loss": 0.1447, + "num_input_tokens_seen": 18496256, + "step": 87635 + }, + { + "epoch": 9.64136413641364, + "grad_norm": 0.01123223640024662, + "learning_rate": 3.087323396609728e-05, + "loss": 0.0116, + "num_input_tokens_seen": 18497376, + "step": 87640 + }, + { + "epoch": 9.641914191419142, + "grad_norm": 0.8193345069885254, + "learning_rate": 3.087090104333078e-05, + "loss": 0.0548, + "num_input_tokens_seen": 18498432, + "step": 87645 + }, + { + "epoch": 9.642464246424643, + "grad_norm": 1.5827823877334595, + "learning_rate": 3.086856806645502e-05, + "loss": 0.0226, + "num_input_tokens_seen": 18499456, + "step": 87650 + }, + { + "epoch": 9.643014301430142, + "grad_norm": 0.281241238117218, + "learning_rate": 3.0866235035491475e-05, + "loss": 0.0687, + "num_input_tokens_seen": 18500416, + "step": 87655 + }, + { + "epoch": 9.643564356435643, + "grad_norm": 0.11930669844150543, + "learning_rate": 3.086390195046166e-05, + "loss": 0.0168, + "num_input_tokens_seen": 18501504, + "step": 87660 + }, + { + "epoch": 9.644114411441144, + "grad_norm": 0.1660318821668625, + "learning_rate": 3.086156881138708e-05, + "loss": 0.005, + "num_input_tokens_seen": 18502528, + "step": 87665 + }, + { + "epoch": 9.644664466446645, + "grad_norm": 0.04759567975997925, + "learning_rate": 3.085923561828922e-05, + "loss": 0.0051, + "num_input_tokens_seen": 18503520, + "step": 87670 + }, + { + "epoch": 9.645214521452145, + "grad_norm": 0.013484633527696133, + "learning_rate": 3.08569023711896e-05, + "loss": 0.0017, + "num_input_tokens_seen": 18504608, + "step": 87675 + }, + { + "epoch": 9.645764576457646, + "grad_norm": 0.05609915778040886, + "learning_rate": 3.0854569070109726e-05, + "loss": 0.0272, + "num_input_tokens_seen": 18505600, + "step": 87680 + }, + { + "epoch": 9.646314631463147, + "grad_norm": 0.018986579030752182, + "learning_rate": 3.08522357150711e-05, + "loss": 0.0153, + "num_input_tokens_seen": 18506592, + "step": 87685 + }, + { + "epoch": 9.646864686468646, + "grad_norm": 0.016232000663876534, + "learning_rate": 3.0849902306095227e-05, + "loss": 0.0629, + "num_input_tokens_seen": 18507648, + "step": 87690 + }, + { + "epoch": 9.647414741474147, + "grad_norm": 0.4548753499984741, + "learning_rate": 3.0847568843203605e-05, + "loss": 0.0852, + "num_input_tokens_seen": 18508736, + "step": 87695 + }, + { + "epoch": 9.647964796479648, + "grad_norm": 0.09457702934741974, + "learning_rate": 3.084523532641775e-05, + "loss": 0.0051, + "num_input_tokens_seen": 18509760, + "step": 87700 + }, + { + "epoch": 9.648514851485148, + "grad_norm": 0.02524356171488762, + "learning_rate": 3.0842901755759155e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18510848, + "step": 87705 + }, + { + "epoch": 9.649064906490649, + "grad_norm": 0.02201918512582779, + "learning_rate": 3.084056813124936e-05, + "loss": 0.0165, + "num_input_tokens_seen": 18511968, + "step": 87710 + }, + { + "epoch": 9.64961496149615, + "grad_norm": 0.061422571539878845, + "learning_rate": 3.083823445290983e-05, + "loss": 0.0292, + "num_input_tokens_seen": 18513024, + "step": 87715 + }, + { + "epoch": 9.65016501650165, + "grad_norm": 0.0627995952963829, + "learning_rate": 3.083590072076211e-05, + "loss": 0.0036, + "num_input_tokens_seen": 18514080, + "step": 87720 + }, + { + "epoch": 9.65071507150715, + "grad_norm": 1.0872091054916382, + "learning_rate": 3.0833566934827685e-05, + "loss": 0.039, + "num_input_tokens_seen": 18515104, + "step": 87725 + }, + { + "epoch": 9.651265126512651, + "grad_norm": 0.007976262830197811, + "learning_rate": 3.0831233095128076e-05, + "loss": 0.0226, + "num_input_tokens_seen": 18516160, + "step": 87730 + }, + { + "epoch": 9.651815181518153, + "grad_norm": 0.015241175889968872, + "learning_rate": 3.082889920168479e-05, + "loss": 0.004, + "num_input_tokens_seen": 18517184, + "step": 87735 + }, + { + "epoch": 9.652365236523652, + "grad_norm": 0.009678234346210957, + "learning_rate": 3.082656525451934e-05, + "loss": 0.0197, + "num_input_tokens_seen": 18518272, + "step": 87740 + }, + { + "epoch": 9.652915291529153, + "grad_norm": 0.03435397148132324, + "learning_rate": 3.0824231253653225e-05, + "loss": 0.0363, + "num_input_tokens_seen": 18519328, + "step": 87745 + }, + { + "epoch": 9.653465346534654, + "grad_norm": 0.16591380536556244, + "learning_rate": 3.082189719910797e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18520384, + "step": 87750 + }, + { + "epoch": 9.654015401540153, + "grad_norm": 0.04270721599459648, + "learning_rate": 3.0819563090905085e-05, + "loss": 0.043, + "num_input_tokens_seen": 18521376, + "step": 87755 + }, + { + "epoch": 9.654565456545654, + "grad_norm": 0.05132758617401123, + "learning_rate": 3.081722892906607e-05, + "loss": 0.009, + "num_input_tokens_seen": 18522368, + "step": 87760 + }, + { + "epoch": 9.655115511551156, + "grad_norm": 0.2737801671028137, + "learning_rate": 3.081489471361246e-05, + "loss": 0.029, + "num_input_tokens_seen": 18523456, + "step": 87765 + }, + { + "epoch": 9.655665566556655, + "grad_norm": 0.04603572189807892, + "learning_rate": 3.0812560444565745e-05, + "loss": 0.094, + "num_input_tokens_seen": 18524512, + "step": 87770 + }, + { + "epoch": 9.656215621562156, + "grad_norm": 0.032453522086143494, + "learning_rate": 3.081022612194745e-05, + "loss": 0.0402, + "num_input_tokens_seen": 18525568, + "step": 87775 + }, + { + "epoch": 9.656765676567657, + "grad_norm": 0.012779979035258293, + "learning_rate": 3.08078917457791e-05, + "loss": 0.011, + "num_input_tokens_seen": 18526592, + "step": 87780 + }, + { + "epoch": 9.657315731573158, + "grad_norm": 0.0531441867351532, + "learning_rate": 3.080555731608219e-05, + "loss": 0.0389, + "num_input_tokens_seen": 18527648, + "step": 87785 + }, + { + "epoch": 9.657865786578657, + "grad_norm": 1.5105198621749878, + "learning_rate": 3.0803222832878244e-05, + "loss": 0.0223, + "num_input_tokens_seen": 18528704, + "step": 87790 + }, + { + "epoch": 9.658415841584159, + "grad_norm": 0.02329801395535469, + "learning_rate": 3.080088829618878e-05, + "loss": 0.0265, + "num_input_tokens_seen": 18529760, + "step": 87795 + }, + { + "epoch": 9.65896589658966, + "grad_norm": 0.13369493186473846, + "learning_rate": 3.079855370603531e-05, + "loss": 0.009, + "num_input_tokens_seen": 18530720, + "step": 87800 + }, + { + "epoch": 9.659515951595159, + "grad_norm": 0.5725614428520203, + "learning_rate": 3.079621906243936e-05, + "loss": 0.0135, + "num_input_tokens_seen": 18531744, + "step": 87805 + }, + { + "epoch": 9.66006600660066, + "grad_norm": 2.1775145530700684, + "learning_rate": 3.0793884365422434e-05, + "loss": 0.0287, + "num_input_tokens_seen": 18532800, + "step": 87810 + }, + { + "epoch": 9.660616061606161, + "grad_norm": 0.053290557116270065, + "learning_rate": 3.079154961500605e-05, + "loss": 0.0338, + "num_input_tokens_seen": 18533856, + "step": 87815 + }, + { + "epoch": 9.66116611661166, + "grad_norm": 0.01876085437834263, + "learning_rate": 3.078921481121174e-05, + "loss": 0.166, + "num_input_tokens_seen": 18534944, + "step": 87820 + }, + { + "epoch": 9.661716171617162, + "grad_norm": 0.11377575993537903, + "learning_rate": 3.078687995406102e-05, + "loss": 0.036, + "num_input_tokens_seen": 18536032, + "step": 87825 + }, + { + "epoch": 9.662266226622663, + "grad_norm": 1.59989333152771, + "learning_rate": 3.07845450435754e-05, + "loss": 0.0291, + "num_input_tokens_seen": 18537056, + "step": 87830 + }, + { + "epoch": 9.662816281628164, + "grad_norm": 0.9693703651428223, + "learning_rate": 3.0782210079776394e-05, + "loss": 0.1028, + "num_input_tokens_seen": 18538080, + "step": 87835 + }, + { + "epoch": 9.663366336633663, + "grad_norm": 0.020942851901054382, + "learning_rate": 3.0779875062685545e-05, + "loss": 0.0876, + "num_input_tokens_seen": 18539104, + "step": 87840 + }, + { + "epoch": 9.663916391639164, + "grad_norm": 0.04102719947695732, + "learning_rate": 3.0777539992324354e-05, + "loss": 0.005, + "num_input_tokens_seen": 18540128, + "step": 87845 + }, + { + "epoch": 9.664466446644665, + "grad_norm": 0.21626177430152893, + "learning_rate": 3.0775204868714355e-05, + "loss": 0.0115, + "num_input_tokens_seen": 18541184, + "step": 87850 + }, + { + "epoch": 9.665016501650165, + "grad_norm": 0.07005270570516586, + "learning_rate": 3.077286969187706e-05, + "loss": 0.0406, + "num_input_tokens_seen": 18542272, + "step": 87855 + }, + { + "epoch": 9.665566556655666, + "grad_norm": 0.07059846818447113, + "learning_rate": 3.0770534461833994e-05, + "loss": 0.0115, + "num_input_tokens_seen": 18543296, + "step": 87860 + }, + { + "epoch": 9.666116611661167, + "grad_norm": 0.023475736379623413, + "learning_rate": 3.076819917860668e-05, + "loss": 0.036, + "num_input_tokens_seen": 18544384, + "step": 87865 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 3.5483932495117188, + "learning_rate": 3.0765863842216645e-05, + "loss": 0.0409, + "num_input_tokens_seen": 18545408, + "step": 87870 + }, + { + "epoch": 9.667216721672167, + "grad_norm": 0.016333863139152527, + "learning_rate": 3.076352845268541e-05, + "loss": 0.0048, + "num_input_tokens_seen": 18546432, + "step": 87875 + }, + { + "epoch": 9.667766776677668, + "grad_norm": 0.013548551127314568, + "learning_rate": 3.07611930100345e-05, + "loss": 0.0376, + "num_input_tokens_seen": 18547456, + "step": 87880 + }, + { + "epoch": 9.668316831683168, + "grad_norm": 0.054684873670339584, + "learning_rate": 3.0758857514285436e-05, + "loss": 0.0236, + "num_input_tokens_seen": 18548640, + "step": 87885 + }, + { + "epoch": 9.668866886688669, + "grad_norm": 0.04136604070663452, + "learning_rate": 3.075652196545975e-05, + "loss": 0.0341, + "num_input_tokens_seen": 18549632, + "step": 87890 + }, + { + "epoch": 9.66941694169417, + "grad_norm": 0.5730682611465454, + "learning_rate": 3.075418636357895e-05, + "loss": 0.021, + "num_input_tokens_seen": 18550720, + "step": 87895 + }, + { + "epoch": 9.66996699669967, + "grad_norm": 0.04214133322238922, + "learning_rate": 3.0751850708664595e-05, + "loss": 0.0362, + "num_input_tokens_seen": 18551808, + "step": 87900 + }, + { + "epoch": 9.67051705170517, + "grad_norm": 0.023886438459157944, + "learning_rate": 3.0749515000738174e-05, + "loss": 0.0191, + "num_input_tokens_seen": 18552928, + "step": 87905 + }, + { + "epoch": 9.671067106710671, + "grad_norm": 0.007862373255193233, + "learning_rate": 3.074717923982124e-05, + "loss": 0.0434, + "num_input_tokens_seen": 18554016, + "step": 87910 + }, + { + "epoch": 9.671617161716172, + "grad_norm": 1.3682878017425537, + "learning_rate": 3.074484342593532e-05, + "loss": 0.1003, + "num_input_tokens_seen": 18555072, + "step": 87915 + }, + { + "epoch": 9.672167216721672, + "grad_norm": 1.1769195795059204, + "learning_rate": 3.0742507559101923e-05, + "loss": 0.01, + "num_input_tokens_seen": 18556128, + "step": 87920 + }, + { + "epoch": 9.672717271727173, + "grad_norm": 0.005520292557775974, + "learning_rate": 3.07401716393426e-05, + "loss": 0.0029, + "num_input_tokens_seen": 18557184, + "step": 87925 + }, + { + "epoch": 9.673267326732674, + "grad_norm": 0.019992481917142868, + "learning_rate": 3.073783566667886e-05, + "loss": 0.0021, + "num_input_tokens_seen": 18558240, + "step": 87930 + }, + { + "epoch": 9.673817381738173, + "grad_norm": 0.12021160125732422, + "learning_rate": 3.073549964113224e-05, + "loss": 0.0066, + "num_input_tokens_seen": 18559232, + "step": 87935 + }, + { + "epoch": 9.674367436743674, + "grad_norm": 0.9905127286911011, + "learning_rate": 3.073316356272428e-05, + "loss": 0.0384, + "num_input_tokens_seen": 18560288, + "step": 87940 + }, + { + "epoch": 9.674917491749175, + "grad_norm": 0.25027942657470703, + "learning_rate": 3.0730827431476504e-05, + "loss": 0.01, + "num_input_tokens_seen": 18561408, + "step": 87945 + }, + { + "epoch": 9.675467546754675, + "grad_norm": 0.008064618334174156, + "learning_rate": 3.0728491247410434e-05, + "loss": 0.0373, + "num_input_tokens_seen": 18562496, + "step": 87950 + }, + { + "epoch": 9.676017601760176, + "grad_norm": 0.06556591391563416, + "learning_rate": 3.0726155010547614e-05, + "loss": 0.0041, + "num_input_tokens_seen": 18563488, + "step": 87955 + }, + { + "epoch": 9.676567656765677, + "grad_norm": 0.007574700750410557, + "learning_rate": 3.072381872090957e-05, + "loss": 0.053, + "num_input_tokens_seen": 18564544, + "step": 87960 + }, + { + "epoch": 9.677117711771178, + "grad_norm": 0.7310102581977844, + "learning_rate": 3.072148237851783e-05, + "loss": 0.0292, + "num_input_tokens_seen": 18565600, + "step": 87965 + }, + { + "epoch": 9.677667766776677, + "grad_norm": 0.03504157066345215, + "learning_rate": 3.071914598339394e-05, + "loss": 0.09, + "num_input_tokens_seen": 18566656, + "step": 87970 + }, + { + "epoch": 9.678217821782178, + "grad_norm": 0.5193737149238586, + "learning_rate": 3.0716809535559425e-05, + "loss": 0.0737, + "num_input_tokens_seen": 18567744, + "step": 87975 + }, + { + "epoch": 9.67876787678768, + "grad_norm": 0.10413894802331924, + "learning_rate": 3.0714473035035815e-05, + "loss": 0.0411, + "num_input_tokens_seen": 18568768, + "step": 87980 + }, + { + "epoch": 9.679317931793179, + "grad_norm": 0.015266955830156803, + "learning_rate": 3.071213648184465e-05, + "loss": 0.0148, + "num_input_tokens_seen": 18569824, + "step": 87985 + }, + { + "epoch": 9.67986798679868, + "grad_norm": 2.161167860031128, + "learning_rate": 3.0709799876007463e-05, + "loss": 0.0872, + "num_input_tokens_seen": 18570944, + "step": 87990 + }, + { + "epoch": 9.680418041804181, + "grad_norm": 0.04851224645972252, + "learning_rate": 3.07074632175458e-05, + "loss": 0.0434, + "num_input_tokens_seen": 18571968, + "step": 87995 + }, + { + "epoch": 9.68096809680968, + "grad_norm": 0.644694447517395, + "learning_rate": 3.070512650648118e-05, + "loss": 0.0101, + "num_input_tokens_seen": 18573088, + "step": 88000 + }, + { + "epoch": 9.681518151815181, + "grad_norm": 0.012261958792805672, + "learning_rate": 3.0702789742835145e-05, + "loss": 0.0078, + "num_input_tokens_seen": 18574208, + "step": 88005 + }, + { + "epoch": 9.682068206820682, + "grad_norm": 0.00619908794760704, + "learning_rate": 3.0700452926629234e-05, + "loss": 0.0036, + "num_input_tokens_seen": 18575296, + "step": 88010 + }, + { + "epoch": 9.682618261826182, + "grad_norm": 0.32089564204216003, + "learning_rate": 3.069811605788499e-05, + "loss": 0.0067, + "num_input_tokens_seen": 18576352, + "step": 88015 + }, + { + "epoch": 9.683168316831683, + "grad_norm": 0.010046195238828659, + "learning_rate": 3.0695779136623935e-05, + "loss": 0.0722, + "num_input_tokens_seen": 18577440, + "step": 88020 + }, + { + "epoch": 9.683718371837184, + "grad_norm": 1.8350197076797485, + "learning_rate": 3.069344216286762e-05, + "loss": 0.0297, + "num_input_tokens_seen": 18578528, + "step": 88025 + }, + { + "epoch": 9.684268426842685, + "grad_norm": 0.008746235631406307, + "learning_rate": 3.0691105136637584e-05, + "loss": 0.0021, + "num_input_tokens_seen": 18579616, + "step": 88030 + }, + { + "epoch": 9.684818481848184, + "grad_norm": 0.5175702571868896, + "learning_rate": 3.068876805795536e-05, + "loss": 0.19, + "num_input_tokens_seen": 18580640, + "step": 88035 + }, + { + "epoch": 9.685368536853685, + "grad_norm": 0.016224274411797523, + "learning_rate": 3.068643092684249e-05, + "loss": 0.055, + "num_input_tokens_seen": 18581696, + "step": 88040 + }, + { + "epoch": 9.685918591859187, + "grad_norm": 0.19079160690307617, + "learning_rate": 3.068409374332052e-05, + "loss": 0.0087, + "num_input_tokens_seen": 18582752, + "step": 88045 + }, + { + "epoch": 9.686468646864686, + "grad_norm": 0.04754923656582832, + "learning_rate": 3.0681756507410975e-05, + "loss": 0.0331, + "num_input_tokens_seen": 18583776, + "step": 88050 + }, + { + "epoch": 9.687018701870187, + "grad_norm": 0.03273848816752434, + "learning_rate": 3.0679419219135414e-05, + "loss": 0.1268, + "num_input_tokens_seen": 18584800, + "step": 88055 + }, + { + "epoch": 9.687568756875688, + "grad_norm": 0.01569535583257675, + "learning_rate": 3.067708187851537e-05, + "loss": 0.0646, + "num_input_tokens_seen": 18585888, + "step": 88060 + }, + { + "epoch": 9.688118811881187, + "grad_norm": 0.013198619708418846, + "learning_rate": 3.067474448557239e-05, + "loss": 0.0655, + "num_input_tokens_seen": 18586912, + "step": 88065 + }, + { + "epoch": 9.688668866886688, + "grad_norm": 1.8589199781417847, + "learning_rate": 3.067240704032801e-05, + "loss": 0.1182, + "num_input_tokens_seen": 18587968, + "step": 88070 + }, + { + "epoch": 9.68921892189219, + "grad_norm": 0.01755991205573082, + "learning_rate": 3.0670069542803776e-05, + "loss": 0.0205, + "num_input_tokens_seen": 18588928, + "step": 88075 + }, + { + "epoch": 9.689768976897689, + "grad_norm": 2.3270103931427, + "learning_rate": 3.0667731993021226e-05, + "loss": 0.0574, + "num_input_tokens_seen": 18589952, + "step": 88080 + }, + { + "epoch": 9.69031903190319, + "grad_norm": 1.700919270515442, + "learning_rate": 3.066539439100192e-05, + "loss": 0.0811, + "num_input_tokens_seen": 18590976, + "step": 88085 + }, + { + "epoch": 9.690869086908691, + "grad_norm": 0.19462570548057556, + "learning_rate": 3.066305673676739e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18592096, + "step": 88090 + }, + { + "epoch": 9.691419141914192, + "grad_norm": 0.04670512303709984, + "learning_rate": 3.066071903033918e-05, + "loss": 0.097, + "num_input_tokens_seen": 18593216, + "step": 88095 + }, + { + "epoch": 9.691969196919691, + "grad_norm": 0.27427005767822266, + "learning_rate": 3.065838127173884e-05, + "loss": 0.0627, + "num_input_tokens_seen": 18594272, + "step": 88100 + }, + { + "epoch": 9.692519251925193, + "grad_norm": 0.08723163604736328, + "learning_rate": 3.0656043460987914e-05, + "loss": 0.0099, + "num_input_tokens_seen": 18595360, + "step": 88105 + }, + { + "epoch": 9.693069306930694, + "grad_norm": 0.024882718920707703, + "learning_rate": 3.065370559810795e-05, + "loss": 0.0066, + "num_input_tokens_seen": 18596384, + "step": 88110 + }, + { + "epoch": 9.693619361936193, + "grad_norm": 0.010191936045885086, + "learning_rate": 3.0651367683120504e-05, + "loss": 0.0172, + "num_input_tokens_seen": 18597440, + "step": 88115 + }, + { + "epoch": 9.694169416941694, + "grad_norm": 1.02420973777771, + "learning_rate": 3.06490297160471e-05, + "loss": 0.1024, + "num_input_tokens_seen": 18598592, + "step": 88120 + }, + { + "epoch": 9.694719471947195, + "grad_norm": 0.009739379398524761, + "learning_rate": 3.06466916969093e-05, + "loss": 0.0153, + "num_input_tokens_seen": 18599648, + "step": 88125 + }, + { + "epoch": 9.695269526952695, + "grad_norm": 0.33854174613952637, + "learning_rate": 3.064435362572866e-05, + "loss": 0.0483, + "num_input_tokens_seen": 18600672, + "step": 88130 + }, + { + "epoch": 9.695819581958196, + "grad_norm": 1.4345542192459106, + "learning_rate": 3.064201550252672e-05, + "loss": 0.0556, + "num_input_tokens_seen": 18601760, + "step": 88135 + }, + { + "epoch": 9.696369636963697, + "grad_norm": 0.12378238141536713, + "learning_rate": 3.063967732732503e-05, + "loss": 0.0467, + "num_input_tokens_seen": 18602880, + "step": 88140 + }, + { + "epoch": 9.696919691969196, + "grad_norm": 0.0371149405837059, + "learning_rate": 3.063733910014514e-05, + "loss": 0.0094, + "num_input_tokens_seen": 18603904, + "step": 88145 + }, + { + "epoch": 9.697469746974697, + "grad_norm": 0.020789477974176407, + "learning_rate": 3.06350008210086e-05, + "loss": 0.0021, + "num_input_tokens_seen": 18604960, + "step": 88150 + }, + { + "epoch": 9.698019801980198, + "grad_norm": 0.013779174536466599, + "learning_rate": 3.0632662489936954e-05, + "loss": 0.0824, + "num_input_tokens_seen": 18605984, + "step": 88155 + }, + { + "epoch": 9.6985698569857, + "grad_norm": 0.4097277522087097, + "learning_rate": 3.063032410695178e-05, + "loss": 0.0117, + "num_input_tokens_seen": 18607104, + "step": 88160 + }, + { + "epoch": 9.699119911991199, + "grad_norm": 0.0746045857667923, + "learning_rate": 3.06279856720746e-05, + "loss": 0.0072, + "num_input_tokens_seen": 18608192, + "step": 88165 + }, + { + "epoch": 9.6996699669967, + "grad_norm": 0.06156622990965843, + "learning_rate": 3.062564718532696e-05, + "loss": 0.0034, + "num_input_tokens_seen": 18609248, + "step": 88170 + }, + { + "epoch": 9.7002200220022, + "grad_norm": 0.14452031254768372, + "learning_rate": 3.062330864673045e-05, + "loss": 0.0065, + "num_input_tokens_seen": 18610208, + "step": 88175 + }, + { + "epoch": 9.7007700770077, + "grad_norm": 0.305618554353714, + "learning_rate": 3.06209700563066e-05, + "loss": 0.0681, + "num_input_tokens_seen": 18611264, + "step": 88180 + }, + { + "epoch": 9.701320132013201, + "grad_norm": 0.03985060006380081, + "learning_rate": 3.061863141407696e-05, + "loss": 0.0607, + "num_input_tokens_seen": 18612320, + "step": 88185 + }, + { + "epoch": 9.701870187018702, + "grad_norm": 0.3759288489818573, + "learning_rate": 3.0616292720063086e-05, + "loss": 0.0069, + "num_input_tokens_seen": 18613344, + "step": 88190 + }, + { + "epoch": 9.702420242024202, + "grad_norm": 0.07711007446050644, + "learning_rate": 3.061395397428655e-05, + "loss": 0.0054, + "num_input_tokens_seen": 18614432, + "step": 88195 + }, + { + "epoch": 9.702970297029703, + "grad_norm": 0.19430330395698547, + "learning_rate": 3.061161517676888e-05, + "loss": 0.0097, + "num_input_tokens_seen": 18615424, + "step": 88200 + }, + { + "epoch": 9.703520352035204, + "grad_norm": 0.5363724231719971, + "learning_rate": 3.060927632753166e-05, + "loss": 0.1154, + "num_input_tokens_seen": 18616480, + "step": 88205 + }, + { + "epoch": 9.704070407040705, + "grad_norm": 0.04921482130885124, + "learning_rate": 3.060693742659642e-05, + "loss": 0.0061, + "num_input_tokens_seen": 18617568, + "step": 88210 + }, + { + "epoch": 9.704620462046204, + "grad_norm": 0.011876242235302925, + "learning_rate": 3.060459847398473e-05, + "loss": 0.0335, + "num_input_tokens_seen": 18618592, + "step": 88215 + }, + { + "epoch": 9.705170517051705, + "grad_norm": 0.1738126575946808, + "learning_rate": 3.060225946971816e-05, + "loss": 0.0101, + "num_input_tokens_seen": 18619680, + "step": 88220 + }, + { + "epoch": 9.705720572057206, + "grad_norm": 0.008641520515084267, + "learning_rate": 3.059992041381824e-05, + "loss": 0.0045, + "num_input_tokens_seen": 18620736, + "step": 88225 + }, + { + "epoch": 9.706270627062706, + "grad_norm": 0.05348995700478554, + "learning_rate": 3.059758130630654e-05, + "loss": 0.055, + "num_input_tokens_seen": 18621760, + "step": 88230 + }, + { + "epoch": 9.706820682068207, + "grad_norm": 2.0189948081970215, + "learning_rate": 3.0595242147204626e-05, + "loss": 0.0484, + "num_input_tokens_seen": 18622848, + "step": 88235 + }, + { + "epoch": 9.707370737073708, + "grad_norm": 1.0609631538391113, + "learning_rate": 3.059290293653404e-05, + "loss": 0.0703, + "num_input_tokens_seen": 18623872, + "step": 88240 + }, + { + "epoch": 9.707920792079207, + "grad_norm": 0.04995926842093468, + "learning_rate": 3.059056367431635e-05, + "loss": 0.0301, + "num_input_tokens_seen": 18624928, + "step": 88245 + }, + { + "epoch": 9.708470847084708, + "grad_norm": 0.01939322240650654, + "learning_rate": 3.058822436057313e-05, + "loss": 0.0218, + "num_input_tokens_seen": 18625984, + "step": 88250 + }, + { + "epoch": 9.70902090209021, + "grad_norm": 0.027130432426929474, + "learning_rate": 3.058588499532592e-05, + "loss": 0.0029, + "num_input_tokens_seen": 18627040, + "step": 88255 + }, + { + "epoch": 9.70957095709571, + "grad_norm": 0.015199576504528522, + "learning_rate": 3.058354557859629e-05, + "loss": 0.0186, + "num_input_tokens_seen": 18628096, + "step": 88260 + }, + { + "epoch": 9.71012101210121, + "grad_norm": 0.01669195108115673, + "learning_rate": 3.05812061104058e-05, + "loss": 0.0022, + "num_input_tokens_seen": 18629152, + "step": 88265 + }, + { + "epoch": 9.710671067106711, + "grad_norm": 0.07879505306482315, + "learning_rate": 3.0578866590776004e-05, + "loss": 0.053, + "num_input_tokens_seen": 18630176, + "step": 88270 + }, + { + "epoch": 9.711221122112212, + "grad_norm": 0.020825911313295364, + "learning_rate": 3.057652701972848e-05, + "loss": 0.0312, + "num_input_tokens_seen": 18631296, + "step": 88275 + }, + { + "epoch": 9.711771177117711, + "grad_norm": 0.06376838684082031, + "learning_rate": 3.0574187397284785e-05, + "loss": 0.0386, + "num_input_tokens_seen": 18632352, + "step": 88280 + }, + { + "epoch": 9.712321232123212, + "grad_norm": 0.661945641040802, + "learning_rate": 3.057184772346647e-05, + "loss": 0.0268, + "num_input_tokens_seen": 18633440, + "step": 88285 + }, + { + "epoch": 9.712871287128714, + "grad_norm": 0.05546284839510918, + "learning_rate": 3.056950799829512e-05, + "loss": 0.1508, + "num_input_tokens_seen": 18634464, + "step": 88290 + }, + { + "epoch": 9.713421342134213, + "grad_norm": 0.025690833106637, + "learning_rate": 3.056716822179228e-05, + "loss": 0.0212, + "num_input_tokens_seen": 18635424, + "step": 88295 + }, + { + "epoch": 9.713971397139714, + "grad_norm": 0.45954611897468567, + "learning_rate": 3.056482839397952e-05, + "loss": 0.0369, + "num_input_tokens_seen": 18636512, + "step": 88300 + }, + { + "epoch": 9.714521452145215, + "grad_norm": 3.754598617553711, + "learning_rate": 3.0562488514878414e-05, + "loss": 0.03, + "num_input_tokens_seen": 18637632, + "step": 88305 + }, + { + "epoch": 9.715071507150714, + "grad_norm": 0.022046072408556938, + "learning_rate": 3.056014858451052e-05, + "loss": 0.0423, + "num_input_tokens_seen": 18638752, + "step": 88310 + }, + { + "epoch": 9.715621562156215, + "grad_norm": 2.210888624191284, + "learning_rate": 3.0557808602897394e-05, + "loss": 0.0257, + "num_input_tokens_seen": 18639776, + "step": 88315 + }, + { + "epoch": 9.716171617161717, + "grad_norm": 0.029063858091831207, + "learning_rate": 3.055546857006062e-05, + "loss": 0.0234, + "num_input_tokens_seen": 18640800, + "step": 88320 + }, + { + "epoch": 9.716721672167218, + "grad_norm": 0.22276103496551514, + "learning_rate": 3.055312848602175e-05, + "loss": 0.0156, + "num_input_tokens_seen": 18641888, + "step": 88325 + }, + { + "epoch": 9.717271727172717, + "grad_norm": 0.00829565804451704, + "learning_rate": 3.0550788350802374e-05, + "loss": 0.1405, + "num_input_tokens_seen": 18642976, + "step": 88330 + }, + { + "epoch": 9.717821782178218, + "grad_norm": 0.6146848797798157, + "learning_rate": 3.0548448164424035e-05, + "loss": 0.0361, + "num_input_tokens_seen": 18643968, + "step": 88335 + }, + { + "epoch": 9.718371837183719, + "grad_norm": 0.3467550575733185, + "learning_rate": 3.054610792690832e-05, + "loss": 0.0119, + "num_input_tokens_seen": 18645024, + "step": 88340 + }, + { + "epoch": 9.718921892189218, + "grad_norm": 0.09548171609640121, + "learning_rate": 3.054376763827678e-05, + "loss": 0.0104, + "num_input_tokens_seen": 18646048, + "step": 88345 + }, + { + "epoch": 9.71947194719472, + "grad_norm": 0.006953321397304535, + "learning_rate": 3.0541427298551e-05, + "loss": 0.0028, + "num_input_tokens_seen": 18647136, + "step": 88350 + }, + { + "epoch": 9.72002200220022, + "grad_norm": 0.03643646463751793, + "learning_rate": 3.053908690775254e-05, + "loss": 0.0106, + "num_input_tokens_seen": 18648192, + "step": 88355 + }, + { + "epoch": 9.72057205720572, + "grad_norm": 0.026037879288196564, + "learning_rate": 3.053674646590298e-05, + "loss": 0.0057, + "num_input_tokens_seen": 18649248, + "step": 88360 + }, + { + "epoch": 9.721122112211221, + "grad_norm": 0.0065471101552248, + "learning_rate": 3.053440597302388e-05, + "loss": 0.0245, + "num_input_tokens_seen": 18650336, + "step": 88365 + }, + { + "epoch": 9.721672167216722, + "grad_norm": 0.548627495765686, + "learning_rate": 3.053206542913681e-05, + "loss": 0.0067, + "num_input_tokens_seen": 18651392, + "step": 88370 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 0.2921315133571625, + "learning_rate": 3.0529724834263355e-05, + "loss": 0.0186, + "num_input_tokens_seen": 18652448, + "step": 88375 + }, + { + "epoch": 9.722772277227723, + "grad_norm": 1.8223013877868652, + "learning_rate": 3.0527384188425086e-05, + "loss": 0.1272, + "num_input_tokens_seen": 18653600, + "step": 88380 + }, + { + "epoch": 9.723322332233224, + "grad_norm": 0.18510492146015167, + "learning_rate": 3.052504349164355e-05, + "loss": 0.0074, + "num_input_tokens_seen": 18654720, + "step": 88385 + }, + { + "epoch": 9.723872387238725, + "grad_norm": 0.08229570090770721, + "learning_rate": 3.0522702743940354e-05, + "loss": 0.1011, + "num_input_tokens_seen": 18655744, + "step": 88390 + }, + { + "epoch": 9.724422442244224, + "grad_norm": 0.09816920757293701, + "learning_rate": 3.0520361945337064e-05, + "loss": 0.0452, + "num_input_tokens_seen": 18656864, + "step": 88395 + }, + { + "epoch": 9.724972497249725, + "grad_norm": 1.5111101865768433, + "learning_rate": 3.051802109585523e-05, + "loss": 0.0707, + "num_input_tokens_seen": 18657920, + "step": 88400 + }, + { + "epoch": 9.725522552255226, + "grad_norm": 0.0275344867259264, + "learning_rate": 3.051568019551645e-05, + "loss": 0.005, + "num_input_tokens_seen": 18658944, + "step": 88405 + }, + { + "epoch": 9.726072607260726, + "grad_norm": 0.2186458557844162, + "learning_rate": 3.051333924434229e-05, + "loss": 0.0507, + "num_input_tokens_seen": 18660000, + "step": 88410 + }, + { + "epoch": 9.726622662266227, + "grad_norm": 0.0263666994869709, + "learning_rate": 3.0510998242354328e-05, + "loss": 0.047, + "num_input_tokens_seen": 18661120, + "step": 88415 + }, + { + "epoch": 9.727172717271728, + "grad_norm": 1.091704249382019, + "learning_rate": 3.0508657189574146e-05, + "loss": 0.1129, + "num_input_tokens_seen": 18662144, + "step": 88420 + }, + { + "epoch": 9.727722772277227, + "grad_norm": 0.16736461222171783, + "learning_rate": 3.050631608602331e-05, + "loss": 0.0184, + "num_input_tokens_seen": 18663168, + "step": 88425 + }, + { + "epoch": 9.728272827282728, + "grad_norm": 1.9991923570632935, + "learning_rate": 3.0503974931723394e-05, + "loss": 0.2007, + "num_input_tokens_seen": 18664160, + "step": 88430 + }, + { + "epoch": 9.72882288228823, + "grad_norm": 0.5942431092262268, + "learning_rate": 3.050163372669599e-05, + "loss": 0.0255, + "num_input_tokens_seen": 18665184, + "step": 88435 + }, + { + "epoch": 9.729372937293729, + "grad_norm": 0.010647319257259369, + "learning_rate": 3.049929247096267e-05, + "loss": 0.1112, + "num_input_tokens_seen": 18666240, + "step": 88440 + }, + { + "epoch": 9.72992299229923, + "grad_norm": 0.019590245559811592, + "learning_rate": 3.0496951164545005e-05, + "loss": 0.0447, + "num_input_tokens_seen": 18667296, + "step": 88445 + }, + { + "epoch": 9.73047304730473, + "grad_norm": 0.14283280074596405, + "learning_rate": 3.049460980746458e-05, + "loss": 0.0061, + "num_input_tokens_seen": 18668384, + "step": 88450 + }, + { + "epoch": 9.731023102310232, + "grad_norm": 0.04432215541601181, + "learning_rate": 3.0492268399742975e-05, + "loss": 0.0275, + "num_input_tokens_seen": 18669504, + "step": 88455 + }, + { + "epoch": 9.731573157315731, + "grad_norm": 0.88181072473526, + "learning_rate": 3.0489926941401757e-05, + "loss": 0.0248, + "num_input_tokens_seen": 18670560, + "step": 88460 + }, + { + "epoch": 9.732123212321232, + "grad_norm": 0.048027876764535904, + "learning_rate": 3.048758543246253e-05, + "loss": 0.002, + "num_input_tokens_seen": 18671616, + "step": 88465 + }, + { + "epoch": 9.732673267326733, + "grad_norm": 0.08320534229278564, + "learning_rate": 3.048524387294686e-05, + "loss": 0.0723, + "num_input_tokens_seen": 18672640, + "step": 88470 + }, + { + "epoch": 9.733223322332233, + "grad_norm": 0.1779009848833084, + "learning_rate": 3.0482902262876324e-05, + "loss": 0.0083, + "num_input_tokens_seen": 18673728, + "step": 88475 + }, + { + "epoch": 9.733773377337734, + "grad_norm": 0.15735158324241638, + "learning_rate": 3.0480560602272513e-05, + "loss": 0.0804, + "num_input_tokens_seen": 18674752, + "step": 88480 + }, + { + "epoch": 9.734323432343235, + "grad_norm": 0.4496743679046631, + "learning_rate": 3.0478218891157003e-05, + "loss": 0.0093, + "num_input_tokens_seen": 18675776, + "step": 88485 + }, + { + "epoch": 9.734873487348734, + "grad_norm": 0.16817641258239746, + "learning_rate": 3.0475877129551376e-05, + "loss": 0.0074, + "num_input_tokens_seen": 18676800, + "step": 88490 + }, + { + "epoch": 9.735423542354235, + "grad_norm": 0.05066850408911705, + "learning_rate": 3.047353531747722e-05, + "loss": 0.0134, + "num_input_tokens_seen": 18677920, + "step": 88495 + }, + { + "epoch": 9.735973597359736, + "grad_norm": 0.029798928648233414, + "learning_rate": 3.047119345495611e-05, + "loss": 0.0076, + "num_input_tokens_seen": 18679040, + "step": 88500 + }, + { + "epoch": 9.736523652365236, + "grad_norm": 1.9514975547790527, + "learning_rate": 3.0468851542009642e-05, + "loss": 0.0521, + "num_input_tokens_seen": 18680064, + "step": 88505 + }, + { + "epoch": 9.737073707370737, + "grad_norm": 0.02885627932846546, + "learning_rate": 3.0466509578659393e-05, + "loss": 0.0685, + "num_input_tokens_seen": 18681120, + "step": 88510 + }, + { + "epoch": 9.737623762376238, + "grad_norm": 0.2513777017593384, + "learning_rate": 3.046416756492695e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18682208, + "step": 88515 + }, + { + "epoch": 9.738173817381739, + "grad_norm": 0.42363837361335754, + "learning_rate": 3.046182550083389e-05, + "loss": 0.0219, + "num_input_tokens_seen": 18683232, + "step": 88520 + }, + { + "epoch": 9.738723872387238, + "grad_norm": 0.15037091076374054, + "learning_rate": 3.045948338640181e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18684224, + "step": 88525 + }, + { + "epoch": 9.73927392739274, + "grad_norm": 0.007027994375675917, + "learning_rate": 3.0457141221652285e-05, + "loss": 0.0619, + "num_input_tokens_seen": 18685248, + "step": 88530 + }, + { + "epoch": 9.73982398239824, + "grad_norm": 0.9003512859344482, + "learning_rate": 3.045479900660691e-05, + "loss": 0.0411, + "num_input_tokens_seen": 18686240, + "step": 88535 + }, + { + "epoch": 9.74037403740374, + "grad_norm": 0.07524389773607254, + "learning_rate": 3.0452456741287284e-05, + "loss": 0.0152, + "num_input_tokens_seen": 18687232, + "step": 88540 + }, + { + "epoch": 9.74092409240924, + "grad_norm": 0.14350152015686035, + "learning_rate": 3.045011442571496e-05, + "loss": 0.0029, + "num_input_tokens_seen": 18688288, + "step": 88545 + }, + { + "epoch": 9.741474147414742, + "grad_norm": 0.07799957692623138, + "learning_rate": 3.0447772059911555e-05, + "loss": 0.0632, + "num_input_tokens_seen": 18689408, + "step": 88550 + }, + { + "epoch": 9.742024202420241, + "grad_norm": 0.15022335946559906, + "learning_rate": 3.0445429643898655e-05, + "loss": 0.116, + "num_input_tokens_seen": 18690400, + "step": 88555 + }, + { + "epoch": 9.742574257425742, + "grad_norm": 0.07745618373155594, + "learning_rate": 3.0443087177697834e-05, + "loss": 0.0035, + "num_input_tokens_seen": 18691488, + "step": 88560 + }, + { + "epoch": 9.743124312431243, + "grad_norm": 0.06887546181678772, + "learning_rate": 3.0440744661330696e-05, + "loss": 0.0648, + "num_input_tokens_seen": 18692576, + "step": 88565 + }, + { + "epoch": 9.743674367436743, + "grad_norm": 0.2280641496181488, + "learning_rate": 3.0438402094818825e-05, + "loss": 0.0047, + "num_input_tokens_seen": 18693664, + "step": 88570 + }, + { + "epoch": 9.744224422442244, + "grad_norm": 1.3599555492401123, + "learning_rate": 3.0436059478183803e-05, + "loss": 0.0509, + "num_input_tokens_seen": 18694688, + "step": 88575 + }, + { + "epoch": 9.744774477447745, + "grad_norm": 0.030320079997181892, + "learning_rate": 3.0433716811447237e-05, + "loss": 0.0167, + "num_input_tokens_seen": 18695712, + "step": 88580 + }, + { + "epoch": 9.745324532453246, + "grad_norm": 0.059591617435216904, + "learning_rate": 3.0431374094630714e-05, + "loss": 0.0101, + "num_input_tokens_seen": 18696832, + "step": 88585 + }, + { + "epoch": 9.745874587458745, + "grad_norm": 0.27245384454727173, + "learning_rate": 3.0429031327755815e-05, + "loss": 0.0084, + "num_input_tokens_seen": 18697920, + "step": 88590 + }, + { + "epoch": 9.746424642464246, + "grad_norm": 0.7242639660835266, + "learning_rate": 3.0426688510844142e-05, + "loss": 0.0445, + "num_input_tokens_seen": 18698944, + "step": 88595 + }, + { + "epoch": 9.746974697469748, + "grad_norm": 0.4463611841201782, + "learning_rate": 3.0424345643917285e-05, + "loss": 0.0184, + "num_input_tokens_seen": 18699968, + "step": 88600 + }, + { + "epoch": 9.747524752475247, + "grad_norm": 0.007002130150794983, + "learning_rate": 3.0422002726996835e-05, + "loss": 0.0259, + "num_input_tokens_seen": 18700928, + "step": 88605 + }, + { + "epoch": 9.748074807480748, + "grad_norm": 0.0104306535795331, + "learning_rate": 3.0419659760104396e-05, + "loss": 0.0133, + "num_input_tokens_seen": 18702016, + "step": 88610 + }, + { + "epoch": 9.748624862486249, + "grad_norm": 0.039698995649814606, + "learning_rate": 3.0417316743261546e-05, + "loss": 0.0956, + "num_input_tokens_seen": 18703040, + "step": 88615 + }, + { + "epoch": 9.749174917491748, + "grad_norm": 0.1311122626066208, + "learning_rate": 3.0414973676489882e-05, + "loss": 0.0026, + "num_input_tokens_seen": 18704032, + "step": 88620 + }, + { + "epoch": 9.74972497249725, + "grad_norm": 0.023756029084324837, + "learning_rate": 3.0412630559811013e-05, + "loss": 0.0067, + "num_input_tokens_seen": 18705088, + "step": 88625 + }, + { + "epoch": 9.75027502750275, + "grad_norm": 0.0458507314324379, + "learning_rate": 3.041028739324652e-05, + "loss": 0.0043, + "num_input_tokens_seen": 18706176, + "step": 88630 + }, + { + "epoch": 9.750825082508252, + "grad_norm": 0.018849210813641548, + "learning_rate": 3.040794417681801e-05, + "loss": 0.1555, + "num_input_tokens_seen": 18707200, + "step": 88635 + }, + { + "epoch": 9.751375137513751, + "grad_norm": 0.015246543101966381, + "learning_rate": 3.040560091054707e-05, + "loss": 0.0041, + "num_input_tokens_seen": 18708192, + "step": 88640 + }, + { + "epoch": 9.751925192519252, + "grad_norm": 0.07664677500724792, + "learning_rate": 3.0403257594455305e-05, + "loss": 0.1207, + "num_input_tokens_seen": 18709248, + "step": 88645 + }, + { + "epoch": 9.752475247524753, + "grad_norm": 0.04232528805732727, + "learning_rate": 3.04009142285643e-05, + "loss": 0.0228, + "num_input_tokens_seen": 18710304, + "step": 88650 + }, + { + "epoch": 9.753025302530252, + "grad_norm": 0.012535389512777328, + "learning_rate": 3.039857081289566e-05, + "loss": 0.0132, + "num_input_tokens_seen": 18711392, + "step": 88655 + }, + { + "epoch": 9.753575357535754, + "grad_norm": 0.42151039838790894, + "learning_rate": 3.0396227347470983e-05, + "loss": 0.09, + "num_input_tokens_seen": 18712480, + "step": 88660 + }, + { + "epoch": 9.754125412541255, + "grad_norm": 0.016576824709773064, + "learning_rate": 3.0393883832311874e-05, + "loss": 0.0127, + "num_input_tokens_seen": 18713536, + "step": 88665 + }, + { + "epoch": 9.754675467546754, + "grad_norm": 0.029622776433825493, + "learning_rate": 3.0391540267439928e-05, + "loss": 0.0025, + "num_input_tokens_seen": 18714560, + "step": 88670 + }, + { + "epoch": 9.755225522552255, + "grad_norm": 1.8223552703857422, + "learning_rate": 3.038919665287674e-05, + "loss": 0.0499, + "num_input_tokens_seen": 18715680, + "step": 88675 + }, + { + "epoch": 9.755775577557756, + "grad_norm": 0.21502920985221863, + "learning_rate": 3.038685298864391e-05, + "loss": 0.0196, + "num_input_tokens_seen": 18716736, + "step": 88680 + }, + { + "epoch": 9.756325632563257, + "grad_norm": 0.02019420824944973, + "learning_rate": 3.038450927476304e-05, + "loss": 0.0415, + "num_input_tokens_seen": 18717824, + "step": 88685 + }, + { + "epoch": 9.756875687568757, + "grad_norm": 0.0531119629740715, + "learning_rate": 3.0382165511255727e-05, + "loss": 0.0416, + "num_input_tokens_seen": 18718912, + "step": 88690 + }, + { + "epoch": 9.757425742574258, + "grad_norm": 0.427588552236557, + "learning_rate": 3.0379821698143584e-05, + "loss": 0.0809, + "num_input_tokens_seen": 18719968, + "step": 88695 + }, + { + "epoch": 9.757975797579759, + "grad_norm": 0.013827512972056866, + "learning_rate": 3.0377477835448205e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18721024, + "step": 88700 + }, + { + "epoch": 9.758525852585258, + "grad_norm": 0.012674279510974884, + "learning_rate": 3.0375133923191194e-05, + "loss": 0.0125, + "num_input_tokens_seen": 18722112, + "step": 88705 + }, + { + "epoch": 9.75907590759076, + "grad_norm": 0.01924818381667137, + "learning_rate": 3.0372789961394154e-05, + "loss": 0.0234, + "num_input_tokens_seen": 18723168, + "step": 88710 + }, + { + "epoch": 9.75962596259626, + "grad_norm": 0.04953411966562271, + "learning_rate": 3.0370445950078686e-05, + "loss": 0.0254, + "num_input_tokens_seen": 18724224, + "step": 88715 + }, + { + "epoch": 9.76017601760176, + "grad_norm": 0.0194682814180851, + "learning_rate": 3.0368101889266387e-05, + "loss": 0.0407, + "num_input_tokens_seen": 18725248, + "step": 88720 + }, + { + "epoch": 9.76072607260726, + "grad_norm": 0.8458698391914368, + "learning_rate": 3.0365757778978877e-05, + "loss": 0.0346, + "num_input_tokens_seen": 18726272, + "step": 88725 + }, + { + "epoch": 9.761276127612762, + "grad_norm": 0.05830617621541023, + "learning_rate": 3.0363413619237758e-05, + "loss": 0.0659, + "num_input_tokens_seen": 18727232, + "step": 88730 + }, + { + "epoch": 9.761826182618261, + "grad_norm": 0.029748672619462013, + "learning_rate": 3.0361069410064613e-05, + "loss": 0.0093, + "num_input_tokens_seen": 18728256, + "step": 88735 + }, + { + "epoch": 9.762376237623762, + "grad_norm": 0.7773471474647522, + "learning_rate": 3.0358725151481078e-05, + "loss": 0.0202, + "num_input_tokens_seen": 18729376, + "step": 88740 + }, + { + "epoch": 9.762926292629263, + "grad_norm": 0.07965958118438721, + "learning_rate": 3.0356380843508736e-05, + "loss": 0.005, + "num_input_tokens_seen": 18730528, + "step": 88745 + }, + { + "epoch": 9.763476347634764, + "grad_norm": 0.024553868919610977, + "learning_rate": 3.0354036486169203e-05, + "loss": 0.0364, + "num_input_tokens_seen": 18731584, + "step": 88750 + }, + { + "epoch": 9.764026402640264, + "grad_norm": 1.1213034391403198, + "learning_rate": 3.0351692079484095e-05, + "loss": 0.0767, + "num_input_tokens_seen": 18732608, + "step": 88755 + }, + { + "epoch": 9.764576457645765, + "grad_norm": 0.04701448604464531, + "learning_rate": 3.0349347623475e-05, + "loss": 0.0735, + "num_input_tokens_seen": 18733664, + "step": 88760 + }, + { + "epoch": 9.765126512651266, + "grad_norm": 0.03023926541209221, + "learning_rate": 3.034700311816353e-05, + "loss": 0.0501, + "num_input_tokens_seen": 18734656, + "step": 88765 + }, + { + "epoch": 9.765676567656765, + "grad_norm": 0.018050167709589005, + "learning_rate": 3.0344658563571304e-05, + "loss": 0.0089, + "num_input_tokens_seen": 18735680, + "step": 88770 + }, + { + "epoch": 9.766226622662266, + "grad_norm": 0.07474339753389359, + "learning_rate": 3.0342313959719925e-05, + "loss": 0.0117, + "num_input_tokens_seen": 18736704, + "step": 88775 + }, + { + "epoch": 9.766776677667767, + "grad_norm": 0.02428184263408184, + "learning_rate": 3.0339969306631005e-05, + "loss": 0.0042, + "num_input_tokens_seen": 18737792, + "step": 88780 + }, + { + "epoch": 9.767326732673267, + "grad_norm": 0.024162519723176956, + "learning_rate": 3.0337624604326147e-05, + "loss": 0.0816, + "num_input_tokens_seen": 18738912, + "step": 88785 + }, + { + "epoch": 9.767876787678768, + "grad_norm": 0.016720471903681755, + "learning_rate": 3.0335279852826964e-05, + "loss": 0.0493, + "num_input_tokens_seen": 18740000, + "step": 88790 + }, + { + "epoch": 9.768426842684269, + "grad_norm": 0.014409059658646584, + "learning_rate": 3.0332935052155066e-05, + "loss": 0.0876, + "num_input_tokens_seen": 18741024, + "step": 88795 + }, + { + "epoch": 9.768976897689768, + "grad_norm": 0.23223170638084412, + "learning_rate": 3.0330590202332064e-05, + "loss": 0.0077, + "num_input_tokens_seen": 18742080, + "step": 88800 + }, + { + "epoch": 9.76952695269527, + "grad_norm": 0.25605154037475586, + "learning_rate": 3.032824530337957e-05, + "loss": 0.0064, + "num_input_tokens_seen": 18743072, + "step": 88805 + }, + { + "epoch": 9.77007700770077, + "grad_norm": 0.016981787979602814, + "learning_rate": 3.03259003553192e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18744192, + "step": 88810 + }, + { + "epoch": 9.770627062706271, + "grad_norm": 0.45233696699142456, + "learning_rate": 3.0323555358172556e-05, + "loss": 0.0122, + "num_input_tokens_seen": 18745216, + "step": 88815 + }, + { + "epoch": 9.77117711771177, + "grad_norm": 0.0721345916390419, + "learning_rate": 3.0321210311961263e-05, + "loss": 0.0123, + "num_input_tokens_seen": 18746208, + "step": 88820 + }, + { + "epoch": 9.771727172717272, + "grad_norm": 0.03288080170750618, + "learning_rate": 3.0318865216706928e-05, + "loss": 0.0128, + "num_input_tokens_seen": 18747264, + "step": 88825 + }, + { + "epoch": 9.772277227722773, + "grad_norm": 0.02263450436294079, + "learning_rate": 3.0316520072431158e-05, + "loss": 0.0018, + "num_input_tokens_seen": 18748352, + "step": 88830 + }, + { + "epoch": 9.772827282728272, + "grad_norm": 0.012815894559025764, + "learning_rate": 3.031417487915557e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18749344, + "step": 88835 + }, + { + "epoch": 9.773377337733773, + "grad_norm": 0.006727264262735844, + "learning_rate": 3.031182963690179e-05, + "loss": 0.009, + "num_input_tokens_seen": 18750368, + "step": 88840 + }, + { + "epoch": 9.773927392739274, + "grad_norm": 0.012900753878057003, + "learning_rate": 3.0309484345691423e-05, + "loss": 0.1321, + "num_input_tokens_seen": 18751392, + "step": 88845 + }, + { + "epoch": 9.774477447744774, + "grad_norm": 0.08931342512369156, + "learning_rate": 3.0307139005546094e-05, + "loss": 0.0084, + "num_input_tokens_seen": 18752480, + "step": 88850 + }, + { + "epoch": 9.775027502750275, + "grad_norm": 0.048744235187768936, + "learning_rate": 3.0304793616487404e-05, + "loss": 0.0049, + "num_input_tokens_seen": 18753568, + "step": 88855 + }, + { + "epoch": 9.775577557755776, + "grad_norm": 0.18353480100631714, + "learning_rate": 3.030244817853698e-05, + "loss": 0.1008, + "num_input_tokens_seen": 18754560, + "step": 88860 + }, + { + "epoch": 9.776127612761275, + "grad_norm": 1.196994662284851, + "learning_rate": 3.0300102691716426e-05, + "loss": 0.0751, + "num_input_tokens_seen": 18755712, + "step": 88865 + }, + { + "epoch": 9.776677667766776, + "grad_norm": 1.6832019090652466, + "learning_rate": 3.0297757156047385e-05, + "loss": 0.0315, + "num_input_tokens_seen": 18756736, + "step": 88870 + }, + { + "epoch": 9.777227722772277, + "grad_norm": 0.7147802710533142, + "learning_rate": 3.0295411571551447e-05, + "loss": 0.036, + "num_input_tokens_seen": 18757856, + "step": 88875 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 1.370517373085022, + "learning_rate": 3.029306593825024e-05, + "loss": 0.1116, + "num_input_tokens_seen": 18758976, + "step": 88880 + }, + { + "epoch": 9.778327832783278, + "grad_norm": 0.008699026890099049, + "learning_rate": 3.029072025616539e-05, + "loss": 0.081, + "num_input_tokens_seen": 18760032, + "step": 88885 + }, + { + "epoch": 9.778877887788779, + "grad_norm": 0.07175138592720032, + "learning_rate": 3.028837452531851e-05, + "loss": 0.1286, + "num_input_tokens_seen": 18761152, + "step": 88890 + }, + { + "epoch": 9.77942794279428, + "grad_norm": 0.8174475431442261, + "learning_rate": 3.0286028745731216e-05, + "loss": 0.0133, + "num_input_tokens_seen": 18762272, + "step": 88895 + }, + { + "epoch": 9.77997799779978, + "grad_norm": 1.1545161008834839, + "learning_rate": 3.0283682917425133e-05, + "loss": 0.0187, + "num_input_tokens_seen": 18763328, + "step": 88900 + }, + { + "epoch": 9.78052805280528, + "grad_norm": 0.08264730125665665, + "learning_rate": 3.028133704042188e-05, + "loss": 0.0217, + "num_input_tokens_seen": 18764384, + "step": 88905 + }, + { + "epoch": 9.781078107810782, + "grad_norm": 0.07420869916677475, + "learning_rate": 3.0278991114743076e-05, + "loss": 0.0524, + "num_input_tokens_seen": 18765440, + "step": 88910 + }, + { + "epoch": 9.781628162816281, + "grad_norm": 0.10081017017364502, + "learning_rate": 3.0276645140410348e-05, + "loss": 0.017, + "num_input_tokens_seen": 18766528, + "step": 88915 + }, + { + "epoch": 9.782178217821782, + "grad_norm": 0.015054559335112572, + "learning_rate": 3.0274299117445316e-05, + "loss": 0.0209, + "num_input_tokens_seen": 18767552, + "step": 88920 + }, + { + "epoch": 9.782728272827283, + "grad_norm": 0.012485106475651264, + "learning_rate": 3.0271953045869588e-05, + "loss": 0.087, + "num_input_tokens_seen": 18768576, + "step": 88925 + }, + { + "epoch": 9.783278327832782, + "grad_norm": 0.09987100958824158, + "learning_rate": 3.0269606925704807e-05, + "loss": 0.0097, + "num_input_tokens_seen": 18769600, + "step": 88930 + }, + { + "epoch": 9.783828382838283, + "grad_norm": 1.7350808382034302, + "learning_rate": 3.0267260756972587e-05, + "loss": 0.1025, + "num_input_tokens_seen": 18770656, + "step": 88935 + }, + { + "epoch": 9.784378437843785, + "grad_norm": 0.43627727031707764, + "learning_rate": 3.026491453969455e-05, + "loss": 0.0239, + "num_input_tokens_seen": 18771712, + "step": 88940 + }, + { + "epoch": 9.784928492849286, + "grad_norm": 0.022655751556158066, + "learning_rate": 3.0262568273892328e-05, + "loss": 0.0188, + "num_input_tokens_seen": 18772832, + "step": 88945 + }, + { + "epoch": 9.785478547854785, + "grad_norm": 0.024139026179909706, + "learning_rate": 3.026022195958754e-05, + "loss": 0.1246, + "num_input_tokens_seen": 18773920, + "step": 88950 + }, + { + "epoch": 9.786028602860286, + "grad_norm": 0.879392683506012, + "learning_rate": 3.02578755968018e-05, + "loss": 0.035, + "num_input_tokens_seen": 18774912, + "step": 88955 + }, + { + "epoch": 9.786578657865787, + "grad_norm": 0.01808254048228264, + "learning_rate": 3.0255529185556748e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18775968, + "step": 88960 + }, + { + "epoch": 9.787128712871286, + "grad_norm": 0.02834213897585869, + "learning_rate": 3.0253182725874006e-05, + "loss": 0.0218, + "num_input_tokens_seen": 18777024, + "step": 88965 + }, + { + "epoch": 9.787678767876788, + "grad_norm": 0.02372092194855213, + "learning_rate": 3.0250836217775203e-05, + "loss": 0.013, + "num_input_tokens_seen": 18778112, + "step": 88970 + }, + { + "epoch": 9.788228822882289, + "grad_norm": 0.3134019374847412, + "learning_rate": 3.024848966128196e-05, + "loss": 0.0898, + "num_input_tokens_seen": 18779200, + "step": 88975 + }, + { + "epoch": 9.788778877887788, + "grad_norm": 0.1528639942407608, + "learning_rate": 3.0246143056415904e-05, + "loss": 0.0209, + "num_input_tokens_seen": 18780352, + "step": 88980 + }, + { + "epoch": 9.789328932893289, + "grad_norm": 0.025337055325508118, + "learning_rate": 3.024379640319866e-05, + "loss": 0.0332, + "num_input_tokens_seen": 18781472, + "step": 88985 + }, + { + "epoch": 9.78987898789879, + "grad_norm": 0.9487817883491516, + "learning_rate": 3.0241449701651874e-05, + "loss": 0.016, + "num_input_tokens_seen": 18782464, + "step": 88990 + }, + { + "epoch": 9.79042904290429, + "grad_norm": 0.02266586199402809, + "learning_rate": 3.023910295179715e-05, + "loss": 0.1138, + "num_input_tokens_seen": 18783552, + "step": 88995 + }, + { + "epoch": 9.79097909790979, + "grad_norm": 3.8549041748046875, + "learning_rate": 3.023675615365613e-05, + "loss": 0.0554, + "num_input_tokens_seen": 18784640, + "step": 89000 + }, + { + "epoch": 9.791529152915292, + "grad_norm": 0.5972326397895813, + "learning_rate": 3.023440930725045e-05, + "loss": 0.0327, + "num_input_tokens_seen": 18785696, + "step": 89005 + }, + { + "epoch": 9.792079207920793, + "grad_norm": 0.035214636474847794, + "learning_rate": 3.0232062412601726e-05, + "loss": 0.0125, + "num_input_tokens_seen": 18786784, + "step": 89010 + }, + { + "epoch": 9.792629262926292, + "grad_norm": 0.04977854713797569, + "learning_rate": 3.022971546973159e-05, + "loss": 0.0044, + "num_input_tokens_seen": 18787872, + "step": 89015 + }, + { + "epoch": 9.793179317931793, + "grad_norm": 0.06031732261180878, + "learning_rate": 3.0227368478661677e-05, + "loss": 0.022, + "num_input_tokens_seen": 18788960, + "step": 89020 + }, + { + "epoch": 9.793729372937294, + "grad_norm": 1.7596306800842285, + "learning_rate": 3.0225021439413615e-05, + "loss": 0.0354, + "num_input_tokens_seen": 18790016, + "step": 89025 + }, + { + "epoch": 9.794279427942794, + "grad_norm": 0.14302827417850494, + "learning_rate": 3.022267435200904e-05, + "loss": 0.0292, + "num_input_tokens_seen": 18791040, + "step": 89030 + }, + { + "epoch": 9.794829482948295, + "grad_norm": 0.18474659323692322, + "learning_rate": 3.0220327216469586e-05, + "loss": 0.0231, + "num_input_tokens_seen": 18792128, + "step": 89035 + }, + { + "epoch": 9.795379537953796, + "grad_norm": 0.1289396733045578, + "learning_rate": 3.021798003281688e-05, + "loss": 0.0028, + "num_input_tokens_seen": 18793088, + "step": 89040 + }, + { + "epoch": 9.795929592959295, + "grad_norm": 0.7865747213363647, + "learning_rate": 3.0215632801072553e-05, + "loss": 0.0189, + "num_input_tokens_seen": 18794208, + "step": 89045 + }, + { + "epoch": 9.796479647964796, + "grad_norm": 0.011348862200975418, + "learning_rate": 3.0213285521258238e-05, + "loss": 0.0055, + "num_input_tokens_seen": 18795296, + "step": 89050 + }, + { + "epoch": 9.797029702970297, + "grad_norm": 0.019319437444210052, + "learning_rate": 3.0210938193395572e-05, + "loss": 0.0426, + "num_input_tokens_seen": 18796320, + "step": 89055 + }, + { + "epoch": 9.797579757975798, + "grad_norm": 0.8852151036262512, + "learning_rate": 3.0208590817506195e-05, + "loss": 0.0387, + "num_input_tokens_seen": 18797440, + "step": 89060 + }, + { + "epoch": 9.798129812981298, + "grad_norm": 0.03079572506248951, + "learning_rate": 3.0206243393611738e-05, + "loss": 0.0597, + "num_input_tokens_seen": 18798464, + "step": 89065 + }, + { + "epoch": 9.798679867986799, + "grad_norm": 0.00486680306494236, + "learning_rate": 3.020389592173382e-05, + "loss": 0.0235, + "num_input_tokens_seen": 18799488, + "step": 89070 + }, + { + "epoch": 9.7992299229923, + "grad_norm": 0.005669599398970604, + "learning_rate": 3.0201548401894104e-05, + "loss": 0.0524, + "num_input_tokens_seen": 18800544, + "step": 89075 + }, + { + "epoch": 9.7997799779978, + "grad_norm": 0.6721199154853821, + "learning_rate": 3.0199200834114205e-05, + "loss": 0.0248, + "num_input_tokens_seen": 18801568, + "step": 89080 + }, + { + "epoch": 9.8003300330033, + "grad_norm": 0.024490689858794212, + "learning_rate": 3.019685321841577e-05, + "loss": 0.0484, + "num_input_tokens_seen": 18802688, + "step": 89085 + }, + { + "epoch": 9.800880088008801, + "grad_norm": 0.03905654698610306, + "learning_rate": 3.0194505554820435e-05, + "loss": 0.0141, + "num_input_tokens_seen": 18803776, + "step": 89090 + }, + { + "epoch": 9.8014301430143, + "grad_norm": 0.029944123700261116, + "learning_rate": 3.0192157843349832e-05, + "loss": 0.0163, + "num_input_tokens_seen": 18804768, + "step": 89095 + }, + { + "epoch": 9.801980198019802, + "grad_norm": 0.1020478680729866, + "learning_rate": 3.0189810084025598e-05, + "loss": 0.0605, + "num_input_tokens_seen": 18805856, + "step": 89100 + }, + { + "epoch": 9.802530253025303, + "grad_norm": 0.020184675231575966, + "learning_rate": 3.0187462276869376e-05, + "loss": 0.0119, + "num_input_tokens_seen": 18806880, + "step": 89105 + }, + { + "epoch": 9.803080308030804, + "grad_norm": 0.4234898090362549, + "learning_rate": 3.018511442190281e-05, + "loss": 0.0578, + "num_input_tokens_seen": 18807936, + "step": 89110 + }, + { + "epoch": 9.803630363036303, + "grad_norm": 0.06611934304237366, + "learning_rate": 3.0182766519147525e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18808896, + "step": 89115 + }, + { + "epoch": 9.804180418041804, + "grad_norm": 0.14510639011859894, + "learning_rate": 3.0180418568625173e-05, + "loss": 0.0082, + "num_input_tokens_seen": 18809952, + "step": 89120 + }, + { + "epoch": 9.804730473047305, + "grad_norm": 0.44046100974082947, + "learning_rate": 3.0178070570357387e-05, + "loss": 0.0202, + "num_input_tokens_seen": 18811008, + "step": 89125 + }, + { + "epoch": 9.805280528052805, + "grad_norm": 0.7112281918525696, + "learning_rate": 3.017572252436581e-05, + "loss": 0.0776, + "num_input_tokens_seen": 18812064, + "step": 89130 + }, + { + "epoch": 9.805830583058306, + "grad_norm": 0.003947795368731022, + "learning_rate": 3.0173374430672086e-05, + "loss": 0.0242, + "num_input_tokens_seen": 18813088, + "step": 89135 + }, + { + "epoch": 9.806380638063807, + "grad_norm": 0.05779806897044182, + "learning_rate": 3.017102628929784e-05, + "loss": 0.0387, + "num_input_tokens_seen": 18814080, + "step": 89140 + }, + { + "epoch": 9.806930693069306, + "grad_norm": 2.1859307289123535, + "learning_rate": 3.0168678100264734e-05, + "loss": 0.0207, + "num_input_tokens_seen": 18815104, + "step": 89145 + }, + { + "epoch": 9.807480748074807, + "grad_norm": 0.02873723953962326, + "learning_rate": 3.0166329863594402e-05, + "loss": 0.0098, + "num_input_tokens_seen": 18816160, + "step": 89150 + }, + { + "epoch": 9.808030803080309, + "grad_norm": 0.06437359005212784, + "learning_rate": 3.0163981579308488e-05, + "loss": 0.0043, + "num_input_tokens_seen": 18817216, + "step": 89155 + }, + { + "epoch": 9.808580858085808, + "grad_norm": 1.735568642616272, + "learning_rate": 3.0161633247428634e-05, + "loss": 0.02, + "num_input_tokens_seen": 18818304, + "step": 89160 + }, + { + "epoch": 9.809130913091309, + "grad_norm": 0.07022975385189056, + "learning_rate": 3.0159284867976478e-05, + "loss": 0.0977, + "num_input_tokens_seen": 18819360, + "step": 89165 + }, + { + "epoch": 9.80968096809681, + "grad_norm": 0.13114555180072784, + "learning_rate": 3.015693644097367e-05, + "loss": 0.0242, + "num_input_tokens_seen": 18820448, + "step": 89170 + }, + { + "epoch": 9.810231023102311, + "grad_norm": 0.062131091952323914, + "learning_rate": 3.0154587966441854e-05, + "loss": 0.038, + "num_input_tokens_seen": 18821472, + "step": 89175 + }, + { + "epoch": 9.81078107810781, + "grad_norm": 1.6948020458221436, + "learning_rate": 3.0152239444402686e-05, + "loss": 0.0514, + "num_input_tokens_seen": 18822496, + "step": 89180 + }, + { + "epoch": 9.811331133113312, + "grad_norm": 0.05103696510195732, + "learning_rate": 3.014989087487778e-05, + "loss": 0.0335, + "num_input_tokens_seen": 18823552, + "step": 89185 + }, + { + "epoch": 9.811881188118813, + "grad_norm": 0.01486042607575655, + "learning_rate": 3.014754225788881e-05, + "loss": 0.0079, + "num_input_tokens_seen": 18824576, + "step": 89190 + }, + { + "epoch": 9.812431243124312, + "grad_norm": 0.04345014691352844, + "learning_rate": 3.0145193593457417e-05, + "loss": 0.0462, + "num_input_tokens_seen": 18825664, + "step": 89195 + }, + { + "epoch": 9.812981298129813, + "grad_norm": 0.03125340864062309, + "learning_rate": 3.014284488160523e-05, + "loss": 0.0752, + "num_input_tokens_seen": 18826688, + "step": 89200 + }, + { + "epoch": 9.813531353135314, + "grad_norm": 0.031562089920043945, + "learning_rate": 3.014049612235393e-05, + "loss": 0.0372, + "num_input_tokens_seen": 18827680, + "step": 89205 + }, + { + "epoch": 9.814081408140813, + "grad_norm": 3.481257915496826, + "learning_rate": 3.0138147315725128e-05, + "loss": 0.1336, + "num_input_tokens_seen": 18828736, + "step": 89210 + }, + { + "epoch": 9.814631463146315, + "grad_norm": 0.6513822674751282, + "learning_rate": 3.0135798461740494e-05, + "loss": 0.1307, + "num_input_tokens_seen": 18829760, + "step": 89215 + }, + { + "epoch": 9.815181518151816, + "grad_norm": 0.02340318262577057, + "learning_rate": 3.0133449560421666e-05, + "loss": 0.107, + "num_input_tokens_seen": 18830784, + "step": 89220 + }, + { + "epoch": 9.815731573157315, + "grad_norm": 0.04869328439235687, + "learning_rate": 3.0131100611790302e-05, + "loss": 0.0043, + "num_input_tokens_seen": 18831872, + "step": 89225 + }, + { + "epoch": 9.816281628162816, + "grad_norm": 0.056974951177835464, + "learning_rate": 3.0128751615868045e-05, + "loss": 0.0026, + "num_input_tokens_seen": 18832864, + "step": 89230 + }, + { + "epoch": 9.816831683168317, + "grad_norm": 0.25047239661216736, + "learning_rate": 3.0126402572676542e-05, + "loss": 0.0408, + "num_input_tokens_seen": 18833888, + "step": 89235 + }, + { + "epoch": 9.817381738173818, + "grad_norm": 0.00860077328979969, + "learning_rate": 3.0124053482237446e-05, + "loss": 0.0499, + "num_input_tokens_seen": 18834912, + "step": 89240 + }, + { + "epoch": 9.817931793179318, + "grad_norm": 0.029820242896676064, + "learning_rate": 3.0121704344572404e-05, + "loss": 0.0286, + "num_input_tokens_seen": 18836000, + "step": 89245 + }, + { + "epoch": 9.818481848184819, + "grad_norm": 0.6024264693260193, + "learning_rate": 3.0119355159703077e-05, + "loss": 0.033, + "num_input_tokens_seen": 18837120, + "step": 89250 + }, + { + "epoch": 9.81903190319032, + "grad_norm": 1.0046069622039795, + "learning_rate": 3.011700592765112e-05, + "loss": 0.1106, + "num_input_tokens_seen": 18838176, + "step": 89255 + }, + { + "epoch": 9.819581958195819, + "grad_norm": 0.040373485535383224, + "learning_rate": 3.0114656648438156e-05, + "loss": 0.0064, + "num_input_tokens_seen": 18839264, + "step": 89260 + }, + { + "epoch": 9.82013201320132, + "grad_norm": 0.012752354145050049, + "learning_rate": 3.0112307322085863e-05, + "loss": 0.0122, + "num_input_tokens_seen": 18840288, + "step": 89265 + }, + { + "epoch": 9.820682068206821, + "grad_norm": 0.05900610610842705, + "learning_rate": 3.010995794861589e-05, + "loss": 0.0064, + "num_input_tokens_seen": 18841280, + "step": 89270 + }, + { + "epoch": 9.82123212321232, + "grad_norm": 0.7559463381767273, + "learning_rate": 3.0107608528049885e-05, + "loss": 0.0246, + "num_input_tokens_seen": 18842336, + "step": 89275 + }, + { + "epoch": 9.821782178217822, + "grad_norm": 1.474493384361267, + "learning_rate": 3.01052590604095e-05, + "loss": 0.0691, + "num_input_tokens_seen": 18843392, + "step": 89280 + }, + { + "epoch": 9.822332233223323, + "grad_norm": 0.05640121176838875, + "learning_rate": 3.0102909545716396e-05, + "loss": 0.0065, + "num_input_tokens_seen": 18844416, + "step": 89285 + }, + { + "epoch": 9.822882288228822, + "grad_norm": 0.43437114357948303, + "learning_rate": 3.010055998399222e-05, + "loss": 0.1118, + "num_input_tokens_seen": 18845472, + "step": 89290 + }, + { + "epoch": 9.823432343234323, + "grad_norm": 0.017729036509990692, + "learning_rate": 3.0098210375258628e-05, + "loss": 0.029, + "num_input_tokens_seen": 18846496, + "step": 89295 + }, + { + "epoch": 9.823982398239824, + "grad_norm": 0.01893557980656624, + "learning_rate": 3.0095860719537282e-05, + "loss": 0.008, + "num_input_tokens_seen": 18847552, + "step": 89300 + }, + { + "epoch": 9.824532453245325, + "grad_norm": 1.639184832572937, + "learning_rate": 3.0093511016849836e-05, + "loss": 0.0829, + "num_input_tokens_seen": 18848608, + "step": 89305 + }, + { + "epoch": 9.825082508250825, + "grad_norm": 0.011748291552066803, + "learning_rate": 3.0091161267217937e-05, + "loss": 0.0341, + "num_input_tokens_seen": 18849664, + "step": 89310 + }, + { + "epoch": 9.825632563256326, + "grad_norm": 0.4412324130535126, + "learning_rate": 3.0088811470663248e-05, + "loss": 0.1276, + "num_input_tokens_seen": 18850656, + "step": 89315 + }, + { + "epoch": 9.826182618261827, + "grad_norm": 0.05486862733960152, + "learning_rate": 3.008646162720743e-05, + "loss": 0.003, + "num_input_tokens_seen": 18851712, + "step": 89320 + }, + { + "epoch": 9.826732673267326, + "grad_norm": 0.010793830268085003, + "learning_rate": 3.0084111736872134e-05, + "loss": 0.0395, + "num_input_tokens_seen": 18852736, + "step": 89325 + }, + { + "epoch": 9.827282728272827, + "grad_norm": 0.08190260827541351, + "learning_rate": 3.0081761799679015e-05, + "loss": 0.0567, + "num_input_tokens_seen": 18853856, + "step": 89330 + }, + { + "epoch": 9.827832783278328, + "grad_norm": 0.0437684990465641, + "learning_rate": 3.007941181564974e-05, + "loss": 0.0155, + "num_input_tokens_seen": 18854912, + "step": 89335 + }, + { + "epoch": 9.828382838283828, + "grad_norm": 0.3697986304759979, + "learning_rate": 3.0077061784805967e-05, + "loss": 0.109, + "num_input_tokens_seen": 18855968, + "step": 89340 + }, + { + "epoch": 9.828932893289329, + "grad_norm": 0.06779962033033371, + "learning_rate": 3.007471170716934e-05, + "loss": 0.0093, + "num_input_tokens_seen": 18857024, + "step": 89345 + }, + { + "epoch": 9.82948294829483, + "grad_norm": 0.009720498695969582, + "learning_rate": 3.0072361582761544e-05, + "loss": 0.0083, + "num_input_tokens_seen": 18858080, + "step": 89350 + }, + { + "epoch": 9.83003300330033, + "grad_norm": 1.977346420288086, + "learning_rate": 3.007001141160422e-05, + "loss": 0.157, + "num_input_tokens_seen": 18859168, + "step": 89355 + }, + { + "epoch": 9.83058305830583, + "grad_norm": 0.11073964834213257, + "learning_rate": 3.0067661193719034e-05, + "loss": 0.0532, + "num_input_tokens_seen": 18860192, + "step": 89360 + }, + { + "epoch": 9.831133113311331, + "grad_norm": 0.023590637370944023, + "learning_rate": 3.0065310929127644e-05, + "loss": 0.006, + "num_input_tokens_seen": 18861216, + "step": 89365 + }, + { + "epoch": 9.831683168316832, + "grad_norm": 0.02270467020571232, + "learning_rate": 3.006296061785172e-05, + "loss": 0.0018, + "num_input_tokens_seen": 18862304, + "step": 89370 + }, + { + "epoch": 9.832233223322332, + "grad_norm": 0.8283246159553528, + "learning_rate": 3.006061025991291e-05, + "loss": 0.0083, + "num_input_tokens_seen": 18863296, + "step": 89375 + }, + { + "epoch": 9.832783278327833, + "grad_norm": 0.012090666219592094, + "learning_rate": 3.0058259855332882e-05, + "loss": 0.0375, + "num_input_tokens_seen": 18864352, + "step": 89380 + }, + { + "epoch": 9.833333333333334, + "grad_norm": 0.0620826780796051, + "learning_rate": 3.005590940413331e-05, + "loss": 0.0158, + "num_input_tokens_seen": 18865376, + "step": 89385 + }, + { + "epoch": 9.833883388338833, + "grad_norm": 0.009647748433053493, + "learning_rate": 3.0053558906335833e-05, + "loss": 0.0753, + "num_input_tokens_seen": 18866368, + "step": 89390 + }, + { + "epoch": 9.834433443344334, + "grad_norm": 0.36307409405708313, + "learning_rate": 3.0051208361962145e-05, + "loss": 0.048, + "num_input_tokens_seen": 18867392, + "step": 89395 + }, + { + "epoch": 9.834983498349835, + "grad_norm": 0.37635335326194763, + "learning_rate": 3.0048857771033884e-05, + "loss": 0.0883, + "num_input_tokens_seen": 18868416, + "step": 89400 + }, + { + "epoch": 9.835533553355335, + "grad_norm": 1.2400569915771484, + "learning_rate": 3.0046507133572722e-05, + "loss": 0.027, + "num_input_tokens_seen": 18869472, + "step": 89405 + }, + { + "epoch": 9.836083608360836, + "grad_norm": 2.179704189300537, + "learning_rate": 3.004415644960033e-05, + "loss": 0.0499, + "num_input_tokens_seen": 18870528, + "step": 89410 + }, + { + "epoch": 9.836633663366337, + "grad_norm": 0.02857600525021553, + "learning_rate": 3.0041805719138366e-05, + "loss": 0.0579, + "num_input_tokens_seen": 18871584, + "step": 89415 + }, + { + "epoch": 9.837183718371836, + "grad_norm": 0.3751947283744812, + "learning_rate": 3.00394549422085e-05, + "loss": 0.0088, + "num_input_tokens_seen": 18872608, + "step": 89420 + }, + { + "epoch": 9.837733773377337, + "grad_norm": 0.019103379920125008, + "learning_rate": 3.0037104118832392e-05, + "loss": 0.0111, + "num_input_tokens_seen": 18873664, + "step": 89425 + }, + { + "epoch": 9.838283828382838, + "grad_norm": 0.4982756972312927, + "learning_rate": 3.0034753249031716e-05, + "loss": 0.0144, + "num_input_tokens_seen": 18874720, + "step": 89430 + }, + { + "epoch": 9.83883388338834, + "grad_norm": 0.25585702061653137, + "learning_rate": 3.003240233282813e-05, + "loss": 0.0241, + "num_input_tokens_seen": 18875744, + "step": 89435 + }, + { + "epoch": 9.839383938393839, + "grad_norm": 0.11543484032154083, + "learning_rate": 3.003005137024331e-05, + "loss": 0.0324, + "num_input_tokens_seen": 18876768, + "step": 89440 + }, + { + "epoch": 9.83993399339934, + "grad_norm": 1.3834397792816162, + "learning_rate": 3.0027700361298926e-05, + "loss": 0.0526, + "num_input_tokens_seen": 18877856, + "step": 89445 + }, + { + "epoch": 9.840484048404841, + "grad_norm": 0.8668777942657471, + "learning_rate": 3.002534930601664e-05, + "loss": 0.121, + "num_input_tokens_seen": 18878912, + "step": 89450 + }, + { + "epoch": 9.84103410341034, + "grad_norm": 0.18909208476543427, + "learning_rate": 3.0022998204418112e-05, + "loss": 0.0132, + "num_input_tokens_seen": 18879968, + "step": 89455 + }, + { + "epoch": 9.841584158415841, + "grad_norm": 0.011272042989730835, + "learning_rate": 3.0020647056525026e-05, + "loss": 0.0107, + "num_input_tokens_seen": 18881056, + "step": 89460 + }, + { + "epoch": 9.842134213421343, + "grad_norm": 2.0147244930267334, + "learning_rate": 3.0018295862359043e-05, + "loss": 0.0411, + "num_input_tokens_seen": 18882048, + "step": 89465 + }, + { + "epoch": 9.842684268426842, + "grad_norm": 0.028056785464286804, + "learning_rate": 3.001594462194183e-05, + "loss": 0.0088, + "num_input_tokens_seen": 18883136, + "step": 89470 + }, + { + "epoch": 9.843234323432343, + "grad_norm": 0.0361853688955307, + "learning_rate": 3.0013593335295066e-05, + "loss": 0.0394, + "num_input_tokens_seen": 18884128, + "step": 89475 + }, + { + "epoch": 9.843784378437844, + "grad_norm": 0.012054584920406342, + "learning_rate": 3.001124200244042e-05, + "loss": 0.0061, + "num_input_tokens_seen": 18885216, + "step": 89480 + }, + { + "epoch": 9.844334433443345, + "grad_norm": 0.026140006259083748, + "learning_rate": 3.0008890623399562e-05, + "loss": 0.004, + "num_input_tokens_seen": 18886208, + "step": 89485 + }, + { + "epoch": 9.844884488448844, + "grad_norm": 1.9310444593429565, + "learning_rate": 3.000653919819416e-05, + "loss": 0.1364, + "num_input_tokens_seen": 18887232, + "step": 89490 + }, + { + "epoch": 9.845434543454346, + "grad_norm": 0.04904824122786522, + "learning_rate": 3.0004187726845895e-05, + "loss": 0.0678, + "num_input_tokens_seen": 18888256, + "step": 89495 + }, + { + "epoch": 9.845984598459847, + "grad_norm": 0.13938133418560028, + "learning_rate": 3.0001836209376427e-05, + "loss": 0.0063, + "num_input_tokens_seen": 18889248, + "step": 89500 + }, + { + "epoch": 9.846534653465346, + "grad_norm": 0.08990386128425598, + "learning_rate": 2.999948464580743e-05, + "loss": 0.0539, + "num_input_tokens_seen": 18890240, + "step": 89505 + }, + { + "epoch": 9.847084708470847, + "grad_norm": 0.030634308233857155, + "learning_rate": 2.9997133036160587e-05, + "loss": 0.0546, + "num_input_tokens_seen": 18891264, + "step": 89510 + }, + { + "epoch": 9.847634763476348, + "grad_norm": 0.018357105553150177, + "learning_rate": 2.9994781380457575e-05, + "loss": 0.0389, + "num_input_tokens_seen": 18892288, + "step": 89515 + }, + { + "epoch": 9.848184818481847, + "grad_norm": 0.45750370621681213, + "learning_rate": 2.9992429678720047e-05, + "loss": 0.0268, + "num_input_tokens_seen": 18893408, + "step": 89520 + }, + { + "epoch": 9.848734873487349, + "grad_norm": 0.005761000793427229, + "learning_rate": 2.9990077930969694e-05, + "loss": 0.0029, + "num_input_tokens_seen": 18894464, + "step": 89525 + }, + { + "epoch": 9.84928492849285, + "grad_norm": 0.6253534555435181, + "learning_rate": 2.998772613722819e-05, + "loss": 0.0503, + "num_input_tokens_seen": 18895552, + "step": 89530 + }, + { + "epoch": 9.84983498349835, + "grad_norm": 0.013183243572711945, + "learning_rate": 2.99853742975172e-05, + "loss": 0.0129, + "num_input_tokens_seen": 18896640, + "step": 89535 + }, + { + "epoch": 9.85038503850385, + "grad_norm": 0.7224751710891724, + "learning_rate": 2.9983022411858418e-05, + "loss": 0.0341, + "num_input_tokens_seen": 18897696, + "step": 89540 + }, + { + "epoch": 9.850935093509351, + "grad_norm": 0.020588479936122894, + "learning_rate": 2.998067048027351e-05, + "loss": 0.0887, + "num_input_tokens_seen": 18898720, + "step": 89545 + }, + { + "epoch": 9.851485148514852, + "grad_norm": 0.312815397977829, + "learning_rate": 2.997831850278414e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18899712, + "step": 89550 + }, + { + "epoch": 9.852035203520352, + "grad_norm": 0.009056712500751019, + "learning_rate": 2.9975966479412004e-05, + "loss": 0.0185, + "num_input_tokens_seen": 18900736, + "step": 89555 + }, + { + "epoch": 9.852585258525853, + "grad_norm": 0.018604466691613197, + "learning_rate": 2.9973614410178773e-05, + "loss": 0.0242, + "num_input_tokens_seen": 18901824, + "step": 89560 + }, + { + "epoch": 9.853135313531354, + "grad_norm": 0.4727005958557129, + "learning_rate": 2.9971262295106128e-05, + "loss": 0.0144, + "num_input_tokens_seen": 18902816, + "step": 89565 + }, + { + "epoch": 9.853685368536853, + "grad_norm": 0.0771530494093895, + "learning_rate": 2.996891013421574e-05, + "loss": 0.0111, + "num_input_tokens_seen": 18903808, + "step": 89570 + }, + { + "epoch": 9.854235423542354, + "grad_norm": 1.2370883226394653, + "learning_rate": 2.996655792752929e-05, + "loss": 0.025, + "num_input_tokens_seen": 18904832, + "step": 89575 + }, + { + "epoch": 9.854785478547855, + "grad_norm": 1.4172940254211426, + "learning_rate": 2.9964205675068458e-05, + "loss": 0.071, + "num_input_tokens_seen": 18905888, + "step": 89580 + }, + { + "epoch": 9.855335533553355, + "grad_norm": 0.07317973673343658, + "learning_rate": 2.9961853376854933e-05, + "loss": 0.0048, + "num_input_tokens_seen": 18906880, + "step": 89585 + }, + { + "epoch": 9.855885588558856, + "grad_norm": 0.03704715147614479, + "learning_rate": 2.9959501032910377e-05, + "loss": 0.0046, + "num_input_tokens_seen": 18907904, + "step": 89590 + }, + { + "epoch": 9.856435643564357, + "grad_norm": 0.07288673520088196, + "learning_rate": 2.9957148643256484e-05, + "loss": 0.0549, + "num_input_tokens_seen": 18909024, + "step": 89595 + }, + { + "epoch": 9.856985698569858, + "grad_norm": 0.1176319420337677, + "learning_rate": 2.9954796207914925e-05, + "loss": 0.0031, + "num_input_tokens_seen": 18910080, + "step": 89600 + }, + { + "epoch": 9.857535753575357, + "grad_norm": 0.054584383964538574, + "learning_rate": 2.995244372690739e-05, + "loss": 0.0088, + "num_input_tokens_seen": 18911104, + "step": 89605 + }, + { + "epoch": 9.858085808580858, + "grad_norm": 0.4494907259941101, + "learning_rate": 2.9950091200255565e-05, + "loss": 0.043, + "num_input_tokens_seen": 18912096, + "step": 89610 + }, + { + "epoch": 9.85863586358636, + "grad_norm": 0.11015357077121735, + "learning_rate": 2.9947738627981115e-05, + "loss": 0.0072, + "num_input_tokens_seen": 18913056, + "step": 89615 + }, + { + "epoch": 9.859185918591859, + "grad_norm": 1.5726103782653809, + "learning_rate": 2.9945386010105735e-05, + "loss": 0.124, + "num_input_tokens_seen": 18914080, + "step": 89620 + }, + { + "epoch": 9.85973597359736, + "grad_norm": 0.017163243144750595, + "learning_rate": 2.9943033346651105e-05, + "loss": 0.0097, + "num_input_tokens_seen": 18915136, + "step": 89625 + }, + { + "epoch": 9.86028602860286, + "grad_norm": 0.1373954713344574, + "learning_rate": 2.9940680637638913e-05, + "loss": 0.0027, + "num_input_tokens_seen": 18916192, + "step": 89630 + }, + { + "epoch": 9.86083608360836, + "grad_norm": 0.03816504776477814, + "learning_rate": 2.9938327883090833e-05, + "loss": 0.0107, + "num_input_tokens_seen": 18917280, + "step": 89635 + }, + { + "epoch": 9.861386138613861, + "grad_norm": 0.7840295433998108, + "learning_rate": 2.9935975083028556e-05, + "loss": 0.0695, + "num_input_tokens_seen": 18918336, + "step": 89640 + }, + { + "epoch": 9.861936193619362, + "grad_norm": 0.025701312348246574, + "learning_rate": 2.993362223747377e-05, + "loss": 0.0118, + "num_input_tokens_seen": 18919360, + "step": 89645 + }, + { + "epoch": 9.862486248624862, + "grad_norm": 1.5736849308013916, + "learning_rate": 2.9931269346448142e-05, + "loss": 0.0421, + "num_input_tokens_seen": 18920416, + "step": 89650 + }, + { + "epoch": 9.863036303630363, + "grad_norm": 0.017636723816394806, + "learning_rate": 2.992891640997339e-05, + "loss": 0.0339, + "num_input_tokens_seen": 18921472, + "step": 89655 + }, + { + "epoch": 9.863586358635864, + "grad_norm": 1.4859256744384766, + "learning_rate": 2.992656342807117e-05, + "loss": 0.0966, + "num_input_tokens_seen": 18922528, + "step": 89660 + }, + { + "epoch": 9.864136413641365, + "grad_norm": 0.15760980546474457, + "learning_rate": 2.9924210400763174e-05, + "loss": 0.0096, + "num_input_tokens_seen": 18923616, + "step": 89665 + }, + { + "epoch": 9.864686468646864, + "grad_norm": 0.030994636937975883, + "learning_rate": 2.99218573280711e-05, + "loss": 0.0349, + "num_input_tokens_seen": 18924640, + "step": 89670 + }, + { + "epoch": 9.865236523652365, + "grad_norm": 0.05936555564403534, + "learning_rate": 2.9919504210016623e-05, + "loss": 0.0027, + "num_input_tokens_seen": 18925632, + "step": 89675 + }, + { + "epoch": 9.865786578657866, + "grad_norm": 0.14899399876594543, + "learning_rate": 2.9917151046621444e-05, + "loss": 0.0132, + "num_input_tokens_seen": 18926720, + "step": 89680 + }, + { + "epoch": 9.866336633663366, + "grad_norm": 1.9975290298461914, + "learning_rate": 2.9914797837907243e-05, + "loss": 0.1044, + "num_input_tokens_seen": 18927808, + "step": 89685 + }, + { + "epoch": 9.866886688668867, + "grad_norm": 0.020746730268001556, + "learning_rate": 2.9912444583895703e-05, + "loss": 0.0107, + "num_input_tokens_seen": 18928832, + "step": 89690 + }, + { + "epoch": 9.867436743674368, + "grad_norm": 0.009444315917789936, + "learning_rate": 2.9910091284608517e-05, + "loss": 0.0558, + "num_input_tokens_seen": 18929920, + "step": 89695 + }, + { + "epoch": 9.867986798679867, + "grad_norm": 0.06761520355939865, + "learning_rate": 2.9907737940067383e-05, + "loss": 0.0071, + "num_input_tokens_seen": 18930944, + "step": 89700 + }, + { + "epoch": 9.868536853685368, + "grad_norm": 0.2783500552177429, + "learning_rate": 2.9905384550293985e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18932000, + "step": 89705 + }, + { + "epoch": 9.86908690869087, + "grad_norm": 0.050125397741794586, + "learning_rate": 2.9903031115310003e-05, + "loss": 0.131, + "num_input_tokens_seen": 18932992, + "step": 89710 + }, + { + "epoch": 9.869636963696369, + "grad_norm": 0.013097263872623444, + "learning_rate": 2.9900677635137142e-05, + "loss": 0.038, + "num_input_tokens_seen": 18934112, + "step": 89715 + }, + { + "epoch": 9.87018701870187, + "grad_norm": 0.03706180304288864, + "learning_rate": 2.9898324109797083e-05, + "loss": 0.0057, + "num_input_tokens_seen": 18935200, + "step": 89720 + }, + { + "epoch": 9.870737073707371, + "grad_norm": 0.26837074756622314, + "learning_rate": 2.989597053931152e-05, + "loss": 0.104, + "num_input_tokens_seen": 18936224, + "step": 89725 + }, + { + "epoch": 9.871287128712872, + "grad_norm": 1.4117035865783691, + "learning_rate": 2.989361692370215e-05, + "loss": 0.0722, + "num_input_tokens_seen": 18937248, + "step": 89730 + }, + { + "epoch": 9.871837183718371, + "grad_norm": 0.3010534346103668, + "learning_rate": 2.989126326299066e-05, + "loss": 0.0346, + "num_input_tokens_seen": 18938304, + "step": 89735 + }, + { + "epoch": 9.872387238723872, + "grad_norm": 0.04347900673747063, + "learning_rate": 2.988890955719874e-05, + "loss": 0.0502, + "num_input_tokens_seen": 18939360, + "step": 89740 + }, + { + "epoch": 9.872937293729374, + "grad_norm": 1.2407444715499878, + "learning_rate": 2.9886555806348088e-05, + "loss": 0.0245, + "num_input_tokens_seen": 18940416, + "step": 89745 + }, + { + "epoch": 9.873487348734873, + "grad_norm": 0.46824750304222107, + "learning_rate": 2.98842020104604e-05, + "loss": 0.0479, + "num_input_tokens_seen": 18941632, + "step": 89750 + }, + { + "epoch": 9.874037403740374, + "grad_norm": 0.018390756100416183, + "learning_rate": 2.9881848169557365e-05, + "loss": 0.0825, + "num_input_tokens_seen": 18942720, + "step": 89755 + }, + { + "epoch": 9.874587458745875, + "grad_norm": 0.580788791179657, + "learning_rate": 2.9879494283660676e-05, + "loss": 0.0305, + "num_input_tokens_seen": 18943776, + "step": 89760 + }, + { + "epoch": 9.875137513751374, + "grad_norm": 0.06064644455909729, + "learning_rate": 2.9877140352792033e-05, + "loss": 0.0048, + "num_input_tokens_seen": 18944896, + "step": 89765 + }, + { + "epoch": 9.875687568756875, + "grad_norm": 0.05070647969841957, + "learning_rate": 2.987478637697312e-05, + "loss": 0.0037, + "num_input_tokens_seen": 18945920, + "step": 89770 + }, + { + "epoch": 9.876237623762377, + "grad_norm": 0.0765320211648941, + "learning_rate": 2.987243235622565e-05, + "loss": 0.0482, + "num_input_tokens_seen": 18946976, + "step": 89775 + }, + { + "epoch": 9.876787678767876, + "grad_norm": 0.21431127190589905, + "learning_rate": 2.9870078290571295e-05, + "loss": 0.0082, + "num_input_tokens_seen": 18948032, + "step": 89780 + }, + { + "epoch": 9.877337733773377, + "grad_norm": 0.040521085262298584, + "learning_rate": 2.9867724180031775e-05, + "loss": 0.0834, + "num_input_tokens_seen": 18949056, + "step": 89785 + }, + { + "epoch": 9.877887788778878, + "grad_norm": 0.04605661705136299, + "learning_rate": 2.9865370024628775e-05, + "loss": 0.0161, + "num_input_tokens_seen": 18950048, + "step": 89790 + }, + { + "epoch": 9.87843784378438, + "grad_norm": 0.6153698563575745, + "learning_rate": 2.9863015824384e-05, + "loss": 0.0658, + "num_input_tokens_seen": 18951072, + "step": 89795 + }, + { + "epoch": 9.878987898789878, + "grad_norm": 0.5841554999351501, + "learning_rate": 2.986066157931913e-05, + "loss": 0.018, + "num_input_tokens_seen": 18952192, + "step": 89800 + }, + { + "epoch": 9.87953795379538, + "grad_norm": 0.09222576767206192, + "learning_rate": 2.985830728945589e-05, + "loss": 0.0345, + "num_input_tokens_seen": 18953280, + "step": 89805 + }, + { + "epoch": 9.88008800880088, + "grad_norm": 0.2140514999628067, + "learning_rate": 2.9855952954815947e-05, + "loss": 0.0123, + "num_input_tokens_seen": 18954336, + "step": 89810 + }, + { + "epoch": 9.88063806380638, + "grad_norm": 0.3152245581150055, + "learning_rate": 2.985359857542102e-05, + "loss": 0.027, + "num_input_tokens_seen": 18955488, + "step": 89815 + }, + { + "epoch": 9.881188118811881, + "grad_norm": 0.10558947175741196, + "learning_rate": 2.9851244151292814e-05, + "loss": 0.1066, + "num_input_tokens_seen": 18956576, + "step": 89820 + }, + { + "epoch": 9.881738173817382, + "grad_norm": 1.147542119026184, + "learning_rate": 2.9848889682453007e-05, + "loss": 0.0871, + "num_input_tokens_seen": 18957664, + "step": 89825 + }, + { + "epoch": 9.882288228822881, + "grad_norm": 0.3799956738948822, + "learning_rate": 2.984653516892332e-05, + "loss": 0.0458, + "num_input_tokens_seen": 18958656, + "step": 89830 + }, + { + "epoch": 9.882838283828383, + "grad_norm": 0.041897740215063095, + "learning_rate": 2.9844180610725436e-05, + "loss": 0.003, + "num_input_tokens_seen": 18959648, + "step": 89835 + }, + { + "epoch": 9.883388338833884, + "grad_norm": 1.9740467071533203, + "learning_rate": 2.9841826007881063e-05, + "loss": 0.0591, + "num_input_tokens_seen": 18960640, + "step": 89840 + }, + { + "epoch": 9.883938393839383, + "grad_norm": 1.1914470195770264, + "learning_rate": 2.9839471360411908e-05, + "loss": 0.0614, + "num_input_tokens_seen": 18961696, + "step": 89845 + }, + { + "epoch": 9.884488448844884, + "grad_norm": 1.3839938640594482, + "learning_rate": 2.983711666833967e-05, + "loss": 0.0533, + "num_input_tokens_seen": 18962720, + "step": 89850 + }, + { + "epoch": 9.885038503850385, + "grad_norm": 0.24728932976722717, + "learning_rate": 2.983476193168604e-05, + "loss": 0.0295, + "num_input_tokens_seen": 18963808, + "step": 89855 + }, + { + "epoch": 9.885588558855886, + "grad_norm": 1.532417893409729, + "learning_rate": 2.9832407150472735e-05, + "loss": 0.1212, + "num_input_tokens_seen": 18964896, + "step": 89860 + }, + { + "epoch": 9.886138613861386, + "grad_norm": 0.0743463858962059, + "learning_rate": 2.9830052324721454e-05, + "loss": 0.0051, + "num_input_tokens_seen": 18965920, + "step": 89865 + }, + { + "epoch": 9.886688668866887, + "grad_norm": 0.04271586611866951, + "learning_rate": 2.9827697454453895e-05, + "loss": 0.0065, + "num_input_tokens_seen": 18967008, + "step": 89870 + }, + { + "epoch": 9.887238723872388, + "grad_norm": 1.0335026979446411, + "learning_rate": 2.9825342539691764e-05, + "loss": 0.0707, + "num_input_tokens_seen": 18968128, + "step": 89875 + }, + { + "epoch": 9.887788778877887, + "grad_norm": 1.1653809547424316, + "learning_rate": 2.9822987580456768e-05, + "loss": 0.062, + "num_input_tokens_seen": 18969152, + "step": 89880 + }, + { + "epoch": 9.888338833883388, + "grad_norm": 0.7467712759971619, + "learning_rate": 2.9820632576770607e-05, + "loss": 0.0334, + "num_input_tokens_seen": 18970208, + "step": 89885 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.007866589352488518, + "learning_rate": 2.9818277528654993e-05, + "loss": 0.1105, + "num_input_tokens_seen": 18971232, + "step": 89890 + }, + { + "epoch": 9.88943894389439, + "grad_norm": 1.8670791387557983, + "learning_rate": 2.9815922436131627e-05, + "loss": 0.0312, + "num_input_tokens_seen": 18972384, + "step": 89895 + }, + { + "epoch": 9.88998899889989, + "grad_norm": 0.04633333161473274, + "learning_rate": 2.981356729922221e-05, + "loss": 0.0244, + "num_input_tokens_seen": 18973440, + "step": 89900 + }, + { + "epoch": 9.89053905390539, + "grad_norm": 0.031773269176483154, + "learning_rate": 2.9811212117948457e-05, + "loss": 0.0831, + "num_input_tokens_seen": 18974496, + "step": 89905 + }, + { + "epoch": 9.891089108910892, + "grad_norm": 0.3190244138240814, + "learning_rate": 2.980885689233207e-05, + "loss": 0.0249, + "num_input_tokens_seen": 18975552, + "step": 89910 + }, + { + "epoch": 9.891639163916391, + "grad_norm": 0.019829779863357544, + "learning_rate": 2.9806501622394757e-05, + "loss": 0.0109, + "num_input_tokens_seen": 18976544, + "step": 89915 + }, + { + "epoch": 9.892189218921892, + "grad_norm": 0.013326258398592472, + "learning_rate": 2.9804146308158227e-05, + "loss": 0.0039, + "num_input_tokens_seen": 18977664, + "step": 89920 + }, + { + "epoch": 9.892739273927393, + "grad_norm": 2.271423816680908, + "learning_rate": 2.9801790949644174e-05, + "loss": 0.1099, + "num_input_tokens_seen": 18978688, + "step": 89925 + }, + { + "epoch": 9.893289328932893, + "grad_norm": 2.996617317199707, + "learning_rate": 2.9799435546874334e-05, + "loss": 0.0533, + "num_input_tokens_seen": 18979744, + "step": 89930 + }, + { + "epoch": 9.893839383938394, + "grad_norm": 0.6396915316581726, + "learning_rate": 2.979708009987039e-05, + "loss": 0.0219, + "num_input_tokens_seen": 18980832, + "step": 89935 + }, + { + "epoch": 9.894389438943895, + "grad_norm": 0.07869867235422134, + "learning_rate": 2.9794724608654068e-05, + "loss": 0.1032, + "num_input_tokens_seen": 18981984, + "step": 89940 + }, + { + "epoch": 9.894939493949394, + "grad_norm": 0.2657735347747803, + "learning_rate": 2.979236907324706e-05, + "loss": 0.1176, + "num_input_tokens_seen": 18983072, + "step": 89945 + }, + { + "epoch": 9.895489548954895, + "grad_norm": 0.7374245524406433, + "learning_rate": 2.9790013493671093e-05, + "loss": 0.0142, + "num_input_tokens_seen": 18984096, + "step": 89950 + }, + { + "epoch": 9.896039603960396, + "grad_norm": 0.9013760685920715, + "learning_rate": 2.9787657869947867e-05, + "loss": 0.0099, + "num_input_tokens_seen": 18985152, + "step": 89955 + }, + { + "epoch": 9.896589658965897, + "grad_norm": 0.12859059870243073, + "learning_rate": 2.9785302202099096e-05, + "loss": 0.0083, + "num_input_tokens_seen": 18986240, + "step": 89960 + }, + { + "epoch": 9.897139713971397, + "grad_norm": 0.11399125307798386, + "learning_rate": 2.9782946490146496e-05, + "loss": 0.009, + "num_input_tokens_seen": 18987296, + "step": 89965 + }, + { + "epoch": 9.897689768976898, + "grad_norm": 1.650744915008545, + "learning_rate": 2.9780590734111763e-05, + "loss": 0.1405, + "num_input_tokens_seen": 18988384, + "step": 89970 + }, + { + "epoch": 9.898239823982399, + "grad_norm": 0.06156592071056366, + "learning_rate": 2.977823493401663e-05, + "loss": 0.0815, + "num_input_tokens_seen": 18989344, + "step": 89975 + }, + { + "epoch": 9.898789878987898, + "grad_norm": 0.030784152448177338, + "learning_rate": 2.9775879089882793e-05, + "loss": 0.0152, + "num_input_tokens_seen": 18990400, + "step": 89980 + }, + { + "epoch": 9.8993399339934, + "grad_norm": 0.053481582552194595, + "learning_rate": 2.9773523201731967e-05, + "loss": 0.0131, + "num_input_tokens_seen": 18991424, + "step": 89985 + }, + { + "epoch": 9.8998899889989, + "grad_norm": 0.3699589967727661, + "learning_rate": 2.9771167269585874e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18992448, + "step": 89990 + }, + { + "epoch": 9.9004400440044, + "grad_norm": 0.036561548709869385, + "learning_rate": 2.976881129346622e-05, + "loss": 0.051, + "num_input_tokens_seen": 18993536, + "step": 89995 + }, + { + "epoch": 9.900990099009901, + "grad_norm": 0.11405983567237854, + "learning_rate": 2.9766455273394718e-05, + "loss": 0.0029, + "num_input_tokens_seen": 18994624, + "step": 90000 + }, + { + "epoch": 9.901540154015402, + "grad_norm": 0.11062934249639511, + "learning_rate": 2.9764099209393086e-05, + "loss": 0.0058, + "num_input_tokens_seen": 18995680, + "step": 90005 + }, + { + "epoch": 9.902090209020901, + "grad_norm": 0.04613054171204567, + "learning_rate": 2.9761743101483042e-05, + "loss": 0.1045, + "num_input_tokens_seen": 18996704, + "step": 90010 + }, + { + "epoch": 9.902640264026402, + "grad_norm": 0.025872817263007164, + "learning_rate": 2.975938694968629e-05, + "loss": 0.008, + "num_input_tokens_seen": 18997696, + "step": 90015 + }, + { + "epoch": 9.903190319031903, + "grad_norm": 0.14315365254878998, + "learning_rate": 2.975703075402456e-05, + "loss": 0.0539, + "num_input_tokens_seen": 18998752, + "step": 90020 + }, + { + "epoch": 9.903740374037405, + "grad_norm": 0.02744933031499386, + "learning_rate": 2.9754674514519553e-05, + "loss": 0.0052, + "num_input_tokens_seen": 18999808, + "step": 90025 + }, + { + "epoch": 9.904290429042904, + "grad_norm": 0.39358314871788025, + "learning_rate": 2.9752318231192993e-05, + "loss": 0.0283, + "num_input_tokens_seen": 19000800, + "step": 90030 + }, + { + "epoch": 9.904840484048405, + "grad_norm": 0.06031806021928787, + "learning_rate": 2.9749961904066604e-05, + "loss": 0.1216, + "num_input_tokens_seen": 19001920, + "step": 90035 + }, + { + "epoch": 9.905390539053906, + "grad_norm": 0.02157089300453663, + "learning_rate": 2.9747605533162093e-05, + "loss": 0.0243, + "num_input_tokens_seen": 19002912, + "step": 90040 + }, + { + "epoch": 9.905940594059405, + "grad_norm": 0.1327177733182907, + "learning_rate": 2.974524911850117e-05, + "loss": 0.0326, + "num_input_tokens_seen": 19003968, + "step": 90045 + }, + { + "epoch": 9.906490649064907, + "grad_norm": 0.72021484375, + "learning_rate": 2.974289266010557e-05, + "loss": 0.0126, + "num_input_tokens_seen": 19005024, + "step": 90050 + }, + { + "epoch": 9.907040704070408, + "grad_norm": 0.5148208737373352, + "learning_rate": 2.9740536157997008e-05, + "loss": 0.1432, + "num_input_tokens_seen": 19006048, + "step": 90055 + }, + { + "epoch": 9.907590759075907, + "grad_norm": 0.019405996426939964, + "learning_rate": 2.973817961219719e-05, + "loss": 0.0604, + "num_input_tokens_seen": 19007072, + "step": 90060 + }, + { + "epoch": 9.908140814081408, + "grad_norm": 0.5415815711021423, + "learning_rate": 2.9735823022727856e-05, + "loss": 0.0117, + "num_input_tokens_seen": 19008096, + "step": 90065 + }, + { + "epoch": 9.908690869086909, + "grad_norm": 0.14138725399971008, + "learning_rate": 2.9733466389610696e-05, + "loss": 0.0177, + "num_input_tokens_seen": 19009120, + "step": 90070 + }, + { + "epoch": 9.909240924092408, + "grad_norm": 0.24270381033420563, + "learning_rate": 2.973110971286746e-05, + "loss": 0.0097, + "num_input_tokens_seen": 19010112, + "step": 90075 + }, + { + "epoch": 9.90979097909791, + "grad_norm": 0.43071407079696655, + "learning_rate": 2.972875299251986e-05, + "loss": 0.0649, + "num_input_tokens_seen": 19011200, + "step": 90080 + }, + { + "epoch": 9.91034103410341, + "grad_norm": 0.7901273369789124, + "learning_rate": 2.9726396228589604e-05, + "loss": 0.0101, + "num_input_tokens_seen": 19012256, + "step": 90085 + }, + { + "epoch": 9.910891089108912, + "grad_norm": 0.06266496330499649, + "learning_rate": 2.9724039421098427e-05, + "loss": 0.0078, + "num_input_tokens_seen": 19013344, + "step": 90090 + }, + { + "epoch": 9.911441144114411, + "grad_norm": 0.0344405360519886, + "learning_rate": 2.9721682570068042e-05, + "loss": 0.0027, + "num_input_tokens_seen": 19014336, + "step": 90095 + }, + { + "epoch": 9.911991199119912, + "grad_norm": 0.313019722700119, + "learning_rate": 2.971932567552017e-05, + "loss": 0.0209, + "num_input_tokens_seen": 19015360, + "step": 90100 + }, + { + "epoch": 9.912541254125413, + "grad_norm": 2.08109188079834, + "learning_rate": 2.9716968737476546e-05, + "loss": 0.033, + "num_input_tokens_seen": 19016384, + "step": 90105 + }, + { + "epoch": 9.913091309130913, + "grad_norm": 0.12496212869882584, + "learning_rate": 2.971461175595889e-05, + "loss": 0.0039, + "num_input_tokens_seen": 19017440, + "step": 90110 + }, + { + "epoch": 9.913641364136414, + "grad_norm": 0.06634947657585144, + "learning_rate": 2.971225473098891e-05, + "loss": 0.0103, + "num_input_tokens_seen": 19018464, + "step": 90115 + }, + { + "epoch": 9.914191419141915, + "grad_norm": 0.9797850251197815, + "learning_rate": 2.9709897662588338e-05, + "loss": 0.0598, + "num_input_tokens_seen": 19019488, + "step": 90120 + }, + { + "epoch": 9.914741474147414, + "grad_norm": 0.1421726942062378, + "learning_rate": 2.9707540550778907e-05, + "loss": 0.0263, + "num_input_tokens_seen": 19020576, + "step": 90125 + }, + { + "epoch": 9.915291529152915, + "grad_norm": 0.05132288113236427, + "learning_rate": 2.9705183395582325e-05, + "loss": 0.009, + "num_input_tokens_seen": 19021568, + "step": 90130 + }, + { + "epoch": 9.915841584158416, + "grad_norm": 0.020596662536263466, + "learning_rate": 2.9702826197020335e-05, + "loss": 0.1022, + "num_input_tokens_seen": 19022560, + "step": 90135 + }, + { + "epoch": 9.916391639163916, + "grad_norm": 0.037330690771341324, + "learning_rate": 2.9700468955114647e-05, + "loss": 0.0755, + "num_input_tokens_seen": 19023584, + "step": 90140 + }, + { + "epoch": 9.916941694169417, + "grad_norm": 0.063625268638134, + "learning_rate": 2.9698111669886987e-05, + "loss": 0.0245, + "num_input_tokens_seen": 19024608, + "step": 90145 + }, + { + "epoch": 9.917491749174918, + "grad_norm": 0.015007032081484795, + "learning_rate": 2.9695754341359094e-05, + "loss": 0.0051, + "num_input_tokens_seen": 19025696, + "step": 90150 + }, + { + "epoch": 9.918041804180419, + "grad_norm": 1.3350152969360352, + "learning_rate": 2.969339696955269e-05, + "loss": 0.015, + "num_input_tokens_seen": 19026752, + "step": 90155 + }, + { + "epoch": 9.918591859185918, + "grad_norm": 0.2865830659866333, + "learning_rate": 2.9691039554489486e-05, + "loss": 0.0165, + "num_input_tokens_seen": 19027744, + "step": 90160 + }, + { + "epoch": 9.91914191419142, + "grad_norm": 1.5295050144195557, + "learning_rate": 2.968868209619123e-05, + "loss": 0.0588, + "num_input_tokens_seen": 19028832, + "step": 90165 + }, + { + "epoch": 9.91969196919692, + "grad_norm": 0.028307870030403137, + "learning_rate": 2.9686324594679636e-05, + "loss": 0.0297, + "num_input_tokens_seen": 19029888, + "step": 90170 + }, + { + "epoch": 9.92024202420242, + "grad_norm": 0.050450392067432404, + "learning_rate": 2.9683967049976437e-05, + "loss": 0.0093, + "num_input_tokens_seen": 19030912, + "step": 90175 + }, + { + "epoch": 9.92079207920792, + "grad_norm": 0.03150724247097969, + "learning_rate": 2.9681609462103366e-05, + "loss": 0.0364, + "num_input_tokens_seen": 19032000, + "step": 90180 + }, + { + "epoch": 9.921342134213422, + "grad_norm": 0.9721140265464783, + "learning_rate": 2.967925183108215e-05, + "loss": 0.0158, + "num_input_tokens_seen": 19033024, + "step": 90185 + }, + { + "epoch": 9.921892189218921, + "grad_norm": 1.1828944683074951, + "learning_rate": 2.9676894156934504e-05, + "loss": 0.0676, + "num_input_tokens_seen": 19034048, + "step": 90190 + }, + { + "epoch": 9.922442244224422, + "grad_norm": 0.016595715656876564, + "learning_rate": 2.9674536439682176e-05, + "loss": 0.0376, + "num_input_tokens_seen": 19035072, + "step": 90195 + }, + { + "epoch": 9.922992299229923, + "grad_norm": 0.014783571474254131, + "learning_rate": 2.9672178679346885e-05, + "loss": 0.1258, + "num_input_tokens_seen": 19036128, + "step": 90200 + }, + { + "epoch": 9.923542354235423, + "grad_norm": 0.03817633166909218, + "learning_rate": 2.966982087595037e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19037248, + "step": 90205 + }, + { + "epoch": 9.924092409240924, + "grad_norm": 1.7213629484176636, + "learning_rate": 2.9667463029514354e-05, + "loss": 0.0248, + "num_input_tokens_seen": 19038272, + "step": 90210 + }, + { + "epoch": 9.924642464246425, + "grad_norm": 0.8120102286338806, + "learning_rate": 2.9665105140060573e-05, + "loss": 0.031, + "num_input_tokens_seen": 19039296, + "step": 90215 + }, + { + "epoch": 9.925192519251926, + "grad_norm": 0.022522903978824615, + "learning_rate": 2.9662747207610752e-05, + "loss": 0.0018, + "num_input_tokens_seen": 19040416, + "step": 90220 + }, + { + "epoch": 9.925742574257425, + "grad_norm": 0.03481810539960861, + "learning_rate": 2.9660389232186637e-05, + "loss": 0.0044, + "num_input_tokens_seen": 19041504, + "step": 90225 + }, + { + "epoch": 9.926292629262926, + "grad_norm": 0.017718471586704254, + "learning_rate": 2.9658031213809935e-05, + "loss": 0.0029, + "num_input_tokens_seen": 19042560, + "step": 90230 + }, + { + "epoch": 9.926842684268427, + "grad_norm": 0.22787988185882568, + "learning_rate": 2.9655673152502405e-05, + "loss": 0.044, + "num_input_tokens_seen": 19043616, + "step": 90235 + }, + { + "epoch": 9.927392739273927, + "grad_norm": 0.015161917544901371, + "learning_rate": 2.9653315048285768e-05, + "loss": 0.0413, + "num_input_tokens_seen": 19044672, + "step": 90240 + }, + { + "epoch": 9.927942794279428, + "grad_norm": 0.08143505454063416, + "learning_rate": 2.965095690118176e-05, + "loss": 0.0028, + "num_input_tokens_seen": 19045728, + "step": 90245 + }, + { + "epoch": 9.928492849284929, + "grad_norm": 0.12675833702087402, + "learning_rate": 2.9648598711212116e-05, + "loss": 0.1422, + "num_input_tokens_seen": 19046752, + "step": 90250 + }, + { + "epoch": 9.929042904290428, + "grad_norm": 0.02267569489777088, + "learning_rate": 2.964624047839856e-05, + "loss": 0.0029, + "num_input_tokens_seen": 19047872, + "step": 90255 + }, + { + "epoch": 9.92959295929593, + "grad_norm": 0.007436102256178856, + "learning_rate": 2.9643882202762842e-05, + "loss": 0.0703, + "num_input_tokens_seen": 19048960, + "step": 90260 + }, + { + "epoch": 9.93014301430143, + "grad_norm": 0.24303260445594788, + "learning_rate": 2.964152388432669e-05, + "loss": 0.0064, + "num_input_tokens_seen": 19049984, + "step": 90265 + }, + { + "epoch": 9.930693069306932, + "grad_norm": 1.149699330329895, + "learning_rate": 2.9639165523111834e-05, + "loss": 0.1588, + "num_input_tokens_seen": 19051040, + "step": 90270 + }, + { + "epoch": 9.93124312431243, + "grad_norm": 2.2961065769195557, + "learning_rate": 2.963680711914002e-05, + "loss": 0.0451, + "num_input_tokens_seen": 19052064, + "step": 90275 + }, + { + "epoch": 9.931793179317932, + "grad_norm": 0.03685633838176727, + "learning_rate": 2.963444867243298e-05, + "loss": 0.0556, + "num_input_tokens_seen": 19053088, + "step": 90280 + }, + { + "epoch": 9.932343234323433, + "grad_norm": 1.1269195079803467, + "learning_rate": 2.9632090183012452e-05, + "loss": 0.0295, + "num_input_tokens_seen": 19054176, + "step": 90285 + }, + { + "epoch": 9.932893289328932, + "grad_norm": 0.07515143603086472, + "learning_rate": 2.962973165090016e-05, + "loss": 0.023, + "num_input_tokens_seen": 19055200, + "step": 90290 + }, + { + "epoch": 9.933443344334433, + "grad_norm": 0.45371875166893005, + "learning_rate": 2.9627373076117863e-05, + "loss": 0.0279, + "num_input_tokens_seen": 19056256, + "step": 90295 + }, + { + "epoch": 9.933993399339935, + "grad_norm": 0.020739713683724403, + "learning_rate": 2.9625014458687294e-05, + "loss": 0.0746, + "num_input_tokens_seen": 19057344, + "step": 90300 + }, + { + "epoch": 9.934543454345434, + "grad_norm": 0.005820428486913443, + "learning_rate": 2.9622655798630178e-05, + "loss": 0.1569, + "num_input_tokens_seen": 19058496, + "step": 90305 + }, + { + "epoch": 9.935093509350935, + "grad_norm": 0.08902815729379654, + "learning_rate": 2.9620297095968264e-05, + "loss": 0.0086, + "num_input_tokens_seen": 19059552, + "step": 90310 + }, + { + "epoch": 9.935643564356436, + "grad_norm": 0.0070700435899198055, + "learning_rate": 2.961793835072329e-05, + "loss": 0.0066, + "num_input_tokens_seen": 19060640, + "step": 90315 + }, + { + "epoch": 9.936193619361937, + "grad_norm": 0.01910659670829773, + "learning_rate": 2.961557956291699e-05, + "loss": 0.1592, + "num_input_tokens_seen": 19061696, + "step": 90320 + }, + { + "epoch": 9.936743674367436, + "grad_norm": 0.022420600056648254, + "learning_rate": 2.9613220732571113e-05, + "loss": 0.0045, + "num_input_tokens_seen": 19062752, + "step": 90325 + }, + { + "epoch": 9.937293729372938, + "grad_norm": 0.024831358343362808, + "learning_rate": 2.961086185970739e-05, + "loss": 0.0318, + "num_input_tokens_seen": 19063744, + "step": 90330 + }, + { + "epoch": 9.937843784378439, + "grad_norm": 0.0984039306640625, + "learning_rate": 2.960850294434756e-05, + "loss": 0.0043, + "num_input_tokens_seen": 19064832, + "step": 90335 + }, + { + "epoch": 9.938393839383938, + "grad_norm": 0.008944557048380375, + "learning_rate": 2.960614398651338e-05, + "loss": 0.0037, + "num_input_tokens_seen": 19065888, + "step": 90340 + }, + { + "epoch": 9.938943894389439, + "grad_norm": 1.6224552392959595, + "learning_rate": 2.960378498622658e-05, + "loss": 0.059, + "num_input_tokens_seen": 19066944, + "step": 90345 + }, + { + "epoch": 9.93949394939494, + "grad_norm": 0.00867130421102047, + "learning_rate": 2.9601425943508898e-05, + "loss": 0.0047, + "num_input_tokens_seen": 19068000, + "step": 90350 + }, + { + "epoch": 9.94004400440044, + "grad_norm": 0.9137474298477173, + "learning_rate": 2.9599066858382084e-05, + "loss": 0.0434, + "num_input_tokens_seen": 19069056, + "step": 90355 + }, + { + "epoch": 9.94059405940594, + "grad_norm": 0.049425046890974045, + "learning_rate": 2.9596707730867877e-05, + "loss": 0.0017, + "num_input_tokens_seen": 19070144, + "step": 90360 + }, + { + "epoch": 9.941144114411442, + "grad_norm": 0.022050321102142334, + "learning_rate": 2.9594348560988017e-05, + "loss": 0.0619, + "num_input_tokens_seen": 19071232, + "step": 90365 + }, + { + "epoch": 9.941694169416941, + "grad_norm": 1.8753973245620728, + "learning_rate": 2.9591989348764265e-05, + "loss": 0.1123, + "num_input_tokens_seen": 19072256, + "step": 90370 + }, + { + "epoch": 9.942244224422442, + "grad_norm": 0.013042508624494076, + "learning_rate": 2.9589630094218335e-05, + "loss": 0.0059, + "num_input_tokens_seen": 19073280, + "step": 90375 + }, + { + "epoch": 9.942794279427943, + "grad_norm": 0.10884114354848862, + "learning_rate": 2.958727079737199e-05, + "loss": 0.0628, + "num_input_tokens_seen": 19074272, + "step": 90380 + }, + { + "epoch": 9.943344334433444, + "grad_norm": 0.30463525652885437, + "learning_rate": 2.9584911458246978e-05, + "loss": 0.0188, + "num_input_tokens_seen": 19075328, + "step": 90385 + }, + { + "epoch": 9.943894389438944, + "grad_norm": 0.028391709551215172, + "learning_rate": 2.9582552076865034e-05, + "loss": 0.0416, + "num_input_tokens_seen": 19076352, + "step": 90390 + }, + { + "epoch": 9.944444444444445, + "grad_norm": 0.43734419345855713, + "learning_rate": 2.9580192653247906e-05, + "loss": 0.0213, + "num_input_tokens_seen": 19077344, + "step": 90395 + }, + { + "epoch": 9.944994499449946, + "grad_norm": 0.032108280807733536, + "learning_rate": 2.9577833187417336e-05, + "loss": 0.0089, + "num_input_tokens_seen": 19078400, + "step": 90400 + }, + { + "epoch": 9.945544554455445, + "grad_norm": 0.2655552625656128, + "learning_rate": 2.9575473679395072e-05, + "loss": 0.0092, + "num_input_tokens_seen": 19079456, + "step": 90405 + }, + { + "epoch": 9.946094609460946, + "grad_norm": 0.06904277950525284, + "learning_rate": 2.9573114129202873e-05, + "loss": 0.0064, + "num_input_tokens_seen": 19080480, + "step": 90410 + }, + { + "epoch": 9.946644664466447, + "grad_norm": 0.052024491131305695, + "learning_rate": 2.9570754536862478e-05, + "loss": 0.003, + "num_input_tokens_seen": 19081504, + "step": 90415 + }, + { + "epoch": 9.947194719471947, + "grad_norm": 0.8590466380119324, + "learning_rate": 2.9568394902395614e-05, + "loss": 0.1122, + "num_input_tokens_seen": 19082528, + "step": 90420 + }, + { + "epoch": 9.947744774477448, + "grad_norm": 1.3813937902450562, + "learning_rate": 2.9566035225824056e-05, + "loss": 0.0472, + "num_input_tokens_seen": 19083584, + "step": 90425 + }, + { + "epoch": 9.948294829482949, + "grad_norm": 2.826694965362549, + "learning_rate": 2.9563675507169547e-05, + "loss": 0.0793, + "num_input_tokens_seen": 19084640, + "step": 90430 + }, + { + "epoch": 9.948844884488448, + "grad_norm": 2.482692241668701, + "learning_rate": 2.956131574645382e-05, + "loss": 0.0523, + "num_input_tokens_seen": 19085728, + "step": 90435 + }, + { + "epoch": 9.94939493949395, + "grad_norm": 0.1324498951435089, + "learning_rate": 2.9558955943698647e-05, + "loss": 0.009, + "num_input_tokens_seen": 19086784, + "step": 90440 + }, + { + "epoch": 9.94994499449945, + "grad_norm": 0.3149229884147644, + "learning_rate": 2.955659609892576e-05, + "loss": 0.0079, + "num_input_tokens_seen": 19087840, + "step": 90445 + }, + { + "epoch": 9.950495049504951, + "grad_norm": 1.1244628429412842, + "learning_rate": 2.9554236212156906e-05, + "loss": 0.0492, + "num_input_tokens_seen": 19088960, + "step": 90450 + }, + { + "epoch": 9.95104510451045, + "grad_norm": 0.01609473116695881, + "learning_rate": 2.9551876283413848e-05, + "loss": 0.1059, + "num_input_tokens_seen": 19090080, + "step": 90455 + }, + { + "epoch": 9.951595159515952, + "grad_norm": 0.0712178573012352, + "learning_rate": 2.9549516312718333e-05, + "loss": 0.0094, + "num_input_tokens_seen": 19091232, + "step": 90460 + }, + { + "epoch": 9.952145214521453, + "grad_norm": 0.0100482776761055, + "learning_rate": 2.9547156300092105e-05, + "loss": 0.024, + "num_input_tokens_seen": 19092288, + "step": 90465 + }, + { + "epoch": 9.952695269526952, + "grad_norm": 0.010667828842997551, + "learning_rate": 2.9544796245556926e-05, + "loss": 0.0611, + "num_input_tokens_seen": 19093376, + "step": 90470 + }, + { + "epoch": 9.953245324532453, + "grad_norm": 3.274385929107666, + "learning_rate": 2.9542436149134535e-05, + "loss": 0.1744, + "num_input_tokens_seen": 19094432, + "step": 90475 + }, + { + "epoch": 9.953795379537954, + "grad_norm": 0.030638668686151505, + "learning_rate": 2.9540076010846683e-05, + "loss": 0.0087, + "num_input_tokens_seen": 19095488, + "step": 90480 + }, + { + "epoch": 9.954345434543454, + "grad_norm": 0.8726003170013428, + "learning_rate": 2.953771583071514e-05, + "loss": 0.057, + "num_input_tokens_seen": 19096512, + "step": 90485 + }, + { + "epoch": 9.954895489548955, + "grad_norm": 0.03010333701968193, + "learning_rate": 2.953535560876165e-05, + "loss": 0.0158, + "num_input_tokens_seen": 19097632, + "step": 90490 + }, + { + "epoch": 9.955445544554456, + "grad_norm": 0.244313046336174, + "learning_rate": 2.953299534500795e-05, + "loss": 0.0377, + "num_input_tokens_seen": 19098688, + "step": 90495 + }, + { + "epoch": 9.955995599559955, + "grad_norm": 0.17014732956886292, + "learning_rate": 2.953063503947582e-05, + "loss": 0.0512, + "num_input_tokens_seen": 19099776, + "step": 90500 + }, + { + "epoch": 9.956545654565456, + "grad_norm": 0.5516676902770996, + "learning_rate": 2.9528274692186995e-05, + "loss": 0.1172, + "num_input_tokens_seen": 19100768, + "step": 90505 + }, + { + "epoch": 9.957095709570957, + "grad_norm": 0.10336193442344666, + "learning_rate": 2.952591430316324e-05, + "loss": 0.0493, + "num_input_tokens_seen": 19101824, + "step": 90510 + }, + { + "epoch": 9.957645764576458, + "grad_norm": 0.0390491746366024, + "learning_rate": 2.9523553872426295e-05, + "loss": 0.0161, + "num_input_tokens_seen": 19102912, + "step": 90515 + }, + { + "epoch": 9.958195819581958, + "grad_norm": 0.013023730367422104, + "learning_rate": 2.952119339999793e-05, + "loss": 0.0054, + "num_input_tokens_seen": 19103968, + "step": 90520 + }, + { + "epoch": 9.958745874587459, + "grad_norm": 1.1495506763458252, + "learning_rate": 2.9518832885899895e-05, + "loss": 0.0411, + "num_input_tokens_seen": 19105056, + "step": 90525 + }, + { + "epoch": 9.95929592959296, + "grad_norm": 0.030705442652106285, + "learning_rate": 2.9516472330153945e-05, + "loss": 0.0034, + "num_input_tokens_seen": 19106144, + "step": 90530 + }, + { + "epoch": 9.95984598459846, + "grad_norm": 0.058583252131938934, + "learning_rate": 2.9514111732781836e-05, + "loss": 0.0203, + "num_input_tokens_seen": 19107200, + "step": 90535 + }, + { + "epoch": 9.96039603960396, + "grad_norm": 0.1680014580488205, + "learning_rate": 2.9511751093805328e-05, + "loss": 0.0078, + "num_input_tokens_seen": 19108288, + "step": 90540 + }, + { + "epoch": 9.960946094609461, + "grad_norm": 1.6192036867141724, + "learning_rate": 2.9509390413246174e-05, + "loss": 0.0603, + "num_input_tokens_seen": 19109376, + "step": 90545 + }, + { + "epoch": 9.96149614961496, + "grad_norm": 0.30747371912002563, + "learning_rate": 2.950702969112613e-05, + "loss": 0.0521, + "num_input_tokens_seen": 19110400, + "step": 90550 + }, + { + "epoch": 9.962046204620462, + "grad_norm": 0.01120985671877861, + "learning_rate": 2.9504668927466957e-05, + "loss": 0.0158, + "num_input_tokens_seen": 19111424, + "step": 90555 + }, + { + "epoch": 9.962596259625963, + "grad_norm": 0.13127687573432922, + "learning_rate": 2.9502308122290418e-05, + "loss": 0.0035, + "num_input_tokens_seen": 19112448, + "step": 90560 + }, + { + "epoch": 9.963146314631462, + "grad_norm": 0.030819786712527275, + "learning_rate": 2.9499947275618257e-05, + "loss": 0.0438, + "num_input_tokens_seen": 19113472, + "step": 90565 + }, + { + "epoch": 9.963696369636963, + "grad_norm": 0.02625930681824684, + "learning_rate": 2.9497586387472242e-05, + "loss": 0.0134, + "num_input_tokens_seen": 19114560, + "step": 90570 + }, + { + "epoch": 9.964246424642464, + "grad_norm": 1.4803998470306396, + "learning_rate": 2.9495225457874136e-05, + "loss": 0.0164, + "num_input_tokens_seen": 19115680, + "step": 90575 + }, + { + "epoch": 9.964796479647966, + "grad_norm": 0.013763816095888615, + "learning_rate": 2.9492864486845697e-05, + "loss": 0.0116, + "num_input_tokens_seen": 19116736, + "step": 90580 + }, + { + "epoch": 9.965346534653465, + "grad_norm": 0.218108132481575, + "learning_rate": 2.9490503474408676e-05, + "loss": 0.0635, + "num_input_tokens_seen": 19117792, + "step": 90585 + }, + { + "epoch": 9.965896589658966, + "grad_norm": 0.07733089476823807, + "learning_rate": 2.948814242058484e-05, + "loss": 0.088, + "num_input_tokens_seen": 19118848, + "step": 90590 + }, + { + "epoch": 9.966446644664467, + "grad_norm": 0.00951389316469431, + "learning_rate": 2.9485781325395946e-05, + "loss": 0.0091, + "num_input_tokens_seen": 19119904, + "step": 90595 + }, + { + "epoch": 9.966996699669966, + "grad_norm": 0.00334900408051908, + "learning_rate": 2.9483420188863757e-05, + "loss": 0.022, + "num_input_tokens_seen": 19120928, + "step": 90600 + }, + { + "epoch": 9.967546754675467, + "grad_norm": 0.03988410905003548, + "learning_rate": 2.948105901101005e-05, + "loss": 0.005, + "num_input_tokens_seen": 19121952, + "step": 90605 + }, + { + "epoch": 9.968096809680969, + "grad_norm": 0.03001486510038376, + "learning_rate": 2.947869779185656e-05, + "loss": 0.0042, + "num_input_tokens_seen": 19122912, + "step": 90610 + }, + { + "epoch": 9.968646864686468, + "grad_norm": 0.010914390906691551, + "learning_rate": 2.947633653142506e-05, + "loss": 0.0107, + "num_input_tokens_seen": 19123968, + "step": 90615 + }, + { + "epoch": 9.969196919691969, + "grad_norm": 0.03054254688322544, + "learning_rate": 2.9473975229737315e-05, + "loss": 0.0056, + "num_input_tokens_seen": 19124960, + "step": 90620 + }, + { + "epoch": 9.96974697469747, + "grad_norm": 0.044944554567337036, + "learning_rate": 2.9471613886815086e-05, + "loss": 0.002, + "num_input_tokens_seen": 19126016, + "step": 90625 + }, + { + "epoch": 9.97029702970297, + "grad_norm": 0.15648771822452545, + "learning_rate": 2.946925250268015e-05, + "loss": 0.0328, + "num_input_tokens_seen": 19127104, + "step": 90630 + }, + { + "epoch": 9.97084708470847, + "grad_norm": 0.25397631525993347, + "learning_rate": 2.9466891077354247e-05, + "loss": 0.0084, + "num_input_tokens_seen": 19128192, + "step": 90635 + }, + { + "epoch": 9.971397139713972, + "grad_norm": 0.46708598732948303, + "learning_rate": 2.9464529610859153e-05, + "loss": 0.0872, + "num_input_tokens_seen": 19129216, + "step": 90640 + }, + { + "epoch": 9.971947194719473, + "grad_norm": 0.48583051562309265, + "learning_rate": 2.946216810321663e-05, + "loss": 0.0196, + "num_input_tokens_seen": 19130272, + "step": 90645 + }, + { + "epoch": 9.972497249724972, + "grad_norm": 1.2111740112304688, + "learning_rate": 2.9459806554448453e-05, + "loss": 0.1078, + "num_input_tokens_seen": 19131392, + "step": 90650 + }, + { + "epoch": 9.973047304730473, + "grad_norm": 0.0408848337829113, + "learning_rate": 2.9457444964576374e-05, + "loss": 0.0285, + "num_input_tokens_seen": 19132448, + "step": 90655 + }, + { + "epoch": 9.973597359735974, + "grad_norm": 0.04173312708735466, + "learning_rate": 2.9455083333622163e-05, + "loss": 0.0136, + "num_input_tokens_seen": 19133504, + "step": 90660 + }, + { + "epoch": 9.974147414741473, + "grad_norm": 1.673136830329895, + "learning_rate": 2.945272166160759e-05, + "loss": 0.0521, + "num_input_tokens_seen": 19134560, + "step": 90665 + }, + { + "epoch": 9.974697469746975, + "grad_norm": 1.3686590194702148, + "learning_rate": 2.9450359948554413e-05, + "loss": 0.1037, + "num_input_tokens_seen": 19135616, + "step": 90670 + }, + { + "epoch": 9.975247524752476, + "grad_norm": 0.012899966910481453, + "learning_rate": 2.9447998194484404e-05, + "loss": 0.0025, + "num_input_tokens_seen": 19136704, + "step": 90675 + }, + { + "epoch": 9.975797579757975, + "grad_norm": 2.1019797325134277, + "learning_rate": 2.9445636399419335e-05, + "loss": 0.0598, + "num_input_tokens_seen": 19137728, + "step": 90680 + }, + { + "epoch": 9.976347634763476, + "grad_norm": 0.017393289133906364, + "learning_rate": 2.9443274563380968e-05, + "loss": 0.028, + "num_input_tokens_seen": 19138752, + "step": 90685 + }, + { + "epoch": 9.976897689768977, + "grad_norm": 0.11814159154891968, + "learning_rate": 2.944091268639107e-05, + "loss": 0.0479, + "num_input_tokens_seen": 19139776, + "step": 90690 + }, + { + "epoch": 9.977447744774478, + "grad_norm": 0.0816606879234314, + "learning_rate": 2.9438550768471413e-05, + "loss": 0.0461, + "num_input_tokens_seen": 19140864, + "step": 90695 + }, + { + "epoch": 9.977997799779978, + "grad_norm": 0.014636289328336716, + "learning_rate": 2.9436188809643756e-05, + "loss": 0.0024, + "num_input_tokens_seen": 19142048, + "step": 90700 + }, + { + "epoch": 9.978547854785479, + "grad_norm": 0.19601425528526306, + "learning_rate": 2.9433826809929882e-05, + "loss": 0.1428, + "num_input_tokens_seen": 19143104, + "step": 90705 + }, + { + "epoch": 9.97909790979098, + "grad_norm": 0.7289232015609741, + "learning_rate": 2.9431464769351548e-05, + "loss": 0.0726, + "num_input_tokens_seen": 19144096, + "step": 90710 + }, + { + "epoch": 9.979647964796479, + "grad_norm": 1.813834547996521, + "learning_rate": 2.942910268793054e-05, + "loss": 0.1769, + "num_input_tokens_seen": 19145120, + "step": 90715 + }, + { + "epoch": 9.98019801980198, + "grad_norm": 1.9556840658187866, + "learning_rate": 2.942674056568861e-05, + "loss": 0.0145, + "num_input_tokens_seen": 19146304, + "step": 90720 + }, + { + "epoch": 9.980748074807481, + "grad_norm": 0.16343431174755096, + "learning_rate": 2.9424378402647536e-05, + "loss": 0.0696, + "num_input_tokens_seen": 19147360, + "step": 90725 + }, + { + "epoch": 9.98129812981298, + "grad_norm": 0.43763771653175354, + "learning_rate": 2.9422016198829093e-05, + "loss": 0.0074, + "num_input_tokens_seen": 19148448, + "step": 90730 + }, + { + "epoch": 9.981848184818482, + "grad_norm": 3.540494918823242, + "learning_rate": 2.9419653954255044e-05, + "loss": 0.0361, + "num_input_tokens_seen": 19149504, + "step": 90735 + }, + { + "epoch": 9.982398239823983, + "grad_norm": 0.9216538071632385, + "learning_rate": 2.9417291668947166e-05, + "loss": 0.0353, + "num_input_tokens_seen": 19150560, + "step": 90740 + }, + { + "epoch": 9.982948294829484, + "grad_norm": 1.0210648775100708, + "learning_rate": 2.9414929342927233e-05, + "loss": 0.0993, + "num_input_tokens_seen": 19151680, + "step": 90745 + }, + { + "epoch": 9.983498349834983, + "grad_norm": 0.10749822109937668, + "learning_rate": 2.9412566976217022e-05, + "loss": 0.0107, + "num_input_tokens_seen": 19152800, + "step": 90750 + }, + { + "epoch": 9.984048404840484, + "grad_norm": 1.495582103729248, + "learning_rate": 2.9410204568838284e-05, + "loss": 0.028, + "num_input_tokens_seen": 19153888, + "step": 90755 + }, + { + "epoch": 9.984598459845985, + "grad_norm": 0.048944149166345596, + "learning_rate": 2.9407842120812812e-05, + "loss": 0.0225, + "num_input_tokens_seen": 19154944, + "step": 90760 + }, + { + "epoch": 9.985148514851485, + "grad_norm": 1.3511310815811157, + "learning_rate": 2.940547963216238e-05, + "loss": 0.0598, + "num_input_tokens_seen": 19156000, + "step": 90765 + }, + { + "epoch": 9.985698569856986, + "grad_norm": 0.5729579329490662, + "learning_rate": 2.9403117102908746e-05, + "loss": 0.0416, + "num_input_tokens_seen": 19157120, + "step": 90770 + }, + { + "epoch": 9.986248624862487, + "grad_norm": 0.8855822086334229, + "learning_rate": 2.9400754533073703e-05, + "loss": 0.0264, + "num_input_tokens_seen": 19158176, + "step": 90775 + }, + { + "epoch": 9.986798679867986, + "grad_norm": 1.1795918941497803, + "learning_rate": 2.939839192267902e-05, + "loss": 0.2603, + "num_input_tokens_seen": 19159200, + "step": 90780 + }, + { + "epoch": 9.987348734873487, + "grad_norm": 0.06911344826221466, + "learning_rate": 2.9396029271746456e-05, + "loss": 0.09, + "num_input_tokens_seen": 19160224, + "step": 90785 + }, + { + "epoch": 9.987898789878988, + "grad_norm": 0.040721818804740906, + "learning_rate": 2.9393666580297806e-05, + "loss": 0.0023, + "num_input_tokens_seen": 19161248, + "step": 90790 + }, + { + "epoch": 9.988448844884488, + "grad_norm": 0.0956084132194519, + "learning_rate": 2.9391303848354846e-05, + "loss": 0.0563, + "num_input_tokens_seen": 19162240, + "step": 90795 + }, + { + "epoch": 9.988998899889989, + "grad_norm": 1.2570034265518188, + "learning_rate": 2.9388941075939334e-05, + "loss": 0.161, + "num_input_tokens_seen": 19163296, + "step": 90800 + }, + { + "epoch": 9.98954895489549, + "grad_norm": 0.11698795109987259, + "learning_rate": 2.9386578263073066e-05, + "loss": 0.0199, + "num_input_tokens_seen": 19164320, + "step": 90805 + }, + { + "epoch": 9.990099009900991, + "grad_norm": 0.3217791020870209, + "learning_rate": 2.9384215409777816e-05, + "loss": 0.0782, + "num_input_tokens_seen": 19165376, + "step": 90810 + }, + { + "epoch": 9.99064906490649, + "grad_norm": 0.06026297062635422, + "learning_rate": 2.938185251607534e-05, + "loss": 0.0108, + "num_input_tokens_seen": 19166432, + "step": 90815 + }, + { + "epoch": 9.991199119911991, + "grad_norm": 0.16573064029216766, + "learning_rate": 2.9379489581987453e-05, + "loss": 0.0188, + "num_input_tokens_seen": 19167520, + "step": 90820 + }, + { + "epoch": 9.991749174917492, + "grad_norm": 0.11979778856039047, + "learning_rate": 2.9377126607535897e-05, + "loss": 0.0473, + "num_input_tokens_seen": 19168672, + "step": 90825 + }, + { + "epoch": 9.992299229922992, + "grad_norm": 0.05045440047979355, + "learning_rate": 2.9374763592742466e-05, + "loss": 0.0036, + "num_input_tokens_seen": 19169792, + "step": 90830 + }, + { + "epoch": 9.992849284928493, + "grad_norm": 0.03219331055879593, + "learning_rate": 2.9372400537628942e-05, + "loss": 0.0233, + "num_input_tokens_seen": 19170816, + "step": 90835 + }, + { + "epoch": 9.993399339933994, + "grad_norm": 0.10213887691497803, + "learning_rate": 2.93700374422171e-05, + "loss": 0.0071, + "num_input_tokens_seen": 19171872, + "step": 90840 + }, + { + "epoch": 9.993949394939493, + "grad_norm": 0.49268287420272827, + "learning_rate": 2.936767430652872e-05, + "loss": 0.0946, + "num_input_tokens_seen": 19172928, + "step": 90845 + }, + { + "epoch": 9.994499449944994, + "grad_norm": 0.06600473076105118, + "learning_rate": 2.9365311130585584e-05, + "loss": 0.0392, + "num_input_tokens_seen": 19173984, + "step": 90850 + }, + { + "epoch": 9.995049504950495, + "grad_norm": 0.17681583762168884, + "learning_rate": 2.9362947914409466e-05, + "loss": 0.0301, + "num_input_tokens_seen": 19175040, + "step": 90855 + }, + { + "epoch": 9.995599559955995, + "grad_norm": 0.1067458763718605, + "learning_rate": 2.9360584658022144e-05, + "loss": 0.0063, + "num_input_tokens_seen": 19176032, + "step": 90860 + }, + { + "epoch": 9.996149614961496, + "grad_norm": 0.07790134102106094, + "learning_rate": 2.9358221361445415e-05, + "loss": 0.0783, + "num_input_tokens_seen": 19177056, + "step": 90865 + }, + { + "epoch": 9.996699669966997, + "grad_norm": 0.4164648652076721, + "learning_rate": 2.935585802470105e-05, + "loss": 0.0359, + "num_input_tokens_seen": 19178112, + "step": 90870 + }, + { + "epoch": 9.997249724972498, + "grad_norm": 0.03126996010541916, + "learning_rate": 2.935349464781083e-05, + "loss": 0.0424, + "num_input_tokens_seen": 19179136, + "step": 90875 + }, + { + "epoch": 9.997799779977997, + "grad_norm": 0.16817352175712585, + "learning_rate": 2.9351131230796548e-05, + "loss": 0.0059, + "num_input_tokens_seen": 19180192, + "step": 90880 + }, + { + "epoch": 9.998349834983498, + "grad_norm": 0.1383187472820282, + "learning_rate": 2.9348767773679957e-05, + "loss": 0.0089, + "num_input_tokens_seen": 19181280, + "step": 90885 + }, + { + "epoch": 9.998899889989, + "grad_norm": 0.5226117372512817, + "learning_rate": 2.9346404276482875e-05, + "loss": 0.1048, + "num_input_tokens_seen": 19182272, + "step": 90890 + }, + { + "epoch": 9.999449944994499, + "grad_norm": 0.19258616864681244, + "learning_rate": 2.9344040739227075e-05, + "loss": 0.0151, + "num_input_tokens_seen": 19183328, + "step": 90895 + }, + { + "epoch": 10.0, + "grad_norm": 1.6378551721572876, + "learning_rate": 2.9341677161934323e-05, + "loss": 0.0662, + "num_input_tokens_seen": 19184224, + "step": 90900 + }, + { + "epoch": 10.0, + "eval_loss": 0.060428909957408905, + "eval_runtime": 36.9778, + "eval_samples_per_second": 109.255, + "eval_steps_per_second": 27.314, + "num_input_tokens_seen": 19184224, + "step": 90900 + }, + { + "epoch": 10.000550055005501, + "grad_norm": 0.1653948277235031, + "learning_rate": 2.9339313544626425e-05, + "loss": 0.009, + "num_input_tokens_seen": 19185312, + "step": 90905 + }, + { + "epoch": 10.001100110011, + "grad_norm": 0.0678105354309082, + "learning_rate": 2.9336949887325154e-05, + "loss": 0.1196, + "num_input_tokens_seen": 19186304, + "step": 90910 + }, + { + "epoch": 10.001650165016502, + "grad_norm": 0.022475535050034523, + "learning_rate": 2.9334586190052293e-05, + "loss": 0.1014, + "num_input_tokens_seen": 19187360, + "step": 90915 + }, + { + "epoch": 10.002200220022003, + "grad_norm": 0.03740399330854416, + "learning_rate": 2.9332222452829638e-05, + "loss": 0.0639, + "num_input_tokens_seen": 19188448, + "step": 90920 + }, + { + "epoch": 10.002750275027502, + "grad_norm": 0.025672322139143944, + "learning_rate": 2.932985867567896e-05, + "loss": 0.0617, + "num_input_tokens_seen": 19189504, + "step": 90925 + }, + { + "epoch": 10.003300330033003, + "grad_norm": 0.026678306981921196, + "learning_rate": 2.9327494858622056e-05, + "loss": 0.0298, + "num_input_tokens_seen": 19190560, + "step": 90930 + }, + { + "epoch": 10.003850385038504, + "grad_norm": 0.05783578380942345, + "learning_rate": 2.9325131001680706e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19191680, + "step": 90935 + }, + { + "epoch": 10.004400440044005, + "grad_norm": 0.026390356943011284, + "learning_rate": 2.932276710487671e-05, + "loss": 0.0866, + "num_input_tokens_seen": 19192768, + "step": 90940 + }, + { + "epoch": 10.004950495049505, + "grad_norm": 0.7243880033493042, + "learning_rate": 2.932040316823183e-05, + "loss": 0.0391, + "num_input_tokens_seen": 19193888, + "step": 90945 + }, + { + "epoch": 10.005500550055006, + "grad_norm": 0.291430801153183, + "learning_rate": 2.9318039191767873e-05, + "loss": 0.0774, + "num_input_tokens_seen": 19194944, + "step": 90950 + }, + { + "epoch": 10.006050605060507, + "grad_norm": 0.11385677009820938, + "learning_rate": 2.931567517550662e-05, + "loss": 0.0334, + "num_input_tokens_seen": 19196000, + "step": 90955 + }, + { + "epoch": 10.006600660066006, + "grad_norm": 1.394174337387085, + "learning_rate": 2.9313311119469856e-05, + "loss": 0.0575, + "num_input_tokens_seen": 19197024, + "step": 90960 + }, + { + "epoch": 10.007150715071507, + "grad_norm": 2.215794801712036, + "learning_rate": 2.931094702367938e-05, + "loss": 0.1829, + "num_input_tokens_seen": 19198080, + "step": 90965 + }, + { + "epoch": 10.007700770077008, + "grad_norm": 0.04299614578485489, + "learning_rate": 2.9308582888156973e-05, + "loss": 0.0604, + "num_input_tokens_seen": 19199200, + "step": 90970 + }, + { + "epoch": 10.008250825082508, + "grad_norm": 0.13133925199508667, + "learning_rate": 2.9306218712924416e-05, + "loss": 0.0529, + "num_input_tokens_seen": 19200320, + "step": 90975 + }, + { + "epoch": 10.008800880088009, + "grad_norm": 0.03330196067690849, + "learning_rate": 2.9303854498003514e-05, + "loss": 0.0369, + "num_input_tokens_seen": 19201408, + "step": 90980 + }, + { + "epoch": 10.00935093509351, + "grad_norm": 0.0654907152056694, + "learning_rate": 2.9301490243416052e-05, + "loss": 0.0073, + "num_input_tokens_seen": 19202560, + "step": 90985 + }, + { + "epoch": 10.009900990099009, + "grad_norm": 0.05225498229265213, + "learning_rate": 2.929912594918382e-05, + "loss": 0.0087, + "num_input_tokens_seen": 19203648, + "step": 90990 + }, + { + "epoch": 10.01045104510451, + "grad_norm": 0.14240844547748566, + "learning_rate": 2.9296761615328606e-05, + "loss": 0.0259, + "num_input_tokens_seen": 19204704, + "step": 90995 + }, + { + "epoch": 10.011001100110011, + "grad_norm": 1.243584394454956, + "learning_rate": 2.9294397241872196e-05, + "loss": 0.028, + "num_input_tokens_seen": 19205728, + "step": 91000 + }, + { + "epoch": 10.011551155115512, + "grad_norm": 0.18165133893489838, + "learning_rate": 2.9292032828836387e-05, + "loss": 0.0086, + "num_input_tokens_seen": 19206784, + "step": 91005 + }, + { + "epoch": 10.012101210121012, + "grad_norm": 0.08430048078298569, + "learning_rate": 2.928966837624298e-05, + "loss": 0.0088, + "num_input_tokens_seen": 19207808, + "step": 91010 + }, + { + "epoch": 10.012651265126513, + "grad_norm": 0.36638715863227844, + "learning_rate": 2.9287303884113747e-05, + "loss": 0.0183, + "num_input_tokens_seen": 19208864, + "step": 91015 + }, + { + "epoch": 10.013201320132014, + "grad_norm": 0.027754122391343117, + "learning_rate": 2.9284939352470502e-05, + "loss": 0.0131, + "num_input_tokens_seen": 19209920, + "step": 91020 + }, + { + "epoch": 10.013751375137513, + "grad_norm": 0.48199546337127686, + "learning_rate": 2.928257478133502e-05, + "loss": 0.0394, + "num_input_tokens_seen": 19210912, + "step": 91025 + }, + { + "epoch": 10.014301430143014, + "grad_norm": 0.2668371796607971, + "learning_rate": 2.9280210170729105e-05, + "loss": 0.0521, + "num_input_tokens_seen": 19211936, + "step": 91030 + }, + { + "epoch": 10.014851485148515, + "grad_norm": 0.04828263446688652, + "learning_rate": 2.9277845520674547e-05, + "loss": 0.1158, + "num_input_tokens_seen": 19213056, + "step": 91035 + }, + { + "epoch": 10.015401540154015, + "grad_norm": 0.16625873744487762, + "learning_rate": 2.9275480831193136e-05, + "loss": 0.0055, + "num_input_tokens_seen": 19214144, + "step": 91040 + }, + { + "epoch": 10.015951595159516, + "grad_norm": 0.15063124895095825, + "learning_rate": 2.927311610230667e-05, + "loss": 0.019, + "num_input_tokens_seen": 19215232, + "step": 91045 + }, + { + "epoch": 10.016501650165017, + "grad_norm": 0.07425396144390106, + "learning_rate": 2.9270751334036943e-05, + "loss": 0.028, + "num_input_tokens_seen": 19216288, + "step": 91050 + }, + { + "epoch": 10.017051705170518, + "grad_norm": 0.9327549338340759, + "learning_rate": 2.9268386526405756e-05, + "loss": 0.0574, + "num_input_tokens_seen": 19217440, + "step": 91055 + }, + { + "epoch": 10.017601760176017, + "grad_norm": 0.13115088641643524, + "learning_rate": 2.9266021679434897e-05, + "loss": 0.0104, + "num_input_tokens_seen": 19218464, + "step": 91060 + }, + { + "epoch": 10.018151815181518, + "grad_norm": 0.3456304669380188, + "learning_rate": 2.926365679314616e-05, + "loss": 0.0588, + "num_input_tokens_seen": 19219552, + "step": 91065 + }, + { + "epoch": 10.01870187018702, + "grad_norm": 0.2800377905368805, + "learning_rate": 2.9261291867561346e-05, + "loss": 0.0081, + "num_input_tokens_seen": 19220608, + "step": 91070 + }, + { + "epoch": 10.019251925192519, + "grad_norm": 0.033313822001218796, + "learning_rate": 2.9258926902702243e-05, + "loss": 0.0178, + "num_input_tokens_seen": 19221696, + "step": 91075 + }, + { + "epoch": 10.01980198019802, + "grad_norm": 0.6980162858963013, + "learning_rate": 2.925656189859066e-05, + "loss": 0.0477, + "num_input_tokens_seen": 19222816, + "step": 91080 + }, + { + "epoch": 10.020352035203521, + "grad_norm": 0.06383543461561203, + "learning_rate": 2.9254196855248395e-05, + "loss": 0.0084, + "num_input_tokens_seen": 19223936, + "step": 91085 + }, + { + "epoch": 10.02090209020902, + "grad_norm": 0.2794668972492218, + "learning_rate": 2.925183177269723e-05, + "loss": 0.0219, + "num_input_tokens_seen": 19225056, + "step": 91090 + }, + { + "epoch": 10.021452145214521, + "grad_norm": 0.022120166569948196, + "learning_rate": 2.9249466650958974e-05, + "loss": 0.0091, + "num_input_tokens_seen": 19226048, + "step": 91095 + }, + { + "epoch": 10.022002200220022, + "grad_norm": 0.09547911584377289, + "learning_rate": 2.9247101490055424e-05, + "loss": 0.0105, + "num_input_tokens_seen": 19227072, + "step": 91100 + }, + { + "epoch": 10.022552255225522, + "grad_norm": 0.1950153410434723, + "learning_rate": 2.924473629000838e-05, + "loss": 0.0435, + "num_input_tokens_seen": 19228128, + "step": 91105 + }, + { + "epoch": 10.023102310231023, + "grad_norm": 0.04575164243578911, + "learning_rate": 2.9242371050839633e-05, + "loss": 0.0042, + "num_input_tokens_seen": 19229216, + "step": 91110 + }, + { + "epoch": 10.023652365236524, + "grad_norm": 0.2092873901128769, + "learning_rate": 2.9240005772570994e-05, + "loss": 0.0195, + "num_input_tokens_seen": 19230304, + "step": 91115 + }, + { + "epoch": 10.024202420242025, + "grad_norm": 0.01864769496023655, + "learning_rate": 2.9237640455224246e-05, + "loss": 0.0833, + "num_input_tokens_seen": 19231424, + "step": 91120 + }, + { + "epoch": 10.024752475247524, + "grad_norm": 0.21460168063640594, + "learning_rate": 2.923527509882121e-05, + "loss": 0.0079, + "num_input_tokens_seen": 19232544, + "step": 91125 + }, + { + "epoch": 10.025302530253025, + "grad_norm": 0.006038535851985216, + "learning_rate": 2.923290970338367e-05, + "loss": 0.0059, + "num_input_tokens_seen": 19233536, + "step": 91130 + }, + { + "epoch": 10.025852585258527, + "grad_norm": 0.01981787011027336, + "learning_rate": 2.9230544268933436e-05, + "loss": 0.0477, + "num_input_tokens_seen": 19234592, + "step": 91135 + }, + { + "epoch": 10.026402640264026, + "grad_norm": 0.009961850009858608, + "learning_rate": 2.92281787954923e-05, + "loss": 0.0273, + "num_input_tokens_seen": 19235680, + "step": 91140 + }, + { + "epoch": 10.026952695269527, + "grad_norm": 0.011250440962612629, + "learning_rate": 2.9225813283082076e-05, + "loss": 0.0397, + "num_input_tokens_seen": 19236704, + "step": 91145 + }, + { + "epoch": 10.027502750275028, + "grad_norm": 0.020391222089529037, + "learning_rate": 2.922344773172455e-05, + "loss": 0.0152, + "num_input_tokens_seen": 19237728, + "step": 91150 + }, + { + "epoch": 10.028052805280527, + "grad_norm": 0.5456109046936035, + "learning_rate": 2.922108214144154e-05, + "loss": 0.014, + "num_input_tokens_seen": 19238816, + "step": 91155 + }, + { + "epoch": 10.028602860286028, + "grad_norm": 0.10947984457015991, + "learning_rate": 2.9218716512254836e-05, + "loss": 0.054, + "num_input_tokens_seen": 19239968, + "step": 91160 + }, + { + "epoch": 10.02915291529153, + "grad_norm": 0.07628411799669266, + "learning_rate": 2.9216350844186253e-05, + "loss": 0.0047, + "num_input_tokens_seen": 19241024, + "step": 91165 + }, + { + "epoch": 10.029702970297029, + "grad_norm": 0.18723978102207184, + "learning_rate": 2.9213985137257583e-05, + "loss": 0.1065, + "num_input_tokens_seen": 19242080, + "step": 91170 + }, + { + "epoch": 10.03025302530253, + "grad_norm": 0.16264021396636963, + "learning_rate": 2.9211619391490634e-05, + "loss": 0.0057, + "num_input_tokens_seen": 19243072, + "step": 91175 + }, + { + "epoch": 10.030803080308031, + "grad_norm": 0.02554962784051895, + "learning_rate": 2.920925360690721e-05, + "loss": 0.009, + "num_input_tokens_seen": 19244128, + "step": 91180 + }, + { + "epoch": 10.031353135313532, + "grad_norm": 0.4029674530029297, + "learning_rate": 2.920688778352912e-05, + "loss": 0.0427, + "num_input_tokens_seen": 19245152, + "step": 91185 + }, + { + "epoch": 10.031903190319031, + "grad_norm": 0.04966045916080475, + "learning_rate": 2.920452192137816e-05, + "loss": 0.0074, + "num_input_tokens_seen": 19246240, + "step": 91190 + }, + { + "epoch": 10.032453245324533, + "grad_norm": 0.01214701309800148, + "learning_rate": 2.9202156020476133e-05, + "loss": 0.0056, + "num_input_tokens_seen": 19247360, + "step": 91195 + }, + { + "epoch": 10.033003300330034, + "grad_norm": 0.14485333859920502, + "learning_rate": 2.9199790080844864e-05, + "loss": 0.0868, + "num_input_tokens_seen": 19248416, + "step": 91200 + }, + { + "epoch": 10.033553355335533, + "grad_norm": 0.06485486775636673, + "learning_rate": 2.9197424102506133e-05, + "loss": 0.0581, + "num_input_tokens_seen": 19249504, + "step": 91205 + }, + { + "epoch": 10.034103410341034, + "grad_norm": 0.03555124253034592, + "learning_rate": 2.9195058085481765e-05, + "loss": 0.0076, + "num_input_tokens_seen": 19250592, + "step": 91210 + }, + { + "epoch": 10.034653465346535, + "grad_norm": 0.052756115794181824, + "learning_rate": 2.9192692029793554e-05, + "loss": 0.0506, + "num_input_tokens_seen": 19251616, + "step": 91215 + }, + { + "epoch": 10.035203520352034, + "grad_norm": 0.9077569246292114, + "learning_rate": 2.9190325935463315e-05, + "loss": 0.0256, + "num_input_tokens_seen": 19252736, + "step": 91220 + }, + { + "epoch": 10.035753575357536, + "grad_norm": 0.01958031952381134, + "learning_rate": 2.918795980251286e-05, + "loss": 0.0036, + "num_input_tokens_seen": 19253792, + "step": 91225 + }, + { + "epoch": 10.036303630363037, + "grad_norm": 0.20542825758457184, + "learning_rate": 2.918559363096398e-05, + "loss": 0.0073, + "num_input_tokens_seen": 19254880, + "step": 91230 + }, + { + "epoch": 10.036853685368538, + "grad_norm": 0.08188923448324203, + "learning_rate": 2.9183227420838493e-05, + "loss": 0.0063, + "num_input_tokens_seen": 19256000, + "step": 91235 + }, + { + "epoch": 10.037403740374037, + "grad_norm": 0.2258729338645935, + "learning_rate": 2.9180861172158207e-05, + "loss": 0.0545, + "num_input_tokens_seen": 19257088, + "step": 91240 + }, + { + "epoch": 10.037953795379538, + "grad_norm": 0.45821893215179443, + "learning_rate": 2.9178494884944928e-05, + "loss": 0.0761, + "num_input_tokens_seen": 19258112, + "step": 91245 + }, + { + "epoch": 10.03850385038504, + "grad_norm": 0.04551452398300171, + "learning_rate": 2.9176128559220473e-05, + "loss": 0.0019, + "num_input_tokens_seen": 19259232, + "step": 91250 + }, + { + "epoch": 10.039053905390539, + "grad_norm": 0.1826908141374588, + "learning_rate": 2.9173762195006644e-05, + "loss": 0.0192, + "num_input_tokens_seen": 19260320, + "step": 91255 + }, + { + "epoch": 10.03960396039604, + "grad_norm": 0.12880635261535645, + "learning_rate": 2.9171395792325247e-05, + "loss": 0.0093, + "num_input_tokens_seen": 19261376, + "step": 91260 + }, + { + "epoch": 10.04015401540154, + "grad_norm": 0.981127142906189, + "learning_rate": 2.9169029351198096e-05, + "loss": 0.0301, + "num_input_tokens_seen": 19262432, + "step": 91265 + }, + { + "epoch": 10.04070407040704, + "grad_norm": 0.016093727201223373, + "learning_rate": 2.9166662871647006e-05, + "loss": 0.0041, + "num_input_tokens_seen": 19263488, + "step": 91270 + }, + { + "epoch": 10.041254125412541, + "grad_norm": 2.262235164642334, + "learning_rate": 2.9164296353693786e-05, + "loss": 0.0223, + "num_input_tokens_seen": 19264576, + "step": 91275 + }, + { + "epoch": 10.041804180418042, + "grad_norm": 0.051061104983091354, + "learning_rate": 2.9161929797360234e-05, + "loss": 0.0209, + "num_input_tokens_seen": 19265600, + "step": 91280 + }, + { + "epoch": 10.042354235423542, + "grad_norm": 0.07766325771808624, + "learning_rate": 2.9159563202668184e-05, + "loss": 0.0096, + "num_input_tokens_seen": 19266592, + "step": 91285 + }, + { + "epoch": 10.042904290429043, + "grad_norm": 0.02739892713725567, + "learning_rate": 2.9157196569639427e-05, + "loss": 0.004, + "num_input_tokens_seen": 19267680, + "step": 91290 + }, + { + "epoch": 10.043454345434544, + "grad_norm": 0.027212075889110565, + "learning_rate": 2.9154829898295788e-05, + "loss": 0.0043, + "num_input_tokens_seen": 19268736, + "step": 91295 + }, + { + "epoch": 10.044004400440045, + "grad_norm": 3.5187158584594727, + "learning_rate": 2.915246318865908e-05, + "loss": 0.0958, + "num_input_tokens_seen": 19269760, + "step": 91300 + }, + { + "epoch": 10.044554455445544, + "grad_norm": 0.2868472635746002, + "learning_rate": 2.9150096440751107e-05, + "loss": 0.087, + "num_input_tokens_seen": 19270880, + "step": 91305 + }, + { + "epoch": 10.045104510451045, + "grad_norm": 0.2486443668603897, + "learning_rate": 2.914772965459368e-05, + "loss": 0.0409, + "num_input_tokens_seen": 19272000, + "step": 91310 + }, + { + "epoch": 10.045654565456546, + "grad_norm": 0.900639533996582, + "learning_rate": 2.9145362830208623e-05, + "loss": 0.0292, + "num_input_tokens_seen": 19273056, + "step": 91315 + }, + { + "epoch": 10.046204620462046, + "grad_norm": 0.008060241118073463, + "learning_rate": 2.914299596761775e-05, + "loss": 0.0025, + "num_input_tokens_seen": 19274080, + "step": 91320 + }, + { + "epoch": 10.046754675467547, + "grad_norm": 0.11125249415636063, + "learning_rate": 2.9140629066842873e-05, + "loss": 0.0104, + "num_input_tokens_seen": 19275104, + "step": 91325 + }, + { + "epoch": 10.047304730473048, + "grad_norm": 0.02897915244102478, + "learning_rate": 2.91382621279058e-05, + "loss": 0.0628, + "num_input_tokens_seen": 19276224, + "step": 91330 + }, + { + "epoch": 10.047854785478547, + "grad_norm": 0.07916098833084106, + "learning_rate": 2.913589515082835e-05, + "loss": 0.0808, + "num_input_tokens_seen": 19277280, + "step": 91335 + }, + { + "epoch": 10.048404840484048, + "grad_norm": 0.015062235295772552, + "learning_rate": 2.9133528135632338e-05, + "loss": 0.0028, + "num_input_tokens_seen": 19278272, + "step": 91340 + }, + { + "epoch": 10.04895489548955, + "grad_norm": 1.068925142288208, + "learning_rate": 2.9131161082339585e-05, + "loss": 0.0486, + "num_input_tokens_seen": 19279392, + "step": 91345 + }, + { + "epoch": 10.049504950495049, + "grad_norm": 0.03406081721186638, + "learning_rate": 2.9128793990971904e-05, + "loss": 0.0246, + "num_input_tokens_seen": 19280512, + "step": 91350 + }, + { + "epoch": 10.05005500550055, + "grad_norm": 0.04718182235956192, + "learning_rate": 2.9126426861551103e-05, + "loss": 0.0284, + "num_input_tokens_seen": 19281568, + "step": 91355 + }, + { + "epoch": 10.05060506050605, + "grad_norm": 0.3933793902397156, + "learning_rate": 2.912405969409901e-05, + "loss": 0.0481, + "num_input_tokens_seen": 19282688, + "step": 91360 + }, + { + "epoch": 10.051155115511552, + "grad_norm": 0.0671713650226593, + "learning_rate": 2.912169248863744e-05, + "loss": 0.0205, + "num_input_tokens_seen": 19283712, + "step": 91365 + }, + { + "epoch": 10.051705170517051, + "grad_norm": 0.7959022521972656, + "learning_rate": 2.9119325245188207e-05, + "loss": 0.0449, + "num_input_tokens_seen": 19284768, + "step": 91370 + }, + { + "epoch": 10.052255225522552, + "grad_norm": 0.09453344345092773, + "learning_rate": 2.9116957963773127e-05, + "loss": 0.0073, + "num_input_tokens_seen": 19285888, + "step": 91375 + }, + { + "epoch": 10.052805280528053, + "grad_norm": 0.037304654717445374, + "learning_rate": 2.911459064441402e-05, + "loss": 0.116, + "num_input_tokens_seen": 19286944, + "step": 91380 + }, + { + "epoch": 10.053355335533553, + "grad_norm": 0.026366757228970528, + "learning_rate": 2.9112223287132706e-05, + "loss": 0.015, + "num_input_tokens_seen": 19288032, + "step": 91385 + }, + { + "epoch": 10.053905390539054, + "grad_norm": 0.49416181445121765, + "learning_rate": 2.910985589195101e-05, + "loss": 0.0621, + "num_input_tokens_seen": 19289088, + "step": 91390 + }, + { + "epoch": 10.054455445544555, + "grad_norm": 0.028305264189839363, + "learning_rate": 2.9107488458890737e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19290176, + "step": 91395 + }, + { + "epoch": 10.055005500550054, + "grad_norm": 0.014926702715456486, + "learning_rate": 2.9105120987973715e-05, + "loss": 0.0438, + "num_input_tokens_seen": 19291296, + "step": 91400 + }, + { + "epoch": 10.055555555555555, + "grad_norm": 0.03716995194554329, + "learning_rate": 2.9102753479221768e-05, + "loss": 0.0083, + "num_input_tokens_seen": 19292256, + "step": 91405 + }, + { + "epoch": 10.056105610561056, + "grad_norm": 0.11739926785230637, + "learning_rate": 2.9100385932656698e-05, + "loss": 0.0111, + "num_input_tokens_seen": 19293280, + "step": 91410 + }, + { + "epoch": 10.056655665566556, + "grad_norm": 0.12915965914726257, + "learning_rate": 2.9098018348300355e-05, + "loss": 0.0134, + "num_input_tokens_seen": 19294368, + "step": 91415 + }, + { + "epoch": 10.057205720572057, + "grad_norm": 0.025607984513044357, + "learning_rate": 2.9095650726174532e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19295392, + "step": 91420 + }, + { + "epoch": 10.057755775577558, + "grad_norm": 0.01223526056855917, + "learning_rate": 2.9093283066301065e-05, + "loss": 0.0251, + "num_input_tokens_seen": 19296384, + "step": 91425 + }, + { + "epoch": 10.058305830583059, + "grad_norm": 0.1377772092819214, + "learning_rate": 2.909091536870177e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19297376, + "step": 91430 + }, + { + "epoch": 10.058855885588558, + "grad_norm": 0.23486213386058807, + "learning_rate": 2.908854763339847e-05, + "loss": 0.0081, + "num_input_tokens_seen": 19298400, + "step": 91435 + }, + { + "epoch": 10.05940594059406, + "grad_norm": 0.14705896377563477, + "learning_rate": 2.9086179860413e-05, + "loss": 0.0124, + "num_input_tokens_seen": 19299488, + "step": 91440 + }, + { + "epoch": 10.05995599559956, + "grad_norm": 0.028373248875141144, + "learning_rate": 2.9083812049767157e-05, + "loss": 0.1271, + "num_input_tokens_seen": 19300512, + "step": 91445 + }, + { + "epoch": 10.06050605060506, + "grad_norm": 0.7678778171539307, + "learning_rate": 2.9081444201482788e-05, + "loss": 0.0265, + "num_input_tokens_seen": 19301568, + "step": 91450 + }, + { + "epoch": 10.061056105610561, + "grad_norm": 0.007920991629362106, + "learning_rate": 2.9079076315581693e-05, + "loss": 0.0017, + "num_input_tokens_seen": 19302624, + "step": 91455 + }, + { + "epoch": 10.061606160616062, + "grad_norm": 0.04154923930764198, + "learning_rate": 2.907670839208572e-05, + "loss": 0.0238, + "num_input_tokens_seen": 19303680, + "step": 91460 + }, + { + "epoch": 10.062156215621561, + "grad_norm": 0.05330917239189148, + "learning_rate": 2.907434043101668e-05, + "loss": 0.0483, + "num_input_tokens_seen": 19304736, + "step": 91465 + }, + { + "epoch": 10.062706270627062, + "grad_norm": 0.19089962542057037, + "learning_rate": 2.90719724323964e-05, + "loss": 0.0037, + "num_input_tokens_seen": 19305824, + "step": 91470 + }, + { + "epoch": 10.063256325632564, + "grad_norm": 0.3664495348930359, + "learning_rate": 2.9069604396246703e-05, + "loss": 0.0049, + "num_input_tokens_seen": 19306912, + "step": 91475 + }, + { + "epoch": 10.063806380638065, + "grad_norm": 0.5906378030776978, + "learning_rate": 2.9067236322589415e-05, + "loss": 0.0269, + "num_input_tokens_seen": 19308032, + "step": 91480 + }, + { + "epoch": 10.064356435643564, + "grad_norm": 0.7598147988319397, + "learning_rate": 2.9064868211446362e-05, + "loss": 0.0579, + "num_input_tokens_seen": 19309088, + "step": 91485 + }, + { + "epoch": 10.064906490649065, + "grad_norm": 0.16810773313045502, + "learning_rate": 2.906250006283937e-05, + "loss": 0.0206, + "num_input_tokens_seen": 19310112, + "step": 91490 + }, + { + "epoch": 10.065456545654566, + "grad_norm": 0.011137991212308407, + "learning_rate": 2.906013187679026e-05, + "loss": 0.0123, + "num_input_tokens_seen": 19311136, + "step": 91495 + }, + { + "epoch": 10.066006600660065, + "grad_norm": 0.06852947175502777, + "learning_rate": 2.9057763653320863e-05, + "loss": 0.0133, + "num_input_tokens_seen": 19312192, + "step": 91500 + }, + { + "epoch": 10.066556655665567, + "grad_norm": 0.013457122258841991, + "learning_rate": 2.905539539245301e-05, + "loss": 0.0016, + "num_input_tokens_seen": 19313184, + "step": 91505 + }, + { + "epoch": 10.067106710671068, + "grad_norm": 0.37738585472106934, + "learning_rate": 2.9053027094208524e-05, + "loss": 0.0073, + "num_input_tokens_seen": 19314144, + "step": 91510 + }, + { + "epoch": 10.067656765676567, + "grad_norm": 0.02140462026000023, + "learning_rate": 2.9050658758609228e-05, + "loss": 0.0237, + "num_input_tokens_seen": 19315168, + "step": 91515 + }, + { + "epoch": 10.068206820682068, + "grad_norm": 0.051230914890766144, + "learning_rate": 2.9048290385676958e-05, + "loss": 0.0233, + "num_input_tokens_seen": 19316256, + "step": 91520 + }, + { + "epoch": 10.06875687568757, + "grad_norm": 0.022582797333598137, + "learning_rate": 2.9045921975433533e-05, + "loss": 0.0143, + "num_input_tokens_seen": 19317312, + "step": 91525 + }, + { + "epoch": 10.069306930693068, + "grad_norm": 1.7296767234802246, + "learning_rate": 2.9043553527900785e-05, + "loss": 0.0708, + "num_input_tokens_seen": 19318336, + "step": 91530 + }, + { + "epoch": 10.06985698569857, + "grad_norm": 0.006848475430160761, + "learning_rate": 2.9041185043100555e-05, + "loss": 0.0412, + "num_input_tokens_seen": 19319392, + "step": 91535 + }, + { + "epoch": 10.07040704070407, + "grad_norm": 3.5206029415130615, + "learning_rate": 2.9038816521054653e-05, + "loss": 0.039, + "num_input_tokens_seen": 19320480, + "step": 91540 + }, + { + "epoch": 10.070957095709572, + "grad_norm": 1.9070073366165161, + "learning_rate": 2.903644796178492e-05, + "loss": 0.0716, + "num_input_tokens_seen": 19321568, + "step": 91545 + }, + { + "epoch": 10.071507150715071, + "grad_norm": 0.007036980241537094, + "learning_rate": 2.9034079365313184e-05, + "loss": 0.0056, + "num_input_tokens_seen": 19322688, + "step": 91550 + }, + { + "epoch": 10.072057205720572, + "grad_norm": 0.006541818846017122, + "learning_rate": 2.9031710731661265e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19323840, + "step": 91555 + }, + { + "epoch": 10.072607260726073, + "grad_norm": 1.7770503759384155, + "learning_rate": 2.9029342060851018e-05, + "loss": 0.0983, + "num_input_tokens_seen": 19324992, + "step": 91560 + }, + { + "epoch": 10.073157315731573, + "grad_norm": 0.09074682742357254, + "learning_rate": 2.902697335290425e-05, + "loss": 0.0276, + "num_input_tokens_seen": 19325952, + "step": 91565 + }, + { + "epoch": 10.073707370737074, + "grad_norm": 0.0048417444340884686, + "learning_rate": 2.9024604607842797e-05, + "loss": 0.028, + "num_input_tokens_seen": 19327008, + "step": 91570 + }, + { + "epoch": 10.074257425742575, + "grad_norm": 0.07191278040409088, + "learning_rate": 2.9022235825688504e-05, + "loss": 0.0056, + "num_input_tokens_seen": 19328096, + "step": 91575 + }, + { + "epoch": 10.074807480748074, + "grad_norm": 0.8862614035606384, + "learning_rate": 2.9019867006463196e-05, + "loss": 0.0109, + "num_input_tokens_seen": 19329184, + "step": 91580 + }, + { + "epoch": 10.075357535753575, + "grad_norm": 2.299508571624756, + "learning_rate": 2.9017498150188688e-05, + "loss": 0.0479, + "num_input_tokens_seen": 19330336, + "step": 91585 + }, + { + "epoch": 10.075907590759076, + "grad_norm": 1.2195883989334106, + "learning_rate": 2.9015129256886836e-05, + "loss": 0.0255, + "num_input_tokens_seen": 19331456, + "step": 91590 + }, + { + "epoch": 10.076457645764576, + "grad_norm": 0.0050767757929861546, + "learning_rate": 2.9012760326579463e-05, + "loss": 0.0304, + "num_input_tokens_seen": 19332512, + "step": 91595 + }, + { + "epoch": 10.077007700770077, + "grad_norm": 0.01763514056801796, + "learning_rate": 2.9010391359288403e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19333536, + "step": 91600 + }, + { + "epoch": 10.077557755775578, + "grad_norm": 0.004568516276776791, + "learning_rate": 2.9008022355035497e-05, + "loss": 0.0057, + "num_input_tokens_seen": 19334560, + "step": 91605 + }, + { + "epoch": 10.078107810781079, + "grad_norm": 1.4276131391525269, + "learning_rate": 2.9005653313842567e-05, + "loss": 0.2684, + "num_input_tokens_seen": 19335648, + "step": 91610 + }, + { + "epoch": 10.078657865786578, + "grad_norm": 0.008633649908006191, + "learning_rate": 2.9003284235731448e-05, + "loss": 0.068, + "num_input_tokens_seen": 19336736, + "step": 91615 + }, + { + "epoch": 10.07920792079208, + "grad_norm": 0.03976316377520561, + "learning_rate": 2.900091512072398e-05, + "loss": 0.0298, + "num_input_tokens_seen": 19337792, + "step": 91620 + }, + { + "epoch": 10.07975797579758, + "grad_norm": 0.005996108055114746, + "learning_rate": 2.8998545968842004e-05, + "loss": 0.0219, + "num_input_tokens_seen": 19338816, + "step": 91625 + }, + { + "epoch": 10.08030803080308, + "grad_norm": 0.045058898627758026, + "learning_rate": 2.8996176780107342e-05, + "loss": 0.0248, + "num_input_tokens_seen": 19339872, + "step": 91630 + }, + { + "epoch": 10.08085808580858, + "grad_norm": 0.024848777800798416, + "learning_rate": 2.899380755454184e-05, + "loss": 0.1046, + "num_input_tokens_seen": 19340960, + "step": 91635 + }, + { + "epoch": 10.081408140814082, + "grad_norm": 0.0038113065529614687, + "learning_rate": 2.8991438292167334e-05, + "loss": 0.0015, + "num_input_tokens_seen": 19342080, + "step": 91640 + }, + { + "epoch": 10.081958195819581, + "grad_norm": 0.010408814996480942, + "learning_rate": 2.8989068993005648e-05, + "loss": 0.002, + "num_input_tokens_seen": 19343168, + "step": 91645 + }, + { + "epoch": 10.082508250825082, + "grad_norm": 0.0021540154702961445, + "learning_rate": 2.898669965707863e-05, + "loss": 0.0029, + "num_input_tokens_seen": 19344224, + "step": 91650 + }, + { + "epoch": 10.083058305830583, + "grad_norm": 0.8879221677780151, + "learning_rate": 2.8984330284408112e-05, + "loss": 0.1502, + "num_input_tokens_seen": 19345248, + "step": 91655 + }, + { + "epoch": 10.083608360836084, + "grad_norm": 0.029303211718797684, + "learning_rate": 2.898196087501594e-05, + "loss": 0.0014, + "num_input_tokens_seen": 19346240, + "step": 91660 + }, + { + "epoch": 10.084158415841584, + "grad_norm": 0.9361075758934021, + "learning_rate": 2.897959142892394e-05, + "loss": 0.0506, + "num_input_tokens_seen": 19347264, + "step": 91665 + }, + { + "epoch": 10.084708470847085, + "grad_norm": 0.03641883283853531, + "learning_rate": 2.8977221946153954e-05, + "loss": 0.0589, + "num_input_tokens_seen": 19348288, + "step": 91670 + }, + { + "epoch": 10.085258525852586, + "grad_norm": 0.08799729496240616, + "learning_rate": 2.8974852426727834e-05, + "loss": 0.0105, + "num_input_tokens_seen": 19349376, + "step": 91675 + }, + { + "epoch": 10.085808580858085, + "grad_norm": 0.010906455107033253, + "learning_rate": 2.8972482870667394e-05, + "loss": 0.0191, + "num_input_tokens_seen": 19350368, + "step": 91680 + }, + { + "epoch": 10.086358635863586, + "grad_norm": 0.1850094050168991, + "learning_rate": 2.8970113277994487e-05, + "loss": 0.0528, + "num_input_tokens_seen": 19351424, + "step": 91685 + }, + { + "epoch": 10.086908690869087, + "grad_norm": 0.32905375957489014, + "learning_rate": 2.8967743648730955e-05, + "loss": 0.1579, + "num_input_tokens_seen": 19352480, + "step": 91690 + }, + { + "epoch": 10.087458745874587, + "grad_norm": 0.026960041373968124, + "learning_rate": 2.896537398289863e-05, + "loss": 0.0152, + "num_input_tokens_seen": 19353472, + "step": 91695 + }, + { + "epoch": 10.088008800880088, + "grad_norm": 0.00670440960675478, + "learning_rate": 2.8963004280519363e-05, + "loss": 0.0568, + "num_input_tokens_seen": 19354528, + "step": 91700 + }, + { + "epoch": 10.088558855885589, + "grad_norm": 0.047784265130758286, + "learning_rate": 2.8960634541614982e-05, + "loss": 0.022, + "num_input_tokens_seen": 19355584, + "step": 91705 + }, + { + "epoch": 10.089108910891088, + "grad_norm": 0.06319468468427658, + "learning_rate": 2.8958264766207334e-05, + "loss": 0.0075, + "num_input_tokens_seen": 19356608, + "step": 91710 + }, + { + "epoch": 10.08965896589659, + "grad_norm": 0.11402027308940887, + "learning_rate": 2.895589495431825e-05, + "loss": 0.0043, + "num_input_tokens_seen": 19357664, + "step": 91715 + }, + { + "epoch": 10.09020902090209, + "grad_norm": 0.008871623314917088, + "learning_rate": 2.8953525105969593e-05, + "loss": 0.0063, + "num_input_tokens_seen": 19358752, + "step": 91720 + }, + { + "epoch": 10.090759075907592, + "grad_norm": 1.2681069374084473, + "learning_rate": 2.895115522118319e-05, + "loss": 0.1315, + "num_input_tokens_seen": 19359776, + "step": 91725 + }, + { + "epoch": 10.091309130913091, + "grad_norm": 0.015276098623871803, + "learning_rate": 2.894878529998088e-05, + "loss": 0.0157, + "num_input_tokens_seen": 19360800, + "step": 91730 + }, + { + "epoch": 10.091859185918592, + "grad_norm": 0.02752901241183281, + "learning_rate": 2.8946415342384513e-05, + "loss": 0.0147, + "num_input_tokens_seen": 19361856, + "step": 91735 + }, + { + "epoch": 10.092409240924093, + "grad_norm": 0.022182287648320198, + "learning_rate": 2.8944045348415933e-05, + "loss": 0.0389, + "num_input_tokens_seen": 19362912, + "step": 91740 + }, + { + "epoch": 10.092959295929592, + "grad_norm": 0.27701935172080994, + "learning_rate": 2.894167531809697e-05, + "loss": 0.0136, + "num_input_tokens_seen": 19364000, + "step": 91745 + }, + { + "epoch": 10.093509350935093, + "grad_norm": 0.00784379057586193, + "learning_rate": 2.893930525144949e-05, + "loss": 0.0026, + "num_input_tokens_seen": 19365152, + "step": 91750 + }, + { + "epoch": 10.094059405940595, + "grad_norm": 1.7758351564407349, + "learning_rate": 2.893693514849532e-05, + "loss": 0.0946, + "num_input_tokens_seen": 19366208, + "step": 91755 + }, + { + "epoch": 10.094609460946094, + "grad_norm": 0.03211282566189766, + "learning_rate": 2.8934565009256298e-05, + "loss": 0.0049, + "num_input_tokens_seen": 19367264, + "step": 91760 + }, + { + "epoch": 10.095159515951595, + "grad_norm": 0.013445681892335415, + "learning_rate": 2.8932194833754284e-05, + "loss": 0.0032, + "num_input_tokens_seen": 19368256, + "step": 91765 + }, + { + "epoch": 10.095709570957096, + "grad_norm": 0.0056440262123942375, + "learning_rate": 2.892982462201112e-05, + "loss": 0.0073, + "num_input_tokens_seen": 19369344, + "step": 91770 + }, + { + "epoch": 10.096259625962595, + "grad_norm": 0.02719884365797043, + "learning_rate": 2.8927454374048646e-05, + "loss": 0.0014, + "num_input_tokens_seen": 19370400, + "step": 91775 + }, + { + "epoch": 10.096809680968097, + "grad_norm": 0.14009930193424225, + "learning_rate": 2.8925084089888717e-05, + "loss": 0.0262, + "num_input_tokens_seen": 19371456, + "step": 91780 + }, + { + "epoch": 10.097359735973598, + "grad_norm": 0.03261568397283554, + "learning_rate": 2.8922713769553162e-05, + "loss": 0.0097, + "num_input_tokens_seen": 19372544, + "step": 91785 + }, + { + "epoch": 10.097909790979099, + "grad_norm": 0.4823055565357208, + "learning_rate": 2.8920343413063832e-05, + "loss": 0.0111, + "num_input_tokens_seen": 19373600, + "step": 91790 + }, + { + "epoch": 10.098459845984598, + "grad_norm": 0.07085848599672318, + "learning_rate": 2.8917973020442597e-05, + "loss": 0.0242, + "num_input_tokens_seen": 19374688, + "step": 91795 + }, + { + "epoch": 10.099009900990099, + "grad_norm": 0.01618797890841961, + "learning_rate": 2.8915602591711267e-05, + "loss": 0.119, + "num_input_tokens_seen": 19375712, + "step": 91800 + }, + { + "epoch": 10.0995599559956, + "grad_norm": 0.022727202624082565, + "learning_rate": 2.891323212689171e-05, + "loss": 0.0331, + "num_input_tokens_seen": 19376768, + "step": 91805 + }, + { + "epoch": 10.1001100110011, + "grad_norm": 0.1087423712015152, + "learning_rate": 2.8910861626005776e-05, + "loss": 0.0043, + "num_input_tokens_seen": 19377856, + "step": 91810 + }, + { + "epoch": 10.1006600660066, + "grad_norm": 0.05982222780585289, + "learning_rate": 2.89084910890753e-05, + "loss": 0.0034, + "num_input_tokens_seen": 19378944, + "step": 91815 + }, + { + "epoch": 10.101210121012102, + "grad_norm": 0.5687654614448547, + "learning_rate": 2.8906120516122144e-05, + "loss": 0.0096, + "num_input_tokens_seen": 19379968, + "step": 91820 + }, + { + "epoch": 10.101760176017601, + "grad_norm": 0.49313363432884216, + "learning_rate": 2.890374990716815e-05, + "loss": 0.0242, + "num_input_tokens_seen": 19381024, + "step": 91825 + }, + { + "epoch": 10.102310231023102, + "grad_norm": 0.00860243197530508, + "learning_rate": 2.8901379262235156e-05, + "loss": 0.0343, + "num_input_tokens_seen": 19382112, + "step": 91830 + }, + { + "epoch": 10.102860286028603, + "grad_norm": 0.9095429182052612, + "learning_rate": 2.8899008581345028e-05, + "loss": 0.0552, + "num_input_tokens_seen": 19383104, + "step": 91835 + }, + { + "epoch": 10.103410341034103, + "grad_norm": 0.02358306013047695, + "learning_rate": 2.889663786451961e-05, + "loss": 0.0691, + "num_input_tokens_seen": 19384160, + "step": 91840 + }, + { + "epoch": 10.103960396039604, + "grad_norm": 0.025373205542564392, + "learning_rate": 2.8894267111780748e-05, + "loss": 0.0208, + "num_input_tokens_seen": 19385184, + "step": 91845 + }, + { + "epoch": 10.104510451045105, + "grad_norm": 0.0866783857345581, + "learning_rate": 2.8891896323150302e-05, + "loss": 0.0196, + "num_input_tokens_seen": 19386272, + "step": 91850 + }, + { + "epoch": 10.105060506050606, + "grad_norm": 0.3092295825481415, + "learning_rate": 2.8889525498650105e-05, + "loss": 0.1011, + "num_input_tokens_seen": 19387360, + "step": 91855 + }, + { + "epoch": 10.105610561056105, + "grad_norm": 0.03702536225318909, + "learning_rate": 2.888715463830202e-05, + "loss": 0.0685, + "num_input_tokens_seen": 19388384, + "step": 91860 + }, + { + "epoch": 10.106160616061606, + "grad_norm": 0.03286624327301979, + "learning_rate": 2.888478374212791e-05, + "loss": 0.0079, + "num_input_tokens_seen": 19389440, + "step": 91865 + }, + { + "epoch": 10.106710671067107, + "grad_norm": 0.02352900803089142, + "learning_rate": 2.8882412810149595e-05, + "loss": 0.0014, + "num_input_tokens_seen": 19390432, + "step": 91870 + }, + { + "epoch": 10.107260726072607, + "grad_norm": 0.8797422647476196, + "learning_rate": 2.8880041842388944e-05, + "loss": 0.0095, + "num_input_tokens_seen": 19391424, + "step": 91875 + }, + { + "epoch": 10.107810781078108, + "grad_norm": 0.4153066873550415, + "learning_rate": 2.8877670838867815e-05, + "loss": 0.0726, + "num_input_tokens_seen": 19392480, + "step": 91880 + }, + { + "epoch": 10.108360836083609, + "grad_norm": 0.08975353091955185, + "learning_rate": 2.887529979960806e-05, + "loss": 0.0467, + "num_input_tokens_seen": 19393600, + "step": 91885 + }, + { + "epoch": 10.108910891089108, + "grad_norm": 0.08211875706911087, + "learning_rate": 2.8872928724631522e-05, + "loss": 0.021, + "num_input_tokens_seen": 19394624, + "step": 91890 + }, + { + "epoch": 10.10946094609461, + "grad_norm": 0.09604822844266891, + "learning_rate": 2.8870557613960055e-05, + "loss": 0.0165, + "num_input_tokens_seen": 19395680, + "step": 91895 + }, + { + "epoch": 10.11001100110011, + "grad_norm": 0.008234783075749874, + "learning_rate": 2.8868186467615517e-05, + "loss": 0.0436, + "num_input_tokens_seen": 19396768, + "step": 91900 + }, + { + "epoch": 10.110561056105611, + "grad_norm": 0.046048007905483246, + "learning_rate": 2.886581528561976e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19397824, + "step": 91905 + }, + { + "epoch": 10.11111111111111, + "grad_norm": 1.37312650680542, + "learning_rate": 2.8863444067994637e-05, + "loss": 0.0762, + "num_input_tokens_seen": 19398944, + "step": 91910 + }, + { + "epoch": 10.111661166116612, + "grad_norm": 0.7924903035163879, + "learning_rate": 2.886107281476201e-05, + "loss": 0.0143, + "num_input_tokens_seen": 19400128, + "step": 91915 + }, + { + "epoch": 10.112211221122113, + "grad_norm": 0.02276560477912426, + "learning_rate": 2.885870152594372e-05, + "loss": 0.0183, + "num_input_tokens_seen": 19401184, + "step": 91920 + }, + { + "epoch": 10.112761276127612, + "grad_norm": 0.01654871739447117, + "learning_rate": 2.8856330201561636e-05, + "loss": 0.03, + "num_input_tokens_seen": 19402240, + "step": 91925 + }, + { + "epoch": 10.113311331133113, + "grad_norm": 1.941859483718872, + "learning_rate": 2.885395884163761e-05, + "loss": 0.0324, + "num_input_tokens_seen": 19403328, + "step": 91930 + }, + { + "epoch": 10.113861386138614, + "grad_norm": 0.09677032381296158, + "learning_rate": 2.8851587446193485e-05, + "loss": 0.0102, + "num_input_tokens_seen": 19404320, + "step": 91935 + }, + { + "epoch": 10.114411441144114, + "grad_norm": 0.5485609769821167, + "learning_rate": 2.8849216015251135e-05, + "loss": 0.0351, + "num_input_tokens_seen": 19405376, + "step": 91940 + }, + { + "epoch": 10.114961496149615, + "grad_norm": 0.03175222873687744, + "learning_rate": 2.8846844548832396e-05, + "loss": 0.0031, + "num_input_tokens_seen": 19406464, + "step": 91945 + }, + { + "epoch": 10.115511551155116, + "grad_norm": 0.9743783473968506, + "learning_rate": 2.884447304695915e-05, + "loss": 0.0613, + "num_input_tokens_seen": 19407552, + "step": 91950 + }, + { + "epoch": 10.116061606160615, + "grad_norm": 0.13064682483673096, + "learning_rate": 2.8842101509653235e-05, + "loss": 0.0172, + "num_input_tokens_seen": 19408640, + "step": 91955 + }, + { + "epoch": 10.116611661166116, + "grad_norm": 0.06204588711261749, + "learning_rate": 2.8839729936936516e-05, + "loss": 0.1094, + "num_input_tokens_seen": 19409728, + "step": 91960 + }, + { + "epoch": 10.117161716171617, + "grad_norm": 0.004543858580291271, + "learning_rate": 2.8837358328830843e-05, + "loss": 0.009, + "num_input_tokens_seen": 19410752, + "step": 91965 + }, + { + "epoch": 10.117711771177119, + "grad_norm": 0.10215948522090912, + "learning_rate": 2.883498668535809e-05, + "loss": 0.005, + "num_input_tokens_seen": 19411712, + "step": 91970 + }, + { + "epoch": 10.118261826182618, + "grad_norm": 0.39353805780410767, + "learning_rate": 2.8832615006540094e-05, + "loss": 0.0052, + "num_input_tokens_seen": 19412704, + "step": 91975 + }, + { + "epoch": 10.118811881188119, + "grad_norm": 0.3269704580307007, + "learning_rate": 2.883024329239873e-05, + "loss": 0.0306, + "num_input_tokens_seen": 19413760, + "step": 91980 + }, + { + "epoch": 10.11936193619362, + "grad_norm": 0.06130828708410263, + "learning_rate": 2.8827871542955854e-05, + "loss": 0.0136, + "num_input_tokens_seen": 19414784, + "step": 91985 + }, + { + "epoch": 10.11991199119912, + "grad_norm": 0.10612966120243073, + "learning_rate": 2.8825499758233314e-05, + "loss": 0.007, + "num_input_tokens_seen": 19415872, + "step": 91990 + }, + { + "epoch": 10.12046204620462, + "grad_norm": 0.01931948773562908, + "learning_rate": 2.8823127938252987e-05, + "loss": 0.0885, + "num_input_tokens_seen": 19416896, + "step": 91995 + }, + { + "epoch": 10.121012101210122, + "grad_norm": 0.06353303045034409, + "learning_rate": 2.882075608303672e-05, + "loss": 0.0268, + "num_input_tokens_seen": 19417984, + "step": 92000 + }, + { + "epoch": 10.12156215621562, + "grad_norm": 0.005820729769766331, + "learning_rate": 2.8818384192606373e-05, + "loss": 0.0902, + "num_input_tokens_seen": 19419072, + "step": 92005 + }, + { + "epoch": 10.122112211221122, + "grad_norm": 0.024565143510699272, + "learning_rate": 2.881601226698383e-05, + "loss": 0.0018, + "num_input_tokens_seen": 19420160, + "step": 92010 + }, + { + "epoch": 10.122662266226623, + "grad_norm": 0.013102679513394833, + "learning_rate": 2.8813640306190915e-05, + "loss": 0.0049, + "num_input_tokens_seen": 19421216, + "step": 92015 + }, + { + "epoch": 10.123212321232122, + "grad_norm": 0.05573813617229462, + "learning_rate": 2.8811268310249506e-05, + "loss": 0.0041, + "num_input_tokens_seen": 19422304, + "step": 92020 + }, + { + "epoch": 10.123762376237623, + "grad_norm": 0.14145033061504364, + "learning_rate": 2.8808896279181474e-05, + "loss": 0.0092, + "num_input_tokens_seen": 19423328, + "step": 92025 + }, + { + "epoch": 10.124312431243125, + "grad_norm": 0.3026132583618164, + "learning_rate": 2.8806524213008673e-05, + "loss": 0.0117, + "num_input_tokens_seen": 19424384, + "step": 92030 + }, + { + "epoch": 10.124862486248626, + "grad_norm": 0.010888230986893177, + "learning_rate": 2.8804152111752963e-05, + "loss": 0.2077, + "num_input_tokens_seen": 19425472, + "step": 92035 + }, + { + "epoch": 10.125412541254125, + "grad_norm": 0.007723958697170019, + "learning_rate": 2.8801779975436212e-05, + "loss": 0.0158, + "num_input_tokens_seen": 19426464, + "step": 92040 + }, + { + "epoch": 10.125962596259626, + "grad_norm": 0.05266951769590378, + "learning_rate": 2.8799407804080276e-05, + "loss": 0.0473, + "num_input_tokens_seen": 19427488, + "step": 92045 + }, + { + "epoch": 10.126512651265127, + "grad_norm": 0.004534166771918535, + "learning_rate": 2.879703559770701e-05, + "loss": 0.0024, + "num_input_tokens_seen": 19428480, + "step": 92050 + }, + { + "epoch": 10.127062706270626, + "grad_norm": 1.1973471641540527, + "learning_rate": 2.8794663356338307e-05, + "loss": 0.0377, + "num_input_tokens_seen": 19429472, + "step": 92055 + }, + { + "epoch": 10.127612761276128, + "grad_norm": 0.11089864373207092, + "learning_rate": 2.8792291079996008e-05, + "loss": 0.0763, + "num_input_tokens_seen": 19430528, + "step": 92060 + }, + { + "epoch": 10.128162816281629, + "grad_norm": 0.14072822034358978, + "learning_rate": 2.8789918768701972e-05, + "loss": 0.0084, + "num_input_tokens_seen": 19431552, + "step": 92065 + }, + { + "epoch": 10.128712871287128, + "grad_norm": 0.7688254117965698, + "learning_rate": 2.8787546422478083e-05, + "loss": 0.1693, + "num_input_tokens_seen": 19432576, + "step": 92070 + }, + { + "epoch": 10.129262926292629, + "grad_norm": 0.018741827458143234, + "learning_rate": 2.8785174041346198e-05, + "loss": 0.0041, + "num_input_tokens_seen": 19433632, + "step": 92075 + }, + { + "epoch": 10.12981298129813, + "grad_norm": 0.04933274909853935, + "learning_rate": 2.878280162532817e-05, + "loss": 0.0058, + "num_input_tokens_seen": 19434688, + "step": 92080 + }, + { + "epoch": 10.130363036303631, + "grad_norm": 0.0035178596153855324, + "learning_rate": 2.8780429174445882e-05, + "loss": 0.033, + "num_input_tokens_seen": 19435680, + "step": 92085 + }, + { + "epoch": 10.13091309130913, + "grad_norm": 0.021025987342000008, + "learning_rate": 2.8778056688721193e-05, + "loss": 0.0019, + "num_input_tokens_seen": 19436672, + "step": 92090 + }, + { + "epoch": 10.131463146314632, + "grad_norm": 0.009771092794835567, + "learning_rate": 2.8775684168175958e-05, + "loss": 0.1012, + "num_input_tokens_seen": 19437728, + "step": 92095 + }, + { + "epoch": 10.132013201320133, + "grad_norm": 0.03582003712654114, + "learning_rate": 2.8773311612832066e-05, + "loss": 0.1222, + "num_input_tokens_seen": 19438848, + "step": 92100 + }, + { + "epoch": 10.132563256325632, + "grad_norm": 0.01332350354641676, + "learning_rate": 2.877093902271137e-05, + "loss": 0.0028, + "num_input_tokens_seen": 19439840, + "step": 92105 + }, + { + "epoch": 10.133113311331133, + "grad_norm": 0.05638793855905533, + "learning_rate": 2.8768566397835734e-05, + "loss": 0.0094, + "num_input_tokens_seen": 19440960, + "step": 92110 + }, + { + "epoch": 10.133663366336634, + "grad_norm": 1.2588152885437012, + "learning_rate": 2.8766193738227032e-05, + "loss": 0.1045, + "num_input_tokens_seen": 19442016, + "step": 92115 + }, + { + "epoch": 10.134213421342134, + "grad_norm": 0.03218803554773331, + "learning_rate": 2.876382104390713e-05, + "loss": 0.017, + "num_input_tokens_seen": 19443104, + "step": 92120 + }, + { + "epoch": 10.134763476347635, + "grad_norm": 0.03676483407616615, + "learning_rate": 2.8761448314897888e-05, + "loss": 0.0149, + "num_input_tokens_seen": 19444160, + "step": 92125 + }, + { + "epoch": 10.135313531353136, + "grad_norm": 0.13718798756599426, + "learning_rate": 2.8759075551221194e-05, + "loss": 0.012, + "num_input_tokens_seen": 19445248, + "step": 92130 + }, + { + "epoch": 10.135863586358635, + "grad_norm": 0.13144119083881378, + "learning_rate": 2.8756702752898897e-05, + "loss": 0.0159, + "num_input_tokens_seen": 19446368, + "step": 92135 + }, + { + "epoch": 10.136413641364136, + "grad_norm": 0.008036124520003796, + "learning_rate": 2.8754329919952878e-05, + "loss": 0.0252, + "num_input_tokens_seen": 19447360, + "step": 92140 + }, + { + "epoch": 10.136963696369637, + "grad_norm": 0.08640023320913315, + "learning_rate": 2.8751957052404998e-05, + "loss": 0.0825, + "num_input_tokens_seen": 19448416, + "step": 92145 + }, + { + "epoch": 10.137513751375138, + "grad_norm": 0.6575244069099426, + "learning_rate": 2.8749584150277133e-05, + "loss": 0.0108, + "num_input_tokens_seen": 19449504, + "step": 92150 + }, + { + "epoch": 10.138063806380638, + "grad_norm": 0.5366931557655334, + "learning_rate": 2.8747211213591147e-05, + "loss": 0.0243, + "num_input_tokens_seen": 19450528, + "step": 92155 + }, + { + "epoch": 10.138613861386139, + "grad_norm": 1.0726792812347412, + "learning_rate": 2.8744838242368917e-05, + "loss": 0.0475, + "num_input_tokens_seen": 19451584, + "step": 92160 + }, + { + "epoch": 10.13916391639164, + "grad_norm": 0.6636982560157776, + "learning_rate": 2.87424652366323e-05, + "loss": 0.0261, + "num_input_tokens_seen": 19452640, + "step": 92165 + }, + { + "epoch": 10.13971397139714, + "grad_norm": 0.1459965854883194, + "learning_rate": 2.8740092196403184e-05, + "loss": 0.0086, + "num_input_tokens_seen": 19453760, + "step": 92170 + }, + { + "epoch": 10.14026402640264, + "grad_norm": 1.2718720436096191, + "learning_rate": 2.873771912170344e-05, + "loss": 0.0707, + "num_input_tokens_seen": 19454880, + "step": 92175 + }, + { + "epoch": 10.140814081408141, + "grad_norm": 0.022942662239074707, + "learning_rate": 2.873534601255492e-05, + "loss": 0.0265, + "num_input_tokens_seen": 19455968, + "step": 92180 + }, + { + "epoch": 10.14136413641364, + "grad_norm": 1.4241186380386353, + "learning_rate": 2.8732972868979514e-05, + "loss": 0.0385, + "num_input_tokens_seen": 19456992, + "step": 92185 + }, + { + "epoch": 10.141914191419142, + "grad_norm": 5.441954135894775, + "learning_rate": 2.8730599690999087e-05, + "loss": 0.2144, + "num_input_tokens_seen": 19457984, + "step": 92190 + }, + { + "epoch": 10.142464246424643, + "grad_norm": 0.024123935028910637, + "learning_rate": 2.872822647863551e-05, + "loss": 0.0319, + "num_input_tokens_seen": 19459040, + "step": 92195 + }, + { + "epoch": 10.143014301430142, + "grad_norm": 2.856271982192993, + "learning_rate": 2.8725853231910672e-05, + "loss": 0.0432, + "num_input_tokens_seen": 19460032, + "step": 92200 + }, + { + "epoch": 10.143564356435643, + "grad_norm": 0.12166314572095871, + "learning_rate": 2.8723479950846423e-05, + "loss": 0.0051, + "num_input_tokens_seen": 19461056, + "step": 92205 + }, + { + "epoch": 10.144114411441144, + "grad_norm": 0.010799840092658997, + "learning_rate": 2.872110663546464e-05, + "loss": 0.0562, + "num_input_tokens_seen": 19462112, + "step": 92210 + }, + { + "epoch": 10.144664466446645, + "grad_norm": 0.022190043702721596, + "learning_rate": 2.871873328578721e-05, + "loss": 0.0081, + "num_input_tokens_seen": 19463200, + "step": 92215 + }, + { + "epoch": 10.145214521452145, + "grad_norm": 0.11248622834682465, + "learning_rate": 2.8716359901835992e-05, + "loss": 0.0164, + "num_input_tokens_seen": 19464224, + "step": 92220 + }, + { + "epoch": 10.145764576457646, + "grad_norm": 0.07304811477661133, + "learning_rate": 2.8713986483632877e-05, + "loss": 0.0011, + "num_input_tokens_seen": 19465280, + "step": 92225 + }, + { + "epoch": 10.146314631463147, + "grad_norm": 0.012452645227313042, + "learning_rate": 2.871161303119973e-05, + "loss": 0.0013, + "num_input_tokens_seen": 19466368, + "step": 92230 + }, + { + "epoch": 10.146864686468646, + "grad_norm": 0.11443034559488297, + "learning_rate": 2.8709239544558427e-05, + "loss": 0.0244, + "num_input_tokens_seen": 19467456, + "step": 92235 + }, + { + "epoch": 10.147414741474147, + "grad_norm": 0.30004847049713135, + "learning_rate": 2.8706866023730832e-05, + "loss": 0.0297, + "num_input_tokens_seen": 19468512, + "step": 92240 + }, + { + "epoch": 10.147964796479648, + "grad_norm": 0.04495393857359886, + "learning_rate": 2.870449246873885e-05, + "loss": 0.0145, + "num_input_tokens_seen": 19469504, + "step": 92245 + }, + { + "epoch": 10.148514851485148, + "grad_norm": 0.010852478444576263, + "learning_rate": 2.8702118879604322e-05, + "loss": 0.0419, + "num_input_tokens_seen": 19470560, + "step": 92250 + }, + { + "epoch": 10.149064906490649, + "grad_norm": 0.01954936794936657, + "learning_rate": 2.8699745256349147e-05, + "loss": 0.0035, + "num_input_tokens_seen": 19471552, + "step": 92255 + }, + { + "epoch": 10.14961496149615, + "grad_norm": 0.008990366943180561, + "learning_rate": 2.86973715989952e-05, + "loss": 0.003, + "num_input_tokens_seen": 19472608, + "step": 92260 + }, + { + "epoch": 10.150165016501651, + "grad_norm": 7.12005090713501, + "learning_rate": 2.8694997907564348e-05, + "loss": 0.0209, + "num_input_tokens_seen": 19473632, + "step": 92265 + }, + { + "epoch": 10.15071507150715, + "grad_norm": 0.008680872619152069, + "learning_rate": 2.8692624182078476e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19474592, + "step": 92270 + }, + { + "epoch": 10.151265126512651, + "grad_norm": 0.04656315967440605, + "learning_rate": 2.8690250422559457e-05, + "loss": 0.0308, + "num_input_tokens_seen": 19475616, + "step": 92275 + }, + { + "epoch": 10.151815181518153, + "grad_norm": 0.0034792274236679077, + "learning_rate": 2.8687876629029165e-05, + "loss": 0.0809, + "num_input_tokens_seen": 19476672, + "step": 92280 + }, + { + "epoch": 10.152365236523652, + "grad_norm": 1.832500696182251, + "learning_rate": 2.868550280150949e-05, + "loss": 0.084, + "num_input_tokens_seen": 19477760, + "step": 92285 + }, + { + "epoch": 10.152915291529153, + "grad_norm": 0.012686556205153465, + "learning_rate": 2.86831289400223e-05, + "loss": 0.0013, + "num_input_tokens_seen": 19478816, + "step": 92290 + }, + { + "epoch": 10.153465346534654, + "grad_norm": 0.0034696145448833704, + "learning_rate": 2.8680755044589487e-05, + "loss": 0.0047, + "num_input_tokens_seen": 19479840, + "step": 92295 + }, + { + "epoch": 10.154015401540153, + "grad_norm": 0.37543803453445435, + "learning_rate": 2.8678381115232918e-05, + "loss": 0.0331, + "num_input_tokens_seen": 19480928, + "step": 92300 + }, + { + "epoch": 10.154565456545654, + "grad_norm": 0.16380798816680908, + "learning_rate": 2.8676007151974472e-05, + "loss": 0.0044, + "num_input_tokens_seen": 19481984, + "step": 92305 + }, + { + "epoch": 10.155115511551156, + "grad_norm": 1.027623176574707, + "learning_rate": 2.8673633154836027e-05, + "loss": 0.0466, + "num_input_tokens_seen": 19483072, + "step": 92310 + }, + { + "epoch": 10.155665566556655, + "grad_norm": 2.206294298171997, + "learning_rate": 2.8671259123839472e-05, + "loss": 0.3185, + "num_input_tokens_seen": 19484096, + "step": 92315 + }, + { + "epoch": 10.156215621562156, + "grad_norm": 0.017900273203849792, + "learning_rate": 2.866888505900669e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19485216, + "step": 92320 + }, + { + "epoch": 10.156765676567657, + "grad_norm": 0.02083383873105049, + "learning_rate": 2.866651096035955e-05, + "loss": 0.0331, + "num_input_tokens_seen": 19486208, + "step": 92325 + }, + { + "epoch": 10.157315731573158, + "grad_norm": 0.6090267300605774, + "learning_rate": 2.8664136827919934e-05, + "loss": 0.1328, + "num_input_tokens_seen": 19487328, + "step": 92330 + }, + { + "epoch": 10.157865786578657, + "grad_norm": 0.16660141944885254, + "learning_rate": 2.8661762661709735e-05, + "loss": 0.08, + "num_input_tokens_seen": 19488416, + "step": 92335 + }, + { + "epoch": 10.158415841584159, + "grad_norm": 0.036546289920806885, + "learning_rate": 2.8659388461750815e-05, + "loss": 0.025, + "num_input_tokens_seen": 19489408, + "step": 92340 + }, + { + "epoch": 10.15896589658966, + "grad_norm": 0.035433899611234665, + "learning_rate": 2.8657014228065083e-05, + "loss": 0.0152, + "num_input_tokens_seen": 19490496, + "step": 92345 + }, + { + "epoch": 10.159515951595159, + "grad_norm": 3.0723893642425537, + "learning_rate": 2.8654639960674396e-05, + "loss": 0.094, + "num_input_tokens_seen": 19491648, + "step": 92350 + }, + { + "epoch": 10.16006600660066, + "grad_norm": 0.012876790948212147, + "learning_rate": 2.865226565960064e-05, + "loss": 0.0091, + "num_input_tokens_seen": 19492672, + "step": 92355 + }, + { + "epoch": 10.160616061606161, + "grad_norm": 0.016201738268136978, + "learning_rate": 2.8649891324865718e-05, + "loss": 0.0219, + "num_input_tokens_seen": 19493760, + "step": 92360 + }, + { + "epoch": 10.16116611661166, + "grad_norm": 0.011724975891411304, + "learning_rate": 2.8647516956491495e-05, + "loss": 0.0323, + "num_input_tokens_seen": 19494784, + "step": 92365 + }, + { + "epoch": 10.161716171617162, + "grad_norm": 0.024381263181567192, + "learning_rate": 2.8645142554499856e-05, + "loss": 0.0226, + "num_input_tokens_seen": 19495776, + "step": 92370 + }, + { + "epoch": 10.162266226622663, + "grad_norm": 0.03775171563029289, + "learning_rate": 2.8642768118912684e-05, + "loss": 0.0034, + "num_input_tokens_seen": 19496800, + "step": 92375 + }, + { + "epoch": 10.162816281628162, + "grad_norm": 0.03268897160887718, + "learning_rate": 2.8640393649751874e-05, + "loss": 0.0226, + "num_input_tokens_seen": 19497824, + "step": 92380 + }, + { + "epoch": 10.163366336633663, + "grad_norm": 0.006615966558456421, + "learning_rate": 2.863801914703929e-05, + "loss": 0.0089, + "num_input_tokens_seen": 19498912, + "step": 92385 + }, + { + "epoch": 10.163916391639164, + "grad_norm": 0.38863039016723633, + "learning_rate": 2.8635644610796842e-05, + "loss": 0.0148, + "num_input_tokens_seen": 19499968, + "step": 92390 + }, + { + "epoch": 10.164466446644665, + "grad_norm": 0.0126056969165802, + "learning_rate": 2.86332700410464e-05, + "loss": 0.0609, + "num_input_tokens_seen": 19500992, + "step": 92395 + }, + { + "epoch": 10.165016501650165, + "grad_norm": 0.009228155948221684, + "learning_rate": 2.8630895437809847e-05, + "loss": 0.0178, + "num_input_tokens_seen": 19502048, + "step": 92400 + }, + { + "epoch": 10.165566556655666, + "grad_norm": 0.06964867562055588, + "learning_rate": 2.862852080110907e-05, + "loss": 0.0236, + "num_input_tokens_seen": 19503040, + "step": 92405 + }, + { + "epoch": 10.166116611661167, + "grad_norm": 0.004963158629834652, + "learning_rate": 2.862614613096597e-05, + "loss": 0.0183, + "num_input_tokens_seen": 19504032, + "step": 92410 + }, + { + "epoch": 10.166666666666666, + "grad_norm": 0.0072031510062515736, + "learning_rate": 2.8623771427402413e-05, + "loss": 0.0221, + "num_input_tokens_seen": 19505088, + "step": 92415 + }, + { + "epoch": 10.167216721672167, + "grad_norm": 0.34217163920402527, + "learning_rate": 2.8621396690440295e-05, + "loss": 0.0526, + "num_input_tokens_seen": 19506112, + "step": 92420 + }, + { + "epoch": 10.167766776677668, + "grad_norm": 0.1314888298511505, + "learning_rate": 2.86190219201015e-05, + "loss": 0.0149, + "num_input_tokens_seen": 19507104, + "step": 92425 + }, + { + "epoch": 10.168316831683168, + "grad_norm": 1.6645575761795044, + "learning_rate": 2.8616647116407912e-05, + "loss": 0.1241, + "num_input_tokens_seen": 19508192, + "step": 92430 + }, + { + "epoch": 10.168866886688669, + "grad_norm": 0.058508723974227905, + "learning_rate": 2.8614272279381432e-05, + "loss": 0.0111, + "num_input_tokens_seen": 19509248, + "step": 92435 + }, + { + "epoch": 10.16941694169417, + "grad_norm": 0.03873329982161522, + "learning_rate": 2.8611897409043935e-05, + "loss": 0.0063, + "num_input_tokens_seen": 19510304, + "step": 92440 + }, + { + "epoch": 10.16996699669967, + "grad_norm": 0.5999678373336792, + "learning_rate": 2.8609522505417314e-05, + "loss": 0.008, + "num_input_tokens_seen": 19511360, + "step": 92445 + }, + { + "epoch": 10.17051705170517, + "grad_norm": 0.006284617353230715, + "learning_rate": 2.860714756852345e-05, + "loss": 0.1413, + "num_input_tokens_seen": 19512320, + "step": 92450 + }, + { + "epoch": 10.171067106710671, + "grad_norm": 0.016424031928181648, + "learning_rate": 2.8604772598384248e-05, + "loss": 0.002, + "num_input_tokens_seen": 19513312, + "step": 92455 + }, + { + "epoch": 10.171617161716172, + "grad_norm": 0.027216825634241104, + "learning_rate": 2.8602397595021578e-05, + "loss": 0.0038, + "num_input_tokens_seen": 19514304, + "step": 92460 + }, + { + "epoch": 10.172167216721672, + "grad_norm": 0.005916578695178032, + "learning_rate": 2.8600022558457344e-05, + "loss": 0.004, + "num_input_tokens_seen": 19515392, + "step": 92465 + }, + { + "epoch": 10.172717271727173, + "grad_norm": 0.02721344865858555, + "learning_rate": 2.8597647488713424e-05, + "loss": 0.0452, + "num_input_tokens_seen": 19516448, + "step": 92470 + }, + { + "epoch": 10.173267326732674, + "grad_norm": 0.063257597386837, + "learning_rate": 2.8595272385811717e-05, + "loss": 0.0178, + "num_input_tokens_seen": 19517504, + "step": 92475 + }, + { + "epoch": 10.173817381738173, + "grad_norm": 0.05164545774459839, + "learning_rate": 2.859289724977411e-05, + "loss": 0.0071, + "num_input_tokens_seen": 19518592, + "step": 92480 + }, + { + "epoch": 10.174367436743674, + "grad_norm": 0.00949556939303875, + "learning_rate": 2.8590522080622494e-05, + "loss": 0.0166, + "num_input_tokens_seen": 19519648, + "step": 92485 + }, + { + "epoch": 10.174917491749175, + "grad_norm": 0.010911557823419571, + "learning_rate": 2.8588146878378757e-05, + "loss": 0.0067, + "num_input_tokens_seen": 19520704, + "step": 92490 + }, + { + "epoch": 10.175467546754675, + "grad_norm": 0.04493588209152222, + "learning_rate": 2.8585771643064792e-05, + "loss": 0.0021, + "num_input_tokens_seen": 19521728, + "step": 92495 + }, + { + "epoch": 10.176017601760176, + "grad_norm": 0.23759996891021729, + "learning_rate": 2.8583396374702487e-05, + "loss": 0.0405, + "num_input_tokens_seen": 19522784, + "step": 92500 + }, + { + "epoch": 10.176567656765677, + "grad_norm": 0.005752300377935171, + "learning_rate": 2.858102107331374e-05, + "loss": 0.0125, + "num_input_tokens_seen": 19523872, + "step": 92505 + }, + { + "epoch": 10.177117711771178, + "grad_norm": 0.01605519838631153, + "learning_rate": 2.8578645738920444e-05, + "loss": 0.0936, + "num_input_tokens_seen": 19524960, + "step": 92510 + }, + { + "epoch": 10.177667766776677, + "grad_norm": 0.079420305788517, + "learning_rate": 2.8576270371544484e-05, + "loss": 0.0063, + "num_input_tokens_seen": 19526048, + "step": 92515 + }, + { + "epoch": 10.178217821782178, + "grad_norm": 0.012043542228639126, + "learning_rate": 2.8573894971207755e-05, + "loss": 0.062, + "num_input_tokens_seen": 19527104, + "step": 92520 + }, + { + "epoch": 10.17876787678768, + "grad_norm": 1.6251564025878906, + "learning_rate": 2.857151953793215e-05, + "loss": 0.1549, + "num_input_tokens_seen": 19528160, + "step": 92525 + }, + { + "epoch": 10.179317931793179, + "grad_norm": 0.04254487156867981, + "learning_rate": 2.856914407173956e-05, + "loss": 0.0678, + "num_input_tokens_seen": 19529152, + "step": 92530 + }, + { + "epoch": 10.17986798679868, + "grad_norm": 0.0169895701110363, + "learning_rate": 2.8566768572651896e-05, + "loss": 0.0026, + "num_input_tokens_seen": 19530176, + "step": 92535 + }, + { + "epoch": 10.180418041804181, + "grad_norm": 0.020875822752714157, + "learning_rate": 2.8564393040691027e-05, + "loss": 0.0816, + "num_input_tokens_seen": 19531232, + "step": 92540 + }, + { + "epoch": 10.18096809680968, + "grad_norm": 0.009696463122963905, + "learning_rate": 2.8562017475878856e-05, + "loss": 0.0047, + "num_input_tokens_seen": 19532288, + "step": 92545 + }, + { + "epoch": 10.181518151815181, + "grad_norm": 0.792359471321106, + "learning_rate": 2.855964187823728e-05, + "loss": 0.0222, + "num_input_tokens_seen": 19533280, + "step": 92550 + }, + { + "epoch": 10.182068206820682, + "grad_norm": 0.920609176158905, + "learning_rate": 2.85572662477882e-05, + "loss": 0.0845, + "num_input_tokens_seen": 19534400, + "step": 92555 + }, + { + "epoch": 10.182618261826182, + "grad_norm": 0.21486347913742065, + "learning_rate": 2.8554890584553496e-05, + "loss": 0.1129, + "num_input_tokens_seen": 19535424, + "step": 92560 + }, + { + "epoch": 10.183168316831683, + "grad_norm": 2.9161057472229004, + "learning_rate": 2.8552514888555078e-05, + "loss": 0.0528, + "num_input_tokens_seen": 19536448, + "step": 92565 + }, + { + "epoch": 10.183718371837184, + "grad_norm": 0.8215487003326416, + "learning_rate": 2.8550139159814832e-05, + "loss": 0.0241, + "num_input_tokens_seen": 19537536, + "step": 92570 + }, + { + "epoch": 10.184268426842685, + "grad_norm": 0.016686512157320976, + "learning_rate": 2.854776339835465e-05, + "loss": 0.0069, + "num_input_tokens_seen": 19538624, + "step": 92575 + }, + { + "epoch": 10.184818481848184, + "grad_norm": 1.2488019466400146, + "learning_rate": 2.854538760419645e-05, + "loss": 0.0847, + "num_input_tokens_seen": 19539680, + "step": 92580 + }, + { + "epoch": 10.185368536853685, + "grad_norm": 0.07149483263492584, + "learning_rate": 2.85430117773621e-05, + "loss": 0.0078, + "num_input_tokens_seen": 19540704, + "step": 92585 + }, + { + "epoch": 10.185918591859187, + "grad_norm": 0.11776528507471085, + "learning_rate": 2.8540635917873516e-05, + "loss": 0.0064, + "num_input_tokens_seen": 19541760, + "step": 92590 + }, + { + "epoch": 10.186468646864686, + "grad_norm": 0.4253326952457428, + "learning_rate": 2.8538260025752588e-05, + "loss": 0.0133, + "num_input_tokens_seen": 19542880, + "step": 92595 + }, + { + "epoch": 10.187018701870187, + "grad_norm": 0.01508129108697176, + "learning_rate": 2.853588410102122e-05, + "loss": 0.003, + "num_input_tokens_seen": 19543872, + "step": 92600 + }, + { + "epoch": 10.187568756875688, + "grad_norm": 1.4814996719360352, + "learning_rate": 2.85335081437013e-05, + "loss": 0.0933, + "num_input_tokens_seen": 19544928, + "step": 92605 + }, + { + "epoch": 10.188118811881187, + "grad_norm": 0.02467518486082554, + "learning_rate": 2.8531132153814732e-05, + "loss": 0.0382, + "num_input_tokens_seen": 19545920, + "step": 92610 + }, + { + "epoch": 10.188668866886688, + "grad_norm": 0.036826178431510925, + "learning_rate": 2.852875613138341e-05, + "loss": 0.0575, + "num_input_tokens_seen": 19546944, + "step": 92615 + }, + { + "epoch": 10.18921892189219, + "grad_norm": 0.09294209629297256, + "learning_rate": 2.8526380076429244e-05, + "loss": 0.0474, + "num_input_tokens_seen": 19548032, + "step": 92620 + }, + { + "epoch": 10.189768976897689, + "grad_norm": 0.011352640576660633, + "learning_rate": 2.852400398897412e-05, + "loss": 0.1688, + "num_input_tokens_seen": 19549120, + "step": 92625 + }, + { + "epoch": 10.19031903190319, + "grad_norm": 0.8909060955047607, + "learning_rate": 2.852162786903994e-05, + "loss": 0.0221, + "num_input_tokens_seen": 19550144, + "step": 92630 + }, + { + "epoch": 10.190869086908691, + "grad_norm": 0.0841132178902626, + "learning_rate": 2.851925171664861e-05, + "loss": 0.0143, + "num_input_tokens_seen": 19551232, + "step": 92635 + }, + { + "epoch": 10.191419141914192, + "grad_norm": 0.007457068655639887, + "learning_rate": 2.851687553182203e-05, + "loss": 0.0141, + "num_input_tokens_seen": 19552224, + "step": 92640 + }, + { + "epoch": 10.191969196919691, + "grad_norm": 0.01287812553346157, + "learning_rate": 2.8514499314582084e-05, + "loss": 0.0059, + "num_input_tokens_seen": 19553248, + "step": 92645 + }, + { + "epoch": 10.192519251925193, + "grad_norm": 0.05421096831560135, + "learning_rate": 2.8512123064950697e-05, + "loss": 0.0703, + "num_input_tokens_seen": 19554272, + "step": 92650 + }, + { + "epoch": 10.193069306930694, + "grad_norm": 0.02264845371246338, + "learning_rate": 2.8509746782949746e-05, + "loss": 0.0074, + "num_input_tokens_seen": 19555328, + "step": 92655 + }, + { + "epoch": 10.193619361936193, + "grad_norm": 1.2339370250701904, + "learning_rate": 2.8507370468601146e-05, + "loss": 0.0612, + "num_input_tokens_seen": 19556384, + "step": 92660 + }, + { + "epoch": 10.194169416941694, + "grad_norm": 1.6331627368927002, + "learning_rate": 2.8504994121926797e-05, + "loss": 0.0462, + "num_input_tokens_seen": 19557408, + "step": 92665 + }, + { + "epoch": 10.194719471947195, + "grad_norm": 0.0760553628206253, + "learning_rate": 2.85026177429486e-05, + "loss": 0.0222, + "num_input_tokens_seen": 19558432, + "step": 92670 + }, + { + "epoch": 10.195269526952695, + "grad_norm": 0.03990178182721138, + "learning_rate": 2.8500241331688454e-05, + "loss": 0.0279, + "num_input_tokens_seen": 19559552, + "step": 92675 + }, + { + "epoch": 10.195819581958196, + "grad_norm": 0.012404263950884342, + "learning_rate": 2.8497864888168264e-05, + "loss": 0.0486, + "num_input_tokens_seen": 19560672, + "step": 92680 + }, + { + "epoch": 10.196369636963697, + "grad_norm": 2.6528842449188232, + "learning_rate": 2.8495488412409933e-05, + "loss": 0.0695, + "num_input_tokens_seen": 19561792, + "step": 92685 + }, + { + "epoch": 10.196919691969198, + "grad_norm": 0.019225191324949265, + "learning_rate": 2.8493111904435354e-05, + "loss": 0.0105, + "num_input_tokens_seen": 19562848, + "step": 92690 + }, + { + "epoch": 10.197469746974697, + "grad_norm": 0.14851441979408264, + "learning_rate": 2.8490735364266452e-05, + "loss": 0.0171, + "num_input_tokens_seen": 19563936, + "step": 92695 + }, + { + "epoch": 10.198019801980198, + "grad_norm": 0.4102884829044342, + "learning_rate": 2.8488358791925107e-05, + "loss": 0.0771, + "num_input_tokens_seen": 19565024, + "step": 92700 + }, + { + "epoch": 10.1985698569857, + "grad_norm": 2.6432082653045654, + "learning_rate": 2.848598218743324e-05, + "loss": 0.0135, + "num_input_tokens_seen": 19566080, + "step": 92705 + }, + { + "epoch": 10.199119911991199, + "grad_norm": 0.2144952267408371, + "learning_rate": 2.848360555081275e-05, + "loss": 0.0113, + "num_input_tokens_seen": 19567136, + "step": 92710 + }, + { + "epoch": 10.1996699669967, + "grad_norm": 0.04834967479109764, + "learning_rate": 2.848122888208553e-05, + "loss": 0.0034, + "num_input_tokens_seen": 19568192, + "step": 92715 + }, + { + "epoch": 10.2002200220022, + "grad_norm": 0.04894554615020752, + "learning_rate": 2.8478852181273495e-05, + "loss": 0.0076, + "num_input_tokens_seen": 19569280, + "step": 92720 + }, + { + "epoch": 10.2007700770077, + "grad_norm": 4.432776927947998, + "learning_rate": 2.847647544839856e-05, + "loss": 0.0141, + "num_input_tokens_seen": 19570336, + "step": 92725 + }, + { + "epoch": 10.201320132013201, + "grad_norm": 0.16123715043067932, + "learning_rate": 2.84740986834826e-05, + "loss": 0.0222, + "num_input_tokens_seen": 19571328, + "step": 92730 + }, + { + "epoch": 10.201870187018702, + "grad_norm": 1.0760700702667236, + "learning_rate": 2.8471721886547554e-05, + "loss": 0.1933, + "num_input_tokens_seen": 19572352, + "step": 92735 + }, + { + "epoch": 10.202420242024202, + "grad_norm": 0.0053856451995670795, + "learning_rate": 2.8469345057615308e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19573440, + "step": 92740 + }, + { + "epoch": 10.202970297029703, + "grad_norm": 1.0647058486938477, + "learning_rate": 2.846696819670778e-05, + "loss": 0.0114, + "num_input_tokens_seen": 19574432, + "step": 92745 + }, + { + "epoch": 10.203520352035204, + "grad_norm": 0.08513697981834412, + "learning_rate": 2.8464591303846867e-05, + "loss": 0.0096, + "num_input_tokens_seen": 19575488, + "step": 92750 + }, + { + "epoch": 10.204070407040705, + "grad_norm": 0.1967318058013916, + "learning_rate": 2.8462214379054476e-05, + "loss": 0.0329, + "num_input_tokens_seen": 19576512, + "step": 92755 + }, + { + "epoch": 10.204620462046204, + "grad_norm": 0.0781189426779747, + "learning_rate": 2.8459837422352507e-05, + "loss": 0.0096, + "num_input_tokens_seen": 19577600, + "step": 92760 + }, + { + "epoch": 10.205170517051705, + "grad_norm": 0.003003399120643735, + "learning_rate": 2.845746043376289e-05, + "loss": 0.0062, + "num_input_tokens_seen": 19578592, + "step": 92765 + }, + { + "epoch": 10.205720572057206, + "grad_norm": 0.1930662840604782, + "learning_rate": 2.845508341330752e-05, + "loss": 0.0082, + "num_input_tokens_seen": 19579680, + "step": 92770 + }, + { + "epoch": 10.206270627062706, + "grad_norm": 0.009853585623204708, + "learning_rate": 2.8452706361008298e-05, + "loss": 0.0036, + "num_input_tokens_seen": 19580704, + "step": 92775 + }, + { + "epoch": 10.206820682068207, + "grad_norm": 1.4975991249084473, + "learning_rate": 2.845032927688714e-05, + "loss": 0.053, + "num_input_tokens_seen": 19581696, + "step": 92780 + }, + { + "epoch": 10.207370737073708, + "grad_norm": 0.03588802367448807, + "learning_rate": 2.844795216096595e-05, + "loss": 0.0207, + "num_input_tokens_seen": 19582752, + "step": 92785 + }, + { + "epoch": 10.207920792079207, + "grad_norm": 0.009299758821725845, + "learning_rate": 2.844557501326664e-05, + "loss": 0.0286, + "num_input_tokens_seen": 19583840, + "step": 92790 + }, + { + "epoch": 10.208470847084708, + "grad_norm": 0.008417396806180477, + "learning_rate": 2.8443197833811126e-05, + "loss": 0.014, + "num_input_tokens_seen": 19584832, + "step": 92795 + }, + { + "epoch": 10.20902090209021, + "grad_norm": 0.0484846755862236, + "learning_rate": 2.8440820622621307e-05, + "loss": 0.1328, + "num_input_tokens_seen": 19585920, + "step": 92800 + }, + { + "epoch": 10.209570957095709, + "grad_norm": 0.18723954260349274, + "learning_rate": 2.8438443379719086e-05, + "loss": 0.0482, + "num_input_tokens_seen": 19586976, + "step": 92805 + }, + { + "epoch": 10.21012101210121, + "grad_norm": 0.515440046787262, + "learning_rate": 2.843606610512639e-05, + "loss": 0.0162, + "num_input_tokens_seen": 19588096, + "step": 92810 + }, + { + "epoch": 10.210671067106711, + "grad_norm": 0.2887592017650604, + "learning_rate": 2.843368879886512e-05, + "loss": 0.0053, + "num_input_tokens_seen": 19589088, + "step": 92815 + }, + { + "epoch": 10.211221122112212, + "grad_norm": 0.11878709495067596, + "learning_rate": 2.843131146095719e-05, + "loss": 0.0117, + "num_input_tokens_seen": 19590112, + "step": 92820 + }, + { + "epoch": 10.211771177117711, + "grad_norm": 0.028346562758088112, + "learning_rate": 2.842893409142451e-05, + "loss": 0.026, + "num_input_tokens_seen": 19591168, + "step": 92825 + }, + { + "epoch": 10.212321232123212, + "grad_norm": 0.025047317147254944, + "learning_rate": 2.8426556690288985e-05, + "loss": 0.0095, + "num_input_tokens_seen": 19592192, + "step": 92830 + }, + { + "epoch": 10.212871287128714, + "grad_norm": 0.016746507957577705, + "learning_rate": 2.8424179257572525e-05, + "loss": 0.0078, + "num_input_tokens_seen": 19593184, + "step": 92835 + }, + { + "epoch": 10.213421342134213, + "grad_norm": 0.4094189405441284, + "learning_rate": 2.8421801793297064e-05, + "loss": 0.0217, + "num_input_tokens_seen": 19594272, + "step": 92840 + }, + { + "epoch": 10.213971397139714, + "grad_norm": 0.009035796858370304, + "learning_rate": 2.8419424297484493e-05, + "loss": 0.0165, + "num_input_tokens_seen": 19595232, + "step": 92845 + }, + { + "epoch": 10.214521452145215, + "grad_norm": 0.00850194226950407, + "learning_rate": 2.841704677015672e-05, + "loss": 0.0375, + "num_input_tokens_seen": 19596256, + "step": 92850 + }, + { + "epoch": 10.215071507150714, + "grad_norm": 0.963430643081665, + "learning_rate": 2.8414669211335672e-05, + "loss": 0.138, + "num_input_tokens_seen": 19597280, + "step": 92855 + }, + { + "epoch": 10.215621562156215, + "grad_norm": 0.13693983852863312, + "learning_rate": 2.8412291621043257e-05, + "loss": 0.0975, + "num_input_tokens_seen": 19598336, + "step": 92860 + }, + { + "epoch": 10.216171617161717, + "grad_norm": 1.376335859298706, + "learning_rate": 2.840991399930139e-05, + "loss": 0.0994, + "num_input_tokens_seen": 19599360, + "step": 92865 + }, + { + "epoch": 10.216721672167218, + "grad_norm": 0.17837810516357422, + "learning_rate": 2.840753634613198e-05, + "loss": 0.0078, + "num_input_tokens_seen": 19600384, + "step": 92870 + }, + { + "epoch": 10.217271727172717, + "grad_norm": 0.9021526575088501, + "learning_rate": 2.8405158661556942e-05, + "loss": 0.0156, + "num_input_tokens_seen": 19601408, + "step": 92875 + }, + { + "epoch": 10.217821782178218, + "grad_norm": 1.5771375894546509, + "learning_rate": 2.8402780945598184e-05, + "loss": 0.1319, + "num_input_tokens_seen": 19602464, + "step": 92880 + }, + { + "epoch": 10.218371837183719, + "grad_norm": 0.02520057186484337, + "learning_rate": 2.8400403198277638e-05, + "loss": 0.0116, + "num_input_tokens_seen": 19603456, + "step": 92885 + }, + { + "epoch": 10.218921892189218, + "grad_norm": 1.3871575593948364, + "learning_rate": 2.83980254196172e-05, + "loss": 0.0344, + "num_input_tokens_seen": 19604576, + "step": 92890 + }, + { + "epoch": 10.21947194719472, + "grad_norm": 1.0964250564575195, + "learning_rate": 2.83956476096388e-05, + "loss": 0.0253, + "num_input_tokens_seen": 19605696, + "step": 92895 + }, + { + "epoch": 10.22002200220022, + "grad_norm": 0.09180907160043716, + "learning_rate": 2.839326976836434e-05, + "loss": 0.0037, + "num_input_tokens_seen": 19606816, + "step": 92900 + }, + { + "epoch": 10.22057205720572, + "grad_norm": 0.07884731143712997, + "learning_rate": 2.839089189581574e-05, + "loss": 0.0323, + "num_input_tokens_seen": 19607904, + "step": 92905 + }, + { + "epoch": 10.221122112211221, + "grad_norm": 0.04271445423364639, + "learning_rate": 2.8388513992014908e-05, + "loss": 0.021, + "num_input_tokens_seen": 19608928, + "step": 92910 + }, + { + "epoch": 10.221672167216722, + "grad_norm": 0.5306903719902039, + "learning_rate": 2.8386136056983787e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19609920, + "step": 92915 + }, + { + "epoch": 10.222222222222221, + "grad_norm": 1.3412957191467285, + "learning_rate": 2.8383758090744262e-05, + "loss": 0.0489, + "num_input_tokens_seen": 19611008, + "step": 92920 + }, + { + "epoch": 10.222772277227723, + "grad_norm": 0.20492076873779297, + "learning_rate": 2.8381380093318265e-05, + "loss": 0.0162, + "num_input_tokens_seen": 19612064, + "step": 92925 + }, + { + "epoch": 10.223322332233224, + "grad_norm": 0.1198941022157669, + "learning_rate": 2.8379002064727713e-05, + "loss": 0.0463, + "num_input_tokens_seen": 19613120, + "step": 92930 + }, + { + "epoch": 10.223872387238725, + "grad_norm": 1.286637544631958, + "learning_rate": 2.8376624004994517e-05, + "loss": 0.0427, + "num_input_tokens_seen": 19614208, + "step": 92935 + }, + { + "epoch": 10.224422442244224, + "grad_norm": 0.005062439478933811, + "learning_rate": 2.8374245914140603e-05, + "loss": 0.0045, + "num_input_tokens_seen": 19615264, + "step": 92940 + }, + { + "epoch": 10.224972497249725, + "grad_norm": 0.008268107660114765, + "learning_rate": 2.8371867792187877e-05, + "loss": 0.022, + "num_input_tokens_seen": 19616320, + "step": 92945 + }, + { + "epoch": 10.225522552255226, + "grad_norm": 2.5148091316223145, + "learning_rate": 2.8369489639158265e-05, + "loss": 0.1167, + "num_input_tokens_seen": 19617408, + "step": 92950 + }, + { + "epoch": 10.226072607260726, + "grad_norm": 0.043284062296152115, + "learning_rate": 2.8367111455073687e-05, + "loss": 0.0796, + "num_input_tokens_seen": 19618528, + "step": 92955 + }, + { + "epoch": 10.226622662266227, + "grad_norm": 0.09986208379268646, + "learning_rate": 2.8364733239956065e-05, + "loss": 0.127, + "num_input_tokens_seen": 19619552, + "step": 92960 + }, + { + "epoch": 10.227172717271728, + "grad_norm": 0.002643629675731063, + "learning_rate": 2.8362354993827296e-05, + "loss": 0.1015, + "num_input_tokens_seen": 19620608, + "step": 92965 + }, + { + "epoch": 10.227722772277227, + "grad_norm": 0.023317014798521996, + "learning_rate": 2.8359976716709326e-05, + "loss": 0.0795, + "num_input_tokens_seen": 19621728, + "step": 92970 + }, + { + "epoch": 10.228272827282728, + "grad_norm": 0.1663651019334793, + "learning_rate": 2.8357598408624054e-05, + "loss": 0.0484, + "num_input_tokens_seen": 19622816, + "step": 92975 + }, + { + "epoch": 10.22882288228823, + "grad_norm": 0.4189523458480835, + "learning_rate": 2.8355220069593407e-05, + "loss": 0.0075, + "num_input_tokens_seen": 19623872, + "step": 92980 + }, + { + "epoch": 10.229372937293729, + "grad_norm": 0.3424599766731262, + "learning_rate": 2.835284169963932e-05, + "loss": 0.0153, + "num_input_tokens_seen": 19624960, + "step": 92985 + }, + { + "epoch": 10.22992299229923, + "grad_norm": 0.7217649817466736, + "learning_rate": 2.8350463298783697e-05, + "loss": 0.0468, + "num_input_tokens_seen": 19626016, + "step": 92990 + }, + { + "epoch": 10.23047304730473, + "grad_norm": 0.0646558403968811, + "learning_rate": 2.834808486704845e-05, + "loss": 0.1187, + "num_input_tokens_seen": 19627072, + "step": 92995 + }, + { + "epoch": 10.231023102310232, + "grad_norm": 0.11385338753461838, + "learning_rate": 2.834570640445552e-05, + "loss": 0.0634, + "num_input_tokens_seen": 19628096, + "step": 93000 + }, + { + "epoch": 10.231573157315731, + "grad_norm": 0.021544910967350006, + "learning_rate": 2.834332791102682e-05, + "loss": 0.0061, + "num_input_tokens_seen": 19629152, + "step": 93005 + }, + { + "epoch": 10.232123212321232, + "grad_norm": 0.03262756019830704, + "learning_rate": 2.8340949386784265e-05, + "loss": 0.0612, + "num_input_tokens_seen": 19630176, + "step": 93010 + }, + { + "epoch": 10.232673267326733, + "grad_norm": 0.43893706798553467, + "learning_rate": 2.8338570831749787e-05, + "loss": 0.0306, + "num_input_tokens_seen": 19631200, + "step": 93015 + }, + { + "epoch": 10.233223322332233, + "grad_norm": 1.4289318323135376, + "learning_rate": 2.8336192245945303e-05, + "loss": 0.0314, + "num_input_tokens_seen": 19632192, + "step": 93020 + }, + { + "epoch": 10.233773377337734, + "grad_norm": 0.045847244560718536, + "learning_rate": 2.8333813629392736e-05, + "loss": 0.0057, + "num_input_tokens_seen": 19633216, + "step": 93025 + }, + { + "epoch": 10.234323432343235, + "grad_norm": 0.02567732334136963, + "learning_rate": 2.8331434982114012e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19634272, + "step": 93030 + }, + { + "epoch": 10.234873487348734, + "grad_norm": 0.19170121848583221, + "learning_rate": 2.832905630413104e-05, + "loss": 0.0136, + "num_input_tokens_seen": 19635296, + "step": 93035 + }, + { + "epoch": 10.235423542354235, + "grad_norm": 1.002575397491455, + "learning_rate": 2.8326677595465766e-05, + "loss": 0.0422, + "num_input_tokens_seen": 19636320, + "step": 93040 + }, + { + "epoch": 10.235973597359736, + "grad_norm": 0.29456010460853577, + "learning_rate": 2.8324298856140097e-05, + "loss": 0.0425, + "num_input_tokens_seen": 19637408, + "step": 93045 + }, + { + "epoch": 10.236523652365236, + "grad_norm": 0.03757892921566963, + "learning_rate": 2.832192008617596e-05, + "loss": 0.0127, + "num_input_tokens_seen": 19638464, + "step": 93050 + }, + { + "epoch": 10.237073707370737, + "grad_norm": 0.006901318207383156, + "learning_rate": 2.8319541285595276e-05, + "loss": 0.0376, + "num_input_tokens_seen": 19639456, + "step": 93055 + }, + { + "epoch": 10.237623762376238, + "grad_norm": 0.024935001507401466, + "learning_rate": 2.8317162454419976e-05, + "loss": 0.0261, + "num_input_tokens_seen": 19640512, + "step": 93060 + }, + { + "epoch": 10.238173817381739, + "grad_norm": 0.04532242566347122, + "learning_rate": 2.8314783592671978e-05, + "loss": 0.0053, + "num_input_tokens_seen": 19641504, + "step": 93065 + }, + { + "epoch": 10.238723872387238, + "grad_norm": 1.1619021892547607, + "learning_rate": 2.8312404700373212e-05, + "loss": 0.0718, + "num_input_tokens_seen": 19642528, + "step": 93070 + }, + { + "epoch": 10.23927392739274, + "grad_norm": 0.031570516526699066, + "learning_rate": 2.83100257775456e-05, + "loss": 0.0061, + "num_input_tokens_seen": 19643584, + "step": 93075 + }, + { + "epoch": 10.23982398239824, + "grad_norm": 0.052342068403959274, + "learning_rate": 2.8307646824211077e-05, + "loss": 0.0126, + "num_input_tokens_seen": 19644672, + "step": 93080 + }, + { + "epoch": 10.24037403740374, + "grad_norm": 0.0872914120554924, + "learning_rate": 2.8305267840391553e-05, + "loss": 0.0473, + "num_input_tokens_seen": 19645728, + "step": 93085 + }, + { + "epoch": 10.24092409240924, + "grad_norm": 0.005059897433966398, + "learning_rate": 2.8302888826108964e-05, + "loss": 0.0042, + "num_input_tokens_seen": 19646848, + "step": 93090 + }, + { + "epoch": 10.241474147414742, + "grad_norm": 1.3037109375, + "learning_rate": 2.8300509781385226e-05, + "loss": 0.0453, + "num_input_tokens_seen": 19647904, + "step": 93095 + }, + { + "epoch": 10.242024202420241, + "grad_norm": 0.029055044054985046, + "learning_rate": 2.8298130706242283e-05, + "loss": 0.0832, + "num_input_tokens_seen": 19648960, + "step": 93100 + }, + { + "epoch": 10.242574257425742, + "grad_norm": 0.013466587290167809, + "learning_rate": 2.829575160070205e-05, + "loss": 0.0262, + "num_input_tokens_seen": 19649984, + "step": 93105 + }, + { + "epoch": 10.243124312431243, + "grad_norm": 0.0501975417137146, + "learning_rate": 2.829337246478645e-05, + "loss": 0.0056, + "num_input_tokens_seen": 19651072, + "step": 93110 + }, + { + "epoch": 10.243674367436745, + "grad_norm": 1.4066411256790161, + "learning_rate": 2.829099329851742e-05, + "loss": 0.0194, + "num_input_tokens_seen": 19652064, + "step": 93115 + }, + { + "epoch": 10.244224422442244, + "grad_norm": 0.08326911926269531, + "learning_rate": 2.828861410191689e-05, + "loss": 0.0182, + "num_input_tokens_seen": 19653088, + "step": 93120 + }, + { + "epoch": 10.244774477447745, + "grad_norm": 0.7961015701293945, + "learning_rate": 2.8286234875006773e-05, + "loss": 0.0518, + "num_input_tokens_seen": 19654112, + "step": 93125 + }, + { + "epoch": 10.245324532453246, + "grad_norm": 0.007968859747052193, + "learning_rate": 2.8283855617809012e-05, + "loss": 0.005, + "num_input_tokens_seen": 19655200, + "step": 93130 + }, + { + "epoch": 10.245874587458745, + "grad_norm": 0.05215837433934212, + "learning_rate": 2.8281476330345524e-05, + "loss": 0.0079, + "num_input_tokens_seen": 19656256, + "step": 93135 + }, + { + "epoch": 10.246424642464246, + "grad_norm": 0.27618181705474854, + "learning_rate": 2.8279097012638243e-05, + "loss": 0.0692, + "num_input_tokens_seen": 19657344, + "step": 93140 + }, + { + "epoch": 10.246974697469748, + "grad_norm": 0.2074025422334671, + "learning_rate": 2.82767176647091e-05, + "loss": 0.0089, + "num_input_tokens_seen": 19658336, + "step": 93145 + }, + { + "epoch": 10.247524752475247, + "grad_norm": 0.8328800797462463, + "learning_rate": 2.8274338286580034e-05, + "loss": 0.0334, + "num_input_tokens_seen": 19659456, + "step": 93150 + }, + { + "epoch": 10.248074807480748, + "grad_norm": 0.26506784558296204, + "learning_rate": 2.8271958878272947e-05, + "loss": 0.0481, + "num_input_tokens_seen": 19660576, + "step": 93155 + }, + { + "epoch": 10.248624862486249, + "grad_norm": 0.24063724279403687, + "learning_rate": 2.826957943980979e-05, + "loss": 0.1452, + "num_input_tokens_seen": 19661600, + "step": 93160 + }, + { + "epoch": 10.249174917491748, + "grad_norm": 0.015563338994979858, + "learning_rate": 2.8267199971212494e-05, + "loss": 0.1073, + "num_input_tokens_seen": 19662656, + "step": 93165 + }, + { + "epoch": 10.24972497249725, + "grad_norm": 0.0293335672467947, + "learning_rate": 2.8264820472502973e-05, + "loss": 0.0083, + "num_input_tokens_seen": 19663776, + "step": 93170 + }, + { + "epoch": 10.25027502750275, + "grad_norm": 0.4964020252227783, + "learning_rate": 2.8262440943703183e-05, + "loss": 0.0591, + "num_input_tokens_seen": 19664896, + "step": 93175 + }, + { + "epoch": 10.250825082508252, + "grad_norm": 0.01144536305218935, + "learning_rate": 2.826006138483503e-05, + "loss": 0.0046, + "num_input_tokens_seen": 19665952, + "step": 93180 + }, + { + "epoch": 10.251375137513751, + "grad_norm": 1.0648550987243652, + "learning_rate": 2.8257681795920455e-05, + "loss": 0.0509, + "num_input_tokens_seen": 19667008, + "step": 93185 + }, + { + "epoch": 10.251925192519252, + "grad_norm": 0.9396637082099915, + "learning_rate": 2.825530217698139e-05, + "loss": 0.1282, + "num_input_tokens_seen": 19668096, + "step": 93190 + }, + { + "epoch": 10.252475247524753, + "grad_norm": 0.050043199211359024, + "learning_rate": 2.8252922528039772e-05, + "loss": 0.0031, + "num_input_tokens_seen": 19669184, + "step": 93195 + }, + { + "epoch": 10.253025302530252, + "grad_norm": 0.22237347066402435, + "learning_rate": 2.8250542849117524e-05, + "loss": 0.0065, + "num_input_tokens_seen": 19670208, + "step": 93200 + }, + { + "epoch": 10.253575357535754, + "grad_norm": 0.16577351093292236, + "learning_rate": 2.8248163140236584e-05, + "loss": 0.0509, + "num_input_tokens_seen": 19671264, + "step": 93205 + }, + { + "epoch": 10.254125412541255, + "grad_norm": 0.017906783148646355, + "learning_rate": 2.8245783401418885e-05, + "loss": 0.0087, + "num_input_tokens_seen": 19672224, + "step": 93210 + }, + { + "epoch": 10.254675467546754, + "grad_norm": 0.2435055822134018, + "learning_rate": 2.824340363268635e-05, + "loss": 0.0057, + "num_input_tokens_seen": 19673248, + "step": 93215 + }, + { + "epoch": 10.255225522552255, + "grad_norm": 0.046608876436948776, + "learning_rate": 2.824102383406093e-05, + "loss": 0.0646, + "num_input_tokens_seen": 19674304, + "step": 93220 + }, + { + "epoch": 10.255775577557756, + "grad_norm": 0.0041937329806387424, + "learning_rate": 2.823864400556454e-05, + "loss": 0.0193, + "num_input_tokens_seen": 19675360, + "step": 93225 + }, + { + "epoch": 10.256325632563255, + "grad_norm": 0.04145012050867081, + "learning_rate": 2.8236264147219128e-05, + "loss": 0.0058, + "num_input_tokens_seen": 19676352, + "step": 93230 + }, + { + "epoch": 10.256875687568757, + "grad_norm": 0.005685816984623671, + "learning_rate": 2.8233884259046616e-05, + "loss": 0.0024, + "num_input_tokens_seen": 19677408, + "step": 93235 + }, + { + "epoch": 10.257425742574258, + "grad_norm": 0.01810380071401596, + "learning_rate": 2.8231504341068953e-05, + "loss": 0.0208, + "num_input_tokens_seen": 19678528, + "step": 93240 + }, + { + "epoch": 10.257975797579759, + "grad_norm": 0.09788825362920761, + "learning_rate": 2.822912439330806e-05, + "loss": 0.0059, + "num_input_tokens_seen": 19679520, + "step": 93245 + }, + { + "epoch": 10.258525852585258, + "grad_norm": 0.8495599031448364, + "learning_rate": 2.8226744415785877e-05, + "loss": 0.0791, + "num_input_tokens_seen": 19680480, + "step": 93250 + }, + { + "epoch": 10.25907590759076, + "grad_norm": 0.02475433424115181, + "learning_rate": 2.822436440852434e-05, + "loss": 0.002, + "num_input_tokens_seen": 19681600, + "step": 93255 + }, + { + "epoch": 10.25962596259626, + "grad_norm": 0.011469417251646519, + "learning_rate": 2.8221984371545386e-05, + "loss": 0.0252, + "num_input_tokens_seen": 19682624, + "step": 93260 + }, + { + "epoch": 10.26017601760176, + "grad_norm": 0.056908588856458664, + "learning_rate": 2.8219604304870943e-05, + "loss": 0.006, + "num_input_tokens_seen": 19683616, + "step": 93265 + }, + { + "epoch": 10.26072607260726, + "grad_norm": 0.6933746337890625, + "learning_rate": 2.8217224208522958e-05, + "loss": 0.0976, + "num_input_tokens_seen": 19684672, + "step": 93270 + }, + { + "epoch": 10.261276127612762, + "grad_norm": 0.5117526650428772, + "learning_rate": 2.8214844082523357e-05, + "loss": 0.0449, + "num_input_tokens_seen": 19685664, + "step": 93275 + }, + { + "epoch": 10.261826182618261, + "grad_norm": 0.006196647882461548, + "learning_rate": 2.8212463926894083e-05, + "loss": 0.0591, + "num_input_tokens_seen": 19686720, + "step": 93280 + }, + { + "epoch": 10.262376237623762, + "grad_norm": 2.0229265689849854, + "learning_rate": 2.821008374165706e-05, + "loss": 0.0554, + "num_input_tokens_seen": 19687776, + "step": 93285 + }, + { + "epoch": 10.262926292629263, + "grad_norm": 0.48300856351852417, + "learning_rate": 2.820770352683425e-05, + "loss": 0.0129, + "num_input_tokens_seen": 19688896, + "step": 93290 + }, + { + "epoch": 10.263476347634764, + "grad_norm": 0.013093127869069576, + "learning_rate": 2.8205323282447577e-05, + "loss": 0.0021, + "num_input_tokens_seen": 19689952, + "step": 93295 + }, + { + "epoch": 10.264026402640264, + "grad_norm": 0.3492598235607147, + "learning_rate": 2.820294300851896e-05, + "loss": 0.0106, + "num_input_tokens_seen": 19691040, + "step": 93300 + }, + { + "epoch": 10.264576457645765, + "grad_norm": 0.014198865741491318, + "learning_rate": 2.8200562705070373e-05, + "loss": 0.0139, + "num_input_tokens_seen": 19692064, + "step": 93305 + }, + { + "epoch": 10.265126512651266, + "grad_norm": 0.011691131629049778, + "learning_rate": 2.8198182372123727e-05, + "loss": 0.0417, + "num_input_tokens_seen": 19693152, + "step": 93310 + }, + { + "epoch": 10.265676567656765, + "grad_norm": 0.1768934279680252, + "learning_rate": 2.8195802009700967e-05, + "loss": 0.011, + "num_input_tokens_seen": 19694208, + "step": 93315 + }, + { + "epoch": 10.266226622662266, + "grad_norm": 0.8893777132034302, + "learning_rate": 2.8193421617824034e-05, + "loss": 0.0873, + "num_input_tokens_seen": 19695328, + "step": 93320 + }, + { + "epoch": 10.266776677667767, + "grad_norm": 0.6230555176734924, + "learning_rate": 2.8191041196514873e-05, + "loss": 0.0105, + "num_input_tokens_seen": 19696448, + "step": 93325 + }, + { + "epoch": 10.267326732673267, + "grad_norm": 0.044561367481946945, + "learning_rate": 2.8188660745795404e-05, + "loss": 0.049, + "num_input_tokens_seen": 19697472, + "step": 93330 + }, + { + "epoch": 10.267876787678768, + "grad_norm": 0.2507302761077881, + "learning_rate": 2.8186280265687588e-05, + "loss": 0.0139, + "num_input_tokens_seen": 19698496, + "step": 93335 + }, + { + "epoch": 10.268426842684269, + "grad_norm": 0.021924864500761032, + "learning_rate": 2.8183899756213357e-05, + "loss": 0.0605, + "num_input_tokens_seen": 19699520, + "step": 93340 + }, + { + "epoch": 10.268976897689768, + "grad_norm": 0.024520279839634895, + "learning_rate": 2.818151921739464e-05, + "loss": 0.0031, + "num_input_tokens_seen": 19700544, + "step": 93345 + }, + { + "epoch": 10.26952695269527, + "grad_norm": 0.00771832512691617, + "learning_rate": 2.81791386492534e-05, + "loss": 0.0087, + "num_input_tokens_seen": 19701568, + "step": 93350 + }, + { + "epoch": 10.27007700770077, + "grad_norm": 0.044308748096227646, + "learning_rate": 2.8176758051811558e-05, + "loss": 0.0072, + "num_input_tokens_seen": 19702688, + "step": 93355 + }, + { + "epoch": 10.270627062706271, + "grad_norm": 0.07383272796869278, + "learning_rate": 2.8174377425091053e-05, + "loss": 0.0056, + "num_input_tokens_seen": 19703776, + "step": 93360 + }, + { + "epoch": 10.27117711771177, + "grad_norm": 0.003911799751222134, + "learning_rate": 2.817199676911385e-05, + "loss": 0.0018, + "num_input_tokens_seen": 19704800, + "step": 93365 + }, + { + "epoch": 10.271727172717272, + "grad_norm": 0.004689258988946676, + "learning_rate": 2.8169616083901862e-05, + "loss": 0.0387, + "num_input_tokens_seen": 19705888, + "step": 93370 + }, + { + "epoch": 10.272277227722773, + "grad_norm": 0.10815701633691788, + "learning_rate": 2.816723536947705e-05, + "loss": 0.0317, + "num_input_tokens_seen": 19707008, + "step": 93375 + }, + { + "epoch": 10.272827282728272, + "grad_norm": 1.5792583227157593, + "learning_rate": 2.8164854625861355e-05, + "loss": 0.0173, + "num_input_tokens_seen": 19708064, + "step": 93380 + }, + { + "epoch": 10.273377337733773, + "grad_norm": 1.5849943161010742, + "learning_rate": 2.8162473853076703e-05, + "loss": 0.0358, + "num_input_tokens_seen": 19709120, + "step": 93385 + }, + { + "epoch": 10.273927392739274, + "grad_norm": 0.01753336377441883, + "learning_rate": 2.8160093051145055e-05, + "loss": 0.0417, + "num_input_tokens_seen": 19710208, + "step": 93390 + }, + { + "epoch": 10.274477447744774, + "grad_norm": 0.3130689859390259, + "learning_rate": 2.8157712220088338e-05, + "loss": 0.0061, + "num_input_tokens_seen": 19711264, + "step": 93395 + }, + { + "epoch": 10.275027502750275, + "grad_norm": 0.15007738769054413, + "learning_rate": 2.8155331359928503e-05, + "loss": 0.0057, + "num_input_tokens_seen": 19712352, + "step": 93400 + }, + { + "epoch": 10.275577557755776, + "grad_norm": 0.02343134768307209, + "learning_rate": 2.81529504706875e-05, + "loss": 0.0581, + "num_input_tokens_seen": 19713408, + "step": 93405 + }, + { + "epoch": 10.276127612761275, + "grad_norm": 0.09223689138889313, + "learning_rate": 2.8150569552387268e-05, + "loss": 0.0062, + "num_input_tokens_seen": 19714496, + "step": 93410 + }, + { + "epoch": 10.276677667766776, + "grad_norm": 1.295056700706482, + "learning_rate": 2.8148188605049742e-05, + "loss": 0.059, + "num_input_tokens_seen": 19715520, + "step": 93415 + }, + { + "epoch": 10.277227722772277, + "grad_norm": 0.016938773915171623, + "learning_rate": 2.814580762869687e-05, + "loss": 0.1325, + "num_input_tokens_seen": 19716544, + "step": 93420 + }, + { + "epoch": 10.277777777777779, + "grad_norm": 0.012445946224033833, + "learning_rate": 2.81434266233506e-05, + "loss": 0.0086, + "num_input_tokens_seen": 19717600, + "step": 93425 + }, + { + "epoch": 10.278327832783278, + "grad_norm": 0.06723905354738235, + "learning_rate": 2.814104558903287e-05, + "loss": 0.002, + "num_input_tokens_seen": 19718624, + "step": 93430 + }, + { + "epoch": 10.278877887788779, + "grad_norm": 0.08915850520133972, + "learning_rate": 2.8138664525765646e-05, + "loss": 0.0029, + "num_input_tokens_seen": 19719648, + "step": 93435 + }, + { + "epoch": 10.27942794279428, + "grad_norm": 0.12011271715164185, + "learning_rate": 2.8136283433570847e-05, + "loss": 0.056, + "num_input_tokens_seen": 19720768, + "step": 93440 + }, + { + "epoch": 10.27997799779978, + "grad_norm": 0.10857658088207245, + "learning_rate": 2.8133902312470428e-05, + "loss": 0.0073, + "num_input_tokens_seen": 19721856, + "step": 93445 + }, + { + "epoch": 10.28052805280528, + "grad_norm": 0.030493073165416718, + "learning_rate": 2.8131521162486334e-05, + "loss": 0.0041, + "num_input_tokens_seen": 19722848, + "step": 93450 + }, + { + "epoch": 10.281078107810782, + "grad_norm": 0.7951282262802124, + "learning_rate": 2.812913998364052e-05, + "loss": 0.1177, + "num_input_tokens_seen": 19723936, + "step": 93455 + }, + { + "epoch": 10.281628162816281, + "grad_norm": 0.879611611366272, + "learning_rate": 2.8126758775954914e-05, + "loss": 0.0217, + "num_input_tokens_seen": 19725024, + "step": 93460 + }, + { + "epoch": 10.282178217821782, + "grad_norm": 0.0510856956243515, + "learning_rate": 2.812437753945148e-05, + "loss": 0.0469, + "num_input_tokens_seen": 19726016, + "step": 93465 + }, + { + "epoch": 10.282728272827283, + "grad_norm": 0.052328769117593765, + "learning_rate": 2.8121996274152153e-05, + "loss": 0.0527, + "num_input_tokens_seen": 19727104, + "step": 93470 + }, + { + "epoch": 10.283278327832782, + "grad_norm": 0.31576213240623474, + "learning_rate": 2.8119614980078885e-05, + "loss": 0.0135, + "num_input_tokens_seen": 19728192, + "step": 93475 + }, + { + "epoch": 10.283828382838283, + "grad_norm": 0.06409715116024017, + "learning_rate": 2.8117233657253622e-05, + "loss": 0.0294, + "num_input_tokens_seen": 19729312, + "step": 93480 + }, + { + "epoch": 10.284378437843785, + "grad_norm": 0.5287166237831116, + "learning_rate": 2.811485230569832e-05, + "loss": 0.1542, + "num_input_tokens_seen": 19730336, + "step": 93485 + }, + { + "epoch": 10.284928492849286, + "grad_norm": 0.3983069062232971, + "learning_rate": 2.811247092543491e-05, + "loss": 0.0752, + "num_input_tokens_seen": 19731360, + "step": 93490 + }, + { + "epoch": 10.285478547854785, + "grad_norm": 0.03166719153523445, + "learning_rate": 2.811008951648535e-05, + "loss": 0.0098, + "num_input_tokens_seen": 19732384, + "step": 93495 + }, + { + "epoch": 10.286028602860286, + "grad_norm": 1.192903757095337, + "learning_rate": 2.8107708078871592e-05, + "loss": 0.0128, + "num_input_tokens_seen": 19733376, + "step": 93500 + }, + { + "epoch": 10.286578657865787, + "grad_norm": 1.8988381624221802, + "learning_rate": 2.8105326612615577e-05, + "loss": 0.0856, + "num_input_tokens_seen": 19734400, + "step": 93505 + }, + { + "epoch": 10.287128712871286, + "grad_norm": 0.0415552482008934, + "learning_rate": 2.810294511773926e-05, + "loss": 0.0923, + "num_input_tokens_seen": 19735424, + "step": 93510 + }, + { + "epoch": 10.287678767876788, + "grad_norm": 0.01589159294962883, + "learning_rate": 2.810056359426458e-05, + "loss": 0.0104, + "num_input_tokens_seen": 19736416, + "step": 93515 + }, + { + "epoch": 10.288228822882289, + "grad_norm": 1.5310362577438354, + "learning_rate": 2.8098182042213495e-05, + "loss": 0.1431, + "num_input_tokens_seen": 19737472, + "step": 93520 + }, + { + "epoch": 10.288778877887788, + "grad_norm": 0.09922580420970917, + "learning_rate": 2.8095800461607958e-05, + "loss": 0.1024, + "num_input_tokens_seen": 19738528, + "step": 93525 + }, + { + "epoch": 10.289328932893289, + "grad_norm": 0.027499733492732048, + "learning_rate": 2.8093418852469906e-05, + "loss": 0.0147, + "num_input_tokens_seen": 19739584, + "step": 93530 + }, + { + "epoch": 10.28987898789879, + "grad_norm": 0.18834733963012695, + "learning_rate": 2.80910372148213e-05, + "loss": 0.0551, + "num_input_tokens_seen": 19740608, + "step": 93535 + }, + { + "epoch": 10.290429042904291, + "grad_norm": 0.11290077865123749, + "learning_rate": 2.808865554868409e-05, + "loss": 0.0247, + "num_input_tokens_seen": 19741696, + "step": 93540 + }, + { + "epoch": 10.29097909790979, + "grad_norm": 0.015352165326476097, + "learning_rate": 2.808627385408022e-05, + "loss": 0.083, + "num_input_tokens_seen": 19742816, + "step": 93545 + }, + { + "epoch": 10.291529152915292, + "grad_norm": 0.06903927773237228, + "learning_rate": 2.808389213103165e-05, + "loss": 0.1096, + "num_input_tokens_seen": 19743840, + "step": 93550 + }, + { + "epoch": 10.292079207920793, + "grad_norm": 0.0270090289413929, + "learning_rate": 2.808151037956033e-05, + "loss": 0.0182, + "num_input_tokens_seen": 19744960, + "step": 93555 + }, + { + "epoch": 10.292629262926292, + "grad_norm": 6.204773902893066, + "learning_rate": 2.8079128599688193e-05, + "loss": 0.0615, + "num_input_tokens_seen": 19746048, + "step": 93560 + }, + { + "epoch": 10.293179317931793, + "grad_norm": 1.2090582847595215, + "learning_rate": 2.8076746791437216e-05, + "loss": 0.0481, + "num_input_tokens_seen": 19747136, + "step": 93565 + }, + { + "epoch": 10.293729372937294, + "grad_norm": 1.2005723714828491, + "learning_rate": 2.8074364954829337e-05, + "loss": 0.0102, + "num_input_tokens_seen": 19748192, + "step": 93570 + }, + { + "epoch": 10.294279427942794, + "grad_norm": 0.07133087515830994, + "learning_rate": 2.8071983089886505e-05, + "loss": 0.0046, + "num_input_tokens_seen": 19749184, + "step": 93575 + }, + { + "epoch": 10.294829482948295, + "grad_norm": 0.46030569076538086, + "learning_rate": 2.806960119663069e-05, + "loss": 0.0109, + "num_input_tokens_seen": 19750208, + "step": 93580 + }, + { + "epoch": 10.295379537953796, + "grad_norm": 0.8556620478630066, + "learning_rate": 2.806721927508383e-05, + "loss": 0.0414, + "num_input_tokens_seen": 19751296, + "step": 93585 + }, + { + "epoch": 10.295929592959295, + "grad_norm": 0.025110742077231407, + "learning_rate": 2.8064837325267872e-05, + "loss": 0.1333, + "num_input_tokens_seen": 19752288, + "step": 93590 + }, + { + "epoch": 10.296479647964796, + "grad_norm": 0.20374640822410583, + "learning_rate": 2.806245534720479e-05, + "loss": 0.008, + "num_input_tokens_seen": 19753344, + "step": 93595 + }, + { + "epoch": 10.297029702970297, + "grad_norm": 0.02040925994515419, + "learning_rate": 2.8060073340916533e-05, + "loss": 0.0068, + "num_input_tokens_seen": 19754400, + "step": 93600 + }, + { + "epoch": 10.297579757975798, + "grad_norm": 0.018691541627049446, + "learning_rate": 2.8057691306425034e-05, + "loss": 0.0707, + "num_input_tokens_seen": 19755456, + "step": 93605 + }, + { + "epoch": 10.298129812981298, + "grad_norm": 0.03041405789554119, + "learning_rate": 2.805530924375227e-05, + "loss": 0.0041, + "num_input_tokens_seen": 19756576, + "step": 93610 + }, + { + "epoch": 10.298679867986799, + "grad_norm": 0.23950797319412231, + "learning_rate": 2.805292715292018e-05, + "loss": 0.0385, + "num_input_tokens_seen": 19757600, + "step": 93615 + }, + { + "epoch": 10.2992299229923, + "grad_norm": 0.2743759751319885, + "learning_rate": 2.8050545033950725e-05, + "loss": 0.0123, + "num_input_tokens_seen": 19758656, + "step": 93620 + }, + { + "epoch": 10.2997799779978, + "grad_norm": 0.6486117243766785, + "learning_rate": 2.804816288686587e-05, + "loss": 0.0358, + "num_input_tokens_seen": 19759680, + "step": 93625 + }, + { + "epoch": 10.3003300330033, + "grad_norm": 0.017747117206454277, + "learning_rate": 2.804578071168756e-05, + "loss": 0.012, + "num_input_tokens_seen": 19760672, + "step": 93630 + }, + { + "epoch": 10.300880088008801, + "grad_norm": 0.013743911869823933, + "learning_rate": 2.804339850843774e-05, + "loss": 0.0068, + "num_input_tokens_seen": 19761728, + "step": 93635 + }, + { + "epoch": 10.3014301430143, + "grad_norm": 0.4503426253795624, + "learning_rate": 2.804101627713838e-05, + "loss": 0.0123, + "num_input_tokens_seen": 19762752, + "step": 93640 + }, + { + "epoch": 10.301980198019802, + "grad_norm": 0.1454533487558365, + "learning_rate": 2.803863401781144e-05, + "loss": 0.0076, + "num_input_tokens_seen": 19763840, + "step": 93645 + }, + { + "epoch": 10.302530253025303, + "grad_norm": 0.17075040936470032, + "learning_rate": 2.8036251730478856e-05, + "loss": 0.0268, + "num_input_tokens_seen": 19764960, + "step": 93650 + }, + { + "epoch": 10.303080308030804, + "grad_norm": 0.8553375601768494, + "learning_rate": 2.8033869415162606e-05, + "loss": 0.0424, + "num_input_tokens_seen": 19766016, + "step": 93655 + }, + { + "epoch": 10.303630363036303, + "grad_norm": 2.270756721496582, + "learning_rate": 2.803148707188464e-05, + "loss": 0.2851, + "num_input_tokens_seen": 19767072, + "step": 93660 + }, + { + "epoch": 10.304180418041804, + "grad_norm": 0.022883417084813118, + "learning_rate": 2.8029104700666897e-05, + "loss": 0.0235, + "num_input_tokens_seen": 19768064, + "step": 93665 + }, + { + "epoch": 10.304730473047305, + "grad_norm": 0.2967124581336975, + "learning_rate": 2.802672230153136e-05, + "loss": 0.0099, + "num_input_tokens_seen": 19769216, + "step": 93670 + }, + { + "epoch": 10.305280528052805, + "grad_norm": 0.027171123772859573, + "learning_rate": 2.8024339874499978e-05, + "loss": 0.0118, + "num_input_tokens_seen": 19770336, + "step": 93675 + }, + { + "epoch": 10.305830583058306, + "grad_norm": 0.02181127853691578, + "learning_rate": 2.8021957419594704e-05, + "loss": 0.0401, + "num_input_tokens_seen": 19771392, + "step": 93680 + }, + { + "epoch": 10.306380638063807, + "grad_norm": 0.01737302541732788, + "learning_rate": 2.8019574936837502e-05, + "loss": 0.0798, + "num_input_tokens_seen": 19772480, + "step": 93685 + }, + { + "epoch": 10.306930693069306, + "grad_norm": 1.271808385848999, + "learning_rate": 2.801719242625032e-05, + "loss": 0.0334, + "num_input_tokens_seen": 19773504, + "step": 93690 + }, + { + "epoch": 10.307480748074807, + "grad_norm": 0.22864988446235657, + "learning_rate": 2.8014809887855127e-05, + "loss": 0.0088, + "num_input_tokens_seen": 19774528, + "step": 93695 + }, + { + "epoch": 10.308030803080309, + "grad_norm": 0.01081882044672966, + "learning_rate": 2.8012427321673877e-05, + "loss": 0.041, + "num_input_tokens_seen": 19775616, + "step": 93700 + }, + { + "epoch": 10.308580858085808, + "grad_norm": 0.01619056798517704, + "learning_rate": 2.8010044727728524e-05, + "loss": 0.0917, + "num_input_tokens_seen": 19776640, + "step": 93705 + }, + { + "epoch": 10.309130913091309, + "grad_norm": 0.022247085347771645, + "learning_rate": 2.8007662106041043e-05, + "loss": 0.014, + "num_input_tokens_seen": 19777696, + "step": 93710 + }, + { + "epoch": 10.30968096809681, + "grad_norm": 1.1085715293884277, + "learning_rate": 2.8005279456633375e-05, + "loss": 0.1392, + "num_input_tokens_seen": 19778688, + "step": 93715 + }, + { + "epoch": 10.310231023102311, + "grad_norm": 0.03843165561556816, + "learning_rate": 2.80028967795275e-05, + "loss": 0.0079, + "num_input_tokens_seen": 19779744, + "step": 93720 + }, + { + "epoch": 10.31078107810781, + "grad_norm": 0.05874411389231682, + "learning_rate": 2.8000514074745354e-05, + "loss": 0.008, + "num_input_tokens_seen": 19780736, + "step": 93725 + }, + { + "epoch": 10.311331133113312, + "grad_norm": 0.845425009727478, + "learning_rate": 2.799813134230892e-05, + "loss": 0.0155, + "num_input_tokens_seen": 19781856, + "step": 93730 + }, + { + "epoch": 10.311881188118813, + "grad_norm": 1.1271302700042725, + "learning_rate": 2.799574858224014e-05, + "loss": 0.0835, + "num_input_tokens_seen": 19782880, + "step": 93735 + }, + { + "epoch": 10.312431243124312, + "grad_norm": 0.03735353797674179, + "learning_rate": 2.7993365794560984e-05, + "loss": 0.0372, + "num_input_tokens_seen": 19783904, + "step": 93740 + }, + { + "epoch": 10.312981298129813, + "grad_norm": 0.3142230212688446, + "learning_rate": 2.7990982979293416e-05, + "loss": 0.0077, + "num_input_tokens_seen": 19784960, + "step": 93745 + }, + { + "epoch": 10.313531353135314, + "grad_norm": 0.04295150190591812, + "learning_rate": 2.7988600136459388e-05, + "loss": 0.0745, + "num_input_tokens_seen": 19786016, + "step": 93750 + }, + { + "epoch": 10.314081408140813, + "grad_norm": 1.1721221208572388, + "learning_rate": 2.7986217266080872e-05, + "loss": 0.028, + "num_input_tokens_seen": 19787040, + "step": 93755 + }, + { + "epoch": 10.314631463146315, + "grad_norm": 1.4574906826019287, + "learning_rate": 2.7983834368179822e-05, + "loss": 0.1488, + "num_input_tokens_seen": 19788032, + "step": 93760 + }, + { + "epoch": 10.315181518151816, + "grad_norm": 0.007958452217280865, + "learning_rate": 2.7981451442778195e-05, + "loss": 0.0648, + "num_input_tokens_seen": 19789120, + "step": 93765 + }, + { + "epoch": 10.315731573157315, + "grad_norm": 0.03835690766572952, + "learning_rate": 2.7979068489897975e-05, + "loss": 0.0089, + "num_input_tokens_seen": 19790208, + "step": 93770 + }, + { + "epoch": 10.316281628162816, + "grad_norm": 0.11497201025485992, + "learning_rate": 2.7976685509561108e-05, + "loss": 0.0042, + "num_input_tokens_seen": 19791264, + "step": 93775 + }, + { + "epoch": 10.316831683168317, + "grad_norm": 0.1807623654603958, + "learning_rate": 2.7974302501789544e-05, + "loss": 0.1283, + "num_input_tokens_seen": 19792288, + "step": 93780 + }, + { + "epoch": 10.317381738173818, + "grad_norm": 2.669693946838379, + "learning_rate": 2.7971919466605274e-05, + "loss": 0.1552, + "num_input_tokens_seen": 19793312, + "step": 93785 + }, + { + "epoch": 10.317931793179318, + "grad_norm": 0.2284184992313385, + "learning_rate": 2.7969536404030246e-05, + "loss": 0.0154, + "num_input_tokens_seen": 19794400, + "step": 93790 + }, + { + "epoch": 10.318481848184819, + "grad_norm": 0.099344901740551, + "learning_rate": 2.7967153314086424e-05, + "loss": 0.021, + "num_input_tokens_seen": 19795424, + "step": 93795 + }, + { + "epoch": 10.31903190319032, + "grad_norm": 0.02354997955262661, + "learning_rate": 2.7964770196795774e-05, + "loss": 0.0066, + "num_input_tokens_seen": 19796480, + "step": 93800 + }, + { + "epoch": 10.319581958195819, + "grad_norm": 0.07561236619949341, + "learning_rate": 2.7962387052180268e-05, + "loss": 0.0135, + "num_input_tokens_seen": 19797504, + "step": 93805 + }, + { + "epoch": 10.32013201320132, + "grad_norm": 0.04245243966579437, + "learning_rate": 2.7960003880261848e-05, + "loss": 0.0205, + "num_input_tokens_seen": 19798528, + "step": 93810 + }, + { + "epoch": 10.320682068206821, + "grad_norm": 1.848387598991394, + "learning_rate": 2.7957620681062502e-05, + "loss": 0.0771, + "num_input_tokens_seen": 19799616, + "step": 93815 + }, + { + "epoch": 10.32123212321232, + "grad_norm": 0.030109591782093048, + "learning_rate": 2.7955237454604176e-05, + "loss": 0.0143, + "num_input_tokens_seen": 19800704, + "step": 93820 + }, + { + "epoch": 10.321782178217822, + "grad_norm": 0.08526153862476349, + "learning_rate": 2.7952854200908858e-05, + "loss": 0.0052, + "num_input_tokens_seen": 19801728, + "step": 93825 + }, + { + "epoch": 10.322332233223323, + "grad_norm": 0.07733456045389175, + "learning_rate": 2.795047091999849e-05, + "loss": 0.0276, + "num_input_tokens_seen": 19802848, + "step": 93830 + }, + { + "epoch": 10.322882288228822, + "grad_norm": 0.12546628713607788, + "learning_rate": 2.794808761189505e-05, + "loss": 0.0046, + "num_input_tokens_seen": 19803968, + "step": 93835 + }, + { + "epoch": 10.323432343234323, + "grad_norm": 0.030235789716243744, + "learning_rate": 2.7945704276620503e-05, + "loss": 0.0331, + "num_input_tokens_seen": 19804992, + "step": 93840 + }, + { + "epoch": 10.323982398239824, + "grad_norm": 0.04969048872590065, + "learning_rate": 2.7943320914196814e-05, + "loss": 0.0045, + "num_input_tokens_seen": 19806016, + "step": 93845 + }, + { + "epoch": 10.324532453245325, + "grad_norm": 0.14189140498638153, + "learning_rate": 2.794093752464594e-05, + "loss": 0.0025, + "num_input_tokens_seen": 19807136, + "step": 93850 + }, + { + "epoch": 10.325082508250825, + "grad_norm": 0.35404089093208313, + "learning_rate": 2.793855410798986e-05, + "loss": 0.016, + "num_input_tokens_seen": 19808224, + "step": 93855 + }, + { + "epoch": 10.325632563256326, + "grad_norm": 0.09365690499544144, + "learning_rate": 2.7936170664250545e-05, + "loss": 0.0163, + "num_input_tokens_seen": 19809248, + "step": 93860 + }, + { + "epoch": 10.326182618261827, + "grad_norm": 0.6037812829017639, + "learning_rate": 2.7933787193449946e-05, + "loss": 0.03, + "num_input_tokens_seen": 19810336, + "step": 93865 + }, + { + "epoch": 10.326732673267326, + "grad_norm": 0.0173848457634449, + "learning_rate": 2.7931403695610043e-05, + "loss": 0.0069, + "num_input_tokens_seen": 19811424, + "step": 93870 + }, + { + "epoch": 10.327282728272827, + "grad_norm": 1.06063973903656, + "learning_rate": 2.7929020170752797e-05, + "loss": 0.0431, + "num_input_tokens_seen": 19812480, + "step": 93875 + }, + { + "epoch": 10.327832783278328, + "grad_norm": 0.7151223421096802, + "learning_rate": 2.792663661890017e-05, + "loss": 0.0124, + "num_input_tokens_seen": 19813504, + "step": 93880 + }, + { + "epoch": 10.328382838283828, + "grad_norm": 0.07634580880403519, + "learning_rate": 2.792425304007415e-05, + "loss": 0.0074, + "num_input_tokens_seen": 19814560, + "step": 93885 + }, + { + "epoch": 10.328932893289329, + "grad_norm": 0.3476983308792114, + "learning_rate": 2.792186943429669e-05, + "loss": 0.0163, + "num_input_tokens_seen": 19815584, + "step": 93890 + }, + { + "epoch": 10.32948294829483, + "grad_norm": 0.010098806582391262, + "learning_rate": 2.791948580158975e-05, + "loss": 0.0043, + "num_input_tokens_seen": 19816640, + "step": 93895 + }, + { + "epoch": 10.33003300330033, + "grad_norm": 1.6086089611053467, + "learning_rate": 2.7917102141975322e-05, + "loss": 0.0813, + "num_input_tokens_seen": 19817760, + "step": 93900 + }, + { + "epoch": 10.33058305830583, + "grad_norm": 0.07018312811851501, + "learning_rate": 2.7914718455475357e-05, + "loss": 0.0413, + "num_input_tokens_seen": 19818720, + "step": 93905 + }, + { + "epoch": 10.331133113311331, + "grad_norm": 0.07291030138731003, + "learning_rate": 2.7912334742111833e-05, + "loss": 0.0505, + "num_input_tokens_seen": 19819744, + "step": 93910 + }, + { + "epoch": 10.331683168316832, + "grad_norm": 0.022038022056221962, + "learning_rate": 2.790995100190672e-05, + "loss": 0.0111, + "num_input_tokens_seen": 19820768, + "step": 93915 + }, + { + "epoch": 10.332233223322332, + "grad_norm": 0.15839792788028717, + "learning_rate": 2.7907567234881982e-05, + "loss": 0.008, + "num_input_tokens_seen": 19821824, + "step": 93920 + }, + { + "epoch": 10.332783278327833, + "grad_norm": 0.05617455765604973, + "learning_rate": 2.7905183441059592e-05, + "loss": 0.0112, + "num_input_tokens_seen": 19822848, + "step": 93925 + }, + { + "epoch": 10.333333333333334, + "grad_norm": 0.013716408051550388, + "learning_rate": 2.7902799620461518e-05, + "loss": 0.0249, + "num_input_tokens_seen": 19823872, + "step": 93930 + }, + { + "epoch": 10.333883388338833, + "grad_norm": 0.1357925534248352, + "learning_rate": 2.7900415773109745e-05, + "loss": 0.1211, + "num_input_tokens_seen": 19824896, + "step": 93935 + }, + { + "epoch": 10.334433443344334, + "grad_norm": 0.07331880182027817, + "learning_rate": 2.7898031899026216e-05, + "loss": 0.0065, + "num_input_tokens_seen": 19825984, + "step": 93940 + }, + { + "epoch": 10.334983498349835, + "grad_norm": 0.05486029386520386, + "learning_rate": 2.7895647998232922e-05, + "loss": 0.0122, + "num_input_tokens_seen": 19827104, + "step": 93945 + }, + { + "epoch": 10.335533553355335, + "grad_norm": 0.025333281606435776, + "learning_rate": 2.7893264070751833e-05, + "loss": 0.0053, + "num_input_tokens_seen": 19828160, + "step": 93950 + }, + { + "epoch": 10.336083608360836, + "grad_norm": 0.026340870186686516, + "learning_rate": 2.7890880116604913e-05, + "loss": 0.0295, + "num_input_tokens_seen": 19829152, + "step": 93955 + }, + { + "epoch": 10.336633663366337, + "grad_norm": 0.15008874237537384, + "learning_rate": 2.7888496135814145e-05, + "loss": 0.2028, + "num_input_tokens_seen": 19830176, + "step": 93960 + }, + { + "epoch": 10.337183718371838, + "grad_norm": 0.17594553530216217, + "learning_rate": 2.7886112128401494e-05, + "loss": 0.0176, + "num_input_tokens_seen": 19831264, + "step": 93965 + }, + { + "epoch": 10.337733773377337, + "grad_norm": 0.5293997526168823, + "learning_rate": 2.7883728094388922e-05, + "loss": 0.0433, + "num_input_tokens_seen": 19832288, + "step": 93970 + }, + { + "epoch": 10.338283828382838, + "grad_norm": 0.18227089941501617, + "learning_rate": 2.7881344033798418e-05, + "loss": 0.0692, + "num_input_tokens_seen": 19833344, + "step": 93975 + }, + { + "epoch": 10.33883388338834, + "grad_norm": 0.4906551241874695, + "learning_rate": 2.787895994665195e-05, + "loss": 0.0556, + "num_input_tokens_seen": 19834368, + "step": 93980 + }, + { + "epoch": 10.339383938393839, + "grad_norm": 0.39261189103126526, + "learning_rate": 2.787657583297149e-05, + "loss": 0.0096, + "num_input_tokens_seen": 19835392, + "step": 93985 + }, + { + "epoch": 10.33993399339934, + "grad_norm": 0.28096428513526917, + "learning_rate": 2.7874191692779012e-05, + "loss": 0.0144, + "num_input_tokens_seen": 19836352, + "step": 93990 + }, + { + "epoch": 10.340484048404841, + "grad_norm": 0.038305047899484634, + "learning_rate": 2.787180752609649e-05, + "loss": 0.0072, + "num_input_tokens_seen": 19837408, + "step": 93995 + }, + { + "epoch": 10.34103410341034, + "grad_norm": 0.014021366834640503, + "learning_rate": 2.7869423332945887e-05, + "loss": 0.0088, + "num_input_tokens_seen": 19838528, + "step": 94000 + }, + { + "epoch": 10.341584158415841, + "grad_norm": 0.9454944729804993, + "learning_rate": 2.7867039113349196e-05, + "loss": 0.0305, + "num_input_tokens_seen": 19839552, + "step": 94005 + }, + { + "epoch": 10.342134213421343, + "grad_norm": 1.9224859476089478, + "learning_rate": 2.786465486732837e-05, + "loss": 0.0747, + "num_input_tokens_seen": 19840576, + "step": 94010 + }, + { + "epoch": 10.342684268426842, + "grad_norm": 0.08450639247894287, + "learning_rate": 2.7862270594905405e-05, + "loss": 0.0075, + "num_input_tokens_seen": 19841600, + "step": 94015 + }, + { + "epoch": 10.343234323432343, + "grad_norm": 0.0932634100317955, + "learning_rate": 2.785988629610226e-05, + "loss": 0.1077, + "num_input_tokens_seen": 19842720, + "step": 94020 + }, + { + "epoch": 10.343784378437844, + "grad_norm": 0.020515426993370056, + "learning_rate": 2.785750197094092e-05, + "loss": 0.0401, + "num_input_tokens_seen": 19843744, + "step": 94025 + }, + { + "epoch": 10.344334433443345, + "grad_norm": 1.1621980667114258, + "learning_rate": 2.785511761944335e-05, + "loss": 0.0193, + "num_input_tokens_seen": 19844736, + "step": 94030 + }, + { + "epoch": 10.344884488448844, + "grad_norm": 2.0981712341308594, + "learning_rate": 2.7852733241631536e-05, + "loss": 0.0972, + "num_input_tokens_seen": 19845856, + "step": 94035 + }, + { + "epoch": 10.345434543454346, + "grad_norm": 0.007936950773000717, + "learning_rate": 2.785034883752744e-05, + "loss": 0.0065, + "num_input_tokens_seen": 19846912, + "step": 94040 + }, + { + "epoch": 10.345984598459847, + "grad_norm": 5.085831165313721, + "learning_rate": 2.7847964407153054e-05, + "loss": 0.1064, + "num_input_tokens_seen": 19847872, + "step": 94045 + }, + { + "epoch": 10.346534653465346, + "grad_norm": 0.030774595215916634, + "learning_rate": 2.7845579950530344e-05, + "loss": 0.0224, + "num_input_tokens_seen": 19848864, + "step": 94050 + }, + { + "epoch": 10.347084708470847, + "grad_norm": 1.0878090858459473, + "learning_rate": 2.7843195467681294e-05, + "loss": 0.1307, + "num_input_tokens_seen": 19849888, + "step": 94055 + }, + { + "epoch": 10.347634763476348, + "grad_norm": 0.0687950849533081, + "learning_rate": 2.784081095862787e-05, + "loss": 0.0063, + "num_input_tokens_seen": 19850880, + "step": 94060 + }, + { + "epoch": 10.348184818481847, + "grad_norm": 0.026535535231232643, + "learning_rate": 2.7838426423392057e-05, + "loss": 0.0018, + "num_input_tokens_seen": 19851936, + "step": 94065 + }, + { + "epoch": 10.348734873487349, + "grad_norm": 0.15735335648059845, + "learning_rate": 2.783604186199582e-05, + "loss": 0.0508, + "num_input_tokens_seen": 19852928, + "step": 94070 + }, + { + "epoch": 10.34928492849285, + "grad_norm": 0.06445451080799103, + "learning_rate": 2.7833657274461162e-05, + "loss": 0.0048, + "num_input_tokens_seen": 19853952, + "step": 94075 + }, + { + "epoch": 10.34983498349835, + "grad_norm": 0.4066076874732971, + "learning_rate": 2.7831272660810042e-05, + "loss": 0.0095, + "num_input_tokens_seen": 19855008, + "step": 94080 + }, + { + "epoch": 10.35038503850385, + "grad_norm": 0.02586486004292965, + "learning_rate": 2.782888802106443e-05, + "loss": 0.0049, + "num_input_tokens_seen": 19856128, + "step": 94085 + }, + { + "epoch": 10.350935093509351, + "grad_norm": 0.1727163940668106, + "learning_rate": 2.782650335524632e-05, + "loss": 0.0075, + "num_input_tokens_seen": 19857184, + "step": 94090 + }, + { + "epoch": 10.351485148514852, + "grad_norm": 0.2183961570262909, + "learning_rate": 2.7824118663377684e-05, + "loss": 0.0249, + "num_input_tokens_seen": 19858336, + "step": 94095 + }, + { + "epoch": 10.352035203520352, + "grad_norm": 0.019445935264229774, + "learning_rate": 2.7821733945480505e-05, + "loss": 0.0427, + "num_input_tokens_seen": 19859360, + "step": 94100 + }, + { + "epoch": 10.352585258525853, + "grad_norm": 0.12551386654376984, + "learning_rate": 2.7819349201576762e-05, + "loss": 0.0091, + "num_input_tokens_seen": 19860448, + "step": 94105 + }, + { + "epoch": 10.353135313531354, + "grad_norm": 0.8076804876327515, + "learning_rate": 2.781696443168842e-05, + "loss": 0.1946, + "num_input_tokens_seen": 19861440, + "step": 94110 + }, + { + "epoch": 10.353685368536853, + "grad_norm": 0.20776180922985077, + "learning_rate": 2.7814579635837468e-05, + "loss": 0.003, + "num_input_tokens_seen": 19862400, + "step": 94115 + }, + { + "epoch": 10.354235423542354, + "grad_norm": 0.1892806738615036, + "learning_rate": 2.7812194814045894e-05, + "loss": 0.0652, + "num_input_tokens_seen": 19863456, + "step": 94120 + }, + { + "epoch": 10.354785478547855, + "grad_norm": 0.03267526999115944, + "learning_rate": 2.7809809966335665e-05, + "loss": 0.0086, + "num_input_tokens_seen": 19864480, + "step": 94125 + }, + { + "epoch": 10.355335533553355, + "grad_norm": 0.06850779056549072, + "learning_rate": 2.7807425092728772e-05, + "loss": 0.0282, + "num_input_tokens_seen": 19865600, + "step": 94130 + }, + { + "epoch": 10.355885588558856, + "grad_norm": 1.398118019104004, + "learning_rate": 2.7805040193247184e-05, + "loss": 0.0131, + "num_input_tokens_seen": 19866624, + "step": 94135 + }, + { + "epoch": 10.356435643564357, + "grad_norm": 0.009473706595599651, + "learning_rate": 2.7802655267912887e-05, + "loss": 0.0441, + "num_input_tokens_seen": 19867648, + "step": 94140 + }, + { + "epoch": 10.356985698569858, + "grad_norm": 0.03253822773694992, + "learning_rate": 2.7800270316747855e-05, + "loss": 0.012, + "num_input_tokens_seen": 19868672, + "step": 94145 + }, + { + "epoch": 10.357535753575357, + "grad_norm": 2.7506439685821533, + "learning_rate": 2.779788533977409e-05, + "loss": 0.0913, + "num_input_tokens_seen": 19869728, + "step": 94150 + }, + { + "epoch": 10.358085808580858, + "grad_norm": 0.007482733111828566, + "learning_rate": 2.7795500337013547e-05, + "loss": 0.0019, + "num_input_tokens_seen": 19870752, + "step": 94155 + }, + { + "epoch": 10.35863586358636, + "grad_norm": 0.0319368913769722, + "learning_rate": 2.779311530848822e-05, + "loss": 0.0062, + "num_input_tokens_seen": 19871808, + "step": 94160 + }, + { + "epoch": 10.359185918591859, + "grad_norm": 0.028774432837963104, + "learning_rate": 2.7790730254220093e-05, + "loss": 0.1093, + "num_input_tokens_seen": 19872864, + "step": 94165 + }, + { + "epoch": 10.35973597359736, + "grad_norm": 0.042637214064598083, + "learning_rate": 2.7788345174231146e-05, + "loss": 0.001, + "num_input_tokens_seen": 19873984, + "step": 94170 + }, + { + "epoch": 10.36028602860286, + "grad_norm": 0.025443267077207565, + "learning_rate": 2.7785960068543355e-05, + "loss": 0.0143, + "num_input_tokens_seen": 19875008, + "step": 94175 + }, + { + "epoch": 10.36083608360836, + "grad_norm": 0.5691820383071899, + "learning_rate": 2.7783574937178708e-05, + "loss": 0.0119, + "num_input_tokens_seen": 19876064, + "step": 94180 + }, + { + "epoch": 10.361386138613861, + "grad_norm": 0.012775718234479427, + "learning_rate": 2.778118978015918e-05, + "loss": 0.0167, + "num_input_tokens_seen": 19877120, + "step": 94185 + }, + { + "epoch": 10.361936193619362, + "grad_norm": 0.5478711724281311, + "learning_rate": 2.777880459750677e-05, + "loss": 0.0099, + "num_input_tokens_seen": 19878208, + "step": 94190 + }, + { + "epoch": 10.362486248624862, + "grad_norm": 0.011856066063046455, + "learning_rate": 2.7776419389243453e-05, + "loss": 0.0782, + "num_input_tokens_seen": 19879232, + "step": 94195 + }, + { + "epoch": 10.363036303630363, + "grad_norm": 0.009646274149417877, + "learning_rate": 2.7774034155391198e-05, + "loss": 0.002, + "num_input_tokens_seen": 19880288, + "step": 94200 + }, + { + "epoch": 10.363586358635864, + "grad_norm": 0.2413327693939209, + "learning_rate": 2.7771648895972012e-05, + "loss": 0.0031, + "num_input_tokens_seen": 19881344, + "step": 94205 + }, + { + "epoch": 10.364136413641365, + "grad_norm": 0.026397835463285446, + "learning_rate": 2.7769263611007868e-05, + "loss": 0.0149, + "num_input_tokens_seen": 19882400, + "step": 94210 + }, + { + "epoch": 10.364686468646864, + "grad_norm": 0.045610226690769196, + "learning_rate": 2.7766878300520744e-05, + "loss": 0.0207, + "num_input_tokens_seen": 19883488, + "step": 94215 + }, + { + "epoch": 10.365236523652365, + "grad_norm": 1.9519354104995728, + "learning_rate": 2.776449296453264e-05, + "loss": 0.0258, + "num_input_tokens_seen": 19884544, + "step": 94220 + }, + { + "epoch": 10.365786578657866, + "grad_norm": 0.0070914919488132, + "learning_rate": 2.7762107603065523e-05, + "loss": 0.0185, + "num_input_tokens_seen": 19885536, + "step": 94225 + }, + { + "epoch": 10.366336633663366, + "grad_norm": 0.02785010263323784, + "learning_rate": 2.7759722216141383e-05, + "loss": 0.1346, + "num_input_tokens_seen": 19886592, + "step": 94230 + }, + { + "epoch": 10.366886688668867, + "grad_norm": 0.2415950894355774, + "learning_rate": 2.7757336803782213e-05, + "loss": 0.0069, + "num_input_tokens_seen": 19887712, + "step": 94235 + }, + { + "epoch": 10.367436743674368, + "grad_norm": 0.026054905727505684, + "learning_rate": 2.775495136600999e-05, + "loss": 0.0292, + "num_input_tokens_seen": 19888832, + "step": 94240 + }, + { + "epoch": 10.367986798679867, + "grad_norm": 0.01936265639960766, + "learning_rate": 2.7752565902846706e-05, + "loss": 0.0751, + "num_input_tokens_seen": 19889920, + "step": 94245 + }, + { + "epoch": 10.368536853685368, + "grad_norm": 0.14806526899337769, + "learning_rate": 2.775018041431434e-05, + "loss": 0.01, + "num_input_tokens_seen": 19890976, + "step": 94250 + }, + { + "epoch": 10.36908690869087, + "grad_norm": 0.0331617072224617, + "learning_rate": 2.7747794900434887e-05, + "loss": 0.0224, + "num_input_tokens_seen": 19892064, + "step": 94255 + }, + { + "epoch": 10.369636963696369, + "grad_norm": 0.8787487149238586, + "learning_rate": 2.774540936123031e-05, + "loss": 0.0511, + "num_input_tokens_seen": 19893056, + "step": 94260 + }, + { + "epoch": 10.37018701870187, + "grad_norm": 0.006912006065249443, + "learning_rate": 2.7743023796722627e-05, + "loss": 0.1662, + "num_input_tokens_seen": 19894240, + "step": 94265 + }, + { + "epoch": 10.370737073707371, + "grad_norm": 0.04645705595612526, + "learning_rate": 2.7740638206933816e-05, + "loss": 0.0394, + "num_input_tokens_seen": 19895264, + "step": 94270 + }, + { + "epoch": 10.371287128712872, + "grad_norm": 0.7407764792442322, + "learning_rate": 2.7738252591885845e-05, + "loss": 0.0198, + "num_input_tokens_seen": 19896288, + "step": 94275 + }, + { + "epoch": 10.371837183718371, + "grad_norm": 1.5835827589035034, + "learning_rate": 2.7735866951600713e-05, + "loss": 0.0728, + "num_input_tokens_seen": 19897344, + "step": 94280 + }, + { + "epoch": 10.372387238723872, + "grad_norm": 0.011882806196808815, + "learning_rate": 2.7733481286100417e-05, + "loss": 0.0213, + "num_input_tokens_seen": 19898432, + "step": 94285 + }, + { + "epoch": 10.372937293729374, + "grad_norm": 0.04851759970188141, + "learning_rate": 2.7731095595406932e-05, + "loss": 0.012, + "num_input_tokens_seen": 19899488, + "step": 94290 + }, + { + "epoch": 10.373487348734873, + "grad_norm": 0.057347167283296585, + "learning_rate": 2.7728709879542247e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19900480, + "step": 94295 + }, + { + "epoch": 10.374037403740374, + "grad_norm": 0.02171316184103489, + "learning_rate": 2.7726324138528347e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19901536, + "step": 94300 + }, + { + "epoch": 10.374587458745875, + "grad_norm": 0.018937965855002403, + "learning_rate": 2.772393837238724e-05, + "loss": 0.0164, + "num_input_tokens_seen": 19902560, + "step": 94305 + }, + { + "epoch": 10.375137513751374, + "grad_norm": 0.08910439908504486, + "learning_rate": 2.7721552581140887e-05, + "loss": 0.0044, + "num_input_tokens_seen": 19903616, + "step": 94310 + }, + { + "epoch": 10.375687568756875, + "grad_norm": 1.268546223640442, + "learning_rate": 2.77191667648113e-05, + "loss": 0.0334, + "num_input_tokens_seen": 19904672, + "step": 94315 + }, + { + "epoch": 10.376237623762377, + "grad_norm": 0.15822333097457886, + "learning_rate": 2.771678092342045e-05, + "loss": 0.0275, + "num_input_tokens_seen": 19905664, + "step": 94320 + }, + { + "epoch": 10.376787678767876, + "grad_norm": 0.1628379225730896, + "learning_rate": 2.7714395056990338e-05, + "loss": 0.007, + "num_input_tokens_seen": 19906688, + "step": 94325 + }, + { + "epoch": 10.377337733773377, + "grad_norm": 0.06573320925235748, + "learning_rate": 2.7712009165542946e-05, + "loss": 0.0058, + "num_input_tokens_seen": 19907776, + "step": 94330 + }, + { + "epoch": 10.377887788778878, + "grad_norm": 0.02778458781540394, + "learning_rate": 2.770962324910027e-05, + "loss": 0.0158, + "num_input_tokens_seen": 19908768, + "step": 94335 + }, + { + "epoch": 10.37843784378438, + "grad_norm": 0.06794299930334091, + "learning_rate": 2.7707237307684304e-05, + "loss": 0.0045, + "num_input_tokens_seen": 19909856, + "step": 94340 + }, + { + "epoch": 10.378987898789878, + "grad_norm": 0.6759900450706482, + "learning_rate": 2.770485134131701e-05, + "loss": 0.0223, + "num_input_tokens_seen": 19910944, + "step": 94345 + }, + { + "epoch": 10.37953795379538, + "grad_norm": 0.005434236954897642, + "learning_rate": 2.7702465350020416e-05, + "loss": 0.0519, + "num_input_tokens_seen": 19912096, + "step": 94350 + }, + { + "epoch": 10.38008800880088, + "grad_norm": 0.008182639256119728, + "learning_rate": 2.7700079333816493e-05, + "loss": 0.0454, + "num_input_tokens_seen": 19913120, + "step": 94355 + }, + { + "epoch": 10.38063806380638, + "grad_norm": 0.22819793224334717, + "learning_rate": 2.769769329272723e-05, + "loss": 0.0109, + "num_input_tokens_seen": 19914272, + "step": 94360 + }, + { + "epoch": 10.381188118811881, + "grad_norm": 0.08238178491592407, + "learning_rate": 2.769530722677463e-05, + "loss": 0.1466, + "num_input_tokens_seen": 19915296, + "step": 94365 + }, + { + "epoch": 10.381738173817382, + "grad_norm": 0.5365412831306458, + "learning_rate": 2.769292113598067e-05, + "loss": 0.0188, + "num_input_tokens_seen": 19916320, + "step": 94370 + }, + { + "epoch": 10.382288228822881, + "grad_norm": 0.03352831304073334, + "learning_rate": 2.769053502036735e-05, + "loss": 0.0079, + "num_input_tokens_seen": 19917376, + "step": 94375 + }, + { + "epoch": 10.382838283828383, + "grad_norm": 0.09827784448862076, + "learning_rate": 2.7688148879956655e-05, + "loss": 0.0046, + "num_input_tokens_seen": 19918432, + "step": 94380 + }, + { + "epoch": 10.383388338833884, + "grad_norm": 0.013679148629307747, + "learning_rate": 2.768576271477059e-05, + "loss": 0.0065, + "num_input_tokens_seen": 19919488, + "step": 94385 + }, + { + "epoch": 10.383938393839385, + "grad_norm": 0.04577748477458954, + "learning_rate": 2.7683376524831132e-05, + "loss": 0.0026, + "num_input_tokens_seen": 19920480, + "step": 94390 + }, + { + "epoch": 10.384488448844884, + "grad_norm": 0.09359602630138397, + "learning_rate": 2.7680990310160288e-05, + "loss": 0.0412, + "num_input_tokens_seen": 19921568, + "step": 94395 + }, + { + "epoch": 10.385038503850385, + "grad_norm": 0.10493113845586777, + "learning_rate": 2.7678604070780035e-05, + "loss": 0.0092, + "num_input_tokens_seen": 19922560, + "step": 94400 + }, + { + "epoch": 10.385588558855886, + "grad_norm": 0.03273259848356247, + "learning_rate": 2.7676217806712374e-05, + "loss": 0.0353, + "num_input_tokens_seen": 19923616, + "step": 94405 + }, + { + "epoch": 10.386138613861386, + "grad_norm": 0.006153748836368322, + "learning_rate": 2.7673831517979303e-05, + "loss": 0.0297, + "num_input_tokens_seen": 19924672, + "step": 94410 + }, + { + "epoch": 10.386688668866887, + "grad_norm": 0.024199992418289185, + "learning_rate": 2.7671445204602808e-05, + "loss": 0.0966, + "num_input_tokens_seen": 19925760, + "step": 94415 + }, + { + "epoch": 10.387238723872388, + "grad_norm": 0.03171893209218979, + "learning_rate": 2.766905886660488e-05, + "loss": 0.0278, + "num_input_tokens_seen": 19926880, + "step": 94420 + }, + { + "epoch": 10.387788778877887, + "grad_norm": 0.018760574981570244, + "learning_rate": 2.7666672504007525e-05, + "loss": 0.0339, + "num_input_tokens_seen": 19927968, + "step": 94425 + }, + { + "epoch": 10.388338833883388, + "grad_norm": 0.3304884135723114, + "learning_rate": 2.7664286116832722e-05, + "loss": 0.0202, + "num_input_tokens_seen": 19929024, + "step": 94430 + }, + { + "epoch": 10.38888888888889, + "grad_norm": 0.03897814080119133, + "learning_rate": 2.7661899705102478e-05, + "loss": 0.0021, + "num_input_tokens_seen": 19930112, + "step": 94435 + }, + { + "epoch": 10.389438943894389, + "grad_norm": 0.051661137491464615, + "learning_rate": 2.7659513268838778e-05, + "loss": 0.0225, + "num_input_tokens_seen": 19931168, + "step": 94440 + }, + { + "epoch": 10.38998899889989, + "grad_norm": 0.032448068261146545, + "learning_rate": 2.7657126808063622e-05, + "loss": 0.0781, + "num_input_tokens_seen": 19932160, + "step": 94445 + }, + { + "epoch": 10.39053905390539, + "grad_norm": 0.38055160641670227, + "learning_rate": 2.7654740322799e-05, + "loss": 0.1063, + "num_input_tokens_seen": 19933248, + "step": 94450 + }, + { + "epoch": 10.391089108910892, + "grad_norm": 0.23929303884506226, + "learning_rate": 2.765235381306691e-05, + "loss": 0.0046, + "num_input_tokens_seen": 19934336, + "step": 94455 + }, + { + "epoch": 10.391639163916391, + "grad_norm": 0.02522284910082817, + "learning_rate": 2.7649967278889356e-05, + "loss": 0.004, + "num_input_tokens_seen": 19935392, + "step": 94460 + }, + { + "epoch": 10.392189218921892, + "grad_norm": 0.005608540028333664, + "learning_rate": 2.764758072028832e-05, + "loss": 0.0055, + "num_input_tokens_seen": 19936480, + "step": 94465 + }, + { + "epoch": 10.392739273927393, + "grad_norm": 0.01943860575556755, + "learning_rate": 2.76451941372858e-05, + "loss": 0.0086, + "num_input_tokens_seen": 19937536, + "step": 94470 + }, + { + "epoch": 10.393289328932893, + "grad_norm": 0.020050542429089546, + "learning_rate": 2.7642807529903796e-05, + "loss": 0.0304, + "num_input_tokens_seen": 19938624, + "step": 94475 + }, + { + "epoch": 10.393839383938394, + "grad_norm": 0.30230528116226196, + "learning_rate": 2.764042089816431e-05, + "loss": 0.0123, + "num_input_tokens_seen": 19939712, + "step": 94480 + }, + { + "epoch": 10.394389438943895, + "grad_norm": 0.007887358777225018, + "learning_rate": 2.7638034242089326e-05, + "loss": 0.0068, + "num_input_tokens_seen": 19940736, + "step": 94485 + }, + { + "epoch": 10.394939493949394, + "grad_norm": 0.014536170288920403, + "learning_rate": 2.7635647561700838e-05, + "loss": 0.0423, + "num_input_tokens_seen": 19941856, + "step": 94490 + }, + { + "epoch": 10.395489548954895, + "grad_norm": 1.3964849710464478, + "learning_rate": 2.7633260857020864e-05, + "loss": 0.0275, + "num_input_tokens_seen": 19942848, + "step": 94495 + }, + { + "epoch": 10.396039603960396, + "grad_norm": 0.020835455507040024, + "learning_rate": 2.7630874128071384e-05, + "loss": 0.0382, + "num_input_tokens_seen": 19943872, + "step": 94500 + }, + { + "epoch": 10.396589658965897, + "grad_norm": 0.09032221883535385, + "learning_rate": 2.76284873748744e-05, + "loss": 0.0109, + "num_input_tokens_seen": 19944928, + "step": 94505 + }, + { + "epoch": 10.397139713971397, + "grad_norm": 0.01628519594669342, + "learning_rate": 2.762610059745191e-05, + "loss": 0.019, + "num_input_tokens_seen": 19946016, + "step": 94510 + }, + { + "epoch": 10.397689768976898, + "grad_norm": 0.11640626192092896, + "learning_rate": 2.7623713795825913e-05, + "loss": 0.0676, + "num_input_tokens_seen": 19947072, + "step": 94515 + }, + { + "epoch": 10.398239823982399, + "grad_norm": 2.075160503387451, + "learning_rate": 2.762132697001839e-05, + "loss": 0.113, + "num_input_tokens_seen": 19948128, + "step": 94520 + }, + { + "epoch": 10.398789878987898, + "grad_norm": 0.03861212357878685, + "learning_rate": 2.761894012005137e-05, + "loss": 0.0071, + "num_input_tokens_seen": 19949184, + "step": 94525 + }, + { + "epoch": 10.3993399339934, + "grad_norm": 0.01722167804837227, + "learning_rate": 2.7616553245946835e-05, + "loss": 0.002, + "num_input_tokens_seen": 19950240, + "step": 94530 + }, + { + "epoch": 10.3998899889989, + "grad_norm": 0.13692675530910492, + "learning_rate": 2.761416634772678e-05, + "loss": 0.0411, + "num_input_tokens_seen": 19951296, + "step": 94535 + }, + { + "epoch": 10.4004400440044, + "grad_norm": 0.013596214354038239, + "learning_rate": 2.761177942541321e-05, + "loss": 0.0217, + "num_input_tokens_seen": 19952320, + "step": 94540 + }, + { + "epoch": 10.400990099009901, + "grad_norm": 0.3622656464576721, + "learning_rate": 2.7609392479028118e-05, + "loss": 0.0098, + "num_input_tokens_seen": 19953376, + "step": 94545 + }, + { + "epoch": 10.401540154015402, + "grad_norm": 0.07424186915159225, + "learning_rate": 2.7607005508593502e-05, + "loss": 0.0186, + "num_input_tokens_seen": 19954400, + "step": 94550 + }, + { + "epoch": 10.402090209020901, + "grad_norm": 0.010777970775961876, + "learning_rate": 2.7604618514131387e-05, + "loss": 0.0837, + "num_input_tokens_seen": 19955360, + "step": 94555 + }, + { + "epoch": 10.402640264026402, + "grad_norm": 0.0771465077996254, + "learning_rate": 2.7602231495663738e-05, + "loss": 0.0022, + "num_input_tokens_seen": 19956416, + "step": 94560 + }, + { + "epoch": 10.403190319031903, + "grad_norm": 0.021473364904522896, + "learning_rate": 2.759984445321257e-05, + "loss": 0.0061, + "num_input_tokens_seen": 19957536, + "step": 94565 + }, + { + "epoch": 10.403740374037405, + "grad_norm": 0.20444989204406738, + "learning_rate": 2.759745738679989e-05, + "loss": 0.0561, + "num_input_tokens_seen": 19958560, + "step": 94570 + }, + { + "epoch": 10.404290429042904, + "grad_norm": 0.026165978983044624, + "learning_rate": 2.759507029644769e-05, + "loss": 0.0032, + "num_input_tokens_seen": 19959616, + "step": 94575 + }, + { + "epoch": 10.404840484048405, + "grad_norm": 0.04161772504448891, + "learning_rate": 2.759268318217797e-05, + "loss": 0.0224, + "num_input_tokens_seen": 19960736, + "step": 94580 + }, + { + "epoch": 10.405390539053906, + "grad_norm": 0.024700447916984558, + "learning_rate": 2.7590296044012738e-05, + "loss": 0.0185, + "num_input_tokens_seen": 19961728, + "step": 94585 + }, + { + "epoch": 10.405940594059405, + "grad_norm": 0.28374183177948, + "learning_rate": 2.7587908881973984e-05, + "loss": 0.0153, + "num_input_tokens_seen": 19962848, + "step": 94590 + }, + { + "epoch": 10.406490649064907, + "grad_norm": 0.12476596981287003, + "learning_rate": 2.758552169608371e-05, + "loss": 0.0388, + "num_input_tokens_seen": 19963872, + "step": 94595 + }, + { + "epoch": 10.407040704070408, + "grad_norm": 0.006141783203929663, + "learning_rate": 2.7583134486363936e-05, + "loss": 0.0027, + "num_input_tokens_seen": 19964896, + "step": 94600 + }, + { + "epoch": 10.407590759075907, + "grad_norm": 0.033347081393003464, + "learning_rate": 2.7580747252836648e-05, + "loss": 0.0024, + "num_input_tokens_seen": 19965920, + "step": 94605 + }, + { + "epoch": 10.408140814081408, + "grad_norm": 0.019492322579026222, + "learning_rate": 2.7578359995523845e-05, + "loss": 0.0193, + "num_input_tokens_seen": 19966912, + "step": 94610 + }, + { + "epoch": 10.408690869086909, + "grad_norm": 1.466143012046814, + "learning_rate": 2.757597271444754e-05, + "loss": 0.017, + "num_input_tokens_seen": 19968000, + "step": 94615 + }, + { + "epoch": 10.409240924092408, + "grad_norm": 0.0022943541407585144, + "learning_rate": 2.7573585409629733e-05, + "loss": 0.0032, + "num_input_tokens_seen": 19969120, + "step": 94620 + }, + { + "epoch": 10.40979097909791, + "grad_norm": 2.8451900482177734, + "learning_rate": 2.757119808109242e-05, + "loss": 0.1222, + "num_input_tokens_seen": 19970176, + "step": 94625 + }, + { + "epoch": 10.41034103410341, + "grad_norm": 1.560675859451294, + "learning_rate": 2.7568810728857607e-05, + "loss": 0.1849, + "num_input_tokens_seen": 19971200, + "step": 94630 + }, + { + "epoch": 10.410891089108912, + "grad_norm": 0.09769041836261749, + "learning_rate": 2.7566423352947303e-05, + "loss": 0.0077, + "num_input_tokens_seen": 19972224, + "step": 94635 + }, + { + "epoch": 10.411441144114411, + "grad_norm": 0.01042951736599207, + "learning_rate": 2.75640359533835e-05, + "loss": 0.1118, + "num_input_tokens_seen": 19973280, + "step": 94640 + }, + { + "epoch": 10.411991199119912, + "grad_norm": 0.453663170337677, + "learning_rate": 2.756164853018821e-05, + "loss": 0.033, + "num_input_tokens_seen": 19974336, + "step": 94645 + }, + { + "epoch": 10.412541254125413, + "grad_norm": 0.05564645677804947, + "learning_rate": 2.7559261083383437e-05, + "loss": 0.0208, + "num_input_tokens_seen": 19975488, + "step": 94650 + }, + { + "epoch": 10.413091309130913, + "grad_norm": 0.034623220562934875, + "learning_rate": 2.7556873612991184e-05, + "loss": 0.0504, + "num_input_tokens_seen": 19976448, + "step": 94655 + }, + { + "epoch": 10.413641364136414, + "grad_norm": 0.010623902082443237, + "learning_rate": 2.7554486119033452e-05, + "loss": 0.0219, + "num_input_tokens_seen": 19977504, + "step": 94660 + }, + { + "epoch": 10.414191419141915, + "grad_norm": 0.014290069229900837, + "learning_rate": 2.7552098601532246e-05, + "loss": 0.0318, + "num_input_tokens_seen": 19978560, + "step": 94665 + }, + { + "epoch": 10.414741474147414, + "grad_norm": 0.09904955327510834, + "learning_rate": 2.7549711060509576e-05, + "loss": 0.0519, + "num_input_tokens_seen": 19979584, + "step": 94670 + }, + { + "epoch": 10.415291529152915, + "grad_norm": 1.4728339910507202, + "learning_rate": 2.7547323495987443e-05, + "loss": 0.0981, + "num_input_tokens_seen": 19980640, + "step": 94675 + }, + { + "epoch": 10.415841584158416, + "grad_norm": 0.009651965461671352, + "learning_rate": 2.754493590798784e-05, + "loss": 0.0738, + "num_input_tokens_seen": 19981664, + "step": 94680 + }, + { + "epoch": 10.416391639163916, + "grad_norm": 0.02830927073955536, + "learning_rate": 2.7542548296532795e-05, + "loss": 0.0193, + "num_input_tokens_seen": 19982688, + "step": 94685 + }, + { + "epoch": 10.416941694169417, + "grad_norm": 0.022505948320031166, + "learning_rate": 2.7540160661644298e-05, + "loss": 0.0249, + "num_input_tokens_seen": 19983776, + "step": 94690 + }, + { + "epoch": 10.417491749174918, + "grad_norm": 0.9460898637771606, + "learning_rate": 2.753777300334436e-05, + "loss": 0.057, + "num_input_tokens_seen": 19984832, + "step": 94695 + }, + { + "epoch": 10.418041804180419, + "grad_norm": 0.5994930863380432, + "learning_rate": 2.7535385321654988e-05, + "loss": 0.104, + "num_input_tokens_seen": 19985984, + "step": 94700 + }, + { + "epoch": 10.418591859185918, + "grad_norm": 3.0593576431274414, + "learning_rate": 2.7532997616598183e-05, + "loss": 0.0443, + "num_input_tokens_seen": 19987040, + "step": 94705 + }, + { + "epoch": 10.41914191419142, + "grad_norm": 0.012266956269741058, + "learning_rate": 2.753060988819595e-05, + "loss": 0.0813, + "num_input_tokens_seen": 19988096, + "step": 94710 + }, + { + "epoch": 10.41969196919692, + "grad_norm": 0.18436259031295776, + "learning_rate": 2.752822213647031e-05, + "loss": 0.1178, + "num_input_tokens_seen": 19989088, + "step": 94715 + }, + { + "epoch": 10.42024202420242, + "grad_norm": 0.4327843487262726, + "learning_rate": 2.752583436144326e-05, + "loss": 0.0106, + "num_input_tokens_seen": 19990176, + "step": 94720 + }, + { + "epoch": 10.42079207920792, + "grad_norm": 0.5922825932502747, + "learning_rate": 2.75234465631368e-05, + "loss": 0.0175, + "num_input_tokens_seen": 19991200, + "step": 94725 + }, + { + "epoch": 10.421342134213422, + "grad_norm": 0.8038088083267212, + "learning_rate": 2.7521058741572946e-05, + "loss": 0.0238, + "num_input_tokens_seen": 19992256, + "step": 94730 + }, + { + "epoch": 10.421892189218921, + "grad_norm": 0.046512238681316376, + "learning_rate": 2.7518670896773703e-05, + "loss": 0.0112, + "num_input_tokens_seen": 19993344, + "step": 94735 + }, + { + "epoch": 10.422442244224422, + "grad_norm": 0.018877575173974037, + "learning_rate": 2.7516283028761074e-05, + "loss": 0.0489, + "num_input_tokens_seen": 19994432, + "step": 94740 + }, + { + "epoch": 10.422992299229923, + "grad_norm": 0.02576311305165291, + "learning_rate": 2.7513895137557084e-05, + "loss": 0.0043, + "num_input_tokens_seen": 19995488, + "step": 94745 + }, + { + "epoch": 10.423542354235423, + "grad_norm": 0.04655192047357559, + "learning_rate": 2.7511507223183724e-05, + "loss": 0.0135, + "num_input_tokens_seen": 19996576, + "step": 94750 + }, + { + "epoch": 10.424092409240924, + "grad_norm": 0.157950296998024, + "learning_rate": 2.7509119285663e-05, + "loss": 0.0441, + "num_input_tokens_seen": 19997696, + "step": 94755 + }, + { + "epoch": 10.424642464246425, + "grad_norm": 0.16989251971244812, + "learning_rate": 2.7506731325016933e-05, + "loss": 0.0398, + "num_input_tokens_seen": 19998784, + "step": 94760 + }, + { + "epoch": 10.425192519251926, + "grad_norm": 1.4379559755325317, + "learning_rate": 2.7504343341267526e-05, + "loss": 0.0705, + "num_input_tokens_seen": 19999872, + "step": 94765 + }, + { + "epoch": 10.425742574257425, + "grad_norm": 0.32246947288513184, + "learning_rate": 2.7501955334436792e-05, + "loss": 0.0272, + "num_input_tokens_seen": 20000896, + "step": 94770 + }, + { + "epoch": 10.426292629262926, + "grad_norm": 0.21448560059070587, + "learning_rate": 2.7499567304546732e-05, + "loss": 0.0087, + "num_input_tokens_seen": 20001952, + "step": 94775 + }, + { + "epoch": 10.426842684268427, + "grad_norm": 0.571647047996521, + "learning_rate": 2.749717925161936e-05, + "loss": 0.0396, + "num_input_tokens_seen": 20002976, + "step": 94780 + }, + { + "epoch": 10.427392739273927, + "grad_norm": 0.1304621547460556, + "learning_rate": 2.7494791175676675e-05, + "loss": 0.0102, + "num_input_tokens_seen": 20004000, + "step": 94785 + }, + { + "epoch": 10.427942794279428, + "grad_norm": 0.013037791475653648, + "learning_rate": 2.7492403076740714e-05, + "loss": 0.0054, + "num_input_tokens_seen": 20005024, + "step": 94790 + }, + { + "epoch": 10.428492849284929, + "grad_norm": 0.14083987474441528, + "learning_rate": 2.749001495483346e-05, + "loss": 0.005, + "num_input_tokens_seen": 20006144, + "step": 94795 + }, + { + "epoch": 10.429042904290428, + "grad_norm": 0.00383512070402503, + "learning_rate": 2.7487626809976935e-05, + "loss": 0.0035, + "num_input_tokens_seen": 20007168, + "step": 94800 + }, + { + "epoch": 10.42959295929593, + "grad_norm": 0.00747194979339838, + "learning_rate": 2.748523864219315e-05, + "loss": 0.0079, + "num_input_tokens_seen": 20008288, + "step": 94805 + }, + { + "epoch": 10.43014301430143, + "grad_norm": 0.0250760018825531, + "learning_rate": 2.748285045150411e-05, + "loss": 0.0009, + "num_input_tokens_seen": 20009344, + "step": 94810 + }, + { + "epoch": 10.430693069306932, + "grad_norm": 0.20898482203483582, + "learning_rate": 2.748046223793183e-05, + "loss": 0.0062, + "num_input_tokens_seen": 20010464, + "step": 94815 + }, + { + "epoch": 10.43124312431243, + "grad_norm": 1.5584704875946045, + "learning_rate": 2.7478074001498323e-05, + "loss": 0.0737, + "num_input_tokens_seen": 20011488, + "step": 94820 + }, + { + "epoch": 10.431793179317932, + "grad_norm": 0.33837854862213135, + "learning_rate": 2.7475685742225587e-05, + "loss": 0.1218, + "num_input_tokens_seen": 20012480, + "step": 94825 + }, + { + "epoch": 10.432343234323433, + "grad_norm": 0.018832193687558174, + "learning_rate": 2.747329746013565e-05, + "loss": 0.0035, + "num_input_tokens_seen": 20013536, + "step": 94830 + }, + { + "epoch": 10.432893289328932, + "grad_norm": 0.05340046063065529, + "learning_rate": 2.7470909155250517e-05, + "loss": 0.0142, + "num_input_tokens_seen": 20014560, + "step": 94835 + }, + { + "epoch": 10.433443344334433, + "grad_norm": 0.01891358196735382, + "learning_rate": 2.7468520827592197e-05, + "loss": 0.003, + "num_input_tokens_seen": 20015616, + "step": 94840 + }, + { + "epoch": 10.433993399339935, + "grad_norm": 0.017450133338570595, + "learning_rate": 2.746613247718271e-05, + "loss": 0.0069, + "num_input_tokens_seen": 20016704, + "step": 94845 + }, + { + "epoch": 10.434543454345434, + "grad_norm": 0.03893667832016945, + "learning_rate": 2.7463744104044065e-05, + "loss": 0.0638, + "num_input_tokens_seen": 20017728, + "step": 94850 + }, + { + "epoch": 10.435093509350935, + "grad_norm": 0.04122605547308922, + "learning_rate": 2.746135570819826e-05, + "loss": 0.0065, + "num_input_tokens_seen": 20018752, + "step": 94855 + }, + { + "epoch": 10.435643564356436, + "grad_norm": 0.01565363258123398, + "learning_rate": 2.7458967289667337e-05, + "loss": 0.0225, + "num_input_tokens_seen": 20019808, + "step": 94860 + }, + { + "epoch": 10.436193619361935, + "grad_norm": 0.06032455340027809, + "learning_rate": 2.7456578848473287e-05, + "loss": 0.0294, + "num_input_tokens_seen": 20020832, + "step": 94865 + }, + { + "epoch": 10.436743674367436, + "grad_norm": 0.4300000071525574, + "learning_rate": 2.7454190384638124e-05, + "loss": 0.0644, + "num_input_tokens_seen": 20021920, + "step": 94870 + }, + { + "epoch": 10.437293729372938, + "grad_norm": 0.03442692756652832, + "learning_rate": 2.7451801898183864e-05, + "loss": 0.0077, + "num_input_tokens_seen": 20022944, + "step": 94875 + }, + { + "epoch": 10.437843784378439, + "grad_norm": 0.0075462921522557735, + "learning_rate": 2.7449413389132534e-05, + "loss": 0.0193, + "num_input_tokens_seen": 20024032, + "step": 94880 + }, + { + "epoch": 10.438393839383938, + "grad_norm": 0.29339689016342163, + "learning_rate": 2.7447024857506126e-05, + "loss": 0.0123, + "num_input_tokens_seen": 20025120, + "step": 94885 + }, + { + "epoch": 10.438943894389439, + "grad_norm": 0.045136746019124985, + "learning_rate": 2.744463630332667e-05, + "loss": 0.003, + "num_input_tokens_seen": 20026144, + "step": 94890 + }, + { + "epoch": 10.43949394939494, + "grad_norm": 0.2128869891166687, + "learning_rate": 2.744224772661617e-05, + "loss": 0.0215, + "num_input_tokens_seen": 20027168, + "step": 94895 + }, + { + "epoch": 10.44004400440044, + "grad_norm": 1.111656904220581, + "learning_rate": 2.7439859127396638e-05, + "loss": 0.0475, + "num_input_tokens_seen": 20028192, + "step": 94900 + }, + { + "epoch": 10.44059405940594, + "grad_norm": 0.09409447014331818, + "learning_rate": 2.7437470505690105e-05, + "loss": 0.0465, + "num_input_tokens_seen": 20029216, + "step": 94905 + }, + { + "epoch": 10.441144114411442, + "grad_norm": 0.08817337453365326, + "learning_rate": 2.7435081861518575e-05, + "loss": 0.0149, + "num_input_tokens_seen": 20030272, + "step": 94910 + }, + { + "epoch": 10.441694169416941, + "grad_norm": 0.03895813599228859, + "learning_rate": 2.7432693194904063e-05, + "loss": 0.019, + "num_input_tokens_seen": 20031360, + "step": 94915 + }, + { + "epoch": 10.442244224422442, + "grad_norm": 0.03295261412858963, + "learning_rate": 2.7430304505868586e-05, + "loss": 0.0148, + "num_input_tokens_seen": 20032416, + "step": 94920 + }, + { + "epoch": 10.442794279427943, + "grad_norm": 1.9731481075286865, + "learning_rate": 2.7427915794434163e-05, + "loss": 0.1587, + "num_input_tokens_seen": 20033472, + "step": 94925 + }, + { + "epoch": 10.443344334433444, + "grad_norm": 0.019282134249806404, + "learning_rate": 2.7425527060622793e-05, + "loss": 0.0416, + "num_input_tokens_seen": 20034528, + "step": 94930 + }, + { + "epoch": 10.443894389438944, + "grad_norm": 0.07806188613176346, + "learning_rate": 2.7423138304456515e-05, + "loss": 0.0059, + "num_input_tokens_seen": 20035616, + "step": 94935 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 0.25875648856163025, + "learning_rate": 2.7420749525957323e-05, + "loss": 0.1204, + "num_input_tokens_seen": 20036672, + "step": 94940 + }, + { + "epoch": 10.444994499449946, + "grad_norm": 0.025208203122019768, + "learning_rate": 2.7418360725147253e-05, + "loss": 0.0117, + "num_input_tokens_seen": 20037728, + "step": 94945 + }, + { + "epoch": 10.445544554455445, + "grad_norm": 0.0355687141418457, + "learning_rate": 2.7415971902048314e-05, + "loss": 0.0136, + "num_input_tokens_seen": 20038784, + "step": 94950 + }, + { + "epoch": 10.446094609460946, + "grad_norm": 0.7361263036727905, + "learning_rate": 2.7413583056682518e-05, + "loss": 0.036, + "num_input_tokens_seen": 20039904, + "step": 94955 + }, + { + "epoch": 10.446644664466447, + "grad_norm": 0.7595062851905823, + "learning_rate": 2.741119418907189e-05, + "loss": 0.0805, + "num_input_tokens_seen": 20040992, + "step": 94960 + }, + { + "epoch": 10.447194719471947, + "grad_norm": 0.3533191978931427, + "learning_rate": 2.7408805299238432e-05, + "loss": 0.0154, + "num_input_tokens_seen": 20041984, + "step": 94965 + }, + { + "epoch": 10.447744774477448, + "grad_norm": 0.11748848855495453, + "learning_rate": 2.740641638720417e-05, + "loss": 0.038, + "num_input_tokens_seen": 20043008, + "step": 94970 + }, + { + "epoch": 10.448294829482949, + "grad_norm": 0.05143863335251808, + "learning_rate": 2.740402745299113e-05, + "loss": 0.018, + "num_input_tokens_seen": 20044096, + "step": 94975 + }, + { + "epoch": 10.448844884488448, + "grad_norm": 1.254052996635437, + "learning_rate": 2.740163849662133e-05, + "loss": 0.052, + "num_input_tokens_seen": 20045120, + "step": 94980 + }, + { + "epoch": 10.44939493949395, + "grad_norm": 0.1770510971546173, + "learning_rate": 2.739924951811677e-05, + "loss": 0.0452, + "num_input_tokens_seen": 20046208, + "step": 94985 + }, + { + "epoch": 10.44994499449945, + "grad_norm": 0.2602376639842987, + "learning_rate": 2.7396860517499478e-05, + "loss": 0.0133, + "num_input_tokens_seen": 20047296, + "step": 94990 + }, + { + "epoch": 10.450495049504951, + "grad_norm": 0.020170345902442932, + "learning_rate": 2.7394471494791478e-05, + "loss": 0.018, + "num_input_tokens_seen": 20048416, + "step": 94995 + }, + { + "epoch": 10.45104510451045, + "grad_norm": 0.9563611745834351, + "learning_rate": 2.7392082450014777e-05, + "loss": 0.0305, + "num_input_tokens_seen": 20049408, + "step": 95000 + }, + { + "epoch": 10.451595159515952, + "grad_norm": 0.0936700850725174, + "learning_rate": 2.738969338319141e-05, + "loss": 0.0256, + "num_input_tokens_seen": 20050496, + "step": 95005 + }, + { + "epoch": 10.452145214521453, + "grad_norm": 0.0032147341407835484, + "learning_rate": 2.738730429434338e-05, + "loss": 0.1642, + "num_input_tokens_seen": 20051584, + "step": 95010 + }, + { + "epoch": 10.452695269526952, + "grad_norm": 1.274771809577942, + "learning_rate": 2.7384915183492704e-05, + "loss": 0.0314, + "num_input_tokens_seen": 20052608, + "step": 95015 + }, + { + "epoch": 10.453245324532453, + "grad_norm": 1.1445766687393188, + "learning_rate": 2.7382526050661415e-05, + "loss": 0.234, + "num_input_tokens_seen": 20053632, + "step": 95020 + }, + { + "epoch": 10.453795379537954, + "grad_norm": 0.011596580035984516, + "learning_rate": 2.7380136895871533e-05, + "loss": 0.006, + "num_input_tokens_seen": 20054688, + "step": 95025 + }, + { + "epoch": 10.454345434543454, + "grad_norm": 0.9864192008972168, + "learning_rate": 2.7377747719145062e-05, + "loss": 0.0655, + "num_input_tokens_seen": 20055712, + "step": 95030 + }, + { + "epoch": 10.454895489548955, + "grad_norm": 0.06869862228631973, + "learning_rate": 2.737535852050404e-05, + "loss": 0.0057, + "num_input_tokens_seen": 20056736, + "step": 95035 + }, + { + "epoch": 10.455445544554456, + "grad_norm": 0.018748195841908455, + "learning_rate": 2.7372969299970476e-05, + "loss": 0.0481, + "num_input_tokens_seen": 20057760, + "step": 95040 + }, + { + "epoch": 10.455995599559955, + "grad_norm": 0.9371908903121948, + "learning_rate": 2.7370580057566386e-05, + "loss": 0.085, + "num_input_tokens_seen": 20058816, + "step": 95045 + }, + { + "epoch": 10.456545654565456, + "grad_norm": 0.038504112511873245, + "learning_rate": 2.736819079331381e-05, + "loss": 0.0401, + "num_input_tokens_seen": 20059776, + "step": 95050 + }, + { + "epoch": 10.457095709570957, + "grad_norm": 0.06771346926689148, + "learning_rate": 2.7365801507234744e-05, + "loss": 0.0049, + "num_input_tokens_seen": 20060832, + "step": 95055 + }, + { + "epoch": 10.457645764576458, + "grad_norm": 0.01288518961519003, + "learning_rate": 2.736341219935122e-05, + "loss": 0.0274, + "num_input_tokens_seen": 20061920, + "step": 95060 + }, + { + "epoch": 10.458195819581958, + "grad_norm": 0.03525754064321518, + "learning_rate": 2.7361022869685266e-05, + "loss": 0.0501, + "num_input_tokens_seen": 20062976, + "step": 95065 + }, + { + "epoch": 10.458745874587459, + "grad_norm": 0.031827058643102646, + "learning_rate": 2.7358633518258897e-05, + "loss": 0.0049, + "num_input_tokens_seen": 20064096, + "step": 95070 + }, + { + "epoch": 10.45929592959296, + "grad_norm": 1.343194842338562, + "learning_rate": 2.7356244145094133e-05, + "loss": 0.1722, + "num_input_tokens_seen": 20065216, + "step": 95075 + }, + { + "epoch": 10.45984598459846, + "grad_norm": 1.5468086004257202, + "learning_rate": 2.7353854750213e-05, + "loss": 0.0356, + "num_input_tokens_seen": 20066272, + "step": 95080 + }, + { + "epoch": 10.46039603960396, + "grad_norm": 0.07457666844129562, + "learning_rate": 2.7351465333637504e-05, + "loss": 0.0059, + "num_input_tokens_seen": 20067296, + "step": 95085 + }, + { + "epoch": 10.460946094609461, + "grad_norm": 0.07973606884479523, + "learning_rate": 2.7349075895389693e-05, + "loss": 0.0371, + "num_input_tokens_seen": 20068352, + "step": 95090 + }, + { + "epoch": 10.46149614961496, + "grad_norm": 0.0741063728928566, + "learning_rate": 2.7346686435491577e-05, + "loss": 0.1364, + "num_input_tokens_seen": 20069440, + "step": 95095 + }, + { + "epoch": 10.462046204620462, + "grad_norm": 0.03414173424243927, + "learning_rate": 2.7344296953965176e-05, + "loss": 0.0075, + "num_input_tokens_seen": 20070528, + "step": 95100 + }, + { + "epoch": 10.462596259625963, + "grad_norm": 0.052109770476818085, + "learning_rate": 2.7341907450832516e-05, + "loss": 0.0232, + "num_input_tokens_seen": 20071648, + "step": 95105 + }, + { + "epoch": 10.463146314631462, + "grad_norm": 0.041590020060539246, + "learning_rate": 2.7339517926115614e-05, + "loss": 0.0079, + "num_input_tokens_seen": 20072672, + "step": 95110 + }, + { + "epoch": 10.463696369636963, + "grad_norm": 0.9705245494842529, + "learning_rate": 2.7337128379836495e-05, + "loss": 0.0357, + "num_input_tokens_seen": 20073760, + "step": 95115 + }, + { + "epoch": 10.464246424642464, + "grad_norm": 0.8842089176177979, + "learning_rate": 2.7334738812017196e-05, + "loss": 0.0506, + "num_input_tokens_seen": 20074720, + "step": 95120 + }, + { + "epoch": 10.464796479647966, + "grad_norm": 0.35856154561042786, + "learning_rate": 2.733234922267973e-05, + "loss": 0.021, + "num_input_tokens_seen": 20075744, + "step": 95125 + }, + { + "epoch": 10.465346534653465, + "grad_norm": 0.05993420258164406, + "learning_rate": 2.7329959611846113e-05, + "loss": 0.0047, + "num_input_tokens_seen": 20076800, + "step": 95130 + }, + { + "epoch": 10.465896589658966, + "grad_norm": 0.11335677653551102, + "learning_rate": 2.732756997953838e-05, + "loss": 0.022, + "num_input_tokens_seen": 20077920, + "step": 95135 + }, + { + "epoch": 10.466446644664467, + "grad_norm": 0.10534351319074631, + "learning_rate": 2.7325180325778547e-05, + "loss": 0.0097, + "num_input_tokens_seen": 20078944, + "step": 95140 + }, + { + "epoch": 10.466996699669966, + "grad_norm": 4.225527763366699, + "learning_rate": 2.7322790650588647e-05, + "loss": 0.0971, + "num_input_tokens_seen": 20079904, + "step": 95145 + }, + { + "epoch": 10.467546754675467, + "grad_norm": 0.037406712770462036, + "learning_rate": 2.7320400953990698e-05, + "loss": 0.0368, + "num_input_tokens_seen": 20081024, + "step": 95150 + }, + { + "epoch": 10.468096809680969, + "grad_norm": 0.12249352037906647, + "learning_rate": 2.731801123600673e-05, + "loss": 0.014, + "num_input_tokens_seen": 20082080, + "step": 95155 + }, + { + "epoch": 10.468646864686468, + "grad_norm": 0.017846401780843735, + "learning_rate": 2.7315621496658762e-05, + "loss": 0.0085, + "num_input_tokens_seen": 20083104, + "step": 95160 + }, + { + "epoch": 10.469196919691969, + "grad_norm": 1.202060341835022, + "learning_rate": 2.731323173596882e-05, + "loss": 0.0338, + "num_input_tokens_seen": 20084128, + "step": 95165 + }, + { + "epoch": 10.46974697469747, + "grad_norm": 1.286974310874939, + "learning_rate": 2.7310841953958942e-05, + "loss": 0.0252, + "num_input_tokens_seen": 20085120, + "step": 95170 + }, + { + "epoch": 10.47029702970297, + "grad_norm": 0.05529743805527687, + "learning_rate": 2.7308452150651125e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20086176, + "step": 95175 + }, + { + "epoch": 10.47084708470847, + "grad_norm": 0.16377626359462738, + "learning_rate": 2.730606232606743e-05, + "loss": 0.1104, + "num_input_tokens_seen": 20087264, + "step": 95180 + }, + { + "epoch": 10.471397139713972, + "grad_norm": 0.07674645632505417, + "learning_rate": 2.7303672480229854e-05, + "loss": 0.0023, + "num_input_tokens_seen": 20088352, + "step": 95185 + }, + { + "epoch": 10.471947194719473, + "grad_norm": 1.5773240327835083, + "learning_rate": 2.7301282613160428e-05, + "loss": 0.1656, + "num_input_tokens_seen": 20089408, + "step": 95190 + }, + { + "epoch": 10.472497249724972, + "grad_norm": 0.8779524564743042, + "learning_rate": 2.7298892724881196e-05, + "loss": 0.0275, + "num_input_tokens_seen": 20090464, + "step": 95195 + }, + { + "epoch": 10.473047304730473, + "grad_norm": 0.08832249790430069, + "learning_rate": 2.7296502815414177e-05, + "loss": 0.1494, + "num_input_tokens_seen": 20091488, + "step": 95200 + }, + { + "epoch": 10.473597359735974, + "grad_norm": 0.009296133182942867, + "learning_rate": 2.729411288478138e-05, + "loss": 0.0308, + "num_input_tokens_seen": 20092576, + "step": 95205 + }, + { + "epoch": 10.474147414741473, + "grad_norm": 0.2908305823802948, + "learning_rate": 2.7291722933004855e-05, + "loss": 0.0075, + "num_input_tokens_seen": 20093696, + "step": 95210 + }, + { + "epoch": 10.474697469746975, + "grad_norm": 2.796391248703003, + "learning_rate": 2.728933296010661e-05, + "loss": 0.1232, + "num_input_tokens_seen": 20094784, + "step": 95215 + }, + { + "epoch": 10.475247524752476, + "grad_norm": 0.019475474953651428, + "learning_rate": 2.7286942966108693e-05, + "loss": 0.0748, + "num_input_tokens_seen": 20095808, + "step": 95220 + }, + { + "epoch": 10.475797579757975, + "grad_norm": 0.042443130165338516, + "learning_rate": 2.7284552951033114e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20096864, + "step": 95225 + }, + { + "epoch": 10.476347634763476, + "grad_norm": 0.011476002633571625, + "learning_rate": 2.728216291490191e-05, + "loss": 0.0226, + "num_input_tokens_seen": 20097952, + "step": 95230 + }, + { + "epoch": 10.476897689768977, + "grad_norm": 0.5432221293449402, + "learning_rate": 2.7279772857737095e-05, + "loss": 0.0084, + "num_input_tokens_seen": 20099008, + "step": 95235 + }, + { + "epoch": 10.477447744774478, + "grad_norm": 2.4279043674468994, + "learning_rate": 2.7277382779560723e-05, + "loss": 0.056, + "num_input_tokens_seen": 20100160, + "step": 95240 + }, + { + "epoch": 10.477997799779978, + "grad_norm": 0.02696026675403118, + "learning_rate": 2.7274992680394794e-05, + "loss": 0.0031, + "num_input_tokens_seen": 20101152, + "step": 95245 + }, + { + "epoch": 10.478547854785479, + "grad_norm": 0.020424658432602882, + "learning_rate": 2.7272602560261357e-05, + "loss": 0.0108, + "num_input_tokens_seen": 20102272, + "step": 95250 + }, + { + "epoch": 10.47909790979098, + "grad_norm": 0.024906162172555923, + "learning_rate": 2.7270212419182428e-05, + "loss": 0.0087, + "num_input_tokens_seen": 20103328, + "step": 95255 + }, + { + "epoch": 10.479647964796479, + "grad_norm": 0.01186822447925806, + "learning_rate": 2.7267822257180043e-05, + "loss": 0.0034, + "num_input_tokens_seen": 20104352, + "step": 95260 + }, + { + "epoch": 10.48019801980198, + "grad_norm": 0.6848531365394592, + "learning_rate": 2.726543207427623e-05, + "loss": 0.0771, + "num_input_tokens_seen": 20105440, + "step": 95265 + }, + { + "epoch": 10.480748074807481, + "grad_norm": 0.020821932703256607, + "learning_rate": 2.7263041870493017e-05, + "loss": 0.009, + "num_input_tokens_seen": 20106528, + "step": 95270 + }, + { + "epoch": 10.48129812981298, + "grad_norm": 0.012970567680895329, + "learning_rate": 2.7260651645852425e-05, + "loss": 0.0036, + "num_input_tokens_seen": 20107552, + "step": 95275 + }, + { + "epoch": 10.481848184818482, + "grad_norm": 0.6603866219520569, + "learning_rate": 2.7258261400376495e-05, + "loss": 0.0148, + "num_input_tokens_seen": 20108544, + "step": 95280 + }, + { + "epoch": 10.482398239823983, + "grad_norm": 0.018243223428726196, + "learning_rate": 2.725587113408726e-05, + "loss": 0.0258, + "num_input_tokens_seen": 20109632, + "step": 95285 + }, + { + "epoch": 10.482948294829482, + "grad_norm": 0.029475897550582886, + "learning_rate": 2.7253480847006734e-05, + "loss": 0.0083, + "num_input_tokens_seen": 20110688, + "step": 95290 + }, + { + "epoch": 10.483498349834983, + "grad_norm": 0.014085309579968452, + "learning_rate": 2.7251090539156966e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20111680, + "step": 95295 + }, + { + "epoch": 10.484048404840484, + "grad_norm": 0.05522716045379639, + "learning_rate": 2.724870021055997e-05, + "loss": 0.0214, + "num_input_tokens_seen": 20112704, + "step": 95300 + }, + { + "epoch": 10.484598459845985, + "grad_norm": 3.253063440322876, + "learning_rate": 2.724630986123778e-05, + "loss": 0.0401, + "num_input_tokens_seen": 20113760, + "step": 95305 + }, + { + "epoch": 10.485148514851485, + "grad_norm": 0.14280861616134644, + "learning_rate": 2.7243919491212438e-05, + "loss": 0.0104, + "num_input_tokens_seen": 20114848, + "step": 95310 + }, + { + "epoch": 10.485698569856986, + "grad_norm": 0.25719285011291504, + "learning_rate": 2.7241529100505968e-05, + "loss": 0.0492, + "num_input_tokens_seen": 20115968, + "step": 95315 + }, + { + "epoch": 10.486248624862487, + "grad_norm": 0.08080155402421951, + "learning_rate": 2.723913868914039e-05, + "loss": 0.0075, + "num_input_tokens_seen": 20116992, + "step": 95320 + }, + { + "epoch": 10.486798679867986, + "grad_norm": 0.017361551523208618, + "learning_rate": 2.723674825713775e-05, + "loss": 0.0165, + "num_input_tokens_seen": 20117984, + "step": 95325 + }, + { + "epoch": 10.487348734873487, + "grad_norm": 0.6682896018028259, + "learning_rate": 2.7234357804520076e-05, + "loss": 0.1089, + "num_input_tokens_seen": 20119008, + "step": 95330 + }, + { + "epoch": 10.487898789878988, + "grad_norm": 0.955312967300415, + "learning_rate": 2.7231967331309398e-05, + "loss": 0.0172, + "num_input_tokens_seen": 20120128, + "step": 95335 + }, + { + "epoch": 10.488448844884488, + "grad_norm": 0.018788572400808334, + "learning_rate": 2.722957683752775e-05, + "loss": 0.0368, + "num_input_tokens_seen": 20121152, + "step": 95340 + }, + { + "epoch": 10.488998899889989, + "grad_norm": 0.5656779408454895, + "learning_rate": 2.7227186323197162e-05, + "loss": 0.0122, + "num_input_tokens_seen": 20122144, + "step": 95345 + }, + { + "epoch": 10.48954895489549, + "grad_norm": 0.007343297824263573, + "learning_rate": 2.7224795788339656e-05, + "loss": 0.1345, + "num_input_tokens_seen": 20123264, + "step": 95350 + }, + { + "epoch": 10.490099009900991, + "grad_norm": 0.05765408277511597, + "learning_rate": 2.7222405232977282e-05, + "loss": 0.0819, + "num_input_tokens_seen": 20124352, + "step": 95355 + }, + { + "epoch": 10.49064906490649, + "grad_norm": 1.42741060256958, + "learning_rate": 2.722001465713207e-05, + "loss": 0.2329, + "num_input_tokens_seen": 20125472, + "step": 95360 + }, + { + "epoch": 10.491199119911991, + "grad_norm": 0.08184589445590973, + "learning_rate": 2.7217624060826047e-05, + "loss": 0.0162, + "num_input_tokens_seen": 20126528, + "step": 95365 + }, + { + "epoch": 10.491749174917492, + "grad_norm": 0.7824800610542297, + "learning_rate": 2.7215233444081245e-05, + "loss": 0.0295, + "num_input_tokens_seen": 20127552, + "step": 95370 + }, + { + "epoch": 10.492299229922992, + "grad_norm": 0.022646160796284676, + "learning_rate": 2.7212842806919696e-05, + "loss": 0.0018, + "num_input_tokens_seen": 20128608, + "step": 95375 + }, + { + "epoch": 10.492849284928493, + "grad_norm": 0.091728076338768, + "learning_rate": 2.7210452149363437e-05, + "loss": 0.0287, + "num_input_tokens_seen": 20129664, + "step": 95380 + }, + { + "epoch": 10.493399339933994, + "grad_norm": 0.024222340434789658, + "learning_rate": 2.720806147143451e-05, + "loss": 0.0154, + "num_input_tokens_seen": 20130688, + "step": 95385 + }, + { + "epoch": 10.493949394939493, + "grad_norm": 0.2180386483669281, + "learning_rate": 2.7205670773154927e-05, + "loss": 0.0625, + "num_input_tokens_seen": 20131744, + "step": 95390 + }, + { + "epoch": 10.494499449944994, + "grad_norm": 0.012555086985230446, + "learning_rate": 2.7203280054546743e-05, + "loss": 0.0036, + "num_input_tokens_seen": 20132800, + "step": 95395 + }, + { + "epoch": 10.495049504950495, + "grad_norm": 0.13567806780338287, + "learning_rate": 2.7200889315631983e-05, + "loss": 0.0645, + "num_input_tokens_seen": 20133760, + "step": 95400 + }, + { + "epoch": 10.495599559955995, + "grad_norm": 0.11827662587165833, + "learning_rate": 2.7198498556432682e-05, + "loss": 0.1046, + "num_input_tokens_seen": 20134816, + "step": 95405 + }, + { + "epoch": 10.496149614961496, + "grad_norm": 1.1439470052719116, + "learning_rate": 2.7196107776970876e-05, + "loss": 0.1555, + "num_input_tokens_seen": 20135776, + "step": 95410 + }, + { + "epoch": 10.496699669966997, + "grad_norm": 0.08054804801940918, + "learning_rate": 2.7193716977268597e-05, + "loss": 0.0132, + "num_input_tokens_seen": 20136768, + "step": 95415 + }, + { + "epoch": 10.497249724972498, + "grad_norm": 0.10983584076166153, + "learning_rate": 2.719132615734788e-05, + "loss": 0.0962, + "num_input_tokens_seen": 20137792, + "step": 95420 + }, + { + "epoch": 10.497799779977997, + "grad_norm": 0.029838457703590393, + "learning_rate": 2.718893531723076e-05, + "loss": 0.0189, + "num_input_tokens_seen": 20138848, + "step": 95425 + }, + { + "epoch": 10.498349834983498, + "grad_norm": 0.011214339174330235, + "learning_rate": 2.7186544456939284e-05, + "loss": 0.0059, + "num_input_tokens_seen": 20139904, + "step": 95430 + }, + { + "epoch": 10.498899889989, + "grad_norm": 0.015985026955604553, + "learning_rate": 2.7184153576495463e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20140896, + "step": 95435 + }, + { + "epoch": 10.499449944994499, + "grad_norm": 0.242525115609169, + "learning_rate": 2.7181762675921352e-05, + "loss": 0.0111, + "num_input_tokens_seen": 20141920, + "step": 95440 + }, + { + "epoch": 10.5, + "grad_norm": 0.01563331112265587, + "learning_rate": 2.7179371755238987e-05, + "loss": 0.0042, + "num_input_tokens_seen": 20143040, + "step": 95445 + }, + { + "epoch": 10.500550055005501, + "grad_norm": 1.7726014852523804, + "learning_rate": 2.7176980814470388e-05, + "loss": 0.0561, + "num_input_tokens_seen": 20144096, + "step": 95450 + }, + { + "epoch": 10.501100110011, + "grad_norm": 0.12857955694198608, + "learning_rate": 2.717458985363761e-05, + "loss": 0.0051, + "num_input_tokens_seen": 20145152, + "step": 95455 + }, + { + "epoch": 10.501650165016502, + "grad_norm": 0.39866313338279724, + "learning_rate": 2.7172198872762676e-05, + "loss": 0.057, + "num_input_tokens_seen": 20146176, + "step": 95460 + }, + { + "epoch": 10.502200220022003, + "grad_norm": 0.3271321654319763, + "learning_rate": 2.7169807871867626e-05, + "loss": 0.0212, + "num_input_tokens_seen": 20147232, + "step": 95465 + }, + { + "epoch": 10.502750275027502, + "grad_norm": 0.8764806985855103, + "learning_rate": 2.7167416850974504e-05, + "loss": 0.104, + "num_input_tokens_seen": 20148320, + "step": 95470 + }, + { + "epoch": 10.503300330033003, + "grad_norm": 0.038990434259176254, + "learning_rate": 2.7165025810105336e-05, + "loss": 0.0093, + "num_input_tokens_seen": 20149376, + "step": 95475 + }, + { + "epoch": 10.503850385038504, + "grad_norm": 0.01885991357266903, + "learning_rate": 2.7162634749282167e-05, + "loss": 0.1944, + "num_input_tokens_seen": 20150496, + "step": 95480 + }, + { + "epoch": 10.504400440044005, + "grad_norm": 0.09450671821832657, + "learning_rate": 2.716024366852703e-05, + "loss": 0.0428, + "num_input_tokens_seen": 20151552, + "step": 95485 + }, + { + "epoch": 10.504950495049505, + "grad_norm": 0.18964257836341858, + "learning_rate": 2.7157852567861962e-05, + "loss": 0.0038, + "num_input_tokens_seen": 20152608, + "step": 95490 + }, + { + "epoch": 10.505500550055006, + "grad_norm": 0.058808259665966034, + "learning_rate": 2.7155461447309e-05, + "loss": 0.0095, + "num_input_tokens_seen": 20153664, + "step": 95495 + }, + { + "epoch": 10.506050605060507, + "grad_norm": 0.171784907579422, + "learning_rate": 2.7153070306890188e-05, + "loss": 0.0538, + "num_input_tokens_seen": 20154656, + "step": 95500 + }, + { + "epoch": 10.506600660066006, + "grad_norm": 0.37152859568595886, + "learning_rate": 2.7150679146627566e-05, + "loss": 0.0404, + "num_input_tokens_seen": 20155744, + "step": 95505 + }, + { + "epoch": 10.507150715071507, + "grad_norm": 0.3811678886413574, + "learning_rate": 2.7148287966543157e-05, + "loss": 0.0127, + "num_input_tokens_seen": 20156800, + "step": 95510 + }, + { + "epoch": 10.507700770077008, + "grad_norm": 0.2522178590297699, + "learning_rate": 2.7145896766659014e-05, + "loss": 0.0187, + "num_input_tokens_seen": 20157888, + "step": 95515 + }, + { + "epoch": 10.508250825082508, + "grad_norm": 0.01433108001947403, + "learning_rate": 2.7143505546997166e-05, + "loss": 0.0087, + "num_input_tokens_seen": 20158880, + "step": 95520 + }, + { + "epoch": 10.508800880088009, + "grad_norm": 0.045600634068250656, + "learning_rate": 2.7141114307579664e-05, + "loss": 0.0089, + "num_input_tokens_seen": 20160000, + "step": 95525 + }, + { + "epoch": 10.50935093509351, + "grad_norm": 0.038763903081417084, + "learning_rate": 2.7138723048428534e-05, + "loss": 0.0184, + "num_input_tokens_seen": 20161088, + "step": 95530 + }, + { + "epoch": 10.509900990099009, + "grad_norm": 0.008810739032924175, + "learning_rate": 2.7136331769565816e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20162144, + "step": 95535 + }, + { + "epoch": 10.51045104510451, + "grad_norm": 0.057726334780454636, + "learning_rate": 2.7133940471013552e-05, + "loss": 0.018, + "num_input_tokens_seen": 20163232, + "step": 95540 + }, + { + "epoch": 10.511001100110011, + "grad_norm": 2.292527914047241, + "learning_rate": 2.713154915279379e-05, + "loss": 0.1108, + "num_input_tokens_seen": 20164320, + "step": 95545 + }, + { + "epoch": 10.511551155115512, + "grad_norm": 0.029319193214178085, + "learning_rate": 2.712915781492856e-05, + "loss": 0.1001, + "num_input_tokens_seen": 20165408, + "step": 95550 + }, + { + "epoch": 10.512101210121012, + "grad_norm": 1.7140369415283203, + "learning_rate": 2.7126766457439902e-05, + "loss": 0.048, + "num_input_tokens_seen": 20166432, + "step": 95555 + }, + { + "epoch": 10.512651265126513, + "grad_norm": 0.09925566613674164, + "learning_rate": 2.7124375080349863e-05, + "loss": 0.1062, + "num_input_tokens_seen": 20167552, + "step": 95560 + }, + { + "epoch": 10.513201320132014, + "grad_norm": 1.32500422000885, + "learning_rate": 2.7121983683680474e-05, + "loss": 0.0336, + "num_input_tokens_seen": 20168576, + "step": 95565 + }, + { + "epoch": 10.513751375137513, + "grad_norm": 0.03156435862183571, + "learning_rate": 2.7119592267453775e-05, + "loss": 0.0272, + "num_input_tokens_seen": 20169696, + "step": 95570 + }, + { + "epoch": 10.514301430143014, + "grad_norm": 0.005876564420759678, + "learning_rate": 2.711720083169182e-05, + "loss": 0.0205, + "num_input_tokens_seen": 20170752, + "step": 95575 + }, + { + "epoch": 10.514851485148515, + "grad_norm": 0.6980698704719543, + "learning_rate": 2.7114809376416632e-05, + "loss": 0.0206, + "num_input_tokens_seen": 20171872, + "step": 95580 + }, + { + "epoch": 10.515401540154015, + "grad_norm": 5.050609588623047, + "learning_rate": 2.7112417901650267e-05, + "loss": 0.0211, + "num_input_tokens_seen": 20172960, + "step": 95585 + }, + { + "epoch": 10.515951595159516, + "grad_norm": 0.08774527162313461, + "learning_rate": 2.711002640741476e-05, + "loss": 0.0053, + "num_input_tokens_seen": 20174016, + "step": 95590 + }, + { + "epoch": 10.516501650165017, + "grad_norm": 1.4215924739837646, + "learning_rate": 2.7107634893732155e-05, + "loss": 0.0125, + "num_input_tokens_seen": 20175072, + "step": 95595 + }, + { + "epoch": 10.517051705170516, + "grad_norm": 1.4434891939163208, + "learning_rate": 2.7105243360624484e-05, + "loss": 0.0678, + "num_input_tokens_seen": 20176192, + "step": 95600 + }, + { + "epoch": 10.517601760176017, + "grad_norm": 0.04513878747820854, + "learning_rate": 2.71028518081138e-05, + "loss": 0.0281, + "num_input_tokens_seen": 20177184, + "step": 95605 + }, + { + "epoch": 10.518151815181518, + "grad_norm": 0.050868332386016846, + "learning_rate": 2.710046023622213e-05, + "loss": 0.0242, + "num_input_tokens_seen": 20178272, + "step": 95610 + }, + { + "epoch": 10.51870187018702, + "grad_norm": 0.020376743748784065, + "learning_rate": 2.7098068644971537e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20179264, + "step": 95615 + }, + { + "epoch": 10.519251925192519, + "grad_norm": 0.03086468018591404, + "learning_rate": 2.7095677034384048e-05, + "loss": 0.0071, + "num_input_tokens_seen": 20180352, + "step": 95620 + }, + { + "epoch": 10.51980198019802, + "grad_norm": 0.9571753144264221, + "learning_rate": 2.709328540448171e-05, + "loss": 0.0174, + "num_input_tokens_seen": 20181344, + "step": 95625 + }, + { + "epoch": 10.520352035203521, + "grad_norm": 0.42627856135368347, + "learning_rate": 2.7090893755286566e-05, + "loss": 0.0038, + "num_input_tokens_seen": 20182400, + "step": 95630 + }, + { + "epoch": 10.52090209020902, + "grad_norm": 0.49474263191223145, + "learning_rate": 2.7088502086820656e-05, + "loss": 0.0083, + "num_input_tokens_seen": 20183360, + "step": 95635 + }, + { + "epoch": 10.521452145214521, + "grad_norm": 0.13920240104198456, + "learning_rate": 2.708611039910602e-05, + "loss": 0.0511, + "num_input_tokens_seen": 20184480, + "step": 95640 + }, + { + "epoch": 10.522002200220022, + "grad_norm": 0.13919198513031006, + "learning_rate": 2.7083718692164718e-05, + "loss": 0.0226, + "num_input_tokens_seen": 20185536, + "step": 95645 + }, + { + "epoch": 10.522552255225522, + "grad_norm": 0.20924252271652222, + "learning_rate": 2.708132696601877e-05, + "loss": 0.0109, + "num_input_tokens_seen": 20186656, + "step": 95650 + }, + { + "epoch": 10.523102310231023, + "grad_norm": 0.021692020818591118, + "learning_rate": 2.7078935220690228e-05, + "loss": 0.0022, + "num_input_tokens_seen": 20187680, + "step": 95655 + }, + { + "epoch": 10.523652365236524, + "grad_norm": 0.7369973659515381, + "learning_rate": 2.7076543456201142e-05, + "loss": 0.0168, + "num_input_tokens_seen": 20188672, + "step": 95660 + }, + { + "epoch": 10.524202420242025, + "grad_norm": 1.433346152305603, + "learning_rate": 2.7074151672573554e-05, + "loss": 0.0156, + "num_input_tokens_seen": 20189728, + "step": 95665 + }, + { + "epoch": 10.524752475247524, + "grad_norm": 0.46590396761894226, + "learning_rate": 2.7071759869829506e-05, + "loss": 0.0318, + "num_input_tokens_seen": 20190816, + "step": 95670 + }, + { + "epoch": 10.525302530253025, + "grad_norm": 0.007792499847710133, + "learning_rate": 2.7069368047991038e-05, + "loss": 0.0799, + "num_input_tokens_seen": 20191936, + "step": 95675 + }, + { + "epoch": 10.525852585258527, + "grad_norm": 0.023831188678741455, + "learning_rate": 2.7066976207080197e-05, + "loss": 0.0569, + "num_input_tokens_seen": 20192992, + "step": 95680 + }, + { + "epoch": 10.526402640264026, + "grad_norm": 2.0508134365081787, + "learning_rate": 2.7064584347119026e-05, + "loss": 0.0226, + "num_input_tokens_seen": 20194016, + "step": 95685 + }, + { + "epoch": 10.526952695269527, + "grad_norm": 0.14532819390296936, + "learning_rate": 2.706219246812958e-05, + "loss": 0.0039, + "num_input_tokens_seen": 20195072, + "step": 95690 + }, + { + "epoch": 10.527502750275028, + "grad_norm": 0.009915652684867382, + "learning_rate": 2.7059800570133887e-05, + "loss": 0.0035, + "num_input_tokens_seen": 20196064, + "step": 95695 + }, + { + "epoch": 10.528052805280527, + "grad_norm": 0.5741561651229858, + "learning_rate": 2.7057408653154005e-05, + "loss": 0.015, + "num_input_tokens_seen": 20197120, + "step": 95700 + }, + { + "epoch": 10.528602860286028, + "grad_norm": 0.011785753071308136, + "learning_rate": 2.705501671721198e-05, + "loss": 0.0093, + "num_input_tokens_seen": 20198176, + "step": 95705 + }, + { + "epoch": 10.52915291529153, + "grad_norm": 0.009926374070346355, + "learning_rate": 2.7052624762329844e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20199232, + "step": 95710 + }, + { + "epoch": 10.52970297029703, + "grad_norm": 0.00586568471044302, + "learning_rate": 2.7050232788529643e-05, + "loss": 0.0718, + "num_input_tokens_seen": 20200320, + "step": 95715 + }, + { + "epoch": 10.53025302530253, + "grad_norm": 0.2865599989891052, + "learning_rate": 2.7047840795833453e-05, + "loss": 0.021, + "num_input_tokens_seen": 20201344, + "step": 95720 + }, + { + "epoch": 10.530803080308031, + "grad_norm": 0.03212910518050194, + "learning_rate": 2.7045448784263278e-05, + "loss": 0.0702, + "num_input_tokens_seen": 20202432, + "step": 95725 + }, + { + "epoch": 10.531353135313532, + "grad_norm": 0.004776779096573591, + "learning_rate": 2.7043056753841188e-05, + "loss": 0.0107, + "num_input_tokens_seen": 20203520, + "step": 95730 + }, + { + "epoch": 10.531903190319031, + "grad_norm": 0.16390973329544067, + "learning_rate": 2.7040664704589224e-05, + "loss": 0.0766, + "num_input_tokens_seen": 20204608, + "step": 95735 + }, + { + "epoch": 10.532453245324533, + "grad_norm": 1.3022984266281128, + "learning_rate": 2.7038272636529432e-05, + "loss": 0.0228, + "num_input_tokens_seen": 20205632, + "step": 95740 + }, + { + "epoch": 10.533003300330034, + "grad_norm": 0.026169095188379288, + "learning_rate": 2.7035880549683862e-05, + "loss": 0.0542, + "num_input_tokens_seen": 20206656, + "step": 95745 + }, + { + "epoch": 10.533553355335533, + "grad_norm": 1.6608786582946777, + "learning_rate": 2.703348844407456e-05, + "loss": 0.0201, + "num_input_tokens_seen": 20207712, + "step": 95750 + }, + { + "epoch": 10.534103410341034, + "grad_norm": 0.019945500418543816, + "learning_rate": 2.7031096319723565e-05, + "loss": 0.0014, + "num_input_tokens_seen": 20208704, + "step": 95755 + }, + { + "epoch": 10.534653465346535, + "grad_norm": 0.12091377377510071, + "learning_rate": 2.702870417665293e-05, + "loss": 0.0102, + "num_input_tokens_seen": 20209792, + "step": 95760 + }, + { + "epoch": 10.535203520352034, + "grad_norm": 0.2972275912761688, + "learning_rate": 2.7026312014884715e-05, + "loss": 0.0698, + "num_input_tokens_seen": 20210816, + "step": 95765 + }, + { + "epoch": 10.535753575357536, + "grad_norm": 0.6389543414115906, + "learning_rate": 2.7023919834440938e-05, + "loss": 0.0126, + "num_input_tokens_seen": 20211936, + "step": 95770 + }, + { + "epoch": 10.536303630363037, + "grad_norm": 0.010481179691851139, + "learning_rate": 2.702152763534367e-05, + "loss": 0.0056, + "num_input_tokens_seen": 20213024, + "step": 95775 + }, + { + "epoch": 10.536853685368538, + "grad_norm": 3.1210103034973145, + "learning_rate": 2.701913541761495e-05, + "loss": 0.0384, + "num_input_tokens_seen": 20214112, + "step": 95780 + }, + { + "epoch": 10.537403740374037, + "grad_norm": 0.01566345803439617, + "learning_rate": 2.701674318127682e-05, + "loss": 0.1237, + "num_input_tokens_seen": 20215168, + "step": 95785 + }, + { + "epoch": 10.537953795379538, + "grad_norm": 0.04124535247683525, + "learning_rate": 2.7014350926351352e-05, + "loss": 0.0021, + "num_input_tokens_seen": 20216224, + "step": 95790 + }, + { + "epoch": 10.53850385038504, + "grad_norm": 0.02696903795003891, + "learning_rate": 2.7011958652860568e-05, + "loss": 0.0643, + "num_input_tokens_seen": 20217312, + "step": 95795 + }, + { + "epoch": 10.539053905390539, + "grad_norm": 0.09426023066043854, + "learning_rate": 2.700956636082653e-05, + "loss": 0.0066, + "num_input_tokens_seen": 20218336, + "step": 95800 + }, + { + "epoch": 10.53960396039604, + "grad_norm": 0.013252890668809414, + "learning_rate": 2.7007174050271277e-05, + "loss": 0.0294, + "num_input_tokens_seen": 20219392, + "step": 95805 + }, + { + "epoch": 10.54015401540154, + "grad_norm": 0.035853952169418335, + "learning_rate": 2.700478172121687e-05, + "loss": 0.0382, + "num_input_tokens_seen": 20220448, + "step": 95810 + }, + { + "epoch": 10.54070407040704, + "grad_norm": 0.2115682065486908, + "learning_rate": 2.7002389373685357e-05, + "loss": 0.0351, + "num_input_tokens_seen": 20221472, + "step": 95815 + }, + { + "epoch": 10.541254125412541, + "grad_norm": 0.07932950556278229, + "learning_rate": 2.6999997007698775e-05, + "loss": 0.0561, + "num_input_tokens_seen": 20222528, + "step": 95820 + }, + { + "epoch": 10.541804180418042, + "grad_norm": 0.027178404852747917, + "learning_rate": 2.6997604623279184e-05, + "loss": 0.0401, + "num_input_tokens_seen": 20223552, + "step": 95825 + }, + { + "epoch": 10.542354235423542, + "grad_norm": 0.1069711446762085, + "learning_rate": 2.6995212220448617e-05, + "loss": 0.0135, + "num_input_tokens_seen": 20224544, + "step": 95830 + }, + { + "epoch": 10.542904290429043, + "grad_norm": 3.8409647941589355, + "learning_rate": 2.699281979922915e-05, + "loss": 0.143, + "num_input_tokens_seen": 20225600, + "step": 95835 + }, + { + "epoch": 10.543454345434544, + "grad_norm": 0.008292517624795437, + "learning_rate": 2.6990427359642822e-05, + "loss": 0.042, + "num_input_tokens_seen": 20226656, + "step": 95840 + }, + { + "epoch": 10.544004400440045, + "grad_norm": 0.05448915809392929, + "learning_rate": 2.6988034901711668e-05, + "loss": 0.0031, + "num_input_tokens_seen": 20227744, + "step": 95845 + }, + { + "epoch": 10.544554455445544, + "grad_norm": 0.2570318877696991, + "learning_rate": 2.6985642425457757e-05, + "loss": 0.0074, + "num_input_tokens_seen": 20228800, + "step": 95850 + }, + { + "epoch": 10.545104510451045, + "grad_norm": 1.1289081573486328, + "learning_rate": 2.698324993090313e-05, + "loss": 0.0056, + "num_input_tokens_seen": 20229856, + "step": 95855 + }, + { + "epoch": 10.545654565456546, + "grad_norm": 0.099274642765522, + "learning_rate": 2.6980857418069845e-05, + "loss": 0.0075, + "num_input_tokens_seen": 20230880, + "step": 95860 + }, + { + "epoch": 10.546204620462046, + "grad_norm": 0.0013531376607716084, + "learning_rate": 2.6978464886979943e-05, + "loss": 0.0197, + "num_input_tokens_seen": 20231936, + "step": 95865 + }, + { + "epoch": 10.546754675467547, + "grad_norm": 0.5550223588943481, + "learning_rate": 2.697607233765548e-05, + "loss": 0.0109, + "num_input_tokens_seen": 20233056, + "step": 95870 + }, + { + "epoch": 10.547304730473048, + "grad_norm": 0.003627970116212964, + "learning_rate": 2.6973679770118505e-05, + "loss": 0.0475, + "num_input_tokens_seen": 20234112, + "step": 95875 + }, + { + "epoch": 10.547854785478547, + "grad_norm": 1.7383848428726196, + "learning_rate": 2.697128718439107e-05, + "loss": 0.0923, + "num_input_tokens_seen": 20235232, + "step": 95880 + }, + { + "epoch": 10.548404840484048, + "grad_norm": 0.1469276249408722, + "learning_rate": 2.696889458049523e-05, + "loss": 0.0227, + "num_input_tokens_seen": 20236224, + "step": 95885 + }, + { + "epoch": 10.54895489548955, + "grad_norm": 0.030929364264011383, + "learning_rate": 2.6966501958453033e-05, + "loss": 0.0117, + "num_input_tokens_seen": 20237312, + "step": 95890 + }, + { + "epoch": 10.549504950495049, + "grad_norm": 0.011554358527064323, + "learning_rate": 2.6964109318286534e-05, + "loss": 0.0168, + "num_input_tokens_seen": 20238432, + "step": 95895 + }, + { + "epoch": 10.55005500550055, + "grad_norm": 0.00974667351692915, + "learning_rate": 2.6961716660017773e-05, + "loss": 0.0035, + "num_input_tokens_seen": 20239488, + "step": 95900 + }, + { + "epoch": 10.55060506050605, + "grad_norm": 0.2621001899242401, + "learning_rate": 2.6959323983668815e-05, + "loss": 0.0031, + "num_input_tokens_seen": 20240576, + "step": 95905 + }, + { + "epoch": 10.551155115511552, + "grad_norm": 0.1834576576948166, + "learning_rate": 2.6956931289261715e-05, + "loss": 0.0076, + "num_input_tokens_seen": 20241696, + "step": 95910 + }, + { + "epoch": 10.551705170517051, + "grad_norm": 0.005685308016836643, + "learning_rate": 2.6954538576818512e-05, + "loss": 0.1429, + "num_input_tokens_seen": 20242752, + "step": 95915 + }, + { + "epoch": 10.552255225522552, + "grad_norm": 1.1802188158035278, + "learning_rate": 2.6952145846361266e-05, + "loss": 0.0193, + "num_input_tokens_seen": 20243776, + "step": 95920 + }, + { + "epoch": 10.552805280528053, + "grad_norm": 0.01852373033761978, + "learning_rate": 2.6949753097912027e-05, + "loss": 0.0026, + "num_input_tokens_seen": 20244800, + "step": 95925 + }, + { + "epoch": 10.553355335533553, + "grad_norm": 0.5661510229110718, + "learning_rate": 2.6947360331492855e-05, + "loss": 0.0091, + "num_input_tokens_seen": 20245824, + "step": 95930 + }, + { + "epoch": 10.553905390539054, + "grad_norm": 0.004904552362859249, + "learning_rate": 2.6944967547125794e-05, + "loss": 0.0401, + "num_input_tokens_seen": 20246816, + "step": 95935 + }, + { + "epoch": 10.554455445544555, + "grad_norm": 1.101449966430664, + "learning_rate": 2.69425747448329e-05, + "loss": 0.079, + "num_input_tokens_seen": 20247904, + "step": 95940 + }, + { + "epoch": 10.555005500550054, + "grad_norm": 0.03721184283494949, + "learning_rate": 2.694018192463622e-05, + "loss": 0.0112, + "num_input_tokens_seen": 20248928, + "step": 95945 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 0.03662886098027229, + "learning_rate": 2.6937789086557817e-05, + "loss": 0.0163, + "num_input_tokens_seen": 20250016, + "step": 95950 + }, + { + "epoch": 10.556105610561056, + "grad_norm": 0.03133774921298027, + "learning_rate": 2.6935396230619752e-05, + "loss": 0.0085, + "num_input_tokens_seen": 20251040, + "step": 95955 + }, + { + "epoch": 10.556655665566556, + "grad_norm": 0.004530050326138735, + "learning_rate": 2.693300335684406e-05, + "loss": 0.0046, + "num_input_tokens_seen": 20252128, + "step": 95960 + }, + { + "epoch": 10.557205720572057, + "grad_norm": 0.01551052276045084, + "learning_rate": 2.6930610465252802e-05, + "loss": 0.0061, + "num_input_tokens_seen": 20253120, + "step": 95965 + }, + { + "epoch": 10.557755775577558, + "grad_norm": 0.8054324388504028, + "learning_rate": 2.6928217555868042e-05, + "loss": 0.0086, + "num_input_tokens_seen": 20254144, + "step": 95970 + }, + { + "epoch": 10.558305830583059, + "grad_norm": 0.04444167762994766, + "learning_rate": 2.6925824628711816e-05, + "loss": 0.0026, + "num_input_tokens_seen": 20255168, + "step": 95975 + }, + { + "epoch": 10.558855885588558, + "grad_norm": 1.5549290180206299, + "learning_rate": 2.69234316838062e-05, + "loss": 0.0093, + "num_input_tokens_seen": 20256192, + "step": 95980 + }, + { + "epoch": 10.55940594059406, + "grad_norm": 0.02870912291109562, + "learning_rate": 2.6921038721173232e-05, + "loss": 0.0075, + "num_input_tokens_seen": 20257216, + "step": 95985 + }, + { + "epoch": 10.55995599559956, + "grad_norm": 0.014626648277044296, + "learning_rate": 2.6918645740834965e-05, + "loss": 0.0016, + "num_input_tokens_seen": 20258304, + "step": 95990 + }, + { + "epoch": 10.56050605060506, + "grad_norm": 0.7466152310371399, + "learning_rate": 2.691625274281347e-05, + "loss": 0.0385, + "num_input_tokens_seen": 20259296, + "step": 95995 + }, + { + "epoch": 10.561056105610561, + "grad_norm": 0.4943726062774658, + "learning_rate": 2.6913859727130786e-05, + "loss": 0.0089, + "num_input_tokens_seen": 20260320, + "step": 96000 + }, + { + "epoch": 10.561606160616062, + "grad_norm": 0.0466628335416317, + "learning_rate": 2.6911466693808978e-05, + "loss": 0.0025, + "num_input_tokens_seen": 20261440, + "step": 96005 + }, + { + "epoch": 10.562156215621561, + "grad_norm": 0.008488364517688751, + "learning_rate": 2.69090736428701e-05, + "loss": 0.0017, + "num_input_tokens_seen": 20262464, + "step": 96010 + }, + { + "epoch": 10.562706270627062, + "grad_norm": 0.03436907008290291, + "learning_rate": 2.690668057433621e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20263552, + "step": 96015 + }, + { + "epoch": 10.563256325632564, + "grad_norm": 0.01500493660569191, + "learning_rate": 2.690428748822935e-05, + "loss": 0.003, + "num_input_tokens_seen": 20264608, + "step": 96020 + }, + { + "epoch": 10.563806380638063, + "grad_norm": 2.2030296325683594, + "learning_rate": 2.6901894384571597e-05, + "loss": 0.1902, + "num_input_tokens_seen": 20265632, + "step": 96025 + }, + { + "epoch": 10.564356435643564, + "grad_norm": 0.04986492171883583, + "learning_rate": 2.6899501263384985e-05, + "loss": 0.1202, + "num_input_tokens_seen": 20266688, + "step": 96030 + }, + { + "epoch": 10.564906490649065, + "grad_norm": 0.00845902319997549, + "learning_rate": 2.689710812469159e-05, + "loss": 0.0366, + "num_input_tokens_seen": 20267808, + "step": 96035 + }, + { + "epoch": 10.565456545654566, + "grad_norm": 0.033259909600019455, + "learning_rate": 2.6894714968513457e-05, + "loss": 0.0121, + "num_input_tokens_seen": 20268928, + "step": 96040 + }, + { + "epoch": 10.566006600660065, + "grad_norm": 0.16270558536052704, + "learning_rate": 2.689232179487265e-05, + "loss": 0.0055, + "num_input_tokens_seen": 20270016, + "step": 96045 + }, + { + "epoch": 10.566556655665567, + "grad_norm": 1.707519292831421, + "learning_rate": 2.6889928603791216e-05, + "loss": 0.09, + "num_input_tokens_seen": 20271040, + "step": 96050 + }, + { + "epoch": 10.567106710671068, + "grad_norm": 0.03672223910689354, + "learning_rate": 2.6887535395291214e-05, + "loss": 0.0257, + "num_input_tokens_seen": 20272032, + "step": 96055 + }, + { + "epoch": 10.567656765676567, + "grad_norm": 0.045280251652002335, + "learning_rate": 2.6885142169394707e-05, + "loss": 0.0037, + "num_input_tokens_seen": 20273056, + "step": 96060 + }, + { + "epoch": 10.568206820682068, + "grad_norm": 0.03732091933488846, + "learning_rate": 2.6882748926123748e-05, + "loss": 0.0123, + "num_input_tokens_seen": 20274048, + "step": 96065 + }, + { + "epoch": 10.56875687568757, + "grad_norm": 0.005423851311206818, + "learning_rate": 2.6880355665500395e-05, + "loss": 0.0694, + "num_input_tokens_seen": 20275168, + "step": 96070 + }, + { + "epoch": 10.569306930693068, + "grad_norm": 0.6549699902534485, + "learning_rate": 2.6877962387546713e-05, + "loss": 0.0617, + "num_input_tokens_seen": 20276224, + "step": 96075 + }, + { + "epoch": 10.56985698569857, + "grad_norm": 0.028867457062005997, + "learning_rate": 2.6875569092284747e-05, + "loss": 0.0829, + "num_input_tokens_seen": 20277312, + "step": 96080 + }, + { + "epoch": 10.57040704070407, + "grad_norm": 2.198397636413574, + "learning_rate": 2.687317577973656e-05, + "loss": 0.0815, + "num_input_tokens_seen": 20278400, + "step": 96085 + }, + { + "epoch": 10.570957095709572, + "grad_norm": 0.02752184122800827, + "learning_rate": 2.6870782449924207e-05, + "loss": 0.0063, + "num_input_tokens_seen": 20279392, + "step": 96090 + }, + { + "epoch": 10.571507150715071, + "grad_norm": 5.369253635406494, + "learning_rate": 2.6868389102869758e-05, + "loss": 0.234, + "num_input_tokens_seen": 20280416, + "step": 96095 + }, + { + "epoch": 10.572057205720572, + "grad_norm": 0.1709415763616562, + "learning_rate": 2.6865995738595267e-05, + "loss": 0.0032, + "num_input_tokens_seen": 20281504, + "step": 96100 + }, + { + "epoch": 10.572607260726073, + "grad_norm": 0.007451239041984081, + "learning_rate": 2.6863602357122775e-05, + "loss": 0.0047, + "num_input_tokens_seen": 20282560, + "step": 96105 + }, + { + "epoch": 10.573157315731573, + "grad_norm": 0.02342643029987812, + "learning_rate": 2.686120895847436e-05, + "loss": 0.004, + "num_input_tokens_seen": 20283648, + "step": 96110 + }, + { + "epoch": 10.573707370737074, + "grad_norm": 0.07913766801357269, + "learning_rate": 2.6858815542672077e-05, + "loss": 0.0017, + "num_input_tokens_seen": 20284704, + "step": 96115 + }, + { + "epoch": 10.574257425742575, + "grad_norm": 0.013936171308159828, + "learning_rate": 2.685642210973799e-05, + "loss": 0.0049, + "num_input_tokens_seen": 20285792, + "step": 96120 + }, + { + "epoch": 10.574807480748074, + "grad_norm": 0.2774604558944702, + "learning_rate": 2.6854028659694142e-05, + "loss": 0.0264, + "num_input_tokens_seen": 20286880, + "step": 96125 + }, + { + "epoch": 10.575357535753575, + "grad_norm": 0.04217385873198509, + "learning_rate": 2.68516351925626e-05, + "loss": 0.0551, + "num_input_tokens_seen": 20287936, + "step": 96130 + }, + { + "epoch": 10.575907590759076, + "grad_norm": 0.011337440460920334, + "learning_rate": 2.6849241708365423e-05, + "loss": 0.0054, + "num_input_tokens_seen": 20289024, + "step": 96135 + }, + { + "epoch": 10.576457645764577, + "grad_norm": 0.055087532848119736, + "learning_rate": 2.6846848207124682e-05, + "loss": 0.1158, + "num_input_tokens_seen": 20290016, + "step": 96140 + }, + { + "epoch": 10.577007700770077, + "grad_norm": 0.02056301012635231, + "learning_rate": 2.684445468886242e-05, + "loss": 0.0083, + "num_input_tokens_seen": 20291104, + "step": 96145 + }, + { + "epoch": 10.577557755775578, + "grad_norm": 0.08445147424936295, + "learning_rate": 2.6842061153600712e-05, + "loss": 0.0713, + "num_input_tokens_seen": 20292160, + "step": 96150 + }, + { + "epoch": 10.578107810781079, + "grad_norm": 0.0359918437898159, + "learning_rate": 2.6839667601361608e-05, + "loss": 0.005, + "num_input_tokens_seen": 20293216, + "step": 96155 + }, + { + "epoch": 10.578657865786578, + "grad_norm": 0.019641509279608727, + "learning_rate": 2.6837274032167164e-05, + "loss": 0.0031, + "num_input_tokens_seen": 20294240, + "step": 96160 + }, + { + "epoch": 10.57920792079208, + "grad_norm": 0.03248327597975731, + "learning_rate": 2.6834880446039444e-05, + "loss": 0.0051, + "num_input_tokens_seen": 20295296, + "step": 96165 + }, + { + "epoch": 10.57975797579758, + "grad_norm": 0.08414963632822037, + "learning_rate": 2.6832486843000528e-05, + "loss": 0.0083, + "num_input_tokens_seen": 20296320, + "step": 96170 + }, + { + "epoch": 10.58030803080308, + "grad_norm": 0.007967826910316944, + "learning_rate": 2.6830093223072446e-05, + "loss": 0.0047, + "num_input_tokens_seen": 20297376, + "step": 96175 + }, + { + "epoch": 10.58085808580858, + "grad_norm": 0.050575967878103256, + "learning_rate": 2.6827699586277278e-05, + "loss": 0.0412, + "num_input_tokens_seen": 20298432, + "step": 96180 + }, + { + "epoch": 10.581408140814082, + "grad_norm": 0.09775734692811966, + "learning_rate": 2.6825305932637078e-05, + "loss": 0.0247, + "num_input_tokens_seen": 20299488, + "step": 96185 + }, + { + "epoch": 10.581958195819581, + "grad_norm": 0.10747279226779938, + "learning_rate": 2.6822912262173912e-05, + "loss": 0.0074, + "num_input_tokens_seen": 20300576, + "step": 96190 + }, + { + "epoch": 10.582508250825082, + "grad_norm": 0.13154155015945435, + "learning_rate": 2.6820518574909838e-05, + "loss": 0.0022, + "num_input_tokens_seen": 20301632, + "step": 96195 + }, + { + "epoch": 10.583058305830583, + "grad_norm": 3.119840621948242, + "learning_rate": 2.681812487086691e-05, + "loss": 0.0585, + "num_input_tokens_seen": 20302656, + "step": 96200 + }, + { + "epoch": 10.583608360836084, + "grad_norm": 0.0319250263273716, + "learning_rate": 2.6815731150067204e-05, + "loss": 0.0074, + "num_input_tokens_seen": 20303712, + "step": 96205 + }, + { + "epoch": 10.584158415841584, + "grad_norm": 0.01972520723938942, + "learning_rate": 2.6813337412532775e-05, + "loss": 0.0009, + "num_input_tokens_seen": 20304704, + "step": 96210 + }, + { + "epoch": 10.584708470847085, + "grad_norm": 2.975860118865967, + "learning_rate": 2.681094365828569e-05, + "loss": 0.0727, + "num_input_tokens_seen": 20305728, + "step": 96215 + }, + { + "epoch": 10.585258525852586, + "grad_norm": 0.03840724751353264, + "learning_rate": 2.6808549887348e-05, + "loss": 0.0028, + "num_input_tokens_seen": 20306720, + "step": 96220 + }, + { + "epoch": 10.585808580858085, + "grad_norm": 2.067070960998535, + "learning_rate": 2.6806156099741776e-05, + "loss": 0.2429, + "num_input_tokens_seen": 20307808, + "step": 96225 + }, + { + "epoch": 10.586358635863586, + "grad_norm": 0.29171860218048096, + "learning_rate": 2.6803762295489075e-05, + "loss": 0.01, + "num_input_tokens_seen": 20308896, + "step": 96230 + }, + { + "epoch": 10.586908690869087, + "grad_norm": 0.010882616974413395, + "learning_rate": 2.680136847461196e-05, + "loss": 0.024, + "num_input_tokens_seen": 20309952, + "step": 96235 + }, + { + "epoch": 10.587458745874587, + "grad_norm": 0.024560129269957542, + "learning_rate": 2.6798974637132505e-05, + "loss": 0.0092, + "num_input_tokens_seen": 20310944, + "step": 96240 + }, + { + "epoch": 10.588008800880088, + "grad_norm": 0.012928021140396595, + "learning_rate": 2.679658078307276e-05, + "loss": 0.0031, + "num_input_tokens_seen": 20312000, + "step": 96245 + }, + { + "epoch": 10.588558855885589, + "grad_norm": 0.07251473516225815, + "learning_rate": 2.6794186912454787e-05, + "loss": 0.0096, + "num_input_tokens_seen": 20313056, + "step": 96250 + }, + { + "epoch": 10.589108910891088, + "grad_norm": 0.7315092086791992, + "learning_rate": 2.679179302530066e-05, + "loss": 0.0141, + "num_input_tokens_seen": 20314208, + "step": 96255 + }, + { + "epoch": 10.58965896589659, + "grad_norm": 0.015670398250222206, + "learning_rate": 2.6789399121632436e-05, + "loss": 0.0036, + "num_input_tokens_seen": 20315296, + "step": 96260 + }, + { + "epoch": 10.59020902090209, + "grad_norm": 0.01933716982603073, + "learning_rate": 2.678700520147218e-05, + "loss": 0.0134, + "num_input_tokens_seen": 20316320, + "step": 96265 + }, + { + "epoch": 10.590759075907592, + "grad_norm": 0.0682494044303894, + "learning_rate": 2.6784611264841952e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20317344, + "step": 96270 + }, + { + "epoch": 10.591309130913091, + "grad_norm": 0.007507672533392906, + "learning_rate": 2.6782217311763824e-05, + "loss": 0.0064, + "num_input_tokens_seen": 20318336, + "step": 96275 + }, + { + "epoch": 10.591859185918592, + "grad_norm": 1.4499751329421997, + "learning_rate": 2.677982334225984e-05, + "loss": 0.1163, + "num_input_tokens_seen": 20319328, + "step": 96280 + }, + { + "epoch": 10.592409240924093, + "grad_norm": 0.028198260813951492, + "learning_rate": 2.6777429356352092e-05, + "loss": 0.0021, + "num_input_tokens_seen": 20320384, + "step": 96285 + }, + { + "epoch": 10.592959295929592, + "grad_norm": 0.013860420323908329, + "learning_rate": 2.6775035354062628e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20321504, + "step": 96290 + }, + { + "epoch": 10.593509350935093, + "grad_norm": 0.0055585806258022785, + "learning_rate": 2.677264133541351e-05, + "loss": 0.0139, + "num_input_tokens_seen": 20322592, + "step": 96295 + }, + { + "epoch": 10.594059405940595, + "grad_norm": 0.07098833471536636, + "learning_rate": 2.677024730042681e-05, + "loss": 0.0033, + "num_input_tokens_seen": 20323648, + "step": 96300 + }, + { + "epoch": 10.594609460946094, + "grad_norm": 0.0162068922072649, + "learning_rate": 2.676785324912459e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20324704, + "step": 96305 + }, + { + "epoch": 10.595159515951595, + "grad_norm": 0.183035746216774, + "learning_rate": 2.676545918152892e-05, + "loss": 0.008, + "num_input_tokens_seen": 20325760, + "step": 96310 + }, + { + "epoch": 10.595709570957096, + "grad_norm": 0.588839054107666, + "learning_rate": 2.6763065097661854e-05, + "loss": 0.0076, + "num_input_tokens_seen": 20326752, + "step": 96315 + }, + { + "epoch": 10.596259625962595, + "grad_norm": 0.08704076707363129, + "learning_rate": 2.6760670997545468e-05, + "loss": 0.0038, + "num_input_tokens_seen": 20327840, + "step": 96320 + }, + { + "epoch": 10.596809680968097, + "grad_norm": 0.9249750971794128, + "learning_rate": 2.675827688120181e-05, + "loss": 0.111, + "num_input_tokens_seen": 20328864, + "step": 96325 + }, + { + "epoch": 10.597359735973598, + "grad_norm": 1.246404767036438, + "learning_rate": 2.6755882748652967e-05, + "loss": 0.1177, + "num_input_tokens_seen": 20329920, + "step": 96330 + }, + { + "epoch": 10.597909790979099, + "grad_norm": 0.013127594254910946, + "learning_rate": 2.6753488599920996e-05, + "loss": 0.0401, + "num_input_tokens_seen": 20330976, + "step": 96335 + }, + { + "epoch": 10.598459845984598, + "grad_norm": 0.0719771757721901, + "learning_rate": 2.6751094435027962e-05, + "loss": 0.0027, + "num_input_tokens_seen": 20332064, + "step": 96340 + }, + { + "epoch": 10.599009900990099, + "grad_norm": 0.134942889213562, + "learning_rate": 2.6748700253995928e-05, + "loss": 0.007, + "num_input_tokens_seen": 20333120, + "step": 96345 + }, + { + "epoch": 10.5995599559956, + "grad_norm": 0.028208767995238304, + "learning_rate": 2.6746306056846964e-05, + "loss": 0.1, + "num_input_tokens_seen": 20334176, + "step": 96350 + }, + { + "epoch": 10.6001100110011, + "grad_norm": 1.3239539861679077, + "learning_rate": 2.674391184360313e-05, + "loss": 0.037, + "num_input_tokens_seen": 20335200, + "step": 96355 + }, + { + "epoch": 10.6006600660066, + "grad_norm": 0.023691929876804352, + "learning_rate": 2.674151761428651e-05, + "loss": 0.0644, + "num_input_tokens_seen": 20336256, + "step": 96360 + }, + { + "epoch": 10.601210121012102, + "grad_norm": 0.9158712029457092, + "learning_rate": 2.6739123368919145e-05, + "loss": 0.0788, + "num_input_tokens_seen": 20337312, + "step": 96365 + }, + { + "epoch": 10.601760176017601, + "grad_norm": 0.048889171332120895, + "learning_rate": 2.673672910752312e-05, + "loss": 0.0765, + "num_input_tokens_seen": 20338400, + "step": 96370 + }, + { + "epoch": 10.602310231023102, + "grad_norm": 0.340175598859787, + "learning_rate": 2.67343348301205e-05, + "loss": 0.0246, + "num_input_tokens_seen": 20339552, + "step": 96375 + }, + { + "epoch": 10.602860286028603, + "grad_norm": 1.6498517990112305, + "learning_rate": 2.673194053673334e-05, + "loss": 0.0845, + "num_input_tokens_seen": 20340608, + "step": 96380 + }, + { + "epoch": 10.603410341034103, + "grad_norm": 0.0742601677775383, + "learning_rate": 2.6729546227383723e-05, + "loss": 0.0469, + "num_input_tokens_seen": 20341664, + "step": 96385 + }, + { + "epoch": 10.603960396039604, + "grad_norm": 0.01195578183978796, + "learning_rate": 2.6727151902093707e-05, + "loss": 0.008, + "num_input_tokens_seen": 20342720, + "step": 96390 + }, + { + "epoch": 10.604510451045105, + "grad_norm": 0.03375551104545593, + "learning_rate": 2.672475756088535e-05, + "loss": 0.1724, + "num_input_tokens_seen": 20343712, + "step": 96395 + }, + { + "epoch": 10.605060506050606, + "grad_norm": 0.03792525455355644, + "learning_rate": 2.6722363203780744e-05, + "loss": 0.0338, + "num_input_tokens_seen": 20344768, + "step": 96400 + }, + { + "epoch": 10.605610561056105, + "grad_norm": 0.0035986043512821198, + "learning_rate": 2.671996883080194e-05, + "loss": 0.0911, + "num_input_tokens_seen": 20345824, + "step": 96405 + }, + { + "epoch": 10.606160616061606, + "grad_norm": 0.04612121731042862, + "learning_rate": 2.6717574441971e-05, + "loss": 0.0403, + "num_input_tokens_seen": 20346912, + "step": 96410 + }, + { + "epoch": 10.606710671067107, + "grad_norm": 0.13845527172088623, + "learning_rate": 2.6715180037310006e-05, + "loss": 0.007, + "num_input_tokens_seen": 20348000, + "step": 96415 + }, + { + "epoch": 10.607260726072607, + "grad_norm": 0.043494705110788345, + "learning_rate": 2.6712785616841026e-05, + "loss": 0.0135, + "num_input_tokens_seen": 20349120, + "step": 96420 + }, + { + "epoch": 10.607810781078108, + "grad_norm": 1.783471941947937, + "learning_rate": 2.671039118058611e-05, + "loss": 0.1043, + "num_input_tokens_seen": 20350272, + "step": 96425 + }, + { + "epoch": 10.608360836083609, + "grad_norm": 0.027829425409436226, + "learning_rate": 2.670799672856735e-05, + "loss": 0.0362, + "num_input_tokens_seen": 20351296, + "step": 96430 + }, + { + "epoch": 10.608910891089108, + "grad_norm": 0.012390457093715668, + "learning_rate": 2.6705602260806807e-05, + "loss": 0.0719, + "num_input_tokens_seen": 20352416, + "step": 96435 + }, + { + "epoch": 10.60946094609461, + "grad_norm": 0.07623951882123947, + "learning_rate": 2.6703207777326535e-05, + "loss": 0.0042, + "num_input_tokens_seen": 20353536, + "step": 96440 + }, + { + "epoch": 10.61001100110011, + "grad_norm": 0.03449136018753052, + "learning_rate": 2.6700813278148617e-05, + "loss": 0.0049, + "num_input_tokens_seen": 20354656, + "step": 96445 + }, + { + "epoch": 10.61056105610561, + "grad_norm": 3.1777029037475586, + "learning_rate": 2.6698418763295125e-05, + "loss": 0.0404, + "num_input_tokens_seen": 20355744, + "step": 96450 + }, + { + "epoch": 10.61111111111111, + "grad_norm": 0.014435694552958012, + "learning_rate": 2.669602423278812e-05, + "loss": 0.032, + "num_input_tokens_seen": 20356800, + "step": 96455 + }, + { + "epoch": 10.611661166116612, + "grad_norm": 3.370807647705078, + "learning_rate": 2.669362968664967e-05, + "loss": 0.023, + "num_input_tokens_seen": 20357824, + "step": 96460 + }, + { + "epoch": 10.612211221122113, + "grad_norm": 0.009103793650865555, + "learning_rate": 2.6691235124901854e-05, + "loss": 0.0099, + "num_input_tokens_seen": 20358816, + "step": 96465 + }, + { + "epoch": 10.612761276127612, + "grad_norm": 0.1395365446805954, + "learning_rate": 2.6688840547566723e-05, + "loss": 0.0421, + "num_input_tokens_seen": 20359872, + "step": 96470 + }, + { + "epoch": 10.613311331133113, + "grad_norm": 0.023357832804322243, + "learning_rate": 2.6686445954666368e-05, + "loss": 0.0067, + "num_input_tokens_seen": 20360928, + "step": 96475 + }, + { + "epoch": 10.613861386138614, + "grad_norm": 0.018374286592006683, + "learning_rate": 2.668405134622285e-05, + "loss": 0.0385, + "num_input_tokens_seen": 20362080, + "step": 96480 + }, + { + "epoch": 10.614411441144114, + "grad_norm": 0.1669798046350479, + "learning_rate": 2.668165672225824e-05, + "loss": 0.0082, + "num_input_tokens_seen": 20363104, + "step": 96485 + }, + { + "epoch": 10.614961496149615, + "grad_norm": 0.09363231062889099, + "learning_rate": 2.6679262082794604e-05, + "loss": 0.006, + "num_input_tokens_seen": 20364128, + "step": 96490 + }, + { + "epoch": 10.615511551155116, + "grad_norm": 0.01360940933227539, + "learning_rate": 2.667686742785402e-05, + "loss": 0.0677, + "num_input_tokens_seen": 20365152, + "step": 96495 + }, + { + "epoch": 10.616061606160617, + "grad_norm": 0.12882880866527557, + "learning_rate": 2.6674472757458546e-05, + "loss": 0.0771, + "num_input_tokens_seen": 20366208, + "step": 96500 + }, + { + "epoch": 10.616611661166116, + "grad_norm": 1.1125004291534424, + "learning_rate": 2.6672078071630268e-05, + "loss": 0.1397, + "num_input_tokens_seen": 20367264, + "step": 96505 + }, + { + "epoch": 10.617161716171617, + "grad_norm": 0.13549117743968964, + "learning_rate": 2.666968337039124e-05, + "loss": 0.0588, + "num_input_tokens_seen": 20368320, + "step": 96510 + }, + { + "epoch": 10.617711771177119, + "grad_norm": 0.0482778325676918, + "learning_rate": 2.6667288653763544e-05, + "loss": 0.0439, + "num_input_tokens_seen": 20369408, + "step": 96515 + }, + { + "epoch": 10.618261826182618, + "grad_norm": 1.7372424602508545, + "learning_rate": 2.6664893921769253e-05, + "loss": 0.0856, + "num_input_tokens_seen": 20370496, + "step": 96520 + }, + { + "epoch": 10.618811881188119, + "grad_norm": 0.817634642124176, + "learning_rate": 2.666249917443043e-05, + "loss": 0.059, + "num_input_tokens_seen": 20371584, + "step": 96525 + }, + { + "epoch": 10.61936193619362, + "grad_norm": 0.13592877984046936, + "learning_rate": 2.6660104411769153e-05, + "loss": 0.0201, + "num_input_tokens_seen": 20372640, + "step": 96530 + }, + { + "epoch": 10.61991199119912, + "grad_norm": 0.24366873502731323, + "learning_rate": 2.6657709633807492e-05, + "loss": 0.0823, + "num_input_tokens_seen": 20373696, + "step": 96535 + }, + { + "epoch": 10.62046204620462, + "grad_norm": 0.04302978515625, + "learning_rate": 2.6655314840567508e-05, + "loss": 0.0199, + "num_input_tokens_seen": 20374656, + "step": 96540 + }, + { + "epoch": 10.621012101210122, + "grad_norm": 0.07996221631765366, + "learning_rate": 2.6652920032071288e-05, + "loss": 0.0267, + "num_input_tokens_seen": 20375712, + "step": 96545 + }, + { + "epoch": 10.62156215621562, + "grad_norm": 0.021530449390411377, + "learning_rate": 2.66505252083409e-05, + "loss": 0.0068, + "num_input_tokens_seen": 20376832, + "step": 96550 + }, + { + "epoch": 10.622112211221122, + "grad_norm": 0.014952141791582108, + "learning_rate": 2.6648130369398406e-05, + "loss": 0.0046, + "num_input_tokens_seen": 20377920, + "step": 96555 + }, + { + "epoch": 10.622662266226623, + "grad_norm": 0.24985773861408234, + "learning_rate": 2.664573551526589e-05, + "loss": 0.0165, + "num_input_tokens_seen": 20378976, + "step": 96560 + }, + { + "epoch": 10.623212321232124, + "grad_norm": 0.08533833920955658, + "learning_rate": 2.6643340645965414e-05, + "loss": 0.0284, + "num_input_tokens_seen": 20380064, + "step": 96565 + }, + { + "epoch": 10.623762376237623, + "grad_norm": 0.08870698511600494, + "learning_rate": 2.6640945761519053e-05, + "loss": 0.0115, + "num_input_tokens_seen": 20381120, + "step": 96570 + }, + { + "epoch": 10.624312431243125, + "grad_norm": 0.5983294248580933, + "learning_rate": 2.6638550861948895e-05, + "loss": 0.0061, + "num_input_tokens_seen": 20382208, + "step": 96575 + }, + { + "epoch": 10.624862486248626, + "grad_norm": 0.013281687162816525, + "learning_rate": 2.6636155947276997e-05, + "loss": 0.1312, + "num_input_tokens_seen": 20383296, + "step": 96580 + }, + { + "epoch": 10.625412541254125, + "grad_norm": 0.06833023577928543, + "learning_rate": 2.6633761017525426e-05, + "loss": 0.0088, + "num_input_tokens_seen": 20384352, + "step": 96585 + }, + { + "epoch": 10.625962596259626, + "grad_norm": 0.027308478951454163, + "learning_rate": 2.6631366072716275e-05, + "loss": 0.1002, + "num_input_tokens_seen": 20385472, + "step": 96590 + }, + { + "epoch": 10.626512651265127, + "grad_norm": 0.049337238073349, + "learning_rate": 2.6628971112871605e-05, + "loss": 0.0035, + "num_input_tokens_seen": 20386496, + "step": 96595 + }, + { + "epoch": 10.627062706270626, + "grad_norm": 0.020478229969739914, + "learning_rate": 2.6626576138013476e-05, + "loss": 0.0055, + "num_input_tokens_seen": 20387552, + "step": 96600 + }, + { + "epoch": 10.627612761276128, + "grad_norm": 0.008620054461061954, + "learning_rate": 2.6624181148163985e-05, + "loss": 0.019, + "num_input_tokens_seen": 20388608, + "step": 96605 + }, + { + "epoch": 10.628162816281629, + "grad_norm": 0.9452158808708191, + "learning_rate": 2.6621786143345196e-05, + "loss": 0.0482, + "num_input_tokens_seen": 20389664, + "step": 96610 + }, + { + "epoch": 10.628712871287128, + "grad_norm": 2.332467555999756, + "learning_rate": 2.661939112357918e-05, + "loss": 0.1465, + "num_input_tokens_seen": 20390688, + "step": 96615 + }, + { + "epoch": 10.629262926292629, + "grad_norm": 0.012403865344822407, + "learning_rate": 2.661699608888802e-05, + "loss": 0.0352, + "num_input_tokens_seen": 20391776, + "step": 96620 + }, + { + "epoch": 10.62981298129813, + "grad_norm": 0.7514627575874329, + "learning_rate": 2.6614601039293774e-05, + "loss": 0.0821, + "num_input_tokens_seen": 20392832, + "step": 96625 + }, + { + "epoch": 10.630363036303631, + "grad_norm": 0.06679404526948929, + "learning_rate": 2.6612205974818526e-05, + "loss": 0.0232, + "num_input_tokens_seen": 20393824, + "step": 96630 + }, + { + "epoch": 10.63091309130913, + "grad_norm": 0.024611946195364, + "learning_rate": 2.6609810895484348e-05, + "loss": 0.0174, + "num_input_tokens_seen": 20394944, + "step": 96635 + }, + { + "epoch": 10.631463146314632, + "grad_norm": 0.0536518320441246, + "learning_rate": 2.6607415801313324e-05, + "loss": 0.0048, + "num_input_tokens_seen": 20395936, + "step": 96640 + }, + { + "epoch": 10.632013201320133, + "grad_norm": 0.018614836037158966, + "learning_rate": 2.6605020692327514e-05, + "loss": 0.018, + "num_input_tokens_seen": 20396928, + "step": 96645 + }, + { + "epoch": 10.632563256325632, + "grad_norm": 0.13950002193450928, + "learning_rate": 2.6602625568549e-05, + "loss": 0.0062, + "num_input_tokens_seen": 20398016, + "step": 96650 + }, + { + "epoch": 10.633113311331133, + "grad_norm": 0.019895905628800392, + "learning_rate": 2.6600230429999852e-05, + "loss": 0.0062, + "num_input_tokens_seen": 20399072, + "step": 96655 + }, + { + "epoch": 10.633663366336634, + "grad_norm": 0.031392499804496765, + "learning_rate": 2.6597835276702147e-05, + "loss": 0.07, + "num_input_tokens_seen": 20400192, + "step": 96660 + }, + { + "epoch": 10.634213421342134, + "grad_norm": 0.07014445215463638, + "learning_rate": 2.6595440108677965e-05, + "loss": 0.0044, + "num_input_tokens_seen": 20401216, + "step": 96665 + }, + { + "epoch": 10.634763476347635, + "grad_norm": 0.08482455462217331, + "learning_rate": 2.659304492594938e-05, + "loss": 0.0421, + "num_input_tokens_seen": 20402304, + "step": 96670 + }, + { + "epoch": 10.635313531353136, + "grad_norm": 0.010000879876315594, + "learning_rate": 2.659064972853846e-05, + "loss": 0.0075, + "num_input_tokens_seen": 20403360, + "step": 96675 + }, + { + "epoch": 10.635863586358635, + "grad_norm": 0.399598628282547, + "learning_rate": 2.6588254516467282e-05, + "loss": 0.0342, + "num_input_tokens_seen": 20404384, + "step": 96680 + }, + { + "epoch": 10.636413641364136, + "grad_norm": 0.010864851996302605, + "learning_rate": 2.6585859289757927e-05, + "loss": 0.0069, + "num_input_tokens_seen": 20405440, + "step": 96685 + }, + { + "epoch": 10.636963696369637, + "grad_norm": 0.009452049620449543, + "learning_rate": 2.6583464048432472e-05, + "loss": 0.0296, + "num_input_tokens_seen": 20406496, + "step": 96690 + }, + { + "epoch": 10.637513751375138, + "grad_norm": 0.11104554682970047, + "learning_rate": 2.6581068792512985e-05, + "loss": 0.0334, + "num_input_tokens_seen": 20407520, + "step": 96695 + }, + { + "epoch": 10.638063806380638, + "grad_norm": 0.04693186655640602, + "learning_rate": 2.657867352202154e-05, + "loss": 0.0039, + "num_input_tokens_seen": 20408576, + "step": 96700 + }, + { + "epoch": 10.638613861386139, + "grad_norm": 1.3629059791564941, + "learning_rate": 2.657627823698023e-05, + "loss": 0.0693, + "num_input_tokens_seen": 20409632, + "step": 96705 + }, + { + "epoch": 10.63916391639164, + "grad_norm": 2.256355047225952, + "learning_rate": 2.657388293741111e-05, + "loss": 0.1643, + "num_input_tokens_seen": 20410688, + "step": 96710 + }, + { + "epoch": 10.63971397139714, + "grad_norm": 0.011655961163341999, + "learning_rate": 2.657148762333627e-05, + "loss": 0.0014, + "num_input_tokens_seen": 20411712, + "step": 96715 + }, + { + "epoch": 10.64026402640264, + "grad_norm": 1.8735839128494263, + "learning_rate": 2.6569092294777787e-05, + "loss": 0.1058, + "num_input_tokens_seen": 20412704, + "step": 96720 + }, + { + "epoch": 10.640814081408141, + "grad_norm": 0.08759909123182297, + "learning_rate": 2.656669695175773e-05, + "loss": 0.0092, + "num_input_tokens_seen": 20413760, + "step": 96725 + }, + { + "epoch": 10.64136413641364, + "grad_norm": 0.011549700982868671, + "learning_rate": 2.6564301594298175e-05, + "loss": 0.002, + "num_input_tokens_seen": 20414816, + "step": 96730 + }, + { + "epoch": 10.641914191419142, + "grad_norm": 0.01690676249563694, + "learning_rate": 2.6561906222421206e-05, + "loss": 0.0536, + "num_input_tokens_seen": 20415808, + "step": 96735 + }, + { + "epoch": 10.642464246424643, + "grad_norm": 0.03765746206045151, + "learning_rate": 2.65595108361489e-05, + "loss": 0.0983, + "num_input_tokens_seen": 20416832, + "step": 96740 + }, + { + "epoch": 10.643014301430142, + "grad_norm": 0.020820343866944313, + "learning_rate": 2.6557115435503326e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20417888, + "step": 96745 + }, + { + "epoch": 10.643564356435643, + "grad_norm": 1.1750680208206177, + "learning_rate": 2.655472002050657e-05, + "loss": 0.0551, + "num_input_tokens_seen": 20418976, + "step": 96750 + }, + { + "epoch": 10.644114411441144, + "grad_norm": 0.013480202294886112, + "learning_rate": 2.6552324591180698e-05, + "loss": 0.013, + "num_input_tokens_seen": 20419968, + "step": 96755 + }, + { + "epoch": 10.644664466446645, + "grad_norm": 0.16450046002864838, + "learning_rate": 2.6549929147547797e-05, + "loss": 0.0626, + "num_input_tokens_seen": 20421024, + "step": 96760 + }, + { + "epoch": 10.645214521452145, + "grad_norm": 1.835857629776001, + "learning_rate": 2.6547533689629956e-05, + "loss": 0.0573, + "num_input_tokens_seen": 20422144, + "step": 96765 + }, + { + "epoch": 10.645764576457646, + "grad_norm": 0.06966818124055862, + "learning_rate": 2.654513821744923e-05, + "loss": 0.0173, + "num_input_tokens_seen": 20423264, + "step": 96770 + }, + { + "epoch": 10.646314631463147, + "grad_norm": 0.12858475744724274, + "learning_rate": 2.6542742731027703e-05, + "loss": 0.0234, + "num_input_tokens_seen": 20424288, + "step": 96775 + }, + { + "epoch": 10.646864686468646, + "grad_norm": 0.06818688660860062, + "learning_rate": 2.654034723038746e-05, + "loss": 0.0115, + "num_input_tokens_seen": 20425280, + "step": 96780 + }, + { + "epoch": 10.647414741474147, + "grad_norm": 0.10967305302619934, + "learning_rate": 2.6537951715550576e-05, + "loss": 0.0157, + "num_input_tokens_seen": 20426368, + "step": 96785 + }, + { + "epoch": 10.647964796479648, + "grad_norm": 0.23425735533237457, + "learning_rate": 2.653555618653913e-05, + "loss": 0.0253, + "num_input_tokens_seen": 20427360, + "step": 96790 + }, + { + "epoch": 10.648514851485148, + "grad_norm": 0.24888506531715393, + "learning_rate": 2.6533160643375197e-05, + "loss": 0.0386, + "num_input_tokens_seen": 20428416, + "step": 96795 + }, + { + "epoch": 10.649064906490649, + "grad_norm": 0.5604177117347717, + "learning_rate": 2.653076508608086e-05, + "loss": 0.0102, + "num_input_tokens_seen": 20429440, + "step": 96800 + }, + { + "epoch": 10.64961496149615, + "grad_norm": 2.2565078735351562, + "learning_rate": 2.652836951467819e-05, + "loss": 0.0833, + "num_input_tokens_seen": 20430496, + "step": 96805 + }, + { + "epoch": 10.65016501650165, + "grad_norm": 0.018618106842041016, + "learning_rate": 2.6525973929189284e-05, + "loss": 0.0117, + "num_input_tokens_seen": 20431552, + "step": 96810 + }, + { + "epoch": 10.65071507150715, + "grad_norm": 0.750449538230896, + "learning_rate": 2.6523578329636195e-05, + "loss": 0.0187, + "num_input_tokens_seen": 20432640, + "step": 96815 + }, + { + "epoch": 10.651265126512651, + "grad_norm": 0.01939321495592594, + "learning_rate": 2.6521182716041027e-05, + "loss": 0.0265, + "num_input_tokens_seen": 20433728, + "step": 96820 + }, + { + "epoch": 10.651815181518153, + "grad_norm": 0.07368889451026917, + "learning_rate": 2.651878708842584e-05, + "loss": 0.0332, + "num_input_tokens_seen": 20434784, + "step": 96825 + }, + { + "epoch": 10.652365236523652, + "grad_norm": 0.12535254657268524, + "learning_rate": 2.6516391446812727e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20435776, + "step": 96830 + }, + { + "epoch": 10.652915291529153, + "grad_norm": 0.16018365323543549, + "learning_rate": 2.6513995791223756e-05, + "loss": 0.0034, + "num_input_tokens_seen": 20436800, + "step": 96835 + }, + { + "epoch": 10.653465346534654, + "grad_norm": 0.020517194643616676, + "learning_rate": 2.6511600121681018e-05, + "loss": 0.0304, + "num_input_tokens_seen": 20437952, + "step": 96840 + }, + { + "epoch": 10.654015401540153, + "grad_norm": 0.8993836641311646, + "learning_rate": 2.6509204438206574e-05, + "loss": 0.0331, + "num_input_tokens_seen": 20438976, + "step": 96845 + }, + { + "epoch": 10.654565456545654, + "grad_norm": 0.11024025082588196, + "learning_rate": 2.650680874082253e-05, + "loss": 0.0265, + "num_input_tokens_seen": 20440032, + "step": 96850 + }, + { + "epoch": 10.655115511551156, + "grad_norm": 0.049349695444107056, + "learning_rate": 2.6504413029550952e-05, + "loss": 0.0804, + "num_input_tokens_seen": 20441056, + "step": 96855 + }, + { + "epoch": 10.655665566556655, + "grad_norm": 0.1815689355134964, + "learning_rate": 2.650201730441392e-05, + "loss": 0.079, + "num_input_tokens_seen": 20442144, + "step": 96860 + }, + { + "epoch": 10.656215621562156, + "grad_norm": 0.3255634903907776, + "learning_rate": 2.6499621565433514e-05, + "loss": 0.0079, + "num_input_tokens_seen": 20443168, + "step": 96865 + }, + { + "epoch": 10.656765676567657, + "grad_norm": 0.026264145970344543, + "learning_rate": 2.6497225812631815e-05, + "loss": 0.1053, + "num_input_tokens_seen": 20444288, + "step": 96870 + }, + { + "epoch": 10.657315731573158, + "grad_norm": 0.03049355372786522, + "learning_rate": 2.64948300460309e-05, + "loss": 0.1484, + "num_input_tokens_seen": 20445344, + "step": 96875 + }, + { + "epoch": 10.657865786578657, + "grad_norm": 0.5609598755836487, + "learning_rate": 2.6492434265652867e-05, + "loss": 0.0636, + "num_input_tokens_seen": 20446464, + "step": 96880 + }, + { + "epoch": 10.658415841584159, + "grad_norm": 0.45132890343666077, + "learning_rate": 2.6490038471519775e-05, + "loss": 0.1203, + "num_input_tokens_seen": 20447488, + "step": 96885 + }, + { + "epoch": 10.65896589658966, + "grad_norm": 0.10101396590471268, + "learning_rate": 2.6487642663653704e-05, + "loss": 0.0225, + "num_input_tokens_seen": 20448544, + "step": 96890 + }, + { + "epoch": 10.659515951595159, + "grad_norm": 0.031169598922133446, + "learning_rate": 2.6485246842076755e-05, + "loss": 0.0386, + "num_input_tokens_seen": 20449568, + "step": 96895 + }, + { + "epoch": 10.66006600660066, + "grad_norm": 0.2607157230377197, + "learning_rate": 2.6482851006810998e-05, + "loss": 0.0573, + "num_input_tokens_seen": 20450688, + "step": 96900 + }, + { + "epoch": 10.660616061606161, + "grad_norm": 0.008538465946912766, + "learning_rate": 2.6480455157878514e-05, + "loss": 0.0061, + "num_input_tokens_seen": 20451776, + "step": 96905 + }, + { + "epoch": 10.66116611661166, + "grad_norm": 0.15659594535827637, + "learning_rate": 2.647805929530139e-05, + "loss": 0.0155, + "num_input_tokens_seen": 20452800, + "step": 96910 + }, + { + "epoch": 10.661716171617162, + "grad_norm": 0.08235037326812744, + "learning_rate": 2.6475663419101697e-05, + "loss": 0.0438, + "num_input_tokens_seen": 20453824, + "step": 96915 + }, + { + "epoch": 10.662266226622663, + "grad_norm": 0.036600932478904724, + "learning_rate": 2.6473267529301516e-05, + "loss": 0.0168, + "num_input_tokens_seen": 20454912, + "step": 96920 + }, + { + "epoch": 10.662816281628164, + "grad_norm": 0.036260977387428284, + "learning_rate": 2.6470871625922943e-05, + "loss": 0.0093, + "num_input_tokens_seen": 20455968, + "step": 96925 + }, + { + "epoch": 10.663366336633663, + "grad_norm": 0.018893899396061897, + "learning_rate": 2.6468475708988055e-05, + "loss": 0.0183, + "num_input_tokens_seen": 20456992, + "step": 96930 + }, + { + "epoch": 10.663916391639164, + "grad_norm": 0.4521074593067169, + "learning_rate": 2.6466079778518925e-05, + "loss": 0.0086, + "num_input_tokens_seen": 20458048, + "step": 96935 + }, + { + "epoch": 10.664466446644665, + "grad_norm": 0.0105300173163414, + "learning_rate": 2.6463683834537645e-05, + "loss": 0.0043, + "num_input_tokens_seen": 20459040, + "step": 96940 + }, + { + "epoch": 10.665016501650165, + "grad_norm": 0.04153478890657425, + "learning_rate": 2.6461287877066294e-05, + "loss": 0.066, + "num_input_tokens_seen": 20460160, + "step": 96945 + }, + { + "epoch": 10.665566556655666, + "grad_norm": 1.1353763341903687, + "learning_rate": 2.6458891906126947e-05, + "loss": 0.0795, + "num_input_tokens_seen": 20461184, + "step": 96950 + }, + { + "epoch": 10.666116611661167, + "grad_norm": 0.06946399062871933, + "learning_rate": 2.6456495921741703e-05, + "loss": 0.0082, + "num_input_tokens_seen": 20462272, + "step": 96955 + }, + { + "epoch": 10.666666666666666, + "grad_norm": 0.06791174411773682, + "learning_rate": 2.6454099923932628e-05, + "loss": 0.0049, + "num_input_tokens_seen": 20463328, + "step": 96960 + }, + { + "epoch": 10.667216721672167, + "grad_norm": 0.04829086735844612, + "learning_rate": 2.6451703912721814e-05, + "loss": 0.0498, + "num_input_tokens_seen": 20464352, + "step": 96965 + }, + { + "epoch": 10.667766776677668, + "grad_norm": 0.31251102685928345, + "learning_rate": 2.6449307888131342e-05, + "loss": 0.1136, + "num_input_tokens_seen": 20465440, + "step": 96970 + }, + { + "epoch": 10.668316831683168, + "grad_norm": 0.0525946281850338, + "learning_rate": 2.6446911850183292e-05, + "loss": 0.0061, + "num_input_tokens_seen": 20466432, + "step": 96975 + }, + { + "epoch": 10.668866886688669, + "grad_norm": 0.021843960508704185, + "learning_rate": 2.6444515798899756e-05, + "loss": 0.0784, + "num_input_tokens_seen": 20467520, + "step": 96980 + }, + { + "epoch": 10.66941694169417, + "grad_norm": 0.49809738993644714, + "learning_rate": 2.6442119734302807e-05, + "loss": 0.0276, + "num_input_tokens_seen": 20468608, + "step": 96985 + }, + { + "epoch": 10.66996699669967, + "grad_norm": 0.0864052101969719, + "learning_rate": 2.643972365641453e-05, + "loss": 0.0153, + "num_input_tokens_seen": 20469632, + "step": 96990 + }, + { + "epoch": 10.67051705170517, + "grad_norm": 0.014960482716560364, + "learning_rate": 2.6437327565257008e-05, + "loss": 0.0254, + "num_input_tokens_seen": 20470656, + "step": 96995 + }, + { + "epoch": 10.671067106710671, + "grad_norm": 0.018249673768877983, + "learning_rate": 2.6434931460852342e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20471680, + "step": 97000 + }, + { + "epoch": 10.671617161716172, + "grad_norm": 0.015204839408397675, + "learning_rate": 2.6432535343222586e-05, + "loss": 0.0073, + "num_input_tokens_seen": 20472736, + "step": 97005 + }, + { + "epoch": 10.672167216721672, + "grad_norm": 0.13205517828464508, + "learning_rate": 2.643013921238985e-05, + "loss": 0.1032, + "num_input_tokens_seen": 20473824, + "step": 97010 + }, + { + "epoch": 10.672717271727173, + "grad_norm": 0.09667181223630905, + "learning_rate": 2.6427743068376198e-05, + "loss": 0.0043, + "num_input_tokens_seen": 20474880, + "step": 97015 + }, + { + "epoch": 10.673267326732674, + "grad_norm": 0.025622928515076637, + "learning_rate": 2.6425346911203725e-05, + "loss": 0.0396, + "num_input_tokens_seen": 20475936, + "step": 97020 + }, + { + "epoch": 10.673817381738173, + "grad_norm": 0.16797228157520294, + "learning_rate": 2.642295074089452e-05, + "loss": 0.0087, + "num_input_tokens_seen": 20476992, + "step": 97025 + }, + { + "epoch": 10.674367436743674, + "grad_norm": 0.019644206389784813, + "learning_rate": 2.642055455747066e-05, + "loss": 0.0043, + "num_input_tokens_seen": 20478048, + "step": 97030 + }, + { + "epoch": 10.674917491749175, + "grad_norm": 0.39510640501976013, + "learning_rate": 2.641815836095422e-05, + "loss": 0.0426, + "num_input_tokens_seen": 20479104, + "step": 97035 + }, + { + "epoch": 10.675467546754675, + "grad_norm": 0.08891035616397858, + "learning_rate": 2.6415762151367307e-05, + "loss": 0.0095, + "num_input_tokens_seen": 20480192, + "step": 97040 + }, + { + "epoch": 10.676017601760176, + "grad_norm": 0.010212937369942665, + "learning_rate": 2.6413365928731987e-05, + "loss": 0.0048, + "num_input_tokens_seen": 20481280, + "step": 97045 + }, + { + "epoch": 10.676567656765677, + "grad_norm": 0.06026912480592728, + "learning_rate": 2.6410969693070354e-05, + "loss": 0.0044, + "num_input_tokens_seen": 20482336, + "step": 97050 + }, + { + "epoch": 10.677117711771178, + "grad_norm": 0.017841938883066177, + "learning_rate": 2.640857344440449e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20483424, + "step": 97055 + }, + { + "epoch": 10.677667766776677, + "grad_norm": 1.6251674890518188, + "learning_rate": 2.640617718275648e-05, + "loss": 0.1502, + "num_input_tokens_seen": 20484480, + "step": 97060 + }, + { + "epoch": 10.678217821782178, + "grad_norm": 0.11823073774576187, + "learning_rate": 2.6403780908148405e-05, + "loss": 0.031, + "num_input_tokens_seen": 20485536, + "step": 97065 + }, + { + "epoch": 10.67876787678768, + "grad_norm": 0.07398498803377151, + "learning_rate": 2.6401384620602366e-05, + "loss": 0.0028, + "num_input_tokens_seen": 20486624, + "step": 97070 + }, + { + "epoch": 10.679317931793179, + "grad_norm": 0.00890378188341856, + "learning_rate": 2.6398988320140432e-05, + "loss": 0.0111, + "num_input_tokens_seen": 20487680, + "step": 97075 + }, + { + "epoch": 10.67986798679868, + "grad_norm": 0.02921922318637371, + "learning_rate": 2.639659200678469e-05, + "loss": 0.0043, + "num_input_tokens_seen": 20488704, + "step": 97080 + }, + { + "epoch": 10.680418041804181, + "grad_norm": 0.30101484060287476, + "learning_rate": 2.6394195680557233e-05, + "loss": 0.1152, + "num_input_tokens_seen": 20489792, + "step": 97085 + }, + { + "epoch": 10.68096809680968, + "grad_norm": 0.2699750065803528, + "learning_rate": 2.6391799341480144e-05, + "loss": 0.0163, + "num_input_tokens_seen": 20490848, + "step": 97090 + }, + { + "epoch": 10.681518151815181, + "grad_norm": 0.008960740640759468, + "learning_rate": 2.6389402989575508e-05, + "loss": 0.0137, + "num_input_tokens_seen": 20491936, + "step": 97095 + }, + { + "epoch": 10.682068206820682, + "grad_norm": 0.014718297868967056, + "learning_rate": 2.6387006624865407e-05, + "loss": 0.0016, + "num_input_tokens_seen": 20493056, + "step": 97100 + }, + { + "epoch": 10.682618261826182, + "grad_norm": 0.12468933314085007, + "learning_rate": 2.638461024737194e-05, + "loss": 0.0217, + "num_input_tokens_seen": 20494112, + "step": 97105 + }, + { + "epoch": 10.683168316831683, + "grad_norm": 0.036385711282491684, + "learning_rate": 2.638221385711717e-05, + "loss": 0.0683, + "num_input_tokens_seen": 20495136, + "step": 97110 + }, + { + "epoch": 10.683718371837184, + "grad_norm": 0.2468547821044922, + "learning_rate": 2.6379817454123214e-05, + "loss": 0.0066, + "num_input_tokens_seen": 20496224, + "step": 97115 + }, + { + "epoch": 10.684268426842685, + "grad_norm": 4.404792308807373, + "learning_rate": 2.6377421038412138e-05, + "loss": 0.0117, + "num_input_tokens_seen": 20497376, + "step": 97120 + }, + { + "epoch": 10.684818481848184, + "grad_norm": 0.01154788862913847, + "learning_rate": 2.6375024610006033e-05, + "loss": 0.0026, + "num_input_tokens_seen": 20498400, + "step": 97125 + }, + { + "epoch": 10.685368536853685, + "grad_norm": 0.08054407685995102, + "learning_rate": 2.637262816892698e-05, + "loss": 0.0036, + "num_input_tokens_seen": 20499488, + "step": 97130 + }, + { + "epoch": 10.685918591859187, + "grad_norm": 2.966122627258301, + "learning_rate": 2.637023171519708e-05, + "loss": 0.0577, + "num_input_tokens_seen": 20500608, + "step": 97135 + }, + { + "epoch": 10.686468646864686, + "grad_norm": 0.011333858594298363, + "learning_rate": 2.6367835248838403e-05, + "loss": 0.034, + "num_input_tokens_seen": 20501600, + "step": 97140 + }, + { + "epoch": 10.687018701870187, + "grad_norm": 0.010441792197525501, + "learning_rate": 2.6365438769873058e-05, + "loss": 0.0006, + "num_input_tokens_seen": 20502656, + "step": 97145 + }, + { + "epoch": 10.687568756875688, + "grad_norm": 0.01379869319498539, + "learning_rate": 2.6363042278323107e-05, + "loss": 0.0319, + "num_input_tokens_seen": 20503680, + "step": 97150 + }, + { + "epoch": 10.688118811881187, + "grad_norm": 0.09698297083377838, + "learning_rate": 2.636064577421065e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20504768, + "step": 97155 + }, + { + "epoch": 10.688668866886688, + "grad_norm": 0.023339591920375824, + "learning_rate": 2.635824925755778e-05, + "loss": 0.0116, + "num_input_tokens_seen": 20505824, + "step": 97160 + }, + { + "epoch": 10.68921892189219, + "grad_norm": 0.022818146273493767, + "learning_rate": 2.6355852728386577e-05, + "loss": 0.0675, + "num_input_tokens_seen": 20506880, + "step": 97165 + }, + { + "epoch": 10.689768976897689, + "grad_norm": 0.6885613799095154, + "learning_rate": 2.6353456186719127e-05, + "loss": 0.0213, + "num_input_tokens_seen": 20508000, + "step": 97170 + }, + { + "epoch": 10.69031903190319, + "grad_norm": 1.0750659704208374, + "learning_rate": 2.635105963257753e-05, + "loss": 0.0416, + "num_input_tokens_seen": 20508992, + "step": 97175 + }, + { + "epoch": 10.690869086908691, + "grad_norm": 0.0050660064443945885, + "learning_rate": 2.6348663065983846e-05, + "loss": 0.0603, + "num_input_tokens_seen": 20510144, + "step": 97180 + }, + { + "epoch": 10.691419141914192, + "grad_norm": 0.033056844025850296, + "learning_rate": 2.6346266486960196e-05, + "loss": 0.0414, + "num_input_tokens_seen": 20511168, + "step": 97185 + }, + { + "epoch": 10.691969196919691, + "grad_norm": 0.562874436378479, + "learning_rate": 2.6343869895528655e-05, + "loss": 0.0059, + "num_input_tokens_seen": 20512192, + "step": 97190 + }, + { + "epoch": 10.692519251925193, + "grad_norm": 0.006658255588263273, + "learning_rate": 2.6341473291711304e-05, + "loss": 0.0097, + "num_input_tokens_seen": 20513184, + "step": 97195 + }, + { + "epoch": 10.693069306930694, + "grad_norm": 0.40969035029411316, + "learning_rate": 2.633907667553024e-05, + "loss": 0.0061, + "num_input_tokens_seen": 20514240, + "step": 97200 + }, + { + "epoch": 10.693619361936193, + "grad_norm": 0.028655190020799637, + "learning_rate": 2.633668004700755e-05, + "loss": 0.104, + "num_input_tokens_seen": 20515360, + "step": 97205 + }, + { + "epoch": 10.694169416941694, + "grad_norm": 0.009146546013653278, + "learning_rate": 2.6334283406165312e-05, + "loss": 0.003, + "num_input_tokens_seen": 20516512, + "step": 97210 + }, + { + "epoch": 10.694719471947195, + "grad_norm": 0.686696469783783, + "learning_rate": 2.6331886753025635e-05, + "loss": 0.1126, + "num_input_tokens_seen": 20517568, + "step": 97215 + }, + { + "epoch": 10.695269526952695, + "grad_norm": 0.016875913366675377, + "learning_rate": 2.63294900876106e-05, + "loss": 0.0294, + "num_input_tokens_seen": 20518560, + "step": 97220 + }, + { + "epoch": 10.695819581958196, + "grad_norm": 0.203578382730484, + "learning_rate": 2.6327093409942278e-05, + "loss": 0.0626, + "num_input_tokens_seen": 20519616, + "step": 97225 + }, + { + "epoch": 10.696369636963697, + "grad_norm": 0.030606040731072426, + "learning_rate": 2.6324696720042784e-05, + "loss": 0.1427, + "num_input_tokens_seen": 20520640, + "step": 97230 + }, + { + "epoch": 10.696919691969196, + "grad_norm": 0.021899832412600517, + "learning_rate": 2.6322300017934192e-05, + "loss": 0.013, + "num_input_tokens_seen": 20521696, + "step": 97235 + }, + { + "epoch": 10.697469746974697, + "grad_norm": 0.01732317917048931, + "learning_rate": 2.6319903303638598e-05, + "loss": 0.0853, + "num_input_tokens_seen": 20522688, + "step": 97240 + }, + { + "epoch": 10.698019801980198, + "grad_norm": 0.020440561696887016, + "learning_rate": 2.6317506577178086e-05, + "loss": 0.043, + "num_input_tokens_seen": 20523744, + "step": 97245 + }, + { + "epoch": 10.6985698569857, + "grad_norm": 0.29862186312675476, + "learning_rate": 2.6315109838574754e-05, + "loss": 0.0139, + "num_input_tokens_seen": 20524832, + "step": 97250 + }, + { + "epoch": 10.699119911991199, + "grad_norm": 0.010045159608125687, + "learning_rate": 2.631271308785067e-05, + "loss": 0.0114, + "num_input_tokens_seen": 20525824, + "step": 97255 + }, + { + "epoch": 10.6996699669967, + "grad_norm": 0.24295860528945923, + "learning_rate": 2.6310316325027956e-05, + "loss": 0.005, + "num_input_tokens_seen": 20526912, + "step": 97260 + }, + { + "epoch": 10.7002200220022, + "grad_norm": 2.2084426879882812, + "learning_rate": 2.630791955012868e-05, + "loss": 0.0563, + "num_input_tokens_seen": 20528000, + "step": 97265 + }, + { + "epoch": 10.7007700770077, + "grad_norm": 0.0056627667509019375, + "learning_rate": 2.6305522763174935e-05, + "loss": 0.0067, + "num_input_tokens_seen": 20529088, + "step": 97270 + }, + { + "epoch": 10.701320132013201, + "grad_norm": 0.0385672003030777, + "learning_rate": 2.6303125964188812e-05, + "loss": 0.0041, + "num_input_tokens_seen": 20530144, + "step": 97275 + }, + { + "epoch": 10.701870187018702, + "grad_norm": 0.42099812626838684, + "learning_rate": 2.630072915319241e-05, + "loss": 0.0407, + "num_input_tokens_seen": 20531104, + "step": 97280 + }, + { + "epoch": 10.702420242024202, + "grad_norm": 0.9309707880020142, + "learning_rate": 2.6298332330207805e-05, + "loss": 0.067, + "num_input_tokens_seen": 20532256, + "step": 97285 + }, + { + "epoch": 10.702970297029703, + "grad_norm": 0.08335351943969727, + "learning_rate": 2.6295935495257097e-05, + "loss": 0.0279, + "num_input_tokens_seen": 20533280, + "step": 97290 + }, + { + "epoch": 10.703520352035204, + "grad_norm": 0.002615799428895116, + "learning_rate": 2.6293538648362364e-05, + "loss": 0.0047, + "num_input_tokens_seen": 20534240, + "step": 97295 + }, + { + "epoch": 10.704070407040705, + "grad_norm": 0.09149880707263947, + "learning_rate": 2.6291141789545714e-05, + "loss": 0.0721, + "num_input_tokens_seen": 20535328, + "step": 97300 + }, + { + "epoch": 10.704620462046204, + "grad_norm": 1.7519328594207764, + "learning_rate": 2.6288744918829233e-05, + "loss": 0.0689, + "num_input_tokens_seen": 20536416, + "step": 97305 + }, + { + "epoch": 10.705170517051705, + "grad_norm": 0.02191893942654133, + "learning_rate": 2.6286348036235003e-05, + "loss": 0.0201, + "num_input_tokens_seen": 20537408, + "step": 97310 + }, + { + "epoch": 10.705720572057206, + "grad_norm": 1.1372519731521606, + "learning_rate": 2.6283951141785123e-05, + "loss": 0.0152, + "num_input_tokens_seen": 20538432, + "step": 97315 + }, + { + "epoch": 10.706270627062706, + "grad_norm": 0.06584387272596359, + "learning_rate": 2.6281554235501682e-05, + "loss": 0.0026, + "num_input_tokens_seen": 20539456, + "step": 97320 + }, + { + "epoch": 10.706820682068207, + "grad_norm": 0.0073882052674889565, + "learning_rate": 2.6279157317406766e-05, + "loss": 0.0329, + "num_input_tokens_seen": 20540480, + "step": 97325 + }, + { + "epoch": 10.707370737073708, + "grad_norm": 0.016140788793563843, + "learning_rate": 2.6276760387522475e-05, + "loss": 0.1091, + "num_input_tokens_seen": 20541600, + "step": 97330 + }, + { + "epoch": 10.707920792079207, + "grad_norm": 0.040852852165699005, + "learning_rate": 2.62743634458709e-05, + "loss": 0.0165, + "num_input_tokens_seen": 20542656, + "step": 97335 + }, + { + "epoch": 10.708470847084708, + "grad_norm": 0.1597818285226822, + "learning_rate": 2.627196649247412e-05, + "loss": 0.0543, + "num_input_tokens_seen": 20543680, + "step": 97340 + }, + { + "epoch": 10.70902090209021, + "grad_norm": 2.1431121826171875, + "learning_rate": 2.6269569527354238e-05, + "loss": 0.143, + "num_input_tokens_seen": 20544800, + "step": 97345 + }, + { + "epoch": 10.70957095709571, + "grad_norm": 0.05957894027233124, + "learning_rate": 2.626717255053334e-05, + "loss": 0.0035, + "num_input_tokens_seen": 20545792, + "step": 97350 + }, + { + "epoch": 10.71012101210121, + "grad_norm": 0.027055993676185608, + "learning_rate": 2.626477556203352e-05, + "loss": 0.0062, + "num_input_tokens_seen": 20546848, + "step": 97355 + }, + { + "epoch": 10.710671067106711, + "grad_norm": 0.00846403744071722, + "learning_rate": 2.6262378561876887e-05, + "loss": 0.0022, + "num_input_tokens_seen": 20548000, + "step": 97360 + }, + { + "epoch": 10.711221122112212, + "grad_norm": 0.05210829898715019, + "learning_rate": 2.6259981550085504e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20549088, + "step": 97365 + }, + { + "epoch": 10.711771177117711, + "grad_norm": 0.8220826983451843, + "learning_rate": 2.6257584526681472e-05, + "loss": 0.1067, + "num_input_tokens_seen": 20550112, + "step": 97370 + }, + { + "epoch": 10.712321232123212, + "grad_norm": 0.05486242473125458, + "learning_rate": 2.6255187491686888e-05, + "loss": 0.0023, + "num_input_tokens_seen": 20551232, + "step": 97375 + }, + { + "epoch": 10.712871287128714, + "grad_norm": 0.044740621000528336, + "learning_rate": 2.6252790445123854e-05, + "loss": 0.0066, + "num_input_tokens_seen": 20552288, + "step": 97380 + }, + { + "epoch": 10.713421342134213, + "grad_norm": 0.09260555356740952, + "learning_rate": 2.625039338701444e-05, + "loss": 0.0911, + "num_input_tokens_seen": 20553376, + "step": 97385 + }, + { + "epoch": 10.713971397139714, + "grad_norm": 0.0044217766262590885, + "learning_rate": 2.624799631738075e-05, + "loss": 0.0633, + "num_input_tokens_seen": 20554464, + "step": 97390 + }, + { + "epoch": 10.714521452145215, + "grad_norm": 0.05511474609375, + "learning_rate": 2.6245599236244884e-05, + "loss": 0.0155, + "num_input_tokens_seen": 20555456, + "step": 97395 + }, + { + "epoch": 10.715071507150714, + "grad_norm": 0.012758439406752586, + "learning_rate": 2.624320214362892e-05, + "loss": 0.0634, + "num_input_tokens_seen": 20556448, + "step": 97400 + }, + { + "epoch": 10.715621562156215, + "grad_norm": 0.013566426932811737, + "learning_rate": 2.6240805039554973e-05, + "loss": 0.0022, + "num_input_tokens_seen": 20557536, + "step": 97405 + }, + { + "epoch": 10.716171617161717, + "grad_norm": 0.009693910367786884, + "learning_rate": 2.623840792404511e-05, + "loss": 0.0305, + "num_input_tokens_seen": 20558528, + "step": 97410 + }, + { + "epoch": 10.716721672167218, + "grad_norm": 0.04278216138482094, + "learning_rate": 2.623601079712143e-05, + "loss": 0.0862, + "num_input_tokens_seen": 20559520, + "step": 97415 + }, + { + "epoch": 10.717271727172717, + "grad_norm": 2.0273935794830322, + "learning_rate": 2.623361365880604e-05, + "loss": 0.1358, + "num_input_tokens_seen": 20560576, + "step": 97420 + }, + { + "epoch": 10.717821782178218, + "grad_norm": 0.015541721135377884, + "learning_rate": 2.623121650912102e-05, + "loss": 0.0394, + "num_input_tokens_seen": 20561664, + "step": 97425 + }, + { + "epoch": 10.718371837183719, + "grad_norm": 0.658406674861908, + "learning_rate": 2.6228819348088475e-05, + "loss": 0.0591, + "num_input_tokens_seen": 20562720, + "step": 97430 + }, + { + "epoch": 10.718921892189218, + "grad_norm": 0.09367735683917999, + "learning_rate": 2.6226422175730487e-05, + "loss": 0.0322, + "num_input_tokens_seen": 20563776, + "step": 97435 + }, + { + "epoch": 10.71947194719472, + "grad_norm": 0.007767377886921167, + "learning_rate": 2.6224024992069156e-05, + "loss": 0.0041, + "num_input_tokens_seen": 20564832, + "step": 97440 + }, + { + "epoch": 10.72002200220022, + "grad_norm": 1.115161657333374, + "learning_rate": 2.622162779712657e-05, + "loss": 0.027, + "num_input_tokens_seen": 20565888, + "step": 97445 + }, + { + "epoch": 10.72057205720572, + "grad_norm": 0.028623387217521667, + "learning_rate": 2.621923059092483e-05, + "loss": 0.0659, + "num_input_tokens_seen": 20567008, + "step": 97450 + }, + { + "epoch": 10.721122112211221, + "grad_norm": 0.6197002530097961, + "learning_rate": 2.621683337348603e-05, + "loss": 0.0198, + "num_input_tokens_seen": 20568096, + "step": 97455 + }, + { + "epoch": 10.721672167216722, + "grad_norm": 1.0763931274414062, + "learning_rate": 2.6214436144832256e-05, + "loss": 0.053, + "num_input_tokens_seen": 20569120, + "step": 97460 + }, + { + "epoch": 10.722222222222221, + "grad_norm": 0.41918739676475525, + "learning_rate": 2.621203890498561e-05, + "loss": 0.0054, + "num_input_tokens_seen": 20570112, + "step": 97465 + }, + { + "epoch": 10.722772277227723, + "grad_norm": 0.08976416289806366, + "learning_rate": 2.620964165396818e-05, + "loss": 0.008, + "num_input_tokens_seen": 20571200, + "step": 97470 + }, + { + "epoch": 10.723322332233224, + "grad_norm": 0.1319599449634552, + "learning_rate": 2.6207244391802065e-05, + "loss": 0.1194, + "num_input_tokens_seen": 20572288, + "step": 97475 + }, + { + "epoch": 10.723872387238725, + "grad_norm": 0.07034587115049362, + "learning_rate": 2.6204847118509358e-05, + "loss": 0.0056, + "num_input_tokens_seen": 20573248, + "step": 97480 + }, + { + "epoch": 10.724422442244224, + "grad_norm": 2.34700870513916, + "learning_rate": 2.6202449834112147e-05, + "loss": 0.2115, + "num_input_tokens_seen": 20574368, + "step": 97485 + }, + { + "epoch": 10.724972497249725, + "grad_norm": 0.9438545107841492, + "learning_rate": 2.6200052538632543e-05, + "loss": 0.0586, + "num_input_tokens_seen": 20575392, + "step": 97490 + }, + { + "epoch": 10.725522552255226, + "grad_norm": 0.03671731799840927, + "learning_rate": 2.6197655232092623e-05, + "loss": 0.0118, + "num_input_tokens_seen": 20576384, + "step": 97495 + }, + { + "epoch": 10.726072607260726, + "grad_norm": 0.0934702679514885, + "learning_rate": 2.6195257914514492e-05, + "loss": 0.0091, + "num_input_tokens_seen": 20577440, + "step": 97500 + }, + { + "epoch": 10.726622662266227, + "grad_norm": 0.10765256732702255, + "learning_rate": 2.6192860585920244e-05, + "loss": 0.0112, + "num_input_tokens_seen": 20578496, + "step": 97505 + }, + { + "epoch": 10.727172717271728, + "grad_norm": 0.07436259090900421, + "learning_rate": 2.619046324633197e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20579584, + "step": 97510 + }, + { + "epoch": 10.727722772277227, + "grad_norm": 0.6058571338653564, + "learning_rate": 2.6188065895771764e-05, + "loss": 0.0773, + "num_input_tokens_seen": 20580672, + "step": 97515 + }, + { + "epoch": 10.728272827282728, + "grad_norm": 0.008448238484561443, + "learning_rate": 2.618566853426173e-05, + "loss": 0.0065, + "num_input_tokens_seen": 20581728, + "step": 97520 + }, + { + "epoch": 10.72882288228823, + "grad_norm": 0.16238254308700562, + "learning_rate": 2.6183271161823962e-05, + "loss": 0.0105, + "num_input_tokens_seen": 20582784, + "step": 97525 + }, + { + "epoch": 10.729372937293729, + "grad_norm": 1.2453161478042603, + "learning_rate": 2.6180873778480543e-05, + "loss": 0.0341, + "num_input_tokens_seen": 20583872, + "step": 97530 + }, + { + "epoch": 10.72992299229923, + "grad_norm": 0.6002253890037537, + "learning_rate": 2.617847638425358e-05, + "loss": 0.0368, + "num_input_tokens_seen": 20584992, + "step": 97535 + }, + { + "epoch": 10.73047304730473, + "grad_norm": 0.007185432128608227, + "learning_rate": 2.617607897916517e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20586048, + "step": 97540 + }, + { + "epoch": 10.731023102310232, + "grad_norm": 0.0608820840716362, + "learning_rate": 2.6173681563237395e-05, + "loss": 0.0134, + "num_input_tokens_seen": 20587072, + "step": 97545 + }, + { + "epoch": 10.731573157315731, + "grad_norm": 0.09777811914682388, + "learning_rate": 2.6171284136492374e-05, + "loss": 0.0028, + "num_input_tokens_seen": 20588192, + "step": 97550 + }, + { + "epoch": 10.732123212321232, + "grad_norm": 0.04208431392908096, + "learning_rate": 2.6168886698952182e-05, + "loss": 0.0162, + "num_input_tokens_seen": 20589216, + "step": 97555 + }, + { + "epoch": 10.732673267326733, + "grad_norm": 0.020480142906308174, + "learning_rate": 2.6166489250638913e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20590336, + "step": 97560 + }, + { + "epoch": 10.733223322332233, + "grad_norm": 0.2825081944465637, + "learning_rate": 2.6164091791574686e-05, + "loss": 0.1016, + "num_input_tokens_seen": 20591360, + "step": 97565 + }, + { + "epoch": 10.733773377337734, + "grad_norm": 0.03310520574450493, + "learning_rate": 2.6161694321781577e-05, + "loss": 0.0025, + "num_input_tokens_seen": 20592416, + "step": 97570 + }, + { + "epoch": 10.734323432343235, + "grad_norm": 0.21260935068130493, + "learning_rate": 2.615929684128169e-05, + "loss": 0.0228, + "num_input_tokens_seen": 20593408, + "step": 97575 + }, + { + "epoch": 10.734873487348734, + "grad_norm": 0.016953425481915474, + "learning_rate": 2.615689935009712e-05, + "loss": 0.0106, + "num_input_tokens_seen": 20594464, + "step": 97580 + }, + { + "epoch": 10.735423542354235, + "grad_norm": 0.009721118956804276, + "learning_rate": 2.615450184824997e-05, + "loss": 0.0526, + "num_input_tokens_seen": 20595488, + "step": 97585 + }, + { + "epoch": 10.735973597359736, + "grad_norm": 0.09097536653280258, + "learning_rate": 2.6152104335762316e-05, + "loss": 0.0051, + "num_input_tokens_seen": 20596576, + "step": 97590 + }, + { + "epoch": 10.736523652365236, + "grad_norm": 0.6077711582183838, + "learning_rate": 2.6149706812656282e-05, + "loss": 0.0269, + "num_input_tokens_seen": 20597664, + "step": 97595 + }, + { + "epoch": 10.737073707370737, + "grad_norm": 1.0678315162658691, + "learning_rate": 2.6147309278953947e-05, + "loss": 0.149, + "num_input_tokens_seen": 20598720, + "step": 97600 + }, + { + "epoch": 10.737623762376238, + "grad_norm": 0.035795047879219055, + "learning_rate": 2.6144911734677413e-05, + "loss": 0.0043, + "num_input_tokens_seen": 20599744, + "step": 97605 + }, + { + "epoch": 10.738173817381739, + "grad_norm": 1.6926863193511963, + "learning_rate": 2.6142514179848775e-05, + "loss": 0.0662, + "num_input_tokens_seen": 20600768, + "step": 97610 + }, + { + "epoch": 10.738723872387238, + "grad_norm": 0.047191817313432693, + "learning_rate": 2.614011661449014e-05, + "loss": 0.0159, + "num_input_tokens_seen": 20601856, + "step": 97615 + }, + { + "epoch": 10.73927392739274, + "grad_norm": 0.012504637241363525, + "learning_rate": 2.6137719038623592e-05, + "loss": 0.0016, + "num_input_tokens_seen": 20602912, + "step": 97620 + }, + { + "epoch": 10.73982398239824, + "grad_norm": 0.034973952919244766, + "learning_rate": 2.6135321452271234e-05, + "loss": 0.0309, + "num_input_tokens_seen": 20603904, + "step": 97625 + }, + { + "epoch": 10.74037403740374, + "grad_norm": 1.4671367406845093, + "learning_rate": 2.6132923855455153e-05, + "loss": 0.101, + "num_input_tokens_seen": 20604928, + "step": 97630 + }, + { + "epoch": 10.74092409240924, + "grad_norm": 0.021811923012137413, + "learning_rate": 2.6130526248197467e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20605984, + "step": 97635 + }, + { + "epoch": 10.741474147414742, + "grad_norm": 0.011637702584266663, + "learning_rate": 2.6128128630520265e-05, + "loss": 0.0628, + "num_input_tokens_seen": 20607040, + "step": 97640 + }, + { + "epoch": 10.742024202420241, + "grad_norm": 0.020166607573628426, + "learning_rate": 2.6125731002445643e-05, + "loss": 0.0569, + "num_input_tokens_seen": 20608064, + "step": 97645 + }, + { + "epoch": 10.742574257425742, + "grad_norm": 0.2180938571691513, + "learning_rate": 2.6123333363995693e-05, + "loss": 0.1105, + "num_input_tokens_seen": 20609088, + "step": 97650 + }, + { + "epoch": 10.743124312431243, + "grad_norm": 0.019935887306928635, + "learning_rate": 2.6120935715192525e-05, + "loss": 0.0039, + "num_input_tokens_seen": 20610144, + "step": 97655 + }, + { + "epoch": 10.743674367436743, + "grad_norm": 0.12032272666692734, + "learning_rate": 2.6118538056058217e-05, + "loss": 0.0045, + "num_input_tokens_seen": 20611168, + "step": 97660 + }, + { + "epoch": 10.744224422442244, + "grad_norm": 0.16095398366451263, + "learning_rate": 2.6116140386614895e-05, + "loss": 0.0042, + "num_input_tokens_seen": 20612288, + "step": 97665 + }, + { + "epoch": 10.744774477447745, + "grad_norm": 0.2568682134151459, + "learning_rate": 2.6113742706884643e-05, + "loss": 0.0152, + "num_input_tokens_seen": 20613312, + "step": 97670 + }, + { + "epoch": 10.745324532453246, + "grad_norm": 0.5470669269561768, + "learning_rate": 2.6111345016889544e-05, + "loss": 0.0105, + "num_input_tokens_seen": 20614368, + "step": 97675 + }, + { + "epoch": 10.745874587458745, + "grad_norm": 0.22061488032341003, + "learning_rate": 2.6108947316651723e-05, + "loss": 0.0046, + "num_input_tokens_seen": 20615424, + "step": 97680 + }, + { + "epoch": 10.746424642464246, + "grad_norm": 0.08699267357587814, + "learning_rate": 2.6106549606193266e-05, + "loss": 0.0428, + "num_input_tokens_seen": 20616480, + "step": 97685 + }, + { + "epoch": 10.746974697469748, + "grad_norm": 0.008619468659162521, + "learning_rate": 2.6104151885536266e-05, + "loss": 0.0319, + "num_input_tokens_seen": 20617568, + "step": 97690 + }, + { + "epoch": 10.747524752475247, + "grad_norm": 0.21320149302482605, + "learning_rate": 2.6101754154702835e-05, + "loss": 0.0246, + "num_input_tokens_seen": 20618656, + "step": 97695 + }, + { + "epoch": 10.748074807480748, + "grad_norm": 0.011295215226709843, + "learning_rate": 2.6099356413715065e-05, + "loss": 0.02, + "num_input_tokens_seen": 20619648, + "step": 97700 + }, + { + "epoch": 10.748624862486249, + "grad_norm": 0.017679523676633835, + "learning_rate": 2.6096958662595046e-05, + "loss": 0.117, + "num_input_tokens_seen": 20620704, + "step": 97705 + }, + { + "epoch": 10.749174917491748, + "grad_norm": 0.030223222449421883, + "learning_rate": 2.609456090136489e-05, + "loss": 0.0456, + "num_input_tokens_seen": 20621792, + "step": 97710 + }, + { + "epoch": 10.74972497249725, + "grad_norm": 0.9244951009750366, + "learning_rate": 2.6092163130046692e-05, + "loss": 0.0325, + "num_input_tokens_seen": 20622816, + "step": 97715 + }, + { + "epoch": 10.75027502750275, + "grad_norm": 1.2920031547546387, + "learning_rate": 2.608976534866255e-05, + "loss": 0.0715, + "num_input_tokens_seen": 20623936, + "step": 97720 + }, + { + "epoch": 10.750825082508252, + "grad_norm": 0.050066374242305756, + "learning_rate": 2.6087367557234567e-05, + "loss": 0.0556, + "num_input_tokens_seen": 20624896, + "step": 97725 + }, + { + "epoch": 10.751375137513751, + "grad_norm": 0.6481472253799438, + "learning_rate": 2.6084969755784833e-05, + "loss": 0.0367, + "num_input_tokens_seen": 20625888, + "step": 97730 + }, + { + "epoch": 10.751925192519252, + "grad_norm": 0.8017972707748413, + "learning_rate": 2.6082571944335455e-05, + "loss": 0.0376, + "num_input_tokens_seen": 20626880, + "step": 97735 + }, + { + "epoch": 10.752475247524753, + "grad_norm": 0.17692185938358307, + "learning_rate": 2.6080174122908537e-05, + "loss": 0.0088, + "num_input_tokens_seen": 20628000, + "step": 97740 + }, + { + "epoch": 10.753025302530252, + "grad_norm": 0.09214196354150772, + "learning_rate": 2.6077776291526163e-05, + "loss": 0.0711, + "num_input_tokens_seen": 20629056, + "step": 97745 + }, + { + "epoch": 10.753575357535754, + "grad_norm": 0.23294946551322937, + "learning_rate": 2.6075378450210447e-05, + "loss": 0.0188, + "num_input_tokens_seen": 20630048, + "step": 97750 + }, + { + "epoch": 10.754125412541255, + "grad_norm": 0.4707620143890381, + "learning_rate": 2.6072980598983487e-05, + "loss": 0.0373, + "num_input_tokens_seen": 20631168, + "step": 97755 + }, + { + "epoch": 10.754675467546754, + "grad_norm": 0.19842852652072906, + "learning_rate": 2.6070582737867376e-05, + "loss": 0.01, + "num_input_tokens_seen": 20632224, + "step": 97760 + }, + { + "epoch": 10.755225522552255, + "grad_norm": 0.009374580346047878, + "learning_rate": 2.606818486688422e-05, + "loss": 0.0039, + "num_input_tokens_seen": 20633280, + "step": 97765 + }, + { + "epoch": 10.755775577557756, + "grad_norm": 0.018599558621644974, + "learning_rate": 2.606578698605611e-05, + "loss": 0.0047, + "num_input_tokens_seen": 20634368, + "step": 97770 + }, + { + "epoch": 10.756325632563257, + "grad_norm": 0.011374604888260365, + "learning_rate": 2.6063389095405155e-05, + "loss": 0.0069, + "num_input_tokens_seen": 20635488, + "step": 97775 + }, + { + "epoch": 10.756875687568757, + "grad_norm": 0.06941384077072144, + "learning_rate": 2.606099119495346e-05, + "loss": 0.0145, + "num_input_tokens_seen": 20636608, + "step": 97780 + }, + { + "epoch": 10.757425742574258, + "grad_norm": 2.9613683223724365, + "learning_rate": 2.6058593284723114e-05, + "loss": 0.0406, + "num_input_tokens_seen": 20637632, + "step": 97785 + }, + { + "epoch": 10.757975797579759, + "grad_norm": 0.6871759295463562, + "learning_rate": 2.605619536473622e-05, + "loss": 0.0949, + "num_input_tokens_seen": 20638656, + "step": 97790 + }, + { + "epoch": 10.758525852585258, + "grad_norm": 0.19764834642410278, + "learning_rate": 2.605379743501488e-05, + "loss": 0.0185, + "num_input_tokens_seen": 20639680, + "step": 97795 + }, + { + "epoch": 10.75907590759076, + "grad_norm": 0.21606872975826263, + "learning_rate": 2.6051399495581196e-05, + "loss": 0.1192, + "num_input_tokens_seen": 20640736, + "step": 97800 + }, + { + "epoch": 10.75962596259626, + "grad_norm": 0.9389128684997559, + "learning_rate": 2.6049001546457265e-05, + "loss": 0.1083, + "num_input_tokens_seen": 20641856, + "step": 97805 + }, + { + "epoch": 10.76017601760176, + "grad_norm": 0.005672059021890163, + "learning_rate": 2.6046603587665198e-05, + "loss": 0.025, + "num_input_tokens_seen": 20642944, + "step": 97810 + }, + { + "epoch": 10.76072607260726, + "grad_norm": 0.006918081548064947, + "learning_rate": 2.6044205619227086e-05, + "loss": 0.0588, + "num_input_tokens_seen": 20644000, + "step": 97815 + }, + { + "epoch": 10.761276127612762, + "grad_norm": 0.17930062115192413, + "learning_rate": 2.6041807641165023e-05, + "loss": 0.0069, + "num_input_tokens_seen": 20645024, + "step": 97820 + }, + { + "epoch": 10.761826182618261, + "grad_norm": 0.014944612979888916, + "learning_rate": 2.6039409653501123e-05, + "loss": 0.0067, + "num_input_tokens_seen": 20646016, + "step": 97825 + }, + { + "epoch": 10.762376237623762, + "grad_norm": 0.0035720409359782934, + "learning_rate": 2.6037011656257487e-05, + "loss": 0.0021, + "num_input_tokens_seen": 20647008, + "step": 97830 + }, + { + "epoch": 10.762926292629263, + "grad_norm": 0.029497487470507622, + "learning_rate": 2.603461364945621e-05, + "loss": 0.0122, + "num_input_tokens_seen": 20648064, + "step": 97835 + }, + { + "epoch": 10.763476347634764, + "grad_norm": 0.0792309120297432, + "learning_rate": 2.6032215633119393e-05, + "loss": 0.0167, + "num_input_tokens_seen": 20649152, + "step": 97840 + }, + { + "epoch": 10.764026402640264, + "grad_norm": 0.2038002759218216, + "learning_rate": 2.6029817607269147e-05, + "loss": 0.0113, + "num_input_tokens_seen": 20650240, + "step": 97845 + }, + { + "epoch": 10.764576457645765, + "grad_norm": 0.014623304829001427, + "learning_rate": 2.6027419571927552e-05, + "loss": 0.0214, + "num_input_tokens_seen": 20651296, + "step": 97850 + }, + { + "epoch": 10.765126512651266, + "grad_norm": 0.00969963613897562, + "learning_rate": 2.6025021527116745e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20652448, + "step": 97855 + }, + { + "epoch": 10.765676567656765, + "grad_norm": 0.0060070576146245, + "learning_rate": 2.6022623472858792e-05, + "loss": 0.0096, + "num_input_tokens_seen": 20653472, + "step": 97860 + }, + { + "epoch": 10.766226622662266, + "grad_norm": 0.009866670705378056, + "learning_rate": 2.6020225409175806e-05, + "loss": 0.1186, + "num_input_tokens_seen": 20654560, + "step": 97865 + }, + { + "epoch": 10.766776677667767, + "grad_norm": 0.01525522768497467, + "learning_rate": 2.60178273360899e-05, + "loss": 0.0088, + "num_input_tokens_seen": 20655648, + "step": 97870 + }, + { + "epoch": 10.767326732673267, + "grad_norm": 0.07305491715669632, + "learning_rate": 2.6015429253623163e-05, + "loss": 0.0081, + "num_input_tokens_seen": 20656768, + "step": 97875 + }, + { + "epoch": 10.767876787678768, + "grad_norm": 0.015054479241371155, + "learning_rate": 2.6013031161797707e-05, + "loss": 0.017, + "num_input_tokens_seen": 20657792, + "step": 97880 + }, + { + "epoch": 10.768426842684269, + "grad_norm": 0.019307082518935204, + "learning_rate": 2.6010633060635624e-05, + "loss": 0.0386, + "num_input_tokens_seen": 20658816, + "step": 97885 + }, + { + "epoch": 10.768976897689768, + "grad_norm": 0.49912258982658386, + "learning_rate": 2.6008234950159028e-05, + "loss": 0.0194, + "num_input_tokens_seen": 20659904, + "step": 97890 + }, + { + "epoch": 10.76952695269527, + "grad_norm": 0.6793057322502136, + "learning_rate": 2.6005836830390003e-05, + "loss": 0.0269, + "num_input_tokens_seen": 20660960, + "step": 97895 + }, + { + "epoch": 10.77007700770077, + "grad_norm": 0.15071143209934235, + "learning_rate": 2.600343870135067e-05, + "loss": 0.111, + "num_input_tokens_seen": 20662080, + "step": 97900 + }, + { + "epoch": 10.770627062706271, + "grad_norm": 0.006025332026183605, + "learning_rate": 2.6001040563063122e-05, + "loss": 0.0033, + "num_input_tokens_seen": 20663232, + "step": 97905 + }, + { + "epoch": 10.77117711771177, + "grad_norm": 1.5883067846298218, + "learning_rate": 2.599864241554946e-05, + "loss": 0.0759, + "num_input_tokens_seen": 20664256, + "step": 97910 + }, + { + "epoch": 10.771727172717272, + "grad_norm": 0.01013023592531681, + "learning_rate": 2.59962442588318e-05, + "loss": 0.0578, + "num_input_tokens_seen": 20665280, + "step": 97915 + }, + { + "epoch": 10.772277227722773, + "grad_norm": 0.07956419140100479, + "learning_rate": 2.5993846092932227e-05, + "loss": 0.0197, + "num_input_tokens_seen": 20666304, + "step": 97920 + }, + { + "epoch": 10.772827282728272, + "grad_norm": 2.1352880001068115, + "learning_rate": 2.5991447917872852e-05, + "loss": 0.0734, + "num_input_tokens_seen": 20667328, + "step": 97925 + }, + { + "epoch": 10.773377337733773, + "grad_norm": 2.8716580867767334, + "learning_rate": 2.5989049733675787e-05, + "loss": 0.1554, + "num_input_tokens_seen": 20668416, + "step": 97930 + }, + { + "epoch": 10.773927392739274, + "grad_norm": 0.05899006128311157, + "learning_rate": 2.598665154036311e-05, + "loss": 0.0251, + "num_input_tokens_seen": 20669472, + "step": 97935 + }, + { + "epoch": 10.774477447744774, + "grad_norm": 0.07710527628660202, + "learning_rate": 2.5984253337956943e-05, + "loss": 0.1601, + "num_input_tokens_seen": 20670560, + "step": 97940 + }, + { + "epoch": 10.775027502750275, + "grad_norm": 0.010804865509271622, + "learning_rate": 2.5981855126479393e-05, + "loss": 0.1048, + "num_input_tokens_seen": 20671680, + "step": 97945 + }, + { + "epoch": 10.775577557755776, + "grad_norm": 0.07631482928991318, + "learning_rate": 2.597945690595255e-05, + "loss": 0.007, + "num_input_tokens_seen": 20672704, + "step": 97950 + }, + { + "epoch": 10.776127612761275, + "grad_norm": 0.09393609315156937, + "learning_rate": 2.5977058676398523e-05, + "loss": 0.003, + "num_input_tokens_seen": 20673728, + "step": 97955 + }, + { + "epoch": 10.776677667766776, + "grad_norm": 0.018626445904374123, + "learning_rate": 2.5974660437839416e-05, + "loss": 0.0094, + "num_input_tokens_seen": 20674880, + "step": 97960 + }, + { + "epoch": 10.777227722772277, + "grad_norm": 0.15381449460983276, + "learning_rate": 2.597226219029733e-05, + "loss": 0.0425, + "num_input_tokens_seen": 20675936, + "step": 97965 + }, + { + "epoch": 10.777777777777779, + "grad_norm": 0.042178601026535034, + "learning_rate": 2.596986393379437e-05, + "loss": 0.0169, + "num_input_tokens_seen": 20677024, + "step": 97970 + }, + { + "epoch": 10.778327832783278, + "grad_norm": 0.06298775970935822, + "learning_rate": 2.5967465668352647e-05, + "loss": 0.1104, + "num_input_tokens_seen": 20678048, + "step": 97975 + }, + { + "epoch": 10.778877887788779, + "grad_norm": 0.02472447231411934, + "learning_rate": 2.5965067393994248e-05, + "loss": 0.0258, + "num_input_tokens_seen": 20679136, + "step": 97980 + }, + { + "epoch": 10.77942794279428, + "grad_norm": 0.06508802622556686, + "learning_rate": 2.5962669110741287e-05, + "loss": 0.0088, + "num_input_tokens_seen": 20680160, + "step": 97985 + }, + { + "epoch": 10.77997799779978, + "grad_norm": 0.03575325757265091, + "learning_rate": 2.5960270818615867e-05, + "loss": 0.0232, + "num_input_tokens_seen": 20681184, + "step": 97990 + }, + { + "epoch": 10.78052805280528, + "grad_norm": 0.3139481246471405, + "learning_rate": 2.595787251764009e-05, + "loss": 0.008, + "num_input_tokens_seen": 20682304, + "step": 97995 + }, + { + "epoch": 10.781078107810782, + "grad_norm": 0.010341082699596882, + "learning_rate": 2.5955474207836072e-05, + "loss": 0.0092, + "num_input_tokens_seen": 20683360, + "step": 98000 + }, + { + "epoch": 10.781628162816281, + "grad_norm": 0.4555876851081848, + "learning_rate": 2.5953075889225897e-05, + "loss": 0.0162, + "num_input_tokens_seen": 20684416, + "step": 98005 + }, + { + "epoch": 10.782178217821782, + "grad_norm": 0.5369663834571838, + "learning_rate": 2.5950677561831678e-05, + "loss": 0.0135, + "num_input_tokens_seen": 20685472, + "step": 98010 + }, + { + "epoch": 10.782728272827283, + "grad_norm": 0.055620256811380386, + "learning_rate": 2.5948279225675526e-05, + "loss": 0.0068, + "num_input_tokens_seen": 20686528, + "step": 98015 + }, + { + "epoch": 10.783278327832782, + "grad_norm": 0.11737733334302902, + "learning_rate": 2.5945880880779533e-05, + "loss": 0.0186, + "num_input_tokens_seen": 20687584, + "step": 98020 + }, + { + "epoch": 10.783828382838283, + "grad_norm": 0.009471205994486809, + "learning_rate": 2.594348252716582e-05, + "loss": 0.0214, + "num_input_tokens_seen": 20688640, + "step": 98025 + }, + { + "epoch": 10.784378437843785, + "grad_norm": 0.008267899975180626, + "learning_rate": 2.5941084164856472e-05, + "loss": 0.0118, + "num_input_tokens_seen": 20689632, + "step": 98030 + }, + { + "epoch": 10.784928492849286, + "grad_norm": 0.017569517716765404, + "learning_rate": 2.593868579387361e-05, + "loss": 0.0497, + "num_input_tokens_seen": 20690688, + "step": 98035 + }, + { + "epoch": 10.785478547854785, + "grad_norm": 0.04597518965601921, + "learning_rate": 2.593628741423932e-05, + "loss": 0.0112, + "num_input_tokens_seen": 20691712, + "step": 98040 + }, + { + "epoch": 10.786028602860286, + "grad_norm": 0.008756413124501705, + "learning_rate": 2.5933889025975728e-05, + "loss": 0.0979, + "num_input_tokens_seen": 20692768, + "step": 98045 + }, + { + "epoch": 10.786578657865787, + "grad_norm": 0.012438913807272911, + "learning_rate": 2.5931490629104922e-05, + "loss": 0.0063, + "num_input_tokens_seen": 20693824, + "step": 98050 + }, + { + "epoch": 10.787128712871286, + "grad_norm": 0.06676594913005829, + "learning_rate": 2.592909222364902e-05, + "loss": 0.0335, + "num_input_tokens_seen": 20694880, + "step": 98055 + }, + { + "epoch": 10.787678767876788, + "grad_norm": 2.34183406829834, + "learning_rate": 2.592669380963012e-05, + "loss": 0.1123, + "num_input_tokens_seen": 20695936, + "step": 98060 + }, + { + "epoch": 10.788228822882289, + "grad_norm": 4.6791558265686035, + "learning_rate": 2.5924295387070325e-05, + "loss": 0.0204, + "num_input_tokens_seen": 20696960, + "step": 98065 + }, + { + "epoch": 10.788778877887788, + "grad_norm": 0.027126524597406387, + "learning_rate": 2.5921896955991748e-05, + "loss": 0.0106, + "num_input_tokens_seen": 20697952, + "step": 98070 + }, + { + "epoch": 10.789328932893289, + "grad_norm": 0.04827950894832611, + "learning_rate": 2.5919498516416485e-05, + "loss": 0.0538, + "num_input_tokens_seen": 20699008, + "step": 98075 + }, + { + "epoch": 10.78987898789879, + "grad_norm": 0.022051820531487465, + "learning_rate": 2.591710006836664e-05, + "loss": 0.0118, + "num_input_tokens_seen": 20700032, + "step": 98080 + }, + { + "epoch": 10.79042904290429, + "grad_norm": 0.06787826865911484, + "learning_rate": 2.5914701611864333e-05, + "loss": 0.0049, + "num_input_tokens_seen": 20701056, + "step": 98085 + }, + { + "epoch": 10.79097909790979, + "grad_norm": 0.06591121107339859, + "learning_rate": 2.5912303146931655e-05, + "loss": 0.0125, + "num_input_tokens_seen": 20702144, + "step": 98090 + }, + { + "epoch": 10.791529152915292, + "grad_norm": 0.005073595326393843, + "learning_rate": 2.590990467359072e-05, + "loss": 0.0034, + "num_input_tokens_seen": 20703200, + "step": 98095 + }, + { + "epoch": 10.792079207920793, + "grad_norm": 0.043740998953580856, + "learning_rate": 2.5907506191863627e-05, + "loss": 0.0436, + "num_input_tokens_seen": 20704224, + "step": 98100 + }, + { + "epoch": 10.792629262926292, + "grad_norm": 0.09130402654409409, + "learning_rate": 2.5905107701772492e-05, + "loss": 0.0041, + "num_input_tokens_seen": 20705280, + "step": 98105 + }, + { + "epoch": 10.793179317931793, + "grad_norm": 0.025276869535446167, + "learning_rate": 2.5902709203339403e-05, + "loss": 0.0042, + "num_input_tokens_seen": 20706368, + "step": 98110 + }, + { + "epoch": 10.793729372937294, + "grad_norm": 0.36926254630088806, + "learning_rate": 2.5900310696586483e-05, + "loss": 0.0812, + "num_input_tokens_seen": 20707392, + "step": 98115 + }, + { + "epoch": 10.794279427942794, + "grad_norm": 0.011076978407800198, + "learning_rate": 2.589791218153583e-05, + "loss": 0.0098, + "num_input_tokens_seen": 20708480, + "step": 98120 + }, + { + "epoch": 10.794829482948295, + "grad_norm": 0.27643030881881714, + "learning_rate": 2.5895513658209552e-05, + "loss": 0.0523, + "num_input_tokens_seen": 20709472, + "step": 98125 + }, + { + "epoch": 10.795379537953796, + "grad_norm": 0.038870424032211304, + "learning_rate": 2.589311512662975e-05, + "loss": 0.0574, + "num_input_tokens_seen": 20710496, + "step": 98130 + }, + { + "epoch": 10.795929592959295, + "grad_norm": 0.007085281889885664, + "learning_rate": 2.589071658681854e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20711520, + "step": 98135 + }, + { + "epoch": 10.796479647964796, + "grad_norm": 3.648867607116699, + "learning_rate": 2.5888318038798026e-05, + "loss": 0.0678, + "num_input_tokens_seen": 20712576, + "step": 98140 + }, + { + "epoch": 10.797029702970297, + "grad_norm": 0.017689086496829987, + "learning_rate": 2.5885919482590304e-05, + "loss": 0.1302, + "num_input_tokens_seen": 20713600, + "step": 98145 + }, + { + "epoch": 10.797579757975798, + "grad_norm": 0.6753984689712524, + "learning_rate": 2.588352091821749e-05, + "loss": 0.0275, + "num_input_tokens_seen": 20714688, + "step": 98150 + }, + { + "epoch": 10.798129812981298, + "grad_norm": 1.1189912557601929, + "learning_rate": 2.588112234570168e-05, + "loss": 0.0786, + "num_input_tokens_seen": 20715680, + "step": 98155 + }, + { + "epoch": 10.798679867986799, + "grad_norm": 0.014943662099540234, + "learning_rate": 2.5878723765065e-05, + "loss": 0.0045, + "num_input_tokens_seen": 20716736, + "step": 98160 + }, + { + "epoch": 10.7992299229923, + "grad_norm": 0.05515924096107483, + "learning_rate": 2.5876325176329542e-05, + "loss": 0.0045, + "num_input_tokens_seen": 20717824, + "step": 98165 + }, + { + "epoch": 10.7997799779978, + "grad_norm": 1.4767833948135376, + "learning_rate": 2.587392657951741e-05, + "loss": 0.153, + "num_input_tokens_seen": 20718848, + "step": 98170 + }, + { + "epoch": 10.8003300330033, + "grad_norm": 1.829053521156311, + "learning_rate": 2.5871527974650717e-05, + "loss": 0.1106, + "num_input_tokens_seen": 20719936, + "step": 98175 + }, + { + "epoch": 10.800880088008801, + "grad_norm": 1.4270002841949463, + "learning_rate": 2.5869129361751572e-05, + "loss": 0.0621, + "num_input_tokens_seen": 20721056, + "step": 98180 + }, + { + "epoch": 10.8014301430143, + "grad_norm": 0.3961760997772217, + "learning_rate": 2.586673074084207e-05, + "loss": 0.0103, + "num_input_tokens_seen": 20722080, + "step": 98185 + }, + { + "epoch": 10.801980198019802, + "grad_norm": 0.013512630015611649, + "learning_rate": 2.5864332111944345e-05, + "loss": 0.0185, + "num_input_tokens_seen": 20723104, + "step": 98190 + }, + { + "epoch": 10.802530253025303, + "grad_norm": 3.931656837463379, + "learning_rate": 2.5861933475080476e-05, + "loss": 0.1216, + "num_input_tokens_seen": 20724192, + "step": 98195 + }, + { + "epoch": 10.803080308030804, + "grad_norm": 0.30917879939079285, + "learning_rate": 2.585953483027257e-05, + "loss": 0.0529, + "num_input_tokens_seen": 20725280, + "step": 98200 + }, + { + "epoch": 10.803630363036303, + "grad_norm": 0.008885517716407776, + "learning_rate": 2.5857136177542756e-05, + "loss": 0.0635, + "num_input_tokens_seen": 20726400, + "step": 98205 + }, + { + "epoch": 10.804180418041804, + "grad_norm": 0.026647266000509262, + "learning_rate": 2.5854737516913125e-05, + "loss": 0.0109, + "num_input_tokens_seen": 20727456, + "step": 98210 + }, + { + "epoch": 10.804730473047305, + "grad_norm": 0.07626882940530777, + "learning_rate": 2.5852338848405788e-05, + "loss": 0.1249, + "num_input_tokens_seen": 20728576, + "step": 98215 + }, + { + "epoch": 10.805280528052805, + "grad_norm": 0.8734776973724365, + "learning_rate": 2.5849940172042858e-05, + "loss": 0.0278, + "num_input_tokens_seen": 20729664, + "step": 98220 + }, + { + "epoch": 10.805830583058306, + "grad_norm": 0.021840929985046387, + "learning_rate": 2.5847541487846432e-05, + "loss": 0.0042, + "num_input_tokens_seen": 20730688, + "step": 98225 + }, + { + "epoch": 10.806380638063807, + "grad_norm": 1.5331026315689087, + "learning_rate": 2.5845142795838617e-05, + "loss": 0.1016, + "num_input_tokens_seen": 20731776, + "step": 98230 + }, + { + "epoch": 10.806930693069306, + "grad_norm": 0.1982942670583725, + "learning_rate": 2.5842744096041545e-05, + "loss": 0.0591, + "num_input_tokens_seen": 20732832, + "step": 98235 + }, + { + "epoch": 10.807480748074807, + "grad_norm": 0.566849410533905, + "learning_rate": 2.5840345388477282e-05, + "loss": 0.0269, + "num_input_tokens_seen": 20733856, + "step": 98240 + }, + { + "epoch": 10.808030803080309, + "grad_norm": 0.935139000415802, + "learning_rate": 2.5837946673167974e-05, + "loss": 0.0183, + "num_input_tokens_seen": 20734848, + "step": 98245 + }, + { + "epoch": 10.808580858085808, + "grad_norm": 0.04563964158296585, + "learning_rate": 2.5835547950135714e-05, + "loss": 0.0022, + "num_input_tokens_seen": 20735936, + "step": 98250 + }, + { + "epoch": 10.809130913091309, + "grad_norm": 0.441342294216156, + "learning_rate": 2.5833149219402603e-05, + "loss": 0.0072, + "num_input_tokens_seen": 20736992, + "step": 98255 + }, + { + "epoch": 10.80968096809681, + "grad_norm": 0.12923666834831238, + "learning_rate": 2.5830750480990756e-05, + "loss": 0.0875, + "num_input_tokens_seen": 20738080, + "step": 98260 + }, + { + "epoch": 10.810231023102311, + "grad_norm": 0.05656015872955322, + "learning_rate": 2.5828351734922286e-05, + "loss": 0.003, + "num_input_tokens_seen": 20739168, + "step": 98265 + }, + { + "epoch": 10.81078107810781, + "grad_norm": 0.5634557604789734, + "learning_rate": 2.582595298121929e-05, + "loss": 0.0065, + "num_input_tokens_seen": 20740224, + "step": 98270 + }, + { + "epoch": 10.811331133113312, + "grad_norm": 1.7162871360778809, + "learning_rate": 2.5823554219903877e-05, + "loss": 0.0211, + "num_input_tokens_seen": 20741312, + "step": 98275 + }, + { + "epoch": 10.811881188118813, + "grad_norm": 1.2186007499694824, + "learning_rate": 2.582115545099817e-05, + "loss": 0.0265, + "num_input_tokens_seen": 20742368, + "step": 98280 + }, + { + "epoch": 10.812431243124312, + "grad_norm": 0.15452493727207184, + "learning_rate": 2.5818756674524263e-05, + "loss": 0.0125, + "num_input_tokens_seen": 20743392, + "step": 98285 + }, + { + "epoch": 10.812981298129813, + "grad_norm": 0.05211225152015686, + "learning_rate": 2.5816357890504272e-05, + "loss": 0.0091, + "num_input_tokens_seen": 20744512, + "step": 98290 + }, + { + "epoch": 10.813531353135314, + "grad_norm": 0.9199549555778503, + "learning_rate": 2.58139590989603e-05, + "loss": 0.0162, + "num_input_tokens_seen": 20745600, + "step": 98295 + }, + { + "epoch": 10.814081408140813, + "grad_norm": 3.53641939163208, + "learning_rate": 2.581156029991445e-05, + "loss": 0.0812, + "num_input_tokens_seen": 20746688, + "step": 98300 + }, + { + "epoch": 10.814631463146315, + "grad_norm": 0.03972864896059036, + "learning_rate": 2.5809161493388845e-05, + "loss": 0.0025, + "num_input_tokens_seen": 20747776, + "step": 98305 + }, + { + "epoch": 10.815181518151816, + "grad_norm": 1.6957625150680542, + "learning_rate": 2.5806762679405592e-05, + "loss": 0.0349, + "num_input_tokens_seen": 20748864, + "step": 98310 + }, + { + "epoch": 10.815731573157315, + "grad_norm": 0.02426299639046192, + "learning_rate": 2.5804363857986786e-05, + "loss": 0.0285, + "num_input_tokens_seen": 20749888, + "step": 98315 + }, + { + "epoch": 10.816281628162816, + "grad_norm": 0.23529240489006042, + "learning_rate": 2.580196502915455e-05, + "loss": 0.0085, + "num_input_tokens_seen": 20750976, + "step": 98320 + }, + { + "epoch": 10.816831683168317, + "grad_norm": 0.01803809031844139, + "learning_rate": 2.5799566192930985e-05, + "loss": 0.014, + "num_input_tokens_seen": 20752064, + "step": 98325 + }, + { + "epoch": 10.817381738173818, + "grad_norm": 1.1191771030426025, + "learning_rate": 2.5797167349338204e-05, + "loss": 0.1587, + "num_input_tokens_seen": 20753152, + "step": 98330 + }, + { + "epoch": 10.817931793179318, + "grad_norm": 0.3223310112953186, + "learning_rate": 2.5794768498398308e-05, + "loss": 0.0605, + "num_input_tokens_seen": 20754176, + "step": 98335 + }, + { + "epoch": 10.818481848184819, + "grad_norm": 0.009058537892997265, + "learning_rate": 2.579236964013342e-05, + "loss": 0.069, + "num_input_tokens_seen": 20755200, + "step": 98340 + }, + { + "epoch": 10.81903190319032, + "grad_norm": 0.258619099855423, + "learning_rate": 2.578997077456563e-05, + "loss": 0.0162, + "num_input_tokens_seen": 20756288, + "step": 98345 + }, + { + "epoch": 10.819581958195819, + "grad_norm": 0.6372324228286743, + "learning_rate": 2.5787571901717067e-05, + "loss": 0.0863, + "num_input_tokens_seen": 20757344, + "step": 98350 + }, + { + "epoch": 10.82013201320132, + "grad_norm": 0.007406021934002638, + "learning_rate": 2.578517302160983e-05, + "loss": 0.0034, + "num_input_tokens_seen": 20758432, + "step": 98355 + }, + { + "epoch": 10.820682068206821, + "grad_norm": 0.6530166268348694, + "learning_rate": 2.578277413426603e-05, + "loss": 0.0885, + "num_input_tokens_seen": 20759488, + "step": 98360 + }, + { + "epoch": 10.82123212321232, + "grad_norm": 0.0808178186416626, + "learning_rate": 2.5780375239707776e-05, + "loss": 0.0017, + "num_input_tokens_seen": 20760576, + "step": 98365 + }, + { + "epoch": 10.821782178217822, + "grad_norm": 0.01592717319726944, + "learning_rate": 2.577797633795718e-05, + "loss": 0.0458, + "num_input_tokens_seen": 20761632, + "step": 98370 + }, + { + "epoch": 10.822332233223323, + "grad_norm": 0.6336926817893982, + "learning_rate": 2.5775577429036345e-05, + "loss": 0.0141, + "num_input_tokens_seen": 20762688, + "step": 98375 + }, + { + "epoch": 10.822882288228822, + "grad_norm": 0.07473183423280716, + "learning_rate": 2.5773178512967394e-05, + "loss": 0.0251, + "num_input_tokens_seen": 20763648, + "step": 98380 + }, + { + "epoch": 10.823432343234323, + "grad_norm": 0.012966446578502655, + "learning_rate": 2.577077958977242e-05, + "loss": 0.03, + "num_input_tokens_seen": 20764736, + "step": 98385 + }, + { + "epoch": 10.823982398239824, + "grad_norm": 0.44378167390823364, + "learning_rate": 2.576838065947354e-05, + "loss": 0.0247, + "num_input_tokens_seen": 20765728, + "step": 98390 + }, + { + "epoch": 10.824532453245325, + "grad_norm": 0.047416456043720245, + "learning_rate": 2.576598172209287e-05, + "loss": 0.033, + "num_input_tokens_seen": 20766688, + "step": 98395 + }, + { + "epoch": 10.825082508250825, + "grad_norm": 0.1798778921365738, + "learning_rate": 2.5763582777652505e-05, + "loss": 0.0401, + "num_input_tokens_seen": 20767776, + "step": 98400 + }, + { + "epoch": 10.825632563256326, + "grad_norm": 0.04909062758088112, + "learning_rate": 2.5761183826174575e-05, + "loss": 0.1067, + "num_input_tokens_seen": 20768832, + "step": 98405 + }, + { + "epoch": 10.826182618261827, + "grad_norm": 0.06699655205011368, + "learning_rate": 2.5758784867681175e-05, + "loss": 0.0319, + "num_input_tokens_seen": 20769888, + "step": 98410 + }, + { + "epoch": 10.826732673267326, + "grad_norm": 0.14747101068496704, + "learning_rate": 2.5756385902194408e-05, + "loss": 0.0055, + "num_input_tokens_seen": 20770944, + "step": 98415 + }, + { + "epoch": 10.827282728272827, + "grad_norm": 0.015888947993516922, + "learning_rate": 2.5753986929736407e-05, + "loss": 0.0088, + "num_input_tokens_seen": 20772032, + "step": 98420 + }, + { + "epoch": 10.827832783278328, + "grad_norm": 0.01340493280440569, + "learning_rate": 2.575158795032927e-05, + "loss": 0.0201, + "num_input_tokens_seen": 20773088, + "step": 98425 + }, + { + "epoch": 10.828382838283828, + "grad_norm": 0.1911136656999588, + "learning_rate": 2.5749188963995103e-05, + "loss": 0.0768, + "num_input_tokens_seen": 20774176, + "step": 98430 + }, + { + "epoch": 10.828932893289329, + "grad_norm": 0.015160066075623035, + "learning_rate": 2.5746789970756023e-05, + "loss": 0.0039, + "num_input_tokens_seen": 20775232, + "step": 98435 + }, + { + "epoch": 10.82948294829483, + "grad_norm": 0.28603824973106384, + "learning_rate": 2.5744390970634136e-05, + "loss": 0.0051, + "num_input_tokens_seen": 20776256, + "step": 98440 + }, + { + "epoch": 10.83003300330033, + "grad_norm": 0.033552076667547226, + "learning_rate": 2.5741991963651553e-05, + "loss": 0.0678, + "num_input_tokens_seen": 20777344, + "step": 98445 + }, + { + "epoch": 10.83058305830583, + "grad_norm": 0.12800729274749756, + "learning_rate": 2.57395929498304e-05, + "loss": 0.052, + "num_input_tokens_seen": 20778432, + "step": 98450 + }, + { + "epoch": 10.831133113311331, + "grad_norm": 0.9109835624694824, + "learning_rate": 2.573719392919276e-05, + "loss": 0.0408, + "num_input_tokens_seen": 20779456, + "step": 98455 + }, + { + "epoch": 10.831683168316832, + "grad_norm": 0.06725466996431351, + "learning_rate": 2.5734794901760756e-05, + "loss": 0.0406, + "num_input_tokens_seen": 20780512, + "step": 98460 + }, + { + "epoch": 10.832233223322332, + "grad_norm": 0.013020727783441544, + "learning_rate": 2.573239586755651e-05, + "loss": 0.0032, + "num_input_tokens_seen": 20781568, + "step": 98465 + }, + { + "epoch": 10.832783278327833, + "grad_norm": 0.33031630516052246, + "learning_rate": 2.5729996826602114e-05, + "loss": 0.0064, + "num_input_tokens_seen": 20782560, + "step": 98470 + }, + { + "epoch": 10.833333333333334, + "grad_norm": 0.08110267668962479, + "learning_rate": 2.5727597778919693e-05, + "loss": 0.0248, + "num_input_tokens_seen": 20783648, + "step": 98475 + }, + { + "epoch": 10.833883388338833, + "grad_norm": 0.16068077087402344, + "learning_rate": 2.5725198724531346e-05, + "loss": 0.0099, + "num_input_tokens_seen": 20784736, + "step": 98480 + }, + { + "epoch": 10.834433443344334, + "grad_norm": 0.057532425969839096, + "learning_rate": 2.5722799663459197e-05, + "loss": 0.005, + "num_input_tokens_seen": 20785728, + "step": 98485 + }, + { + "epoch": 10.834983498349835, + "grad_norm": 0.2415921688079834, + "learning_rate": 2.5720400595725348e-05, + "loss": 0.0515, + "num_input_tokens_seen": 20786880, + "step": 98490 + }, + { + "epoch": 10.835533553355335, + "grad_norm": 0.09156524389982224, + "learning_rate": 2.571800152135191e-05, + "loss": 0.015, + "num_input_tokens_seen": 20787936, + "step": 98495 + }, + { + "epoch": 10.836083608360836, + "grad_norm": 0.02350478060543537, + "learning_rate": 2.5715602440361e-05, + "loss": 0.0814, + "num_input_tokens_seen": 20788992, + "step": 98500 + }, + { + "epoch": 10.836633663366337, + "grad_norm": 0.09764846414327621, + "learning_rate": 2.5713203352774723e-05, + "loss": 0.0835, + "num_input_tokens_seen": 20790048, + "step": 98505 + }, + { + "epoch": 10.837183718371836, + "grad_norm": 0.19104675948619843, + "learning_rate": 2.5710804258615197e-05, + "loss": 0.0173, + "num_input_tokens_seen": 20791072, + "step": 98510 + }, + { + "epoch": 10.837733773377337, + "grad_norm": 0.06962364912033081, + "learning_rate": 2.5708405157904525e-05, + "loss": 0.0056, + "num_input_tokens_seen": 20792128, + "step": 98515 + }, + { + "epoch": 10.838283828382838, + "grad_norm": 0.21274764835834503, + "learning_rate": 2.5706006050664826e-05, + "loss": 0.0118, + "num_input_tokens_seen": 20793216, + "step": 98520 + }, + { + "epoch": 10.83883388338834, + "grad_norm": 0.3764517605304718, + "learning_rate": 2.5703606936918205e-05, + "loss": 0.0431, + "num_input_tokens_seen": 20794272, + "step": 98525 + }, + { + "epoch": 10.839383938393839, + "grad_norm": 0.15859320759773254, + "learning_rate": 2.5701207816686772e-05, + "loss": 0.0191, + "num_input_tokens_seen": 20795360, + "step": 98530 + }, + { + "epoch": 10.83993399339934, + "grad_norm": 0.021144751459360123, + "learning_rate": 2.5698808689992653e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20796448, + "step": 98535 + }, + { + "epoch": 10.840484048404841, + "grad_norm": 0.004897498060017824, + "learning_rate": 2.5696409556857947e-05, + "loss": 0.004, + "num_input_tokens_seen": 20797536, + "step": 98540 + }, + { + "epoch": 10.84103410341034, + "grad_norm": 0.00942232832312584, + "learning_rate": 2.5694010417304763e-05, + "loss": 0.005, + "num_input_tokens_seen": 20798592, + "step": 98545 + }, + { + "epoch": 10.841584158415841, + "grad_norm": 0.020532073453068733, + "learning_rate": 2.5691611271355228e-05, + "loss": 0.0042, + "num_input_tokens_seen": 20799648, + "step": 98550 + }, + { + "epoch": 10.842134213421343, + "grad_norm": 0.019116543233394623, + "learning_rate": 2.5689212119031437e-05, + "loss": 0.0426, + "num_input_tokens_seen": 20800736, + "step": 98555 + }, + { + "epoch": 10.842684268426842, + "grad_norm": 0.13261160254478455, + "learning_rate": 2.5686812960355506e-05, + "loss": 0.0839, + "num_input_tokens_seen": 20801792, + "step": 98560 + }, + { + "epoch": 10.843234323432343, + "grad_norm": 0.9929161667823792, + "learning_rate": 2.5684413795349548e-05, + "loss": 0.0416, + "num_input_tokens_seen": 20802880, + "step": 98565 + }, + { + "epoch": 10.843784378437844, + "grad_norm": 1.009606957435608, + "learning_rate": 2.568201462403569e-05, + "loss": 0.0751, + "num_input_tokens_seen": 20803936, + "step": 98570 + }, + { + "epoch": 10.844334433443345, + "grad_norm": 0.4992409944534302, + "learning_rate": 2.5679615446436018e-05, + "loss": 0.0887, + "num_input_tokens_seen": 20804928, + "step": 98575 + }, + { + "epoch": 10.844884488448844, + "grad_norm": 0.7547467350959778, + "learning_rate": 2.5677216262572657e-05, + "loss": 0.0321, + "num_input_tokens_seen": 20806016, + "step": 98580 + }, + { + "epoch": 10.845434543454346, + "grad_norm": 0.21625787019729614, + "learning_rate": 2.567481707246772e-05, + "loss": 0.0139, + "num_input_tokens_seen": 20807040, + "step": 98585 + }, + { + "epoch": 10.845984598459847, + "grad_norm": 0.008040688931941986, + "learning_rate": 2.5672417876143316e-05, + "loss": 0.0749, + "num_input_tokens_seen": 20808064, + "step": 98590 + }, + { + "epoch": 10.846534653465346, + "grad_norm": 0.021753067150712013, + "learning_rate": 2.567001867362157e-05, + "loss": 0.0177, + "num_input_tokens_seen": 20809152, + "step": 98595 + }, + { + "epoch": 10.847084708470847, + "grad_norm": 0.7149361968040466, + "learning_rate": 2.566761946492458e-05, + "loss": 0.0381, + "num_input_tokens_seen": 20810208, + "step": 98600 + }, + { + "epoch": 10.847634763476348, + "grad_norm": 2.2288169860839844, + "learning_rate": 2.566522025007445e-05, + "loss": 0.0949, + "num_input_tokens_seen": 20811264, + "step": 98605 + }, + { + "epoch": 10.848184818481847, + "grad_norm": 0.17657703161239624, + "learning_rate": 2.5662821029093313e-05, + "loss": 0.0127, + "num_input_tokens_seen": 20812288, + "step": 98610 + }, + { + "epoch": 10.848734873487349, + "grad_norm": 0.026835685595870018, + "learning_rate": 2.5660421802003276e-05, + "loss": 0.0044, + "num_input_tokens_seen": 20813344, + "step": 98615 + }, + { + "epoch": 10.84928492849285, + "grad_norm": 0.011521503329277039, + "learning_rate": 2.565802256882644e-05, + "loss": 0.0028, + "num_input_tokens_seen": 20814336, + "step": 98620 + }, + { + "epoch": 10.84983498349835, + "grad_norm": 1.322069525718689, + "learning_rate": 2.5655623329584933e-05, + "loss": 0.0819, + "num_input_tokens_seen": 20815328, + "step": 98625 + }, + { + "epoch": 10.85038503850385, + "grad_norm": 0.005416270811110735, + "learning_rate": 2.565322408430086e-05, + "loss": 0.1029, + "num_input_tokens_seen": 20816416, + "step": 98630 + }, + { + "epoch": 10.850935093509351, + "grad_norm": 0.05682147294282913, + "learning_rate": 2.5650824832996325e-05, + "loss": 0.0139, + "num_input_tokens_seen": 20817504, + "step": 98635 + }, + { + "epoch": 10.851485148514852, + "grad_norm": 0.031050320714712143, + "learning_rate": 2.564842557569346e-05, + "loss": 0.0073, + "num_input_tokens_seen": 20818496, + "step": 98640 + }, + { + "epoch": 10.852035203520352, + "grad_norm": 0.0034471882972866297, + "learning_rate": 2.5646026312414363e-05, + "loss": 0.0036, + "num_input_tokens_seen": 20819488, + "step": 98645 + }, + { + "epoch": 10.852585258525853, + "grad_norm": 0.27947333455085754, + "learning_rate": 2.5643627043181147e-05, + "loss": 0.0255, + "num_input_tokens_seen": 20820544, + "step": 98650 + }, + { + "epoch": 10.853135313531354, + "grad_norm": 0.03487987816333771, + "learning_rate": 2.5641227768015935e-05, + "loss": 0.083, + "num_input_tokens_seen": 20821632, + "step": 98655 + }, + { + "epoch": 10.853685368536853, + "grad_norm": 0.914910614490509, + "learning_rate": 2.5638828486940836e-05, + "loss": 0.0544, + "num_input_tokens_seen": 20822688, + "step": 98660 + }, + { + "epoch": 10.854235423542354, + "grad_norm": 0.05726753547787666, + "learning_rate": 2.5636429199977957e-05, + "loss": 0.0259, + "num_input_tokens_seen": 20823776, + "step": 98665 + }, + { + "epoch": 10.854785478547855, + "grad_norm": 0.090841144323349, + "learning_rate": 2.563402990714942e-05, + "loss": 0.0492, + "num_input_tokens_seen": 20824896, + "step": 98670 + }, + { + "epoch": 10.855335533553355, + "grad_norm": 0.11953217536211014, + "learning_rate": 2.563163060847733e-05, + "loss": 0.0291, + "num_input_tokens_seen": 20825984, + "step": 98675 + }, + { + "epoch": 10.855885588558856, + "grad_norm": 0.012545456178486347, + "learning_rate": 2.5629231303983804e-05, + "loss": 0.0514, + "num_input_tokens_seen": 20827072, + "step": 98680 + }, + { + "epoch": 10.856435643564357, + "grad_norm": 0.006126862484961748, + "learning_rate": 2.5626831993690954e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20828064, + "step": 98685 + }, + { + "epoch": 10.856985698569858, + "grad_norm": 0.22201049327850342, + "learning_rate": 2.5624432677620895e-05, + "loss": 0.0469, + "num_input_tokens_seen": 20829152, + "step": 98690 + }, + { + "epoch": 10.857535753575357, + "grad_norm": 0.12068886309862137, + "learning_rate": 2.5622033355795742e-05, + "loss": 0.0076, + "num_input_tokens_seen": 20830304, + "step": 98695 + }, + { + "epoch": 10.858085808580858, + "grad_norm": 0.06212092936038971, + "learning_rate": 2.5619634028237606e-05, + "loss": 0.0079, + "num_input_tokens_seen": 20831296, + "step": 98700 + }, + { + "epoch": 10.85863586358636, + "grad_norm": 0.02603032998740673, + "learning_rate": 2.56172346949686e-05, + "loss": 0.0469, + "num_input_tokens_seen": 20832320, + "step": 98705 + }, + { + "epoch": 10.859185918591859, + "grad_norm": 0.014868345111608505, + "learning_rate": 2.5614835356010834e-05, + "loss": 0.0117, + "num_input_tokens_seen": 20833408, + "step": 98710 + }, + { + "epoch": 10.85973597359736, + "grad_norm": 1.9593530893325806, + "learning_rate": 2.561243601138643e-05, + "loss": 0.044, + "num_input_tokens_seen": 20834496, + "step": 98715 + }, + { + "epoch": 10.86028602860286, + "grad_norm": 0.242027148604393, + "learning_rate": 2.561003666111749e-05, + "loss": 0.0069, + "num_input_tokens_seen": 20835584, + "step": 98720 + }, + { + "epoch": 10.86083608360836, + "grad_norm": 1.7482856512069702, + "learning_rate": 2.5607637305226144e-05, + "loss": 0.0317, + "num_input_tokens_seen": 20836672, + "step": 98725 + }, + { + "epoch": 10.861386138613861, + "grad_norm": 0.1149650514125824, + "learning_rate": 2.5605237943734495e-05, + "loss": 0.0085, + "num_input_tokens_seen": 20837696, + "step": 98730 + }, + { + "epoch": 10.861936193619362, + "grad_norm": 0.006931331939995289, + "learning_rate": 2.5602838576664658e-05, + "loss": 0.0021, + "num_input_tokens_seen": 20838752, + "step": 98735 + }, + { + "epoch": 10.862486248624862, + "grad_norm": 0.00995652750134468, + "learning_rate": 2.5600439204038744e-05, + "loss": 0.0073, + "num_input_tokens_seen": 20839744, + "step": 98740 + }, + { + "epoch": 10.863036303630363, + "grad_norm": 0.084487684071064, + "learning_rate": 2.5598039825878873e-05, + "loss": 0.0658, + "num_input_tokens_seen": 20840768, + "step": 98745 + }, + { + "epoch": 10.863586358635864, + "grad_norm": 0.606807291507721, + "learning_rate": 2.5595640442207152e-05, + "loss": 0.0491, + "num_input_tokens_seen": 20841856, + "step": 98750 + }, + { + "epoch": 10.864136413641365, + "grad_norm": 0.03628092259168625, + "learning_rate": 2.5593241053045702e-05, + "loss": 0.003, + "num_input_tokens_seen": 20842912, + "step": 98755 + }, + { + "epoch": 10.864686468646864, + "grad_norm": 2.3545613288879395, + "learning_rate": 2.5590841658416638e-05, + "loss": 0.0491, + "num_input_tokens_seen": 20843968, + "step": 98760 + }, + { + "epoch": 10.865236523652365, + "grad_norm": 0.061729807406663895, + "learning_rate": 2.558844225834206e-05, + "loss": 0.0536, + "num_input_tokens_seen": 20845024, + "step": 98765 + }, + { + "epoch": 10.865786578657866, + "grad_norm": 1.1951682567596436, + "learning_rate": 2.55860428528441e-05, + "loss": 0.0123, + "num_input_tokens_seen": 20846080, + "step": 98770 + }, + { + "epoch": 10.866336633663366, + "grad_norm": 0.527289628982544, + "learning_rate": 2.558364344194486e-05, + "loss": 0.006, + "num_input_tokens_seen": 20847168, + "step": 98775 + }, + { + "epoch": 10.866886688668867, + "grad_norm": 0.01590881124138832, + "learning_rate": 2.5581244025666456e-05, + "loss": 0.0395, + "num_input_tokens_seen": 20848224, + "step": 98780 + }, + { + "epoch": 10.867436743674368, + "grad_norm": 1.2102584838867188, + "learning_rate": 2.5578844604031016e-05, + "loss": 0.0298, + "num_input_tokens_seen": 20849248, + "step": 98785 + }, + { + "epoch": 10.867986798679867, + "grad_norm": 0.03197941184043884, + "learning_rate": 2.5576445177060638e-05, + "loss": 0.0356, + "num_input_tokens_seen": 20850368, + "step": 98790 + }, + { + "epoch": 10.868536853685368, + "grad_norm": 0.05528254434466362, + "learning_rate": 2.557404574477743e-05, + "loss": 0.0687, + "num_input_tokens_seen": 20851424, + "step": 98795 + }, + { + "epoch": 10.86908690869087, + "grad_norm": 0.032160647213459015, + "learning_rate": 2.557164630720353e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20852480, + "step": 98800 + }, + { + "epoch": 10.869636963696369, + "grad_norm": 0.8277379274368286, + "learning_rate": 2.5569246864361034e-05, + "loss": 0.0175, + "num_input_tokens_seen": 20853504, + "step": 98805 + }, + { + "epoch": 10.87018701870187, + "grad_norm": 0.01685892418026924, + "learning_rate": 2.556684741627207e-05, + "loss": 0.003, + "num_input_tokens_seen": 20854528, + "step": 98810 + }, + { + "epoch": 10.870737073707371, + "grad_norm": 0.01831248588860035, + "learning_rate": 2.5564447962958744e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20855552, + "step": 98815 + }, + { + "epoch": 10.871287128712872, + "grad_norm": 0.03322621062397957, + "learning_rate": 2.5562048504443165e-05, + "loss": 0.0223, + "num_input_tokens_seen": 20856672, + "step": 98820 + }, + { + "epoch": 10.871837183718371, + "grad_norm": 0.01725294068455696, + "learning_rate": 2.5559649040747458e-05, + "loss": 0.0038, + "num_input_tokens_seen": 20857728, + "step": 98825 + }, + { + "epoch": 10.872387238723872, + "grad_norm": 1.3362152576446533, + "learning_rate": 2.5557249571893736e-05, + "loss": 0.0629, + "num_input_tokens_seen": 20858848, + "step": 98830 + }, + { + "epoch": 10.872937293729374, + "grad_norm": 0.0215210672467947, + "learning_rate": 2.5554850097904108e-05, + "loss": 0.0132, + "num_input_tokens_seen": 20859808, + "step": 98835 + }, + { + "epoch": 10.873487348734873, + "grad_norm": 0.01758131943643093, + "learning_rate": 2.55524506188007e-05, + "loss": 0.0077, + "num_input_tokens_seen": 20860864, + "step": 98840 + }, + { + "epoch": 10.874037403740374, + "grad_norm": 0.024607211351394653, + "learning_rate": 2.5550051134605618e-05, + "loss": 0.002, + "num_input_tokens_seen": 20861920, + "step": 98845 + }, + { + "epoch": 10.874587458745875, + "grad_norm": 1.4833613634109497, + "learning_rate": 2.5547651645340974e-05, + "loss": 0.0804, + "num_input_tokens_seen": 20862976, + "step": 98850 + }, + { + "epoch": 10.875137513751374, + "grad_norm": 0.010008459910750389, + "learning_rate": 2.554525215102889e-05, + "loss": 0.0451, + "num_input_tokens_seen": 20864000, + "step": 98855 + }, + { + "epoch": 10.875687568756875, + "grad_norm": 0.15934346616268158, + "learning_rate": 2.5542852651691484e-05, + "loss": 0.029, + "num_input_tokens_seen": 20865024, + "step": 98860 + }, + { + "epoch": 10.876237623762377, + "grad_norm": 0.07719647139310837, + "learning_rate": 2.554045314735085e-05, + "loss": 0.0015, + "num_input_tokens_seen": 20866016, + "step": 98865 + }, + { + "epoch": 10.876787678767876, + "grad_norm": 0.008803224191069603, + "learning_rate": 2.5538053638029136e-05, + "loss": 0.0167, + "num_input_tokens_seen": 20867008, + "step": 98870 + }, + { + "epoch": 10.877337733773377, + "grad_norm": 0.11178231984376907, + "learning_rate": 2.553565412374843e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20868064, + "step": 98875 + }, + { + "epoch": 10.877887788778878, + "grad_norm": 0.05601118132472038, + "learning_rate": 2.553325460453086e-05, + "loss": 0.0152, + "num_input_tokens_seen": 20869216, + "step": 98880 + }, + { + "epoch": 10.87843784378438, + "grad_norm": 0.018895039334893227, + "learning_rate": 2.553085508039854e-05, + "loss": 0.0055, + "num_input_tokens_seen": 20870336, + "step": 98885 + }, + { + "epoch": 10.878987898789878, + "grad_norm": 0.16676390171051025, + "learning_rate": 2.552845555137358e-05, + "loss": 0.0491, + "num_input_tokens_seen": 20871360, + "step": 98890 + }, + { + "epoch": 10.87953795379538, + "grad_norm": 0.011159751564264297, + "learning_rate": 2.55260560174781e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20872384, + "step": 98895 + }, + { + "epoch": 10.88008800880088, + "grad_norm": 0.03130064532160759, + "learning_rate": 2.5523656478734213e-05, + "loss": 0.0149, + "num_input_tokens_seen": 20873408, + "step": 98900 + }, + { + "epoch": 10.88063806380638, + "grad_norm": 0.3161200284957886, + "learning_rate": 2.5521256935164044e-05, + "loss": 0.008, + "num_input_tokens_seen": 20874464, + "step": 98905 + }, + { + "epoch": 10.881188118811881, + "grad_norm": 1.435802936553955, + "learning_rate": 2.5518857386789684e-05, + "loss": 0.0216, + "num_input_tokens_seen": 20875456, + "step": 98910 + }, + { + "epoch": 10.881738173817382, + "grad_norm": 0.012713773176074028, + "learning_rate": 2.5516457833633277e-05, + "loss": 0.0598, + "num_input_tokens_seen": 20876512, + "step": 98915 + }, + { + "epoch": 10.882288228822881, + "grad_norm": 0.035084232687950134, + "learning_rate": 2.5514058275716924e-05, + "loss": 0.0397, + "num_input_tokens_seen": 20877568, + "step": 98920 + }, + { + "epoch": 10.882838283828383, + "grad_norm": 0.018808025866746902, + "learning_rate": 2.551165871306274e-05, + "loss": 0.0162, + "num_input_tokens_seen": 20878624, + "step": 98925 + }, + { + "epoch": 10.883388338833884, + "grad_norm": 0.04198968783020973, + "learning_rate": 2.5509259145692843e-05, + "loss": 0.0361, + "num_input_tokens_seen": 20879648, + "step": 98930 + }, + { + "epoch": 10.883938393839383, + "grad_norm": 0.007832457311451435, + "learning_rate": 2.5506859573629356e-05, + "loss": 0.001, + "num_input_tokens_seen": 20880704, + "step": 98935 + }, + { + "epoch": 10.884488448844884, + "grad_norm": 0.9921509623527527, + "learning_rate": 2.5504459996894374e-05, + "loss": 0.0875, + "num_input_tokens_seen": 20881696, + "step": 98940 + }, + { + "epoch": 10.885038503850385, + "grad_norm": 0.00762570695951581, + "learning_rate": 2.5502060415510032e-05, + "loss": 0.0056, + "num_input_tokens_seen": 20882720, + "step": 98945 + }, + { + "epoch": 10.885588558855886, + "grad_norm": 0.7492311596870422, + "learning_rate": 2.549966082949845e-05, + "loss": 0.0822, + "num_input_tokens_seen": 20883744, + "step": 98950 + }, + { + "epoch": 10.886138613861386, + "grad_norm": 0.25867465138435364, + "learning_rate": 2.549726123888172e-05, + "loss": 0.0543, + "num_input_tokens_seen": 20884800, + "step": 98955 + }, + { + "epoch": 10.886688668866887, + "grad_norm": 0.01611722633242607, + "learning_rate": 2.549486164368198e-05, + "loss": 0.0345, + "num_input_tokens_seen": 20885888, + "step": 98960 + }, + { + "epoch": 10.887238723872388, + "grad_norm": 0.08844291418790817, + "learning_rate": 2.5492462043921335e-05, + "loss": 0.0053, + "num_input_tokens_seen": 20886944, + "step": 98965 + }, + { + "epoch": 10.887788778877887, + "grad_norm": 0.08579365909099579, + "learning_rate": 2.5490062439621898e-05, + "loss": 0.0132, + "num_input_tokens_seen": 20887968, + "step": 98970 + }, + { + "epoch": 10.888338833883388, + "grad_norm": 0.05580797418951988, + "learning_rate": 2.5487662830805804e-05, + "loss": 0.0541, + "num_input_tokens_seen": 20889056, + "step": 98975 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 1.3413658142089844, + "learning_rate": 2.5485263217495147e-05, + "loss": 0.0259, + "num_input_tokens_seen": 20890048, + "step": 98980 + }, + { + "epoch": 10.88943894389439, + "grad_norm": 0.13047541677951813, + "learning_rate": 2.5482863599712047e-05, + "loss": 0.0976, + "num_input_tokens_seen": 20891104, + "step": 98985 + }, + { + "epoch": 10.88998899889989, + "grad_norm": 0.11207371205091476, + "learning_rate": 2.548046397747863e-05, + "loss": 0.0123, + "num_input_tokens_seen": 20892096, + "step": 98990 + }, + { + "epoch": 10.89053905390539, + "grad_norm": 0.03605537489056587, + "learning_rate": 2.547806435081701e-05, + "loss": 0.0081, + "num_input_tokens_seen": 20893184, + "step": 98995 + }, + { + "epoch": 10.891089108910892, + "grad_norm": 0.004170229658484459, + "learning_rate": 2.54756647197493e-05, + "loss": 0.0699, + "num_input_tokens_seen": 20894304, + "step": 99000 + }, + { + "epoch": 10.891639163916391, + "grad_norm": 0.06663151830434799, + "learning_rate": 2.5473265084297617e-05, + "loss": 0.0072, + "num_input_tokens_seen": 20895392, + "step": 99005 + }, + { + "epoch": 10.892189218921892, + "grad_norm": 0.008599597029387951, + "learning_rate": 2.547086544448407e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20896448, + "step": 99010 + }, + { + "epoch": 10.892739273927393, + "grad_norm": 0.13046172261238098, + "learning_rate": 2.5468465800330782e-05, + "loss": 0.0772, + "num_input_tokens_seen": 20897504, + "step": 99015 + }, + { + "epoch": 10.893289328932893, + "grad_norm": 0.6575040221214294, + "learning_rate": 2.546606615185988e-05, + "loss": 0.0206, + "num_input_tokens_seen": 20898560, + "step": 99020 + }, + { + "epoch": 10.893839383938394, + "grad_norm": 0.006632419768720865, + "learning_rate": 2.5463666499093465e-05, + "loss": 0.0076, + "num_input_tokens_seen": 20899648, + "step": 99025 + }, + { + "epoch": 10.894389438943895, + "grad_norm": 0.07507479935884476, + "learning_rate": 2.5461266842053654e-05, + "loss": 0.0654, + "num_input_tokens_seen": 20900736, + "step": 99030 + }, + { + "epoch": 10.894939493949394, + "grad_norm": 0.004858235828578472, + "learning_rate": 2.5458867180762575e-05, + "loss": 0.0017, + "num_input_tokens_seen": 20901792, + "step": 99035 + }, + { + "epoch": 10.895489548954895, + "grad_norm": 0.02384124882519245, + "learning_rate": 2.545646751524233e-05, + "loss": 0.001, + "num_input_tokens_seen": 20902816, + "step": 99040 + }, + { + "epoch": 10.896039603960396, + "grad_norm": 0.3176831603050232, + "learning_rate": 2.545406784551505e-05, + "loss": 0.0362, + "num_input_tokens_seen": 20903904, + "step": 99045 + }, + { + "epoch": 10.896589658965897, + "grad_norm": 0.11680849641561508, + "learning_rate": 2.545166817160284e-05, + "loss": 0.1118, + "num_input_tokens_seen": 20904896, + "step": 99050 + }, + { + "epoch": 10.897139713971397, + "grad_norm": 2.541217803955078, + "learning_rate": 2.5449268493527815e-05, + "loss": 0.2475, + "num_input_tokens_seen": 20905952, + "step": 99055 + }, + { + "epoch": 10.897689768976898, + "grad_norm": 0.018976278603076935, + "learning_rate": 2.544686881131211e-05, + "loss": 0.0144, + "num_input_tokens_seen": 20906944, + "step": 99060 + }, + { + "epoch": 10.898239823982399, + "grad_norm": 0.020530199632048607, + "learning_rate": 2.5444469124977823e-05, + "loss": 0.1547, + "num_input_tokens_seen": 20908000, + "step": 99065 + }, + { + "epoch": 10.898789878987898, + "grad_norm": 0.49745211005210876, + "learning_rate": 2.544206943454708e-05, + "loss": 0.0723, + "num_input_tokens_seen": 20908992, + "step": 99070 + }, + { + "epoch": 10.8993399339934, + "grad_norm": 0.162349134683609, + "learning_rate": 2.5439669740041995e-05, + "loss": 0.1522, + "num_input_tokens_seen": 20910080, + "step": 99075 + }, + { + "epoch": 10.8998899889989, + "grad_norm": 1.3449240922927856, + "learning_rate": 2.5437270041484685e-05, + "loss": 0.199, + "num_input_tokens_seen": 20911104, + "step": 99080 + }, + { + "epoch": 10.9004400440044, + "grad_norm": 0.5211751461029053, + "learning_rate": 2.543487033889726e-05, + "loss": 0.0169, + "num_input_tokens_seen": 20912128, + "step": 99085 + }, + { + "epoch": 10.900990099009901, + "grad_norm": 0.5197975039482117, + "learning_rate": 2.5432470632301854e-05, + "loss": 0.0112, + "num_input_tokens_seen": 20913184, + "step": 99090 + }, + { + "epoch": 10.901540154015402, + "grad_norm": 0.13400395214557648, + "learning_rate": 2.5430070921720572e-05, + "loss": 0.0113, + "num_input_tokens_seen": 20914272, + "step": 99095 + }, + { + "epoch": 10.902090209020901, + "grad_norm": 0.008304065093398094, + "learning_rate": 2.5427671207175525e-05, + "loss": 0.0072, + "num_input_tokens_seen": 20915296, + "step": 99100 + }, + { + "epoch": 10.902640264026402, + "grad_norm": 0.92955482006073, + "learning_rate": 2.5425271488688845e-05, + "loss": 0.0598, + "num_input_tokens_seen": 20916320, + "step": 99105 + }, + { + "epoch": 10.903190319031903, + "grad_norm": 0.3447665870189667, + "learning_rate": 2.542287176628264e-05, + "loss": 0.0181, + "num_input_tokens_seen": 20917376, + "step": 99110 + }, + { + "epoch": 10.903740374037405, + "grad_norm": 0.024564115330576897, + "learning_rate": 2.5420472039979032e-05, + "loss": 0.0018, + "num_input_tokens_seen": 20918400, + "step": 99115 + }, + { + "epoch": 10.904290429042904, + "grad_norm": 0.012802042067050934, + "learning_rate": 2.541807230980013e-05, + "loss": 0.016, + "num_input_tokens_seen": 20919488, + "step": 99120 + }, + { + "epoch": 10.904840484048405, + "grad_norm": 0.03769766539335251, + "learning_rate": 2.5415672575768055e-05, + "loss": 0.0123, + "num_input_tokens_seen": 20920544, + "step": 99125 + }, + { + "epoch": 10.905390539053906, + "grad_norm": 0.3634762465953827, + "learning_rate": 2.5413272837904927e-05, + "loss": 0.007, + "num_input_tokens_seen": 20921600, + "step": 99130 + }, + { + "epoch": 10.905940594059405, + "grad_norm": 0.018438901752233505, + "learning_rate": 2.5410873096232864e-05, + "loss": 0.0459, + "num_input_tokens_seen": 20922688, + "step": 99135 + }, + { + "epoch": 10.906490649064907, + "grad_norm": 0.016044126823544502, + "learning_rate": 2.5408473350773977e-05, + "loss": 0.0011, + "num_input_tokens_seen": 20923744, + "step": 99140 + }, + { + "epoch": 10.907040704070408, + "grad_norm": 2.0665230751037598, + "learning_rate": 2.540607360155039e-05, + "loss": 0.0238, + "num_input_tokens_seen": 20924768, + "step": 99145 + }, + { + "epoch": 10.907590759075907, + "grad_norm": 0.011067820712924004, + "learning_rate": 2.540367384858422e-05, + "loss": 0.006, + "num_input_tokens_seen": 20925824, + "step": 99150 + }, + { + "epoch": 10.908140814081408, + "grad_norm": 0.06147463619709015, + "learning_rate": 2.5401274091897575e-05, + "loss": 0.0797, + "num_input_tokens_seen": 20926848, + "step": 99155 + }, + { + "epoch": 10.908690869086909, + "grad_norm": 0.01918545737862587, + "learning_rate": 2.539887433151258e-05, + "loss": 0.0027, + "num_input_tokens_seen": 20927904, + "step": 99160 + }, + { + "epoch": 10.909240924092408, + "grad_norm": 1.373171091079712, + "learning_rate": 2.539647456745136e-05, + "loss": 0.0791, + "num_input_tokens_seen": 20928960, + "step": 99165 + }, + { + "epoch": 10.90979097909791, + "grad_norm": 1.33022141456604, + "learning_rate": 2.5394074799736017e-05, + "loss": 0.0294, + "num_input_tokens_seen": 20930080, + "step": 99170 + }, + { + "epoch": 10.91034103410341, + "grad_norm": 0.1053592786192894, + "learning_rate": 2.539167502838868e-05, + "loss": 0.0239, + "num_input_tokens_seen": 20931136, + "step": 99175 + }, + { + "epoch": 10.910891089108912, + "grad_norm": 0.024079592898488045, + "learning_rate": 2.5389275253431454e-05, + "loss": 0.0082, + "num_input_tokens_seen": 20932160, + "step": 99180 + }, + { + "epoch": 10.911441144114411, + "grad_norm": 1.105118751525879, + "learning_rate": 2.5386875474886475e-05, + "loss": 0.0259, + "num_input_tokens_seen": 20933184, + "step": 99185 + }, + { + "epoch": 10.911991199119912, + "grad_norm": 0.13893252611160278, + "learning_rate": 2.5384475692775846e-05, + "loss": 0.04, + "num_input_tokens_seen": 20934240, + "step": 99190 + }, + { + "epoch": 10.912541254125413, + "grad_norm": 0.23327429592609406, + "learning_rate": 2.538207590712169e-05, + "loss": 0.0131, + "num_input_tokens_seen": 20935328, + "step": 99195 + }, + { + "epoch": 10.913091309130913, + "grad_norm": 0.014929704368114471, + "learning_rate": 2.5379676117946117e-05, + "loss": 0.0088, + "num_input_tokens_seen": 20936448, + "step": 99200 + }, + { + "epoch": 10.913641364136414, + "grad_norm": 0.04725992679595947, + "learning_rate": 2.537727632527126e-05, + "loss": 0.0555, + "num_input_tokens_seen": 20937504, + "step": 99205 + }, + { + "epoch": 10.914191419141915, + "grad_norm": 0.0441073440015316, + "learning_rate": 2.537487652911923e-05, + "loss": 0.0093, + "num_input_tokens_seen": 20938592, + "step": 99210 + }, + { + "epoch": 10.914741474147414, + "grad_norm": 0.08642619848251343, + "learning_rate": 2.5372476729512136e-05, + "loss": 0.0175, + "num_input_tokens_seen": 20939680, + "step": 99215 + }, + { + "epoch": 10.915291529152915, + "grad_norm": 0.023054257035255432, + "learning_rate": 2.5370076926472102e-05, + "loss": 0.0505, + "num_input_tokens_seen": 20940704, + "step": 99220 + }, + { + "epoch": 10.915841584158416, + "grad_norm": 0.10355124622583389, + "learning_rate": 2.536767712002125e-05, + "loss": 0.1136, + "num_input_tokens_seen": 20941760, + "step": 99225 + }, + { + "epoch": 10.916391639163916, + "grad_norm": 0.01579436846077442, + "learning_rate": 2.536527731018169e-05, + "loss": 0.0019, + "num_input_tokens_seen": 20942752, + "step": 99230 + }, + { + "epoch": 10.916941694169417, + "grad_norm": 0.850608766078949, + "learning_rate": 2.5362877496975553e-05, + "loss": 0.0712, + "num_input_tokens_seen": 20943776, + "step": 99235 + }, + { + "epoch": 10.917491749174918, + "grad_norm": 0.026614969596266747, + "learning_rate": 2.5360477680424944e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20944832, + "step": 99240 + }, + { + "epoch": 10.918041804180419, + "grad_norm": 0.00596572132781148, + "learning_rate": 2.5358077860551984e-05, + "loss": 0.0065, + "num_input_tokens_seen": 20945824, + "step": 99245 + }, + { + "epoch": 10.918591859185918, + "grad_norm": 0.34490320086479187, + "learning_rate": 2.535567803737879e-05, + "loss": 0.0582, + "num_input_tokens_seen": 20946912, + "step": 99250 + }, + { + "epoch": 10.91914191419142, + "grad_norm": 0.0188581682741642, + "learning_rate": 2.5353278210927484e-05, + "loss": 0.0043, + "num_input_tokens_seen": 20947936, + "step": 99255 + }, + { + "epoch": 10.91969196919692, + "grad_norm": 0.07550982385873795, + "learning_rate": 2.5350878381220183e-05, + "loss": 0.018, + "num_input_tokens_seen": 20948928, + "step": 99260 + }, + { + "epoch": 10.92024202420242, + "grad_norm": 2.3820722103118896, + "learning_rate": 2.534847854827901e-05, + "loss": 0.069, + "num_input_tokens_seen": 20949984, + "step": 99265 + }, + { + "epoch": 10.92079207920792, + "grad_norm": 0.139739528298378, + "learning_rate": 2.5346078712126072e-05, + "loss": 0.0965, + "num_input_tokens_seen": 20951008, + "step": 99270 + }, + { + "epoch": 10.921342134213422, + "grad_norm": 0.020817793905735016, + "learning_rate": 2.5343678872783483e-05, + "loss": 0.0029, + "num_input_tokens_seen": 20952096, + "step": 99275 + }, + { + "epoch": 10.921892189218921, + "grad_norm": 0.03667067363858223, + "learning_rate": 2.5341279030273384e-05, + "loss": 0.0022, + "num_input_tokens_seen": 20953088, + "step": 99280 + }, + { + "epoch": 10.922442244224422, + "grad_norm": 0.07417856901884079, + "learning_rate": 2.533887918461788e-05, + "loss": 0.1041, + "num_input_tokens_seen": 20954144, + "step": 99285 + }, + { + "epoch": 10.922992299229923, + "grad_norm": 0.01814429648220539, + "learning_rate": 2.533647933583908e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20955264, + "step": 99290 + }, + { + "epoch": 10.923542354235423, + "grad_norm": 0.17278587818145752, + "learning_rate": 2.5334079483959116e-05, + "loss": 0.0047, + "num_input_tokens_seen": 20956320, + "step": 99295 + }, + { + "epoch": 10.924092409240924, + "grad_norm": 0.061807770282030106, + "learning_rate": 2.5331679629000103e-05, + "loss": 0.0044, + "num_input_tokens_seen": 20957408, + "step": 99300 + }, + { + "epoch": 10.924642464246425, + "grad_norm": 0.04553522169589996, + "learning_rate": 2.532927977098416e-05, + "loss": 0.1039, + "num_input_tokens_seen": 20958432, + "step": 99305 + }, + { + "epoch": 10.925192519251926, + "grad_norm": 0.029501305893063545, + "learning_rate": 2.5326879909933403e-05, + "loss": 0.0072, + "num_input_tokens_seen": 20959520, + "step": 99310 + }, + { + "epoch": 10.925742574257425, + "grad_norm": 0.029789239168167114, + "learning_rate": 2.532448004586994e-05, + "loss": 0.0036, + "num_input_tokens_seen": 20960608, + "step": 99315 + }, + { + "epoch": 10.926292629262926, + "grad_norm": 0.07133320719003677, + "learning_rate": 2.532208017881591e-05, + "loss": 0.0665, + "num_input_tokens_seen": 20961664, + "step": 99320 + }, + { + "epoch": 10.926842684268427, + "grad_norm": 0.029617639258503914, + "learning_rate": 2.5319680308793426e-05, + "loss": 0.0099, + "num_input_tokens_seen": 20962688, + "step": 99325 + }, + { + "epoch": 10.927392739273927, + "grad_norm": 2.523090124130249, + "learning_rate": 2.5317280435824592e-05, + "loss": 0.2046, + "num_input_tokens_seen": 20963808, + "step": 99330 + }, + { + "epoch": 10.927942794279428, + "grad_norm": 0.037200018763542175, + "learning_rate": 2.5314880559931542e-05, + "loss": 0.0018, + "num_input_tokens_seen": 20964896, + "step": 99335 + }, + { + "epoch": 10.928492849284929, + "grad_norm": 2.3662493228912354, + "learning_rate": 2.5312480681136392e-05, + "loss": 0.1302, + "num_input_tokens_seen": 20965952, + "step": 99340 + }, + { + "epoch": 10.929042904290428, + "grad_norm": 0.19280670583248138, + "learning_rate": 2.5310080799461245e-05, + "loss": 0.0052, + "num_input_tokens_seen": 20967008, + "step": 99345 + }, + { + "epoch": 10.92959295929593, + "grad_norm": 0.9757252931594849, + "learning_rate": 2.530768091492824e-05, + "loss": 0.0517, + "num_input_tokens_seen": 20968064, + "step": 99350 + }, + { + "epoch": 10.93014301430143, + "grad_norm": 2.539937973022461, + "learning_rate": 2.5305281027559495e-05, + "loss": 0.0164, + "num_input_tokens_seen": 20969152, + "step": 99355 + }, + { + "epoch": 10.930693069306932, + "grad_norm": 0.011045240797102451, + "learning_rate": 2.5302881137377112e-05, + "loss": 0.0109, + "num_input_tokens_seen": 20970176, + "step": 99360 + }, + { + "epoch": 10.93124312431243, + "grad_norm": 0.019173989072442055, + "learning_rate": 2.530048124440322e-05, + "loss": 0.0622, + "num_input_tokens_seen": 20971200, + "step": 99365 + }, + { + "epoch": 10.931793179317932, + "grad_norm": 0.11321108788251877, + "learning_rate": 2.5298081348659936e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20972224, + "step": 99370 + }, + { + "epoch": 10.932343234323433, + "grad_norm": 0.026974281296133995, + "learning_rate": 2.529568145016938e-05, + "loss": 0.0024, + "num_input_tokens_seen": 20973216, + "step": 99375 + }, + { + "epoch": 10.932893289328932, + "grad_norm": 0.1923810988664627, + "learning_rate": 2.5293281548953673e-05, + "loss": 0.0418, + "num_input_tokens_seen": 20974304, + "step": 99380 + }, + { + "epoch": 10.933443344334433, + "grad_norm": 2.12998628616333, + "learning_rate": 2.5290881645034932e-05, + "loss": 0.0478, + "num_input_tokens_seen": 20975328, + "step": 99385 + }, + { + "epoch": 10.933993399339935, + "grad_norm": 0.020778298377990723, + "learning_rate": 2.5288481738435265e-05, + "loss": 0.0149, + "num_input_tokens_seen": 20976352, + "step": 99390 + }, + { + "epoch": 10.934543454345434, + "grad_norm": 0.013913596980273724, + "learning_rate": 2.5286081829176805e-05, + "loss": 0.0193, + "num_input_tokens_seen": 20977472, + "step": 99395 + }, + { + "epoch": 10.935093509350935, + "grad_norm": 0.08921093493700027, + "learning_rate": 2.5283681917281677e-05, + "loss": 0.0164, + "num_input_tokens_seen": 20978496, + "step": 99400 + }, + { + "epoch": 10.935643564356436, + "grad_norm": 0.007510468363761902, + "learning_rate": 2.528128200277197e-05, + "loss": 0.0081, + "num_input_tokens_seen": 20979552, + "step": 99405 + }, + { + "epoch": 10.936193619361937, + "grad_norm": 0.005643836688250303, + "learning_rate": 2.527888208566983e-05, + "loss": 0.1379, + "num_input_tokens_seen": 20980576, + "step": 99410 + }, + { + "epoch": 10.936743674367436, + "grad_norm": 0.014585472643375397, + "learning_rate": 2.5276482165997372e-05, + "loss": 0.0582, + "num_input_tokens_seen": 20981664, + "step": 99415 + }, + { + "epoch": 10.937293729372938, + "grad_norm": 0.07813385128974915, + "learning_rate": 2.5274082243776702e-05, + "loss": 0.0562, + "num_input_tokens_seen": 20982752, + "step": 99420 + }, + { + "epoch": 10.937843784378439, + "grad_norm": 0.0431382954120636, + "learning_rate": 2.527168231902996e-05, + "loss": 0.0309, + "num_input_tokens_seen": 20983712, + "step": 99425 + }, + { + "epoch": 10.938393839383938, + "grad_norm": 0.04951354116201401, + "learning_rate": 2.5269282391779243e-05, + "loss": 0.0058, + "num_input_tokens_seen": 20984768, + "step": 99430 + }, + { + "epoch": 10.938943894389439, + "grad_norm": 0.23634284734725952, + "learning_rate": 2.5266882462046675e-05, + "loss": 0.0167, + "num_input_tokens_seen": 20985792, + "step": 99435 + }, + { + "epoch": 10.93949394939494, + "grad_norm": 0.20609962940216064, + "learning_rate": 2.526448252985439e-05, + "loss": 0.0061, + "num_input_tokens_seen": 20986784, + "step": 99440 + }, + { + "epoch": 10.94004400440044, + "grad_norm": 1.7832274436950684, + "learning_rate": 2.526208259522449e-05, + "loss": 0.0275, + "num_input_tokens_seen": 20987872, + "step": 99445 + }, + { + "epoch": 10.94059405940594, + "grad_norm": 0.08619704097509384, + "learning_rate": 2.5259682658179107e-05, + "loss": 0.0132, + "num_input_tokens_seen": 20988928, + "step": 99450 + }, + { + "epoch": 10.941144114411442, + "grad_norm": 0.04777940735220909, + "learning_rate": 2.5257282718740348e-05, + "loss": 0.0431, + "num_input_tokens_seen": 20989952, + "step": 99455 + }, + { + "epoch": 10.941694169416941, + "grad_norm": 3.079954147338867, + "learning_rate": 2.5254882776930337e-05, + "loss": 0.0141, + "num_input_tokens_seen": 20990944, + "step": 99460 + }, + { + "epoch": 10.942244224422442, + "grad_norm": 0.02360200695693493, + "learning_rate": 2.5252482832771196e-05, + "loss": 0.0613, + "num_input_tokens_seen": 20992064, + "step": 99465 + }, + { + "epoch": 10.942794279427943, + "grad_norm": 0.07108745723962784, + "learning_rate": 2.525008288628504e-05, + "loss": 0.0096, + "num_input_tokens_seen": 20993088, + "step": 99470 + }, + { + "epoch": 10.943344334433444, + "grad_norm": 1.2735953330993652, + "learning_rate": 2.5247682937493995e-05, + "loss": 0.0482, + "num_input_tokens_seen": 20994208, + "step": 99475 + }, + { + "epoch": 10.943894389438944, + "grad_norm": 0.062122225761413574, + "learning_rate": 2.5245282986420172e-05, + "loss": 0.0405, + "num_input_tokens_seen": 20995296, + "step": 99480 + }, + { + "epoch": 10.944444444444445, + "grad_norm": 0.0577467679977417, + "learning_rate": 2.5242883033085692e-05, + "loss": 0.0253, + "num_input_tokens_seen": 20996384, + "step": 99485 + }, + { + "epoch": 10.944994499449946, + "grad_norm": 0.03634076565504074, + "learning_rate": 2.5240483077512676e-05, + "loss": 0.1299, + "num_input_tokens_seen": 20997440, + "step": 99490 + }, + { + "epoch": 10.945544554455445, + "grad_norm": 0.04376833885908127, + "learning_rate": 2.5238083119723243e-05, + "loss": 0.003, + "num_input_tokens_seen": 20998592, + "step": 99495 + }, + { + "epoch": 10.946094609460946, + "grad_norm": 1.1395066976547241, + "learning_rate": 2.5235683159739514e-05, + "loss": 0.0556, + "num_input_tokens_seen": 20999584, + "step": 99500 + }, + { + "epoch": 10.946644664466447, + "grad_norm": 0.022058486938476562, + "learning_rate": 2.5233283197583603e-05, + "loss": 0.0076, + "num_input_tokens_seen": 21000608, + "step": 99505 + }, + { + "epoch": 10.947194719471947, + "grad_norm": 1.7783342599868774, + "learning_rate": 2.5230883233277637e-05, + "loss": 0.0475, + "num_input_tokens_seen": 21001696, + "step": 99510 + }, + { + "epoch": 10.947744774477448, + "grad_norm": 0.3375149667263031, + "learning_rate": 2.5228483266843728e-05, + "loss": 0.0223, + "num_input_tokens_seen": 21002816, + "step": 99515 + }, + { + "epoch": 10.948294829482949, + "grad_norm": 0.6363317370414734, + "learning_rate": 2.5226083298304e-05, + "loss": 0.0279, + "num_input_tokens_seen": 21003872, + "step": 99520 + }, + { + "epoch": 10.948844884488448, + "grad_norm": 0.018884671851992607, + "learning_rate": 2.5223683327680568e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21004896, + "step": 99525 + }, + { + "epoch": 10.94939493949395, + "grad_norm": 0.14851514995098114, + "learning_rate": 2.5221283354995556e-05, + "loss": 0.1239, + "num_input_tokens_seen": 21005952, + "step": 99530 + }, + { + "epoch": 10.94994499449945, + "grad_norm": 0.13650816679000854, + "learning_rate": 2.521888338027108e-05, + "loss": 0.0354, + "num_input_tokens_seen": 21007008, + "step": 99535 + }, + { + "epoch": 10.950495049504951, + "grad_norm": 0.4560195803642273, + "learning_rate": 2.5216483403529263e-05, + "loss": 0.01, + "num_input_tokens_seen": 21008032, + "step": 99540 + }, + { + "epoch": 10.95104510451045, + "grad_norm": 1.5607675313949585, + "learning_rate": 2.5214083424792227e-05, + "loss": 0.0782, + "num_input_tokens_seen": 21009056, + "step": 99545 + }, + { + "epoch": 10.951595159515952, + "grad_norm": 3.203988552093506, + "learning_rate": 2.521168344408208e-05, + "loss": 0.0682, + "num_input_tokens_seen": 21010048, + "step": 99550 + }, + { + "epoch": 10.952145214521453, + "grad_norm": 0.01125631295144558, + "learning_rate": 2.5209283461420947e-05, + "loss": 0.0016, + "num_input_tokens_seen": 21011072, + "step": 99555 + }, + { + "epoch": 10.952695269526952, + "grad_norm": 0.037776682525873184, + "learning_rate": 2.520688347683095e-05, + "loss": 0.0141, + "num_input_tokens_seen": 21012096, + "step": 99560 + }, + { + "epoch": 10.953245324532453, + "grad_norm": 0.12621469795703888, + "learning_rate": 2.5204483490334206e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21013184, + "step": 99565 + }, + { + "epoch": 10.953795379537954, + "grad_norm": 0.49561989307403564, + "learning_rate": 2.5202083501952843e-05, + "loss": 0.0142, + "num_input_tokens_seen": 21014208, + "step": 99570 + }, + { + "epoch": 10.954345434543454, + "grad_norm": 0.2700421214103699, + "learning_rate": 2.519968351170897e-05, + "loss": 0.0094, + "num_input_tokens_seen": 21015264, + "step": 99575 + }, + { + "epoch": 10.954895489548955, + "grad_norm": 0.00909085851162672, + "learning_rate": 2.5197283519624705e-05, + "loss": 0.0122, + "num_input_tokens_seen": 21016288, + "step": 99580 + }, + { + "epoch": 10.955445544554456, + "grad_norm": 1.4996998310089111, + "learning_rate": 2.5194883525722178e-05, + "loss": 0.0211, + "num_input_tokens_seen": 21017280, + "step": 99585 + }, + { + "epoch": 10.955995599559955, + "grad_norm": 0.20049414038658142, + "learning_rate": 2.5192483530023504e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21018272, + "step": 99590 + }, + { + "epoch": 10.956545654565456, + "grad_norm": 0.23280830681324005, + "learning_rate": 2.51900835325508e-05, + "loss": 0.0575, + "num_input_tokens_seen": 21019232, + "step": 99595 + }, + { + "epoch": 10.957095709570957, + "grad_norm": 0.09962508082389832, + "learning_rate": 2.5187683533326184e-05, + "loss": 0.0061, + "num_input_tokens_seen": 21020288, + "step": 99600 + }, + { + "epoch": 10.957645764576458, + "grad_norm": 0.08647511899471283, + "learning_rate": 2.5185283532371782e-05, + "loss": 0.0128, + "num_input_tokens_seen": 21021344, + "step": 99605 + }, + { + "epoch": 10.958195819581958, + "grad_norm": 1.8589370250701904, + "learning_rate": 2.5182883529709705e-05, + "loss": 0.1033, + "num_input_tokens_seen": 21022400, + "step": 99610 + }, + { + "epoch": 10.958745874587459, + "grad_norm": 0.027058416977524757, + "learning_rate": 2.5180483525362093e-05, + "loss": 0.095, + "num_input_tokens_seen": 21023424, + "step": 99615 + }, + { + "epoch": 10.95929592959296, + "grad_norm": 0.9312030673027039, + "learning_rate": 2.5178083519351037e-05, + "loss": 0.0105, + "num_input_tokens_seen": 21024480, + "step": 99620 + }, + { + "epoch": 10.95984598459846, + "grad_norm": 0.044464610517024994, + "learning_rate": 2.5175683511698672e-05, + "loss": 0.0071, + "num_input_tokens_seen": 21025568, + "step": 99625 + }, + { + "epoch": 10.96039603960396, + "grad_norm": 0.1824573129415512, + "learning_rate": 2.517328350242712e-05, + "loss": 0.0054, + "num_input_tokens_seen": 21026656, + "step": 99630 + }, + { + "epoch": 10.960946094609461, + "grad_norm": 0.014890510588884354, + "learning_rate": 2.51708834915585e-05, + "loss": 0.0112, + "num_input_tokens_seen": 21027680, + "step": 99635 + }, + { + "epoch": 10.96149614961496, + "grad_norm": 0.058613959699869156, + "learning_rate": 2.5168483479114925e-05, + "loss": 0.0023, + "num_input_tokens_seen": 21028768, + "step": 99640 + }, + { + "epoch": 10.962046204620462, + "grad_norm": 0.11947010457515717, + "learning_rate": 2.5166083465118517e-05, + "loss": 0.0885, + "num_input_tokens_seen": 21029792, + "step": 99645 + }, + { + "epoch": 10.962596259625963, + "grad_norm": 0.1751352995634079, + "learning_rate": 2.5163683449591398e-05, + "loss": 0.0035, + "num_input_tokens_seen": 21030848, + "step": 99650 + }, + { + "epoch": 10.963146314631462, + "grad_norm": 0.004799343645572662, + "learning_rate": 2.5161283432555687e-05, + "loss": 0.0497, + "num_input_tokens_seen": 21031904, + "step": 99655 + }, + { + "epoch": 10.963696369636963, + "grad_norm": 0.017879247665405273, + "learning_rate": 2.5158883414033507e-05, + "loss": 0.0925, + "num_input_tokens_seen": 21032896, + "step": 99660 + }, + { + "epoch": 10.964246424642464, + "grad_norm": 1.097191333770752, + "learning_rate": 2.515648339404697e-05, + "loss": 0.0164, + "num_input_tokens_seen": 21033952, + "step": 99665 + }, + { + "epoch": 10.964796479647966, + "grad_norm": 0.6034514904022217, + "learning_rate": 2.515408337261821e-05, + "loss": 0.0078, + "num_input_tokens_seen": 21034976, + "step": 99670 + }, + { + "epoch": 10.965346534653465, + "grad_norm": 0.3035563826560974, + "learning_rate": 2.515168334976933e-05, + "loss": 0.0688, + "num_input_tokens_seen": 21036000, + "step": 99675 + }, + { + "epoch": 10.965896589658966, + "grad_norm": 2.134803533554077, + "learning_rate": 2.5149283325522456e-05, + "loss": 0.081, + "num_input_tokens_seen": 21037024, + "step": 99680 + }, + { + "epoch": 10.966446644664467, + "grad_norm": 0.030555661767721176, + "learning_rate": 2.5146883299899714e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21038144, + "step": 99685 + }, + { + "epoch": 10.966996699669966, + "grad_norm": 0.005008812993764877, + "learning_rate": 2.514448327292322e-05, + "loss": 0.0077, + "num_input_tokens_seen": 21039168, + "step": 99690 + }, + { + "epoch": 10.967546754675467, + "grad_norm": 0.33736133575439453, + "learning_rate": 2.5142083244615083e-05, + "loss": 0.0191, + "num_input_tokens_seen": 21040224, + "step": 99695 + }, + { + "epoch": 10.968096809680969, + "grad_norm": 2.821857452392578, + "learning_rate": 2.513968321499744e-05, + "loss": 0.0343, + "num_input_tokens_seen": 21041248, + "step": 99700 + }, + { + "epoch": 10.968646864686468, + "grad_norm": 0.8529667258262634, + "learning_rate": 2.5137283184092404e-05, + "loss": 0.029, + "num_input_tokens_seen": 21042400, + "step": 99705 + }, + { + "epoch": 10.969196919691969, + "grad_norm": 0.03419654816389084, + "learning_rate": 2.5134883151922094e-05, + "loss": 0.0738, + "num_input_tokens_seen": 21043424, + "step": 99710 + }, + { + "epoch": 10.96974697469747, + "grad_norm": 0.017614087089896202, + "learning_rate": 2.513248311850863e-05, + "loss": 0.1163, + "num_input_tokens_seen": 21044512, + "step": 99715 + }, + { + "epoch": 10.97029702970297, + "grad_norm": 0.041527777910232544, + "learning_rate": 2.513008308387414e-05, + "loss": 0.0023, + "num_input_tokens_seen": 21045536, + "step": 99720 + }, + { + "epoch": 10.97084708470847, + "grad_norm": 0.7620528340339661, + "learning_rate": 2.5127683048040724e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21046656, + "step": 99725 + }, + { + "epoch": 10.971397139713972, + "grad_norm": 0.010644149035215378, + "learning_rate": 2.512528301103052e-05, + "loss": 0.0024, + "num_input_tokens_seen": 21047712, + "step": 99730 + }, + { + "epoch": 10.971947194719473, + "grad_norm": 2.0601048469543457, + "learning_rate": 2.5122882972865652e-05, + "loss": 0.1048, + "num_input_tokens_seen": 21048800, + "step": 99735 + }, + { + "epoch": 10.972497249724972, + "grad_norm": 0.012216583825647831, + "learning_rate": 2.5120482933568217e-05, + "loss": 0.0407, + "num_input_tokens_seen": 21049824, + "step": 99740 + }, + { + "epoch": 10.973047304730473, + "grad_norm": 0.9265585541725159, + "learning_rate": 2.5118082893160356e-05, + "loss": 0.0219, + "num_input_tokens_seen": 21050848, + "step": 99745 + }, + { + "epoch": 10.973597359735974, + "grad_norm": 0.07917571812868118, + "learning_rate": 2.511568285166418e-05, + "loss": 0.0828, + "num_input_tokens_seen": 21051872, + "step": 99750 + }, + { + "epoch": 10.974147414741473, + "grad_norm": 0.46037617325782776, + "learning_rate": 2.5113282809101805e-05, + "loss": 0.0189, + "num_input_tokens_seen": 21052960, + "step": 99755 + }, + { + "epoch": 10.974697469746975, + "grad_norm": 0.016546133905649185, + "learning_rate": 2.5110882765495368e-05, + "loss": 0.0353, + "num_input_tokens_seen": 21054048, + "step": 99760 + }, + { + "epoch": 10.975247524752476, + "grad_norm": 0.015105748549103737, + "learning_rate": 2.5108482720866972e-05, + "loss": 0.0202, + "num_input_tokens_seen": 21055136, + "step": 99765 + }, + { + "epoch": 10.975797579757975, + "grad_norm": 0.992378830909729, + "learning_rate": 2.510608267523874e-05, + "loss": 0.0683, + "num_input_tokens_seen": 21056224, + "step": 99770 + }, + { + "epoch": 10.976347634763476, + "grad_norm": 0.026945868507027626, + "learning_rate": 2.5103682628632795e-05, + "loss": 0.0039, + "num_input_tokens_seen": 21057248, + "step": 99775 + }, + { + "epoch": 10.976897689768977, + "grad_norm": 0.015314068645238876, + "learning_rate": 2.510128258107126e-05, + "loss": 0.0035, + "num_input_tokens_seen": 21058368, + "step": 99780 + }, + { + "epoch": 10.977447744774478, + "grad_norm": 0.07873202115297318, + "learning_rate": 2.509888253257625e-05, + "loss": 0.0141, + "num_input_tokens_seen": 21059360, + "step": 99785 + }, + { + "epoch": 10.977997799779978, + "grad_norm": 0.0052884542383253574, + "learning_rate": 2.509648248316989e-05, + "loss": 0.003, + "num_input_tokens_seen": 21060448, + "step": 99790 + }, + { + "epoch": 10.978547854785479, + "grad_norm": 0.02172895148396492, + "learning_rate": 2.50940824328743e-05, + "loss": 0.004, + "num_input_tokens_seen": 21061504, + "step": 99795 + }, + { + "epoch": 10.97909790979098, + "grad_norm": 2.5357413291931152, + "learning_rate": 2.509168238171159e-05, + "loss": 0.0478, + "num_input_tokens_seen": 21062560, + "step": 99800 + }, + { + "epoch": 10.979647964796479, + "grad_norm": 0.1758536696434021, + "learning_rate": 2.5089282329703896e-05, + "loss": 0.0349, + "num_input_tokens_seen": 21063680, + "step": 99805 + }, + { + "epoch": 10.98019801980198, + "grad_norm": 0.5978518128395081, + "learning_rate": 2.5086882276873318e-05, + "loss": 0.0057, + "num_input_tokens_seen": 21064736, + "step": 99810 + }, + { + "epoch": 10.980748074807481, + "grad_norm": 0.028771713376045227, + "learning_rate": 2.5084482223241995e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21065824, + "step": 99815 + }, + { + "epoch": 10.98129812981298, + "grad_norm": 5.236015319824219, + "learning_rate": 2.508208216883204e-05, + "loss": 0.0204, + "num_input_tokens_seen": 21066816, + "step": 99820 + }, + { + "epoch": 10.981848184818482, + "grad_norm": 0.13850151002407074, + "learning_rate": 2.507968211366557e-05, + "loss": 0.0627, + "num_input_tokens_seen": 21067840, + "step": 99825 + }, + { + "epoch": 10.982398239823983, + "grad_norm": 0.3056543171405792, + "learning_rate": 2.507728205776471e-05, + "loss": 0.0066, + "num_input_tokens_seen": 21068928, + "step": 99830 + }, + { + "epoch": 10.982948294829484, + "grad_norm": 0.009244889952242374, + "learning_rate": 2.5074882001151586e-05, + "loss": 0.006, + "num_input_tokens_seen": 21069952, + "step": 99835 + }, + { + "epoch": 10.983498349834983, + "grad_norm": 0.08338257670402527, + "learning_rate": 2.507248194384829e-05, + "loss": 0.0141, + "num_input_tokens_seen": 21070976, + "step": 99840 + }, + { + "epoch": 10.984048404840484, + "grad_norm": 1.5182793140411377, + "learning_rate": 2.5070081885876977e-05, + "loss": 0.0438, + "num_input_tokens_seen": 21072096, + "step": 99845 + }, + { + "epoch": 10.984598459845985, + "grad_norm": 0.003984531853348017, + "learning_rate": 2.5067681827259754e-05, + "loss": 0.0108, + "num_input_tokens_seen": 21073184, + "step": 99850 + }, + { + "epoch": 10.985148514851485, + "grad_norm": 0.41900795698165894, + "learning_rate": 2.506528176801874e-05, + "loss": 0.0216, + "num_input_tokens_seen": 21074208, + "step": 99855 + }, + { + "epoch": 10.985698569856986, + "grad_norm": 0.06381724029779434, + "learning_rate": 2.5062881708176056e-05, + "loss": 0.0344, + "num_input_tokens_seen": 21075264, + "step": 99860 + }, + { + "epoch": 10.986248624862487, + "grad_norm": 0.38426831364631653, + "learning_rate": 2.506048164775382e-05, + "loss": 0.0582, + "num_input_tokens_seen": 21076352, + "step": 99865 + }, + { + "epoch": 10.986798679867986, + "grad_norm": 0.07168857753276825, + "learning_rate": 2.5058081586774147e-05, + "loss": 0.0243, + "num_input_tokens_seen": 21077408, + "step": 99870 + }, + { + "epoch": 10.987348734873487, + "grad_norm": 0.0018984898924827576, + "learning_rate": 2.5055681525259174e-05, + "loss": 0.0041, + "num_input_tokens_seen": 21078464, + "step": 99875 + }, + { + "epoch": 10.987898789878988, + "grad_norm": 0.558840811252594, + "learning_rate": 2.5053281463231003e-05, + "loss": 0.0521, + "num_input_tokens_seen": 21079584, + "step": 99880 + }, + { + "epoch": 10.988448844884488, + "grad_norm": 1.7946207523345947, + "learning_rate": 2.505088140071176e-05, + "loss": 0.0607, + "num_input_tokens_seen": 21080672, + "step": 99885 + }, + { + "epoch": 10.988998899889989, + "grad_norm": 0.11093195527791977, + "learning_rate": 2.504848133772358e-05, + "loss": 0.1301, + "num_input_tokens_seen": 21081824, + "step": 99890 + }, + { + "epoch": 10.98954895489549, + "grad_norm": 0.0075473361648619175, + "learning_rate": 2.5046081274288562e-05, + "loss": 0.0074, + "num_input_tokens_seen": 21082848, + "step": 99895 + }, + { + "epoch": 10.990099009900991, + "grad_norm": 0.011464381590485573, + "learning_rate": 2.504368121042884e-05, + "loss": 0.0758, + "num_input_tokens_seen": 21083904, + "step": 99900 + }, + { + "epoch": 10.99064906490649, + "grad_norm": 0.04768671095371246, + "learning_rate": 2.5041281146166524e-05, + "loss": 0.0048, + "num_input_tokens_seen": 21084928, + "step": 99905 + }, + { + "epoch": 10.991199119911991, + "grad_norm": 0.42803487181663513, + "learning_rate": 2.503888108152374e-05, + "loss": 0.0565, + "num_input_tokens_seen": 21086016, + "step": 99910 + }, + { + "epoch": 10.991749174917492, + "grad_norm": 0.12302716821432114, + "learning_rate": 2.5036481016522605e-05, + "loss": 0.0286, + "num_input_tokens_seen": 21087040, + "step": 99915 + }, + { + "epoch": 10.992299229922992, + "grad_norm": 0.029856812208890915, + "learning_rate": 2.5034080951185247e-05, + "loss": 0.0041, + "num_input_tokens_seen": 21088064, + "step": 99920 + }, + { + "epoch": 10.992849284928493, + "grad_norm": 0.6050194501876831, + "learning_rate": 2.503168088553378e-05, + "loss": 0.0075, + "num_input_tokens_seen": 21089152, + "step": 99925 + }, + { + "epoch": 10.993399339933994, + "grad_norm": 0.013201837427914143, + "learning_rate": 2.502928081959033e-05, + "loss": 0.0158, + "num_input_tokens_seen": 21090240, + "step": 99930 + }, + { + "epoch": 10.993949394939493, + "grad_norm": 0.01250533852726221, + "learning_rate": 2.5026880753377007e-05, + "loss": 0.0682, + "num_input_tokens_seen": 21091296, + "step": 99935 + }, + { + "epoch": 10.994499449944994, + "grad_norm": 0.030696751549839973, + "learning_rate": 2.502448068691594e-05, + "loss": 0.0355, + "num_input_tokens_seen": 21092416, + "step": 99940 + }, + { + "epoch": 10.995049504950495, + "grad_norm": 0.024835681542754173, + "learning_rate": 2.5022080620229238e-05, + "loss": 0.009, + "num_input_tokens_seen": 21093472, + "step": 99945 + }, + { + "epoch": 10.995599559955995, + "grad_norm": 0.015276067890226841, + "learning_rate": 2.501968055333904e-05, + "loss": 0.003, + "num_input_tokens_seen": 21094432, + "step": 99950 + }, + { + "epoch": 10.996149614961496, + "grad_norm": 0.024125879630446434, + "learning_rate": 2.5017280486267453e-05, + "loss": 0.003, + "num_input_tokens_seen": 21095552, + "step": 99955 + }, + { + "epoch": 10.996699669966997, + "grad_norm": 1.2250703573226929, + "learning_rate": 2.5014880419036596e-05, + "loss": 0.0114, + "num_input_tokens_seen": 21096576, + "step": 99960 + }, + { + "epoch": 10.997249724972498, + "grad_norm": 0.017117168754339218, + "learning_rate": 2.5012480351668598e-05, + "loss": 0.0012, + "num_input_tokens_seen": 21097696, + "step": 99965 + }, + { + "epoch": 10.997799779977997, + "grad_norm": 0.007533451076596975, + "learning_rate": 2.5010080284185576e-05, + "loss": 0.0016, + "num_input_tokens_seen": 21098784, + "step": 99970 + }, + { + "epoch": 10.998349834983498, + "grad_norm": 0.033840686082839966, + "learning_rate": 2.5007680216609648e-05, + "loss": 0.0035, + "num_input_tokens_seen": 21099840, + "step": 99975 + }, + { + "epoch": 10.998899889989, + "grad_norm": 0.024028094485402107, + "learning_rate": 2.5005280148962936e-05, + "loss": 0.0158, + "num_input_tokens_seen": 21100864, + "step": 99980 + }, + { + "epoch": 10.999449944994499, + "grad_norm": 0.561249315738678, + "learning_rate": 2.500288008126755e-05, + "loss": 0.0078, + "num_input_tokens_seen": 21101952, + "step": 99985 + }, + { + "epoch": 11.0, + "grad_norm": 0.003457447048276663, + "learning_rate": 2.5000480013545623e-05, + "loss": 0.0033, + "num_input_tokens_seen": 21102912, + "step": 99990 + }, + { + "epoch": 11.0, + "eval_loss": 0.068524569272995, + "eval_runtime": 36.9746, + "eval_samples_per_second": 109.264, + "eval_steps_per_second": 27.316, + "num_input_tokens_seen": 21102912, + "step": 99990 + }, + { + "epoch": 11.000550055005501, + "grad_norm": 1.205944299697876, + "learning_rate": 2.499807994581928e-05, + "loss": 0.0948, + "num_input_tokens_seen": 21103936, + "step": 99995 + }, + { + "epoch": 11.001100110011, + "grad_norm": 0.4407407343387604, + "learning_rate": 2.4995679878110625e-05, + "loss": 0.0196, + "num_input_tokens_seen": 21105024, + "step": 100000 + }, + { + "epoch": 11.001650165016502, + "grad_norm": 0.007101535797119141, + "learning_rate": 2.4993279810441787e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21106016, + "step": 100005 + }, + { + "epoch": 11.002200220022003, + "grad_norm": 0.7360825538635254, + "learning_rate": 2.4990879742834895e-05, + "loss": 0.0193, + "num_input_tokens_seen": 21107136, + "step": 100010 + }, + { + "epoch": 11.002750275027502, + "grad_norm": 0.02573988027870655, + "learning_rate": 2.4988479675312053e-05, + "loss": 0.0019, + "num_input_tokens_seen": 21108192, + "step": 100015 + }, + { + "epoch": 11.003300330033003, + "grad_norm": 0.013613750226795673, + "learning_rate": 2.4986079607895384e-05, + "loss": 0.0471, + "num_input_tokens_seen": 21109248, + "step": 100020 + }, + { + "epoch": 11.003850385038504, + "grad_norm": 2.5374507904052734, + "learning_rate": 2.4983679540607024e-05, + "loss": 0.0908, + "num_input_tokens_seen": 21110368, + "step": 100025 + }, + { + "epoch": 11.004400440044005, + "grad_norm": 0.07534302771091461, + "learning_rate": 2.4981279473469076e-05, + "loss": 0.0608, + "num_input_tokens_seen": 21111392, + "step": 100030 + }, + { + "epoch": 11.004950495049505, + "grad_norm": 0.9538525938987732, + "learning_rate": 2.497887940650366e-05, + "loss": 0.0836, + "num_input_tokens_seen": 21112512, + "step": 100035 + }, + { + "epoch": 11.005500550055006, + "grad_norm": 0.06350959837436676, + "learning_rate": 2.4976479339732914e-05, + "loss": 0.0231, + "num_input_tokens_seen": 21113632, + "step": 100040 + }, + { + "epoch": 11.006050605060507, + "grad_norm": 0.003821704536676407, + "learning_rate": 2.4974079273178928e-05, + "loss": 0.0012, + "num_input_tokens_seen": 21114720, + "step": 100045 + }, + { + "epoch": 11.006600660066006, + "grad_norm": 0.055063262581825256, + "learning_rate": 2.4971679206863863e-05, + "loss": 0.0088, + "num_input_tokens_seen": 21115776, + "step": 100050 + }, + { + "epoch": 11.007150715071507, + "grad_norm": 0.4859248399734497, + "learning_rate": 2.4969279140809806e-05, + "loss": 0.0633, + "num_input_tokens_seen": 21116864, + "step": 100055 + }, + { + "epoch": 11.007700770077008, + "grad_norm": 0.7162113785743713, + "learning_rate": 2.496687907503889e-05, + "loss": 0.0883, + "num_input_tokens_seen": 21117952, + "step": 100060 + }, + { + "epoch": 11.008250825082508, + "grad_norm": 0.21229322254657745, + "learning_rate": 2.4964479009573235e-05, + "loss": 0.01, + "num_input_tokens_seen": 21119008, + "step": 100065 + }, + { + "epoch": 11.008800880088009, + "grad_norm": 0.7782031893730164, + "learning_rate": 2.4962078944434956e-05, + "loss": 0.0135, + "num_input_tokens_seen": 21120032, + "step": 100070 + }, + { + "epoch": 11.00935093509351, + "grad_norm": 0.012247909791767597, + "learning_rate": 2.495967887964618e-05, + "loss": 0.0016, + "num_input_tokens_seen": 21121120, + "step": 100075 + }, + { + "epoch": 11.009900990099009, + "grad_norm": 0.17482948303222656, + "learning_rate": 2.495727881522903e-05, + "loss": 0.1091, + "num_input_tokens_seen": 21122176, + "step": 100080 + }, + { + "epoch": 11.01045104510451, + "grad_norm": 0.7945292592048645, + "learning_rate": 2.495487875120561e-05, + "loss": 0.057, + "num_input_tokens_seen": 21123168, + "step": 100085 + }, + { + "epoch": 11.011001100110011, + "grad_norm": 0.006155026610940695, + "learning_rate": 2.4952478687598055e-05, + "loss": 0.0116, + "num_input_tokens_seen": 21124192, + "step": 100090 + }, + { + "epoch": 11.011551155115512, + "grad_norm": 0.02160901390016079, + "learning_rate": 2.4950078624428477e-05, + "loss": 0.0126, + "num_input_tokens_seen": 21125280, + "step": 100095 + }, + { + "epoch": 11.012101210121012, + "grad_norm": 0.008085163310170174, + "learning_rate": 2.4947678561719008e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21126336, + "step": 100100 + }, + { + "epoch": 11.012651265126513, + "grad_norm": 0.013133867643773556, + "learning_rate": 2.494527849949176e-05, + "loss": 0.0054, + "num_input_tokens_seen": 21127360, + "step": 100105 + }, + { + "epoch": 11.013201320132014, + "grad_norm": 0.030883703380823135, + "learning_rate": 2.4942878437768848e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21128480, + "step": 100110 + }, + { + "epoch": 11.013751375137513, + "grad_norm": 0.05974820256233215, + "learning_rate": 2.4940478376572403e-05, + "loss": 0.0142, + "num_input_tokens_seen": 21129664, + "step": 100115 + }, + { + "epoch": 11.014301430143014, + "grad_norm": 0.013851677998900414, + "learning_rate": 2.4938078315924533e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21130720, + "step": 100120 + }, + { + "epoch": 11.014851485148515, + "grad_norm": 2.032238721847534, + "learning_rate": 2.493567825584738e-05, + "loss": 0.0968, + "num_input_tokens_seen": 21131712, + "step": 100125 + }, + { + "epoch": 11.015401540154015, + "grad_norm": 0.11493542045354843, + "learning_rate": 2.493327819636304e-05, + "loss": 0.0123, + "num_input_tokens_seen": 21132736, + "step": 100130 + }, + { + "epoch": 11.015951595159516, + "grad_norm": 0.006831480655819178, + "learning_rate": 2.493087813749364e-05, + "loss": 0.0398, + "num_input_tokens_seen": 21133824, + "step": 100135 + }, + { + "epoch": 11.016501650165017, + "grad_norm": 1.6880038976669312, + "learning_rate": 2.4928478079261315e-05, + "loss": 0.0228, + "num_input_tokens_seen": 21134848, + "step": 100140 + }, + { + "epoch": 11.017051705170518, + "grad_norm": 2.4685823917388916, + "learning_rate": 2.4926078021688158e-05, + "loss": 0.0506, + "num_input_tokens_seen": 21135936, + "step": 100145 + }, + { + "epoch": 11.017601760176017, + "grad_norm": 0.007379329297691584, + "learning_rate": 2.4923677964796314e-05, + "loss": 0.0167, + "num_input_tokens_seen": 21137024, + "step": 100150 + }, + { + "epoch": 11.018151815181518, + "grad_norm": 1.9057493209838867, + "learning_rate": 2.4921277908607894e-05, + "loss": 0.0324, + "num_input_tokens_seen": 21138080, + "step": 100155 + }, + { + "epoch": 11.01870187018702, + "grad_norm": 0.05210435763001442, + "learning_rate": 2.4918877853145005e-05, + "loss": 0.002, + "num_input_tokens_seen": 21139136, + "step": 100160 + }, + { + "epoch": 11.019251925192519, + "grad_norm": 0.0030061937868595123, + "learning_rate": 2.4916477798429795e-05, + "loss": 0.0164, + "num_input_tokens_seen": 21140192, + "step": 100165 + }, + { + "epoch": 11.01980198019802, + "grad_norm": 0.019027262926101685, + "learning_rate": 2.491407774448436e-05, + "loss": 0.0848, + "num_input_tokens_seen": 21141184, + "step": 100170 + }, + { + "epoch": 11.020352035203521, + "grad_norm": 0.6371374726295471, + "learning_rate": 2.4911677691330834e-05, + "loss": 0.0103, + "num_input_tokens_seen": 21142208, + "step": 100175 + }, + { + "epoch": 11.02090209020902, + "grad_norm": 0.14101052284240723, + "learning_rate": 2.4909277638991338e-05, + "loss": 0.0139, + "num_input_tokens_seen": 21143232, + "step": 100180 + }, + { + "epoch": 11.021452145214521, + "grad_norm": 0.013082217425107956, + "learning_rate": 2.4906877587487975e-05, + "loss": 0.0049, + "num_input_tokens_seen": 21144288, + "step": 100185 + }, + { + "epoch": 11.022002200220022, + "grad_norm": 0.014265242964029312, + "learning_rate": 2.490447753684288e-05, + "loss": 0.0825, + "num_input_tokens_seen": 21145280, + "step": 100190 + }, + { + "epoch": 11.022552255225522, + "grad_norm": 0.007770895026624203, + "learning_rate": 2.490207748707817e-05, + "loss": 0.0479, + "num_input_tokens_seen": 21146272, + "step": 100195 + }, + { + "epoch": 11.023102310231023, + "grad_norm": 0.019001085311174393, + "learning_rate": 2.489967743821597e-05, + "loss": 0.2457, + "num_input_tokens_seen": 21147360, + "step": 100200 + }, + { + "epoch": 11.023652365236524, + "grad_norm": 0.005709749646484852, + "learning_rate": 2.4897277390278393e-05, + "loss": 0.0009, + "num_input_tokens_seen": 21148384, + "step": 100205 + }, + { + "epoch": 11.024202420242025, + "grad_norm": 0.0065873111598193645, + "learning_rate": 2.4894877343287554e-05, + "loss": 0.0053, + "num_input_tokens_seen": 21149408, + "step": 100210 + }, + { + "epoch": 11.024752475247524, + "grad_norm": 0.005769388750195503, + "learning_rate": 2.489247729726559e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21150528, + "step": 100215 + }, + { + "epoch": 11.025302530253025, + "grad_norm": 0.7372487187385559, + "learning_rate": 2.4890077252234605e-05, + "loss": 0.0115, + "num_input_tokens_seen": 21151552, + "step": 100220 + }, + { + "epoch": 11.025852585258527, + "grad_norm": 0.481285035610199, + "learning_rate": 2.4887677208216724e-05, + "loss": 0.0178, + "num_input_tokens_seen": 21152576, + "step": 100225 + }, + { + "epoch": 11.026402640264026, + "grad_norm": 1.2957042455673218, + "learning_rate": 2.488527716523407e-05, + "loss": 0.1176, + "num_input_tokens_seen": 21153600, + "step": 100230 + }, + { + "epoch": 11.026952695269527, + "grad_norm": 2.5247719287872314, + "learning_rate": 2.4882877123308755e-05, + "loss": 0.0385, + "num_input_tokens_seen": 21154688, + "step": 100235 + }, + { + "epoch": 11.027502750275028, + "grad_norm": 0.6086121797561646, + "learning_rate": 2.4880477082462924e-05, + "loss": 0.012, + "num_input_tokens_seen": 21155744, + "step": 100240 + }, + { + "epoch": 11.028052805280527, + "grad_norm": 2.1585114002227783, + "learning_rate": 2.4878077042718665e-05, + "loss": 0.1621, + "num_input_tokens_seen": 21156800, + "step": 100245 + }, + { + "epoch": 11.028602860286028, + "grad_norm": 0.046655673533678055, + "learning_rate": 2.4875677004098108e-05, + "loss": 0.0055, + "num_input_tokens_seen": 21157856, + "step": 100250 + }, + { + "epoch": 11.02915291529153, + "grad_norm": 0.055925168097019196, + "learning_rate": 2.4873276966623387e-05, + "loss": 0.0443, + "num_input_tokens_seen": 21158816, + "step": 100255 + }, + { + "epoch": 11.029702970297029, + "grad_norm": 0.4181758165359497, + "learning_rate": 2.48708769303166e-05, + "loss": 0.0183, + "num_input_tokens_seen": 21159872, + "step": 100260 + }, + { + "epoch": 11.03025302530253, + "grad_norm": 0.09733225405216217, + "learning_rate": 2.4868476895199884e-05, + "loss": 0.011, + "num_input_tokens_seen": 21160832, + "step": 100265 + }, + { + "epoch": 11.030803080308031, + "grad_norm": 1.3032405376434326, + "learning_rate": 2.4866076861295363e-05, + "loss": 0.016, + "num_input_tokens_seen": 21161952, + "step": 100270 + }, + { + "epoch": 11.031353135313532, + "grad_norm": 0.19916751980781555, + "learning_rate": 2.486367682862513e-05, + "loss": 0.005, + "num_input_tokens_seen": 21163040, + "step": 100275 + }, + { + "epoch": 11.031903190319031, + "grad_norm": 0.02765110693871975, + "learning_rate": 2.486127679721133e-05, + "loss": 0.028, + "num_input_tokens_seen": 21164128, + "step": 100280 + }, + { + "epoch": 11.032453245324533, + "grad_norm": 0.07777222990989685, + "learning_rate": 2.4858876767076074e-05, + "loss": 0.1056, + "num_input_tokens_seen": 21165184, + "step": 100285 + }, + { + "epoch": 11.033003300330034, + "grad_norm": 0.05300289765000343, + "learning_rate": 2.485647673824148e-05, + "loss": 0.0245, + "num_input_tokens_seen": 21166304, + "step": 100290 + }, + { + "epoch": 11.033553355335533, + "grad_norm": 0.04535328224301338, + "learning_rate": 2.4854076710729684e-05, + "loss": 0.0333, + "num_input_tokens_seen": 21167296, + "step": 100295 + }, + { + "epoch": 11.034103410341034, + "grad_norm": 0.46726614236831665, + "learning_rate": 2.485167668456278e-05, + "loss": 0.0446, + "num_input_tokens_seen": 21168416, + "step": 100300 + }, + { + "epoch": 11.034653465346535, + "grad_norm": 0.10991103947162628, + "learning_rate": 2.4849276659762904e-05, + "loss": 0.0353, + "num_input_tokens_seen": 21169408, + "step": 100305 + }, + { + "epoch": 11.035203520352034, + "grad_norm": 0.10913726687431335, + "learning_rate": 2.484687663635217e-05, + "loss": 0.0359, + "num_input_tokens_seen": 21170496, + "step": 100310 + }, + { + "epoch": 11.035753575357536, + "grad_norm": 0.04569767415523529, + "learning_rate": 2.484447661435271e-05, + "loss": 0.0062, + "num_input_tokens_seen": 21171648, + "step": 100315 + }, + { + "epoch": 11.036303630363037, + "grad_norm": 0.02215113677084446, + "learning_rate": 2.4842076593786632e-05, + "loss": 0.0243, + "num_input_tokens_seen": 21172800, + "step": 100320 + }, + { + "epoch": 11.036853685368538, + "grad_norm": 0.1201903447508812, + "learning_rate": 2.483967657467605e-05, + "loss": 0.0028, + "num_input_tokens_seen": 21173856, + "step": 100325 + }, + { + "epoch": 11.037403740374037, + "grad_norm": 0.1071600690484047, + "learning_rate": 2.48372765570431e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21174848, + "step": 100330 + }, + { + "epoch": 11.037953795379538, + "grad_norm": 0.15550637245178223, + "learning_rate": 2.4834876540909886e-05, + "loss": 0.0061, + "num_input_tokens_seen": 21176000, + "step": 100335 + }, + { + "epoch": 11.03850385038504, + "grad_norm": 0.008556629531085491, + "learning_rate": 2.4832476526298546e-05, + "loss": 0.0303, + "num_input_tokens_seen": 21177120, + "step": 100340 + }, + { + "epoch": 11.039053905390539, + "grad_norm": 0.13939981162548065, + "learning_rate": 2.4830076513231184e-05, + "loss": 0.0211, + "num_input_tokens_seen": 21178208, + "step": 100345 + }, + { + "epoch": 11.03960396039604, + "grad_norm": 0.027368832379579544, + "learning_rate": 2.4827676501729918e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21179232, + "step": 100350 + }, + { + "epoch": 11.04015401540154, + "grad_norm": 0.0164421945810318, + "learning_rate": 2.482527649181689e-05, + "loss": 0.0034, + "num_input_tokens_seen": 21180288, + "step": 100355 + }, + { + "epoch": 11.04070407040704, + "grad_norm": 0.007764286827296019, + "learning_rate": 2.4822876483514193e-05, + "loss": 0.0274, + "num_input_tokens_seen": 21181312, + "step": 100360 + }, + { + "epoch": 11.041254125412541, + "grad_norm": 0.09286992996931076, + "learning_rate": 2.4820476476843962e-05, + "loss": 0.0055, + "num_input_tokens_seen": 21182432, + "step": 100365 + }, + { + "epoch": 11.041804180418042, + "grad_norm": 0.20015043020248413, + "learning_rate": 2.4818076471828322e-05, + "loss": 0.005, + "num_input_tokens_seen": 21183456, + "step": 100370 + }, + { + "epoch": 11.042354235423542, + "grad_norm": 0.06452686339616776, + "learning_rate": 2.481567646848937e-05, + "loss": 0.0104, + "num_input_tokens_seen": 21184544, + "step": 100375 + }, + { + "epoch": 11.042904290429043, + "grad_norm": 0.19385218620300293, + "learning_rate": 2.4813276466849245e-05, + "loss": 0.0063, + "num_input_tokens_seen": 21185664, + "step": 100380 + }, + { + "epoch": 11.043454345434544, + "grad_norm": 0.011316763237118721, + "learning_rate": 2.4810876466930057e-05, + "loss": 0.0979, + "num_input_tokens_seen": 21186720, + "step": 100385 + }, + { + "epoch": 11.044004400440045, + "grad_norm": 0.010249330662190914, + "learning_rate": 2.480847646875394e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21187776, + "step": 100390 + }, + { + "epoch": 11.044554455445544, + "grad_norm": 0.07925592362880707, + "learning_rate": 2.4806076472342997e-05, + "loss": 0.0027, + "num_input_tokens_seen": 21188832, + "step": 100395 + }, + { + "epoch": 11.045104510451045, + "grad_norm": 3.1961700916290283, + "learning_rate": 2.4803676477719354e-05, + "loss": 0.142, + "num_input_tokens_seen": 21189856, + "step": 100400 + }, + { + "epoch": 11.045654565456546, + "grad_norm": 1.2830986976623535, + "learning_rate": 2.4801276484905132e-05, + "loss": 0.1376, + "num_input_tokens_seen": 21190816, + "step": 100405 + }, + { + "epoch": 11.046204620462046, + "grad_norm": 0.005606233142316341, + "learning_rate": 2.4798876493922454e-05, + "loss": 0.0478, + "num_input_tokens_seen": 21191840, + "step": 100410 + }, + { + "epoch": 11.046754675467547, + "grad_norm": 0.08359140902757645, + "learning_rate": 2.4796476504793425e-05, + "loss": 0.0182, + "num_input_tokens_seen": 21192832, + "step": 100415 + }, + { + "epoch": 11.047304730473048, + "grad_norm": 1.0344231128692627, + "learning_rate": 2.4794076517540178e-05, + "loss": 0.0469, + "num_input_tokens_seen": 21193856, + "step": 100420 + }, + { + "epoch": 11.047854785478547, + "grad_norm": 2.406031608581543, + "learning_rate": 2.4791676532184827e-05, + "loss": 0.0267, + "num_input_tokens_seen": 21194912, + "step": 100425 + }, + { + "epoch": 11.048404840484048, + "grad_norm": 0.010438201017677784, + "learning_rate": 2.4789276548749503e-05, + "loss": 0.0639, + "num_input_tokens_seen": 21195936, + "step": 100430 + }, + { + "epoch": 11.04895489548955, + "grad_norm": 0.00853409618139267, + "learning_rate": 2.478687656725631e-05, + "loss": 0.137, + "num_input_tokens_seen": 21196960, + "step": 100435 + }, + { + "epoch": 11.049504950495049, + "grad_norm": 1.451829195022583, + "learning_rate": 2.4784476587727367e-05, + "loss": 0.1276, + "num_input_tokens_seen": 21198048, + "step": 100440 + }, + { + "epoch": 11.05005500550055, + "grad_norm": 0.024183155968785286, + "learning_rate": 2.478207661018481e-05, + "loss": 0.0098, + "num_input_tokens_seen": 21199104, + "step": 100445 + }, + { + "epoch": 11.05060506050605, + "grad_norm": 0.04092894494533539, + "learning_rate": 2.4779676634650742e-05, + "loss": 0.0485, + "num_input_tokens_seen": 21200192, + "step": 100450 + }, + { + "epoch": 11.051155115511552, + "grad_norm": 0.8786206841468811, + "learning_rate": 2.4777276661147293e-05, + "loss": 0.1227, + "num_input_tokens_seen": 21201312, + "step": 100455 + }, + { + "epoch": 11.051705170517051, + "grad_norm": 0.053348228335380554, + "learning_rate": 2.477487668969658e-05, + "loss": 0.0586, + "num_input_tokens_seen": 21202400, + "step": 100460 + }, + { + "epoch": 11.052255225522552, + "grad_norm": 0.06791827827692032, + "learning_rate": 2.4772476720320706e-05, + "loss": 0.0016, + "num_input_tokens_seen": 21203456, + "step": 100465 + }, + { + "epoch": 11.052805280528053, + "grad_norm": 0.032016362994909286, + "learning_rate": 2.477007675304182e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21204512, + "step": 100470 + }, + { + "epoch": 11.053355335533553, + "grad_norm": 0.05229088291525841, + "learning_rate": 2.476767678788202e-05, + "loss": 0.0579, + "num_input_tokens_seen": 21205536, + "step": 100475 + }, + { + "epoch": 11.053905390539054, + "grad_norm": 0.2918733060359955, + "learning_rate": 2.476527682486343e-05, + "loss": 0.0091, + "num_input_tokens_seen": 21206656, + "step": 100480 + }, + { + "epoch": 11.054455445544555, + "grad_norm": 0.09139952808618546, + "learning_rate": 2.4762876864008183e-05, + "loss": 0.0077, + "num_input_tokens_seen": 21207776, + "step": 100485 + }, + { + "epoch": 11.055005500550054, + "grad_norm": 0.1720348745584488, + "learning_rate": 2.476047690533837e-05, + "loss": 0.0048, + "num_input_tokens_seen": 21208832, + "step": 100490 + }, + { + "epoch": 11.055555555555555, + "grad_norm": 0.0038095531053841114, + "learning_rate": 2.4758076948876137e-05, + "loss": 0.0164, + "num_input_tokens_seen": 21209920, + "step": 100495 + }, + { + "epoch": 11.056105610561056, + "grad_norm": 0.01700643077492714, + "learning_rate": 2.4755676994643585e-05, + "loss": 0.0516, + "num_input_tokens_seen": 21210976, + "step": 100500 + }, + { + "epoch": 11.056655665566556, + "grad_norm": 0.15767517685890198, + "learning_rate": 2.475327704266285e-05, + "loss": 0.0117, + "num_input_tokens_seen": 21212064, + "step": 100505 + }, + { + "epoch": 11.057205720572057, + "grad_norm": 1.3459694385528564, + "learning_rate": 2.4750877092956036e-05, + "loss": 0.0297, + "num_input_tokens_seen": 21213088, + "step": 100510 + }, + { + "epoch": 11.057755775577558, + "grad_norm": 0.006013383623212576, + "learning_rate": 2.4748477145545264e-05, + "loss": 0.0009, + "num_input_tokens_seen": 21214144, + "step": 100515 + }, + { + "epoch": 11.058305830583059, + "grad_norm": 0.13216571509838104, + "learning_rate": 2.4746077200452672e-05, + "loss": 0.0189, + "num_input_tokens_seen": 21215168, + "step": 100520 + }, + { + "epoch": 11.058855885588558, + "grad_norm": 0.005543303210288286, + "learning_rate": 2.4743677257700347e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21216256, + "step": 100525 + }, + { + "epoch": 11.05940594059406, + "grad_norm": 2.040134906768799, + "learning_rate": 2.4741277317310438e-05, + "loss": 0.1397, + "num_input_tokens_seen": 21217280, + "step": 100530 + }, + { + "epoch": 11.05995599559956, + "grad_norm": 0.03263535350561142, + "learning_rate": 2.473887737930505e-05, + "loss": 0.0052, + "num_input_tokens_seen": 21218336, + "step": 100535 + }, + { + "epoch": 11.06050605060506, + "grad_norm": 0.22644631564617157, + "learning_rate": 2.4736477443706297e-05, + "loss": 0.0209, + "num_input_tokens_seen": 21219424, + "step": 100540 + }, + { + "epoch": 11.061056105610561, + "grad_norm": 0.03265867382287979, + "learning_rate": 2.4734077510536317e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21220448, + "step": 100545 + }, + { + "epoch": 11.061606160616062, + "grad_norm": 0.030761517584323883, + "learning_rate": 2.4731677579817203e-05, + "loss": 0.0882, + "num_input_tokens_seen": 21221472, + "step": 100550 + }, + { + "epoch": 11.062156215621561, + "grad_norm": 0.008013509213924408, + "learning_rate": 2.4729277651571093e-05, + "loss": 0.0255, + "num_input_tokens_seen": 21222432, + "step": 100555 + }, + { + "epoch": 11.062706270627062, + "grad_norm": 0.10748276859521866, + "learning_rate": 2.4726877725820112e-05, + "loss": 0.0131, + "num_input_tokens_seen": 21223488, + "step": 100560 + }, + { + "epoch": 11.063256325632564, + "grad_norm": 0.045975856482982635, + "learning_rate": 2.4724477802586354e-05, + "loss": 0.0514, + "num_input_tokens_seen": 21224544, + "step": 100565 + }, + { + "epoch": 11.063806380638065, + "grad_norm": 0.07285954058170319, + "learning_rate": 2.472207788189196e-05, + "loss": 0.0064, + "num_input_tokens_seen": 21225568, + "step": 100570 + }, + { + "epoch": 11.064356435643564, + "grad_norm": 0.0077080572955310345, + "learning_rate": 2.471967796375903e-05, + "loss": 0.0068, + "num_input_tokens_seen": 21226624, + "step": 100575 + }, + { + "epoch": 11.064906490649065, + "grad_norm": 0.9835706353187561, + "learning_rate": 2.4717278048209708e-05, + "loss": 0.0327, + "num_input_tokens_seen": 21227680, + "step": 100580 + }, + { + "epoch": 11.065456545654566, + "grad_norm": 0.04521699622273445, + "learning_rate": 2.4714878135266092e-05, + "loss": 0.0193, + "num_input_tokens_seen": 21228672, + "step": 100585 + }, + { + "epoch": 11.066006600660065, + "grad_norm": 0.013359090313315392, + "learning_rate": 2.47124782249503e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21229728, + "step": 100590 + }, + { + "epoch": 11.066556655665567, + "grad_norm": 0.06284332275390625, + "learning_rate": 2.471007831728447e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21230784, + "step": 100595 + }, + { + "epoch": 11.067106710671068, + "grad_norm": 0.0735001415014267, + "learning_rate": 2.470767841229071e-05, + "loss": 0.0935, + "num_input_tokens_seen": 21231840, + "step": 100600 + }, + { + "epoch": 11.067656765676567, + "grad_norm": 0.03486587107181549, + "learning_rate": 2.470527850999113e-05, + "loss": 0.1413, + "num_input_tokens_seen": 21232960, + "step": 100605 + }, + { + "epoch": 11.068206820682068, + "grad_norm": 0.09766238182783127, + "learning_rate": 2.4702878610407856e-05, + "loss": 0.0408, + "num_input_tokens_seen": 21234016, + "step": 100610 + }, + { + "epoch": 11.06875687568757, + "grad_norm": 0.03939232602715492, + "learning_rate": 2.4700478713563006e-05, + "loss": 0.1463, + "num_input_tokens_seen": 21235104, + "step": 100615 + }, + { + "epoch": 11.069306930693068, + "grad_norm": 0.6761857271194458, + "learning_rate": 2.4698078819478708e-05, + "loss": 0.01, + "num_input_tokens_seen": 21236128, + "step": 100620 + }, + { + "epoch": 11.06985698569857, + "grad_norm": 0.016310371458530426, + "learning_rate": 2.4695678928177072e-05, + "loss": 0.0248, + "num_input_tokens_seen": 21237152, + "step": 100625 + }, + { + "epoch": 11.07040704070407, + "grad_norm": 3.2333192825317383, + "learning_rate": 2.469327903968021e-05, + "loss": 0.0728, + "num_input_tokens_seen": 21238304, + "step": 100630 + }, + { + "epoch": 11.070957095709572, + "grad_norm": 0.24433079361915588, + "learning_rate": 2.4690879154010258e-05, + "loss": 0.1603, + "num_input_tokens_seen": 21239328, + "step": 100635 + }, + { + "epoch": 11.071507150715071, + "grad_norm": 0.09562308341264725, + "learning_rate": 2.4688479271189313e-05, + "loss": 0.0066, + "num_input_tokens_seen": 21240448, + "step": 100640 + }, + { + "epoch": 11.072057205720572, + "grad_norm": 1.5607831478118896, + "learning_rate": 2.468607939123951e-05, + "loss": 0.0457, + "num_input_tokens_seen": 21241472, + "step": 100645 + }, + { + "epoch": 11.072607260726073, + "grad_norm": 1.0460786819458008, + "learning_rate": 2.4683679514182963e-05, + "loss": 0.0842, + "num_input_tokens_seen": 21242592, + "step": 100650 + }, + { + "epoch": 11.073157315731573, + "grad_norm": 0.021183297038078308, + "learning_rate": 2.4681279640041788e-05, + "loss": 0.0044, + "num_input_tokens_seen": 21243680, + "step": 100655 + }, + { + "epoch": 11.073707370737074, + "grad_norm": 0.0020573940128087997, + "learning_rate": 2.4678879768838116e-05, + "loss": 0.0134, + "num_input_tokens_seen": 21244736, + "step": 100660 + }, + { + "epoch": 11.074257425742575, + "grad_norm": 1.3452938795089722, + "learning_rate": 2.467647990059404e-05, + "loss": 0.089, + "num_input_tokens_seen": 21245824, + "step": 100665 + }, + { + "epoch": 11.074807480748074, + "grad_norm": 1.2184902429580688, + "learning_rate": 2.46740800353317e-05, + "loss": 0.116, + "num_input_tokens_seen": 21246880, + "step": 100670 + }, + { + "epoch": 11.075357535753575, + "grad_norm": 2.1715855598449707, + "learning_rate": 2.4671680173073217e-05, + "loss": 0.0841, + "num_input_tokens_seen": 21247904, + "step": 100675 + }, + { + "epoch": 11.075907590759076, + "grad_norm": 0.22175022959709167, + "learning_rate": 2.4669280313840686e-05, + "loss": 0.0285, + "num_input_tokens_seen": 21248896, + "step": 100680 + }, + { + "epoch": 11.076457645764576, + "grad_norm": 1.5898096561431885, + "learning_rate": 2.4666880457656246e-05, + "loss": 0.1058, + "num_input_tokens_seen": 21250016, + "step": 100685 + }, + { + "epoch": 11.077007700770077, + "grad_norm": 0.023027125746011734, + "learning_rate": 2.4664480604542003e-05, + "loss": 0.0095, + "num_input_tokens_seen": 21251072, + "step": 100690 + }, + { + "epoch": 11.077557755775578, + "grad_norm": 0.028422528877854347, + "learning_rate": 2.466208075452009e-05, + "loss": 0.0533, + "num_input_tokens_seen": 21252160, + "step": 100695 + }, + { + "epoch": 11.078107810781079, + "grad_norm": 0.23499347269535065, + "learning_rate": 2.4659680907612617e-05, + "loss": 0.023, + "num_input_tokens_seen": 21253216, + "step": 100700 + }, + { + "epoch": 11.078657865786578, + "grad_norm": 0.29845380783081055, + "learning_rate": 2.4657281063841693e-05, + "loss": 0.0762, + "num_input_tokens_seen": 21254304, + "step": 100705 + }, + { + "epoch": 11.07920792079208, + "grad_norm": 0.04476982727646828, + "learning_rate": 2.465488122322945e-05, + "loss": 0.0283, + "num_input_tokens_seen": 21255328, + "step": 100710 + }, + { + "epoch": 11.07975797579758, + "grad_norm": 0.013673323206603527, + "learning_rate": 2.4652481385797994e-05, + "loss": 0.0836, + "num_input_tokens_seen": 21256384, + "step": 100715 + }, + { + "epoch": 11.08030803080308, + "grad_norm": 0.07445547729730606, + "learning_rate": 2.465008155156947e-05, + "loss": 0.0463, + "num_input_tokens_seen": 21257376, + "step": 100720 + }, + { + "epoch": 11.08085808580858, + "grad_norm": 0.015835251659154892, + "learning_rate": 2.464768172056596e-05, + "loss": 0.1349, + "num_input_tokens_seen": 21258400, + "step": 100725 + }, + { + "epoch": 11.081408140814082, + "grad_norm": 0.43839231133461, + "learning_rate": 2.46452818928096e-05, + "loss": 0.0167, + "num_input_tokens_seen": 21259424, + "step": 100730 + }, + { + "epoch": 11.081958195819581, + "grad_norm": 0.13486771285533905, + "learning_rate": 2.4642882068322516e-05, + "loss": 0.029, + "num_input_tokens_seen": 21260448, + "step": 100735 + }, + { + "epoch": 11.082508250825082, + "grad_norm": 0.5892423391342163, + "learning_rate": 2.4640482247126802e-05, + "loss": 0.1267, + "num_input_tokens_seen": 21261536, + "step": 100740 + }, + { + "epoch": 11.083058305830583, + "grad_norm": 0.042878419160842896, + "learning_rate": 2.46380824292446e-05, + "loss": 0.0345, + "num_input_tokens_seen": 21262592, + "step": 100745 + }, + { + "epoch": 11.083608360836084, + "grad_norm": 0.07588062435388565, + "learning_rate": 2.4635682614698023e-05, + "loss": 0.0103, + "num_input_tokens_seen": 21263744, + "step": 100750 + }, + { + "epoch": 11.084158415841584, + "grad_norm": 0.026398872956633568, + "learning_rate": 2.4633282803509173e-05, + "loss": 0.0117, + "num_input_tokens_seen": 21264768, + "step": 100755 + }, + { + "epoch": 11.084708470847085, + "grad_norm": 0.07341349124908447, + "learning_rate": 2.4630882995700185e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21265792, + "step": 100760 + }, + { + "epoch": 11.085258525852586, + "grad_norm": 0.08409693837165833, + "learning_rate": 2.4628483191293165e-05, + "loss": 0.064, + "num_input_tokens_seen": 21266816, + "step": 100765 + }, + { + "epoch": 11.085808580858085, + "grad_norm": 0.03690857067704201, + "learning_rate": 2.4626083390310246e-05, + "loss": 0.037, + "num_input_tokens_seen": 21267904, + "step": 100770 + }, + { + "epoch": 11.086358635863586, + "grad_norm": 2.747450351715088, + "learning_rate": 2.4623683592773538e-05, + "loss": 0.1046, + "num_input_tokens_seen": 21268928, + "step": 100775 + }, + { + "epoch": 11.086908690869087, + "grad_norm": 0.012600621208548546, + "learning_rate": 2.4621283798705148e-05, + "loss": 0.0157, + "num_input_tokens_seen": 21270048, + "step": 100780 + }, + { + "epoch": 11.087458745874587, + "grad_norm": 0.023080511018633842, + "learning_rate": 2.4618884008127212e-05, + "loss": 0.0031, + "num_input_tokens_seen": 21271072, + "step": 100785 + }, + { + "epoch": 11.088008800880088, + "grad_norm": 0.08122440427541733, + "learning_rate": 2.4616484221061844e-05, + "loss": 0.0064, + "num_input_tokens_seen": 21272064, + "step": 100790 + }, + { + "epoch": 11.088558855885589, + "grad_norm": 0.9665572047233582, + "learning_rate": 2.4614084437531144e-05, + "loss": 0.0151, + "num_input_tokens_seen": 21273152, + "step": 100795 + }, + { + "epoch": 11.089108910891088, + "grad_norm": 1.883876919746399, + "learning_rate": 2.461168465755725e-05, + "loss": 0.0701, + "num_input_tokens_seen": 21274208, + "step": 100800 + }, + { + "epoch": 11.08965896589659, + "grad_norm": 0.28998008370399475, + "learning_rate": 2.4609284881162264e-05, + "loss": 0.0863, + "num_input_tokens_seen": 21275328, + "step": 100805 + }, + { + "epoch": 11.09020902090209, + "grad_norm": 0.04851751774549484, + "learning_rate": 2.4606885108368323e-05, + "loss": 0.0866, + "num_input_tokens_seen": 21276320, + "step": 100810 + }, + { + "epoch": 11.090759075907592, + "grad_norm": 0.669171929359436, + "learning_rate": 2.460448533919753e-05, + "loss": 0.0445, + "num_input_tokens_seen": 21277376, + "step": 100815 + }, + { + "epoch": 11.091309130913091, + "grad_norm": 0.7972713112831116, + "learning_rate": 2.4602085573672e-05, + "loss": 0.0172, + "num_input_tokens_seen": 21278464, + "step": 100820 + }, + { + "epoch": 11.091859185918592, + "grad_norm": 0.47510069608688354, + "learning_rate": 2.4599685811813866e-05, + "loss": 0.024, + "num_input_tokens_seen": 21279552, + "step": 100825 + }, + { + "epoch": 11.092409240924093, + "grad_norm": 0.10469979792833328, + "learning_rate": 2.4597286053645223e-05, + "loss": 0.007, + "num_input_tokens_seen": 21280608, + "step": 100830 + }, + { + "epoch": 11.092959295929592, + "grad_norm": 1.1756013631820679, + "learning_rate": 2.4594886299188214e-05, + "loss": 0.0522, + "num_input_tokens_seen": 21281632, + "step": 100835 + }, + { + "epoch": 11.093509350935093, + "grad_norm": 0.08872464299201965, + "learning_rate": 2.4592486548464937e-05, + "loss": 0.0683, + "num_input_tokens_seen": 21282784, + "step": 100840 + }, + { + "epoch": 11.094059405940595, + "grad_norm": 0.2491617053747177, + "learning_rate": 2.4590086801497515e-05, + "loss": 0.0069, + "num_input_tokens_seen": 21283776, + "step": 100845 + }, + { + "epoch": 11.094609460946094, + "grad_norm": 0.46948131918907166, + "learning_rate": 2.4587687058308074e-05, + "loss": 0.0245, + "num_input_tokens_seen": 21284832, + "step": 100850 + }, + { + "epoch": 11.095159515951595, + "grad_norm": 1.2588878870010376, + "learning_rate": 2.4585287318918714e-05, + "loss": 0.0309, + "num_input_tokens_seen": 21285888, + "step": 100855 + }, + { + "epoch": 11.095709570957096, + "grad_norm": 0.1111726462841034, + "learning_rate": 2.4582887583351563e-05, + "loss": 0.0539, + "num_input_tokens_seen": 21287008, + "step": 100860 + }, + { + "epoch": 11.096259625962595, + "grad_norm": 0.32853230834007263, + "learning_rate": 2.458048785162875e-05, + "loss": 0.0068, + "num_input_tokens_seen": 21288064, + "step": 100865 + }, + { + "epoch": 11.096809680968097, + "grad_norm": 0.5608848929405212, + "learning_rate": 2.4578088123772362e-05, + "loss": 0.0161, + "num_input_tokens_seen": 21289120, + "step": 100870 + }, + { + "epoch": 11.097359735973598, + "grad_norm": 0.012846898287534714, + "learning_rate": 2.4575688399804538e-05, + "loss": 0.0052, + "num_input_tokens_seen": 21290208, + "step": 100875 + }, + { + "epoch": 11.097909790979099, + "grad_norm": 0.38377365469932556, + "learning_rate": 2.4573288679747387e-05, + "loss": 0.0088, + "num_input_tokens_seen": 21291232, + "step": 100880 + }, + { + "epoch": 11.098459845984598, + "grad_norm": 0.032421648502349854, + "learning_rate": 2.4570888963623044e-05, + "loss": 0.0254, + "num_input_tokens_seen": 21292320, + "step": 100885 + }, + { + "epoch": 11.099009900990099, + "grad_norm": 0.30615076422691345, + "learning_rate": 2.4568489251453603e-05, + "loss": 0.0809, + "num_input_tokens_seen": 21293344, + "step": 100890 + }, + { + "epoch": 11.0995599559956, + "grad_norm": 0.48200204968452454, + "learning_rate": 2.4566089543261184e-05, + "loss": 0.009, + "num_input_tokens_seen": 21294400, + "step": 100895 + }, + { + "epoch": 11.1001100110011, + "grad_norm": 2.190300226211548, + "learning_rate": 2.4563689839067913e-05, + "loss": 0.0402, + "num_input_tokens_seen": 21295520, + "step": 100900 + }, + { + "epoch": 11.1006600660066, + "grad_norm": 1.2339223623275757, + "learning_rate": 2.45612901388959e-05, + "loss": 0.0784, + "num_input_tokens_seen": 21296608, + "step": 100905 + }, + { + "epoch": 11.101210121012102, + "grad_norm": 0.016931433230638504, + "learning_rate": 2.455889044276728e-05, + "loss": 0.0473, + "num_input_tokens_seen": 21297632, + "step": 100910 + }, + { + "epoch": 11.101760176017601, + "grad_norm": 0.010401486419141293, + "learning_rate": 2.4556490750704146e-05, + "loss": 0.0673, + "num_input_tokens_seen": 21298624, + "step": 100915 + }, + { + "epoch": 11.102310231023102, + "grad_norm": 1.1082638502120972, + "learning_rate": 2.4554091062728615e-05, + "loss": 0.0459, + "num_input_tokens_seen": 21299744, + "step": 100920 + }, + { + "epoch": 11.102860286028603, + "grad_norm": 0.03867685794830322, + "learning_rate": 2.4551691378862827e-05, + "loss": 0.0344, + "num_input_tokens_seen": 21300800, + "step": 100925 + }, + { + "epoch": 11.103410341034103, + "grad_norm": 0.0178147591650486, + "learning_rate": 2.4549291699128873e-05, + "loss": 0.002, + "num_input_tokens_seen": 21301824, + "step": 100930 + }, + { + "epoch": 11.103960396039604, + "grad_norm": 0.19469690322875977, + "learning_rate": 2.454689202354889e-05, + "loss": 0.0043, + "num_input_tokens_seen": 21302848, + "step": 100935 + }, + { + "epoch": 11.104510451045105, + "grad_norm": 0.01768490858376026, + "learning_rate": 2.4544492352144986e-05, + "loss": 0.0131, + "num_input_tokens_seen": 21303936, + "step": 100940 + }, + { + "epoch": 11.105060506050606, + "grad_norm": 0.4369436502456665, + "learning_rate": 2.4542092684939262e-05, + "loss": 0.0593, + "num_input_tokens_seen": 21304960, + "step": 100945 + }, + { + "epoch": 11.105610561056105, + "grad_norm": 0.013596984557807446, + "learning_rate": 2.4539693021953873e-05, + "loss": 0.0311, + "num_input_tokens_seen": 21306016, + "step": 100950 + }, + { + "epoch": 11.106160616061606, + "grad_norm": 0.03429847210645676, + "learning_rate": 2.4537293363210896e-05, + "loss": 0.0505, + "num_input_tokens_seen": 21307072, + "step": 100955 + }, + { + "epoch": 11.106710671067107, + "grad_norm": 0.04856012761592865, + "learning_rate": 2.453489370873247e-05, + "loss": 0.0241, + "num_input_tokens_seen": 21308160, + "step": 100960 + }, + { + "epoch": 11.107260726072607, + "grad_norm": 0.9366456866264343, + "learning_rate": 2.453249405854071e-05, + "loss": 0.0605, + "num_input_tokens_seen": 21309184, + "step": 100965 + }, + { + "epoch": 11.107810781078108, + "grad_norm": 0.03650360926985741, + "learning_rate": 2.4530094412657716e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21310208, + "step": 100970 + }, + { + "epoch": 11.108360836083609, + "grad_norm": 0.0068641286343336105, + "learning_rate": 2.4527694771105625e-05, + "loss": 0.0675, + "num_input_tokens_seen": 21311232, + "step": 100975 + }, + { + "epoch": 11.108910891089108, + "grad_norm": 0.16636653244495392, + "learning_rate": 2.4525295133906548e-05, + "loss": 0.0513, + "num_input_tokens_seen": 21312256, + "step": 100980 + }, + { + "epoch": 11.10946094609461, + "grad_norm": 0.2567789852619171, + "learning_rate": 2.452289550108259e-05, + "loss": 0.0066, + "num_input_tokens_seen": 21313280, + "step": 100985 + }, + { + "epoch": 11.11001100110011, + "grad_norm": 0.9502329230308533, + "learning_rate": 2.4520495872655877e-05, + "loss": 0.0188, + "num_input_tokens_seen": 21314304, + "step": 100990 + }, + { + "epoch": 11.110561056105611, + "grad_norm": 0.31306418776512146, + "learning_rate": 2.451809624864852e-05, + "loss": 0.0244, + "num_input_tokens_seen": 21315360, + "step": 100995 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 2.702334403991699, + "learning_rate": 2.4515696629082648e-05, + "loss": 0.0246, + "num_input_tokens_seen": 21316448, + "step": 101000 + }, + { + "epoch": 11.111661166116612, + "grad_norm": 0.027089182287454605, + "learning_rate": 2.4513297013980363e-05, + "loss": 0.1024, + "num_input_tokens_seen": 21317536, + "step": 101005 + }, + { + "epoch": 11.112211221122113, + "grad_norm": 0.07432713359594345, + "learning_rate": 2.4510897403363782e-05, + "loss": 0.0044, + "num_input_tokens_seen": 21318560, + "step": 101010 + }, + { + "epoch": 11.112761276127612, + "grad_norm": 0.20782147347927094, + "learning_rate": 2.450849779725503e-05, + "loss": 0.0103, + "num_input_tokens_seen": 21319552, + "step": 101015 + }, + { + "epoch": 11.113311331133113, + "grad_norm": 0.08697531372308731, + "learning_rate": 2.4506098195676212e-05, + "loss": 0.0049, + "num_input_tokens_seen": 21320640, + "step": 101020 + }, + { + "epoch": 11.113861386138614, + "grad_norm": 0.2134087234735489, + "learning_rate": 2.450369859864946e-05, + "loss": 0.014, + "num_input_tokens_seen": 21321632, + "step": 101025 + }, + { + "epoch": 11.114411441144114, + "grad_norm": 0.008221158757805824, + "learning_rate": 2.4501299006196872e-05, + "loss": 0.0182, + "num_input_tokens_seen": 21322720, + "step": 101030 + }, + { + "epoch": 11.114961496149615, + "grad_norm": 0.31253933906555176, + "learning_rate": 2.449889941834057e-05, + "loss": 0.0572, + "num_input_tokens_seen": 21323744, + "step": 101035 + }, + { + "epoch": 11.115511551155116, + "grad_norm": 1.735394835472107, + "learning_rate": 2.4496499835102684e-05, + "loss": 0.0419, + "num_input_tokens_seen": 21324768, + "step": 101040 + }, + { + "epoch": 11.116061606160615, + "grad_norm": 0.03248084709048271, + "learning_rate": 2.4494100256505303e-05, + "loss": 0.0052, + "num_input_tokens_seen": 21325856, + "step": 101045 + }, + { + "epoch": 11.116611661166116, + "grad_norm": 0.04840483143925667, + "learning_rate": 2.4491700682570565e-05, + "loss": 0.0758, + "num_input_tokens_seen": 21326880, + "step": 101050 + }, + { + "epoch": 11.117161716171617, + "grad_norm": 1.18292236328125, + "learning_rate": 2.4489301113320583e-05, + "loss": 0.0905, + "num_input_tokens_seen": 21327904, + "step": 101055 + }, + { + "epoch": 11.117711771177119, + "grad_norm": 0.1123601496219635, + "learning_rate": 2.4486901548777455e-05, + "loss": 0.015, + "num_input_tokens_seen": 21328960, + "step": 101060 + }, + { + "epoch": 11.118261826182618, + "grad_norm": 0.024687113240361214, + "learning_rate": 2.4484501988963314e-05, + "loss": 0.0059, + "num_input_tokens_seen": 21330016, + "step": 101065 + }, + { + "epoch": 11.118811881188119, + "grad_norm": 2.0056354999542236, + "learning_rate": 2.448210243390027e-05, + "loss": 0.0495, + "num_input_tokens_seen": 21331008, + "step": 101070 + }, + { + "epoch": 11.11936193619362, + "grad_norm": 3.006446361541748, + "learning_rate": 2.4479702883610443e-05, + "loss": 0.0623, + "num_input_tokens_seen": 21332128, + "step": 101075 + }, + { + "epoch": 11.11991199119912, + "grad_norm": 0.7503377795219421, + "learning_rate": 2.4477303338115947e-05, + "loss": 0.0759, + "num_input_tokens_seen": 21333152, + "step": 101080 + }, + { + "epoch": 11.12046204620462, + "grad_norm": 0.06104062870144844, + "learning_rate": 2.4474903797438887e-05, + "loss": 0.0176, + "num_input_tokens_seen": 21334144, + "step": 101085 + }, + { + "epoch": 11.121012101210122, + "grad_norm": 0.03250656649470329, + "learning_rate": 2.4472504261601393e-05, + "loss": 0.0034, + "num_input_tokens_seen": 21335200, + "step": 101090 + }, + { + "epoch": 11.12156215621562, + "grad_norm": 0.03753367438912392, + "learning_rate": 2.4470104730625564e-05, + "loss": 0.019, + "num_input_tokens_seen": 21336256, + "step": 101095 + }, + { + "epoch": 11.122112211221122, + "grad_norm": 0.13826347887516022, + "learning_rate": 2.446770520453354e-05, + "loss": 0.0031, + "num_input_tokens_seen": 21337344, + "step": 101100 + }, + { + "epoch": 11.122662266226623, + "grad_norm": 0.6907459497451782, + "learning_rate": 2.4465305683347416e-05, + "loss": 0.0193, + "num_input_tokens_seen": 21338368, + "step": 101105 + }, + { + "epoch": 11.123212321232122, + "grad_norm": 0.8086946606636047, + "learning_rate": 2.446290616708931e-05, + "loss": 0.0282, + "num_input_tokens_seen": 21339424, + "step": 101110 + }, + { + "epoch": 11.123762376237623, + "grad_norm": 0.155099019408226, + "learning_rate": 2.4460506655781348e-05, + "loss": 0.1004, + "num_input_tokens_seen": 21340512, + "step": 101115 + }, + { + "epoch": 11.124312431243125, + "grad_norm": 0.06565189361572266, + "learning_rate": 2.445810714944563e-05, + "loss": 0.0033, + "num_input_tokens_seen": 21341568, + "step": 101120 + }, + { + "epoch": 11.124862486248626, + "grad_norm": 0.9069449305534363, + "learning_rate": 2.4455707648104277e-05, + "loss": 0.0391, + "num_input_tokens_seen": 21342560, + "step": 101125 + }, + { + "epoch": 11.125412541254125, + "grad_norm": 0.010425828397274017, + "learning_rate": 2.4453308151779407e-05, + "loss": 0.0204, + "num_input_tokens_seen": 21343616, + "step": 101130 + }, + { + "epoch": 11.125962596259626, + "grad_norm": 0.011470424942672253, + "learning_rate": 2.4450908660493133e-05, + "loss": 0.021, + "num_input_tokens_seen": 21344672, + "step": 101135 + }, + { + "epoch": 11.126512651265127, + "grad_norm": 0.023942269384860992, + "learning_rate": 2.444850917426758e-05, + "loss": 0.0499, + "num_input_tokens_seen": 21345792, + "step": 101140 + }, + { + "epoch": 11.127062706270626, + "grad_norm": 0.09986662864685059, + "learning_rate": 2.444610969312484e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21346912, + "step": 101145 + }, + { + "epoch": 11.127612761276128, + "grad_norm": 0.15956461429595947, + "learning_rate": 2.4443710217087045e-05, + "loss": 0.0157, + "num_input_tokens_seen": 21348064, + "step": 101150 + }, + { + "epoch": 11.128162816281629, + "grad_norm": 0.022564757615327835, + "learning_rate": 2.4441310746176317e-05, + "loss": 0.003, + "num_input_tokens_seen": 21349152, + "step": 101155 + }, + { + "epoch": 11.128712871287128, + "grad_norm": 0.03489161282777786, + "learning_rate": 2.4438911280414744e-05, + "loss": 0.0215, + "num_input_tokens_seen": 21350176, + "step": 101160 + }, + { + "epoch": 11.129262926292629, + "grad_norm": 0.02734333835542202, + "learning_rate": 2.4436511819824466e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21351264, + "step": 101165 + }, + { + "epoch": 11.12981298129813, + "grad_norm": 1.612973928451538, + "learning_rate": 2.4434112364427587e-05, + "loss": 0.0962, + "num_input_tokens_seen": 21352352, + "step": 101170 + }, + { + "epoch": 11.130363036303631, + "grad_norm": 0.710676372051239, + "learning_rate": 2.443171291424622e-05, + "loss": 0.0078, + "num_input_tokens_seen": 21353472, + "step": 101175 + }, + { + "epoch": 11.13091309130913, + "grad_norm": 0.022904375568032265, + "learning_rate": 2.4429313469302483e-05, + "loss": 0.0028, + "num_input_tokens_seen": 21354496, + "step": 101180 + }, + { + "epoch": 11.131463146314632, + "grad_norm": 0.13026301562786102, + "learning_rate": 2.4426914029618485e-05, + "loss": 0.0037, + "num_input_tokens_seen": 21355488, + "step": 101185 + }, + { + "epoch": 11.132013201320133, + "grad_norm": 0.010306186974048615, + "learning_rate": 2.4424514595216352e-05, + "loss": 0.0849, + "num_input_tokens_seen": 21356512, + "step": 101190 + }, + { + "epoch": 11.132563256325632, + "grad_norm": 0.004541358444839716, + "learning_rate": 2.4422115166118196e-05, + "loss": 0.0112, + "num_input_tokens_seen": 21357568, + "step": 101195 + }, + { + "epoch": 11.133113311331133, + "grad_norm": 0.01831413060426712, + "learning_rate": 2.4419715742346116e-05, + "loss": 0.0063, + "num_input_tokens_seen": 21358720, + "step": 101200 + }, + { + "epoch": 11.133663366336634, + "grad_norm": 0.08947838097810745, + "learning_rate": 2.4417316323922246e-05, + "loss": 0.0126, + "num_input_tokens_seen": 21359744, + "step": 101205 + }, + { + "epoch": 11.134213421342134, + "grad_norm": 0.052926693111658096, + "learning_rate": 2.4414916910868683e-05, + "loss": 0.1063, + "num_input_tokens_seen": 21360736, + "step": 101210 + }, + { + "epoch": 11.134763476347635, + "grad_norm": 1.1414897441864014, + "learning_rate": 2.4412517503207563e-05, + "loss": 0.0485, + "num_input_tokens_seen": 21361728, + "step": 101215 + }, + { + "epoch": 11.135313531353136, + "grad_norm": 0.02547605335712433, + "learning_rate": 2.4410118100960982e-05, + "loss": 0.0323, + "num_input_tokens_seen": 21362816, + "step": 101220 + }, + { + "epoch": 11.135863586358635, + "grad_norm": 1.5022152662277222, + "learning_rate": 2.440771870415105e-05, + "loss": 0.0522, + "num_input_tokens_seen": 21363904, + "step": 101225 + }, + { + "epoch": 11.136413641364136, + "grad_norm": 0.02394939213991165, + "learning_rate": 2.4405319312799906e-05, + "loss": 0.0018, + "num_input_tokens_seen": 21364896, + "step": 101230 + }, + { + "epoch": 11.136963696369637, + "grad_norm": 0.07263470441102982, + "learning_rate": 2.4402919926929637e-05, + "loss": 0.0629, + "num_input_tokens_seen": 21365920, + "step": 101235 + }, + { + "epoch": 11.137513751375138, + "grad_norm": 1.9910563230514526, + "learning_rate": 2.440052054656237e-05, + "loss": 0.0725, + "num_input_tokens_seen": 21367008, + "step": 101240 + }, + { + "epoch": 11.138063806380638, + "grad_norm": 0.021946223452687263, + "learning_rate": 2.439812117172023e-05, + "loss": 0.0929, + "num_input_tokens_seen": 21368032, + "step": 101245 + }, + { + "epoch": 11.138613861386139, + "grad_norm": 0.1363438367843628, + "learning_rate": 2.4395721802425297e-05, + "loss": 0.0058, + "num_input_tokens_seen": 21369088, + "step": 101250 + }, + { + "epoch": 11.13916391639164, + "grad_norm": 0.043865252286195755, + "learning_rate": 2.439332243869972e-05, + "loss": 0.0141, + "num_input_tokens_seen": 21370144, + "step": 101255 + }, + { + "epoch": 11.13971397139714, + "grad_norm": 0.004254547879099846, + "learning_rate": 2.4390923080565593e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21371232, + "step": 101260 + }, + { + "epoch": 11.14026402640264, + "grad_norm": 2.5912513732910156, + "learning_rate": 2.4388523728045044e-05, + "loss": 0.0951, + "num_input_tokens_seen": 21372320, + "step": 101265 + }, + { + "epoch": 11.140814081408141, + "grad_norm": 0.08363603055477142, + "learning_rate": 2.4386124381160177e-05, + "loss": 0.0504, + "num_input_tokens_seen": 21373376, + "step": 101270 + }, + { + "epoch": 11.14136413641364, + "grad_norm": 0.007756928913295269, + "learning_rate": 2.43837250399331e-05, + "loss": 0.0165, + "num_input_tokens_seen": 21374400, + "step": 101275 + }, + { + "epoch": 11.141914191419142, + "grad_norm": 0.18005163967609406, + "learning_rate": 2.438132570438594e-05, + "loss": 0.0266, + "num_input_tokens_seen": 21375424, + "step": 101280 + }, + { + "epoch": 11.142464246424643, + "grad_norm": 0.21951961517333984, + "learning_rate": 2.4378926374540794e-05, + "loss": 0.0294, + "num_input_tokens_seen": 21376512, + "step": 101285 + }, + { + "epoch": 11.143014301430142, + "grad_norm": 0.3895135223865509, + "learning_rate": 2.4376527050419802e-05, + "loss": 0.0243, + "num_input_tokens_seen": 21377568, + "step": 101290 + }, + { + "epoch": 11.143564356435643, + "grad_norm": 0.1783972680568695, + "learning_rate": 2.4374127732045048e-05, + "loss": 0.0122, + "num_input_tokens_seen": 21378688, + "step": 101295 + }, + { + "epoch": 11.144114411441144, + "grad_norm": 0.13917632400989532, + "learning_rate": 2.437172841943866e-05, + "loss": 0.0152, + "num_input_tokens_seen": 21379744, + "step": 101300 + }, + { + "epoch": 11.144664466446645, + "grad_norm": 1.4838441610336304, + "learning_rate": 2.4369329112622758e-05, + "loss": 0.1228, + "num_input_tokens_seen": 21380832, + "step": 101305 + }, + { + "epoch": 11.145214521452145, + "grad_norm": 0.010332928039133549, + "learning_rate": 2.436692981161943e-05, + "loss": 0.037, + "num_input_tokens_seen": 21381888, + "step": 101310 + }, + { + "epoch": 11.145764576457646, + "grad_norm": 0.031821358948946, + "learning_rate": 2.4364530516450824e-05, + "loss": 0.0411, + "num_input_tokens_seen": 21382976, + "step": 101315 + }, + { + "epoch": 11.146314631463147, + "grad_norm": 0.06672707945108414, + "learning_rate": 2.436213122713903e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21384032, + "step": 101320 + }, + { + "epoch": 11.146864686468646, + "grad_norm": 0.022776011377573013, + "learning_rate": 2.4359731943706157e-05, + "loss": 0.0401, + "num_input_tokens_seen": 21385088, + "step": 101325 + }, + { + "epoch": 11.147414741474147, + "grad_norm": 0.05613785982131958, + "learning_rate": 2.4357332666174343e-05, + "loss": 0.0027, + "num_input_tokens_seen": 21386176, + "step": 101330 + }, + { + "epoch": 11.147964796479648, + "grad_norm": 0.024624692276120186, + "learning_rate": 2.435493339456567e-05, + "loss": 0.0137, + "num_input_tokens_seen": 21387200, + "step": 101335 + }, + { + "epoch": 11.148514851485148, + "grad_norm": 0.2755573093891144, + "learning_rate": 2.4352534128902278e-05, + "loss": 0.0417, + "num_input_tokens_seen": 21388288, + "step": 101340 + }, + { + "epoch": 11.149064906490649, + "grad_norm": 0.262762188911438, + "learning_rate": 2.435013486920627e-05, + "loss": 0.0109, + "num_input_tokens_seen": 21389312, + "step": 101345 + }, + { + "epoch": 11.14961496149615, + "grad_norm": 0.07067784667015076, + "learning_rate": 2.4347735615499743e-05, + "loss": 0.0815, + "num_input_tokens_seen": 21390400, + "step": 101350 + }, + { + "epoch": 11.150165016501651, + "grad_norm": 0.027835503220558167, + "learning_rate": 2.4345336367804837e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21391456, + "step": 101355 + }, + { + "epoch": 11.15071507150715, + "grad_norm": 0.11548065394163132, + "learning_rate": 2.434293712614364e-05, + "loss": 0.0058, + "num_input_tokens_seen": 21392576, + "step": 101360 + }, + { + "epoch": 11.151265126512651, + "grad_norm": 0.11692839860916138, + "learning_rate": 2.434053789053829e-05, + "loss": 0.064, + "num_input_tokens_seen": 21393696, + "step": 101365 + }, + { + "epoch": 11.151815181518153, + "grad_norm": 0.11205731332302094, + "learning_rate": 2.433813866101088e-05, + "loss": 0.065, + "num_input_tokens_seen": 21394784, + "step": 101370 + }, + { + "epoch": 11.152365236523652, + "grad_norm": 0.009394240565598011, + "learning_rate": 2.433573943758352e-05, + "loss": 0.0482, + "num_input_tokens_seen": 21395808, + "step": 101375 + }, + { + "epoch": 11.152915291529153, + "grad_norm": 0.10034102946519852, + "learning_rate": 2.433334022027834e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21396896, + "step": 101380 + }, + { + "epoch": 11.153465346534654, + "grad_norm": 0.16131183505058289, + "learning_rate": 2.4330941009117446e-05, + "loss": 0.0118, + "num_input_tokens_seen": 21398016, + "step": 101385 + }, + { + "epoch": 11.154015401540153, + "grad_norm": 0.04343712329864502, + "learning_rate": 2.432854180412294e-05, + "loss": 0.0041, + "num_input_tokens_seen": 21399040, + "step": 101390 + }, + { + "epoch": 11.154565456545654, + "grad_norm": 0.11966592818498611, + "learning_rate": 2.4326142605316946e-05, + "loss": 0.0079, + "num_input_tokens_seen": 21400160, + "step": 101395 + }, + { + "epoch": 11.155115511551156, + "grad_norm": 0.07112367451190948, + "learning_rate": 2.4323743412721564e-05, + "loss": 0.0233, + "num_input_tokens_seen": 21401248, + "step": 101400 + }, + { + "epoch": 11.155665566556655, + "grad_norm": 0.07458911091089249, + "learning_rate": 2.432134422635893e-05, + "loss": 0.0271, + "num_input_tokens_seen": 21402304, + "step": 101405 + }, + { + "epoch": 11.156215621562156, + "grad_norm": 1.6757824420928955, + "learning_rate": 2.431894504625113e-05, + "loss": 0.0375, + "num_input_tokens_seen": 21403360, + "step": 101410 + }, + { + "epoch": 11.156765676567657, + "grad_norm": 0.021915419027209282, + "learning_rate": 2.4316545872420286e-05, + "loss": 0.0035, + "num_input_tokens_seen": 21404416, + "step": 101415 + }, + { + "epoch": 11.157315731573158, + "grad_norm": 0.07632149755954742, + "learning_rate": 2.431414670488852e-05, + "loss": 0.0905, + "num_input_tokens_seen": 21405472, + "step": 101420 + }, + { + "epoch": 11.157865786578657, + "grad_norm": 0.05777754634618759, + "learning_rate": 2.4311747543677924e-05, + "loss": 0.0105, + "num_input_tokens_seen": 21406528, + "step": 101425 + }, + { + "epoch": 11.158415841584159, + "grad_norm": 0.04468883201479912, + "learning_rate": 2.4309348388810626e-05, + "loss": 0.0018, + "num_input_tokens_seen": 21407648, + "step": 101430 + }, + { + "epoch": 11.15896589658966, + "grad_norm": 2.407219409942627, + "learning_rate": 2.4306949240308733e-05, + "loss": 0.0185, + "num_input_tokens_seen": 21408736, + "step": 101435 + }, + { + "epoch": 11.159515951595159, + "grad_norm": 0.11016905307769775, + "learning_rate": 2.430455009819435e-05, + "loss": 0.0217, + "num_input_tokens_seen": 21409792, + "step": 101440 + }, + { + "epoch": 11.16006600660066, + "grad_norm": 0.023963311687111855, + "learning_rate": 2.4302150962489605e-05, + "loss": 0.0153, + "num_input_tokens_seen": 21410816, + "step": 101445 + }, + { + "epoch": 11.160616061606161, + "grad_norm": 0.023640600964426994, + "learning_rate": 2.429975183321659e-05, + "loss": 0.0161, + "num_input_tokens_seen": 21411968, + "step": 101450 + }, + { + "epoch": 11.16116611661166, + "grad_norm": 0.007867448031902313, + "learning_rate": 2.429735271039743e-05, + "loss": 0.1013, + "num_input_tokens_seen": 21413024, + "step": 101455 + }, + { + "epoch": 11.161716171617162, + "grad_norm": 0.11521504819393158, + "learning_rate": 2.4294953594054243e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21414048, + "step": 101460 + }, + { + "epoch": 11.162266226622663, + "grad_norm": 0.015795212239027023, + "learning_rate": 2.4292554484209114e-05, + "loss": 0.0241, + "num_input_tokens_seen": 21415072, + "step": 101465 + }, + { + "epoch": 11.162816281628162, + "grad_norm": 0.009758726693689823, + "learning_rate": 2.429015538088418e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21416096, + "step": 101470 + }, + { + "epoch": 11.163366336633663, + "grad_norm": 0.008916888386011124, + "learning_rate": 2.4287756284101536e-05, + "loss": 0.0025, + "num_input_tokens_seen": 21417152, + "step": 101475 + }, + { + "epoch": 11.163916391639164, + "grad_norm": 1.3821016550064087, + "learning_rate": 2.4285357193883313e-05, + "loss": 0.0243, + "num_input_tokens_seen": 21418272, + "step": 101480 + }, + { + "epoch": 11.164466446644665, + "grad_norm": 0.32146236300468445, + "learning_rate": 2.4282958110251604e-05, + "loss": 0.0079, + "num_input_tokens_seen": 21419328, + "step": 101485 + }, + { + "epoch": 11.165016501650165, + "grad_norm": 0.03468668460845947, + "learning_rate": 2.428055903322852e-05, + "loss": 0.046, + "num_input_tokens_seen": 21420384, + "step": 101490 + }, + { + "epoch": 11.165566556655666, + "grad_norm": 0.035376619547605515, + "learning_rate": 2.4278159962836183e-05, + "loss": 0.0561, + "num_input_tokens_seen": 21421376, + "step": 101495 + }, + { + "epoch": 11.166116611661167, + "grad_norm": 0.10344677418470383, + "learning_rate": 2.4275760899096694e-05, + "loss": 0.0119, + "num_input_tokens_seen": 21422432, + "step": 101500 + }, + { + "epoch": 11.166666666666666, + "grad_norm": 0.00801663938909769, + "learning_rate": 2.427336184203218e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21423424, + "step": 101505 + }, + { + "epoch": 11.167216721672167, + "grad_norm": 1.224390983581543, + "learning_rate": 2.4270962791664736e-05, + "loss": 0.0635, + "num_input_tokens_seen": 21424416, + "step": 101510 + }, + { + "epoch": 11.167766776677668, + "grad_norm": 0.26542744040489197, + "learning_rate": 2.4268563748016475e-05, + "loss": 0.0057, + "num_input_tokens_seen": 21425440, + "step": 101515 + }, + { + "epoch": 11.168316831683168, + "grad_norm": 0.05167941376566887, + "learning_rate": 2.426616471110952e-05, + "loss": 0.0035, + "num_input_tokens_seen": 21426592, + "step": 101520 + }, + { + "epoch": 11.168866886688669, + "grad_norm": 1.6060044765472412, + "learning_rate": 2.4263765680965962e-05, + "loss": 0.0834, + "num_input_tokens_seen": 21427616, + "step": 101525 + }, + { + "epoch": 11.16941694169417, + "grad_norm": 0.023570319637656212, + "learning_rate": 2.4261366657607925e-05, + "loss": 0.0367, + "num_input_tokens_seen": 21428672, + "step": 101530 + }, + { + "epoch": 11.16996699669967, + "grad_norm": 1.503383755683899, + "learning_rate": 2.4258967641057525e-05, + "loss": 0.0477, + "num_input_tokens_seen": 21429728, + "step": 101535 + }, + { + "epoch": 11.17051705170517, + "grad_norm": 0.076504647731781, + "learning_rate": 2.4256568631336854e-05, + "loss": 0.0045, + "num_input_tokens_seen": 21430784, + "step": 101540 + }, + { + "epoch": 11.171067106710671, + "grad_norm": 0.1607198715209961, + "learning_rate": 2.425416962846804e-05, + "loss": 0.0037, + "num_input_tokens_seen": 21431808, + "step": 101545 + }, + { + "epoch": 11.171617161716172, + "grad_norm": 0.1778116524219513, + "learning_rate": 2.4251770632473178e-05, + "loss": 0.062, + "num_input_tokens_seen": 21432832, + "step": 101550 + }, + { + "epoch": 11.172167216721672, + "grad_norm": 0.0779481828212738, + "learning_rate": 2.4249371643374398e-05, + "loss": 0.0408, + "num_input_tokens_seen": 21433888, + "step": 101555 + }, + { + "epoch": 11.172717271727173, + "grad_norm": 0.045016806572675705, + "learning_rate": 2.4246972661193797e-05, + "loss": 0.0267, + "num_input_tokens_seen": 21434944, + "step": 101560 + }, + { + "epoch": 11.173267326732674, + "grad_norm": 0.04216568544507027, + "learning_rate": 2.4244573685953482e-05, + "loss": 0.003, + "num_input_tokens_seen": 21435968, + "step": 101565 + }, + { + "epoch": 11.173817381738173, + "grad_norm": 0.05698816478252411, + "learning_rate": 2.4242174717675572e-05, + "loss": 0.0229, + "num_input_tokens_seen": 21436992, + "step": 101570 + }, + { + "epoch": 11.174367436743674, + "grad_norm": 0.2465057522058487, + "learning_rate": 2.4239775756382183e-05, + "loss": 0.0038, + "num_input_tokens_seen": 21438016, + "step": 101575 + }, + { + "epoch": 11.174917491749175, + "grad_norm": 0.04205527901649475, + "learning_rate": 2.42373768020954e-05, + "loss": 0.0066, + "num_input_tokens_seen": 21439104, + "step": 101580 + }, + { + "epoch": 11.175467546754675, + "grad_norm": 0.015906814485788345, + "learning_rate": 2.423497785483736e-05, + "loss": 0.0285, + "num_input_tokens_seen": 21440128, + "step": 101585 + }, + { + "epoch": 11.176017601760176, + "grad_norm": 3.3196194171905518, + "learning_rate": 2.4232578914630153e-05, + "loss": 0.0584, + "num_input_tokens_seen": 21441184, + "step": 101590 + }, + { + "epoch": 11.176567656765677, + "grad_norm": 0.12355639040470123, + "learning_rate": 2.423017998149591e-05, + "loss": 0.0303, + "num_input_tokens_seen": 21442240, + "step": 101595 + }, + { + "epoch": 11.177117711771178, + "grad_norm": 0.005616602022200823, + "learning_rate": 2.422778105545672e-05, + "loss": 0.0125, + "num_input_tokens_seen": 21443264, + "step": 101600 + }, + { + "epoch": 11.177667766776677, + "grad_norm": 0.00667272973805666, + "learning_rate": 2.4225382136534702e-05, + "loss": 0.0045, + "num_input_tokens_seen": 21444288, + "step": 101605 + }, + { + "epoch": 11.178217821782178, + "grad_norm": 0.041873764246702194, + "learning_rate": 2.422298322475197e-05, + "loss": 0.0351, + "num_input_tokens_seen": 21445280, + "step": 101610 + }, + { + "epoch": 11.17876787678768, + "grad_norm": 0.7011580467224121, + "learning_rate": 2.422058432013062e-05, + "loss": 0.007, + "num_input_tokens_seen": 21446336, + "step": 101615 + }, + { + "epoch": 11.179317931793179, + "grad_norm": 0.0033018849790096283, + "learning_rate": 2.421818542269278e-05, + "loss": 0.077, + "num_input_tokens_seen": 21447328, + "step": 101620 + }, + { + "epoch": 11.17986798679868, + "grad_norm": 0.2463209182024002, + "learning_rate": 2.4215786532460548e-05, + "loss": 0.0031, + "num_input_tokens_seen": 21448416, + "step": 101625 + }, + { + "epoch": 11.180418041804181, + "grad_norm": 0.05283786356449127, + "learning_rate": 2.4213387649456026e-05, + "loss": 0.0106, + "num_input_tokens_seen": 21449472, + "step": 101630 + }, + { + "epoch": 11.18096809680968, + "grad_norm": 0.3173618018627167, + "learning_rate": 2.4210988773701347e-05, + "loss": 0.0272, + "num_input_tokens_seen": 21450496, + "step": 101635 + }, + { + "epoch": 11.181518151815181, + "grad_norm": 0.00903624389320612, + "learning_rate": 2.4208589905218592e-05, + "loss": 0.0145, + "num_input_tokens_seen": 21451584, + "step": 101640 + }, + { + "epoch": 11.182068206820682, + "grad_norm": 0.00389256258495152, + "learning_rate": 2.420619104402989e-05, + "loss": 0.0041, + "num_input_tokens_seen": 21452608, + "step": 101645 + }, + { + "epoch": 11.182618261826182, + "grad_norm": 0.10210132598876953, + "learning_rate": 2.4203792190157353e-05, + "loss": 0.0127, + "num_input_tokens_seen": 21453664, + "step": 101650 + }, + { + "epoch": 11.183168316831683, + "grad_norm": 0.03731164708733559, + "learning_rate": 2.4201393343623068e-05, + "loss": 0.003, + "num_input_tokens_seen": 21454688, + "step": 101655 + }, + { + "epoch": 11.183718371837184, + "grad_norm": 0.2320261299610138, + "learning_rate": 2.4198994504449164e-05, + "loss": 0.0419, + "num_input_tokens_seen": 21455808, + "step": 101660 + }, + { + "epoch": 11.184268426842685, + "grad_norm": 0.024384137243032455, + "learning_rate": 2.4196595672657735e-05, + "loss": 0.0017, + "num_input_tokens_seen": 21456800, + "step": 101665 + }, + { + "epoch": 11.184818481848184, + "grad_norm": 2.069286346435547, + "learning_rate": 2.419419684827091e-05, + "loss": 0.0342, + "num_input_tokens_seen": 21457888, + "step": 101670 + }, + { + "epoch": 11.185368536853685, + "grad_norm": 0.02665070630609989, + "learning_rate": 2.4191798031310782e-05, + "loss": 0.0054, + "num_input_tokens_seen": 21458912, + "step": 101675 + }, + { + "epoch": 11.185918591859187, + "grad_norm": 0.004186798818409443, + "learning_rate": 2.4189399221799456e-05, + "loss": 0.0383, + "num_input_tokens_seen": 21459936, + "step": 101680 + }, + { + "epoch": 11.186468646864686, + "grad_norm": 0.9936365485191345, + "learning_rate": 2.418700041975906e-05, + "loss": 0.0126, + "num_input_tokens_seen": 21460992, + "step": 101685 + }, + { + "epoch": 11.187018701870187, + "grad_norm": 2.447589874267578, + "learning_rate": 2.418460162521168e-05, + "loss": 0.0593, + "num_input_tokens_seen": 21462080, + "step": 101690 + }, + { + "epoch": 11.187568756875688, + "grad_norm": 0.0052239662036299706, + "learning_rate": 2.4182202838179446e-05, + "loss": 0.0397, + "num_input_tokens_seen": 21463072, + "step": 101695 + }, + { + "epoch": 11.188118811881187, + "grad_norm": 0.012693134136497974, + "learning_rate": 2.4179804058684455e-05, + "loss": 0.0014, + "num_input_tokens_seen": 21464160, + "step": 101700 + }, + { + "epoch": 11.188668866886688, + "grad_norm": 0.06309643387794495, + "learning_rate": 2.4177405286748805e-05, + "loss": 0.0101, + "num_input_tokens_seen": 21465184, + "step": 101705 + }, + { + "epoch": 11.18921892189219, + "grad_norm": 0.033558327704668045, + "learning_rate": 2.4175006522394632e-05, + "loss": 0.0044, + "num_input_tokens_seen": 21466208, + "step": 101710 + }, + { + "epoch": 11.189768976897689, + "grad_norm": 0.009512772783637047, + "learning_rate": 2.4172607765644013e-05, + "loss": 0.1214, + "num_input_tokens_seen": 21467264, + "step": 101715 + }, + { + "epoch": 11.19031903190319, + "grad_norm": 0.07028709352016449, + "learning_rate": 2.417020901651908e-05, + "loss": 0.0043, + "num_input_tokens_seen": 21468320, + "step": 101720 + }, + { + "epoch": 11.190869086908691, + "grad_norm": 0.0063172392547130585, + "learning_rate": 2.4167810275041935e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21469376, + "step": 101725 + }, + { + "epoch": 11.191419141914192, + "grad_norm": 0.8628512024879456, + "learning_rate": 2.416541154123467e-05, + "loss": 0.0121, + "num_input_tokens_seen": 21470336, + "step": 101730 + }, + { + "epoch": 11.191969196919691, + "grad_norm": 0.03202672302722931, + "learning_rate": 2.416301281511942e-05, + "loss": 0.0342, + "num_input_tokens_seen": 21471392, + "step": 101735 + }, + { + "epoch": 11.192519251925193, + "grad_norm": 0.006378052290529013, + "learning_rate": 2.416061409671827e-05, + "loss": 0.002, + "num_input_tokens_seen": 21472416, + "step": 101740 + }, + { + "epoch": 11.193069306930694, + "grad_norm": 0.0023777117021381855, + "learning_rate": 2.4158215386053344e-05, + "loss": 0.0017, + "num_input_tokens_seen": 21473472, + "step": 101745 + }, + { + "epoch": 11.193619361936193, + "grad_norm": 4.446186065673828, + "learning_rate": 2.4155816683146746e-05, + "loss": 0.1817, + "num_input_tokens_seen": 21474464, + "step": 101750 + }, + { + "epoch": 11.194169416941694, + "grad_norm": 0.019513491541147232, + "learning_rate": 2.415341798802057e-05, + "loss": 0.0866, + "num_input_tokens_seen": 21475520, + "step": 101755 + }, + { + "epoch": 11.194719471947195, + "grad_norm": 0.28365394473075867, + "learning_rate": 2.415101930069694e-05, + "loss": 0.0373, + "num_input_tokens_seen": 21476576, + "step": 101760 + }, + { + "epoch": 11.195269526952695, + "grad_norm": 0.011106368154287338, + "learning_rate": 2.414862062119796e-05, + "loss": 0.0161, + "num_input_tokens_seen": 21477536, + "step": 101765 + }, + { + "epoch": 11.195819581958196, + "grad_norm": 0.3096323311328888, + "learning_rate": 2.4146221949545726e-05, + "loss": 0.011, + "num_input_tokens_seen": 21478528, + "step": 101770 + }, + { + "epoch": 11.196369636963697, + "grad_norm": 0.06495574116706848, + "learning_rate": 2.4143823285762364e-05, + "loss": 0.0015, + "num_input_tokens_seen": 21479584, + "step": 101775 + }, + { + "epoch": 11.196919691969198, + "grad_norm": 0.1285373866558075, + "learning_rate": 2.4141424629869964e-05, + "loss": 0.002, + "num_input_tokens_seen": 21480608, + "step": 101780 + }, + { + "epoch": 11.197469746974697, + "grad_norm": 0.10515663772821426, + "learning_rate": 2.413902598189065e-05, + "loss": 0.0052, + "num_input_tokens_seen": 21481632, + "step": 101785 + }, + { + "epoch": 11.198019801980198, + "grad_norm": 0.38812440633773804, + "learning_rate": 2.4136627341846518e-05, + "loss": 0.2565, + "num_input_tokens_seen": 21482624, + "step": 101790 + }, + { + "epoch": 11.1985698569857, + "grad_norm": 0.013512615114450455, + "learning_rate": 2.413422870975967e-05, + "loss": 0.0015, + "num_input_tokens_seen": 21483648, + "step": 101795 + }, + { + "epoch": 11.199119911991199, + "grad_norm": 0.01838984712958336, + "learning_rate": 2.4131830085652226e-05, + "loss": 0.1099, + "num_input_tokens_seen": 21484704, + "step": 101800 + }, + { + "epoch": 11.1996699669967, + "grad_norm": 0.14585217833518982, + "learning_rate": 2.4129431469546285e-05, + "loss": 0.0833, + "num_input_tokens_seen": 21485728, + "step": 101805 + }, + { + "epoch": 11.2002200220022, + "grad_norm": 0.20730847120285034, + "learning_rate": 2.4127032861463965e-05, + "loss": 0.0091, + "num_input_tokens_seen": 21486784, + "step": 101810 + }, + { + "epoch": 11.2007700770077, + "grad_norm": 0.1334831565618515, + "learning_rate": 2.412463426142736e-05, + "loss": 0.0062, + "num_input_tokens_seen": 21487872, + "step": 101815 + }, + { + "epoch": 11.201320132013201, + "grad_norm": 0.0379902608692646, + "learning_rate": 2.4122235669458575e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21488832, + "step": 101820 + }, + { + "epoch": 11.201870187018702, + "grad_norm": 0.07224854826927185, + "learning_rate": 2.411983708557973e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21489952, + "step": 101825 + }, + { + "epoch": 11.202420242024202, + "grad_norm": 0.03756307438015938, + "learning_rate": 2.4117438509812917e-05, + "loss": 0.0106, + "num_input_tokens_seen": 21491008, + "step": 101830 + }, + { + "epoch": 11.202970297029703, + "grad_norm": 0.10182719677686691, + "learning_rate": 2.4115039942180255e-05, + "loss": 0.0732, + "num_input_tokens_seen": 21492064, + "step": 101835 + }, + { + "epoch": 11.203520352035204, + "grad_norm": 0.02084210328757763, + "learning_rate": 2.411264138270385e-05, + "loss": 0.0129, + "num_input_tokens_seen": 21493152, + "step": 101840 + }, + { + "epoch": 11.204070407040705, + "grad_norm": 0.3502587378025055, + "learning_rate": 2.4110242831405792e-05, + "loss": 0.0074, + "num_input_tokens_seen": 21494272, + "step": 101845 + }, + { + "epoch": 11.204620462046204, + "grad_norm": 0.5685160756111145, + "learning_rate": 2.4107844288308206e-05, + "loss": 0.0136, + "num_input_tokens_seen": 21495392, + "step": 101850 + }, + { + "epoch": 11.205170517051705, + "grad_norm": 1.769742727279663, + "learning_rate": 2.4105445753433185e-05, + "loss": 0.1621, + "num_input_tokens_seen": 21496352, + "step": 101855 + }, + { + "epoch": 11.205720572057206, + "grad_norm": 0.020659189671278, + "learning_rate": 2.4103047226802848e-05, + "loss": 0.0025, + "num_input_tokens_seen": 21497376, + "step": 101860 + }, + { + "epoch": 11.206270627062706, + "grad_norm": 0.0193594079464674, + "learning_rate": 2.4100648708439298e-05, + "loss": 0.0031, + "num_input_tokens_seen": 21498464, + "step": 101865 + }, + { + "epoch": 11.206820682068207, + "grad_norm": 0.06293128430843353, + "learning_rate": 2.4098250198364626e-05, + "loss": 0.0365, + "num_input_tokens_seen": 21499520, + "step": 101870 + }, + { + "epoch": 11.207370737073708, + "grad_norm": 0.3947128653526306, + "learning_rate": 2.4095851696600953e-05, + "loss": 0.0642, + "num_input_tokens_seen": 21500608, + "step": 101875 + }, + { + "epoch": 11.207920792079207, + "grad_norm": 0.015537512488663197, + "learning_rate": 2.409345320317038e-05, + "loss": 0.0008, + "num_input_tokens_seen": 21501696, + "step": 101880 + }, + { + "epoch": 11.208470847084708, + "grad_norm": 0.005967292934656143, + "learning_rate": 2.4091054718095022e-05, + "loss": 0.1922, + "num_input_tokens_seen": 21502752, + "step": 101885 + }, + { + "epoch": 11.20902090209021, + "grad_norm": 0.12137505412101746, + "learning_rate": 2.408865624139697e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21503840, + "step": 101890 + }, + { + "epoch": 11.209570957095709, + "grad_norm": 0.03104243054986, + "learning_rate": 2.4086257773098334e-05, + "loss": 0.1462, + "num_input_tokens_seen": 21504832, + "step": 101895 + }, + { + "epoch": 11.21012101210121, + "grad_norm": 0.04912741109728813, + "learning_rate": 2.408385931322123e-05, + "loss": 0.0652, + "num_input_tokens_seen": 21505888, + "step": 101900 + }, + { + "epoch": 11.210671067106711, + "grad_norm": 0.08854672312736511, + "learning_rate": 2.4081460861787747e-05, + "loss": 0.0028, + "num_input_tokens_seen": 21506912, + "step": 101905 + }, + { + "epoch": 11.211221122112212, + "grad_norm": 0.025116249918937683, + "learning_rate": 2.4079062418820002e-05, + "loss": 0.0114, + "num_input_tokens_seen": 21508032, + "step": 101910 + }, + { + "epoch": 11.211771177117711, + "grad_norm": 0.8225861191749573, + "learning_rate": 2.40766639843401e-05, + "loss": 0.0064, + "num_input_tokens_seen": 21509088, + "step": 101915 + }, + { + "epoch": 11.212321232123212, + "grad_norm": 0.6136654019355774, + "learning_rate": 2.4074265558370128e-05, + "loss": 0.0944, + "num_input_tokens_seen": 21510144, + "step": 101920 + }, + { + "epoch": 11.212871287128714, + "grad_norm": 0.016987884417176247, + "learning_rate": 2.4071867140932224e-05, + "loss": 0.045, + "num_input_tokens_seen": 21511168, + "step": 101925 + }, + { + "epoch": 11.213421342134213, + "grad_norm": 0.322270929813385, + "learning_rate": 2.4069468732048468e-05, + "loss": 0.0766, + "num_input_tokens_seen": 21512320, + "step": 101930 + }, + { + "epoch": 11.213971397139714, + "grad_norm": 0.9711005687713623, + "learning_rate": 2.4067070331740972e-05, + "loss": 0.0225, + "num_input_tokens_seen": 21513440, + "step": 101935 + }, + { + "epoch": 11.214521452145215, + "grad_norm": 0.15757547318935394, + "learning_rate": 2.4064671940031846e-05, + "loss": 0.0055, + "num_input_tokens_seen": 21514528, + "step": 101940 + }, + { + "epoch": 11.215071507150714, + "grad_norm": 0.1667044311761856, + "learning_rate": 2.406227355694318e-05, + "loss": 0.1312, + "num_input_tokens_seen": 21515584, + "step": 101945 + }, + { + "epoch": 11.215621562156215, + "grad_norm": 0.39623403549194336, + "learning_rate": 2.4059875182497095e-05, + "loss": 0.0121, + "num_input_tokens_seen": 21516672, + "step": 101950 + }, + { + "epoch": 11.216171617161717, + "grad_norm": 0.009658848866820335, + "learning_rate": 2.4057476816715696e-05, + "loss": 0.0035, + "num_input_tokens_seen": 21517728, + "step": 101955 + }, + { + "epoch": 11.216721672167218, + "grad_norm": 1.375274419784546, + "learning_rate": 2.405507845962107e-05, + "loss": 0.0168, + "num_input_tokens_seen": 21518784, + "step": 101960 + }, + { + "epoch": 11.217271727172717, + "grad_norm": 1.0537946224212646, + "learning_rate": 2.4052680111235338e-05, + "loss": 0.1147, + "num_input_tokens_seen": 21519840, + "step": 101965 + }, + { + "epoch": 11.217821782178218, + "grad_norm": 1.2482659816741943, + "learning_rate": 2.405028177158059e-05, + "loss": 0.0803, + "num_input_tokens_seen": 21520832, + "step": 101970 + }, + { + "epoch": 11.218371837183719, + "grad_norm": 0.03965750336647034, + "learning_rate": 2.4047883440678948e-05, + "loss": 0.0053, + "num_input_tokens_seen": 21521952, + "step": 101975 + }, + { + "epoch": 11.218921892189218, + "grad_norm": 0.04307697340846062, + "learning_rate": 2.404548511855251e-05, + "loss": 0.0446, + "num_input_tokens_seen": 21523008, + "step": 101980 + }, + { + "epoch": 11.21947194719472, + "grad_norm": 0.7614293098449707, + "learning_rate": 2.404308680522337e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21524064, + "step": 101985 + }, + { + "epoch": 11.22002200220022, + "grad_norm": 0.040246978402137756, + "learning_rate": 2.4040688500713642e-05, + "loss": 0.0153, + "num_input_tokens_seen": 21525184, + "step": 101990 + }, + { + "epoch": 11.22057205720572, + "grad_norm": 0.007425480056554079, + "learning_rate": 2.4038290205045425e-05, + "loss": 0.0136, + "num_input_tokens_seen": 21526240, + "step": 101995 + }, + { + "epoch": 11.221122112211221, + "grad_norm": 0.11154361069202423, + "learning_rate": 2.4035891918240835e-05, + "loss": 0.1146, + "num_input_tokens_seen": 21527328, + "step": 102000 + }, + { + "epoch": 11.221672167216722, + "grad_norm": 0.05154334008693695, + "learning_rate": 2.403349364032196e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21528384, + "step": 102005 + }, + { + "epoch": 11.222222222222221, + "grad_norm": 0.036439839750528336, + "learning_rate": 2.4031095371310906e-05, + "loss": 0.0123, + "num_input_tokens_seen": 21529408, + "step": 102010 + }, + { + "epoch": 11.222772277227723, + "grad_norm": 0.13122552633285522, + "learning_rate": 2.4028697111229796e-05, + "loss": 0.0117, + "num_input_tokens_seen": 21530496, + "step": 102015 + }, + { + "epoch": 11.223322332233224, + "grad_norm": 0.03660675883293152, + "learning_rate": 2.4026298860100702e-05, + "loss": 0.037, + "num_input_tokens_seen": 21531552, + "step": 102020 + }, + { + "epoch": 11.223872387238725, + "grad_norm": 0.01072074193507433, + "learning_rate": 2.4023900617945753e-05, + "loss": 0.05, + "num_input_tokens_seen": 21532640, + "step": 102025 + }, + { + "epoch": 11.224422442244224, + "grad_norm": 0.13062842190265656, + "learning_rate": 2.4021502384787047e-05, + "loss": 0.0387, + "num_input_tokens_seen": 21533728, + "step": 102030 + }, + { + "epoch": 11.224972497249725, + "grad_norm": 1.6382232904434204, + "learning_rate": 2.401910416064667e-05, + "loss": 0.1616, + "num_input_tokens_seen": 21534752, + "step": 102035 + }, + { + "epoch": 11.225522552255226, + "grad_norm": 0.10250327736139297, + "learning_rate": 2.4016705945546752e-05, + "loss": 0.0018, + "num_input_tokens_seen": 21535840, + "step": 102040 + }, + { + "epoch": 11.226072607260726, + "grad_norm": 0.0656147226691246, + "learning_rate": 2.4014307739509376e-05, + "loss": 0.0018, + "num_input_tokens_seen": 21536832, + "step": 102045 + }, + { + "epoch": 11.226622662266227, + "grad_norm": 0.06246262043714523, + "learning_rate": 2.4011909542556655e-05, + "loss": 0.0973, + "num_input_tokens_seen": 21537888, + "step": 102050 + }, + { + "epoch": 11.227172717271728, + "grad_norm": 0.020611796528100967, + "learning_rate": 2.4009511354710698e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21538976, + "step": 102055 + }, + { + "epoch": 11.227722772277227, + "grad_norm": 0.16818450391292572, + "learning_rate": 2.4007113175993587e-05, + "loss": 0.0547, + "num_input_tokens_seen": 21540064, + "step": 102060 + }, + { + "epoch": 11.228272827282728, + "grad_norm": 0.030171198770403862, + "learning_rate": 2.4004715006427444e-05, + "loss": 0.0701, + "num_input_tokens_seen": 21541152, + "step": 102065 + }, + { + "epoch": 11.22882288228823, + "grad_norm": 0.7561671733856201, + "learning_rate": 2.400231684603436e-05, + "loss": 0.0843, + "num_input_tokens_seen": 21542240, + "step": 102070 + }, + { + "epoch": 11.229372937293729, + "grad_norm": 0.31421083211898804, + "learning_rate": 2.399991869483645e-05, + "loss": 0.008, + "num_input_tokens_seen": 21543296, + "step": 102075 + }, + { + "epoch": 11.22992299229923, + "grad_norm": 0.01476006768643856, + "learning_rate": 2.399752055285581e-05, + "loss": 0.0054, + "num_input_tokens_seen": 21544288, + "step": 102080 + }, + { + "epoch": 11.23047304730473, + "grad_norm": 1.1890416145324707, + "learning_rate": 2.3995122420114532e-05, + "loss": 0.0094, + "num_input_tokens_seen": 21545280, + "step": 102085 + }, + { + "epoch": 11.231023102310232, + "grad_norm": 0.028444338589906693, + "learning_rate": 2.399272429663474e-05, + "loss": 0.0095, + "num_input_tokens_seen": 21546336, + "step": 102090 + }, + { + "epoch": 11.231573157315731, + "grad_norm": 0.026803404092788696, + "learning_rate": 2.399032618243851e-05, + "loss": 0.0122, + "num_input_tokens_seen": 21547360, + "step": 102095 + }, + { + "epoch": 11.232123212321232, + "grad_norm": 0.02069985494017601, + "learning_rate": 2.3987928077547975e-05, + "loss": 0.0109, + "num_input_tokens_seen": 21548416, + "step": 102100 + }, + { + "epoch": 11.232673267326733, + "grad_norm": 0.09623821079730988, + "learning_rate": 2.3985529981985217e-05, + "loss": 0.0093, + "num_input_tokens_seen": 21549504, + "step": 102105 + }, + { + "epoch": 11.233223322332233, + "grad_norm": 0.022936591878533363, + "learning_rate": 2.3983131895772333e-05, + "loss": 0.0046, + "num_input_tokens_seen": 21550560, + "step": 102110 + }, + { + "epoch": 11.233773377337734, + "grad_norm": 0.029942693188786507, + "learning_rate": 2.398073381893145e-05, + "loss": 0.0009, + "num_input_tokens_seen": 21551648, + "step": 102115 + }, + { + "epoch": 11.234323432343235, + "grad_norm": 0.9143533706665039, + "learning_rate": 2.397833575148464e-05, + "loss": 0.052, + "num_input_tokens_seen": 21552768, + "step": 102120 + }, + { + "epoch": 11.234873487348734, + "grad_norm": 0.021457543596625328, + "learning_rate": 2.3975937693454025e-05, + "loss": 0.0044, + "num_input_tokens_seen": 21553888, + "step": 102125 + }, + { + "epoch": 11.235423542354235, + "grad_norm": 0.6954705119132996, + "learning_rate": 2.3973539644861707e-05, + "loss": 0.0127, + "num_input_tokens_seen": 21554880, + "step": 102130 + }, + { + "epoch": 11.235973597359736, + "grad_norm": 0.07398252189159393, + "learning_rate": 2.3971141605729775e-05, + "loss": 0.0042, + "num_input_tokens_seen": 21555936, + "step": 102135 + }, + { + "epoch": 11.236523652365236, + "grad_norm": 0.012961046770215034, + "learning_rate": 2.3968743576080335e-05, + "loss": 0.0011, + "num_input_tokens_seen": 21556992, + "step": 102140 + }, + { + "epoch": 11.237073707370737, + "grad_norm": 0.03960118070244789, + "learning_rate": 2.39663455559355e-05, + "loss": 0.0043, + "num_input_tokens_seen": 21558080, + "step": 102145 + }, + { + "epoch": 11.237623762376238, + "grad_norm": 1.0503900051116943, + "learning_rate": 2.396394754531735e-05, + "loss": 0.0268, + "num_input_tokens_seen": 21559104, + "step": 102150 + }, + { + "epoch": 11.238173817381739, + "grad_norm": 1.1872553825378418, + "learning_rate": 2.3961549544248005e-05, + "loss": 0.0122, + "num_input_tokens_seen": 21560096, + "step": 102155 + }, + { + "epoch": 11.238723872387238, + "grad_norm": 0.014062144793570042, + "learning_rate": 2.3959151552749553e-05, + "loss": 0.0061, + "num_input_tokens_seen": 21561152, + "step": 102160 + }, + { + "epoch": 11.23927392739274, + "grad_norm": 1.581251859664917, + "learning_rate": 2.3956753570844108e-05, + "loss": 0.0737, + "num_input_tokens_seen": 21562240, + "step": 102165 + }, + { + "epoch": 11.23982398239824, + "grad_norm": 0.014954343438148499, + "learning_rate": 2.395435559855377e-05, + "loss": 0.0174, + "num_input_tokens_seen": 21563296, + "step": 102170 + }, + { + "epoch": 11.24037403740374, + "grad_norm": 0.37512677907943726, + "learning_rate": 2.3951957635900623e-05, + "loss": 0.011, + "num_input_tokens_seen": 21564288, + "step": 102175 + }, + { + "epoch": 11.24092409240924, + "grad_norm": 0.31067708134651184, + "learning_rate": 2.3949559682906785e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21565312, + "step": 102180 + }, + { + "epoch": 11.241474147414742, + "grad_norm": 0.10316066443920135, + "learning_rate": 2.394716173959435e-05, + "loss": 0.0094, + "num_input_tokens_seen": 21566368, + "step": 102185 + }, + { + "epoch": 11.242024202420241, + "grad_norm": 0.012418567202985287, + "learning_rate": 2.3944763805985426e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21567456, + "step": 102190 + }, + { + "epoch": 11.242574257425742, + "grad_norm": 0.01256357878446579, + "learning_rate": 2.3942365882102104e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21568480, + "step": 102195 + }, + { + "epoch": 11.243124312431243, + "grad_norm": 0.10745158046483994, + "learning_rate": 2.393996796796648e-05, + "loss": 0.0049, + "num_input_tokens_seen": 21569472, + "step": 102200 + }, + { + "epoch": 11.243674367436745, + "grad_norm": 0.03516131266951561, + "learning_rate": 2.393757006360068e-05, + "loss": 0.0418, + "num_input_tokens_seen": 21570592, + "step": 102205 + }, + { + "epoch": 11.244224422442244, + "grad_norm": 0.02927681803703308, + "learning_rate": 2.393517216902677e-05, + "loss": 0.01, + "num_input_tokens_seen": 21571616, + "step": 102210 + }, + { + "epoch": 11.244774477447745, + "grad_norm": 0.011776388622820377, + "learning_rate": 2.3932774284266874e-05, + "loss": 0.0039, + "num_input_tokens_seen": 21572672, + "step": 102215 + }, + { + "epoch": 11.245324532453246, + "grad_norm": 0.03482244536280632, + "learning_rate": 2.3930376409343086e-05, + "loss": 0.0196, + "num_input_tokens_seen": 21573664, + "step": 102220 + }, + { + "epoch": 11.245874587458745, + "grad_norm": 0.02992936410009861, + "learning_rate": 2.39279785442775e-05, + "loss": 0.1755, + "num_input_tokens_seen": 21574720, + "step": 102225 + }, + { + "epoch": 11.246424642464246, + "grad_norm": 0.020671522244811058, + "learning_rate": 2.392558068909223e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21575808, + "step": 102230 + }, + { + "epoch": 11.246974697469748, + "grad_norm": 0.10797102004289627, + "learning_rate": 2.3923182843809362e-05, + "loss": 0.0049, + "num_input_tokens_seen": 21576896, + "step": 102235 + }, + { + "epoch": 11.247524752475247, + "grad_norm": 0.11765769124031067, + "learning_rate": 2.3920785008451e-05, + "loss": 0.0342, + "num_input_tokens_seen": 21577952, + "step": 102240 + }, + { + "epoch": 11.248074807480748, + "grad_norm": 0.06309673190116882, + "learning_rate": 2.3918387183039255e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21579008, + "step": 102245 + }, + { + "epoch": 11.248624862486249, + "grad_norm": 0.002882968867197633, + "learning_rate": 2.39159893675962e-05, + "loss": 0.0049, + "num_input_tokens_seen": 21580064, + "step": 102250 + }, + { + "epoch": 11.249174917491748, + "grad_norm": 0.047451265156269073, + "learning_rate": 2.3913591562143962e-05, + "loss": 0.1067, + "num_input_tokens_seen": 21581088, + "step": 102255 + }, + { + "epoch": 11.24972497249725, + "grad_norm": 0.02979354001581669, + "learning_rate": 2.391119376670462e-05, + "loss": 0.032, + "num_input_tokens_seen": 21582144, + "step": 102260 + }, + { + "epoch": 11.25027502750275, + "grad_norm": 0.11271710693836212, + "learning_rate": 2.3908795981300297e-05, + "loss": 0.0521, + "num_input_tokens_seen": 21583200, + "step": 102265 + }, + { + "epoch": 11.250825082508252, + "grad_norm": 0.05328061059117317, + "learning_rate": 2.390639820595307e-05, + "loss": 0.0101, + "num_input_tokens_seen": 21584256, + "step": 102270 + }, + { + "epoch": 11.251375137513751, + "grad_norm": 1.309892177581787, + "learning_rate": 2.390400044068504e-05, + "loss": 0.0496, + "num_input_tokens_seen": 21585312, + "step": 102275 + }, + { + "epoch": 11.251925192519252, + "grad_norm": 0.12582510709762573, + "learning_rate": 2.3901602685518317e-05, + "loss": 0.006, + "num_input_tokens_seen": 21586400, + "step": 102280 + }, + { + "epoch": 11.252475247524753, + "grad_norm": 0.11960765719413757, + "learning_rate": 2.3899204940474994e-05, + "loss": 0.0025, + "num_input_tokens_seen": 21587456, + "step": 102285 + }, + { + "epoch": 11.253025302530252, + "grad_norm": 0.43732669949531555, + "learning_rate": 2.3896807205577177e-05, + "loss": 0.0078, + "num_input_tokens_seen": 21588544, + "step": 102290 + }, + { + "epoch": 11.253575357535754, + "grad_norm": 0.11326460540294647, + "learning_rate": 2.3894409480846954e-05, + "loss": 0.0082, + "num_input_tokens_seen": 21589600, + "step": 102295 + }, + { + "epoch": 11.254125412541255, + "grad_norm": 0.0027907006442546844, + "learning_rate": 2.3892011766306422e-05, + "loss": 0.003, + "num_input_tokens_seen": 21590720, + "step": 102300 + }, + { + "epoch": 11.254675467546754, + "grad_norm": 1.7843964099884033, + "learning_rate": 2.38896140619777e-05, + "loss": 0.0261, + "num_input_tokens_seen": 21591776, + "step": 102305 + }, + { + "epoch": 11.255225522552255, + "grad_norm": 1.4828517436981201, + "learning_rate": 2.3887216367882863e-05, + "loss": 0.0706, + "num_input_tokens_seen": 21592832, + "step": 102310 + }, + { + "epoch": 11.255775577557756, + "grad_norm": 0.01918579451739788, + "learning_rate": 2.388481868404402e-05, + "loss": 0.0011, + "num_input_tokens_seen": 21593952, + "step": 102315 + }, + { + "epoch": 11.256325632563255, + "grad_norm": 0.23802903294563293, + "learning_rate": 2.3882421010483274e-05, + "loss": 0.0093, + "num_input_tokens_seen": 21595072, + "step": 102320 + }, + { + "epoch": 11.256875687568757, + "grad_norm": 0.008431646041572094, + "learning_rate": 2.3880023347222706e-05, + "loss": 0.036, + "num_input_tokens_seen": 21596160, + "step": 102325 + }, + { + "epoch": 11.257425742574258, + "grad_norm": 0.08752138167619705, + "learning_rate": 2.3877625694284435e-05, + "loss": 0.0013, + "num_input_tokens_seen": 21597216, + "step": 102330 + }, + { + "epoch": 11.257975797579759, + "grad_norm": 0.08269595354795456, + "learning_rate": 2.387522805169055e-05, + "loss": 0.0057, + "num_input_tokens_seen": 21598272, + "step": 102335 + }, + { + "epoch": 11.258525852585258, + "grad_norm": 0.017999546602368355, + "learning_rate": 2.387283041946313e-05, + "loss": 0.0007, + "num_input_tokens_seen": 21599360, + "step": 102340 + }, + { + "epoch": 11.25907590759076, + "grad_norm": 0.043941840529441833, + "learning_rate": 2.3870432797624315e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21600416, + "step": 102345 + }, + { + "epoch": 11.25962596259626, + "grad_norm": 0.004205808043479919, + "learning_rate": 2.3868035186196163e-05, + "loss": 0.0116, + "num_input_tokens_seen": 21601408, + "step": 102350 + }, + { + "epoch": 11.26017601760176, + "grad_norm": 0.0029227794148027897, + "learning_rate": 2.3865637585200794e-05, + "loss": 0.0951, + "num_input_tokens_seen": 21602464, + "step": 102355 + }, + { + "epoch": 11.26072607260726, + "grad_norm": 0.014957691542804241, + "learning_rate": 2.3863239994660306e-05, + "loss": 0.0027, + "num_input_tokens_seen": 21603488, + "step": 102360 + }, + { + "epoch": 11.261276127612762, + "grad_norm": 0.00995259452611208, + "learning_rate": 2.3860842414596775e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21604544, + "step": 102365 + }, + { + "epoch": 11.261826182618261, + "grad_norm": 0.5175904631614685, + "learning_rate": 2.3858444845032322e-05, + "loss": 0.0083, + "num_input_tokens_seen": 21605632, + "step": 102370 + }, + { + "epoch": 11.262376237623762, + "grad_norm": 0.3185720145702362, + "learning_rate": 2.3856047285989028e-05, + "loss": 0.0119, + "num_input_tokens_seen": 21606624, + "step": 102375 + }, + { + "epoch": 11.262926292629263, + "grad_norm": 0.011424344033002853, + "learning_rate": 2.3853649737489007e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21607680, + "step": 102380 + }, + { + "epoch": 11.263476347634764, + "grad_norm": 1.7863560914993286, + "learning_rate": 2.3851252199554344e-05, + "loss": 0.0755, + "num_input_tokens_seen": 21608704, + "step": 102385 + }, + { + "epoch": 11.264026402640264, + "grad_norm": 0.012616724707186222, + "learning_rate": 2.384885467220713e-05, + "loss": 0.1124, + "num_input_tokens_seen": 21609792, + "step": 102390 + }, + { + "epoch": 11.264576457645765, + "grad_norm": 1.4841326475143433, + "learning_rate": 2.384645715546948e-05, + "loss": 0.0455, + "num_input_tokens_seen": 21610816, + "step": 102395 + }, + { + "epoch": 11.265126512651266, + "grad_norm": 0.008617301471531391, + "learning_rate": 2.384405964936347e-05, + "loss": 0.0014, + "num_input_tokens_seen": 21611904, + "step": 102400 + }, + { + "epoch": 11.265676567656765, + "grad_norm": 0.03549350053071976, + "learning_rate": 2.384166215391122e-05, + "loss": 0.0242, + "num_input_tokens_seen": 21612960, + "step": 102405 + }, + { + "epoch": 11.266226622662266, + "grad_norm": 0.20958714187145233, + "learning_rate": 2.3839264669134807e-05, + "loss": 0.0043, + "num_input_tokens_seen": 21613952, + "step": 102410 + }, + { + "epoch": 11.266776677667767, + "grad_norm": 1.4165135622024536, + "learning_rate": 2.3836867195056335e-05, + "loss": 0.0665, + "num_input_tokens_seen": 21614976, + "step": 102415 + }, + { + "epoch": 11.267326732673267, + "grad_norm": 0.1314622312784195, + "learning_rate": 2.3834469731697908e-05, + "loss": 0.1268, + "num_input_tokens_seen": 21615968, + "step": 102420 + }, + { + "epoch": 11.267876787678768, + "grad_norm": 0.012033705599606037, + "learning_rate": 2.38320722790816e-05, + "loss": 0.1991, + "num_input_tokens_seen": 21617056, + "step": 102425 + }, + { + "epoch": 11.268426842684269, + "grad_norm": 0.15483391284942627, + "learning_rate": 2.382967483722953e-05, + "loss": 0.0363, + "num_input_tokens_seen": 21618144, + "step": 102430 + }, + { + "epoch": 11.268976897689768, + "grad_norm": 0.25241297483444214, + "learning_rate": 2.382727740616379e-05, + "loss": 0.0085, + "num_input_tokens_seen": 21619136, + "step": 102435 + }, + { + "epoch": 11.26952695269527, + "grad_norm": 0.06431432068347931, + "learning_rate": 2.382487998590646e-05, + "loss": 0.0213, + "num_input_tokens_seen": 21620192, + "step": 102440 + }, + { + "epoch": 11.27007700770077, + "grad_norm": 0.09793446213006973, + "learning_rate": 2.382248257647965e-05, + "loss": 0.002, + "num_input_tokens_seen": 21621280, + "step": 102445 + }, + { + "epoch": 11.270627062706271, + "grad_norm": 1.924953579902649, + "learning_rate": 2.3820085177905455e-05, + "loss": 0.08, + "num_input_tokens_seen": 21622304, + "step": 102450 + }, + { + "epoch": 11.27117711771177, + "grad_norm": 0.011721715331077576, + "learning_rate": 2.3817687790205976e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21623360, + "step": 102455 + }, + { + "epoch": 11.271727172717272, + "grad_norm": 0.049392618238925934, + "learning_rate": 2.3815290413403295e-05, + "loss": 0.0144, + "num_input_tokens_seen": 21624384, + "step": 102460 + }, + { + "epoch": 11.272277227722773, + "grad_norm": 0.03870410844683647, + "learning_rate": 2.381289304751951e-05, + "loss": 0.0039, + "num_input_tokens_seen": 21625408, + "step": 102465 + }, + { + "epoch": 11.272827282728272, + "grad_norm": 0.09475761651992798, + "learning_rate": 2.3810495692576725e-05, + "loss": 0.0275, + "num_input_tokens_seen": 21626528, + "step": 102470 + }, + { + "epoch": 11.273377337733773, + "grad_norm": 0.020482804626226425, + "learning_rate": 2.380809834859702e-05, + "loss": 0.008, + "num_input_tokens_seen": 21627552, + "step": 102475 + }, + { + "epoch": 11.273927392739274, + "grad_norm": 0.03746416047215462, + "learning_rate": 2.3805701015602517e-05, + "loss": 0.0016, + "num_input_tokens_seen": 21628576, + "step": 102480 + }, + { + "epoch": 11.274477447744774, + "grad_norm": 1.5691285133361816, + "learning_rate": 2.3803303693615288e-05, + "loss": 0.0654, + "num_input_tokens_seen": 21629600, + "step": 102485 + }, + { + "epoch": 11.275027502750275, + "grad_norm": 0.04344860091805458, + "learning_rate": 2.3800906382657428e-05, + "loss": 0.0079, + "num_input_tokens_seen": 21630656, + "step": 102490 + }, + { + "epoch": 11.275577557755776, + "grad_norm": 0.020582664757966995, + "learning_rate": 2.3798509082751054e-05, + "loss": 0.0019, + "num_input_tokens_seen": 21631712, + "step": 102495 + }, + { + "epoch": 11.276127612761275, + "grad_norm": 0.3483966290950775, + "learning_rate": 2.379611179391823e-05, + "loss": 0.0062, + "num_input_tokens_seen": 21632736, + "step": 102500 + }, + { + "epoch": 11.276677667766776, + "grad_norm": 0.029959386214613914, + "learning_rate": 2.379371451618107e-05, + "loss": 0.0073, + "num_input_tokens_seen": 21633824, + "step": 102505 + }, + { + "epoch": 11.277227722772277, + "grad_norm": 0.047708481550216675, + "learning_rate": 2.3791317249561673e-05, + "loss": 0.0088, + "num_input_tokens_seen": 21634912, + "step": 102510 + }, + { + "epoch": 11.277777777777779, + "grad_norm": 0.004692847840487957, + "learning_rate": 2.378891999408211e-05, + "loss": 0.0037, + "num_input_tokens_seen": 21635968, + "step": 102515 + }, + { + "epoch": 11.278327832783278, + "grad_norm": 0.014193840324878693, + "learning_rate": 2.3786522749764502e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21637024, + "step": 102520 + }, + { + "epoch": 11.278877887788779, + "grad_norm": 1.6803785562515259, + "learning_rate": 2.3784125516630927e-05, + "loss": 0.0211, + "num_input_tokens_seen": 21638112, + "step": 102525 + }, + { + "epoch": 11.27942794279428, + "grad_norm": 0.2208956927061081, + "learning_rate": 2.378172829470348e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21639168, + "step": 102530 + }, + { + "epoch": 11.27997799779978, + "grad_norm": 0.10523922741413116, + "learning_rate": 2.377933108400427e-05, + "loss": 0.0401, + "num_input_tokens_seen": 21640192, + "step": 102535 + }, + { + "epoch": 11.28052805280528, + "grad_norm": 0.2421211153268814, + "learning_rate": 2.3776933884555368e-05, + "loss": 0.0722, + "num_input_tokens_seen": 21641184, + "step": 102540 + }, + { + "epoch": 11.281078107810782, + "grad_norm": 0.004143417812883854, + "learning_rate": 2.3774536696378887e-05, + "loss": 0.0019, + "num_input_tokens_seen": 21642240, + "step": 102545 + }, + { + "epoch": 11.281628162816281, + "grad_norm": 0.9197034239768982, + "learning_rate": 2.3772139519496917e-05, + "loss": 0.052, + "num_input_tokens_seen": 21643296, + "step": 102550 + }, + { + "epoch": 11.282178217821782, + "grad_norm": 2.4179840087890625, + "learning_rate": 2.3769742353931537e-05, + "loss": 0.0918, + "num_input_tokens_seen": 21644352, + "step": 102555 + }, + { + "epoch": 11.282728272827283, + "grad_norm": 0.058603353798389435, + "learning_rate": 2.3767345199704858e-05, + "loss": 0.0019, + "num_input_tokens_seen": 21645408, + "step": 102560 + }, + { + "epoch": 11.283278327832782, + "grad_norm": 0.0620080828666687, + "learning_rate": 2.3764948056838958e-05, + "loss": 0.0561, + "num_input_tokens_seen": 21646528, + "step": 102565 + }, + { + "epoch": 11.283828382838283, + "grad_norm": 0.010606848634779453, + "learning_rate": 2.3762550925355954e-05, + "loss": 0.0532, + "num_input_tokens_seen": 21647584, + "step": 102570 + }, + { + "epoch": 11.284378437843785, + "grad_norm": 0.03143291547894478, + "learning_rate": 2.3760153805277914e-05, + "loss": 0.0319, + "num_input_tokens_seen": 21648640, + "step": 102575 + }, + { + "epoch": 11.284928492849286, + "grad_norm": 0.0198556799441576, + "learning_rate": 2.375775669662694e-05, + "loss": 0.003, + "num_input_tokens_seen": 21649728, + "step": 102580 + }, + { + "epoch": 11.285478547854785, + "grad_norm": 0.025721674785017967, + "learning_rate": 2.375535959942513e-05, + "loss": 0.0061, + "num_input_tokens_seen": 21650752, + "step": 102585 + }, + { + "epoch": 11.286028602860286, + "grad_norm": 2.008861541748047, + "learning_rate": 2.375296251369457e-05, + "loss": 0.061, + "num_input_tokens_seen": 21651776, + "step": 102590 + }, + { + "epoch": 11.286578657865787, + "grad_norm": 0.02193126268684864, + "learning_rate": 2.3750565439457366e-05, + "loss": 0.0172, + "num_input_tokens_seen": 21652832, + "step": 102595 + }, + { + "epoch": 11.287128712871286, + "grad_norm": 0.04138047248125076, + "learning_rate": 2.3748168376735593e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21653920, + "step": 102600 + }, + { + "epoch": 11.287678767876788, + "grad_norm": 0.023409221321344376, + "learning_rate": 2.374577132555135e-05, + "loss": 0.0235, + "num_input_tokens_seen": 21655104, + "step": 102605 + }, + { + "epoch": 11.288228822882289, + "grad_norm": 0.016271043568849564, + "learning_rate": 2.3743374285926738e-05, + "loss": 0.005, + "num_input_tokens_seen": 21656128, + "step": 102610 + }, + { + "epoch": 11.288778877887788, + "grad_norm": 0.010734576731920242, + "learning_rate": 2.3740977257883835e-05, + "loss": 0.0947, + "num_input_tokens_seen": 21657184, + "step": 102615 + }, + { + "epoch": 11.289328932893289, + "grad_norm": 0.020076682791113853, + "learning_rate": 2.3738580241444745e-05, + "loss": 0.0082, + "num_input_tokens_seen": 21658208, + "step": 102620 + }, + { + "epoch": 11.28987898789879, + "grad_norm": 0.01907912828028202, + "learning_rate": 2.3736183236631558e-05, + "loss": 0.0739, + "num_input_tokens_seen": 21659328, + "step": 102625 + }, + { + "epoch": 11.290429042904291, + "grad_norm": 0.016107460483908653, + "learning_rate": 2.3733786243466354e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21660416, + "step": 102630 + }, + { + "epoch": 11.29097909790979, + "grad_norm": 0.02650385908782482, + "learning_rate": 2.373138926197124e-05, + "loss": 0.0283, + "num_input_tokens_seen": 21661472, + "step": 102635 + }, + { + "epoch": 11.291529152915292, + "grad_norm": 0.2056630253791809, + "learning_rate": 2.3728992292168302e-05, + "loss": 0.0464, + "num_input_tokens_seen": 21662592, + "step": 102640 + }, + { + "epoch": 11.292079207920793, + "grad_norm": 0.8074004054069519, + "learning_rate": 2.372659533407963e-05, + "loss": 0.0243, + "num_input_tokens_seen": 21663584, + "step": 102645 + }, + { + "epoch": 11.292629262926292, + "grad_norm": 0.024282651022076607, + "learning_rate": 2.372419838772733e-05, + "loss": 0.0146, + "num_input_tokens_seen": 21664640, + "step": 102650 + }, + { + "epoch": 11.293179317931793, + "grad_norm": 0.11436320841312408, + "learning_rate": 2.372180145313346e-05, + "loss": 0.007, + "num_input_tokens_seen": 21665664, + "step": 102655 + }, + { + "epoch": 11.293729372937294, + "grad_norm": 0.21305906772613525, + "learning_rate": 2.371940453032015e-05, + "loss": 0.0262, + "num_input_tokens_seen": 21666784, + "step": 102660 + }, + { + "epoch": 11.294279427942794, + "grad_norm": 2.7709591388702393, + "learning_rate": 2.3717007619309463e-05, + "loss": 0.0412, + "num_input_tokens_seen": 21667808, + "step": 102665 + }, + { + "epoch": 11.294829482948295, + "grad_norm": 0.06071880832314491, + "learning_rate": 2.371461072012351e-05, + "loss": 0.014, + "num_input_tokens_seen": 21668832, + "step": 102670 + }, + { + "epoch": 11.295379537953796, + "grad_norm": 0.8038432002067566, + "learning_rate": 2.3712213832784374e-05, + "loss": 0.0607, + "num_input_tokens_seen": 21669856, + "step": 102675 + }, + { + "epoch": 11.295929592959295, + "grad_norm": 0.03555351495742798, + "learning_rate": 2.3709816957314135e-05, + "loss": 0.0015, + "num_input_tokens_seen": 21670848, + "step": 102680 + }, + { + "epoch": 11.296479647964796, + "grad_norm": 1.405415415763855, + "learning_rate": 2.3707420093734906e-05, + "loss": 0.0207, + "num_input_tokens_seen": 21671872, + "step": 102685 + }, + { + "epoch": 11.297029702970297, + "grad_norm": 0.017364708706736565, + "learning_rate": 2.3705023242068756e-05, + "loss": 0.0044, + "num_input_tokens_seen": 21672960, + "step": 102690 + }, + { + "epoch": 11.297579757975798, + "grad_norm": 0.05141562968492508, + "learning_rate": 2.3702626402337787e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21674048, + "step": 102695 + }, + { + "epoch": 11.298129812981298, + "grad_norm": 0.0066948519088327885, + "learning_rate": 2.3700229574564098e-05, + "loss": 0.0628, + "num_input_tokens_seen": 21675136, + "step": 102700 + }, + { + "epoch": 11.298679867986799, + "grad_norm": 0.10288870334625244, + "learning_rate": 2.369783275876975e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21676192, + "step": 102705 + }, + { + "epoch": 11.2992299229923, + "grad_norm": 1.5237271785736084, + "learning_rate": 2.369543595497687e-05, + "loss": 0.0066, + "num_input_tokens_seen": 21677280, + "step": 102710 + }, + { + "epoch": 11.2997799779978, + "grad_norm": 0.00206250068731606, + "learning_rate": 2.369303916320753e-05, + "loss": 0.0098, + "num_input_tokens_seen": 21678368, + "step": 102715 + }, + { + "epoch": 11.3003300330033, + "grad_norm": 0.12209667265415192, + "learning_rate": 2.3690642383483807e-05, + "loss": 0.0146, + "num_input_tokens_seen": 21679456, + "step": 102720 + }, + { + "epoch": 11.300880088008801, + "grad_norm": 0.10936641693115234, + "learning_rate": 2.368824561582782e-05, + "loss": 0.0197, + "num_input_tokens_seen": 21680512, + "step": 102725 + }, + { + "epoch": 11.3014301430143, + "grad_norm": 1.491557240486145, + "learning_rate": 2.3685848860261634e-05, + "loss": 0.048, + "num_input_tokens_seen": 21681568, + "step": 102730 + }, + { + "epoch": 11.301980198019802, + "grad_norm": 0.16863307356834412, + "learning_rate": 2.368345211680735e-05, + "loss": 0.0066, + "num_input_tokens_seen": 21682656, + "step": 102735 + }, + { + "epoch": 11.302530253025303, + "grad_norm": 0.21992090344429016, + "learning_rate": 2.3681055385487066e-05, + "loss": 0.0111, + "num_input_tokens_seen": 21683712, + "step": 102740 + }, + { + "epoch": 11.303080308030804, + "grad_norm": 0.012823386117815971, + "learning_rate": 2.3678658666322848e-05, + "loss": 0.0521, + "num_input_tokens_seen": 21684864, + "step": 102745 + }, + { + "epoch": 11.303630363036303, + "grad_norm": 0.029954781755805016, + "learning_rate": 2.3676261959336803e-05, + "loss": 0.1775, + "num_input_tokens_seen": 21685952, + "step": 102750 + }, + { + "epoch": 11.304180418041804, + "grad_norm": 0.030460139736533165, + "learning_rate": 2.3673865264551013e-05, + "loss": 0.0025, + "num_input_tokens_seen": 21686976, + "step": 102755 + }, + { + "epoch": 11.304730473047305, + "grad_norm": 0.9038428068161011, + "learning_rate": 2.3671468581987577e-05, + "loss": 0.0498, + "num_input_tokens_seen": 21688064, + "step": 102760 + }, + { + "epoch": 11.305280528052805, + "grad_norm": 0.14808320999145508, + "learning_rate": 2.3669071911668582e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21689120, + "step": 102765 + }, + { + "epoch": 11.305830583058306, + "grad_norm": 2.9187958240509033, + "learning_rate": 2.36666752536161e-05, + "loss": 0.0491, + "num_input_tokens_seen": 21690240, + "step": 102770 + }, + { + "epoch": 11.306380638063807, + "grad_norm": 0.010962839238345623, + "learning_rate": 2.3664278607852242e-05, + "loss": 0.094, + "num_input_tokens_seen": 21691296, + "step": 102775 + }, + { + "epoch": 11.306930693069306, + "grad_norm": 0.05930709093809128, + "learning_rate": 2.3661881974399082e-05, + "loss": 0.0188, + "num_input_tokens_seen": 21692320, + "step": 102780 + }, + { + "epoch": 11.307480748074807, + "grad_norm": 1.2182449102401733, + "learning_rate": 2.3659485353278718e-05, + "loss": 0.025, + "num_input_tokens_seen": 21693376, + "step": 102785 + }, + { + "epoch": 11.308030803080309, + "grad_norm": 0.5066784620285034, + "learning_rate": 2.3657088744513234e-05, + "loss": 0.1447, + "num_input_tokens_seen": 21694432, + "step": 102790 + }, + { + "epoch": 11.308580858085808, + "grad_norm": 0.024264203384518623, + "learning_rate": 2.3654692148124714e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21695488, + "step": 102795 + }, + { + "epoch": 11.309130913091309, + "grad_norm": 0.009310578927397728, + "learning_rate": 2.3652295564135256e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21696576, + "step": 102800 + }, + { + "epoch": 11.30968096809681, + "grad_norm": 0.024532107636332512, + "learning_rate": 2.3649898992566936e-05, + "loss": 0.0206, + "num_input_tokens_seen": 21697664, + "step": 102805 + }, + { + "epoch": 11.310231023102311, + "grad_norm": 0.5810040831565857, + "learning_rate": 2.3647502433441856e-05, + "loss": 0.0133, + "num_input_tokens_seen": 21698752, + "step": 102810 + }, + { + "epoch": 11.31078107810781, + "grad_norm": 0.06668919324874878, + "learning_rate": 2.36451058867821e-05, + "loss": 0.0018, + "num_input_tokens_seen": 21699744, + "step": 102815 + }, + { + "epoch": 11.311331133113312, + "grad_norm": 4.259121894836426, + "learning_rate": 2.3642709352609737e-05, + "loss": 0.0949, + "num_input_tokens_seen": 21700704, + "step": 102820 + }, + { + "epoch": 11.311881188118813, + "grad_norm": 0.009847529232501984, + "learning_rate": 2.364031283094689e-05, + "loss": 0.0077, + "num_input_tokens_seen": 21701728, + "step": 102825 + }, + { + "epoch": 11.312431243124312, + "grad_norm": 0.0027961290907114744, + "learning_rate": 2.3637916321815613e-05, + "loss": 0.0028, + "num_input_tokens_seen": 21702880, + "step": 102830 + }, + { + "epoch": 11.312981298129813, + "grad_norm": 3.9801721572875977, + "learning_rate": 2.3635519825238013e-05, + "loss": 0.0949, + "num_input_tokens_seen": 21703968, + "step": 102835 + }, + { + "epoch": 11.313531353135314, + "grad_norm": 2.599700450897217, + "learning_rate": 2.363312334123618e-05, + "loss": 0.0217, + "num_input_tokens_seen": 21705024, + "step": 102840 + }, + { + "epoch": 11.314081408140813, + "grad_norm": 0.020246554166078568, + "learning_rate": 2.363072686983218e-05, + "loss": 0.0061, + "num_input_tokens_seen": 21706112, + "step": 102845 + }, + { + "epoch": 11.314631463146315, + "grad_norm": 0.008753837086260319, + "learning_rate": 2.362833041104812e-05, + "loss": 0.0359, + "num_input_tokens_seen": 21707168, + "step": 102850 + }, + { + "epoch": 11.315181518151816, + "grad_norm": 1.9110454320907593, + "learning_rate": 2.3625933964906073e-05, + "loss": 0.1794, + "num_input_tokens_seen": 21708256, + "step": 102855 + }, + { + "epoch": 11.315731573157315, + "grad_norm": 0.01918848231434822, + "learning_rate": 2.3623537531428145e-05, + "loss": 0.0977, + "num_input_tokens_seen": 21709312, + "step": 102860 + }, + { + "epoch": 11.316281628162816, + "grad_norm": 1.7793365716934204, + "learning_rate": 2.3621141110636406e-05, + "loss": 0.021, + "num_input_tokens_seen": 21710336, + "step": 102865 + }, + { + "epoch": 11.316831683168317, + "grad_norm": 0.04935193434357643, + "learning_rate": 2.3618744702552943e-05, + "loss": 0.0814, + "num_input_tokens_seen": 21711360, + "step": 102870 + }, + { + "epoch": 11.317381738173818, + "grad_norm": 1.5530415773391724, + "learning_rate": 2.361634830719986e-05, + "loss": 0.1986, + "num_input_tokens_seen": 21712416, + "step": 102875 + }, + { + "epoch": 11.317931793179318, + "grad_norm": 0.016878638416528702, + "learning_rate": 2.361395192459921e-05, + "loss": 0.0259, + "num_input_tokens_seen": 21713504, + "step": 102880 + }, + { + "epoch": 11.318481848184819, + "grad_norm": 0.18303793668746948, + "learning_rate": 2.3611555554773122e-05, + "loss": 0.0031, + "num_input_tokens_seen": 21714528, + "step": 102885 + }, + { + "epoch": 11.31903190319032, + "grad_norm": 0.02136271446943283, + "learning_rate": 2.360915919774365e-05, + "loss": 0.0104, + "num_input_tokens_seen": 21715584, + "step": 102890 + }, + { + "epoch": 11.319581958195819, + "grad_norm": 0.006477795075625181, + "learning_rate": 2.3606762853532883e-05, + "loss": 0.0037, + "num_input_tokens_seen": 21716672, + "step": 102895 + }, + { + "epoch": 11.32013201320132, + "grad_norm": 0.5142863392829895, + "learning_rate": 2.360436652216293e-05, + "loss": 0.0167, + "num_input_tokens_seen": 21717792, + "step": 102900 + }, + { + "epoch": 11.320682068206821, + "grad_norm": 0.008167154155671597, + "learning_rate": 2.3601970203655855e-05, + "loss": 0.0016, + "num_input_tokens_seen": 21718848, + "step": 102905 + }, + { + "epoch": 11.32123212321232, + "grad_norm": 0.8275127410888672, + "learning_rate": 2.3599573898033746e-05, + "loss": 0.0374, + "num_input_tokens_seen": 21719872, + "step": 102910 + }, + { + "epoch": 11.321782178217822, + "grad_norm": 0.014572721906006336, + "learning_rate": 2.3597177605318698e-05, + "loss": 0.0316, + "num_input_tokens_seen": 21720928, + "step": 102915 + }, + { + "epoch": 11.322332233223323, + "grad_norm": 0.10570729523897171, + "learning_rate": 2.3594781325532784e-05, + "loss": 0.026, + "num_input_tokens_seen": 21722016, + "step": 102920 + }, + { + "epoch": 11.322882288228822, + "grad_norm": 0.019963588565587997, + "learning_rate": 2.35923850586981e-05, + "loss": 0.0023, + "num_input_tokens_seen": 21723072, + "step": 102925 + }, + { + "epoch": 11.323432343234323, + "grad_norm": 0.05051998049020767, + "learning_rate": 2.3589988804836734e-05, + "loss": 0.0011, + "num_input_tokens_seen": 21724160, + "step": 102930 + }, + { + "epoch": 11.323982398239824, + "grad_norm": 0.051557887345552444, + "learning_rate": 2.358759256397075e-05, + "loss": 0.1957, + "num_input_tokens_seen": 21725184, + "step": 102935 + }, + { + "epoch": 11.324532453245325, + "grad_norm": 2.705932855606079, + "learning_rate": 2.3585196336122255e-05, + "loss": 0.1712, + "num_input_tokens_seen": 21726240, + "step": 102940 + }, + { + "epoch": 11.325082508250825, + "grad_norm": 0.03699856251478195, + "learning_rate": 2.3582800121313326e-05, + "loss": 0.0013, + "num_input_tokens_seen": 21727200, + "step": 102945 + }, + { + "epoch": 11.325632563256326, + "grad_norm": 0.08440495282411575, + "learning_rate": 2.3580403919566046e-05, + "loss": 0.0096, + "num_input_tokens_seen": 21728192, + "step": 102950 + }, + { + "epoch": 11.326182618261827, + "grad_norm": 0.03958327695727348, + "learning_rate": 2.3578007730902514e-05, + "loss": 0.0019, + "num_input_tokens_seen": 21729216, + "step": 102955 + }, + { + "epoch": 11.326732673267326, + "grad_norm": 0.15896083414554596, + "learning_rate": 2.3575611555344787e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21730240, + "step": 102960 + }, + { + "epoch": 11.327282728272827, + "grad_norm": 2.815225839614868, + "learning_rate": 2.357321539291497e-05, + "loss": 0.094, + "num_input_tokens_seen": 21731296, + "step": 102965 + }, + { + "epoch": 11.327832783278328, + "grad_norm": 0.04341066628694534, + "learning_rate": 2.357081924363514e-05, + "loss": 0.0633, + "num_input_tokens_seen": 21732288, + "step": 102970 + }, + { + "epoch": 11.328382838283828, + "grad_norm": 1.4588631391525269, + "learning_rate": 2.3568423107527393e-05, + "loss": 0.0163, + "num_input_tokens_seen": 21733312, + "step": 102975 + }, + { + "epoch": 11.328932893289329, + "grad_norm": 0.00894833542406559, + "learning_rate": 2.3566026984613797e-05, + "loss": 0.0092, + "num_input_tokens_seen": 21734304, + "step": 102980 + }, + { + "epoch": 11.32948294829483, + "grad_norm": 0.017393808811903, + "learning_rate": 2.3563630874916436e-05, + "loss": 0.0337, + "num_input_tokens_seen": 21735392, + "step": 102985 + }, + { + "epoch": 11.33003300330033, + "grad_norm": 0.008232545107603073, + "learning_rate": 2.3561234778457414e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21736448, + "step": 102990 + }, + { + "epoch": 11.33058305830583, + "grad_norm": 0.014238876290619373, + "learning_rate": 2.3558838695258788e-05, + "loss": 0.1019, + "num_input_tokens_seen": 21737472, + "step": 102995 + }, + { + "epoch": 11.331133113311331, + "grad_norm": 0.13961079716682434, + "learning_rate": 2.3556442625342658e-05, + "loss": 0.1037, + "num_input_tokens_seen": 21738496, + "step": 103000 + }, + { + "epoch": 11.331683168316832, + "grad_norm": 0.22626087069511414, + "learning_rate": 2.3554046568731105e-05, + "loss": 0.0038, + "num_input_tokens_seen": 21739520, + "step": 103005 + }, + { + "epoch": 11.332233223322332, + "grad_norm": 0.01179434172809124, + "learning_rate": 2.3551650525446206e-05, + "loss": 0.0473, + "num_input_tokens_seen": 21740544, + "step": 103010 + }, + { + "epoch": 11.332783278327833, + "grad_norm": 0.036753296852111816, + "learning_rate": 2.354925449551006e-05, + "loss": 0.1001, + "num_input_tokens_seen": 21741600, + "step": 103015 + }, + { + "epoch": 11.333333333333334, + "grad_norm": 0.15820327401161194, + "learning_rate": 2.354685847894473e-05, + "loss": 0.0601, + "num_input_tokens_seen": 21742656, + "step": 103020 + }, + { + "epoch": 11.333883388338833, + "grad_norm": 0.48633483052253723, + "learning_rate": 2.354446247577231e-05, + "loss": 0.0696, + "num_input_tokens_seen": 21743744, + "step": 103025 + }, + { + "epoch": 11.334433443344334, + "grad_norm": 0.01598537154495716, + "learning_rate": 2.3542066486014888e-05, + "loss": 0.0105, + "num_input_tokens_seen": 21744736, + "step": 103030 + }, + { + "epoch": 11.334983498349835, + "grad_norm": 0.6902298331260681, + "learning_rate": 2.353967050969453e-05, + "loss": 0.0681, + "num_input_tokens_seen": 21745728, + "step": 103035 + }, + { + "epoch": 11.335533553355335, + "grad_norm": 1.2021408081054688, + "learning_rate": 2.353727454683333e-05, + "loss": 0.0442, + "num_input_tokens_seen": 21746784, + "step": 103040 + }, + { + "epoch": 11.336083608360836, + "grad_norm": 0.06235876679420471, + "learning_rate": 2.3534878597453365e-05, + "loss": 0.0491, + "num_input_tokens_seen": 21747840, + "step": 103045 + }, + { + "epoch": 11.336633663366337, + "grad_norm": 0.05186881870031357, + "learning_rate": 2.3532482661576733e-05, + "loss": 0.002, + "num_input_tokens_seen": 21748800, + "step": 103050 + }, + { + "epoch": 11.337183718371838, + "grad_norm": 0.011524797417223454, + "learning_rate": 2.3530086739225496e-05, + "loss": 0.0492, + "num_input_tokens_seen": 21749888, + "step": 103055 + }, + { + "epoch": 11.337733773377337, + "grad_norm": 0.016377365216612816, + "learning_rate": 2.352769083042174e-05, + "loss": 0.031, + "num_input_tokens_seen": 21751040, + "step": 103060 + }, + { + "epoch": 11.338283828382838, + "grad_norm": 0.5926912426948547, + "learning_rate": 2.352529493518756e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21752064, + "step": 103065 + }, + { + "epoch": 11.33883388338834, + "grad_norm": 0.2595479488372803, + "learning_rate": 2.352289905354502e-05, + "loss": 0.0092, + "num_input_tokens_seen": 21753120, + "step": 103070 + }, + { + "epoch": 11.339383938393839, + "grad_norm": 0.09127770364284515, + "learning_rate": 2.3520503185516222e-05, + "loss": 0.1113, + "num_input_tokens_seen": 21754080, + "step": 103075 + }, + { + "epoch": 11.33993399339934, + "grad_norm": 0.3156004548072815, + "learning_rate": 2.3518107331123228e-05, + "loss": 0.0813, + "num_input_tokens_seen": 21755104, + "step": 103080 + }, + { + "epoch": 11.340484048404841, + "grad_norm": 0.06636641174554825, + "learning_rate": 2.3515711490388125e-05, + "loss": 0.0072, + "num_input_tokens_seen": 21756192, + "step": 103085 + }, + { + "epoch": 11.34103410341034, + "grad_norm": 0.011566031724214554, + "learning_rate": 2.351331566333301e-05, + "loss": 0.0119, + "num_input_tokens_seen": 21757216, + "step": 103090 + }, + { + "epoch": 11.341584158415841, + "grad_norm": 0.011315643787384033, + "learning_rate": 2.351091984997994e-05, + "loss": 0.0226, + "num_input_tokens_seen": 21758272, + "step": 103095 + }, + { + "epoch": 11.342134213421343, + "grad_norm": 0.12029213458299637, + "learning_rate": 2.3508524050351012e-05, + "loss": 0.0631, + "num_input_tokens_seen": 21759392, + "step": 103100 + }, + { + "epoch": 11.342684268426842, + "grad_norm": 0.7253268957138062, + "learning_rate": 2.3506128264468306e-05, + "loss": 0.0202, + "num_input_tokens_seen": 21760416, + "step": 103105 + }, + { + "epoch": 11.343234323432343, + "grad_norm": 0.051295969635248184, + "learning_rate": 2.3503732492353887e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21761472, + "step": 103110 + }, + { + "epoch": 11.343784378437844, + "grad_norm": 0.09853937476873398, + "learning_rate": 2.3501336734029855e-05, + "loss": 0.0043, + "num_input_tokens_seen": 21762528, + "step": 103115 + }, + { + "epoch": 11.344334433443345, + "grad_norm": 1.2323919534683228, + "learning_rate": 2.3498940989518288e-05, + "loss": 0.0294, + "num_input_tokens_seen": 21763648, + "step": 103120 + }, + { + "epoch": 11.344884488448844, + "grad_norm": 0.03523518890142441, + "learning_rate": 2.3496545258841245e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21764768, + "step": 103125 + }, + { + "epoch": 11.345434543454346, + "grad_norm": 4.075846195220947, + "learning_rate": 2.349414954202084e-05, + "loss": 0.0321, + "num_input_tokens_seen": 21765824, + "step": 103130 + }, + { + "epoch": 11.345984598459847, + "grad_norm": 0.06481122970581055, + "learning_rate": 2.3491753839079124e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21766848, + "step": 103135 + }, + { + "epoch": 11.346534653465346, + "grad_norm": 0.2855651080608368, + "learning_rate": 2.3489358150038196e-05, + "loss": 0.0182, + "num_input_tokens_seen": 21767872, + "step": 103140 + }, + { + "epoch": 11.347084708470847, + "grad_norm": 0.21418903768062592, + "learning_rate": 2.3486962474920135e-05, + "loss": 0.0041, + "num_input_tokens_seen": 21768960, + "step": 103145 + }, + { + "epoch": 11.347634763476348, + "grad_norm": 0.10058620572090149, + "learning_rate": 2.3484566813747e-05, + "loss": 0.0103, + "num_input_tokens_seen": 21770080, + "step": 103150 + }, + { + "epoch": 11.348184818481847, + "grad_norm": 0.05307411402463913, + "learning_rate": 2.34821711665409e-05, + "loss": 0.0046, + "num_input_tokens_seen": 21771104, + "step": 103155 + }, + { + "epoch": 11.348734873487349, + "grad_norm": 1.6669201850891113, + "learning_rate": 2.3479775533323887e-05, + "loss": 0.0252, + "num_input_tokens_seen": 21772192, + "step": 103160 + }, + { + "epoch": 11.34928492849285, + "grad_norm": 0.9188798069953918, + "learning_rate": 2.3477379914118064e-05, + "loss": 0.0303, + "num_input_tokens_seen": 21773280, + "step": 103165 + }, + { + "epoch": 11.34983498349835, + "grad_norm": 0.02304792031645775, + "learning_rate": 2.3474984308945497e-05, + "loss": 0.0195, + "num_input_tokens_seen": 21774336, + "step": 103170 + }, + { + "epoch": 11.35038503850385, + "grad_norm": 0.043260931968688965, + "learning_rate": 2.3472588717828263e-05, + "loss": 0.0015, + "num_input_tokens_seen": 21775360, + "step": 103175 + }, + { + "epoch": 11.350935093509351, + "grad_norm": 3.6983487606048584, + "learning_rate": 2.3470193140788455e-05, + "loss": 0.1209, + "num_input_tokens_seen": 21776448, + "step": 103180 + }, + { + "epoch": 11.351485148514852, + "grad_norm": 0.06864892691373825, + "learning_rate": 2.3467797577848124e-05, + "loss": 0.0188, + "num_input_tokens_seen": 21777536, + "step": 103185 + }, + { + "epoch": 11.352035203520352, + "grad_norm": 0.3275740444660187, + "learning_rate": 2.3465402029029387e-05, + "loss": 0.0051, + "num_input_tokens_seen": 21778528, + "step": 103190 + }, + { + "epoch": 11.352585258525853, + "grad_norm": 0.015347618609666824, + "learning_rate": 2.34630064943543e-05, + "loss": 0.0457, + "num_input_tokens_seen": 21779552, + "step": 103195 + }, + { + "epoch": 11.353135313531354, + "grad_norm": 0.0043120249174535275, + "learning_rate": 2.3460610973844934e-05, + "loss": 0.0028, + "num_input_tokens_seen": 21780544, + "step": 103200 + }, + { + "epoch": 11.353685368536853, + "grad_norm": 0.023597687482833862, + "learning_rate": 2.3458215467523392e-05, + "loss": 0.0942, + "num_input_tokens_seen": 21781536, + "step": 103205 + }, + { + "epoch": 11.354235423542354, + "grad_norm": 0.2214713990688324, + "learning_rate": 2.3455819975411726e-05, + "loss": 0.004, + "num_input_tokens_seen": 21782560, + "step": 103210 + }, + { + "epoch": 11.354785478547855, + "grad_norm": 0.9999486804008484, + "learning_rate": 2.3453424497532032e-05, + "loss": 0.0319, + "num_input_tokens_seen": 21783616, + "step": 103215 + }, + { + "epoch": 11.355335533553355, + "grad_norm": 0.008636715821921825, + "learning_rate": 2.345102903390639e-05, + "loss": 0.0019, + "num_input_tokens_seen": 21784704, + "step": 103220 + }, + { + "epoch": 11.355885588558856, + "grad_norm": 0.2763420045375824, + "learning_rate": 2.3448633584556856e-05, + "loss": 0.0323, + "num_input_tokens_seen": 21785824, + "step": 103225 + }, + { + "epoch": 11.356435643564357, + "grad_norm": 0.012774339877068996, + "learning_rate": 2.344623814950553e-05, + "loss": 0.0023, + "num_input_tokens_seen": 21786848, + "step": 103230 + }, + { + "epoch": 11.356985698569858, + "grad_norm": 0.5763223767280579, + "learning_rate": 2.3443842728774472e-05, + "loss": 0.0138, + "num_input_tokens_seen": 21788000, + "step": 103235 + }, + { + "epoch": 11.357535753575357, + "grad_norm": 2.686319589614868, + "learning_rate": 2.3441447322385785e-05, + "loss": 0.1183, + "num_input_tokens_seen": 21789024, + "step": 103240 + }, + { + "epoch": 11.358085808580858, + "grad_norm": 0.07161567360162735, + "learning_rate": 2.343905193036152e-05, + "loss": 0.071, + "num_input_tokens_seen": 21790048, + "step": 103245 + }, + { + "epoch": 11.35863586358636, + "grad_norm": 1.7780133485794067, + "learning_rate": 2.3436656552723762e-05, + "loss": 0.0578, + "num_input_tokens_seen": 21791072, + "step": 103250 + }, + { + "epoch": 11.359185918591859, + "grad_norm": 0.7520585060119629, + "learning_rate": 2.3434261189494593e-05, + "loss": 0.0069, + "num_input_tokens_seen": 21792160, + "step": 103255 + }, + { + "epoch": 11.35973597359736, + "grad_norm": 2.5535924434661865, + "learning_rate": 2.3431865840696086e-05, + "loss": 0.0196, + "num_input_tokens_seen": 21793216, + "step": 103260 + }, + { + "epoch": 11.36028602860286, + "grad_norm": 1.0180960893630981, + "learning_rate": 2.3429470506350327e-05, + "loss": 0.016, + "num_input_tokens_seen": 21794176, + "step": 103265 + }, + { + "epoch": 11.36083608360836, + "grad_norm": 1.266756296157837, + "learning_rate": 2.3427075186479382e-05, + "loss": 0.063, + "num_input_tokens_seen": 21795296, + "step": 103270 + }, + { + "epoch": 11.361386138613861, + "grad_norm": 0.01602022349834442, + "learning_rate": 2.342467988110532e-05, + "loss": 0.0156, + "num_input_tokens_seen": 21796320, + "step": 103275 + }, + { + "epoch": 11.361936193619362, + "grad_norm": 0.008125453256070614, + "learning_rate": 2.3422284590250244e-05, + "loss": 0.0015, + "num_input_tokens_seen": 21797376, + "step": 103280 + }, + { + "epoch": 11.362486248624862, + "grad_norm": 0.011282296851277351, + "learning_rate": 2.34198893139362e-05, + "loss": 0.0623, + "num_input_tokens_seen": 21798432, + "step": 103285 + }, + { + "epoch": 11.363036303630363, + "grad_norm": 2.187601327896118, + "learning_rate": 2.341749405218528e-05, + "loss": 0.0658, + "num_input_tokens_seen": 21799520, + "step": 103290 + }, + { + "epoch": 11.363586358635864, + "grad_norm": 0.055250681936740875, + "learning_rate": 2.341509880501957e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21800576, + "step": 103295 + }, + { + "epoch": 11.364136413641365, + "grad_norm": 0.33064594864845276, + "learning_rate": 2.3412703572461114e-05, + "loss": 0.1179, + "num_input_tokens_seen": 21801600, + "step": 103300 + }, + { + "epoch": 11.364686468646864, + "grad_norm": 0.006164306774735451, + "learning_rate": 2.3410308354532024e-05, + "loss": 0.0011, + "num_input_tokens_seen": 21802624, + "step": 103305 + }, + { + "epoch": 11.365236523652365, + "grad_norm": 0.0050236680544912815, + "learning_rate": 2.3407913151254354e-05, + "loss": 0.0044, + "num_input_tokens_seen": 21803648, + "step": 103310 + }, + { + "epoch": 11.365786578657866, + "grad_norm": 0.03855860233306885, + "learning_rate": 2.3405517962650178e-05, + "loss": 0.0379, + "num_input_tokens_seen": 21804704, + "step": 103315 + }, + { + "epoch": 11.366336633663366, + "grad_norm": 0.09110481292009354, + "learning_rate": 2.340312278874159e-05, + "loss": 0.0077, + "num_input_tokens_seen": 21805728, + "step": 103320 + }, + { + "epoch": 11.366886688668867, + "grad_norm": 0.017694557085633278, + "learning_rate": 2.3400727629550643e-05, + "loss": 0.0071, + "num_input_tokens_seen": 21806784, + "step": 103325 + }, + { + "epoch": 11.367436743674368, + "grad_norm": 0.09838727861642838, + "learning_rate": 2.3398332485099428e-05, + "loss": 0.004, + "num_input_tokens_seen": 21807904, + "step": 103330 + }, + { + "epoch": 11.367986798679867, + "grad_norm": 0.009210693649947643, + "learning_rate": 2.3395937355410016e-05, + "loss": 0.0012, + "num_input_tokens_seen": 21808992, + "step": 103335 + }, + { + "epoch": 11.368536853685368, + "grad_norm": 0.03573215380311012, + "learning_rate": 2.3393542240504472e-05, + "loss": 0.0111, + "num_input_tokens_seen": 21809984, + "step": 103340 + }, + { + "epoch": 11.36908690869087, + "grad_norm": 0.7617582082748413, + "learning_rate": 2.339114714040488e-05, + "loss": 0.0133, + "num_input_tokens_seen": 21811040, + "step": 103345 + }, + { + "epoch": 11.369636963696369, + "grad_norm": 0.3285391628742218, + "learning_rate": 2.338875205513331e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21812128, + "step": 103350 + }, + { + "epoch": 11.37018701870187, + "grad_norm": 0.07964076101779938, + "learning_rate": 2.338635698471185e-05, + "loss": 0.0428, + "num_input_tokens_seen": 21813216, + "step": 103355 + }, + { + "epoch": 11.370737073707371, + "grad_norm": 0.1563349962234497, + "learning_rate": 2.3383961929162558e-05, + "loss": 0.0119, + "num_input_tokens_seen": 21814240, + "step": 103360 + }, + { + "epoch": 11.371287128712872, + "grad_norm": 0.011877964250743389, + "learning_rate": 2.338156688850751e-05, + "loss": 0.0993, + "num_input_tokens_seen": 21815296, + "step": 103365 + }, + { + "epoch": 11.371837183718371, + "grad_norm": 0.025406308472156525, + "learning_rate": 2.3379171862768785e-05, + "loss": 0.0358, + "num_input_tokens_seen": 21816352, + "step": 103370 + }, + { + "epoch": 11.372387238723872, + "grad_norm": 0.008250374346971512, + "learning_rate": 2.3376776851968453e-05, + "loss": 0.0017, + "num_input_tokens_seen": 21817472, + "step": 103375 + }, + { + "epoch": 11.372937293729374, + "grad_norm": 0.015754610300064087, + "learning_rate": 2.3374381856128596e-05, + "loss": 0.0045, + "num_input_tokens_seen": 21818432, + "step": 103380 + }, + { + "epoch": 11.373487348734873, + "grad_norm": 0.030677499249577522, + "learning_rate": 2.337198687527128e-05, + "loss": 0.0068, + "num_input_tokens_seen": 21819488, + "step": 103385 + }, + { + "epoch": 11.374037403740374, + "grad_norm": 1.5513190031051636, + "learning_rate": 2.3369591909418573e-05, + "loss": 0.033, + "num_input_tokens_seen": 21820480, + "step": 103390 + }, + { + "epoch": 11.374587458745875, + "grad_norm": 1.788036823272705, + "learning_rate": 2.3367196958592565e-05, + "loss": 0.1002, + "num_input_tokens_seen": 21821536, + "step": 103395 + }, + { + "epoch": 11.375137513751374, + "grad_norm": 0.07103219628334045, + "learning_rate": 2.336480202281531e-05, + "loss": 0.0028, + "num_input_tokens_seen": 21822560, + "step": 103400 + }, + { + "epoch": 11.375687568756875, + "grad_norm": 0.020104842260479927, + "learning_rate": 2.3362407102108898e-05, + "loss": 0.0684, + "num_input_tokens_seen": 21823584, + "step": 103405 + }, + { + "epoch": 11.376237623762377, + "grad_norm": 0.02162649668753147, + "learning_rate": 2.3360012196495397e-05, + "loss": 0.0202, + "num_input_tokens_seen": 21824640, + "step": 103410 + }, + { + "epoch": 11.376787678767876, + "grad_norm": 0.033606503158807755, + "learning_rate": 2.3357617305996864e-05, + "loss": 0.1039, + "num_input_tokens_seen": 21825696, + "step": 103415 + }, + { + "epoch": 11.377337733773377, + "grad_norm": 0.21067489683628082, + "learning_rate": 2.335522243063539e-05, + "loss": 0.0076, + "num_input_tokens_seen": 21826784, + "step": 103420 + }, + { + "epoch": 11.377887788778878, + "grad_norm": 0.21638309955596924, + "learning_rate": 2.3352827570433036e-05, + "loss": 0.0314, + "num_input_tokens_seen": 21827744, + "step": 103425 + }, + { + "epoch": 11.37843784378438, + "grad_norm": 0.023437820374965668, + "learning_rate": 2.3350432725411888e-05, + "loss": 0.0017, + "num_input_tokens_seen": 21828800, + "step": 103430 + }, + { + "epoch": 11.378987898789878, + "grad_norm": 0.01833382248878479, + "learning_rate": 2.3348037895594012e-05, + "loss": 0.0081, + "num_input_tokens_seen": 21829824, + "step": 103435 + }, + { + "epoch": 11.37953795379538, + "grad_norm": 0.17608322203159332, + "learning_rate": 2.334564308100147e-05, + "loss": 0.01, + "num_input_tokens_seen": 21830944, + "step": 103440 + }, + { + "epoch": 11.38008800880088, + "grad_norm": 0.08966917544603348, + "learning_rate": 2.3343248281656346e-05, + "loss": 0.0438, + "num_input_tokens_seen": 21832064, + "step": 103445 + }, + { + "epoch": 11.38063806380638, + "grad_norm": 1.4627954959869385, + "learning_rate": 2.33408534975807e-05, + "loss": 0.0848, + "num_input_tokens_seen": 21833120, + "step": 103450 + }, + { + "epoch": 11.381188118811881, + "grad_norm": 0.10850898921489716, + "learning_rate": 2.3338458728796626e-05, + "loss": 0.0049, + "num_input_tokens_seen": 21834112, + "step": 103455 + }, + { + "epoch": 11.381738173817382, + "grad_norm": 0.07088375091552734, + "learning_rate": 2.333606397532617e-05, + "loss": 0.129, + "num_input_tokens_seen": 21835104, + "step": 103460 + }, + { + "epoch": 11.382288228822881, + "grad_norm": 0.024864112958312035, + "learning_rate": 2.3333669237191416e-05, + "loss": 0.0011, + "num_input_tokens_seen": 21836192, + "step": 103465 + }, + { + "epoch": 11.382838283828383, + "grad_norm": 0.012315606698393822, + "learning_rate": 2.333127451441444e-05, + "loss": 0.0993, + "num_input_tokens_seen": 21837216, + "step": 103470 + }, + { + "epoch": 11.383388338833884, + "grad_norm": 1.0549829006195068, + "learning_rate": 2.3328879807017297e-05, + "loss": 0.0715, + "num_input_tokens_seen": 21838240, + "step": 103475 + }, + { + "epoch": 11.383938393839385, + "grad_norm": 0.014293328858911991, + "learning_rate": 2.3326485115022067e-05, + "loss": 0.0041, + "num_input_tokens_seen": 21839264, + "step": 103480 + }, + { + "epoch": 11.384488448844884, + "grad_norm": 0.05747874826192856, + "learning_rate": 2.3324090438450832e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21840320, + "step": 103485 + }, + { + "epoch": 11.385038503850385, + "grad_norm": 0.038550153374671936, + "learning_rate": 2.332169577732563e-05, + "loss": 0.0026, + "num_input_tokens_seen": 21841344, + "step": 103490 + }, + { + "epoch": 11.385588558855886, + "grad_norm": 0.016639117151498795, + "learning_rate": 2.3319301131668576e-05, + "loss": 0.0329, + "num_input_tokens_seen": 21842464, + "step": 103495 + }, + { + "epoch": 11.386138613861386, + "grad_norm": 0.011377126909792423, + "learning_rate": 2.3316906501501706e-05, + "loss": 0.0043, + "num_input_tokens_seen": 21843520, + "step": 103500 + }, + { + "epoch": 11.386688668866887, + "grad_norm": 0.012605970725417137, + "learning_rate": 2.3314511886847096e-05, + "loss": 0.0071, + "num_input_tokens_seen": 21844544, + "step": 103505 + }, + { + "epoch": 11.387238723872388, + "grad_norm": 0.026110297068953514, + "learning_rate": 2.3312117287726833e-05, + "loss": 0.0022, + "num_input_tokens_seen": 21845568, + "step": 103510 + }, + { + "epoch": 11.387788778877887, + "grad_norm": 0.05477702245116234, + "learning_rate": 2.3309722704162963e-05, + "loss": 0.0369, + "num_input_tokens_seen": 21846624, + "step": 103515 + }, + { + "epoch": 11.388338833883388, + "grad_norm": 0.06716704368591309, + "learning_rate": 2.3307328136177575e-05, + "loss": 0.0272, + "num_input_tokens_seen": 21847680, + "step": 103520 + }, + { + "epoch": 11.38888888888889, + "grad_norm": 0.27547648549079895, + "learning_rate": 2.3304933583792737e-05, + "loss": 0.0088, + "num_input_tokens_seen": 21848672, + "step": 103525 + }, + { + "epoch": 11.389438943894389, + "grad_norm": 0.3502192199230194, + "learning_rate": 2.33025390470305e-05, + "loss": 0.0138, + "num_input_tokens_seen": 21849760, + "step": 103530 + }, + { + "epoch": 11.38998899889989, + "grad_norm": 0.05016893893480301, + "learning_rate": 2.3300144525912953e-05, + "loss": 0.0036, + "num_input_tokens_seen": 21850784, + "step": 103535 + }, + { + "epoch": 11.39053905390539, + "grad_norm": 0.052260108292102814, + "learning_rate": 2.3297750020462153e-05, + "loss": 0.0935, + "num_input_tokens_seen": 21851840, + "step": 103540 + }, + { + "epoch": 11.391089108910892, + "grad_norm": 0.27410459518432617, + "learning_rate": 2.3295355530700176e-05, + "loss": 0.0052, + "num_input_tokens_seen": 21852864, + "step": 103545 + }, + { + "epoch": 11.391639163916391, + "grad_norm": 1.5311061143875122, + "learning_rate": 2.3292961056649098e-05, + "loss": 0.0163, + "num_input_tokens_seen": 21853888, + "step": 103550 + }, + { + "epoch": 11.392189218921892, + "grad_norm": 0.037666112184524536, + "learning_rate": 2.329056659833097e-05, + "loss": 0.0121, + "num_input_tokens_seen": 21854944, + "step": 103555 + }, + { + "epoch": 11.392739273927393, + "grad_norm": 0.02537870965898037, + "learning_rate": 2.328817215576787e-05, + "loss": 0.0858, + "num_input_tokens_seen": 21856032, + "step": 103560 + }, + { + "epoch": 11.393289328932893, + "grad_norm": 0.9182928800582886, + "learning_rate": 2.3285777728981864e-05, + "loss": 0.0134, + "num_input_tokens_seen": 21857024, + "step": 103565 + }, + { + "epoch": 11.393839383938394, + "grad_norm": 0.025513246655464172, + "learning_rate": 2.3283383317995032e-05, + "loss": 0.0057, + "num_input_tokens_seen": 21858080, + "step": 103570 + }, + { + "epoch": 11.394389438943895, + "grad_norm": 0.0070592062547802925, + "learning_rate": 2.3280988922829426e-05, + "loss": 0.003, + "num_input_tokens_seen": 21859136, + "step": 103575 + }, + { + "epoch": 11.394939493949394, + "grad_norm": 0.2999928295612335, + "learning_rate": 2.3278594543507115e-05, + "loss": 0.1272, + "num_input_tokens_seen": 21860192, + "step": 103580 + }, + { + "epoch": 11.395489548954895, + "grad_norm": 0.23134855926036835, + "learning_rate": 2.3276200180050184e-05, + "loss": 0.0185, + "num_input_tokens_seen": 21861312, + "step": 103585 + }, + { + "epoch": 11.396039603960396, + "grad_norm": 0.009667892009019852, + "learning_rate": 2.327380583248068e-05, + "loss": 0.0021, + "num_input_tokens_seen": 21862336, + "step": 103590 + }, + { + "epoch": 11.396589658965897, + "grad_norm": 0.04290822148323059, + "learning_rate": 2.3271411500820677e-05, + "loss": 0.0027, + "num_input_tokens_seen": 21863328, + "step": 103595 + }, + { + "epoch": 11.397139713971397, + "grad_norm": 1.8464221954345703, + "learning_rate": 2.3269017185092257e-05, + "loss": 0.0379, + "num_input_tokens_seen": 21864384, + "step": 103600 + }, + { + "epoch": 11.397689768976898, + "grad_norm": 1.5379395484924316, + "learning_rate": 2.3266622885317456e-05, + "loss": 0.0693, + "num_input_tokens_seen": 21865440, + "step": 103605 + }, + { + "epoch": 11.398239823982399, + "grad_norm": 0.8859720230102539, + "learning_rate": 2.3264228601518377e-05, + "loss": 0.0366, + "num_input_tokens_seen": 21866496, + "step": 103610 + }, + { + "epoch": 11.398789878987898, + "grad_norm": 1.1098941564559937, + "learning_rate": 2.326183433371706e-05, + "loss": 0.0864, + "num_input_tokens_seen": 21867552, + "step": 103615 + }, + { + "epoch": 11.3993399339934, + "grad_norm": 0.04694022238254547, + "learning_rate": 2.3259440081935586e-05, + "loss": 0.0052, + "num_input_tokens_seen": 21868544, + "step": 103620 + }, + { + "epoch": 11.3998899889989, + "grad_norm": 0.04480663686990738, + "learning_rate": 2.3257045846196025e-05, + "loss": 0.0012, + "num_input_tokens_seen": 21869568, + "step": 103625 + }, + { + "epoch": 11.4004400440044, + "grad_norm": 0.007130542770028114, + "learning_rate": 2.3254651626520424e-05, + "loss": 0.0078, + "num_input_tokens_seen": 21870656, + "step": 103630 + }, + { + "epoch": 11.400990099009901, + "grad_norm": 0.41739141941070557, + "learning_rate": 2.325225742293087e-05, + "loss": 0.0073, + "num_input_tokens_seen": 21871680, + "step": 103635 + }, + { + "epoch": 11.401540154015402, + "grad_norm": 0.04346293956041336, + "learning_rate": 2.324986323544941e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21872736, + "step": 103640 + }, + { + "epoch": 11.402090209020901, + "grad_norm": 0.009313976392149925, + "learning_rate": 2.3247469064098133e-05, + "loss": 0.0211, + "num_input_tokens_seen": 21873792, + "step": 103645 + }, + { + "epoch": 11.402640264026402, + "grad_norm": 1.0461417436599731, + "learning_rate": 2.3245074908899086e-05, + "loss": 0.1041, + "num_input_tokens_seen": 21874880, + "step": 103650 + }, + { + "epoch": 11.403190319031903, + "grad_norm": 1.5107533931732178, + "learning_rate": 2.324268076987434e-05, + "loss": 0.0467, + "num_input_tokens_seen": 21875936, + "step": 103655 + }, + { + "epoch": 11.403740374037405, + "grad_norm": 0.025326654314994812, + "learning_rate": 2.3240286647045968e-05, + "loss": 0.0076, + "num_input_tokens_seen": 21876960, + "step": 103660 + }, + { + "epoch": 11.404290429042904, + "grad_norm": 0.002075592987239361, + "learning_rate": 2.3237892540436017e-05, + "loss": 0.007, + "num_input_tokens_seen": 21877952, + "step": 103665 + }, + { + "epoch": 11.404840484048405, + "grad_norm": 0.023758720606565475, + "learning_rate": 2.323549845006658e-05, + "loss": 0.1345, + "num_input_tokens_seen": 21878976, + "step": 103670 + }, + { + "epoch": 11.405390539053906, + "grad_norm": 0.29663464426994324, + "learning_rate": 2.3233104375959704e-05, + "loss": 0.0058, + "num_input_tokens_seen": 21880064, + "step": 103675 + }, + { + "epoch": 11.405940594059405, + "grad_norm": 0.0721149668097496, + "learning_rate": 2.323071031813745e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21881120, + "step": 103680 + }, + { + "epoch": 11.406490649064907, + "grad_norm": 1.8497556447982788, + "learning_rate": 2.3228316276621902e-05, + "loss": 0.029, + "num_input_tokens_seen": 21882176, + "step": 103685 + }, + { + "epoch": 11.407040704070408, + "grad_norm": 0.5655815005302429, + "learning_rate": 2.3225922251435107e-05, + "loss": 0.0164, + "num_input_tokens_seen": 21883200, + "step": 103690 + }, + { + "epoch": 11.407590759075907, + "grad_norm": 0.007309107109904289, + "learning_rate": 2.3223528242599125e-05, + "loss": 0.0875, + "num_input_tokens_seen": 21884288, + "step": 103695 + }, + { + "epoch": 11.408140814081408, + "grad_norm": 0.048002585768699646, + "learning_rate": 2.322113425013605e-05, + "loss": 0.0431, + "num_input_tokens_seen": 21885376, + "step": 103700 + }, + { + "epoch": 11.408690869086909, + "grad_norm": 0.06309124827384949, + "learning_rate": 2.321874027406791e-05, + "loss": 0.0079, + "num_input_tokens_seen": 21886432, + "step": 103705 + }, + { + "epoch": 11.409240924092408, + "grad_norm": 0.017528628930449486, + "learning_rate": 2.3216346314416793e-05, + "loss": 0.0704, + "num_input_tokens_seen": 21887488, + "step": 103710 + }, + { + "epoch": 11.40979097909791, + "grad_norm": 0.012690586969256401, + "learning_rate": 2.3213952371204763e-05, + "loss": 0.0023, + "num_input_tokens_seen": 21888512, + "step": 103715 + }, + { + "epoch": 11.41034103410341, + "grad_norm": 0.5328789353370667, + "learning_rate": 2.3211558444453863e-05, + "loss": 0.0389, + "num_input_tokens_seen": 21889600, + "step": 103720 + }, + { + "epoch": 11.410891089108912, + "grad_norm": 1.8686238527297974, + "learning_rate": 2.320916453418618e-05, + "loss": 0.0838, + "num_input_tokens_seen": 21890720, + "step": 103725 + }, + { + "epoch": 11.411441144114411, + "grad_norm": 0.024780910462141037, + "learning_rate": 2.3206770640423757e-05, + "loss": 0.0077, + "num_input_tokens_seen": 21891744, + "step": 103730 + }, + { + "epoch": 11.411991199119912, + "grad_norm": 0.22669313848018646, + "learning_rate": 2.3204376763188678e-05, + "loss": 0.0443, + "num_input_tokens_seen": 21892800, + "step": 103735 + }, + { + "epoch": 11.412541254125413, + "grad_norm": 0.08769150823354721, + "learning_rate": 2.3201982902503007e-05, + "loss": 0.1114, + "num_input_tokens_seen": 21893824, + "step": 103740 + }, + { + "epoch": 11.413091309130913, + "grad_norm": 0.4249950051307678, + "learning_rate": 2.3199589058388777e-05, + "loss": 0.0039, + "num_input_tokens_seen": 21894848, + "step": 103745 + }, + { + "epoch": 11.413641364136414, + "grad_norm": 0.14688412845134735, + "learning_rate": 2.319719523086808e-05, + "loss": 0.023, + "num_input_tokens_seen": 21895904, + "step": 103750 + }, + { + "epoch": 11.414191419141915, + "grad_norm": 0.18540871143341064, + "learning_rate": 2.3194801419962963e-05, + "loss": 0.0321, + "num_input_tokens_seen": 21896960, + "step": 103755 + }, + { + "epoch": 11.414741474147414, + "grad_norm": 0.40462198853492737, + "learning_rate": 2.3192407625695507e-05, + "loss": 0.0707, + "num_input_tokens_seen": 21897984, + "step": 103760 + }, + { + "epoch": 11.415291529152915, + "grad_norm": 0.06297723203897476, + "learning_rate": 2.319001384808776e-05, + "loss": 0.0369, + "num_input_tokens_seen": 21898976, + "step": 103765 + }, + { + "epoch": 11.415841584158416, + "grad_norm": 0.45232275128364563, + "learning_rate": 2.3187620087161777e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21900000, + "step": 103770 + }, + { + "epoch": 11.416391639163916, + "grad_norm": 0.011485440656542778, + "learning_rate": 2.3185226342939643e-05, + "loss": 0.1455, + "num_input_tokens_seen": 21901024, + "step": 103775 + }, + { + "epoch": 11.416941694169417, + "grad_norm": 0.012731513939797878, + "learning_rate": 2.3182832615443394e-05, + "loss": 0.0101, + "num_input_tokens_seen": 21902080, + "step": 103780 + }, + { + "epoch": 11.417491749174918, + "grad_norm": 1.6083385944366455, + "learning_rate": 2.318043890469511e-05, + "loss": 0.1182, + "num_input_tokens_seen": 21903104, + "step": 103785 + }, + { + "epoch": 11.418041804180419, + "grad_norm": 0.02718537673354149, + "learning_rate": 2.3178045210716854e-05, + "loss": 0.1003, + "num_input_tokens_seen": 21904128, + "step": 103790 + }, + { + "epoch": 11.418591859185918, + "grad_norm": 0.16254454851150513, + "learning_rate": 2.317565153353067e-05, + "loss": 0.0295, + "num_input_tokens_seen": 21905216, + "step": 103795 + }, + { + "epoch": 11.41914191419142, + "grad_norm": 0.014756284654140472, + "learning_rate": 2.317325787315864e-05, + "loss": 0.2181, + "num_input_tokens_seen": 21906304, + "step": 103800 + }, + { + "epoch": 11.41969196919692, + "grad_norm": 0.06725102663040161, + "learning_rate": 2.317086422962281e-05, + "loss": 0.0038, + "num_input_tokens_seen": 21907328, + "step": 103805 + }, + { + "epoch": 11.42024202420242, + "grad_norm": 0.2693770229816437, + "learning_rate": 2.3168470602945246e-05, + "loss": 0.0226, + "num_input_tokens_seen": 21908384, + "step": 103810 + }, + { + "epoch": 11.42079207920792, + "grad_norm": 0.030588053166866302, + "learning_rate": 2.316607699314802e-05, + "loss": 0.024, + "num_input_tokens_seen": 21909376, + "step": 103815 + }, + { + "epoch": 11.421342134213422, + "grad_norm": 0.03964224457740784, + "learning_rate": 2.3163683400253168e-05, + "loss": 0.0047, + "num_input_tokens_seen": 21910400, + "step": 103820 + }, + { + "epoch": 11.421892189218921, + "grad_norm": 0.6704297065734863, + "learning_rate": 2.3161289824282772e-05, + "loss": 0.0156, + "num_input_tokens_seen": 21911584, + "step": 103825 + }, + { + "epoch": 11.422442244224422, + "grad_norm": 0.050605155527591705, + "learning_rate": 2.315889626525888e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21912640, + "step": 103830 + }, + { + "epoch": 11.422992299229923, + "grad_norm": 0.023827912285923958, + "learning_rate": 2.315650272320357e-05, + "loss": 0.0581, + "num_input_tokens_seen": 21913696, + "step": 103835 + }, + { + "epoch": 11.423542354235423, + "grad_norm": 0.5248551964759827, + "learning_rate": 2.3154109198138883e-05, + "loss": 0.0629, + "num_input_tokens_seen": 21914656, + "step": 103840 + }, + { + "epoch": 11.424092409240924, + "grad_norm": 0.04658633470535278, + "learning_rate": 2.315171569008688e-05, + "loss": 0.0024, + "num_input_tokens_seen": 21915744, + "step": 103845 + }, + { + "epoch": 11.424642464246425, + "grad_norm": 3.91977858543396, + "learning_rate": 2.314932219906963e-05, + "loss": 0.0348, + "num_input_tokens_seen": 21916832, + "step": 103850 + }, + { + "epoch": 11.425192519251926, + "grad_norm": 0.007122877519577742, + "learning_rate": 2.314692872510919e-05, + "loss": 0.0338, + "num_input_tokens_seen": 21917888, + "step": 103855 + }, + { + "epoch": 11.425742574257425, + "grad_norm": 0.13479427993297577, + "learning_rate": 2.3144535268227624e-05, + "loss": 0.0879, + "num_input_tokens_seen": 21918912, + "step": 103860 + }, + { + "epoch": 11.426292629262926, + "grad_norm": 0.009379466995596886, + "learning_rate": 2.3142141828446983e-05, + "loss": 0.0068, + "num_input_tokens_seen": 21919936, + "step": 103865 + }, + { + "epoch": 11.426842684268427, + "grad_norm": 0.11145273596048355, + "learning_rate": 2.313974840578932e-05, + "loss": 0.0587, + "num_input_tokens_seen": 21920960, + "step": 103870 + }, + { + "epoch": 11.427392739273927, + "grad_norm": 0.036411285400390625, + "learning_rate": 2.313735500027672e-05, + "loss": 0.0724, + "num_input_tokens_seen": 21922048, + "step": 103875 + }, + { + "epoch": 11.427942794279428, + "grad_norm": 0.06517951935529709, + "learning_rate": 2.3134961611931215e-05, + "loss": 0.1032, + "num_input_tokens_seen": 21923040, + "step": 103880 + }, + { + "epoch": 11.428492849284929, + "grad_norm": 0.07051605731248856, + "learning_rate": 2.313256824077487e-05, + "loss": 0.0023, + "num_input_tokens_seen": 21924096, + "step": 103885 + }, + { + "epoch": 11.429042904290428, + "grad_norm": 2.371323347091675, + "learning_rate": 2.313017488682976e-05, + "loss": 0.1168, + "num_input_tokens_seen": 21925088, + "step": 103890 + }, + { + "epoch": 11.42959295929593, + "grad_norm": 1.5032060146331787, + "learning_rate": 2.3127781550117918e-05, + "loss": 0.0308, + "num_input_tokens_seen": 21926208, + "step": 103895 + }, + { + "epoch": 11.43014301430143, + "grad_norm": 0.14056508243083954, + "learning_rate": 2.3125388230661425e-05, + "loss": 0.0218, + "num_input_tokens_seen": 21927232, + "step": 103900 + }, + { + "epoch": 11.430693069306932, + "grad_norm": 0.09896405786275864, + "learning_rate": 2.312299492848233e-05, + "loss": 0.005, + "num_input_tokens_seen": 21928224, + "step": 103905 + }, + { + "epoch": 11.43124312431243, + "grad_norm": 0.05281433090567589, + "learning_rate": 2.3120601643602674e-05, + "loss": 0.0032, + "num_input_tokens_seen": 21929248, + "step": 103910 + }, + { + "epoch": 11.431793179317932, + "grad_norm": 0.27482396364212036, + "learning_rate": 2.3118208376044546e-05, + "loss": 0.0275, + "num_input_tokens_seen": 21930304, + "step": 103915 + }, + { + "epoch": 11.432343234323433, + "grad_norm": 0.003684702794998884, + "learning_rate": 2.311581512582998e-05, + "loss": 0.0135, + "num_input_tokens_seen": 21931360, + "step": 103920 + }, + { + "epoch": 11.432893289328932, + "grad_norm": 0.009383336640894413, + "learning_rate": 2.3113421892981048e-05, + "loss": 0.0644, + "num_input_tokens_seen": 21932384, + "step": 103925 + }, + { + "epoch": 11.433443344334433, + "grad_norm": 0.01290801540017128, + "learning_rate": 2.3111028677519804e-05, + "loss": 0.0521, + "num_input_tokens_seen": 21933376, + "step": 103930 + }, + { + "epoch": 11.433993399339935, + "grad_norm": 1.3259254693984985, + "learning_rate": 2.3108635479468292e-05, + "loss": 0.04, + "num_input_tokens_seen": 21934400, + "step": 103935 + }, + { + "epoch": 11.434543454345434, + "grad_norm": 0.2967776358127594, + "learning_rate": 2.3106242298848586e-05, + "loss": 0.0227, + "num_input_tokens_seen": 21935424, + "step": 103940 + }, + { + "epoch": 11.435093509350935, + "grad_norm": 0.4693842828273773, + "learning_rate": 2.3103849135682733e-05, + "loss": 0.0516, + "num_input_tokens_seen": 21936480, + "step": 103945 + }, + { + "epoch": 11.435643564356436, + "grad_norm": 0.016522174701094627, + "learning_rate": 2.31014559899928e-05, + "loss": 0.14, + "num_input_tokens_seen": 21937600, + "step": 103950 + }, + { + "epoch": 11.436193619361935, + "grad_norm": 0.04089214652776718, + "learning_rate": 2.309906286180083e-05, + "loss": 0.0163, + "num_input_tokens_seen": 21938656, + "step": 103955 + }, + { + "epoch": 11.436743674367436, + "grad_norm": 0.06380819529294968, + "learning_rate": 2.309666975112888e-05, + "loss": 0.033, + "num_input_tokens_seen": 21939680, + "step": 103960 + }, + { + "epoch": 11.437293729372938, + "grad_norm": 0.04433063045144081, + "learning_rate": 2.3094276657999027e-05, + "loss": 0.0813, + "num_input_tokens_seen": 21940768, + "step": 103965 + }, + { + "epoch": 11.437843784378439, + "grad_norm": 0.12745420634746552, + "learning_rate": 2.3091883582433287e-05, + "loss": 0.0482, + "num_input_tokens_seen": 21941824, + "step": 103970 + }, + { + "epoch": 11.438393839383938, + "grad_norm": 0.46932539343833923, + "learning_rate": 2.3089490524453764e-05, + "loss": 0.0105, + "num_input_tokens_seen": 21942880, + "step": 103975 + }, + { + "epoch": 11.438943894389439, + "grad_norm": 0.08700378239154816, + "learning_rate": 2.308709748408248e-05, + "loss": 0.085, + "num_input_tokens_seen": 21943904, + "step": 103980 + }, + { + "epoch": 11.43949394939494, + "grad_norm": 0.3362138271331787, + "learning_rate": 2.30847044613415e-05, + "loss": 0.0048, + "num_input_tokens_seen": 21944992, + "step": 103985 + }, + { + "epoch": 11.44004400440044, + "grad_norm": 1.507585048675537, + "learning_rate": 2.3082311456252885e-05, + "loss": 0.0835, + "num_input_tokens_seen": 21945984, + "step": 103990 + }, + { + "epoch": 11.44059405940594, + "grad_norm": 0.06614343076944351, + "learning_rate": 2.307991846883868e-05, + "loss": 0.0298, + "num_input_tokens_seen": 21947104, + "step": 103995 + }, + { + "epoch": 11.441144114411442, + "grad_norm": 0.08354902267456055, + "learning_rate": 2.3077525499120946e-05, + "loss": 0.0129, + "num_input_tokens_seen": 21948192, + "step": 104000 + }, + { + "epoch": 11.441694169416941, + "grad_norm": 0.3142718970775604, + "learning_rate": 2.3075132547121748e-05, + "loss": 0.0207, + "num_input_tokens_seen": 21949248, + "step": 104005 + }, + { + "epoch": 11.442244224422442, + "grad_norm": 1.0602083206176758, + "learning_rate": 2.3072739612863113e-05, + "loss": 0.0354, + "num_input_tokens_seen": 21950304, + "step": 104010 + }, + { + "epoch": 11.442794279427943, + "grad_norm": 0.343570739030838, + "learning_rate": 2.3070346696367122e-05, + "loss": 0.011, + "num_input_tokens_seen": 21951296, + "step": 104015 + }, + { + "epoch": 11.443344334433444, + "grad_norm": 0.01524325180798769, + "learning_rate": 2.306795379765581e-05, + "loss": 0.0033, + "num_input_tokens_seen": 21952288, + "step": 104020 + }, + { + "epoch": 11.443894389438944, + "grad_norm": 0.023253576830029488, + "learning_rate": 2.3065560916751252e-05, + "loss": 0.0381, + "num_input_tokens_seen": 21953312, + "step": 104025 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 1.1575464010238647, + "learning_rate": 2.3063168053675487e-05, + "loss": 0.0167, + "num_input_tokens_seen": 21954336, + "step": 104030 + }, + { + "epoch": 11.444994499449946, + "grad_norm": 2.170546770095825, + "learning_rate": 2.306077520845057e-05, + "loss": 0.0666, + "num_input_tokens_seen": 21955360, + "step": 104035 + }, + { + "epoch": 11.445544554455445, + "grad_norm": 0.008592450059950352, + "learning_rate": 2.3058382381098558e-05, + "loss": 0.0563, + "num_input_tokens_seen": 21956448, + "step": 104040 + }, + { + "epoch": 11.446094609460946, + "grad_norm": 0.26273277401924133, + "learning_rate": 2.30559895716415e-05, + "loss": 0.0082, + "num_input_tokens_seen": 21957504, + "step": 104045 + }, + { + "epoch": 11.446644664466447, + "grad_norm": 0.02342056855559349, + "learning_rate": 2.3053596780101466e-05, + "loss": 0.0132, + "num_input_tokens_seen": 21958624, + "step": 104050 + }, + { + "epoch": 11.447194719471947, + "grad_norm": 1.9593042135238647, + "learning_rate": 2.305120400650049e-05, + "loss": 0.0264, + "num_input_tokens_seen": 21959712, + "step": 104055 + }, + { + "epoch": 11.447744774477448, + "grad_norm": 1.2182849645614624, + "learning_rate": 2.3048811250860624e-05, + "loss": 0.0266, + "num_input_tokens_seen": 21960768, + "step": 104060 + }, + { + "epoch": 11.448294829482949, + "grad_norm": 0.04653862491250038, + "learning_rate": 2.3046418513203943e-05, + "loss": 0.0063, + "num_input_tokens_seen": 21961824, + "step": 104065 + }, + { + "epoch": 11.448844884488448, + "grad_norm": 0.0390639528632164, + "learning_rate": 2.3044025793552476e-05, + "loss": 0.008, + "num_input_tokens_seen": 21962880, + "step": 104070 + }, + { + "epoch": 11.44939493949395, + "grad_norm": 0.0318489596247673, + "learning_rate": 2.3041633091928283e-05, + "loss": 0.0098, + "num_input_tokens_seen": 21963936, + "step": 104075 + }, + { + "epoch": 11.44994499449945, + "grad_norm": 0.0681481584906578, + "learning_rate": 2.3039240408353426e-05, + "loss": 0.0056, + "num_input_tokens_seen": 21965024, + "step": 104080 + }, + { + "epoch": 11.450495049504951, + "grad_norm": 0.07904037088155746, + "learning_rate": 2.3036847742849936e-05, + "loss": 0.0029, + "num_input_tokens_seen": 21966080, + "step": 104085 + }, + { + "epoch": 11.45104510451045, + "grad_norm": 1.4250489473342896, + "learning_rate": 2.3034455095439893e-05, + "loss": 0.0424, + "num_input_tokens_seen": 21967136, + "step": 104090 + }, + { + "epoch": 11.451595159515952, + "grad_norm": 0.6863270401954651, + "learning_rate": 2.3032062466145326e-05, + "loss": 0.0108, + "num_input_tokens_seen": 21968224, + "step": 104095 + }, + { + "epoch": 11.452145214521453, + "grad_norm": 0.02091236039996147, + "learning_rate": 2.3029669854988294e-05, + "loss": 0.0042, + "num_input_tokens_seen": 21969312, + "step": 104100 + }, + { + "epoch": 11.452695269526952, + "grad_norm": 0.04839290305972099, + "learning_rate": 2.3027277261990858e-05, + "loss": 0.0072, + "num_input_tokens_seen": 21970432, + "step": 104105 + }, + { + "epoch": 11.453245324532453, + "grad_norm": 0.4968491792678833, + "learning_rate": 2.3024884687175048e-05, + "loss": 0.0067, + "num_input_tokens_seen": 21971520, + "step": 104110 + }, + { + "epoch": 11.453795379537954, + "grad_norm": 0.03327880799770355, + "learning_rate": 2.3022492130562936e-05, + "loss": 0.0692, + "num_input_tokens_seen": 21972608, + "step": 104115 + }, + { + "epoch": 11.454345434543454, + "grad_norm": 0.016963813453912735, + "learning_rate": 2.3020099592176574e-05, + "loss": 0.0262, + "num_input_tokens_seen": 21973600, + "step": 104120 + }, + { + "epoch": 11.454895489548955, + "grad_norm": 0.09061665087938309, + "learning_rate": 2.3017707072037988e-05, + "loss": 0.003, + "num_input_tokens_seen": 21974720, + "step": 104125 + }, + { + "epoch": 11.455445544554456, + "grad_norm": 0.19023007154464722, + "learning_rate": 2.301531457016925e-05, + "loss": 0.0735, + "num_input_tokens_seen": 21975744, + "step": 104130 + }, + { + "epoch": 11.455995599559955, + "grad_norm": 1.7484675645828247, + "learning_rate": 2.3012922086592403e-05, + "loss": 0.1154, + "num_input_tokens_seen": 21976800, + "step": 104135 + }, + { + "epoch": 11.456545654565456, + "grad_norm": 1.7809948921203613, + "learning_rate": 2.3010529621329508e-05, + "loss": 0.0795, + "num_input_tokens_seen": 21977824, + "step": 104140 + }, + { + "epoch": 11.457095709570957, + "grad_norm": 0.0124391233548522, + "learning_rate": 2.3008137174402602e-05, + "loss": 0.0127, + "num_input_tokens_seen": 21978880, + "step": 104145 + }, + { + "epoch": 11.457645764576458, + "grad_norm": 0.01741817034780979, + "learning_rate": 2.3005744745833736e-05, + "loss": 0.0168, + "num_input_tokens_seen": 21979904, + "step": 104150 + }, + { + "epoch": 11.458195819581958, + "grad_norm": 0.015629587695002556, + "learning_rate": 2.3003352335644967e-05, + "loss": 0.0057, + "num_input_tokens_seen": 21980992, + "step": 104155 + }, + { + "epoch": 11.458745874587459, + "grad_norm": 0.21075904369354248, + "learning_rate": 2.3000959943858337e-05, + "loss": 0.0093, + "num_input_tokens_seen": 21982016, + "step": 104160 + }, + { + "epoch": 11.45929592959296, + "grad_norm": 0.008827665820717812, + "learning_rate": 2.2998567570495914e-05, + "loss": 0.002, + "num_input_tokens_seen": 21983104, + "step": 104165 + }, + { + "epoch": 11.45984598459846, + "grad_norm": 0.1753794401884079, + "learning_rate": 2.2996175215579724e-05, + "loss": 0.0219, + "num_input_tokens_seen": 21984192, + "step": 104170 + }, + { + "epoch": 11.46039603960396, + "grad_norm": 0.08690396696329117, + "learning_rate": 2.299378287913182e-05, + "loss": 0.0244, + "num_input_tokens_seen": 21985248, + "step": 104175 + }, + { + "epoch": 11.460946094609461, + "grad_norm": 0.00446726568043232, + "learning_rate": 2.2991390561174267e-05, + "loss": 0.0147, + "num_input_tokens_seen": 21986272, + "step": 104180 + }, + { + "epoch": 11.46149614961496, + "grad_norm": 0.17800144851207733, + "learning_rate": 2.2988998261729095e-05, + "loss": 0.0165, + "num_input_tokens_seen": 21987328, + "step": 104185 + }, + { + "epoch": 11.462046204620462, + "grad_norm": 0.9569380283355713, + "learning_rate": 2.2986605980818364e-05, + "loss": 0.0348, + "num_input_tokens_seen": 21988448, + "step": 104190 + }, + { + "epoch": 11.462596259625963, + "grad_norm": 0.7239871025085449, + "learning_rate": 2.2984213718464125e-05, + "loss": 0.0515, + "num_input_tokens_seen": 21989504, + "step": 104195 + }, + { + "epoch": 11.463146314631462, + "grad_norm": 0.014549651183187962, + "learning_rate": 2.2981821474688416e-05, + "loss": 0.0034, + "num_input_tokens_seen": 21990560, + "step": 104200 + }, + { + "epoch": 11.463696369636963, + "grad_norm": 0.18248242139816284, + "learning_rate": 2.297942924951329e-05, + "loss": 0.0632, + "num_input_tokens_seen": 21991648, + "step": 104205 + }, + { + "epoch": 11.464246424642464, + "grad_norm": 0.01775137335062027, + "learning_rate": 2.2977037042960788e-05, + "loss": 0.0675, + "num_input_tokens_seen": 21992704, + "step": 104210 + }, + { + "epoch": 11.464796479647966, + "grad_norm": 0.09775964170694351, + "learning_rate": 2.2974644855052973e-05, + "loss": 0.0094, + "num_input_tokens_seen": 21993696, + "step": 104215 + }, + { + "epoch": 11.465346534653465, + "grad_norm": 0.0077656167559325695, + "learning_rate": 2.297225268581189e-05, + "loss": 0.0101, + "num_input_tokens_seen": 21994752, + "step": 104220 + }, + { + "epoch": 11.465896589658966, + "grad_norm": 0.03951152414083481, + "learning_rate": 2.296986053525957e-05, + "loss": 0.0414, + "num_input_tokens_seen": 21995744, + "step": 104225 + }, + { + "epoch": 11.466446644664467, + "grad_norm": 0.012950844131410122, + "learning_rate": 2.2967468403418075e-05, + "loss": 0.0393, + "num_input_tokens_seen": 21996704, + "step": 104230 + }, + { + "epoch": 11.466996699669966, + "grad_norm": 0.10143646597862244, + "learning_rate": 2.2965076290309446e-05, + "loss": 0.0053, + "num_input_tokens_seen": 21997824, + "step": 104235 + }, + { + "epoch": 11.467546754675467, + "grad_norm": 0.11689909547567368, + "learning_rate": 2.2962684195955742e-05, + "loss": 0.003, + "num_input_tokens_seen": 21998848, + "step": 104240 + }, + { + "epoch": 11.468096809680969, + "grad_norm": 0.031028831377625465, + "learning_rate": 2.2960292120378993e-05, + "loss": 0.0007, + "num_input_tokens_seen": 21999936, + "step": 104245 + }, + { + "epoch": 11.468646864686468, + "grad_norm": 0.044066816568374634, + "learning_rate": 2.2957900063601247e-05, + "loss": 0.0455, + "num_input_tokens_seen": 22000992, + "step": 104250 + }, + { + "epoch": 11.469196919691969, + "grad_norm": 1.0095336437225342, + "learning_rate": 2.295550802564457e-05, + "loss": 0.0259, + "num_input_tokens_seen": 22002080, + "step": 104255 + }, + { + "epoch": 11.46974697469747, + "grad_norm": 0.026166971772909164, + "learning_rate": 2.295311600653099e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22003040, + "step": 104260 + }, + { + "epoch": 11.47029702970297, + "grad_norm": 0.06944375485181808, + "learning_rate": 2.2950724006282548e-05, + "loss": 0.0989, + "num_input_tokens_seen": 22004160, + "step": 104265 + }, + { + "epoch": 11.47084708470847, + "grad_norm": 1.0017201900482178, + "learning_rate": 2.294833202492131e-05, + "loss": 0.1181, + "num_input_tokens_seen": 22005280, + "step": 104270 + }, + { + "epoch": 11.471397139713972, + "grad_norm": 0.045203596353530884, + "learning_rate": 2.2945940062469303e-05, + "loss": 0.0029, + "num_input_tokens_seen": 22006336, + "step": 104275 + }, + { + "epoch": 11.471947194719473, + "grad_norm": 1.1201478242874146, + "learning_rate": 2.294354811894859e-05, + "loss": 0.0213, + "num_input_tokens_seen": 22007392, + "step": 104280 + }, + { + "epoch": 11.472497249724972, + "grad_norm": 0.04383470490574837, + "learning_rate": 2.2941156194381203e-05, + "loss": 0.0289, + "num_input_tokens_seen": 22008448, + "step": 104285 + }, + { + "epoch": 11.473047304730473, + "grad_norm": 0.0975998267531395, + "learning_rate": 2.2938764288789185e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22009472, + "step": 104290 + }, + { + "epoch": 11.473597359735974, + "grad_norm": 0.044132690876722336, + "learning_rate": 2.29363724021946e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22010592, + "step": 104295 + }, + { + "epoch": 11.474147414741473, + "grad_norm": 0.053798191249370575, + "learning_rate": 2.293398053461947e-05, + "loss": 0.1341, + "num_input_tokens_seen": 22011648, + "step": 104300 + }, + { + "epoch": 11.474697469746975, + "grad_norm": 0.016086392104625702, + "learning_rate": 2.293158868608585e-05, + "loss": 0.0131, + "num_input_tokens_seen": 22012672, + "step": 104305 + }, + { + "epoch": 11.475247524752476, + "grad_norm": 0.019135886803269386, + "learning_rate": 2.2929196856615797e-05, + "loss": 0.0023, + "num_input_tokens_seen": 22013792, + "step": 104310 + }, + { + "epoch": 11.475797579757975, + "grad_norm": 0.0192145723849535, + "learning_rate": 2.2926805046231332e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22014816, + "step": 104315 + }, + { + "epoch": 11.476347634763476, + "grad_norm": 0.15469969809055328, + "learning_rate": 2.292441325495451e-05, + "loss": 0.0087, + "num_input_tokens_seen": 22015872, + "step": 104320 + }, + { + "epoch": 11.476897689768977, + "grad_norm": 0.011005425825715065, + "learning_rate": 2.2922021482807373e-05, + "loss": 0.0042, + "num_input_tokens_seen": 22017024, + "step": 104325 + }, + { + "epoch": 11.477447744774478, + "grad_norm": 0.044701557606458664, + "learning_rate": 2.2919629729811972e-05, + "loss": 0.0126, + "num_input_tokens_seen": 22018048, + "step": 104330 + }, + { + "epoch": 11.477997799779978, + "grad_norm": 0.005148052703589201, + "learning_rate": 2.291723799599035e-05, + "loss": 0.0496, + "num_input_tokens_seen": 22019104, + "step": 104335 + }, + { + "epoch": 11.478547854785479, + "grad_norm": 0.03527131676673889, + "learning_rate": 2.2914846281364537e-05, + "loss": 0.0046, + "num_input_tokens_seen": 22020096, + "step": 104340 + }, + { + "epoch": 11.47909790979098, + "grad_norm": 0.04516253620386124, + "learning_rate": 2.2912454585956592e-05, + "loss": 0.0063, + "num_input_tokens_seen": 22021184, + "step": 104345 + }, + { + "epoch": 11.479647964796479, + "grad_norm": 0.03908573091030121, + "learning_rate": 2.2910062909788543e-05, + "loss": 0.0083, + "num_input_tokens_seen": 22022208, + "step": 104350 + }, + { + "epoch": 11.48019801980198, + "grad_norm": 0.004363891202956438, + "learning_rate": 2.290767125288245e-05, + "loss": 0.0051, + "num_input_tokens_seen": 22023328, + "step": 104355 + }, + { + "epoch": 11.480748074807481, + "grad_norm": 0.0854022353887558, + "learning_rate": 2.2905279615260347e-05, + "loss": 0.0321, + "num_input_tokens_seen": 22024352, + "step": 104360 + }, + { + "epoch": 11.48129812981298, + "grad_norm": 1.2715643644332886, + "learning_rate": 2.290288799694427e-05, + "loss": 0.0423, + "num_input_tokens_seen": 22025472, + "step": 104365 + }, + { + "epoch": 11.481848184818482, + "grad_norm": 1.337970495223999, + "learning_rate": 2.2900496397956276e-05, + "loss": 0.0668, + "num_input_tokens_seen": 22026496, + "step": 104370 + }, + { + "epoch": 11.482398239823983, + "grad_norm": 0.018458319827914238, + "learning_rate": 2.2898104818318393e-05, + "loss": 0.0412, + "num_input_tokens_seen": 22027552, + "step": 104375 + }, + { + "epoch": 11.482948294829482, + "grad_norm": 0.16015474498271942, + "learning_rate": 2.289571325805267e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22028640, + "step": 104380 + }, + { + "epoch": 11.483498349834983, + "grad_norm": 0.010791952721774578, + "learning_rate": 2.2893321717181154e-05, + "loss": 0.0129, + "num_input_tokens_seen": 22029696, + "step": 104385 + }, + { + "epoch": 11.484048404840484, + "grad_norm": 0.11016286909580231, + "learning_rate": 2.2890930195725865e-05, + "loss": 0.0551, + "num_input_tokens_seen": 22030720, + "step": 104390 + }, + { + "epoch": 11.484598459845985, + "grad_norm": 0.01457616314291954, + "learning_rate": 2.288853869370888e-05, + "loss": 0.0638, + "num_input_tokens_seen": 22031840, + "step": 104395 + }, + { + "epoch": 11.485148514851485, + "grad_norm": 0.006512230262160301, + "learning_rate": 2.2886147211152205e-05, + "loss": 0.0061, + "num_input_tokens_seen": 22032896, + "step": 104400 + }, + { + "epoch": 11.485698569856986, + "grad_norm": 0.017726833000779152, + "learning_rate": 2.2883755748077906e-05, + "loss": 0.0805, + "num_input_tokens_seen": 22033888, + "step": 104405 + }, + { + "epoch": 11.486248624862487, + "grad_norm": 0.01577608846127987, + "learning_rate": 2.2881364304508018e-05, + "loss": 0.0981, + "num_input_tokens_seen": 22034944, + "step": 104410 + }, + { + "epoch": 11.486798679867986, + "grad_norm": 0.03582724183797836, + "learning_rate": 2.2878972880464568e-05, + "loss": 0.1976, + "num_input_tokens_seen": 22035968, + "step": 104415 + }, + { + "epoch": 11.487348734873487, + "grad_norm": 0.3173559904098511, + "learning_rate": 2.2876581475969618e-05, + "loss": 0.0108, + "num_input_tokens_seen": 22037024, + "step": 104420 + }, + { + "epoch": 11.487898789878988, + "grad_norm": 0.020529944449663162, + "learning_rate": 2.2874190091045184e-05, + "loss": 0.0016, + "num_input_tokens_seen": 22038112, + "step": 104425 + }, + { + "epoch": 11.488448844884488, + "grad_norm": 0.017200445756316185, + "learning_rate": 2.2871798725713334e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22039168, + "step": 104430 + }, + { + "epoch": 11.488998899889989, + "grad_norm": 0.013766477815806866, + "learning_rate": 2.2869407379996088e-05, + "loss": 0.0093, + "num_input_tokens_seen": 22040256, + "step": 104435 + }, + { + "epoch": 11.48954895489549, + "grad_norm": 0.02323431335389614, + "learning_rate": 2.2867016053915487e-05, + "loss": 0.0063, + "num_input_tokens_seen": 22041344, + "step": 104440 + }, + { + "epoch": 11.490099009900991, + "grad_norm": 0.018757222220301628, + "learning_rate": 2.2864624747493586e-05, + "loss": 0.0376, + "num_input_tokens_seen": 22042464, + "step": 104445 + }, + { + "epoch": 11.49064906490649, + "grad_norm": 0.014427818357944489, + "learning_rate": 2.286223346075241e-05, + "loss": 0.008, + "num_input_tokens_seen": 22043616, + "step": 104450 + }, + { + "epoch": 11.491199119911991, + "grad_norm": 0.07387623935937881, + "learning_rate": 2.2859842193713995e-05, + "loss": 0.0515, + "num_input_tokens_seen": 22044704, + "step": 104455 + }, + { + "epoch": 11.491749174917492, + "grad_norm": 0.014090987853705883, + "learning_rate": 2.2857450946400397e-05, + "loss": 0.0036, + "num_input_tokens_seen": 22045728, + "step": 104460 + }, + { + "epoch": 11.492299229922992, + "grad_norm": 0.007939362898468971, + "learning_rate": 2.2855059718833635e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22046816, + "step": 104465 + }, + { + "epoch": 11.492849284928493, + "grad_norm": 0.10186143964529037, + "learning_rate": 2.285266851103577e-05, + "loss": 0.0113, + "num_input_tokens_seen": 22047840, + "step": 104470 + }, + { + "epoch": 11.493399339933994, + "grad_norm": 0.0124321598559618, + "learning_rate": 2.2850277323028826e-05, + "loss": 0.0179, + "num_input_tokens_seen": 22048864, + "step": 104475 + }, + { + "epoch": 11.493949394939493, + "grad_norm": 0.12029173970222473, + "learning_rate": 2.2847886154834838e-05, + "loss": 0.1221, + "num_input_tokens_seen": 22050016, + "step": 104480 + }, + { + "epoch": 11.494499449944994, + "grad_norm": 0.005716945510357618, + "learning_rate": 2.284549500647586e-05, + "loss": 0.0416, + "num_input_tokens_seen": 22051040, + "step": 104485 + }, + { + "epoch": 11.495049504950495, + "grad_norm": 0.033291228115558624, + "learning_rate": 2.2843103877973912e-05, + "loss": 0.0021, + "num_input_tokens_seen": 22052096, + "step": 104490 + }, + { + "epoch": 11.495599559955995, + "grad_norm": 0.2628013491630554, + "learning_rate": 2.2840712769351047e-05, + "loss": 0.0056, + "num_input_tokens_seen": 22053184, + "step": 104495 + }, + { + "epoch": 11.496149614961496, + "grad_norm": 0.2615402340888977, + "learning_rate": 2.2838321680629298e-05, + "loss": 0.0853, + "num_input_tokens_seen": 22054208, + "step": 104500 + }, + { + "epoch": 11.496699669966997, + "grad_norm": 0.031497176736593246, + "learning_rate": 2.283593061183069e-05, + "loss": 0.0046, + "num_input_tokens_seen": 22055264, + "step": 104505 + }, + { + "epoch": 11.497249724972498, + "grad_norm": 0.028625480830669403, + "learning_rate": 2.283353956297728e-05, + "loss": 0.0184, + "num_input_tokens_seen": 22056352, + "step": 104510 + }, + { + "epoch": 11.497799779977997, + "grad_norm": 0.0396861657500267, + "learning_rate": 2.283114853409109e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22057376, + "step": 104515 + }, + { + "epoch": 11.498349834983498, + "grad_norm": 0.0626201406121254, + "learning_rate": 2.2828757525194166e-05, + "loss": 0.1057, + "num_input_tokens_seen": 22058432, + "step": 104520 + }, + { + "epoch": 11.498899889989, + "grad_norm": 0.04161481186747551, + "learning_rate": 2.2826366536308552e-05, + "loss": 0.0114, + "num_input_tokens_seen": 22059488, + "step": 104525 + }, + { + "epoch": 11.499449944994499, + "grad_norm": 4.902318477630615, + "learning_rate": 2.282397556745626e-05, + "loss": 0.1094, + "num_input_tokens_seen": 22060608, + "step": 104530 + }, + { + "epoch": 11.5, + "grad_norm": 0.008788547478616238, + "learning_rate": 2.2821584618659347e-05, + "loss": 0.0038, + "num_input_tokens_seen": 22061664, + "step": 104535 + }, + { + "epoch": 11.500550055005501, + "grad_norm": 0.016934260725975037, + "learning_rate": 2.281919368993984e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22062784, + "step": 104540 + }, + { + "epoch": 11.501100110011, + "grad_norm": 0.7162258625030518, + "learning_rate": 2.2816802781319786e-05, + "loss": 0.1914, + "num_input_tokens_seen": 22063840, + "step": 104545 + }, + { + "epoch": 11.501650165016502, + "grad_norm": 0.9782224893569946, + "learning_rate": 2.2814411892821207e-05, + "loss": 0.0249, + "num_input_tokens_seen": 22064896, + "step": 104550 + }, + { + "epoch": 11.502200220022003, + "grad_norm": 0.10443726927042007, + "learning_rate": 2.2812021024466142e-05, + "loss": 0.0325, + "num_input_tokens_seen": 22066016, + "step": 104555 + }, + { + "epoch": 11.502750275027502, + "grad_norm": 0.012039613910019398, + "learning_rate": 2.280963017627664e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22067136, + "step": 104560 + }, + { + "epoch": 11.503300330033003, + "grad_norm": 0.06744366884231567, + "learning_rate": 2.2807239348274717e-05, + "loss": 0.005, + "num_input_tokens_seen": 22068192, + "step": 104565 + }, + { + "epoch": 11.503850385038504, + "grad_norm": 0.014345260336995125, + "learning_rate": 2.2804848540482417e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22069216, + "step": 104570 + }, + { + "epoch": 11.504400440044005, + "grad_norm": 0.2769431173801422, + "learning_rate": 2.280245775292178e-05, + "loss": 0.0982, + "num_input_tokens_seen": 22070272, + "step": 104575 + }, + { + "epoch": 11.504950495049505, + "grad_norm": 0.1222522184252739, + "learning_rate": 2.2800066985614828e-05, + "loss": 0.0088, + "num_input_tokens_seen": 22071392, + "step": 104580 + }, + { + "epoch": 11.505500550055006, + "grad_norm": 0.04643511399626732, + "learning_rate": 2.2797676238583612e-05, + "loss": 0.0029, + "num_input_tokens_seen": 22072384, + "step": 104585 + }, + { + "epoch": 11.506050605060507, + "grad_norm": 0.009107260964810848, + "learning_rate": 2.279528551185015e-05, + "loss": 0.0073, + "num_input_tokens_seen": 22073472, + "step": 104590 + }, + { + "epoch": 11.506600660066006, + "grad_norm": 0.04948091134428978, + "learning_rate": 2.2792894805436484e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22074560, + "step": 104595 + }, + { + "epoch": 11.507150715071507, + "grad_norm": 0.01077858917415142, + "learning_rate": 2.2790504119364654e-05, + "loss": 0.0293, + "num_input_tokens_seen": 22075616, + "step": 104600 + }, + { + "epoch": 11.507700770077008, + "grad_norm": 0.0015770617173984647, + "learning_rate": 2.278811345365668e-05, + "loss": 0.0201, + "num_input_tokens_seen": 22076768, + "step": 104605 + }, + { + "epoch": 11.508250825082508, + "grad_norm": 0.8328509330749512, + "learning_rate": 2.2785722808334605e-05, + "loss": 0.0087, + "num_input_tokens_seen": 22077856, + "step": 104610 + }, + { + "epoch": 11.508800880088009, + "grad_norm": 0.0033071364741772413, + "learning_rate": 2.2783332183420458e-05, + "loss": 0.0046, + "num_input_tokens_seen": 22078944, + "step": 104615 + }, + { + "epoch": 11.50935093509351, + "grad_norm": 0.004537597764283419, + "learning_rate": 2.2780941578936282e-05, + "loss": 0.0435, + "num_input_tokens_seen": 22080000, + "step": 104620 + }, + { + "epoch": 11.509900990099009, + "grad_norm": 0.009978830814361572, + "learning_rate": 2.27785509949041e-05, + "loss": 0.003, + "num_input_tokens_seen": 22081088, + "step": 104625 + }, + { + "epoch": 11.51045104510451, + "grad_norm": 0.06533807516098022, + "learning_rate": 2.2776160431345943e-05, + "loss": 0.0057, + "num_input_tokens_seen": 22082176, + "step": 104630 + }, + { + "epoch": 11.511001100110011, + "grad_norm": 0.2975831925868988, + "learning_rate": 2.277376988828385e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22083200, + "step": 104635 + }, + { + "epoch": 11.511551155115512, + "grad_norm": 0.05128275230526924, + "learning_rate": 2.2771379365739847e-05, + "loss": 0.0016, + "num_input_tokens_seen": 22084256, + "step": 104640 + }, + { + "epoch": 11.512101210121012, + "grad_norm": 0.017738621681928635, + "learning_rate": 2.2768988863735985e-05, + "loss": 0.0052, + "num_input_tokens_seen": 22085248, + "step": 104645 + }, + { + "epoch": 11.512651265126513, + "grad_norm": 0.503472626209259, + "learning_rate": 2.2766598382294275e-05, + "loss": 0.0142, + "num_input_tokens_seen": 22086336, + "step": 104650 + }, + { + "epoch": 11.513201320132014, + "grad_norm": 0.007313588168472052, + "learning_rate": 2.276420792143675e-05, + "loss": 0.0111, + "num_input_tokens_seen": 22087392, + "step": 104655 + }, + { + "epoch": 11.513751375137513, + "grad_norm": 0.003728442592546344, + "learning_rate": 2.2761817481185458e-05, + "loss": 0.0129, + "num_input_tokens_seen": 22088416, + "step": 104660 + }, + { + "epoch": 11.514301430143014, + "grad_norm": 0.01810678467154503, + "learning_rate": 2.2759427061562417e-05, + "loss": 0.007, + "num_input_tokens_seen": 22089440, + "step": 104665 + }, + { + "epoch": 11.514851485148515, + "grad_norm": 0.03625702112913132, + "learning_rate": 2.2757036662589654e-05, + "loss": 0.0334, + "num_input_tokens_seen": 22090464, + "step": 104670 + }, + { + "epoch": 11.515401540154015, + "grad_norm": 2.2101049423217773, + "learning_rate": 2.2754646284289217e-05, + "loss": 0.1416, + "num_input_tokens_seen": 22091488, + "step": 104675 + }, + { + "epoch": 11.515951595159516, + "grad_norm": 0.006574549246579409, + "learning_rate": 2.275225592668312e-05, + "loss": 0.0121, + "num_input_tokens_seen": 22092576, + "step": 104680 + }, + { + "epoch": 11.516501650165017, + "grad_norm": 0.017924241721630096, + "learning_rate": 2.2749865589793407e-05, + "loss": 0.0094, + "num_input_tokens_seen": 22093600, + "step": 104685 + }, + { + "epoch": 11.517051705170516, + "grad_norm": 0.00629831338301301, + "learning_rate": 2.274747527364211e-05, + "loss": 0.1317, + "num_input_tokens_seen": 22094688, + "step": 104690 + }, + { + "epoch": 11.517601760176017, + "grad_norm": 0.003777350764721632, + "learning_rate": 2.2745084978251232e-05, + "loss": 0.0007, + "num_input_tokens_seen": 22095712, + "step": 104695 + }, + { + "epoch": 11.518151815181518, + "grad_norm": 0.008334029465913773, + "learning_rate": 2.274269470364284e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22096736, + "step": 104700 + }, + { + "epoch": 11.51870187018702, + "grad_norm": 0.03282761946320534, + "learning_rate": 2.2740304449838938e-05, + "loss": 0.004, + "num_input_tokens_seen": 22097696, + "step": 104705 + }, + { + "epoch": 11.519251925192519, + "grad_norm": 0.0652027577161789, + "learning_rate": 2.2737914216861576e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22098720, + "step": 104710 + }, + { + "epoch": 11.51980198019802, + "grad_norm": 0.06827056407928467, + "learning_rate": 2.2735524004732775e-05, + "loss": 0.0194, + "num_input_tokens_seen": 22099872, + "step": 104715 + }, + { + "epoch": 11.520352035203521, + "grad_norm": 0.06059621274471283, + "learning_rate": 2.273313381347455e-05, + "loss": 0.0466, + "num_input_tokens_seen": 22100864, + "step": 104720 + }, + { + "epoch": 11.52090209020902, + "grad_norm": 0.7153461575508118, + "learning_rate": 2.2730743643108952e-05, + "loss": 0.0329, + "num_input_tokens_seen": 22101920, + "step": 104725 + }, + { + "epoch": 11.521452145214521, + "grad_norm": 2.124302864074707, + "learning_rate": 2.272835349365799e-05, + "loss": 0.16, + "num_input_tokens_seen": 22103008, + "step": 104730 + }, + { + "epoch": 11.522002200220022, + "grad_norm": 0.0434693768620491, + "learning_rate": 2.272596336514372e-05, + "loss": 0.0369, + "num_input_tokens_seen": 22104032, + "step": 104735 + }, + { + "epoch": 11.522552255225522, + "grad_norm": 0.004113936331123114, + "learning_rate": 2.2723573257588147e-05, + "loss": 0.0032, + "num_input_tokens_seen": 22105024, + "step": 104740 + }, + { + "epoch": 11.523102310231023, + "grad_norm": 0.027201922610402107, + "learning_rate": 2.27211831710133e-05, + "loss": 0.0923, + "num_input_tokens_seen": 22106112, + "step": 104745 + }, + { + "epoch": 11.523652365236524, + "grad_norm": 3.8762056827545166, + "learning_rate": 2.2718793105441227e-05, + "loss": 0.1191, + "num_input_tokens_seen": 22107168, + "step": 104750 + }, + { + "epoch": 11.524202420242025, + "grad_norm": 0.1164214164018631, + "learning_rate": 2.2716403060893927e-05, + "loss": 0.0048, + "num_input_tokens_seen": 22108192, + "step": 104755 + }, + { + "epoch": 11.524752475247524, + "grad_norm": 0.15447431802749634, + "learning_rate": 2.271401303739346e-05, + "loss": 0.012, + "num_input_tokens_seen": 22109312, + "step": 104760 + }, + { + "epoch": 11.525302530253025, + "grad_norm": 0.02150763012468815, + "learning_rate": 2.271162303496183e-05, + "loss": 0.0015, + "num_input_tokens_seen": 22110368, + "step": 104765 + }, + { + "epoch": 11.525852585258527, + "grad_norm": 0.008245549164712429, + "learning_rate": 2.2709233053621075e-05, + "loss": 0.1023, + "num_input_tokens_seen": 22111328, + "step": 104770 + }, + { + "epoch": 11.526402640264026, + "grad_norm": 1.3705329895019531, + "learning_rate": 2.2706843093393224e-05, + "loss": 0.0412, + "num_input_tokens_seen": 22112384, + "step": 104775 + }, + { + "epoch": 11.526952695269527, + "grad_norm": 0.09471653401851654, + "learning_rate": 2.270445315430029e-05, + "loss": 0.0468, + "num_input_tokens_seen": 22113344, + "step": 104780 + }, + { + "epoch": 11.527502750275028, + "grad_norm": 0.785334050655365, + "learning_rate": 2.2702063236364316e-05, + "loss": 0.0306, + "num_input_tokens_seen": 22114368, + "step": 104785 + }, + { + "epoch": 11.528052805280527, + "grad_norm": 0.045933254063129425, + "learning_rate": 2.2699673339607328e-05, + "loss": 0.057, + "num_input_tokens_seen": 22115520, + "step": 104790 + }, + { + "epoch": 11.528602860286028, + "grad_norm": 1.9228111505508423, + "learning_rate": 2.2697283464051334e-05, + "loss": 0.1989, + "num_input_tokens_seen": 22116576, + "step": 104795 + }, + { + "epoch": 11.52915291529153, + "grad_norm": 0.06329164654016495, + "learning_rate": 2.2694893609718377e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22117632, + "step": 104800 + }, + { + "epoch": 11.52970297029703, + "grad_norm": 0.009301532991230488, + "learning_rate": 2.269250377663048e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22118688, + "step": 104805 + }, + { + "epoch": 11.53025302530253, + "grad_norm": 0.007733938284218311, + "learning_rate": 2.2690113964809677e-05, + "loss": 0.0248, + "num_input_tokens_seen": 22119744, + "step": 104810 + }, + { + "epoch": 11.530803080308031, + "grad_norm": 0.029164627194404602, + "learning_rate": 2.268772417427798e-05, + "loss": 0.0509, + "num_input_tokens_seen": 22120768, + "step": 104815 + }, + { + "epoch": 11.531353135313532, + "grad_norm": 0.03815784677863121, + "learning_rate": 2.268533440505741e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22121824, + "step": 104820 + }, + { + "epoch": 11.531903190319031, + "grad_norm": 0.24495528638362885, + "learning_rate": 2.2682944657170014e-05, + "loss": 0.005, + "num_input_tokens_seen": 22122944, + "step": 104825 + }, + { + "epoch": 11.532453245324533, + "grad_norm": 0.15702275931835175, + "learning_rate": 2.2680554930637798e-05, + "loss": 0.0149, + "num_input_tokens_seen": 22123968, + "step": 104830 + }, + { + "epoch": 11.533003300330034, + "grad_norm": 1.5198220014572144, + "learning_rate": 2.2678165225482803e-05, + "loss": 0.0105, + "num_input_tokens_seen": 22125088, + "step": 104835 + }, + { + "epoch": 11.533553355335533, + "grad_norm": 0.06068412587046623, + "learning_rate": 2.2675775541727042e-05, + "loss": 0.0092, + "num_input_tokens_seen": 22126176, + "step": 104840 + }, + { + "epoch": 11.534103410341034, + "grad_norm": 0.02711765468120575, + "learning_rate": 2.2673385879392538e-05, + "loss": 0.0028, + "num_input_tokens_seen": 22127200, + "step": 104845 + }, + { + "epoch": 11.534653465346535, + "grad_norm": 0.04650790989398956, + "learning_rate": 2.2670996238501326e-05, + "loss": 0.009, + "num_input_tokens_seen": 22128256, + "step": 104850 + }, + { + "epoch": 11.535203520352034, + "grad_norm": 0.04492982104420662, + "learning_rate": 2.2668606619075424e-05, + "loss": 0.0526, + "num_input_tokens_seen": 22129344, + "step": 104855 + }, + { + "epoch": 11.535753575357536, + "grad_norm": 0.09519223123788834, + "learning_rate": 2.2666217021136846e-05, + "loss": 0.0165, + "num_input_tokens_seen": 22130336, + "step": 104860 + }, + { + "epoch": 11.536303630363037, + "grad_norm": 0.004152802284806967, + "learning_rate": 2.2663827444707637e-05, + "loss": 0.005, + "num_input_tokens_seen": 22131360, + "step": 104865 + }, + { + "epoch": 11.536853685368538, + "grad_norm": 0.012812595814466476, + "learning_rate": 2.2661437889809803e-05, + "loss": 0.0378, + "num_input_tokens_seen": 22132384, + "step": 104870 + }, + { + "epoch": 11.537403740374037, + "grad_norm": 0.03088020719587803, + "learning_rate": 2.2659048356465375e-05, + "loss": 0.0042, + "num_input_tokens_seen": 22133504, + "step": 104875 + }, + { + "epoch": 11.537953795379538, + "grad_norm": 0.016658322885632515, + "learning_rate": 2.2656658844696378e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22134496, + "step": 104880 + }, + { + "epoch": 11.53850385038504, + "grad_norm": 1.5826977491378784, + "learning_rate": 2.2654269354524823e-05, + "loss": 0.1567, + "num_input_tokens_seen": 22135552, + "step": 104885 + }, + { + "epoch": 11.539053905390539, + "grad_norm": 0.03579457476735115, + "learning_rate": 2.2651879885972758e-05, + "loss": 0.0699, + "num_input_tokens_seen": 22136544, + "step": 104890 + }, + { + "epoch": 11.53960396039604, + "grad_norm": 0.07068092375993729, + "learning_rate": 2.2649490439062174e-05, + "loss": 0.0219, + "num_input_tokens_seen": 22137568, + "step": 104895 + }, + { + "epoch": 11.54015401540154, + "grad_norm": 0.012563781812787056, + "learning_rate": 2.2647101013815116e-05, + "loss": 0.002, + "num_input_tokens_seen": 22138624, + "step": 104900 + }, + { + "epoch": 11.54070407040704, + "grad_norm": 0.05703308805823326, + "learning_rate": 2.26447116102536e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22139680, + "step": 104905 + }, + { + "epoch": 11.541254125412541, + "grad_norm": 0.01434366125613451, + "learning_rate": 2.2642322228399644e-05, + "loss": 0.0348, + "num_input_tokens_seen": 22140736, + "step": 104910 + }, + { + "epoch": 11.541804180418042, + "grad_norm": 1.4582140445709229, + "learning_rate": 2.2639932868275274e-05, + "loss": 0.0449, + "num_input_tokens_seen": 22141792, + "step": 104915 + }, + { + "epoch": 11.542354235423542, + "grad_norm": 0.007573124021291733, + "learning_rate": 2.2637543529902503e-05, + "loss": 0.0218, + "num_input_tokens_seen": 22142848, + "step": 104920 + }, + { + "epoch": 11.542904290429043, + "grad_norm": 0.328748494386673, + "learning_rate": 2.263515421330337e-05, + "loss": 0.014, + "num_input_tokens_seen": 22143872, + "step": 104925 + }, + { + "epoch": 11.543454345434544, + "grad_norm": 1.0456904172897339, + "learning_rate": 2.2632764918499885e-05, + "loss": 0.069, + "num_input_tokens_seen": 22144864, + "step": 104930 + }, + { + "epoch": 11.544004400440045, + "grad_norm": 0.3830778896808624, + "learning_rate": 2.2630375645514062e-05, + "loss": 0.0219, + "num_input_tokens_seen": 22145920, + "step": 104935 + }, + { + "epoch": 11.544554455445544, + "grad_norm": 0.03381367400288582, + "learning_rate": 2.2627986394367938e-05, + "loss": 0.0857, + "num_input_tokens_seen": 22146976, + "step": 104940 + }, + { + "epoch": 11.545104510451045, + "grad_norm": 0.18720318377017975, + "learning_rate": 2.2625597165083514e-05, + "loss": 0.0701, + "num_input_tokens_seen": 22147968, + "step": 104945 + }, + { + "epoch": 11.545654565456546, + "grad_norm": 0.02319221943616867, + "learning_rate": 2.2623207957682836e-05, + "loss": 0.0189, + "num_input_tokens_seen": 22149088, + "step": 104950 + }, + { + "epoch": 11.546204620462046, + "grad_norm": 0.1522791087627411, + "learning_rate": 2.2620818772187904e-05, + "loss": 0.0215, + "num_input_tokens_seen": 22150048, + "step": 104955 + }, + { + "epoch": 11.546754675467547, + "grad_norm": 0.009465381503105164, + "learning_rate": 2.2618429608620738e-05, + "loss": 0.0095, + "num_input_tokens_seen": 22151072, + "step": 104960 + }, + { + "epoch": 11.547304730473048, + "grad_norm": 0.005717698484659195, + "learning_rate": 2.2616040467003373e-05, + "loss": 0.0008, + "num_input_tokens_seen": 22152096, + "step": 104965 + }, + { + "epoch": 11.547854785478547, + "grad_norm": 0.05486684665083885, + "learning_rate": 2.2613651347357813e-05, + "loss": 0.0036, + "num_input_tokens_seen": 22153152, + "step": 104970 + }, + { + "epoch": 11.548404840484048, + "grad_norm": 0.05054735764861107, + "learning_rate": 2.2611262249706083e-05, + "loss": 0.0311, + "num_input_tokens_seen": 22154208, + "step": 104975 + }, + { + "epoch": 11.54895489548955, + "grad_norm": 0.02500978298485279, + "learning_rate": 2.260887317407021e-05, + "loss": 0.0226, + "num_input_tokens_seen": 22155232, + "step": 104980 + }, + { + "epoch": 11.549504950495049, + "grad_norm": 0.5110983848571777, + "learning_rate": 2.2606484120472196e-05, + "loss": 0.0262, + "num_input_tokens_seen": 22156352, + "step": 104985 + }, + { + "epoch": 11.55005500550055, + "grad_norm": 0.05136268958449364, + "learning_rate": 2.2604095088934074e-05, + "loss": 0.0062, + "num_input_tokens_seen": 22157408, + "step": 104990 + }, + { + "epoch": 11.55060506050605, + "grad_norm": 0.009270544163882732, + "learning_rate": 2.260170607947785e-05, + "loss": 0.0058, + "num_input_tokens_seen": 22158432, + "step": 104995 + }, + { + "epoch": 11.551155115511552, + "grad_norm": 0.013980313204228878, + "learning_rate": 2.2599317092125562e-05, + "loss": 0.0098, + "num_input_tokens_seen": 22159552, + "step": 105000 + }, + { + "epoch": 11.551705170517051, + "grad_norm": 0.053568169474601746, + "learning_rate": 2.2596928126899217e-05, + "loss": 0.0402, + "num_input_tokens_seen": 22160640, + "step": 105005 + }, + { + "epoch": 11.552255225522552, + "grad_norm": 0.013396987691521645, + "learning_rate": 2.2594539183820822e-05, + "loss": 0.0412, + "num_input_tokens_seen": 22161728, + "step": 105010 + }, + { + "epoch": 11.552805280528053, + "grad_norm": 0.04134920611977577, + "learning_rate": 2.2592150262912413e-05, + "loss": 0.0344, + "num_input_tokens_seen": 22162784, + "step": 105015 + }, + { + "epoch": 11.553355335533553, + "grad_norm": 0.011716458946466446, + "learning_rate": 2.258976136419599e-05, + "loss": 0.1362, + "num_input_tokens_seen": 22163872, + "step": 105020 + }, + { + "epoch": 11.553905390539054, + "grad_norm": 0.48600494861602783, + "learning_rate": 2.2587372487693595e-05, + "loss": 0.0126, + "num_input_tokens_seen": 22164864, + "step": 105025 + }, + { + "epoch": 11.554455445544555, + "grad_norm": 1.5368329286575317, + "learning_rate": 2.258498363342722e-05, + "loss": 0.013, + "num_input_tokens_seen": 22165888, + "step": 105030 + }, + { + "epoch": 11.555005500550054, + "grad_norm": 0.010210815817117691, + "learning_rate": 2.258259480141889e-05, + "loss": 0.0255, + "num_input_tokens_seen": 22166944, + "step": 105035 + }, + { + "epoch": 11.555555555555555, + "grad_norm": 1.2004283666610718, + "learning_rate": 2.2580205991690638e-05, + "loss": 0.0173, + "num_input_tokens_seen": 22168064, + "step": 105040 + }, + { + "epoch": 11.556105610561056, + "grad_norm": 1.416906476020813, + "learning_rate": 2.2577817204264454e-05, + "loss": 0.1748, + "num_input_tokens_seen": 22169120, + "step": 105045 + }, + { + "epoch": 11.556655665566556, + "grad_norm": 0.009368110448122025, + "learning_rate": 2.2575428439162362e-05, + "loss": 0.0106, + "num_input_tokens_seen": 22170208, + "step": 105050 + }, + { + "epoch": 11.557205720572057, + "grad_norm": 0.008644678629934788, + "learning_rate": 2.25730396964064e-05, + "loss": 0.0023, + "num_input_tokens_seen": 22171168, + "step": 105055 + }, + { + "epoch": 11.557755775577558, + "grad_norm": 0.014431841671466827, + "learning_rate": 2.2570650976018542e-05, + "loss": 0.0041, + "num_input_tokens_seen": 22172224, + "step": 105060 + }, + { + "epoch": 11.558305830583059, + "grad_norm": 0.0852673351764679, + "learning_rate": 2.256826227802085e-05, + "loss": 0.0318, + "num_input_tokens_seen": 22173280, + "step": 105065 + }, + { + "epoch": 11.558855885588558, + "grad_norm": 0.022289616987109184, + "learning_rate": 2.2565873602435307e-05, + "loss": 0.0281, + "num_input_tokens_seen": 22174336, + "step": 105070 + }, + { + "epoch": 11.55940594059406, + "grad_norm": 0.26855695247650146, + "learning_rate": 2.256348494928394e-05, + "loss": 0.0406, + "num_input_tokens_seen": 22175456, + "step": 105075 + }, + { + "epoch": 11.55995599559956, + "grad_norm": 0.0060813166201114655, + "learning_rate": 2.256109631858877e-05, + "loss": 0.0309, + "num_input_tokens_seen": 22176512, + "step": 105080 + }, + { + "epoch": 11.56050605060506, + "grad_norm": 0.005320973694324493, + "learning_rate": 2.2558707710371795e-05, + "loss": 0.004, + "num_input_tokens_seen": 22177536, + "step": 105085 + }, + { + "epoch": 11.561056105610561, + "grad_norm": 0.4060281813144684, + "learning_rate": 2.2556319124655045e-05, + "loss": 0.0051, + "num_input_tokens_seen": 22178528, + "step": 105090 + }, + { + "epoch": 11.561606160616062, + "grad_norm": 0.0513509064912796, + "learning_rate": 2.2553930561460536e-05, + "loss": 0.147, + "num_input_tokens_seen": 22179584, + "step": 105095 + }, + { + "epoch": 11.562156215621561, + "grad_norm": 0.018393052741885185, + "learning_rate": 2.2551542020810262e-05, + "loss": 0.0044, + "num_input_tokens_seen": 22180608, + "step": 105100 + }, + { + "epoch": 11.562706270627062, + "grad_norm": 0.03344148024916649, + "learning_rate": 2.2549153502726257e-05, + "loss": 0.1558, + "num_input_tokens_seen": 22181632, + "step": 105105 + }, + { + "epoch": 11.563256325632564, + "grad_norm": 0.06927742809057236, + "learning_rate": 2.254676500723052e-05, + "loss": 0.0065, + "num_input_tokens_seen": 22182688, + "step": 105110 + }, + { + "epoch": 11.563806380638063, + "grad_norm": 0.028990954160690308, + "learning_rate": 2.254437653434508e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22183744, + "step": 105115 + }, + { + "epoch": 11.564356435643564, + "grad_norm": 0.04243846610188484, + "learning_rate": 2.2541988084091953e-05, + "loss": 0.0254, + "num_input_tokens_seen": 22184768, + "step": 105120 + }, + { + "epoch": 11.564906490649065, + "grad_norm": 0.019264737144112587, + "learning_rate": 2.2539599656493127e-05, + "loss": 0.0039, + "num_input_tokens_seen": 22185888, + "step": 105125 + }, + { + "epoch": 11.565456545654566, + "grad_norm": 0.12287202477455139, + "learning_rate": 2.2537211251570637e-05, + "loss": 0.0042, + "num_input_tokens_seen": 22186944, + "step": 105130 + }, + { + "epoch": 11.566006600660065, + "grad_norm": 0.022729115560650826, + "learning_rate": 2.2534822869346482e-05, + "loss": 0.0013, + "num_input_tokens_seen": 22188000, + "step": 105135 + }, + { + "epoch": 11.566556655665567, + "grad_norm": 0.03882947191596031, + "learning_rate": 2.253243450984269e-05, + "loss": 0.0916, + "num_input_tokens_seen": 22189120, + "step": 105140 + }, + { + "epoch": 11.567106710671068, + "grad_norm": 0.1572123020887375, + "learning_rate": 2.2530046173081268e-05, + "loss": 0.0117, + "num_input_tokens_seen": 22190176, + "step": 105145 + }, + { + "epoch": 11.567656765676567, + "grad_norm": 0.042322900146245956, + "learning_rate": 2.2527657859084214e-05, + "loss": 0.012, + "num_input_tokens_seen": 22191200, + "step": 105150 + }, + { + "epoch": 11.568206820682068, + "grad_norm": 0.8122115135192871, + "learning_rate": 2.2525269567873562e-05, + "loss": 0.0246, + "num_input_tokens_seen": 22192256, + "step": 105155 + }, + { + "epoch": 11.56875687568757, + "grad_norm": 0.03739160671830177, + "learning_rate": 2.2522881299471306e-05, + "loss": 0.0145, + "num_input_tokens_seen": 22193248, + "step": 105160 + }, + { + "epoch": 11.569306930693068, + "grad_norm": 0.006595369894057512, + "learning_rate": 2.2520493053899468e-05, + "loss": 0.0081, + "num_input_tokens_seen": 22194272, + "step": 105165 + }, + { + "epoch": 11.56985698569857, + "grad_norm": 1.6988595724105835, + "learning_rate": 2.251810483118006e-05, + "loss": 0.1029, + "num_input_tokens_seen": 22195328, + "step": 105170 + }, + { + "epoch": 11.57040704070407, + "grad_norm": 0.013815921731293201, + "learning_rate": 2.251571663133507e-05, + "loss": 0.0017, + "num_input_tokens_seen": 22196352, + "step": 105175 + }, + { + "epoch": 11.570957095709572, + "grad_norm": 0.008899775333702564, + "learning_rate": 2.251332845438655e-05, + "loss": 0.2051, + "num_input_tokens_seen": 22197408, + "step": 105180 + }, + { + "epoch": 11.571507150715071, + "grad_norm": 0.4808858633041382, + "learning_rate": 2.251094030035647e-05, + "loss": 0.0086, + "num_input_tokens_seen": 22198464, + "step": 105185 + }, + { + "epoch": 11.572057205720572, + "grad_norm": 0.3598402738571167, + "learning_rate": 2.2508552169266868e-05, + "loss": 0.0241, + "num_input_tokens_seen": 22199520, + "step": 105190 + }, + { + "epoch": 11.572607260726073, + "grad_norm": 2.678339719772339, + "learning_rate": 2.2506164061139753e-05, + "loss": 0.0208, + "num_input_tokens_seen": 22200608, + "step": 105195 + }, + { + "epoch": 11.573157315731573, + "grad_norm": 0.17516222596168518, + "learning_rate": 2.2503775975997116e-05, + "loss": 0.008, + "num_input_tokens_seen": 22201632, + "step": 105200 + }, + { + "epoch": 11.573707370737074, + "grad_norm": 0.019143374636769295, + "learning_rate": 2.250138791386098e-05, + "loss": 0.0933, + "num_input_tokens_seen": 22202656, + "step": 105205 + }, + { + "epoch": 11.574257425742575, + "grad_norm": 0.057737309485673904, + "learning_rate": 2.249899987475335e-05, + "loss": 0.0723, + "num_input_tokens_seen": 22203744, + "step": 105210 + }, + { + "epoch": 11.574807480748074, + "grad_norm": 1.4047104120254517, + "learning_rate": 2.249661185869625e-05, + "loss": 0.0957, + "num_input_tokens_seen": 22204864, + "step": 105215 + }, + { + "epoch": 11.575357535753575, + "grad_norm": 1.662953495979309, + "learning_rate": 2.249422386571167e-05, + "loss": 0.1087, + "num_input_tokens_seen": 22205888, + "step": 105220 + }, + { + "epoch": 11.575907590759076, + "grad_norm": 2.057265281677246, + "learning_rate": 2.249183589582162e-05, + "loss": 0.0925, + "num_input_tokens_seen": 22207008, + "step": 105225 + }, + { + "epoch": 11.576457645764577, + "grad_norm": 0.054121725261211395, + "learning_rate": 2.2489447949048125e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22208032, + "step": 105230 + }, + { + "epoch": 11.577007700770077, + "grad_norm": 0.03183133155107498, + "learning_rate": 2.248706002541318e-05, + "loss": 0.0541, + "num_input_tokens_seen": 22209056, + "step": 105235 + }, + { + "epoch": 11.577557755775578, + "grad_norm": 0.010805089958012104, + "learning_rate": 2.248467212493879e-05, + "loss": 0.0543, + "num_input_tokens_seen": 22210144, + "step": 105240 + }, + { + "epoch": 11.578107810781079, + "grad_norm": 1.366800308227539, + "learning_rate": 2.2482284247646978e-05, + "loss": 0.131, + "num_input_tokens_seen": 22211232, + "step": 105245 + }, + { + "epoch": 11.578657865786578, + "grad_norm": 0.15059374272823334, + "learning_rate": 2.2479896393559738e-05, + "loss": 0.0067, + "num_input_tokens_seen": 22212224, + "step": 105250 + }, + { + "epoch": 11.57920792079208, + "grad_norm": 0.11470203846693039, + "learning_rate": 2.2477508562699093e-05, + "loss": 0.0149, + "num_input_tokens_seen": 22213216, + "step": 105255 + }, + { + "epoch": 11.57975797579758, + "grad_norm": 0.26712921261787415, + "learning_rate": 2.2475120755087035e-05, + "loss": 0.0061, + "num_input_tokens_seen": 22214304, + "step": 105260 + }, + { + "epoch": 11.58030803080308, + "grad_norm": 0.023331165313720703, + "learning_rate": 2.2472732970745575e-05, + "loss": 0.0013, + "num_input_tokens_seen": 22215360, + "step": 105265 + }, + { + "epoch": 11.58085808580858, + "grad_norm": 0.02314220368862152, + "learning_rate": 2.2470345209696732e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22216512, + "step": 105270 + }, + { + "epoch": 11.581408140814082, + "grad_norm": 1.563311219215393, + "learning_rate": 2.2467957471962494e-05, + "loss": 0.0652, + "num_input_tokens_seen": 22217536, + "step": 105275 + }, + { + "epoch": 11.581958195819581, + "grad_norm": 2.422715187072754, + "learning_rate": 2.246556975756488e-05, + "loss": 0.0764, + "num_input_tokens_seen": 22218560, + "step": 105280 + }, + { + "epoch": 11.582508250825082, + "grad_norm": 0.0520993173122406, + "learning_rate": 2.24631820665259e-05, + "loss": 0.006, + "num_input_tokens_seen": 22219648, + "step": 105285 + }, + { + "epoch": 11.583058305830583, + "grad_norm": 0.007219718303531408, + "learning_rate": 2.2460794398867544e-05, + "loss": 0.0112, + "num_input_tokens_seen": 22220672, + "step": 105290 + }, + { + "epoch": 11.583608360836084, + "grad_norm": 0.3764920234680176, + "learning_rate": 2.245840675461183e-05, + "loss": 0.0563, + "num_input_tokens_seen": 22221856, + "step": 105295 + }, + { + "epoch": 11.584158415841584, + "grad_norm": 0.005718632601201534, + "learning_rate": 2.2456019133780758e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22222880, + "step": 105300 + }, + { + "epoch": 11.584708470847085, + "grad_norm": 0.045656342059373856, + "learning_rate": 2.2453631536396342e-05, + "loss": 0.0253, + "num_input_tokens_seen": 22224000, + "step": 105305 + }, + { + "epoch": 11.585258525852586, + "grad_norm": 0.04794956371188164, + "learning_rate": 2.2451243962480587e-05, + "loss": 0.0081, + "num_input_tokens_seen": 22225088, + "step": 105310 + }, + { + "epoch": 11.585808580858085, + "grad_norm": 0.8526447415351868, + "learning_rate": 2.244885641205548e-05, + "loss": 0.1352, + "num_input_tokens_seen": 22226112, + "step": 105315 + }, + { + "epoch": 11.586358635863586, + "grad_norm": 0.1655264049768448, + "learning_rate": 2.2446468885143052e-05, + "loss": 0.0058, + "num_input_tokens_seen": 22227168, + "step": 105320 + }, + { + "epoch": 11.586908690869087, + "grad_norm": 0.139997199177742, + "learning_rate": 2.2444081381765283e-05, + "loss": 0.0372, + "num_input_tokens_seen": 22228256, + "step": 105325 + }, + { + "epoch": 11.587458745874587, + "grad_norm": 0.022584520280361176, + "learning_rate": 2.2441693901944202e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22229344, + "step": 105330 + }, + { + "epoch": 11.588008800880088, + "grad_norm": 0.02898215875029564, + "learning_rate": 2.2439306445701798e-05, + "loss": 0.0735, + "num_input_tokens_seen": 22230304, + "step": 105335 + }, + { + "epoch": 11.588558855885589, + "grad_norm": 0.17765894532203674, + "learning_rate": 2.243691901306007e-05, + "loss": 0.0195, + "num_input_tokens_seen": 22231360, + "step": 105340 + }, + { + "epoch": 11.589108910891088, + "grad_norm": 1.315474510192871, + "learning_rate": 2.243453160404104e-05, + "loss": 0.0606, + "num_input_tokens_seen": 22232416, + "step": 105345 + }, + { + "epoch": 11.58965896589659, + "grad_norm": 3.0454695224761963, + "learning_rate": 2.243214421866669e-05, + "loss": 0.0497, + "num_input_tokens_seen": 22233504, + "step": 105350 + }, + { + "epoch": 11.59020902090209, + "grad_norm": 0.06775029003620148, + "learning_rate": 2.242975685695904e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22234592, + "step": 105355 + }, + { + "epoch": 11.590759075907592, + "grad_norm": 0.018763601779937744, + "learning_rate": 2.2427369518940092e-05, + "loss": 0.064, + "num_input_tokens_seen": 22235648, + "step": 105360 + }, + { + "epoch": 11.591309130913091, + "grad_norm": 0.13902835547924042, + "learning_rate": 2.2424982204631834e-05, + "loss": 0.0553, + "num_input_tokens_seen": 22236736, + "step": 105365 + }, + { + "epoch": 11.591859185918592, + "grad_norm": 0.6566509008407593, + "learning_rate": 2.2422594914056294e-05, + "loss": 0.0337, + "num_input_tokens_seen": 22237856, + "step": 105370 + }, + { + "epoch": 11.592409240924093, + "grad_norm": 0.8644158244132996, + "learning_rate": 2.2420207647235448e-05, + "loss": 0.0329, + "num_input_tokens_seen": 22238976, + "step": 105375 + }, + { + "epoch": 11.592959295929592, + "grad_norm": 0.9138811826705933, + "learning_rate": 2.2417820404191314e-05, + "loss": 0.0195, + "num_input_tokens_seen": 22240000, + "step": 105380 + }, + { + "epoch": 11.593509350935093, + "grad_norm": 0.06989933550357819, + "learning_rate": 2.2415433184945896e-05, + "loss": 0.0396, + "num_input_tokens_seen": 22241024, + "step": 105385 + }, + { + "epoch": 11.594059405940595, + "grad_norm": 0.5571756958961487, + "learning_rate": 2.2413045989521182e-05, + "loss": 0.051, + "num_input_tokens_seen": 22242112, + "step": 105390 + }, + { + "epoch": 11.594609460946094, + "grad_norm": 0.045083776116371155, + "learning_rate": 2.2410658817939185e-05, + "loss": 0.1649, + "num_input_tokens_seen": 22243136, + "step": 105395 + }, + { + "epoch": 11.595159515951595, + "grad_norm": 0.0065956455655395985, + "learning_rate": 2.2408271670221895e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22244192, + "step": 105400 + }, + { + "epoch": 11.595709570957096, + "grad_norm": 0.013022289611399174, + "learning_rate": 2.2405884546391337e-05, + "loss": 0.0044, + "num_input_tokens_seen": 22245248, + "step": 105405 + }, + { + "epoch": 11.596259625962595, + "grad_norm": 0.029368797317147255, + "learning_rate": 2.2403497446469486e-05, + "loss": 0.0039, + "num_input_tokens_seen": 22246336, + "step": 105410 + }, + { + "epoch": 11.596809680968097, + "grad_norm": 0.0285930372774601, + "learning_rate": 2.2401110370478352e-05, + "loss": 0.0056, + "num_input_tokens_seen": 22247328, + "step": 105415 + }, + { + "epoch": 11.597359735973598, + "grad_norm": 0.03173408657312393, + "learning_rate": 2.2398723318439942e-05, + "loss": 0.011, + "num_input_tokens_seen": 22248416, + "step": 105420 + }, + { + "epoch": 11.597909790979099, + "grad_norm": 0.05768664926290512, + "learning_rate": 2.2396336290376253e-05, + "loss": 0.0087, + "num_input_tokens_seen": 22249600, + "step": 105425 + }, + { + "epoch": 11.598459845984598, + "grad_norm": 0.0210641548037529, + "learning_rate": 2.2393949286309276e-05, + "loss": 0.0078, + "num_input_tokens_seen": 22250624, + "step": 105430 + }, + { + "epoch": 11.599009900990099, + "grad_norm": 1.550532341003418, + "learning_rate": 2.2391562306261022e-05, + "loss": 0.0675, + "num_input_tokens_seen": 22251648, + "step": 105435 + }, + { + "epoch": 11.5995599559956, + "grad_norm": 0.07675035297870636, + "learning_rate": 2.238917535025348e-05, + "loss": 0.0654, + "num_input_tokens_seen": 22252800, + "step": 105440 + }, + { + "epoch": 11.6001100110011, + "grad_norm": 0.03257696330547333, + "learning_rate": 2.238678841830867e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22253856, + "step": 105445 + }, + { + "epoch": 11.6006600660066, + "grad_norm": 0.8640432357788086, + "learning_rate": 2.2384401510448568e-05, + "loss": 0.0411, + "num_input_tokens_seen": 22254944, + "step": 105450 + }, + { + "epoch": 11.601210121012102, + "grad_norm": 0.14421842992305756, + "learning_rate": 2.2382014626695177e-05, + "loss": 0.003, + "num_input_tokens_seen": 22256032, + "step": 105455 + }, + { + "epoch": 11.601760176017601, + "grad_norm": 0.015038266777992249, + "learning_rate": 2.2379627767070512e-05, + "loss": 0.0179, + "num_input_tokens_seen": 22257024, + "step": 105460 + }, + { + "epoch": 11.602310231023102, + "grad_norm": 0.2733815014362335, + "learning_rate": 2.2377240931596552e-05, + "loss": 0.0086, + "num_input_tokens_seen": 22258080, + "step": 105465 + }, + { + "epoch": 11.602860286028603, + "grad_norm": 0.10359737277030945, + "learning_rate": 2.2374854120295306e-05, + "loss": 0.0644, + "num_input_tokens_seen": 22259136, + "step": 105470 + }, + { + "epoch": 11.603410341034103, + "grad_norm": 1.5257936716079712, + "learning_rate": 2.2372467333188778e-05, + "loss": 0.0376, + "num_input_tokens_seen": 22260224, + "step": 105475 + }, + { + "epoch": 11.603960396039604, + "grad_norm": 0.036861907690763474, + "learning_rate": 2.237008057029894e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22261248, + "step": 105480 + }, + { + "epoch": 11.604510451045105, + "grad_norm": 0.15563955903053284, + "learning_rate": 2.2367693831647827e-05, + "loss": 0.0464, + "num_input_tokens_seen": 22262208, + "step": 105485 + }, + { + "epoch": 11.605060506050606, + "grad_norm": 0.009307273663580418, + "learning_rate": 2.2365307117257403e-05, + "loss": 0.0201, + "num_input_tokens_seen": 22263296, + "step": 105490 + }, + { + "epoch": 11.605610561056105, + "grad_norm": 0.03935739025473595, + "learning_rate": 2.2362920427149686e-05, + "loss": 0.0017, + "num_input_tokens_seen": 22264320, + "step": 105495 + }, + { + "epoch": 11.606160616061606, + "grad_norm": 0.1959841251373291, + "learning_rate": 2.2360533761346668e-05, + "loss": 0.011, + "num_input_tokens_seen": 22265344, + "step": 105500 + }, + { + "epoch": 11.606710671067107, + "grad_norm": 0.02756638079881668, + "learning_rate": 2.2358147119870338e-05, + "loss": 0.012, + "num_input_tokens_seen": 22266400, + "step": 105505 + }, + { + "epoch": 11.607260726072607, + "grad_norm": 0.07892326265573502, + "learning_rate": 2.2355760502742702e-05, + "loss": 0.0021, + "num_input_tokens_seen": 22267488, + "step": 105510 + }, + { + "epoch": 11.607810781078108, + "grad_norm": 0.09791676700115204, + "learning_rate": 2.2353373909985745e-05, + "loss": 0.0062, + "num_input_tokens_seen": 22268544, + "step": 105515 + }, + { + "epoch": 11.608360836083609, + "grad_norm": 0.016270680353045464, + "learning_rate": 2.235098734162148e-05, + "loss": 0.0398, + "num_input_tokens_seen": 22269632, + "step": 105520 + }, + { + "epoch": 11.608910891089108, + "grad_norm": 0.2726239264011383, + "learning_rate": 2.2348600797671894e-05, + "loss": 0.0142, + "num_input_tokens_seen": 22270688, + "step": 105525 + }, + { + "epoch": 11.60946094609461, + "grad_norm": 0.39053162932395935, + "learning_rate": 2.2346214278158974e-05, + "loss": 0.0129, + "num_input_tokens_seen": 22271744, + "step": 105530 + }, + { + "epoch": 11.61001100110011, + "grad_norm": 1.0885331630706787, + "learning_rate": 2.2343827783104733e-05, + "loss": 0.0716, + "num_input_tokens_seen": 22272800, + "step": 105535 + }, + { + "epoch": 11.61056105610561, + "grad_norm": 0.7773166298866272, + "learning_rate": 2.234144131253114e-05, + "loss": 0.0125, + "num_input_tokens_seen": 22273856, + "step": 105540 + }, + { + "epoch": 11.61111111111111, + "grad_norm": 0.06453020870685577, + "learning_rate": 2.2339054866460226e-05, + "loss": 0.0035, + "num_input_tokens_seen": 22274944, + "step": 105545 + }, + { + "epoch": 11.611661166116612, + "grad_norm": 0.018229259178042412, + "learning_rate": 2.233666844491396e-05, + "loss": 0.0669, + "num_input_tokens_seen": 22276000, + "step": 105550 + }, + { + "epoch": 11.612211221122113, + "grad_norm": 0.13695858418941498, + "learning_rate": 2.2334282047914338e-05, + "loss": 0.0588, + "num_input_tokens_seen": 22277088, + "step": 105555 + }, + { + "epoch": 11.612761276127612, + "grad_norm": 2.484349012374878, + "learning_rate": 2.233189567548337e-05, + "loss": 0.0146, + "num_input_tokens_seen": 22278144, + "step": 105560 + }, + { + "epoch": 11.613311331133113, + "grad_norm": 0.027229098603129387, + "learning_rate": 2.2329509327643025e-05, + "loss": 0.0029, + "num_input_tokens_seen": 22279264, + "step": 105565 + }, + { + "epoch": 11.613861386138614, + "grad_norm": 0.0042962911538779736, + "learning_rate": 2.232712300441532e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22280384, + "step": 105570 + }, + { + "epoch": 11.614411441144114, + "grad_norm": 0.020515134558081627, + "learning_rate": 2.2324736705822243e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22281472, + "step": 105575 + }, + { + "epoch": 11.614961496149615, + "grad_norm": 0.07269971072673798, + "learning_rate": 2.2322350431885774e-05, + "loss": 0.0682, + "num_input_tokens_seen": 22282528, + "step": 105580 + }, + { + "epoch": 11.615511551155116, + "grad_norm": 0.09452694654464722, + "learning_rate": 2.231996418262792e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22283584, + "step": 105585 + }, + { + "epoch": 11.616061606160617, + "grad_norm": 0.013660503551363945, + "learning_rate": 2.231757795807067e-05, + "loss": 0.0323, + "num_input_tokens_seen": 22284640, + "step": 105590 + }, + { + "epoch": 11.616611661166116, + "grad_norm": 0.1802876591682434, + "learning_rate": 2.2315191758236025e-05, + "loss": 0.0059, + "num_input_tokens_seen": 22285632, + "step": 105595 + }, + { + "epoch": 11.617161716171617, + "grad_norm": 0.7689918279647827, + "learning_rate": 2.2312805583145963e-05, + "loss": 0.0193, + "num_input_tokens_seen": 22286720, + "step": 105600 + }, + { + "epoch": 11.617711771177119, + "grad_norm": 1.2943240404129028, + "learning_rate": 2.2310419432822477e-05, + "loss": 0.0831, + "num_input_tokens_seen": 22287808, + "step": 105605 + }, + { + "epoch": 11.618261826182618, + "grad_norm": 0.007348780054599047, + "learning_rate": 2.2308033307287572e-05, + "loss": 0.0011, + "num_input_tokens_seen": 22288832, + "step": 105610 + }, + { + "epoch": 11.618811881188119, + "grad_norm": 0.017129847779870033, + "learning_rate": 2.2305647206563235e-05, + "loss": 0.0243, + "num_input_tokens_seen": 22289984, + "step": 105615 + }, + { + "epoch": 11.61936193619362, + "grad_norm": 0.0170644149184227, + "learning_rate": 2.2303261130671445e-05, + "loss": 0.0576, + "num_input_tokens_seen": 22291072, + "step": 105620 + }, + { + "epoch": 11.61991199119912, + "grad_norm": 0.12466125935316086, + "learning_rate": 2.230087507963421e-05, + "loss": 0.0208, + "num_input_tokens_seen": 22292064, + "step": 105625 + }, + { + "epoch": 11.62046204620462, + "grad_norm": 0.07318686693906784, + "learning_rate": 2.2298489053473505e-05, + "loss": 0.0399, + "num_input_tokens_seen": 22293088, + "step": 105630 + }, + { + "epoch": 11.621012101210122, + "grad_norm": 0.038050685077905655, + "learning_rate": 2.2296103052211344e-05, + "loss": 0.0132, + "num_input_tokens_seen": 22294176, + "step": 105635 + }, + { + "epoch": 11.62156215621562, + "grad_norm": 0.020645223557949066, + "learning_rate": 2.22937170758697e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22295264, + "step": 105640 + }, + { + "epoch": 11.622112211221122, + "grad_norm": 0.0054540447890758514, + "learning_rate": 2.229133112447056e-05, + "loss": 0.001, + "num_input_tokens_seen": 22296320, + "step": 105645 + }, + { + "epoch": 11.622662266226623, + "grad_norm": 0.09086287021636963, + "learning_rate": 2.228894519803593e-05, + "loss": 0.0494, + "num_input_tokens_seen": 22297376, + "step": 105650 + }, + { + "epoch": 11.623212321232124, + "grad_norm": 0.3974948823451996, + "learning_rate": 2.228655929658778e-05, + "loss": 0.0084, + "num_input_tokens_seen": 22298432, + "step": 105655 + }, + { + "epoch": 11.623762376237623, + "grad_norm": 1.3924280405044556, + "learning_rate": 2.228417342014812e-05, + "loss": 0.0392, + "num_input_tokens_seen": 22299552, + "step": 105660 + }, + { + "epoch": 11.624312431243125, + "grad_norm": 0.14726155996322632, + "learning_rate": 2.2281787568738926e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22300672, + "step": 105665 + }, + { + "epoch": 11.624862486248626, + "grad_norm": 0.060176633298397064, + "learning_rate": 2.227940174238219e-05, + "loss": 0.0184, + "num_input_tokens_seen": 22301792, + "step": 105670 + }, + { + "epoch": 11.625412541254125, + "grad_norm": 0.17724816501140594, + "learning_rate": 2.227701594109991e-05, + "loss": 0.0054, + "num_input_tokens_seen": 22302784, + "step": 105675 + }, + { + "epoch": 11.625962596259626, + "grad_norm": 0.013345202431082726, + "learning_rate": 2.2274630164914063e-05, + "loss": 0.1194, + "num_input_tokens_seen": 22303840, + "step": 105680 + }, + { + "epoch": 11.626512651265127, + "grad_norm": 0.015645409002900124, + "learning_rate": 2.2272244413846644e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22304896, + "step": 105685 + }, + { + "epoch": 11.627062706270626, + "grad_norm": 0.011502128094434738, + "learning_rate": 2.226985868791964e-05, + "loss": 0.1329, + "num_input_tokens_seen": 22305920, + "step": 105690 + }, + { + "epoch": 11.627612761276128, + "grad_norm": 0.11207189410924911, + "learning_rate": 2.2267472987155033e-05, + "loss": 0.0207, + "num_input_tokens_seen": 22307040, + "step": 105695 + }, + { + "epoch": 11.628162816281629, + "grad_norm": 0.13679692149162292, + "learning_rate": 2.226508731157482e-05, + "loss": 0.0521, + "num_input_tokens_seen": 22308096, + "step": 105700 + }, + { + "epoch": 11.628712871287128, + "grad_norm": 0.0540783554315567, + "learning_rate": 2.2262701661200982e-05, + "loss": 0.0399, + "num_input_tokens_seen": 22309184, + "step": 105705 + }, + { + "epoch": 11.629262926292629, + "grad_norm": 0.010708668269217014, + "learning_rate": 2.2260316036055517e-05, + "loss": 0.0175, + "num_input_tokens_seen": 22310176, + "step": 105710 + }, + { + "epoch": 11.62981298129813, + "grad_norm": 0.03339744731783867, + "learning_rate": 2.22579304361604e-05, + "loss": 0.0095, + "num_input_tokens_seen": 22311200, + "step": 105715 + }, + { + "epoch": 11.630363036303631, + "grad_norm": 0.012260722927749157, + "learning_rate": 2.2255544861537615e-05, + "loss": 0.0008, + "num_input_tokens_seen": 22312256, + "step": 105720 + }, + { + "epoch": 11.63091309130913, + "grad_norm": 0.010349931195378304, + "learning_rate": 2.2253159312209165e-05, + "loss": 0.0893, + "num_input_tokens_seen": 22313312, + "step": 105725 + }, + { + "epoch": 11.631463146314632, + "grad_norm": 0.0131881944835186, + "learning_rate": 2.2250773788197022e-05, + "loss": 0.0328, + "num_input_tokens_seen": 22314368, + "step": 105730 + }, + { + "epoch": 11.632013201320133, + "grad_norm": 0.4610837399959564, + "learning_rate": 2.2248388289523186e-05, + "loss": 0.0969, + "num_input_tokens_seen": 22315392, + "step": 105735 + }, + { + "epoch": 11.632563256325632, + "grad_norm": 0.018985547125339508, + "learning_rate": 2.224600281620963e-05, + "loss": 0.0045, + "num_input_tokens_seen": 22316480, + "step": 105740 + }, + { + "epoch": 11.633113311331133, + "grad_norm": 0.060101862996816635, + "learning_rate": 2.224361736827834e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22317536, + "step": 105745 + }, + { + "epoch": 11.633663366336634, + "grad_norm": 0.40877777338027954, + "learning_rate": 2.2241231945751318e-05, + "loss": 0.1265, + "num_input_tokens_seen": 22318624, + "step": 105750 + }, + { + "epoch": 11.634213421342134, + "grad_norm": 0.006575595587491989, + "learning_rate": 2.223884654865053e-05, + "loss": 0.1091, + "num_input_tokens_seen": 22319744, + "step": 105755 + }, + { + "epoch": 11.634763476347635, + "grad_norm": 0.03744541108608246, + "learning_rate": 2.2236461176997963e-05, + "loss": 0.1331, + "num_input_tokens_seen": 22320800, + "step": 105760 + }, + { + "epoch": 11.635313531353136, + "grad_norm": 0.07965563237667084, + "learning_rate": 2.223407583081562e-05, + "loss": 0.0054, + "num_input_tokens_seen": 22321920, + "step": 105765 + }, + { + "epoch": 11.635863586358635, + "grad_norm": 0.7698432803153992, + "learning_rate": 2.2231690510125462e-05, + "loss": 0.0156, + "num_input_tokens_seen": 22323040, + "step": 105770 + }, + { + "epoch": 11.636413641364136, + "grad_norm": 0.005743572022765875, + "learning_rate": 2.222930521494949e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22324096, + "step": 105775 + }, + { + "epoch": 11.636963696369637, + "grad_norm": 0.05559111759066582, + "learning_rate": 2.2226919945309675e-05, + "loss": 0.0288, + "num_input_tokens_seen": 22325120, + "step": 105780 + }, + { + "epoch": 11.637513751375138, + "grad_norm": 1.704603672027588, + "learning_rate": 2.2224534701228013e-05, + "loss": 0.0469, + "num_input_tokens_seen": 22326144, + "step": 105785 + }, + { + "epoch": 11.638063806380638, + "grad_norm": 0.006837280001491308, + "learning_rate": 2.2222149482726488e-05, + "loss": 0.1111, + "num_input_tokens_seen": 22327264, + "step": 105790 + }, + { + "epoch": 11.638613861386139, + "grad_norm": 0.12013321369886398, + "learning_rate": 2.221976428982707e-05, + "loss": 0.011, + "num_input_tokens_seen": 22328384, + "step": 105795 + }, + { + "epoch": 11.63916391639164, + "grad_norm": 0.011585836298763752, + "learning_rate": 2.2217379122551753e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22329408, + "step": 105800 + }, + { + "epoch": 11.63971397139714, + "grad_norm": 0.372477263212204, + "learning_rate": 2.2214993980922524e-05, + "loss": 0.0112, + "num_input_tokens_seen": 22330496, + "step": 105805 + }, + { + "epoch": 11.64026402640264, + "grad_norm": 0.06381388753652573, + "learning_rate": 2.2212608864961348e-05, + "loss": 0.0144, + "num_input_tokens_seen": 22331520, + "step": 105810 + }, + { + "epoch": 11.640814081408141, + "grad_norm": 0.9866716265678406, + "learning_rate": 2.2210223774690224e-05, + "loss": 0.014, + "num_input_tokens_seen": 22332576, + "step": 105815 + }, + { + "epoch": 11.64136413641364, + "grad_norm": 1.0445362329483032, + "learning_rate": 2.2207838710131127e-05, + "loss": 0.0232, + "num_input_tokens_seen": 22333600, + "step": 105820 + }, + { + "epoch": 11.641914191419142, + "grad_norm": 0.00966158788651228, + "learning_rate": 2.220545367130605e-05, + "loss": 0.018, + "num_input_tokens_seen": 22334624, + "step": 105825 + }, + { + "epoch": 11.642464246424643, + "grad_norm": 0.01816384866833687, + "learning_rate": 2.2203068658236957e-05, + "loss": 0.0365, + "num_input_tokens_seen": 22335648, + "step": 105830 + }, + { + "epoch": 11.643014301430142, + "grad_norm": 0.016922159120440483, + "learning_rate": 2.2200683670945835e-05, + "loss": 0.0123, + "num_input_tokens_seen": 22336704, + "step": 105835 + }, + { + "epoch": 11.643564356435643, + "grad_norm": 0.007538670673966408, + "learning_rate": 2.219829870945468e-05, + "loss": 0.1457, + "num_input_tokens_seen": 22337760, + "step": 105840 + }, + { + "epoch": 11.644114411441144, + "grad_norm": 1.6178213357925415, + "learning_rate": 2.2195913773785442e-05, + "loss": 0.0446, + "num_input_tokens_seen": 22338848, + "step": 105845 + }, + { + "epoch": 11.644664466446645, + "grad_norm": 0.3327811360359192, + "learning_rate": 2.219352886396014e-05, + "loss": 0.0151, + "num_input_tokens_seen": 22339872, + "step": 105850 + }, + { + "epoch": 11.645214521452145, + "grad_norm": 0.09807991236448288, + "learning_rate": 2.2191143980000727e-05, + "loss": 0.0684, + "num_input_tokens_seen": 22341056, + "step": 105855 + }, + { + "epoch": 11.645764576457646, + "grad_norm": 2.1256206035614014, + "learning_rate": 2.218875912192919e-05, + "loss": 0.0805, + "num_input_tokens_seen": 22342080, + "step": 105860 + }, + { + "epoch": 11.646314631463147, + "grad_norm": 1.978646993637085, + "learning_rate": 2.2186374289767522e-05, + "loss": 0.1051, + "num_input_tokens_seen": 22343104, + "step": 105865 + }, + { + "epoch": 11.646864686468646, + "grad_norm": 0.027410142123699188, + "learning_rate": 2.2183989483537682e-05, + "loss": 0.047, + "num_input_tokens_seen": 22344192, + "step": 105870 + }, + { + "epoch": 11.647414741474147, + "grad_norm": 0.03312457725405693, + "learning_rate": 2.2181604703261663e-05, + "loss": 0.1015, + "num_input_tokens_seen": 22345248, + "step": 105875 + }, + { + "epoch": 11.647964796479648, + "grad_norm": 0.09659577161073685, + "learning_rate": 2.2179219948961448e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22346336, + "step": 105880 + }, + { + "epoch": 11.648514851485148, + "grad_norm": 0.031991541385650635, + "learning_rate": 2.2176835220658998e-05, + "loss": 0.1406, + "num_input_tokens_seen": 22347360, + "step": 105885 + }, + { + "epoch": 11.649064906490649, + "grad_norm": 0.27240562438964844, + "learning_rate": 2.2174450518376306e-05, + "loss": 0.0079, + "num_input_tokens_seen": 22348480, + "step": 105890 + }, + { + "epoch": 11.64961496149615, + "grad_norm": 0.041240841150283813, + "learning_rate": 2.2172065842135343e-05, + "loss": 0.0815, + "num_input_tokens_seen": 22349472, + "step": 105895 + }, + { + "epoch": 11.65016501650165, + "grad_norm": 1.7083775997161865, + "learning_rate": 2.2169681191958107e-05, + "loss": 0.1747, + "num_input_tokens_seen": 22350560, + "step": 105900 + }, + { + "epoch": 11.65071507150715, + "grad_norm": 0.02019125036895275, + "learning_rate": 2.2167296567866554e-05, + "loss": 0.0054, + "num_input_tokens_seen": 22351648, + "step": 105905 + }, + { + "epoch": 11.651265126512651, + "grad_norm": 1.1402455568313599, + "learning_rate": 2.216491196988266e-05, + "loss": 0.0397, + "num_input_tokens_seen": 22352704, + "step": 105910 + }, + { + "epoch": 11.651815181518153, + "grad_norm": 0.38926905393600464, + "learning_rate": 2.216252739802842e-05, + "loss": 0.011, + "num_input_tokens_seen": 22353792, + "step": 105915 + }, + { + "epoch": 11.652365236523652, + "grad_norm": 2.2537693977355957, + "learning_rate": 2.21601428523258e-05, + "loss": 0.0808, + "num_input_tokens_seen": 22354880, + "step": 105920 + }, + { + "epoch": 11.652915291529153, + "grad_norm": 0.060359887778759, + "learning_rate": 2.215775833279679e-05, + "loss": 0.0213, + "num_input_tokens_seen": 22356000, + "step": 105925 + }, + { + "epoch": 11.653465346534654, + "grad_norm": 0.24936307966709137, + "learning_rate": 2.2155373839463353e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22356992, + "step": 105930 + }, + { + "epoch": 11.654015401540153, + "grad_norm": 0.608851969242096, + "learning_rate": 2.2152989372347463e-05, + "loss": 0.0469, + "num_input_tokens_seen": 22358048, + "step": 105935 + }, + { + "epoch": 11.654565456545654, + "grad_norm": 0.6345911622047424, + "learning_rate": 2.2150604931471116e-05, + "loss": 0.0514, + "num_input_tokens_seen": 22359072, + "step": 105940 + }, + { + "epoch": 11.655115511551156, + "grad_norm": 0.012485027313232422, + "learning_rate": 2.2148220516856266e-05, + "loss": 0.0055, + "num_input_tokens_seen": 22360128, + "step": 105945 + }, + { + "epoch": 11.655665566556655, + "grad_norm": 1.002397894859314, + "learning_rate": 2.2145836128524902e-05, + "loss": 0.0198, + "num_input_tokens_seen": 22361120, + "step": 105950 + }, + { + "epoch": 11.656215621562156, + "grad_norm": 0.02741285227239132, + "learning_rate": 2.2143451766499004e-05, + "loss": 0.0093, + "num_input_tokens_seen": 22362144, + "step": 105955 + }, + { + "epoch": 11.656765676567657, + "grad_norm": 0.21469931304454803, + "learning_rate": 2.2141067430800524e-05, + "loss": 0.039, + "num_input_tokens_seen": 22363168, + "step": 105960 + }, + { + "epoch": 11.657315731573158, + "grad_norm": 0.022888395935297012, + "learning_rate": 2.213868312145147e-05, + "loss": 0.0188, + "num_input_tokens_seen": 22364192, + "step": 105965 + }, + { + "epoch": 11.657865786578657, + "grad_norm": 1.3858747482299805, + "learning_rate": 2.213629883847379e-05, + "loss": 0.0592, + "num_input_tokens_seen": 22365248, + "step": 105970 + }, + { + "epoch": 11.658415841584159, + "grad_norm": 0.023900894448161125, + "learning_rate": 2.2133914581889476e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22366272, + "step": 105975 + }, + { + "epoch": 11.65896589658966, + "grad_norm": 0.9643996357917786, + "learning_rate": 2.2131530351720496e-05, + "loss": 0.0103, + "num_input_tokens_seen": 22367296, + "step": 105980 + }, + { + "epoch": 11.659515951595159, + "grad_norm": 0.012725730426609516, + "learning_rate": 2.212914614798882e-05, + "loss": 0.0122, + "num_input_tokens_seen": 22368352, + "step": 105985 + }, + { + "epoch": 11.66006600660066, + "grad_norm": 0.06490147113800049, + "learning_rate": 2.2126761970716427e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22369440, + "step": 105990 + }, + { + "epoch": 11.660616061606161, + "grad_norm": 0.011033112183213234, + "learning_rate": 2.21243778199253e-05, + "loss": 0.0052, + "num_input_tokens_seen": 22370560, + "step": 105995 + }, + { + "epoch": 11.66116611661166, + "grad_norm": 1.0090174674987793, + "learning_rate": 2.212199369563739e-05, + "loss": 0.0555, + "num_input_tokens_seen": 22371552, + "step": 106000 + }, + { + "epoch": 11.661716171617162, + "grad_norm": 0.024144228547811508, + "learning_rate": 2.2119609597874688e-05, + "loss": 0.0332, + "num_input_tokens_seen": 22372544, + "step": 106005 + }, + { + "epoch": 11.662266226622663, + "grad_norm": 1.8581657409667969, + "learning_rate": 2.2117225526659158e-05, + "loss": 0.0297, + "num_input_tokens_seen": 22373568, + "step": 106010 + }, + { + "epoch": 11.662816281628164, + "grad_norm": 0.03714704141020775, + "learning_rate": 2.2114841482012785e-05, + "loss": 0.013, + "num_input_tokens_seen": 22374560, + "step": 106015 + }, + { + "epoch": 11.663366336633663, + "grad_norm": 0.01721959561109543, + "learning_rate": 2.211245746395753e-05, + "loss": 0.0054, + "num_input_tokens_seen": 22375648, + "step": 106020 + }, + { + "epoch": 11.663916391639164, + "grad_norm": 1.0509928464889526, + "learning_rate": 2.2110073472515364e-05, + "loss": 0.0617, + "num_input_tokens_seen": 22376704, + "step": 106025 + }, + { + "epoch": 11.664466446644665, + "grad_norm": 0.006374879740178585, + "learning_rate": 2.210768950770827e-05, + "loss": 0.0119, + "num_input_tokens_seen": 22377760, + "step": 106030 + }, + { + "epoch": 11.665016501650165, + "grad_norm": 0.22657160460948944, + "learning_rate": 2.210530556955821e-05, + "loss": 0.0825, + "num_input_tokens_seen": 22378816, + "step": 106035 + }, + { + "epoch": 11.665566556655666, + "grad_norm": 4.037931442260742, + "learning_rate": 2.2102921658087167e-05, + "loss": 0.1389, + "num_input_tokens_seen": 22379872, + "step": 106040 + }, + { + "epoch": 11.666116611661167, + "grad_norm": 0.03371201455593109, + "learning_rate": 2.2100537773317097e-05, + "loss": 0.0101, + "num_input_tokens_seen": 22380992, + "step": 106045 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 0.0070188213139772415, + "learning_rate": 2.209815391526998e-05, + "loss": 0.0812, + "num_input_tokens_seen": 22382112, + "step": 106050 + }, + { + "epoch": 11.667216721672167, + "grad_norm": 0.08233713358640671, + "learning_rate": 2.2095770083967793e-05, + "loss": 0.0032, + "num_input_tokens_seen": 22383200, + "step": 106055 + }, + { + "epoch": 11.667766776677668, + "grad_norm": 2.8837811946868896, + "learning_rate": 2.209338627943249e-05, + "loss": 0.0427, + "num_input_tokens_seen": 22384224, + "step": 106060 + }, + { + "epoch": 11.668316831683168, + "grad_norm": 0.16220667958259583, + "learning_rate": 2.2091002501686056e-05, + "loss": 0.0577, + "num_input_tokens_seen": 22385216, + "step": 106065 + }, + { + "epoch": 11.668866886688669, + "grad_norm": 0.025536932051181793, + "learning_rate": 2.208861875075046e-05, + "loss": 0.0045, + "num_input_tokens_seen": 22386272, + "step": 106070 + }, + { + "epoch": 11.66941694169417, + "grad_norm": 0.06879972666501999, + "learning_rate": 2.208623502664766e-05, + "loss": 0.0536, + "num_input_tokens_seen": 22387328, + "step": 106075 + }, + { + "epoch": 11.66996699669967, + "grad_norm": 1.7252341508865356, + "learning_rate": 2.2083851329399637e-05, + "loss": 0.0304, + "num_input_tokens_seen": 22388352, + "step": 106080 + }, + { + "epoch": 11.67051705170517, + "grad_norm": 0.01070419792085886, + "learning_rate": 2.2081467659028355e-05, + "loss": 0.0029, + "num_input_tokens_seen": 22389408, + "step": 106085 + }, + { + "epoch": 11.671067106710671, + "grad_norm": 0.11497781425714493, + "learning_rate": 2.2079084015555787e-05, + "loss": 0.0641, + "num_input_tokens_seen": 22390496, + "step": 106090 + }, + { + "epoch": 11.671617161716172, + "grad_norm": 0.08101753145456314, + "learning_rate": 2.2076700399003904e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22391552, + "step": 106095 + }, + { + "epoch": 11.672167216721672, + "grad_norm": 0.050423990935087204, + "learning_rate": 2.2074316809394662e-05, + "loss": 0.0139, + "num_input_tokens_seen": 22392704, + "step": 106100 + }, + { + "epoch": 11.672717271727173, + "grad_norm": 0.03390742093324661, + "learning_rate": 2.2071933246750047e-05, + "loss": 0.0042, + "num_input_tokens_seen": 22393760, + "step": 106105 + }, + { + "epoch": 11.673267326732674, + "grad_norm": 0.07575593143701553, + "learning_rate": 2.206954971109201e-05, + "loss": 0.0083, + "num_input_tokens_seen": 22394816, + "step": 106110 + }, + { + "epoch": 11.673817381738173, + "grad_norm": 1.559325098991394, + "learning_rate": 2.2067166202442536e-05, + "loss": 0.1024, + "num_input_tokens_seen": 22395872, + "step": 106115 + }, + { + "epoch": 11.674367436743674, + "grad_norm": 0.08886231482028961, + "learning_rate": 2.2064782720823578e-05, + "loss": 0.0329, + "num_input_tokens_seen": 22396896, + "step": 106120 + }, + { + "epoch": 11.674917491749175, + "grad_norm": 1.171515703201294, + "learning_rate": 2.2062399266257107e-05, + "loss": 0.0555, + "num_input_tokens_seen": 22397920, + "step": 106125 + }, + { + "epoch": 11.675467546754675, + "grad_norm": 0.017368188127875328, + "learning_rate": 2.2060015838765103e-05, + "loss": 0.0125, + "num_input_tokens_seen": 22398976, + "step": 106130 + }, + { + "epoch": 11.676017601760176, + "grad_norm": 1.884047508239746, + "learning_rate": 2.205763243836951e-05, + "loss": 0.0373, + "num_input_tokens_seen": 22399968, + "step": 106135 + }, + { + "epoch": 11.676567656765677, + "grad_norm": 0.28031986951828003, + "learning_rate": 2.2055249065092313e-05, + "loss": 0.0322, + "num_input_tokens_seen": 22400992, + "step": 106140 + }, + { + "epoch": 11.677117711771178, + "grad_norm": 0.03423655778169632, + "learning_rate": 2.205286571895547e-05, + "loss": 0.0029, + "num_input_tokens_seen": 22401952, + "step": 106145 + }, + { + "epoch": 11.677667766776677, + "grad_norm": 0.04262280464172363, + "learning_rate": 2.2050482399980944e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22403040, + "step": 106150 + }, + { + "epoch": 11.678217821782178, + "grad_norm": 0.06541844457387924, + "learning_rate": 2.2048099108190723e-05, + "loss": 0.0115, + "num_input_tokens_seen": 22404032, + "step": 106155 + }, + { + "epoch": 11.67876787678768, + "grad_norm": 0.014813579618930817, + "learning_rate": 2.204571584360674e-05, + "loss": 0.0631, + "num_input_tokens_seen": 22405120, + "step": 106160 + }, + { + "epoch": 11.679317931793179, + "grad_norm": 1.2201396226882935, + "learning_rate": 2.204333260625098e-05, + "loss": 0.015, + "num_input_tokens_seen": 22406112, + "step": 106165 + }, + { + "epoch": 11.67986798679868, + "grad_norm": 0.3149120509624481, + "learning_rate": 2.2040949396145415e-05, + "loss": 0.005, + "num_input_tokens_seen": 22407232, + "step": 106170 + }, + { + "epoch": 11.680418041804181, + "grad_norm": 0.03962676599621773, + "learning_rate": 2.203856621331199e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22408352, + "step": 106175 + }, + { + "epoch": 11.68096809680968, + "grad_norm": 1.0438686609268188, + "learning_rate": 2.203618305777268e-05, + "loss": 0.0263, + "num_input_tokens_seen": 22409504, + "step": 106180 + }, + { + "epoch": 11.681518151815181, + "grad_norm": 0.028255684301257133, + "learning_rate": 2.2033799929549457e-05, + "loss": 0.1022, + "num_input_tokens_seen": 22410496, + "step": 106185 + }, + { + "epoch": 11.682068206820682, + "grad_norm": 0.01769609935581684, + "learning_rate": 2.2031416828664263e-05, + "loss": 0.067, + "num_input_tokens_seen": 22411552, + "step": 106190 + }, + { + "epoch": 11.682618261826182, + "grad_norm": 0.33214184641838074, + "learning_rate": 2.2029033755139084e-05, + "loss": 0.0038, + "num_input_tokens_seen": 22412544, + "step": 106195 + }, + { + "epoch": 11.683168316831683, + "grad_norm": 1.159110426902771, + "learning_rate": 2.2026650708995868e-05, + "loss": 0.0259, + "num_input_tokens_seen": 22413632, + "step": 106200 + }, + { + "epoch": 11.683718371837184, + "grad_norm": 0.029851103201508522, + "learning_rate": 2.2024267690256593e-05, + "loss": 0.0023, + "num_input_tokens_seen": 22414688, + "step": 106205 + }, + { + "epoch": 11.684268426842685, + "grad_norm": 0.02315324917435646, + "learning_rate": 2.202188469894322e-05, + "loss": 0.0074, + "num_input_tokens_seen": 22415680, + "step": 106210 + }, + { + "epoch": 11.684818481848184, + "grad_norm": 0.0285036601126194, + "learning_rate": 2.201950173507769e-05, + "loss": 0.0755, + "num_input_tokens_seen": 22416832, + "step": 106215 + }, + { + "epoch": 11.685368536853685, + "grad_norm": 0.009810741059482098, + "learning_rate": 2.2017118798681995e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22417888, + "step": 106220 + }, + { + "epoch": 11.685918591859187, + "grad_norm": 0.14103741943836212, + "learning_rate": 2.2014735889778076e-05, + "loss": 0.0124, + "num_input_tokens_seen": 22418912, + "step": 106225 + }, + { + "epoch": 11.686468646864686, + "grad_norm": 1.050238847732544, + "learning_rate": 2.2012353008387914e-05, + "loss": 0.206, + "num_input_tokens_seen": 22420000, + "step": 106230 + }, + { + "epoch": 11.687018701870187, + "grad_norm": 1.6201831102371216, + "learning_rate": 2.200997015453346e-05, + "loss": 0.0703, + "num_input_tokens_seen": 22420992, + "step": 106235 + }, + { + "epoch": 11.687568756875688, + "grad_norm": 1.557862401008606, + "learning_rate": 2.2007587328236665e-05, + "loss": 0.0218, + "num_input_tokens_seen": 22422016, + "step": 106240 + }, + { + "epoch": 11.688118811881187, + "grad_norm": 0.026622094213962555, + "learning_rate": 2.2005204529519515e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22423168, + "step": 106245 + }, + { + "epoch": 11.688668866886688, + "grad_norm": 0.8392817378044128, + "learning_rate": 2.2002821758403946e-05, + "loss": 0.0063, + "num_input_tokens_seen": 22424224, + "step": 106250 + }, + { + "epoch": 11.68921892189219, + "grad_norm": 0.06034097075462341, + "learning_rate": 2.2000439014911937e-05, + "loss": 0.0695, + "num_input_tokens_seen": 22425248, + "step": 106255 + }, + { + "epoch": 11.689768976897689, + "grad_norm": 0.06884279102087021, + "learning_rate": 2.199805629906545e-05, + "loss": 0.0234, + "num_input_tokens_seen": 22426368, + "step": 106260 + }, + { + "epoch": 11.69031903190319, + "grad_norm": 0.016445057466626167, + "learning_rate": 2.199567361088642e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22427424, + "step": 106265 + }, + { + "epoch": 11.690869086908691, + "grad_norm": 0.11971135437488556, + "learning_rate": 2.199329095039684e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22428544, + "step": 106270 + }, + { + "epoch": 11.691419141914192, + "grad_norm": 1.2490661144256592, + "learning_rate": 2.199090831761864e-05, + "loss": 0.0482, + "num_input_tokens_seen": 22429600, + "step": 106275 + }, + { + "epoch": 11.691969196919691, + "grad_norm": 0.014986167661845684, + "learning_rate": 2.198852571257381e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22430688, + "step": 106280 + }, + { + "epoch": 11.692519251925193, + "grad_norm": 4.241835594177246, + "learning_rate": 2.1986143135284288e-05, + "loss": 0.0511, + "num_input_tokens_seen": 22431680, + "step": 106285 + }, + { + "epoch": 11.693069306930694, + "grad_norm": 0.8000313639640808, + "learning_rate": 2.198376058577203e-05, + "loss": 0.0106, + "num_input_tokens_seen": 22432800, + "step": 106290 + }, + { + "epoch": 11.693619361936193, + "grad_norm": 0.5132673978805542, + "learning_rate": 2.1981378064059014e-05, + "loss": 0.0513, + "num_input_tokens_seen": 22433920, + "step": 106295 + }, + { + "epoch": 11.694169416941694, + "grad_norm": 0.009939917363226414, + "learning_rate": 2.1978995570167178e-05, + "loss": 0.0724, + "num_input_tokens_seen": 22434976, + "step": 106300 + }, + { + "epoch": 11.694719471947195, + "grad_norm": 2.3497061729431152, + "learning_rate": 2.1976613104118504e-05, + "loss": 0.1169, + "num_input_tokens_seen": 22436032, + "step": 106305 + }, + { + "epoch": 11.695269526952695, + "grad_norm": 0.06478314101696014, + "learning_rate": 2.1974230665934927e-05, + "loss": 0.0021, + "num_input_tokens_seen": 22437088, + "step": 106310 + }, + { + "epoch": 11.695819581958196, + "grad_norm": 1.438308835029602, + "learning_rate": 2.197184825563841e-05, + "loss": 0.066, + "num_input_tokens_seen": 22438080, + "step": 106315 + }, + { + "epoch": 11.696369636963697, + "grad_norm": 0.4991128444671631, + "learning_rate": 2.196946587325093e-05, + "loss": 0.0509, + "num_input_tokens_seen": 22439200, + "step": 106320 + }, + { + "epoch": 11.696919691969196, + "grad_norm": 0.011090448126196861, + "learning_rate": 2.1967083518794407e-05, + "loss": 0.052, + "num_input_tokens_seen": 22440256, + "step": 106325 + }, + { + "epoch": 11.697469746974697, + "grad_norm": 0.7368274331092834, + "learning_rate": 2.1964701192290837e-05, + "loss": 0.113, + "num_input_tokens_seen": 22441312, + "step": 106330 + }, + { + "epoch": 11.698019801980198, + "grad_norm": 0.04322768375277519, + "learning_rate": 2.1962318893762153e-05, + "loss": 0.0115, + "num_input_tokens_seen": 22442368, + "step": 106335 + }, + { + "epoch": 11.6985698569857, + "grad_norm": 0.19195544719696045, + "learning_rate": 2.1959936623230317e-05, + "loss": 0.0278, + "num_input_tokens_seen": 22443360, + "step": 106340 + }, + { + "epoch": 11.699119911991199, + "grad_norm": 0.0656866729259491, + "learning_rate": 2.1957554380717297e-05, + "loss": 0.0363, + "num_input_tokens_seen": 22444384, + "step": 106345 + }, + { + "epoch": 11.6996699669967, + "grad_norm": 0.015501201152801514, + "learning_rate": 2.1955172166245025e-05, + "loss": 0.0214, + "num_input_tokens_seen": 22445408, + "step": 106350 + }, + { + "epoch": 11.7002200220022, + "grad_norm": 0.03663984313607216, + "learning_rate": 2.1952789979835476e-05, + "loss": 0.0124, + "num_input_tokens_seen": 22446432, + "step": 106355 + }, + { + "epoch": 11.7007700770077, + "grad_norm": 2.7736103534698486, + "learning_rate": 2.1950407821510606e-05, + "loss": 0.0855, + "num_input_tokens_seen": 22447552, + "step": 106360 + }, + { + "epoch": 11.701320132013201, + "grad_norm": 0.13847976922988892, + "learning_rate": 2.1948025691292352e-05, + "loss": 0.0375, + "num_input_tokens_seen": 22448544, + "step": 106365 + }, + { + "epoch": 11.701870187018702, + "grad_norm": 0.15236856043338776, + "learning_rate": 2.1945643589202684e-05, + "loss": 0.0134, + "num_input_tokens_seen": 22449600, + "step": 106370 + }, + { + "epoch": 11.702420242024202, + "grad_norm": 0.0162202175706625, + "learning_rate": 2.194326151526355e-05, + "loss": 0.0058, + "num_input_tokens_seen": 22450720, + "step": 106375 + }, + { + "epoch": 11.702970297029703, + "grad_norm": 0.05133536085486412, + "learning_rate": 2.194087946949692e-05, + "loss": 0.0044, + "num_input_tokens_seen": 22451808, + "step": 106380 + }, + { + "epoch": 11.703520352035204, + "grad_norm": 0.009540077298879623, + "learning_rate": 2.1938497451924727e-05, + "loss": 0.0062, + "num_input_tokens_seen": 22452928, + "step": 106385 + }, + { + "epoch": 11.704070407040705, + "grad_norm": 0.24276740849018097, + "learning_rate": 2.193611546256893e-05, + "loss": 0.0052, + "num_input_tokens_seen": 22454016, + "step": 106390 + }, + { + "epoch": 11.704620462046204, + "grad_norm": 0.026281259953975677, + "learning_rate": 2.1933733501451494e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22455008, + "step": 106395 + }, + { + "epoch": 11.705170517051705, + "grad_norm": 0.4066401720046997, + "learning_rate": 2.193135156859437e-05, + "loss": 0.0118, + "num_input_tokens_seen": 22456064, + "step": 106400 + }, + { + "epoch": 11.705720572057206, + "grad_norm": 0.02623266540467739, + "learning_rate": 2.1928969664019493e-05, + "loss": 0.0015, + "num_input_tokens_seen": 22457088, + "step": 106405 + }, + { + "epoch": 11.706270627062706, + "grad_norm": 0.40829604864120483, + "learning_rate": 2.192658778774884e-05, + "loss": 0.0075, + "num_input_tokens_seen": 22458112, + "step": 106410 + }, + { + "epoch": 11.706820682068207, + "grad_norm": 0.10020418465137482, + "learning_rate": 2.1924205939804342e-05, + "loss": 0.0604, + "num_input_tokens_seen": 22459200, + "step": 106415 + }, + { + "epoch": 11.707370737073708, + "grad_norm": 0.3931785523891449, + "learning_rate": 2.1921824120207975e-05, + "loss": 0.0073, + "num_input_tokens_seen": 22460192, + "step": 106420 + }, + { + "epoch": 11.707920792079207, + "grad_norm": 0.42209774255752563, + "learning_rate": 2.191944232898167e-05, + "loss": 0.0633, + "num_input_tokens_seen": 22461280, + "step": 106425 + }, + { + "epoch": 11.708470847084708, + "grad_norm": 0.013433914631605148, + "learning_rate": 2.1917060566147384e-05, + "loss": 0.0143, + "num_input_tokens_seen": 22462304, + "step": 106430 + }, + { + "epoch": 11.70902090209021, + "grad_norm": 0.012202566489577293, + "learning_rate": 2.191467883172708e-05, + "loss": 0.1109, + "num_input_tokens_seen": 22463328, + "step": 106435 + }, + { + "epoch": 11.70957095709571, + "grad_norm": 0.005142157431691885, + "learning_rate": 2.1912297125742694e-05, + "loss": 0.0015, + "num_input_tokens_seen": 22464352, + "step": 106440 + }, + { + "epoch": 11.71012101210121, + "grad_norm": 0.004171546548604965, + "learning_rate": 2.1909915448216184e-05, + "loss": 0.0173, + "num_input_tokens_seen": 22465376, + "step": 106445 + }, + { + "epoch": 11.710671067106711, + "grad_norm": 1.2104333639144897, + "learning_rate": 2.19075337991695e-05, + "loss": 0.171, + "num_input_tokens_seen": 22466400, + "step": 106450 + }, + { + "epoch": 11.711221122112212, + "grad_norm": 0.05507673695683479, + "learning_rate": 2.1905152178624595e-05, + "loss": 0.0064, + "num_input_tokens_seen": 22467584, + "step": 106455 + }, + { + "epoch": 11.711771177117711, + "grad_norm": 1.6103888750076294, + "learning_rate": 2.1902770586603422e-05, + "loss": 0.0738, + "num_input_tokens_seen": 22468640, + "step": 106460 + }, + { + "epoch": 11.712321232123212, + "grad_norm": 0.9229716658592224, + "learning_rate": 2.1900389023127917e-05, + "loss": 0.0105, + "num_input_tokens_seen": 22469760, + "step": 106465 + }, + { + "epoch": 11.712871287128714, + "grad_norm": 0.32894107699394226, + "learning_rate": 2.1898007488220045e-05, + "loss": 0.0139, + "num_input_tokens_seen": 22470816, + "step": 106470 + }, + { + "epoch": 11.713421342134213, + "grad_norm": 0.06733023375272751, + "learning_rate": 2.1895625981901754e-05, + "loss": 0.0097, + "num_input_tokens_seen": 22471872, + "step": 106475 + }, + { + "epoch": 11.713971397139714, + "grad_norm": 0.03239334374666214, + "learning_rate": 2.189324450419498e-05, + "loss": 0.0045, + "num_input_tokens_seen": 22472928, + "step": 106480 + }, + { + "epoch": 11.714521452145215, + "grad_norm": 5.615249156951904, + "learning_rate": 2.1890863055121685e-05, + "loss": 0.041, + "num_input_tokens_seen": 22474016, + "step": 106485 + }, + { + "epoch": 11.715071507150714, + "grad_norm": 0.008808156475424767, + "learning_rate": 2.1888481634703807e-05, + "loss": 0.0051, + "num_input_tokens_seen": 22475072, + "step": 106490 + }, + { + "epoch": 11.715621562156215, + "grad_norm": 0.043537694960832596, + "learning_rate": 2.188610024296331e-05, + "loss": 0.0064, + "num_input_tokens_seen": 22476064, + "step": 106495 + }, + { + "epoch": 11.716171617161717, + "grad_norm": 0.04193514585494995, + "learning_rate": 2.188371887992213e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22477120, + "step": 106500 + }, + { + "epoch": 11.716721672167218, + "grad_norm": 0.028593759983778, + "learning_rate": 2.1881337545602213e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22478176, + "step": 106505 + }, + { + "epoch": 11.717271727172717, + "grad_norm": 0.01157811377197504, + "learning_rate": 2.1878956240025513e-05, + "loss": 0.0462, + "num_input_tokens_seen": 22479168, + "step": 106510 + }, + { + "epoch": 11.717821782178218, + "grad_norm": 0.05384093150496483, + "learning_rate": 2.1876574963213974e-05, + "loss": 0.0096, + "num_input_tokens_seen": 22480224, + "step": 106515 + }, + { + "epoch": 11.718371837183719, + "grad_norm": 1.2536289691925049, + "learning_rate": 2.1874193715189557e-05, + "loss": 0.0119, + "num_input_tokens_seen": 22481312, + "step": 106520 + }, + { + "epoch": 11.718921892189218, + "grad_norm": 0.16791756451129913, + "learning_rate": 2.1871812495974185e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22482336, + "step": 106525 + }, + { + "epoch": 11.71947194719472, + "grad_norm": 0.18342752754688263, + "learning_rate": 2.1869431305589815e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22483296, + "step": 106530 + }, + { + "epoch": 11.72002200220022, + "grad_norm": 1.3341106176376343, + "learning_rate": 2.1867050144058403e-05, + "loss": 0.0845, + "num_input_tokens_seen": 22484384, + "step": 106535 + }, + { + "epoch": 11.72057205720572, + "grad_norm": 0.03307805955410004, + "learning_rate": 2.186466901140188e-05, + "loss": 0.0048, + "num_input_tokens_seen": 22485376, + "step": 106540 + }, + { + "epoch": 11.721122112211221, + "grad_norm": 0.045285459607839584, + "learning_rate": 2.18622879076422e-05, + "loss": 0.0059, + "num_input_tokens_seen": 22486400, + "step": 106545 + }, + { + "epoch": 11.721672167216722, + "grad_norm": 0.00963114108890295, + "learning_rate": 2.1859906832801313e-05, + "loss": 0.0117, + "num_input_tokens_seen": 22487424, + "step": 106550 + }, + { + "epoch": 11.722222222222221, + "grad_norm": 0.03601309657096863, + "learning_rate": 2.1857525786901147e-05, + "loss": 0.0664, + "num_input_tokens_seen": 22488448, + "step": 106555 + }, + { + "epoch": 11.722772277227723, + "grad_norm": 0.07347749173641205, + "learning_rate": 2.1855144769963666e-05, + "loss": 0.0039, + "num_input_tokens_seen": 22489472, + "step": 106560 + }, + { + "epoch": 11.723322332233224, + "grad_norm": 0.015892887488007545, + "learning_rate": 2.18527637820108e-05, + "loss": 0.0272, + "num_input_tokens_seen": 22490528, + "step": 106565 + }, + { + "epoch": 11.723872387238725, + "grad_norm": 0.007304660975933075, + "learning_rate": 2.18503828230645e-05, + "loss": 0.0006, + "num_input_tokens_seen": 22491616, + "step": 106570 + }, + { + "epoch": 11.724422442244224, + "grad_norm": 0.01022613700479269, + "learning_rate": 2.1848001893146723e-05, + "loss": 0.0011, + "num_input_tokens_seen": 22492672, + "step": 106575 + }, + { + "epoch": 11.724972497249725, + "grad_norm": 0.010693643242120743, + "learning_rate": 2.1845620992279387e-05, + "loss": 0.007, + "num_input_tokens_seen": 22493760, + "step": 106580 + }, + { + "epoch": 11.725522552255226, + "grad_norm": 0.0711003839969635, + "learning_rate": 2.184324012048445e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22494816, + "step": 106585 + }, + { + "epoch": 11.726072607260726, + "grad_norm": 0.03088301606476307, + "learning_rate": 2.1840859277783866e-05, + "loss": 0.0566, + "num_input_tokens_seen": 22495872, + "step": 106590 + }, + { + "epoch": 11.726622662266227, + "grad_norm": 0.1717843860387802, + "learning_rate": 2.1838478464199548e-05, + "loss": 0.0895, + "num_input_tokens_seen": 22496928, + "step": 106595 + }, + { + "epoch": 11.727172717271728, + "grad_norm": 0.009882315061986446, + "learning_rate": 2.183609767975347e-05, + "loss": 0.1029, + "num_input_tokens_seen": 22497984, + "step": 106600 + }, + { + "epoch": 11.727722772277227, + "grad_norm": 1.3649475574493408, + "learning_rate": 2.1833716924467552e-05, + "loss": 0.0427, + "num_input_tokens_seen": 22499008, + "step": 106605 + }, + { + "epoch": 11.728272827282728, + "grad_norm": 0.01956508494913578, + "learning_rate": 2.1831336198363757e-05, + "loss": 0.0064, + "num_input_tokens_seen": 22500032, + "step": 106610 + }, + { + "epoch": 11.72882288228823, + "grad_norm": 0.0775131955742836, + "learning_rate": 2.182895550146401e-05, + "loss": 0.1047, + "num_input_tokens_seen": 22501024, + "step": 106615 + }, + { + "epoch": 11.729372937293729, + "grad_norm": 0.0285330917686224, + "learning_rate": 2.1826574833790252e-05, + "loss": 0.0405, + "num_input_tokens_seen": 22502048, + "step": 106620 + }, + { + "epoch": 11.72992299229923, + "grad_norm": 0.14063656330108643, + "learning_rate": 2.1824194195364443e-05, + "loss": 0.1189, + "num_input_tokens_seen": 22503136, + "step": 106625 + }, + { + "epoch": 11.73047304730473, + "grad_norm": 0.004044735338538885, + "learning_rate": 2.1821813586208498e-05, + "loss": 0.016, + "num_input_tokens_seen": 22504128, + "step": 106630 + }, + { + "epoch": 11.731023102310232, + "grad_norm": 0.0098729208111763, + "learning_rate": 2.1819433006344385e-05, + "loss": 0.0427, + "num_input_tokens_seen": 22505152, + "step": 106635 + }, + { + "epoch": 11.731573157315731, + "grad_norm": 1.1722874641418457, + "learning_rate": 2.181705245579403e-05, + "loss": 0.0513, + "num_input_tokens_seen": 22506240, + "step": 106640 + }, + { + "epoch": 11.732123212321232, + "grad_norm": 0.22164705395698547, + "learning_rate": 2.1814671934579366e-05, + "loss": 0.0085, + "num_input_tokens_seen": 22507296, + "step": 106645 + }, + { + "epoch": 11.732673267326733, + "grad_norm": 2.081449031829834, + "learning_rate": 2.1812291442722356e-05, + "loss": 0.0548, + "num_input_tokens_seen": 22508352, + "step": 106650 + }, + { + "epoch": 11.733223322332233, + "grad_norm": 1.1783527135849, + "learning_rate": 2.1809910980244914e-05, + "loss": 0.0601, + "num_input_tokens_seen": 22509408, + "step": 106655 + }, + { + "epoch": 11.733773377337734, + "grad_norm": 0.12350703775882721, + "learning_rate": 2.1807530547168996e-05, + "loss": 0.0721, + "num_input_tokens_seen": 22510400, + "step": 106660 + }, + { + "epoch": 11.734323432343235, + "grad_norm": 0.02693353220820427, + "learning_rate": 2.1805150143516545e-05, + "loss": 0.1043, + "num_input_tokens_seen": 22511392, + "step": 106665 + }, + { + "epoch": 11.734873487348734, + "grad_norm": 0.013583384454250336, + "learning_rate": 2.180276976930948e-05, + "loss": 0.1279, + "num_input_tokens_seen": 22512544, + "step": 106670 + }, + { + "epoch": 11.735423542354235, + "grad_norm": 0.03233233094215393, + "learning_rate": 2.1800389424569758e-05, + "loss": 0.0746, + "num_input_tokens_seen": 22513664, + "step": 106675 + }, + { + "epoch": 11.735973597359736, + "grad_norm": 1.1582564115524292, + "learning_rate": 2.1798009109319307e-05, + "loss": 0.0171, + "num_input_tokens_seen": 22514720, + "step": 106680 + }, + { + "epoch": 11.736523652365236, + "grad_norm": 0.09821640700101852, + "learning_rate": 2.1795628823580077e-05, + "loss": 0.0632, + "num_input_tokens_seen": 22515712, + "step": 106685 + }, + { + "epoch": 11.737073707370737, + "grad_norm": 0.02623278833925724, + "learning_rate": 2.1793248567373996e-05, + "loss": 0.0066, + "num_input_tokens_seen": 22516800, + "step": 106690 + }, + { + "epoch": 11.737623762376238, + "grad_norm": 2.6380679607391357, + "learning_rate": 2.1790868340723e-05, + "loss": 0.075, + "num_input_tokens_seen": 22517856, + "step": 106695 + }, + { + "epoch": 11.738173817381739, + "grad_norm": 0.008070769719779491, + "learning_rate": 2.178848814364903e-05, + "loss": 0.0142, + "num_input_tokens_seen": 22518848, + "step": 106700 + }, + { + "epoch": 11.738723872387238, + "grad_norm": 0.08659522235393524, + "learning_rate": 2.1786107976174024e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22519968, + "step": 106705 + }, + { + "epoch": 11.73927392739274, + "grad_norm": 0.07800696045160294, + "learning_rate": 2.1783727838319926e-05, + "loss": 0.1333, + "num_input_tokens_seen": 22521024, + "step": 106710 + }, + { + "epoch": 11.73982398239824, + "grad_norm": 0.05744113028049469, + "learning_rate": 2.178134773010866e-05, + "loss": 0.0349, + "num_input_tokens_seen": 22522048, + "step": 106715 + }, + { + "epoch": 11.74037403740374, + "grad_norm": 1.3130055665969849, + "learning_rate": 2.1778967651562167e-05, + "loss": 0.1508, + "num_input_tokens_seen": 22523104, + "step": 106720 + }, + { + "epoch": 11.74092409240924, + "grad_norm": 0.5363555550575256, + "learning_rate": 2.177658760270239e-05, + "loss": 0.0359, + "num_input_tokens_seen": 22524064, + "step": 106725 + }, + { + "epoch": 11.741474147414742, + "grad_norm": 0.016162805259227753, + "learning_rate": 2.1774207583551245e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22525088, + "step": 106730 + }, + { + "epoch": 11.742024202420241, + "grad_norm": 0.018029840663075447, + "learning_rate": 2.177182759413069e-05, + "loss": 0.002, + "num_input_tokens_seen": 22526112, + "step": 106735 + }, + { + "epoch": 11.742574257425742, + "grad_norm": 0.5384693741798401, + "learning_rate": 2.1769447634462657e-05, + "loss": 0.0134, + "num_input_tokens_seen": 22527168, + "step": 106740 + }, + { + "epoch": 11.743124312431243, + "grad_norm": 0.07989470660686493, + "learning_rate": 2.1767067704569056e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22528128, + "step": 106745 + }, + { + "epoch": 11.743674367436743, + "grad_norm": 0.12921640276908875, + "learning_rate": 2.1764687804471862e-05, + "loss": 0.019, + "num_input_tokens_seen": 22529152, + "step": 106750 + }, + { + "epoch": 11.744224422442244, + "grad_norm": 0.016281399875879288, + "learning_rate": 2.1762307934192972e-05, + "loss": 0.0409, + "num_input_tokens_seen": 22530240, + "step": 106755 + }, + { + "epoch": 11.744774477447745, + "grad_norm": 1.4809589385986328, + "learning_rate": 2.1759928093754345e-05, + "loss": 0.0825, + "num_input_tokens_seen": 22531296, + "step": 106760 + }, + { + "epoch": 11.745324532453246, + "grad_norm": 0.08612687140703201, + "learning_rate": 2.1757548283177908e-05, + "loss": 0.0291, + "num_input_tokens_seen": 22532448, + "step": 106765 + }, + { + "epoch": 11.745874587458745, + "grad_norm": 0.052547868341207504, + "learning_rate": 2.1755168502485583e-05, + "loss": 0.0749, + "num_input_tokens_seen": 22533536, + "step": 106770 + }, + { + "epoch": 11.746424642464246, + "grad_norm": 0.040183085948228836, + "learning_rate": 2.175278875169932e-05, + "loss": 0.0051, + "num_input_tokens_seen": 22534656, + "step": 106775 + }, + { + "epoch": 11.746974697469748, + "grad_norm": 0.13487276434898376, + "learning_rate": 2.1750409030841048e-05, + "loss": 0.08, + "num_input_tokens_seen": 22535744, + "step": 106780 + }, + { + "epoch": 11.747524752475247, + "grad_norm": 0.0924731194972992, + "learning_rate": 2.1748029339932687e-05, + "loss": 0.0303, + "num_input_tokens_seen": 22536832, + "step": 106785 + }, + { + "epoch": 11.748074807480748, + "grad_norm": 0.021548395976424217, + "learning_rate": 2.1745649678996184e-05, + "loss": 0.0059, + "num_input_tokens_seen": 22537856, + "step": 106790 + }, + { + "epoch": 11.748624862486249, + "grad_norm": 0.35488349199295044, + "learning_rate": 2.1743270048053463e-05, + "loss": 0.0155, + "num_input_tokens_seen": 22538912, + "step": 106795 + }, + { + "epoch": 11.749174917491748, + "grad_norm": 0.028372712433338165, + "learning_rate": 2.174089044712647e-05, + "loss": 0.0423, + "num_input_tokens_seen": 22539968, + "step": 106800 + }, + { + "epoch": 11.74972497249725, + "grad_norm": 0.044657833874225616, + "learning_rate": 2.1738510876237118e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22541024, + "step": 106805 + }, + { + "epoch": 11.75027502750275, + "grad_norm": 0.012859857641160488, + "learning_rate": 2.1736131335407342e-05, + "loss": 0.0779, + "num_input_tokens_seen": 22542048, + "step": 106810 + }, + { + "epoch": 11.750825082508252, + "grad_norm": 0.035897549241781235, + "learning_rate": 2.173375182465908e-05, + "loss": 0.0272, + "num_input_tokens_seen": 22543104, + "step": 106815 + }, + { + "epoch": 11.751375137513751, + "grad_norm": 0.1434202939271927, + "learning_rate": 2.173137234401426e-05, + "loss": 0.0453, + "num_input_tokens_seen": 22544192, + "step": 106820 + }, + { + "epoch": 11.751925192519252, + "grad_norm": 0.03863957151770592, + "learning_rate": 2.172899289349482e-05, + "loss": 0.0088, + "num_input_tokens_seen": 22545248, + "step": 106825 + }, + { + "epoch": 11.752475247524753, + "grad_norm": 0.05757661536335945, + "learning_rate": 2.172661347312268e-05, + "loss": 0.0595, + "num_input_tokens_seen": 22546272, + "step": 106830 + }, + { + "epoch": 11.753025302530252, + "grad_norm": 1.6076527833938599, + "learning_rate": 2.1724234082919766e-05, + "loss": 0.0679, + "num_input_tokens_seen": 22547424, + "step": 106835 + }, + { + "epoch": 11.753575357535754, + "grad_norm": 1.6834876537322998, + "learning_rate": 2.1721854722908025e-05, + "loss": 0.0376, + "num_input_tokens_seen": 22548512, + "step": 106840 + }, + { + "epoch": 11.754125412541255, + "grad_norm": 0.049233417958021164, + "learning_rate": 2.1719475393109362e-05, + "loss": 0.0032, + "num_input_tokens_seen": 22549632, + "step": 106845 + }, + { + "epoch": 11.754675467546754, + "grad_norm": 0.012470878660678864, + "learning_rate": 2.171709609354573e-05, + "loss": 0.0461, + "num_input_tokens_seen": 22550720, + "step": 106850 + }, + { + "epoch": 11.755225522552255, + "grad_norm": 0.010272560641169548, + "learning_rate": 2.1714716824239055e-05, + "loss": 0.0288, + "num_input_tokens_seen": 22551872, + "step": 106855 + }, + { + "epoch": 11.755775577557756, + "grad_norm": 0.14644891023635864, + "learning_rate": 2.171233758521124e-05, + "loss": 0.0052, + "num_input_tokens_seen": 22552928, + "step": 106860 + }, + { + "epoch": 11.756325632563257, + "grad_norm": 0.005155028309673071, + "learning_rate": 2.1709958376484246e-05, + "loss": 0.1024, + "num_input_tokens_seen": 22553888, + "step": 106865 + }, + { + "epoch": 11.756875687568757, + "grad_norm": 0.04493376240134239, + "learning_rate": 2.1707579198079977e-05, + "loss": 0.0324, + "num_input_tokens_seen": 22554848, + "step": 106870 + }, + { + "epoch": 11.757425742574258, + "grad_norm": 0.047584205865859985, + "learning_rate": 2.170520005002038e-05, + "loss": 0.0048, + "num_input_tokens_seen": 22555872, + "step": 106875 + }, + { + "epoch": 11.757975797579759, + "grad_norm": 0.09134583920240402, + "learning_rate": 2.1702820932327372e-05, + "loss": 0.0062, + "num_input_tokens_seen": 22556960, + "step": 106880 + }, + { + "epoch": 11.758525852585258, + "grad_norm": 0.06742198020219803, + "learning_rate": 2.1700441845022876e-05, + "loss": 0.0099, + "num_input_tokens_seen": 22558016, + "step": 106885 + }, + { + "epoch": 11.75907590759076, + "grad_norm": 0.10802941024303436, + "learning_rate": 2.1698062788128827e-05, + "loss": 0.0079, + "num_input_tokens_seen": 22559104, + "step": 106890 + }, + { + "epoch": 11.75962596259626, + "grad_norm": 0.018071012571454048, + "learning_rate": 2.1695683761667144e-05, + "loss": 0.0075, + "num_input_tokens_seen": 22560192, + "step": 106895 + }, + { + "epoch": 11.76017601760176, + "grad_norm": 0.07178031653165817, + "learning_rate": 2.1693304765659768e-05, + "loss": 0.0242, + "num_input_tokens_seen": 22561216, + "step": 106900 + }, + { + "epoch": 11.76072607260726, + "grad_norm": 0.03152487054467201, + "learning_rate": 2.1690925800128614e-05, + "loss": 0.0016, + "num_input_tokens_seen": 22562272, + "step": 106905 + }, + { + "epoch": 11.761276127612762, + "grad_norm": 0.009340076707303524, + "learning_rate": 2.16885468650956e-05, + "loss": 0.0231, + "num_input_tokens_seen": 22563296, + "step": 106910 + }, + { + "epoch": 11.761826182618261, + "grad_norm": 0.009298042394220829, + "learning_rate": 2.168616796058267e-05, + "loss": 0.0477, + "num_input_tokens_seen": 22564448, + "step": 106915 + }, + { + "epoch": 11.762376237623762, + "grad_norm": 0.03151310980319977, + "learning_rate": 2.168378908661173e-05, + "loss": 0.0727, + "num_input_tokens_seen": 22565536, + "step": 106920 + }, + { + "epoch": 11.762926292629263, + "grad_norm": 0.01077175047248602, + "learning_rate": 2.1681410243204724e-05, + "loss": 0.005, + "num_input_tokens_seen": 22566528, + "step": 106925 + }, + { + "epoch": 11.763476347634764, + "grad_norm": 0.015843670815229416, + "learning_rate": 2.167903143038357e-05, + "loss": 0.0069, + "num_input_tokens_seen": 22567552, + "step": 106930 + }, + { + "epoch": 11.764026402640264, + "grad_norm": 1.5925333499908447, + "learning_rate": 2.1676652648170175e-05, + "loss": 0.1176, + "num_input_tokens_seen": 22568576, + "step": 106935 + }, + { + "epoch": 11.764576457645765, + "grad_norm": 0.24718010425567627, + "learning_rate": 2.1674273896586492e-05, + "loss": 0.0534, + "num_input_tokens_seen": 22569632, + "step": 106940 + }, + { + "epoch": 11.765126512651266, + "grad_norm": 0.19958744943141937, + "learning_rate": 2.1671895175654418e-05, + "loss": 0.1142, + "num_input_tokens_seen": 22570688, + "step": 106945 + }, + { + "epoch": 11.765676567656765, + "grad_norm": 0.4504944980144501, + "learning_rate": 2.1669516485395897e-05, + "loss": 0.0111, + "num_input_tokens_seen": 22571744, + "step": 106950 + }, + { + "epoch": 11.766226622662266, + "grad_norm": 0.1683010756969452, + "learning_rate": 2.166713782583285e-05, + "loss": 0.0115, + "num_input_tokens_seen": 22572832, + "step": 106955 + }, + { + "epoch": 11.766776677667767, + "grad_norm": 0.021740563213825226, + "learning_rate": 2.1664759196987182e-05, + "loss": 0.0724, + "num_input_tokens_seen": 22573952, + "step": 106960 + }, + { + "epoch": 11.767326732673267, + "grad_norm": 0.02510581910610199, + "learning_rate": 2.166238059888084e-05, + "loss": 0.0229, + "num_input_tokens_seen": 22574976, + "step": 106965 + }, + { + "epoch": 11.767876787678768, + "grad_norm": 0.018780430778861046, + "learning_rate": 2.1660002031535733e-05, + "loss": 0.0179, + "num_input_tokens_seen": 22576032, + "step": 106970 + }, + { + "epoch": 11.768426842684269, + "grad_norm": 0.09393957257270813, + "learning_rate": 2.1657623494973776e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22577120, + "step": 106975 + }, + { + "epoch": 11.768976897689768, + "grad_norm": 0.012934273108839989, + "learning_rate": 2.1655244989216903e-05, + "loss": 0.048, + "num_input_tokens_seen": 22578208, + "step": 106980 + }, + { + "epoch": 11.76952695269527, + "grad_norm": 0.04887557029724121, + "learning_rate": 2.165286651428703e-05, + "loss": 0.0128, + "num_input_tokens_seen": 22579264, + "step": 106985 + }, + { + "epoch": 11.77007700770077, + "grad_norm": 0.0536409467458725, + "learning_rate": 2.165048807020608e-05, + "loss": 0.009, + "num_input_tokens_seen": 22580288, + "step": 106990 + }, + { + "epoch": 11.770627062706271, + "grad_norm": 0.777126133441925, + "learning_rate": 2.1648109656995984e-05, + "loss": 0.0173, + "num_input_tokens_seen": 22581344, + "step": 106995 + }, + { + "epoch": 11.77117711771177, + "grad_norm": 0.024169020354747772, + "learning_rate": 2.1645731274678643e-05, + "loss": 0.1509, + "num_input_tokens_seen": 22582368, + "step": 107000 + }, + { + "epoch": 11.771727172717272, + "grad_norm": 0.10738660395145416, + "learning_rate": 2.1643352923275988e-05, + "loss": 0.0861, + "num_input_tokens_seen": 22583392, + "step": 107005 + }, + { + "epoch": 11.772277227722773, + "grad_norm": 0.48525431752204895, + "learning_rate": 2.1640974602809937e-05, + "loss": 0.0294, + "num_input_tokens_seen": 22584512, + "step": 107010 + }, + { + "epoch": 11.772827282728272, + "grad_norm": 0.015601416118443012, + "learning_rate": 2.1638596313302422e-05, + "loss": 0.0908, + "num_input_tokens_seen": 22585600, + "step": 107015 + }, + { + "epoch": 11.773377337733773, + "grad_norm": 0.13575124740600586, + "learning_rate": 2.1636218054775344e-05, + "loss": 0.0616, + "num_input_tokens_seen": 22586656, + "step": 107020 + }, + { + "epoch": 11.773927392739274, + "grad_norm": 0.3044971227645874, + "learning_rate": 2.1633839827250628e-05, + "loss": 0.0085, + "num_input_tokens_seen": 22587744, + "step": 107025 + }, + { + "epoch": 11.774477447744774, + "grad_norm": 0.01153479516506195, + "learning_rate": 2.1631461630750204e-05, + "loss": 0.0067, + "num_input_tokens_seen": 22588736, + "step": 107030 + }, + { + "epoch": 11.775027502750275, + "grad_norm": 0.019357111304998398, + "learning_rate": 2.162908346529597e-05, + "loss": 0.084, + "num_input_tokens_seen": 22589824, + "step": 107035 + }, + { + "epoch": 11.775577557755776, + "grad_norm": 0.014110441319644451, + "learning_rate": 2.1626705330909865e-05, + "loss": 0.0028, + "num_input_tokens_seen": 22590848, + "step": 107040 + }, + { + "epoch": 11.776127612761275, + "grad_norm": 0.32584860920906067, + "learning_rate": 2.16243272276138e-05, + "loss": 0.1872, + "num_input_tokens_seen": 22591904, + "step": 107045 + }, + { + "epoch": 11.776677667766776, + "grad_norm": 0.18225976824760437, + "learning_rate": 2.1621949155429676e-05, + "loss": 0.0056, + "num_input_tokens_seen": 22592928, + "step": 107050 + }, + { + "epoch": 11.777227722772277, + "grad_norm": 0.28125107288360596, + "learning_rate": 2.1619571114379442e-05, + "loss": 0.0167, + "num_input_tokens_seen": 22593952, + "step": 107055 + }, + { + "epoch": 11.777777777777779, + "grad_norm": 0.09105248004198074, + "learning_rate": 2.1617193104484988e-05, + "loss": 0.0286, + "num_input_tokens_seen": 22594976, + "step": 107060 + }, + { + "epoch": 11.778327832783278, + "grad_norm": 0.009229599498212337, + "learning_rate": 2.1614815125768248e-05, + "loss": 0.0766, + "num_input_tokens_seen": 22596032, + "step": 107065 + }, + { + "epoch": 11.778877887788779, + "grad_norm": 2.2397961616516113, + "learning_rate": 2.161243717825114e-05, + "loss": 0.0892, + "num_input_tokens_seen": 22597120, + "step": 107070 + }, + { + "epoch": 11.77942794279428, + "grad_norm": 0.06247791647911072, + "learning_rate": 2.1610059261955557e-05, + "loss": 0.0204, + "num_input_tokens_seen": 22598080, + "step": 107075 + }, + { + "epoch": 11.77997799779978, + "grad_norm": 0.08231907337903976, + "learning_rate": 2.1607681376903442e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22599072, + "step": 107080 + }, + { + "epoch": 11.78052805280528, + "grad_norm": 0.28421086072921753, + "learning_rate": 2.1605303523116692e-05, + "loss": 0.0065, + "num_input_tokens_seen": 22600096, + "step": 107085 + }, + { + "epoch": 11.781078107810782, + "grad_norm": 1.617235779762268, + "learning_rate": 2.160292570061724e-05, + "loss": 0.1228, + "num_input_tokens_seen": 22601088, + "step": 107090 + }, + { + "epoch": 11.781628162816281, + "grad_norm": 0.04150785505771637, + "learning_rate": 2.160054790942699e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22602176, + "step": 107095 + }, + { + "epoch": 11.782178217821782, + "grad_norm": 0.5559359788894653, + "learning_rate": 2.159817014956785e-05, + "loss": 0.0326, + "num_input_tokens_seen": 22603200, + "step": 107100 + }, + { + "epoch": 11.782728272827283, + "grad_norm": 0.01523113064467907, + "learning_rate": 2.1595792421061754e-05, + "loss": 0.0706, + "num_input_tokens_seen": 22604288, + "step": 107105 + }, + { + "epoch": 11.783278327832782, + "grad_norm": 0.010569379664957523, + "learning_rate": 2.1593414723930587e-05, + "loss": 0.0315, + "num_input_tokens_seen": 22605280, + "step": 107110 + }, + { + "epoch": 11.783828382838283, + "grad_norm": 3.953463077545166, + "learning_rate": 2.1591037058196305e-05, + "loss": 0.1138, + "num_input_tokens_seen": 22606400, + "step": 107115 + }, + { + "epoch": 11.784378437843785, + "grad_norm": 0.00795299094170332, + "learning_rate": 2.158865942388079e-05, + "loss": 0.0616, + "num_input_tokens_seen": 22607424, + "step": 107120 + }, + { + "epoch": 11.784928492849286, + "grad_norm": 1.3558580875396729, + "learning_rate": 2.1586281821005958e-05, + "loss": 0.0566, + "num_input_tokens_seen": 22608480, + "step": 107125 + }, + { + "epoch": 11.785478547854785, + "grad_norm": 0.36963167786598206, + "learning_rate": 2.1583904249593738e-05, + "loss": 0.0101, + "num_input_tokens_seen": 22609504, + "step": 107130 + }, + { + "epoch": 11.786028602860286, + "grad_norm": 0.10532223433256149, + "learning_rate": 2.1581526709666024e-05, + "loss": 0.0736, + "num_input_tokens_seen": 22610560, + "step": 107135 + }, + { + "epoch": 11.786578657865787, + "grad_norm": 1.231675148010254, + "learning_rate": 2.1579149201244747e-05, + "loss": 0.146, + "num_input_tokens_seen": 22611520, + "step": 107140 + }, + { + "epoch": 11.787128712871286, + "grad_norm": 0.3973824679851532, + "learning_rate": 2.1576771724351813e-05, + "loss": 0.012, + "num_input_tokens_seen": 22612640, + "step": 107145 + }, + { + "epoch": 11.787678767876788, + "grad_norm": 0.05354364961385727, + "learning_rate": 2.157439427900912e-05, + "loss": 0.0066, + "num_input_tokens_seen": 22613728, + "step": 107150 + }, + { + "epoch": 11.788228822882289, + "grad_norm": 1.659315586090088, + "learning_rate": 2.1572016865238596e-05, + "loss": 0.0566, + "num_input_tokens_seen": 22614848, + "step": 107155 + }, + { + "epoch": 11.788778877887788, + "grad_norm": 2.0854690074920654, + "learning_rate": 2.1569639483062156e-05, + "loss": 0.1457, + "num_input_tokens_seen": 22615840, + "step": 107160 + }, + { + "epoch": 11.789328932893289, + "grad_norm": 0.030751295387744904, + "learning_rate": 2.1567262132501688e-05, + "loss": 0.0057, + "num_input_tokens_seen": 22616832, + "step": 107165 + }, + { + "epoch": 11.78987898789879, + "grad_norm": 0.8733691573143005, + "learning_rate": 2.1564884813579124e-05, + "loss": 0.1278, + "num_input_tokens_seen": 22617920, + "step": 107170 + }, + { + "epoch": 11.79042904290429, + "grad_norm": 0.8684431314468384, + "learning_rate": 2.1562507526316364e-05, + "loss": 0.0295, + "num_input_tokens_seen": 22619008, + "step": 107175 + }, + { + "epoch": 11.79097909790979, + "grad_norm": 0.16296759247779846, + "learning_rate": 2.1560130270735327e-05, + "loss": 0.005, + "num_input_tokens_seen": 22620064, + "step": 107180 + }, + { + "epoch": 11.791529152915292, + "grad_norm": 0.04345141723752022, + "learning_rate": 2.1557753046857927e-05, + "loss": 0.0492, + "num_input_tokens_seen": 22621152, + "step": 107185 + }, + { + "epoch": 11.792079207920793, + "grad_norm": 0.07135577499866486, + "learning_rate": 2.155537585470605e-05, + "loss": 0.0439, + "num_input_tokens_seen": 22622240, + "step": 107190 + }, + { + "epoch": 11.792629262926292, + "grad_norm": 3.886349678039551, + "learning_rate": 2.155299869430163e-05, + "loss": 0.0928, + "num_input_tokens_seen": 22623328, + "step": 107195 + }, + { + "epoch": 11.793179317931793, + "grad_norm": 0.014547787606716156, + "learning_rate": 2.155062156566656e-05, + "loss": 0.0718, + "num_input_tokens_seen": 22624352, + "step": 107200 + }, + { + "epoch": 11.793729372937294, + "grad_norm": 0.02707412652671337, + "learning_rate": 2.1548244468822767e-05, + "loss": 0.0388, + "num_input_tokens_seen": 22625408, + "step": 107205 + }, + { + "epoch": 11.794279427942794, + "grad_norm": 0.060631535947322845, + "learning_rate": 2.1545867403792145e-05, + "loss": 0.0213, + "num_input_tokens_seen": 22626496, + "step": 107210 + }, + { + "epoch": 11.794829482948295, + "grad_norm": 0.15403130650520325, + "learning_rate": 2.1543490370596597e-05, + "loss": 0.0041, + "num_input_tokens_seen": 22627520, + "step": 107215 + }, + { + "epoch": 11.795379537953796, + "grad_norm": 0.03653036057949066, + "learning_rate": 2.1541113369258052e-05, + "loss": 0.0505, + "num_input_tokens_seen": 22628608, + "step": 107220 + }, + { + "epoch": 11.795929592959295, + "grad_norm": 0.028923748061060905, + "learning_rate": 2.1538736399798392e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22629568, + "step": 107225 + }, + { + "epoch": 11.796479647964796, + "grad_norm": 0.11926022917032242, + "learning_rate": 2.1536359462239548e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22630592, + "step": 107230 + }, + { + "epoch": 11.797029702970297, + "grad_norm": 2.0795094966888428, + "learning_rate": 2.1533982556603413e-05, + "loss": 0.0509, + "num_input_tokens_seen": 22631616, + "step": 107235 + }, + { + "epoch": 11.797579757975798, + "grad_norm": 0.04035186022520065, + "learning_rate": 2.1531605682911897e-05, + "loss": 0.0983, + "num_input_tokens_seen": 22632704, + "step": 107240 + }, + { + "epoch": 11.798129812981298, + "grad_norm": 0.04526512324810028, + "learning_rate": 2.1529228841186913e-05, + "loss": 0.0121, + "num_input_tokens_seen": 22633696, + "step": 107245 + }, + { + "epoch": 11.798679867986799, + "grad_norm": 2.0971295833587646, + "learning_rate": 2.152685203145035e-05, + "loss": 0.0372, + "num_input_tokens_seen": 22634752, + "step": 107250 + }, + { + "epoch": 11.7992299229923, + "grad_norm": 0.006613160949200392, + "learning_rate": 2.1524475253724134e-05, + "loss": 0.0071, + "num_input_tokens_seen": 22635808, + "step": 107255 + }, + { + "epoch": 11.7997799779978, + "grad_norm": 0.06851713359355927, + "learning_rate": 2.1522098508030168e-05, + "loss": 0.0075, + "num_input_tokens_seen": 22636768, + "step": 107260 + }, + { + "epoch": 11.8003300330033, + "grad_norm": 1.1058799028396606, + "learning_rate": 2.1519721794390335e-05, + "loss": 0.0494, + "num_input_tokens_seen": 22637760, + "step": 107265 + }, + { + "epoch": 11.800880088008801, + "grad_norm": 0.1056492030620575, + "learning_rate": 2.1517345112826565e-05, + "loss": 0.0082, + "num_input_tokens_seen": 22638816, + "step": 107270 + }, + { + "epoch": 11.8014301430143, + "grad_norm": 0.0346100814640522, + "learning_rate": 2.151496846336075e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22639904, + "step": 107275 + }, + { + "epoch": 11.801980198019802, + "grad_norm": 0.20280218124389648, + "learning_rate": 2.1512591846014807e-05, + "loss": 0.0063, + "num_input_tokens_seen": 22640928, + "step": 107280 + }, + { + "epoch": 11.802530253025303, + "grad_norm": 0.6426182389259338, + "learning_rate": 2.151021526081063e-05, + "loss": 0.1325, + "num_input_tokens_seen": 22641952, + "step": 107285 + }, + { + "epoch": 11.803080308030804, + "grad_norm": 0.04655110836029053, + "learning_rate": 2.150783870777011e-05, + "loss": 0.0181, + "num_input_tokens_seen": 22643008, + "step": 107290 + }, + { + "epoch": 11.803630363036303, + "grad_norm": 0.0794302448630333, + "learning_rate": 2.150546218691518e-05, + "loss": 0.0807, + "num_input_tokens_seen": 22644032, + "step": 107295 + }, + { + "epoch": 11.804180418041804, + "grad_norm": 0.052391067147254944, + "learning_rate": 2.1503085698267717e-05, + "loss": 0.0049, + "num_input_tokens_seen": 22645152, + "step": 107300 + }, + { + "epoch": 11.804730473047305, + "grad_norm": 0.06321123987436295, + "learning_rate": 2.150070924184965e-05, + "loss": 0.0105, + "num_input_tokens_seen": 22646240, + "step": 107305 + }, + { + "epoch": 11.805280528052805, + "grad_norm": 0.014181124046444893, + "learning_rate": 2.149833281768286e-05, + "loss": 0.0052, + "num_input_tokens_seen": 22647328, + "step": 107310 + }, + { + "epoch": 11.805830583058306, + "grad_norm": 0.05282822251319885, + "learning_rate": 2.1495956425789248e-05, + "loss": 0.0232, + "num_input_tokens_seen": 22648448, + "step": 107315 + }, + { + "epoch": 11.806380638063807, + "grad_norm": 0.017502931877970695, + "learning_rate": 2.1493580066190738e-05, + "loss": 0.0728, + "num_input_tokens_seen": 22649568, + "step": 107320 + }, + { + "epoch": 11.806930693069306, + "grad_norm": 0.014315336011350155, + "learning_rate": 2.1491203738909205e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22650624, + "step": 107325 + }, + { + "epoch": 11.807480748074807, + "grad_norm": 0.15645645558834076, + "learning_rate": 2.148882744396657e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22651712, + "step": 107330 + }, + { + "epoch": 11.808030803080309, + "grad_norm": 0.07176104933023453, + "learning_rate": 2.1486451181384734e-05, + "loss": 0.0203, + "num_input_tokens_seen": 22652800, + "step": 107335 + }, + { + "epoch": 11.808580858085808, + "grad_norm": 0.16065187752246857, + "learning_rate": 2.1484074951185577e-05, + "loss": 0.0632, + "num_input_tokens_seen": 22653888, + "step": 107340 + }, + { + "epoch": 11.809130913091309, + "grad_norm": 0.03038998693227768, + "learning_rate": 2.1481698753391025e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22654912, + "step": 107345 + }, + { + "epoch": 11.80968096809681, + "grad_norm": 0.054654400795698166, + "learning_rate": 2.147932258802297e-05, + "loss": 0.0134, + "num_input_tokens_seen": 22655968, + "step": 107350 + }, + { + "epoch": 11.810231023102311, + "grad_norm": 0.6142993569374084, + "learning_rate": 2.147694645510329e-05, + "loss": 0.0156, + "num_input_tokens_seen": 22657024, + "step": 107355 + }, + { + "epoch": 11.81078107810781, + "grad_norm": 0.8506576418876648, + "learning_rate": 2.147457035465393e-05, + "loss": 0.0194, + "num_input_tokens_seen": 22658048, + "step": 107360 + }, + { + "epoch": 11.811331133113312, + "grad_norm": 0.0467284731566906, + "learning_rate": 2.1472194286696744e-05, + "loss": 0.1391, + "num_input_tokens_seen": 22659104, + "step": 107365 + }, + { + "epoch": 11.811881188118813, + "grad_norm": 0.03380647674202919, + "learning_rate": 2.1469818251253657e-05, + "loss": 0.1058, + "num_input_tokens_seen": 22660128, + "step": 107370 + }, + { + "epoch": 11.812431243124312, + "grad_norm": 0.025665609166026115, + "learning_rate": 2.1467442248346566e-05, + "loss": 0.0074, + "num_input_tokens_seen": 22661216, + "step": 107375 + }, + { + "epoch": 11.812981298129813, + "grad_norm": 0.03341222181916237, + "learning_rate": 2.1465066277997358e-05, + "loss": 0.006, + "num_input_tokens_seen": 22662240, + "step": 107380 + }, + { + "epoch": 11.813531353135314, + "grad_norm": 0.02586643025279045, + "learning_rate": 2.146269034022794e-05, + "loss": 0.0038, + "num_input_tokens_seen": 22663264, + "step": 107385 + }, + { + "epoch": 11.814081408140813, + "grad_norm": 0.03368692472577095, + "learning_rate": 2.14603144350602e-05, + "loss": 0.0036, + "num_input_tokens_seen": 22664352, + "step": 107390 + }, + { + "epoch": 11.814631463146315, + "grad_norm": 0.09022465348243713, + "learning_rate": 2.1457938562516058e-05, + "loss": 0.0571, + "num_input_tokens_seen": 22665344, + "step": 107395 + }, + { + "epoch": 11.815181518151816, + "grad_norm": 0.8215145468711853, + "learning_rate": 2.1455562722617388e-05, + "loss": 0.0116, + "num_input_tokens_seen": 22666368, + "step": 107400 + }, + { + "epoch": 11.815731573157315, + "grad_norm": 0.7296144962310791, + "learning_rate": 2.1453186915386095e-05, + "loss": 0.0132, + "num_input_tokens_seen": 22667520, + "step": 107405 + }, + { + "epoch": 11.816281628162816, + "grad_norm": 0.044961243867874146, + "learning_rate": 2.1450811140844084e-05, + "loss": 0.1672, + "num_input_tokens_seen": 22668640, + "step": 107410 + }, + { + "epoch": 11.816831683168317, + "grad_norm": 0.06276869028806686, + "learning_rate": 2.144843539901323e-05, + "loss": 0.0176, + "num_input_tokens_seen": 22669664, + "step": 107415 + }, + { + "epoch": 11.817381738173818, + "grad_norm": 0.034165747463703156, + "learning_rate": 2.1446059689915456e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22670752, + "step": 107420 + }, + { + "epoch": 11.817931793179318, + "grad_norm": 0.08595339953899384, + "learning_rate": 2.144368401357264e-05, + "loss": 0.0036, + "num_input_tokens_seen": 22671840, + "step": 107425 + }, + { + "epoch": 11.818481848184819, + "grad_norm": 0.2749493420124054, + "learning_rate": 2.1441308370006678e-05, + "loss": 0.009, + "num_input_tokens_seen": 22672896, + "step": 107430 + }, + { + "epoch": 11.81903190319032, + "grad_norm": 0.12193772196769714, + "learning_rate": 2.1438932759239476e-05, + "loss": 0.1234, + "num_input_tokens_seen": 22673984, + "step": 107435 + }, + { + "epoch": 11.819581958195819, + "grad_norm": 0.018610628321766853, + "learning_rate": 2.1436557181292914e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22674944, + "step": 107440 + }, + { + "epoch": 11.82013201320132, + "grad_norm": 0.013455077074468136, + "learning_rate": 2.14341816361889e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22675968, + "step": 107445 + }, + { + "epoch": 11.820682068206821, + "grad_norm": 0.11645886301994324, + "learning_rate": 2.1431806123949326e-05, + "loss": 0.0332, + "num_input_tokens_seen": 22677056, + "step": 107450 + }, + { + "epoch": 11.82123212321232, + "grad_norm": 0.024577273055911064, + "learning_rate": 2.1429430644596078e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22678112, + "step": 107455 + }, + { + "epoch": 11.821782178217822, + "grad_norm": 0.014462623745203018, + "learning_rate": 2.1427055198151056e-05, + "loss": 0.0116, + "num_input_tokens_seen": 22679136, + "step": 107460 + }, + { + "epoch": 11.822332233223323, + "grad_norm": 0.025885431095957756, + "learning_rate": 2.1424679784636144e-05, + "loss": 0.047, + "num_input_tokens_seen": 22680192, + "step": 107465 + }, + { + "epoch": 11.822882288228822, + "grad_norm": 0.02072707749903202, + "learning_rate": 2.1422304404073257e-05, + "loss": 0.0021, + "num_input_tokens_seen": 22681248, + "step": 107470 + }, + { + "epoch": 11.823432343234323, + "grad_norm": 2.531768321990967, + "learning_rate": 2.1419929056484267e-05, + "loss": 0.0419, + "num_input_tokens_seen": 22682336, + "step": 107475 + }, + { + "epoch": 11.823982398239824, + "grad_norm": 0.08583708107471466, + "learning_rate": 2.1417553741891067e-05, + "loss": 0.0051, + "num_input_tokens_seen": 22683392, + "step": 107480 + }, + { + "epoch": 11.824532453245325, + "grad_norm": 0.2955256402492523, + "learning_rate": 2.141517846031556e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22684416, + "step": 107485 + }, + { + "epoch": 11.825082508250825, + "grad_norm": 0.15812285244464874, + "learning_rate": 2.141280321177963e-05, + "loss": 0.0101, + "num_input_tokens_seen": 22685504, + "step": 107490 + }, + { + "epoch": 11.825632563256326, + "grad_norm": 0.013808993622660637, + "learning_rate": 2.141042799630518e-05, + "loss": 0.1044, + "num_input_tokens_seen": 22686592, + "step": 107495 + }, + { + "epoch": 11.826182618261827, + "grad_norm": 1.2025699615478516, + "learning_rate": 2.1408052813914086e-05, + "loss": 0.0692, + "num_input_tokens_seen": 22687616, + "step": 107500 + }, + { + "epoch": 11.826732673267326, + "grad_norm": 2.4142253398895264, + "learning_rate": 2.1405677664628242e-05, + "loss": 0.0427, + "num_input_tokens_seen": 22688640, + "step": 107505 + }, + { + "epoch": 11.827282728272827, + "grad_norm": 0.023066915571689606, + "learning_rate": 2.1403302548469555e-05, + "loss": 0.0698, + "num_input_tokens_seen": 22689696, + "step": 107510 + }, + { + "epoch": 11.827832783278328, + "grad_norm": 0.10080292075872421, + "learning_rate": 2.1400927465459893e-05, + "loss": 0.0329, + "num_input_tokens_seen": 22690688, + "step": 107515 + }, + { + "epoch": 11.828382838283828, + "grad_norm": 0.18066120147705078, + "learning_rate": 2.1398552415621155e-05, + "loss": 0.0117, + "num_input_tokens_seen": 22691744, + "step": 107520 + }, + { + "epoch": 11.828932893289329, + "grad_norm": 0.3372590243816376, + "learning_rate": 2.1396177398975243e-05, + "loss": 0.0438, + "num_input_tokens_seen": 22692704, + "step": 107525 + }, + { + "epoch": 11.82948294829483, + "grad_norm": 0.1409350484609604, + "learning_rate": 2.139380241554401e-05, + "loss": 0.0149, + "num_input_tokens_seen": 22693696, + "step": 107530 + }, + { + "epoch": 11.83003300330033, + "grad_norm": 0.029623968526721, + "learning_rate": 2.1391427465349395e-05, + "loss": 0.0575, + "num_input_tokens_seen": 22694720, + "step": 107535 + }, + { + "epoch": 11.83058305830583, + "grad_norm": 0.024389689788222313, + "learning_rate": 2.138905254841325e-05, + "loss": 0.1341, + "num_input_tokens_seen": 22695808, + "step": 107540 + }, + { + "epoch": 11.831133113311331, + "grad_norm": 1.0698860883712769, + "learning_rate": 2.1386677664757474e-05, + "loss": 0.04, + "num_input_tokens_seen": 22696864, + "step": 107545 + }, + { + "epoch": 11.831683168316832, + "grad_norm": 0.14018096029758453, + "learning_rate": 2.1384302814403964e-05, + "loss": 0.0844, + "num_input_tokens_seen": 22697856, + "step": 107550 + }, + { + "epoch": 11.832233223322332, + "grad_norm": 0.04669695347547531, + "learning_rate": 2.138192799737459e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22698880, + "step": 107555 + }, + { + "epoch": 11.832783278327833, + "grad_norm": 0.24611252546310425, + "learning_rate": 2.1379553213691257e-05, + "loss": 0.0049, + "num_input_tokens_seen": 22699904, + "step": 107560 + }, + { + "epoch": 11.833333333333334, + "grad_norm": 1.742897629737854, + "learning_rate": 2.1377178463375848e-05, + "loss": 0.0963, + "num_input_tokens_seen": 22700928, + "step": 107565 + }, + { + "epoch": 11.833883388338833, + "grad_norm": 0.06615012884140015, + "learning_rate": 2.1374803746450233e-05, + "loss": 0.005, + "num_input_tokens_seen": 22701984, + "step": 107570 + }, + { + "epoch": 11.834433443344334, + "grad_norm": 0.15787093341350555, + "learning_rate": 2.137242906293632e-05, + "loss": 0.0259, + "num_input_tokens_seen": 22702976, + "step": 107575 + }, + { + "epoch": 11.834983498349835, + "grad_norm": 0.27551403641700745, + "learning_rate": 2.1370054412855983e-05, + "loss": 0.0277, + "num_input_tokens_seen": 22703968, + "step": 107580 + }, + { + "epoch": 11.835533553355335, + "grad_norm": 1.095415472984314, + "learning_rate": 2.1367679796231124e-05, + "loss": 0.0797, + "num_input_tokens_seen": 22705024, + "step": 107585 + }, + { + "epoch": 11.836083608360836, + "grad_norm": 0.07232381403446198, + "learning_rate": 2.136530521308361e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22706144, + "step": 107590 + }, + { + "epoch": 11.836633663366337, + "grad_norm": 0.027147026732563972, + "learning_rate": 2.136293066343533e-05, + "loss": 0.0042, + "num_input_tokens_seen": 22707168, + "step": 107595 + }, + { + "epoch": 11.837183718371836, + "grad_norm": 0.01603945530951023, + "learning_rate": 2.136055614730818e-05, + "loss": 0.0091, + "num_input_tokens_seen": 22708224, + "step": 107600 + }, + { + "epoch": 11.837733773377337, + "grad_norm": 0.44187408685684204, + "learning_rate": 2.135818166472403e-05, + "loss": 0.0076, + "num_input_tokens_seen": 22709280, + "step": 107605 + }, + { + "epoch": 11.838283828382838, + "grad_norm": 0.04502306133508682, + "learning_rate": 2.1355807215704782e-05, + "loss": 0.0737, + "num_input_tokens_seen": 22710336, + "step": 107610 + }, + { + "epoch": 11.83883388338834, + "grad_norm": 0.06049984693527222, + "learning_rate": 2.1353432800272306e-05, + "loss": 0.1052, + "num_input_tokens_seen": 22711488, + "step": 107615 + }, + { + "epoch": 11.839383938393839, + "grad_norm": 0.044578682631254196, + "learning_rate": 2.1351058418448487e-05, + "loss": 0.0134, + "num_input_tokens_seen": 22712640, + "step": 107620 + }, + { + "epoch": 11.83993399339934, + "grad_norm": 0.09437454491853714, + "learning_rate": 2.1348684070255217e-05, + "loss": 0.075, + "num_input_tokens_seen": 22713696, + "step": 107625 + }, + { + "epoch": 11.840484048404841, + "grad_norm": 0.04453608766198158, + "learning_rate": 2.1346309755714366e-05, + "loss": 0.0116, + "num_input_tokens_seen": 22714720, + "step": 107630 + }, + { + "epoch": 11.84103410341034, + "grad_norm": 0.06333883851766586, + "learning_rate": 2.1343935474847832e-05, + "loss": 0.0331, + "num_input_tokens_seen": 22715712, + "step": 107635 + }, + { + "epoch": 11.841584158415841, + "grad_norm": 0.03861748054623604, + "learning_rate": 2.134156122767749e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22716704, + "step": 107640 + }, + { + "epoch": 11.842134213421343, + "grad_norm": 0.7923757433891296, + "learning_rate": 2.133918701422522e-05, + "loss": 0.0437, + "num_input_tokens_seen": 22717792, + "step": 107645 + }, + { + "epoch": 11.842684268426842, + "grad_norm": 0.2466883510351181, + "learning_rate": 2.1336812834512907e-05, + "loss": 0.0055, + "num_input_tokens_seen": 22718848, + "step": 107650 + }, + { + "epoch": 11.843234323432343, + "grad_norm": 0.00840639229863882, + "learning_rate": 2.1334438688562428e-05, + "loss": 0.0645, + "num_input_tokens_seen": 22719936, + "step": 107655 + }, + { + "epoch": 11.843784378437844, + "grad_norm": 0.06373940408229828, + "learning_rate": 2.133206457639567e-05, + "loss": 0.0107, + "num_input_tokens_seen": 22721024, + "step": 107660 + }, + { + "epoch": 11.844334433443345, + "grad_norm": 0.058656733483076096, + "learning_rate": 2.1329690498034525e-05, + "loss": 0.0382, + "num_input_tokens_seen": 22722112, + "step": 107665 + }, + { + "epoch": 11.844884488448844, + "grad_norm": 0.04237069934606552, + "learning_rate": 2.132731645350085e-05, + "loss": 0.0154, + "num_input_tokens_seen": 22723072, + "step": 107670 + }, + { + "epoch": 11.845434543454346, + "grad_norm": 0.04079674184322357, + "learning_rate": 2.1324942442816537e-05, + "loss": 0.0023, + "num_input_tokens_seen": 22724128, + "step": 107675 + }, + { + "epoch": 11.845984598459847, + "grad_norm": 0.16331085562705994, + "learning_rate": 2.1322568466003464e-05, + "loss": 0.0951, + "num_input_tokens_seen": 22725216, + "step": 107680 + }, + { + "epoch": 11.846534653465346, + "grad_norm": 0.14583419263362885, + "learning_rate": 2.1320194523083526e-05, + "loss": 0.0083, + "num_input_tokens_seen": 22726240, + "step": 107685 + }, + { + "epoch": 11.847084708470847, + "grad_norm": 0.03472309187054634, + "learning_rate": 2.1317820614078577e-05, + "loss": 0.0249, + "num_input_tokens_seen": 22727328, + "step": 107690 + }, + { + "epoch": 11.847634763476348, + "grad_norm": 0.26067885756492615, + "learning_rate": 2.131544673901051e-05, + "loss": 0.0046, + "num_input_tokens_seen": 22728384, + "step": 107695 + }, + { + "epoch": 11.848184818481847, + "grad_norm": 0.08087015897035599, + "learning_rate": 2.131307289790121e-05, + "loss": 0.0729, + "num_input_tokens_seen": 22729408, + "step": 107700 + }, + { + "epoch": 11.848734873487349, + "grad_norm": 0.028494257479906082, + "learning_rate": 2.131069909077254e-05, + "loss": 0.0039, + "num_input_tokens_seen": 22730464, + "step": 107705 + }, + { + "epoch": 11.84928492849285, + "grad_norm": 0.017460886389017105, + "learning_rate": 2.1308325317646387e-05, + "loss": 0.0743, + "num_input_tokens_seen": 22731456, + "step": 107710 + }, + { + "epoch": 11.84983498349835, + "grad_norm": 0.04232804477214813, + "learning_rate": 2.1305951578544636e-05, + "loss": 0.005, + "num_input_tokens_seen": 22732480, + "step": 107715 + }, + { + "epoch": 11.85038503850385, + "grad_norm": 0.15554197132587433, + "learning_rate": 2.130357787348914e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22733504, + "step": 107720 + }, + { + "epoch": 11.850935093509351, + "grad_norm": 0.17255868017673492, + "learning_rate": 2.1301204202501813e-05, + "loss": 0.0053, + "num_input_tokens_seen": 22734592, + "step": 107725 + }, + { + "epoch": 11.851485148514852, + "grad_norm": 1.4012089967727661, + "learning_rate": 2.12988305656045e-05, + "loss": 0.0868, + "num_input_tokens_seen": 22735648, + "step": 107730 + }, + { + "epoch": 11.852035203520352, + "grad_norm": 0.0949435606598854, + "learning_rate": 2.129645696281909e-05, + "loss": 0.0294, + "num_input_tokens_seen": 22736736, + "step": 107735 + }, + { + "epoch": 11.852585258525853, + "grad_norm": 0.20533642172813416, + "learning_rate": 2.1294083394167467e-05, + "loss": 0.0073, + "num_input_tokens_seen": 22737856, + "step": 107740 + }, + { + "epoch": 11.853135313531354, + "grad_norm": 0.017147932201623917, + "learning_rate": 2.1291709859671488e-05, + "loss": 0.0028, + "num_input_tokens_seen": 22738912, + "step": 107745 + }, + { + "epoch": 11.853685368536853, + "grad_norm": 0.017415901646018028, + "learning_rate": 2.1289336359353045e-05, + "loss": 0.0455, + "num_input_tokens_seen": 22740000, + "step": 107750 + }, + { + "epoch": 11.854235423542354, + "grad_norm": 0.05121271684765816, + "learning_rate": 2.1286962893234014e-05, + "loss": 0.0134, + "num_input_tokens_seen": 22741088, + "step": 107755 + }, + { + "epoch": 11.854785478547855, + "grad_norm": 0.3990635573863983, + "learning_rate": 2.1284589461336254e-05, + "loss": 0.0204, + "num_input_tokens_seen": 22742112, + "step": 107760 + }, + { + "epoch": 11.855335533553355, + "grad_norm": 0.6020422577857971, + "learning_rate": 2.1282216063681656e-05, + "loss": 0.0584, + "num_input_tokens_seen": 22743232, + "step": 107765 + }, + { + "epoch": 11.855885588558856, + "grad_norm": 0.02085283026099205, + "learning_rate": 2.1279842700292083e-05, + "loss": 0.006, + "num_input_tokens_seen": 22744288, + "step": 107770 + }, + { + "epoch": 11.856435643564357, + "grad_norm": 0.0358474925160408, + "learning_rate": 2.1277469371189417e-05, + "loss": 0.0155, + "num_input_tokens_seen": 22745248, + "step": 107775 + }, + { + "epoch": 11.856985698569858, + "grad_norm": 0.028773976489901543, + "learning_rate": 2.127509607639554e-05, + "loss": 0.0667, + "num_input_tokens_seen": 22746240, + "step": 107780 + }, + { + "epoch": 11.857535753575357, + "grad_norm": 0.06674148887395859, + "learning_rate": 2.1272722815932304e-05, + "loss": 0.0078, + "num_input_tokens_seen": 22747264, + "step": 107785 + }, + { + "epoch": 11.858085808580858, + "grad_norm": 0.12342587113380432, + "learning_rate": 2.1270349589821594e-05, + "loss": 0.0919, + "num_input_tokens_seen": 22748352, + "step": 107790 + }, + { + "epoch": 11.85863586358636, + "grad_norm": 3.334540605545044, + "learning_rate": 2.126797639808528e-05, + "loss": 0.0968, + "num_input_tokens_seen": 22749408, + "step": 107795 + }, + { + "epoch": 11.859185918591859, + "grad_norm": 0.0122367013245821, + "learning_rate": 2.1265603240745246e-05, + "loss": 0.0827, + "num_input_tokens_seen": 22750432, + "step": 107800 + }, + { + "epoch": 11.85973597359736, + "grad_norm": 0.1908012479543686, + "learning_rate": 2.1263230117823352e-05, + "loss": 0.0104, + "num_input_tokens_seen": 22751488, + "step": 107805 + }, + { + "epoch": 11.86028602860286, + "grad_norm": 0.029329074546694756, + "learning_rate": 2.1260857029341466e-05, + "loss": 0.0683, + "num_input_tokens_seen": 22752544, + "step": 107810 + }, + { + "epoch": 11.86083608360836, + "grad_norm": 4.855162143707275, + "learning_rate": 2.1258483975321478e-05, + "loss": 0.0881, + "num_input_tokens_seen": 22753600, + "step": 107815 + }, + { + "epoch": 11.861386138613861, + "grad_norm": 0.07810419797897339, + "learning_rate": 2.1256110955785234e-05, + "loss": 0.0823, + "num_input_tokens_seen": 22754656, + "step": 107820 + }, + { + "epoch": 11.861936193619362, + "grad_norm": 0.13797765970230103, + "learning_rate": 2.125373797075463e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22755680, + "step": 107825 + }, + { + "epoch": 11.862486248624862, + "grad_norm": 0.2379174530506134, + "learning_rate": 2.1251365020251527e-05, + "loss": 0.0111, + "num_input_tokens_seen": 22756704, + "step": 107830 + }, + { + "epoch": 11.863036303630363, + "grad_norm": 0.008962541818618774, + "learning_rate": 2.1248992104297776e-05, + "loss": 0.0168, + "num_input_tokens_seen": 22757760, + "step": 107835 + }, + { + "epoch": 11.863586358635864, + "grad_norm": 0.027348697185516357, + "learning_rate": 2.1246619222915283e-05, + "loss": 0.0102, + "num_input_tokens_seen": 22758880, + "step": 107840 + }, + { + "epoch": 11.864136413641365, + "grad_norm": 0.31691569089889526, + "learning_rate": 2.1244246376125893e-05, + "loss": 0.0161, + "num_input_tokens_seen": 22759968, + "step": 107845 + }, + { + "epoch": 11.864686468646864, + "grad_norm": 0.016143852844834328, + "learning_rate": 2.124187356395148e-05, + "loss": 0.1047, + "num_input_tokens_seen": 22760992, + "step": 107850 + }, + { + "epoch": 11.865236523652365, + "grad_norm": 0.44679173827171326, + "learning_rate": 2.1239500786413925e-05, + "loss": 0.0122, + "num_input_tokens_seen": 22762016, + "step": 107855 + }, + { + "epoch": 11.865786578657866, + "grad_norm": 0.07943354547023773, + "learning_rate": 2.1237128043535075e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22763104, + "step": 107860 + }, + { + "epoch": 11.866336633663366, + "grad_norm": 0.029963115230202675, + "learning_rate": 2.1234755335336816e-05, + "loss": 0.0014, + "num_input_tokens_seen": 22764096, + "step": 107865 + }, + { + "epoch": 11.866886688668867, + "grad_norm": 0.9262649416923523, + "learning_rate": 2.1232382661841005e-05, + "loss": 0.1024, + "num_input_tokens_seen": 22765184, + "step": 107870 + }, + { + "epoch": 11.867436743674368, + "grad_norm": 0.09357981383800507, + "learning_rate": 2.1230010023069524e-05, + "loss": 0.0149, + "num_input_tokens_seen": 22766208, + "step": 107875 + }, + { + "epoch": 11.867986798679867, + "grad_norm": 0.03910910710692406, + "learning_rate": 2.122763741904423e-05, + "loss": 0.0025, + "num_input_tokens_seen": 22767264, + "step": 107880 + }, + { + "epoch": 11.868536853685368, + "grad_norm": 0.39837872982025146, + "learning_rate": 2.1225264849786983e-05, + "loss": 0.0397, + "num_input_tokens_seen": 22768320, + "step": 107885 + }, + { + "epoch": 11.86908690869087, + "grad_norm": 0.03466206043958664, + "learning_rate": 2.122289231531967e-05, + "loss": 0.0272, + "num_input_tokens_seen": 22769344, + "step": 107890 + }, + { + "epoch": 11.869636963696369, + "grad_norm": 0.0711694210767746, + "learning_rate": 2.122051981566413e-05, + "loss": 0.1022, + "num_input_tokens_seen": 22770464, + "step": 107895 + }, + { + "epoch": 11.87018701870187, + "grad_norm": 0.8409008383750916, + "learning_rate": 2.121814735084226e-05, + "loss": 0.013, + "num_input_tokens_seen": 22771488, + "step": 107900 + }, + { + "epoch": 11.870737073707371, + "grad_norm": 0.060230154544115067, + "learning_rate": 2.1215774920875907e-05, + "loss": 0.004, + "num_input_tokens_seen": 22772544, + "step": 107905 + }, + { + "epoch": 11.871287128712872, + "grad_norm": 0.03961599990725517, + "learning_rate": 2.1213402525786938e-05, + "loss": 0.0045, + "num_input_tokens_seen": 22773600, + "step": 107910 + }, + { + "epoch": 11.871837183718371, + "grad_norm": 0.07723477482795715, + "learning_rate": 2.1211030165597228e-05, + "loss": 0.0544, + "num_input_tokens_seen": 22774656, + "step": 107915 + }, + { + "epoch": 11.872387238723872, + "grad_norm": 0.08542167395353317, + "learning_rate": 2.120865784032863e-05, + "loss": 0.0287, + "num_input_tokens_seen": 22775744, + "step": 107920 + }, + { + "epoch": 11.872937293729374, + "grad_norm": 0.020089779049158096, + "learning_rate": 2.1206285550003012e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22776800, + "step": 107925 + }, + { + "epoch": 11.873487348734873, + "grad_norm": 0.17518828809261322, + "learning_rate": 2.1203913294642248e-05, + "loss": 0.0552, + "num_input_tokens_seen": 22777856, + "step": 107930 + }, + { + "epoch": 11.874037403740374, + "grad_norm": 0.016861235722899437, + "learning_rate": 2.120154107426818e-05, + "loss": 0.0236, + "num_input_tokens_seen": 22778848, + "step": 107935 + }, + { + "epoch": 11.874587458745875, + "grad_norm": 2.8007657527923584, + "learning_rate": 2.119916888890269e-05, + "loss": 0.0951, + "num_input_tokens_seen": 22779904, + "step": 107940 + }, + { + "epoch": 11.875137513751374, + "grad_norm": 0.22537897527217865, + "learning_rate": 2.1196796738567647e-05, + "loss": 0.0049, + "num_input_tokens_seen": 22780928, + "step": 107945 + }, + { + "epoch": 11.875687568756875, + "grad_norm": 0.047947902232408524, + "learning_rate": 2.1194424623284892e-05, + "loss": 0.0026, + "num_input_tokens_seen": 22781984, + "step": 107950 + }, + { + "epoch": 11.876237623762377, + "grad_norm": 0.03263973817229271, + "learning_rate": 2.1192052543076304e-05, + "loss": 0.0103, + "num_input_tokens_seen": 22783072, + "step": 107955 + }, + { + "epoch": 11.876787678767876, + "grad_norm": 0.0354837030172348, + "learning_rate": 2.1189680497963734e-05, + "loss": 0.0056, + "num_input_tokens_seen": 22784096, + "step": 107960 + }, + { + "epoch": 11.877337733773377, + "grad_norm": 0.34747880697250366, + "learning_rate": 2.118730848796906e-05, + "loss": 0.0145, + "num_input_tokens_seen": 22785216, + "step": 107965 + }, + { + "epoch": 11.877887788778878, + "grad_norm": 0.019231567159295082, + "learning_rate": 2.118493651311413e-05, + "loss": 0.0106, + "num_input_tokens_seen": 22786272, + "step": 107970 + }, + { + "epoch": 11.87843784378438, + "grad_norm": 0.397373765707016, + "learning_rate": 2.1182564573420805e-05, + "loss": 0.0126, + "num_input_tokens_seen": 22787360, + "step": 107975 + }, + { + "epoch": 11.878987898789878, + "grad_norm": 0.04116310551762581, + "learning_rate": 2.1180192668910957e-05, + "loss": 0.0043, + "num_input_tokens_seen": 22788416, + "step": 107980 + }, + { + "epoch": 11.87953795379538, + "grad_norm": 0.03167185187339783, + "learning_rate": 2.117782079960643e-05, + "loss": 0.0071, + "num_input_tokens_seen": 22789440, + "step": 107985 + }, + { + "epoch": 11.88008800880088, + "grad_norm": 2.5381338596343994, + "learning_rate": 2.117544896552911e-05, + "loss": 0.0261, + "num_input_tokens_seen": 22790528, + "step": 107990 + }, + { + "epoch": 11.88063806380638, + "grad_norm": 0.03713083267211914, + "learning_rate": 2.1173077166700837e-05, + "loss": 0.01, + "num_input_tokens_seen": 22791584, + "step": 107995 + }, + { + "epoch": 11.881188118811881, + "grad_norm": 0.32242241501808167, + "learning_rate": 2.117070540314347e-05, + "loss": 0.0046, + "num_input_tokens_seen": 22792608, + "step": 108000 + }, + { + "epoch": 11.881738173817382, + "grad_norm": 0.06490187346935272, + "learning_rate": 2.116833367487888e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22793696, + "step": 108005 + }, + { + "epoch": 11.882288228822881, + "grad_norm": 1.228894829750061, + "learning_rate": 2.116596198192891e-05, + "loss": 0.0167, + "num_input_tokens_seen": 22794720, + "step": 108010 + }, + { + "epoch": 11.882838283828383, + "grad_norm": 0.19643060863018036, + "learning_rate": 2.1163590324315435e-05, + "loss": 0.0039, + "num_input_tokens_seen": 22795776, + "step": 108015 + }, + { + "epoch": 11.883388338833884, + "grad_norm": 0.0572015643119812, + "learning_rate": 2.116121870206031e-05, + "loss": 0.0284, + "num_input_tokens_seen": 22796896, + "step": 108020 + }, + { + "epoch": 11.883938393839383, + "grad_norm": 0.02464556321501732, + "learning_rate": 2.115884711518538e-05, + "loss": 0.0617, + "num_input_tokens_seen": 22797920, + "step": 108025 + }, + { + "epoch": 11.884488448844884, + "grad_norm": 0.09937786310911179, + "learning_rate": 2.115647556371253e-05, + "loss": 0.0227, + "num_input_tokens_seen": 22799040, + "step": 108030 + }, + { + "epoch": 11.885038503850385, + "grad_norm": 0.5027102828025818, + "learning_rate": 2.1154104047663583e-05, + "loss": 0.0784, + "num_input_tokens_seen": 22800128, + "step": 108035 + }, + { + "epoch": 11.885588558855886, + "grad_norm": 0.1602194607257843, + "learning_rate": 2.1151732567060423e-05, + "loss": 0.0088, + "num_input_tokens_seen": 22801216, + "step": 108040 + }, + { + "epoch": 11.886138613861386, + "grad_norm": 0.026003176346421242, + "learning_rate": 2.1149361121924902e-05, + "loss": 0.0138, + "num_input_tokens_seen": 22802336, + "step": 108045 + }, + { + "epoch": 11.886688668866887, + "grad_norm": 1.0571075677871704, + "learning_rate": 2.114698971227886e-05, + "loss": 0.0179, + "num_input_tokens_seen": 22803424, + "step": 108050 + }, + { + "epoch": 11.887238723872388, + "grad_norm": 0.06945277750492096, + "learning_rate": 2.1144618338144172e-05, + "loss": 0.0125, + "num_input_tokens_seen": 22804512, + "step": 108055 + }, + { + "epoch": 11.887788778877887, + "grad_norm": 0.04981284588575363, + "learning_rate": 2.114224699954268e-05, + "loss": 0.0052, + "num_input_tokens_seen": 22805632, + "step": 108060 + }, + { + "epoch": 11.888338833883388, + "grad_norm": 0.025736285373568535, + "learning_rate": 2.113987569649626e-05, + "loss": 0.0056, + "num_input_tokens_seen": 22806720, + "step": 108065 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 0.05554858595132828, + "learning_rate": 2.1137504429026745e-05, + "loss": 0.0388, + "num_input_tokens_seen": 22807776, + "step": 108070 + }, + { + "epoch": 11.88943894389439, + "grad_norm": 0.0029608896002173424, + "learning_rate": 2.1135133197155992e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22808864, + "step": 108075 + }, + { + "epoch": 11.88998899889989, + "grad_norm": 0.0067785209976136684, + "learning_rate": 2.1132762000905866e-05, + "loss": 0.0705, + "num_input_tokens_seen": 22809952, + "step": 108080 + }, + { + "epoch": 11.89053905390539, + "grad_norm": 0.009704271331429482, + "learning_rate": 2.1130390840298218e-05, + "loss": 0.003, + "num_input_tokens_seen": 22811072, + "step": 108085 + }, + { + "epoch": 11.891089108910892, + "grad_norm": 0.06130577623844147, + "learning_rate": 2.1128019715354906e-05, + "loss": 0.0286, + "num_input_tokens_seen": 22812160, + "step": 108090 + }, + { + "epoch": 11.891639163916391, + "grad_norm": 0.3077602982521057, + "learning_rate": 2.1125648626097774e-05, + "loss": 0.0573, + "num_input_tokens_seen": 22813280, + "step": 108095 + }, + { + "epoch": 11.892189218921892, + "grad_norm": 0.02543102763593197, + "learning_rate": 2.1123277572548673e-05, + "loss": 0.0016, + "num_input_tokens_seen": 22814336, + "step": 108100 + }, + { + "epoch": 11.892739273927393, + "grad_norm": 0.06395094096660614, + "learning_rate": 2.1120906554729476e-05, + "loss": 0.0022, + "num_input_tokens_seen": 22815360, + "step": 108105 + }, + { + "epoch": 11.893289328932893, + "grad_norm": 0.07932254672050476, + "learning_rate": 2.111853557266201e-05, + "loss": 0.0089, + "num_input_tokens_seen": 22816480, + "step": 108110 + }, + { + "epoch": 11.893839383938394, + "grad_norm": 1.6584235429763794, + "learning_rate": 2.1116164626368147e-05, + "loss": 0.0726, + "num_input_tokens_seen": 22817536, + "step": 108115 + }, + { + "epoch": 11.894389438943895, + "grad_norm": 0.22728875279426575, + "learning_rate": 2.1113793715869735e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22818656, + "step": 108120 + }, + { + "epoch": 11.894939493949394, + "grad_norm": 0.06923547387123108, + "learning_rate": 2.111142284118861e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22819712, + "step": 108125 + }, + { + "epoch": 11.895489548954895, + "grad_norm": 1.3036648035049438, + "learning_rate": 2.1109052002346637e-05, + "loss": 0.0981, + "num_input_tokens_seen": 22820704, + "step": 108130 + }, + { + "epoch": 11.896039603960396, + "grad_norm": 0.27789297699928284, + "learning_rate": 2.1106681199365674e-05, + "loss": 0.0061, + "num_input_tokens_seen": 22821760, + "step": 108135 + }, + { + "epoch": 11.896589658965897, + "grad_norm": 0.044090330600738525, + "learning_rate": 2.1104310432267545e-05, + "loss": 0.0145, + "num_input_tokens_seen": 22822784, + "step": 108140 + }, + { + "epoch": 11.897139713971397, + "grad_norm": 0.1793091744184494, + "learning_rate": 2.1101939701074133e-05, + "loss": 0.004, + "num_input_tokens_seen": 22823776, + "step": 108145 + }, + { + "epoch": 11.897689768976898, + "grad_norm": 0.18223416805267334, + "learning_rate": 2.1099569005807264e-05, + "loss": 0.0103, + "num_input_tokens_seen": 22824800, + "step": 108150 + }, + { + "epoch": 11.898239823982399, + "grad_norm": 0.053188055753707886, + "learning_rate": 2.1097198346488798e-05, + "loss": 0.0045, + "num_input_tokens_seen": 22825856, + "step": 108155 + }, + { + "epoch": 11.898789878987898, + "grad_norm": 0.26654425263404846, + "learning_rate": 2.109482772314059e-05, + "loss": 0.0368, + "num_input_tokens_seen": 22826816, + "step": 108160 + }, + { + "epoch": 11.8993399339934, + "grad_norm": 0.010017650201916695, + "learning_rate": 2.1092457135784468e-05, + "loss": 0.0877, + "num_input_tokens_seen": 22827904, + "step": 108165 + }, + { + "epoch": 11.8998899889989, + "grad_norm": 0.10437098145484924, + "learning_rate": 2.10900865844423e-05, + "loss": 0.0046, + "num_input_tokens_seen": 22829024, + "step": 108170 + }, + { + "epoch": 11.9004400440044, + "grad_norm": 0.20205286145210266, + "learning_rate": 2.1087716069135922e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22830048, + "step": 108175 + }, + { + "epoch": 11.900990099009901, + "grad_norm": 0.003574824659153819, + "learning_rate": 2.1085345589887198e-05, + "loss": 0.0049, + "num_input_tokens_seen": 22831104, + "step": 108180 + }, + { + "epoch": 11.901540154015402, + "grad_norm": 0.030702099204063416, + "learning_rate": 2.108297514671796e-05, + "loss": 0.1057, + "num_input_tokens_seen": 22832128, + "step": 108185 + }, + { + "epoch": 11.902090209020901, + "grad_norm": 0.053564343601465225, + "learning_rate": 2.1080604739650058e-05, + "loss": 0.0028, + "num_input_tokens_seen": 22833184, + "step": 108190 + }, + { + "epoch": 11.902640264026402, + "grad_norm": 0.018807632848620415, + "learning_rate": 2.1078234368705348e-05, + "loss": 0.079, + "num_input_tokens_seen": 22834208, + "step": 108195 + }, + { + "epoch": 11.903190319031903, + "grad_norm": 0.08818026632070541, + "learning_rate": 2.1075864033905656e-05, + "loss": 0.0138, + "num_input_tokens_seen": 22835360, + "step": 108200 + }, + { + "epoch": 11.903740374037405, + "grad_norm": 3.306227684020996, + "learning_rate": 2.107349373527286e-05, + "loss": 0.0364, + "num_input_tokens_seen": 22836384, + "step": 108205 + }, + { + "epoch": 11.904290429042904, + "grad_norm": 0.01746279001235962, + "learning_rate": 2.107112347282878e-05, + "loss": 0.041, + "num_input_tokens_seen": 22837376, + "step": 108210 + }, + { + "epoch": 11.904840484048405, + "grad_norm": 0.038111183792352676, + "learning_rate": 2.1068753246595267e-05, + "loss": 0.0069, + "num_input_tokens_seen": 22838400, + "step": 108215 + }, + { + "epoch": 11.905390539053906, + "grad_norm": 2.5024445056915283, + "learning_rate": 2.1066383056594177e-05, + "loss": 0.0513, + "num_input_tokens_seen": 22839552, + "step": 108220 + }, + { + "epoch": 11.905940594059405, + "grad_norm": 0.006849715486168861, + "learning_rate": 2.1064012902847336e-05, + "loss": 0.0032, + "num_input_tokens_seen": 22840608, + "step": 108225 + }, + { + "epoch": 11.906490649064907, + "grad_norm": 0.47842204570770264, + "learning_rate": 2.1061642785376606e-05, + "loss": 0.0392, + "num_input_tokens_seen": 22841696, + "step": 108230 + }, + { + "epoch": 11.907040704070408, + "grad_norm": 1.598039984703064, + "learning_rate": 2.105927270420383e-05, + "loss": 0.0205, + "num_input_tokens_seen": 22842752, + "step": 108235 + }, + { + "epoch": 11.907590759075907, + "grad_norm": 0.9014031887054443, + "learning_rate": 2.1056902659350836e-05, + "loss": 0.0963, + "num_input_tokens_seen": 22843808, + "step": 108240 + }, + { + "epoch": 11.908140814081408, + "grad_norm": 0.03478559851646423, + "learning_rate": 2.1054532650839484e-05, + "loss": 0.001, + "num_input_tokens_seen": 22844864, + "step": 108245 + }, + { + "epoch": 11.908690869086909, + "grad_norm": 0.15042848885059357, + "learning_rate": 2.1052162678691604e-05, + "loss": 0.0458, + "num_input_tokens_seen": 22845888, + "step": 108250 + }, + { + "epoch": 11.909240924092408, + "grad_norm": 0.03496120125055313, + "learning_rate": 2.104979274292906e-05, + "loss": 0.1, + "num_input_tokens_seen": 22846944, + "step": 108255 + }, + { + "epoch": 11.90979097909791, + "grad_norm": 0.7451954483985901, + "learning_rate": 2.1047422843573673e-05, + "loss": 0.0256, + "num_input_tokens_seen": 22848000, + "step": 108260 + }, + { + "epoch": 11.91034103410341, + "grad_norm": 0.01446770690381527, + "learning_rate": 2.1045052980647287e-05, + "loss": 0.0943, + "num_input_tokens_seen": 22849056, + "step": 108265 + }, + { + "epoch": 11.910891089108912, + "grad_norm": 0.07596997171640396, + "learning_rate": 2.1042683154171754e-05, + "loss": 0.0089, + "num_input_tokens_seen": 22850176, + "step": 108270 + }, + { + "epoch": 11.911441144114411, + "grad_norm": 0.012979518622159958, + "learning_rate": 2.104031336416891e-05, + "loss": 0.0067, + "num_input_tokens_seen": 22851136, + "step": 108275 + }, + { + "epoch": 11.911991199119912, + "grad_norm": 0.03238875791430473, + "learning_rate": 2.1037943610660605e-05, + "loss": 0.0451, + "num_input_tokens_seen": 22852192, + "step": 108280 + }, + { + "epoch": 11.912541254125413, + "grad_norm": 0.20766155421733856, + "learning_rate": 2.1035573893668672e-05, + "loss": 0.0961, + "num_input_tokens_seen": 22853280, + "step": 108285 + }, + { + "epoch": 11.913091309130913, + "grad_norm": 0.06210799515247345, + "learning_rate": 2.1033204213214945e-05, + "loss": 0.0018, + "num_input_tokens_seen": 22854304, + "step": 108290 + }, + { + "epoch": 11.913641364136414, + "grad_norm": 0.6848000884056091, + "learning_rate": 2.1030834569321278e-05, + "loss": 0.0124, + "num_input_tokens_seen": 22855296, + "step": 108295 + }, + { + "epoch": 11.914191419141915, + "grad_norm": 0.04685825854539871, + "learning_rate": 2.1028464962009496e-05, + "loss": 0.0239, + "num_input_tokens_seen": 22856320, + "step": 108300 + }, + { + "epoch": 11.914741474147414, + "grad_norm": 1.8265855312347412, + "learning_rate": 2.1026095391301454e-05, + "loss": 0.0836, + "num_input_tokens_seen": 22857376, + "step": 108305 + }, + { + "epoch": 11.915291529152915, + "grad_norm": 1.4205491542816162, + "learning_rate": 2.1023725857218988e-05, + "loss": 0.1208, + "num_input_tokens_seen": 22858464, + "step": 108310 + }, + { + "epoch": 11.915841584158416, + "grad_norm": 0.005510250572115183, + "learning_rate": 2.1021356359783917e-05, + "loss": 0.0732, + "num_input_tokens_seen": 22859520, + "step": 108315 + }, + { + "epoch": 11.916391639163916, + "grad_norm": 0.13487520813941956, + "learning_rate": 2.1018986899018112e-05, + "loss": 0.0124, + "num_input_tokens_seen": 22860544, + "step": 108320 + }, + { + "epoch": 11.916941694169417, + "grad_norm": 0.05060837045311928, + "learning_rate": 2.1016617474943386e-05, + "loss": 0.0022, + "num_input_tokens_seen": 22861568, + "step": 108325 + }, + { + "epoch": 11.917491749174918, + "grad_norm": 0.05840083956718445, + "learning_rate": 2.101424808758158e-05, + "loss": 0.0146, + "num_input_tokens_seen": 22862592, + "step": 108330 + }, + { + "epoch": 11.918041804180419, + "grad_norm": 0.00903963390737772, + "learning_rate": 2.1011878736954552e-05, + "loss": 0.0241, + "num_input_tokens_seen": 22863648, + "step": 108335 + }, + { + "epoch": 11.918591859185918, + "grad_norm": 0.06377683579921722, + "learning_rate": 2.1009509423084108e-05, + "loss": 0.0032, + "num_input_tokens_seen": 22864768, + "step": 108340 + }, + { + "epoch": 11.91914191419142, + "grad_norm": 1.9735612869262695, + "learning_rate": 2.100714014599211e-05, + "loss": 0.1305, + "num_input_tokens_seen": 22865856, + "step": 108345 + }, + { + "epoch": 11.91969196919692, + "grad_norm": 1.8490498065948486, + "learning_rate": 2.1004770905700388e-05, + "loss": 0.0569, + "num_input_tokens_seen": 22866880, + "step": 108350 + }, + { + "epoch": 11.92024202420242, + "grad_norm": 0.006432351656258106, + "learning_rate": 2.1002401702230768e-05, + "loss": 0.0031, + "num_input_tokens_seen": 22867872, + "step": 108355 + }, + { + "epoch": 11.92079207920792, + "grad_norm": 0.0916363075375557, + "learning_rate": 2.1000032535605095e-05, + "loss": 0.0042, + "num_input_tokens_seen": 22868896, + "step": 108360 + }, + { + "epoch": 11.921342134213422, + "grad_norm": 0.015937654301524162, + "learning_rate": 2.0997663405845204e-05, + "loss": 0.004, + "num_input_tokens_seen": 22869984, + "step": 108365 + }, + { + "epoch": 11.921892189218921, + "grad_norm": 2.474182605743408, + "learning_rate": 2.0995294312972933e-05, + "loss": 0.1241, + "num_input_tokens_seen": 22871040, + "step": 108370 + }, + { + "epoch": 11.922442244224422, + "grad_norm": 0.02966678887605667, + "learning_rate": 2.099292525701011e-05, + "loss": 0.017, + "num_input_tokens_seen": 22872128, + "step": 108375 + }, + { + "epoch": 11.922992299229923, + "grad_norm": 0.06772065162658691, + "learning_rate": 2.0990556237978563e-05, + "loss": 0.003, + "num_input_tokens_seen": 22873184, + "step": 108380 + }, + { + "epoch": 11.923542354235423, + "grad_norm": 0.020478293299674988, + "learning_rate": 2.0988187255900144e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22874240, + "step": 108385 + }, + { + "epoch": 11.924092409240924, + "grad_norm": 0.007864104583859444, + "learning_rate": 2.0985818310796674e-05, + "loss": 0.0124, + "num_input_tokens_seen": 22875392, + "step": 108390 + }, + { + "epoch": 11.924642464246425, + "grad_norm": 0.4507907032966614, + "learning_rate": 2.0983449402689997e-05, + "loss": 0.0087, + "num_input_tokens_seen": 22876384, + "step": 108395 + }, + { + "epoch": 11.925192519251926, + "grad_norm": 0.08598481863737106, + "learning_rate": 2.0981080531601937e-05, + "loss": 0.0112, + "num_input_tokens_seen": 22877440, + "step": 108400 + }, + { + "epoch": 11.925742574257425, + "grad_norm": 0.056392256170511246, + "learning_rate": 2.0978711697554322e-05, + "loss": 0.0024, + "num_input_tokens_seen": 22878528, + "step": 108405 + }, + { + "epoch": 11.926292629262926, + "grad_norm": 0.22677303850650787, + "learning_rate": 2.0976342900569007e-05, + "loss": 0.0243, + "num_input_tokens_seen": 22879584, + "step": 108410 + }, + { + "epoch": 11.926842684268427, + "grad_norm": 0.01607857458293438, + "learning_rate": 2.0973974140667793e-05, + "loss": 0.0065, + "num_input_tokens_seen": 22880640, + "step": 108415 + }, + { + "epoch": 11.927392739273927, + "grad_norm": 0.01808926649391651, + "learning_rate": 2.0971605417872535e-05, + "loss": 0.0062, + "num_input_tokens_seen": 22881696, + "step": 108420 + }, + { + "epoch": 11.927942794279428, + "grad_norm": 0.22974230349063873, + "learning_rate": 2.0969236732205062e-05, + "loss": 0.005, + "num_input_tokens_seen": 22882848, + "step": 108425 + }, + { + "epoch": 11.928492849284929, + "grad_norm": 0.18524235486984253, + "learning_rate": 2.096686808368719e-05, + "loss": 0.0072, + "num_input_tokens_seen": 22884032, + "step": 108430 + }, + { + "epoch": 11.929042904290428, + "grad_norm": 0.015172503888607025, + "learning_rate": 2.0964499472340763e-05, + "loss": 0.0044, + "num_input_tokens_seen": 22885088, + "step": 108435 + }, + { + "epoch": 11.92959295929593, + "grad_norm": 0.0416102334856987, + "learning_rate": 2.0962130898187604e-05, + "loss": 0.0045, + "num_input_tokens_seen": 22886080, + "step": 108440 + }, + { + "epoch": 11.93014301430143, + "grad_norm": 0.015416119247674942, + "learning_rate": 2.0959762361249552e-05, + "loss": 0.078, + "num_input_tokens_seen": 22887136, + "step": 108445 + }, + { + "epoch": 11.930693069306932, + "grad_norm": 0.0381799079477787, + "learning_rate": 2.0957393861548437e-05, + "loss": 0.0457, + "num_input_tokens_seen": 22888160, + "step": 108450 + }, + { + "epoch": 11.93124312431243, + "grad_norm": 1.6848599910736084, + "learning_rate": 2.0955025399106072e-05, + "loss": 0.0518, + "num_input_tokens_seen": 22889152, + "step": 108455 + }, + { + "epoch": 11.931793179317932, + "grad_norm": 0.003552859416231513, + "learning_rate": 2.09526569739443e-05, + "loss": 0.043, + "num_input_tokens_seen": 22890240, + "step": 108460 + }, + { + "epoch": 11.932343234323433, + "grad_norm": 0.027501681819558144, + "learning_rate": 2.0950288586084945e-05, + "loss": 0.002, + "num_input_tokens_seen": 22891328, + "step": 108465 + }, + { + "epoch": 11.932893289328932, + "grad_norm": 0.20359037816524506, + "learning_rate": 2.094792023554985e-05, + "loss": 0.0039, + "num_input_tokens_seen": 22892384, + "step": 108470 + }, + { + "epoch": 11.933443344334433, + "grad_norm": 0.0074807084165513515, + "learning_rate": 2.0945551922360818e-05, + "loss": 0.0064, + "num_input_tokens_seen": 22893472, + "step": 108475 + }, + { + "epoch": 11.933993399339935, + "grad_norm": 0.0320177897810936, + "learning_rate": 2.0943183646539686e-05, + "loss": 0.0144, + "num_input_tokens_seen": 22894496, + "step": 108480 + }, + { + "epoch": 11.934543454345434, + "grad_norm": 0.010605044662952423, + "learning_rate": 2.0940815408108293e-05, + "loss": 0.0269, + "num_input_tokens_seen": 22895584, + "step": 108485 + }, + { + "epoch": 11.935093509350935, + "grad_norm": 0.03903839364647865, + "learning_rate": 2.0938447207088447e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22896608, + "step": 108490 + }, + { + "epoch": 11.935643564356436, + "grad_norm": 0.019632209092378616, + "learning_rate": 2.093607904350199e-05, + "loss": 0.0102, + "num_input_tokens_seen": 22897664, + "step": 108495 + }, + { + "epoch": 11.936193619361937, + "grad_norm": 0.07810953259468079, + "learning_rate": 2.0933710917370746e-05, + "loss": 0.012, + "num_input_tokens_seen": 22898688, + "step": 108500 + }, + { + "epoch": 11.936743674367436, + "grad_norm": 0.9870508909225464, + "learning_rate": 2.0931342828716523e-05, + "loss": 0.0759, + "num_input_tokens_seen": 22899712, + "step": 108505 + }, + { + "epoch": 11.937293729372938, + "grad_norm": 2.2933712005615234, + "learning_rate": 2.0928974777561176e-05, + "loss": 0.1734, + "num_input_tokens_seen": 22900800, + "step": 108510 + }, + { + "epoch": 11.937843784378439, + "grad_norm": 0.027332335710525513, + "learning_rate": 2.0926606763926513e-05, + "loss": 0.0095, + "num_input_tokens_seen": 22901792, + "step": 108515 + }, + { + "epoch": 11.938393839383938, + "grad_norm": 1.9961148500442505, + "learning_rate": 2.0924238787834353e-05, + "loss": 0.0215, + "num_input_tokens_seen": 22902816, + "step": 108520 + }, + { + "epoch": 11.938943894389439, + "grad_norm": 0.029404157772660255, + "learning_rate": 2.0921870849306534e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22903872, + "step": 108525 + }, + { + "epoch": 11.93949394939494, + "grad_norm": 0.09292270988225937, + "learning_rate": 2.091950294836487e-05, + "loss": 0.0167, + "num_input_tokens_seen": 22904896, + "step": 108530 + }, + { + "epoch": 11.94004400440044, + "grad_norm": 0.013469934463500977, + "learning_rate": 2.0917135085031194e-05, + "loss": 0.0021, + "num_input_tokens_seen": 22905952, + "step": 108535 + }, + { + "epoch": 11.94059405940594, + "grad_norm": 1.7835230827331543, + "learning_rate": 2.0914767259327327e-05, + "loss": 0.1741, + "num_input_tokens_seen": 22907008, + "step": 108540 + }, + { + "epoch": 11.941144114411442, + "grad_norm": 1.6806762218475342, + "learning_rate": 2.0912399471275085e-05, + "loss": 0.0512, + "num_input_tokens_seen": 22908064, + "step": 108545 + }, + { + "epoch": 11.941694169416941, + "grad_norm": 0.0625801607966423, + "learning_rate": 2.0910031720896293e-05, + "loss": 0.069, + "num_input_tokens_seen": 22909088, + "step": 108550 + }, + { + "epoch": 11.942244224422442, + "grad_norm": 2.113190174102783, + "learning_rate": 2.090766400821278e-05, + "loss": 0.0122, + "num_input_tokens_seen": 22910144, + "step": 108555 + }, + { + "epoch": 11.942794279427943, + "grad_norm": 1.429683804512024, + "learning_rate": 2.0905296333246363e-05, + "loss": 0.0316, + "num_input_tokens_seen": 22911136, + "step": 108560 + }, + { + "epoch": 11.943344334433444, + "grad_norm": 2.095639228820801, + "learning_rate": 2.090292869601887e-05, + "loss": 0.0367, + "num_input_tokens_seen": 22912160, + "step": 108565 + }, + { + "epoch": 11.943894389438944, + "grad_norm": 0.02412949502468109, + "learning_rate": 2.090056109655211e-05, + "loss": 0.0058, + "num_input_tokens_seen": 22913248, + "step": 108570 + }, + { + "epoch": 11.944444444444445, + "grad_norm": 0.004262794274836779, + "learning_rate": 2.0898193534867916e-05, + "loss": 0.0274, + "num_input_tokens_seen": 22914272, + "step": 108575 + }, + { + "epoch": 11.944994499449946, + "grad_norm": 0.0056312186643481255, + "learning_rate": 2.0895826010988096e-05, + "loss": 0.0054, + "num_input_tokens_seen": 22915328, + "step": 108580 + }, + { + "epoch": 11.945544554455445, + "grad_norm": 0.01387854665517807, + "learning_rate": 2.089345852493449e-05, + "loss": 0.0088, + "num_input_tokens_seen": 22916384, + "step": 108585 + }, + { + "epoch": 11.946094609460946, + "grad_norm": 0.32987526059150696, + "learning_rate": 2.0891091076728902e-05, + "loss": 0.0243, + "num_input_tokens_seen": 22917440, + "step": 108590 + }, + { + "epoch": 11.946644664466447, + "grad_norm": 0.04833708703517914, + "learning_rate": 2.088872366639315e-05, + "loss": 0.0286, + "num_input_tokens_seen": 22918528, + "step": 108595 + }, + { + "epoch": 11.947194719471947, + "grad_norm": 0.04779877886176109, + "learning_rate": 2.088635629394907e-05, + "loss": 0.002, + "num_input_tokens_seen": 22919552, + "step": 108600 + }, + { + "epoch": 11.947744774477448, + "grad_norm": 0.022594016045331955, + "learning_rate": 2.088398895941846e-05, + "loss": 0.0062, + "num_input_tokens_seen": 22920640, + "step": 108605 + }, + { + "epoch": 11.948294829482949, + "grad_norm": 1.2091056108474731, + "learning_rate": 2.0881621662823154e-05, + "loss": 0.0111, + "num_input_tokens_seen": 22921760, + "step": 108610 + }, + { + "epoch": 11.948844884488448, + "grad_norm": 0.06685319542884827, + "learning_rate": 2.0879254404184965e-05, + "loss": 0.0502, + "num_input_tokens_seen": 22922848, + "step": 108615 + }, + { + "epoch": 11.94939493949395, + "grad_norm": 0.4169768691062927, + "learning_rate": 2.0876887183525705e-05, + "loss": 0.0093, + "num_input_tokens_seen": 22923840, + "step": 108620 + }, + { + "epoch": 11.94994499449945, + "grad_norm": 0.006768313702195883, + "learning_rate": 2.0874520000867205e-05, + "loss": 0.0246, + "num_input_tokens_seen": 22924864, + "step": 108625 + }, + { + "epoch": 11.950495049504951, + "grad_norm": 0.004304874688386917, + "learning_rate": 2.0872152856231266e-05, + "loss": 0.0246, + "num_input_tokens_seen": 22925856, + "step": 108630 + }, + { + "epoch": 11.95104510451045, + "grad_norm": 2.170400381088257, + "learning_rate": 2.086978574963972e-05, + "loss": 0.1338, + "num_input_tokens_seen": 22926944, + "step": 108635 + }, + { + "epoch": 11.951595159515952, + "grad_norm": 0.010006625205278397, + "learning_rate": 2.086741868111438e-05, + "loss": 0.0009, + "num_input_tokens_seen": 22928032, + "step": 108640 + }, + { + "epoch": 11.952145214521453, + "grad_norm": 0.005612201057374477, + "learning_rate": 2.0865051650677053e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22929120, + "step": 108645 + }, + { + "epoch": 11.952695269526952, + "grad_norm": 0.019375652074813843, + "learning_rate": 2.0862684658349564e-05, + "loss": 0.0012, + "num_input_tokens_seen": 22930208, + "step": 108650 + }, + { + "epoch": 11.953245324532453, + "grad_norm": 1.8778715133666992, + "learning_rate": 2.0860317704153716e-05, + "loss": 0.0454, + "num_input_tokens_seen": 22931296, + "step": 108655 + }, + { + "epoch": 11.953795379537954, + "grad_norm": 2.788461923599243, + "learning_rate": 2.085795078811135e-05, + "loss": 0.0987, + "num_input_tokens_seen": 22932384, + "step": 108660 + }, + { + "epoch": 11.954345434543454, + "grad_norm": 0.013916566967964172, + "learning_rate": 2.0855583910244252e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22933408, + "step": 108665 + }, + { + "epoch": 11.954895489548955, + "grad_norm": 0.00959797017276287, + "learning_rate": 2.085321707057425e-05, + "loss": 0.0294, + "num_input_tokens_seen": 22934496, + "step": 108670 + }, + { + "epoch": 11.955445544554456, + "grad_norm": 0.06582950055599213, + "learning_rate": 2.085085026912316e-05, + "loss": 0.003, + "num_input_tokens_seen": 22935584, + "step": 108675 + }, + { + "epoch": 11.955995599559955, + "grad_norm": 0.019622281193733215, + "learning_rate": 2.084848350591278e-05, + "loss": 0.001, + "num_input_tokens_seen": 22936640, + "step": 108680 + }, + { + "epoch": 11.956545654565456, + "grad_norm": 0.06328054517507553, + "learning_rate": 2.0846116780964954e-05, + "loss": 0.0023, + "num_input_tokens_seen": 22937696, + "step": 108685 + }, + { + "epoch": 11.957095709570957, + "grad_norm": 0.5535486340522766, + "learning_rate": 2.0843750094301467e-05, + "loss": 0.0905, + "num_input_tokens_seen": 22938752, + "step": 108690 + }, + { + "epoch": 11.957645764576458, + "grad_norm": 0.0046058520674705505, + "learning_rate": 2.084138344594414e-05, + "loss": 0.0041, + "num_input_tokens_seen": 22939808, + "step": 108695 + }, + { + "epoch": 11.958195819581958, + "grad_norm": 0.06115549057722092, + "learning_rate": 2.083901683591479e-05, + "loss": 0.0056, + "num_input_tokens_seen": 22940832, + "step": 108700 + }, + { + "epoch": 11.958745874587459, + "grad_norm": 0.0075347828678786755, + "learning_rate": 2.0836650264235228e-05, + "loss": 0.0009, + "num_input_tokens_seen": 22941888, + "step": 108705 + }, + { + "epoch": 11.95929592959296, + "grad_norm": 0.009438450448215008, + "learning_rate": 2.083428373092725e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22942944, + "step": 108710 + }, + { + "epoch": 11.95984598459846, + "grad_norm": 0.04663180187344551, + "learning_rate": 2.0831917236012696e-05, + "loss": 0.186, + "num_input_tokens_seen": 22943968, + "step": 108715 + }, + { + "epoch": 11.96039603960396, + "grad_norm": 0.1135031208395958, + "learning_rate": 2.082955077951335e-05, + "loss": 0.1577, + "num_input_tokens_seen": 22945024, + "step": 108720 + }, + { + "epoch": 11.960946094609461, + "grad_norm": 0.297503262758255, + "learning_rate": 2.0827184361451035e-05, + "loss": 0.0252, + "num_input_tokens_seen": 22946016, + "step": 108725 + }, + { + "epoch": 11.96149614961496, + "grad_norm": 0.05888249725103378, + "learning_rate": 2.0824817981847566e-05, + "loss": 0.0164, + "num_input_tokens_seen": 22947104, + "step": 108730 + }, + { + "epoch": 11.962046204620462, + "grad_norm": 0.034765273332595825, + "learning_rate": 2.082245164072474e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22948096, + "step": 108735 + }, + { + "epoch": 11.962596259625963, + "grad_norm": 0.11997800320386887, + "learning_rate": 2.082008533810437e-05, + "loss": 0.0035, + "num_input_tokens_seen": 22949152, + "step": 108740 + }, + { + "epoch": 11.963146314631462, + "grad_norm": 0.006432862486690283, + "learning_rate": 2.0817719074008268e-05, + "loss": 0.0095, + "num_input_tokens_seen": 22950240, + "step": 108745 + }, + { + "epoch": 11.963696369636963, + "grad_norm": 2.0468382835388184, + "learning_rate": 2.0815352848458246e-05, + "loss": 0.0116, + "num_input_tokens_seen": 22951264, + "step": 108750 + }, + { + "epoch": 11.964246424642464, + "grad_norm": 0.06414537131786346, + "learning_rate": 2.0812986661476113e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22952256, + "step": 108755 + }, + { + "epoch": 11.964796479647966, + "grad_norm": 0.04527723044157028, + "learning_rate": 2.0810620513083664e-05, + "loss": 0.0714, + "num_input_tokens_seen": 22953280, + "step": 108760 + }, + { + "epoch": 11.965346534653465, + "grad_norm": 0.04120012745261192, + "learning_rate": 2.080825440330272e-05, + "loss": 0.0022, + "num_input_tokens_seen": 22954336, + "step": 108765 + }, + { + "epoch": 11.965896589658966, + "grad_norm": 1.4530805349349976, + "learning_rate": 2.080588833215508e-05, + "loss": 0.05, + "num_input_tokens_seen": 22955328, + "step": 108770 + }, + { + "epoch": 11.966446644664467, + "grad_norm": 0.09823456406593323, + "learning_rate": 2.0803522299662566e-05, + "loss": 0.009, + "num_input_tokens_seen": 22956352, + "step": 108775 + }, + { + "epoch": 11.966996699669966, + "grad_norm": 0.5273795127868652, + "learning_rate": 2.0801156305846965e-05, + "loss": 0.0135, + "num_input_tokens_seen": 22957408, + "step": 108780 + }, + { + "epoch": 11.967546754675467, + "grad_norm": 0.2545296251773834, + "learning_rate": 2.0798790350730086e-05, + "loss": 0.0066, + "num_input_tokens_seen": 22958432, + "step": 108785 + }, + { + "epoch": 11.968096809680969, + "grad_norm": 0.004971607122570276, + "learning_rate": 2.0796424434333754e-05, + "loss": 0.0155, + "num_input_tokens_seen": 22959488, + "step": 108790 + }, + { + "epoch": 11.968646864686468, + "grad_norm": 1.7723455429077148, + "learning_rate": 2.0794058556679746e-05, + "loss": 0.0345, + "num_input_tokens_seen": 22960512, + "step": 108795 + }, + { + "epoch": 11.969196919691969, + "grad_norm": 0.12397713214159012, + "learning_rate": 2.079169271778989e-05, + "loss": 0.0107, + "num_input_tokens_seen": 22961664, + "step": 108800 + }, + { + "epoch": 11.96974697469747, + "grad_norm": 0.009450087323784828, + "learning_rate": 2.0789326917685983e-05, + "loss": 0.0125, + "num_input_tokens_seen": 22962720, + "step": 108805 + }, + { + "epoch": 11.97029702970297, + "grad_norm": 2.6557581424713135, + "learning_rate": 2.0786961156389827e-05, + "loss": 0.2466, + "num_input_tokens_seen": 22963840, + "step": 108810 + }, + { + "epoch": 11.97084708470847, + "grad_norm": 0.010167817585170269, + "learning_rate": 2.0784595433923236e-05, + "loss": 0.0061, + "num_input_tokens_seen": 22964960, + "step": 108815 + }, + { + "epoch": 11.971397139713972, + "grad_norm": 2.4539365768432617, + "learning_rate": 2.0782229750307993e-05, + "loss": 0.072, + "num_input_tokens_seen": 22966016, + "step": 108820 + }, + { + "epoch": 11.971947194719473, + "grad_norm": 0.04704366251826286, + "learning_rate": 2.0779864105565922e-05, + "loss": 0.0335, + "num_input_tokens_seen": 22967104, + "step": 108825 + }, + { + "epoch": 11.972497249724972, + "grad_norm": 0.005692682694643736, + "learning_rate": 2.0777498499718824e-05, + "loss": 0.007, + "num_input_tokens_seen": 22968096, + "step": 108830 + }, + { + "epoch": 11.973047304730473, + "grad_norm": 0.01649811491370201, + "learning_rate": 2.0775132932788485e-05, + "loss": 0.0115, + "num_input_tokens_seen": 22969184, + "step": 108835 + }, + { + "epoch": 11.973597359735974, + "grad_norm": 0.018967708572745323, + "learning_rate": 2.0772767404796725e-05, + "loss": 0.011, + "num_input_tokens_seen": 22970240, + "step": 108840 + }, + { + "epoch": 11.974147414741473, + "grad_norm": 1.2703298330307007, + "learning_rate": 2.0770401915765334e-05, + "loss": 0.0091, + "num_input_tokens_seen": 22971264, + "step": 108845 + }, + { + "epoch": 11.974697469746975, + "grad_norm": 0.012244755402207375, + "learning_rate": 2.076803646571613e-05, + "loss": 0.0097, + "num_input_tokens_seen": 22972384, + "step": 108850 + }, + { + "epoch": 11.975247524752476, + "grad_norm": 0.015514393337070942, + "learning_rate": 2.0765671054670895e-05, + "loss": 0.0176, + "num_input_tokens_seen": 22973408, + "step": 108855 + }, + { + "epoch": 11.975797579757975, + "grad_norm": 0.1543608009815216, + "learning_rate": 2.0763305682651434e-05, + "loss": 0.0454, + "num_input_tokens_seen": 22974496, + "step": 108860 + }, + { + "epoch": 11.976347634763476, + "grad_norm": 0.025751104578375816, + "learning_rate": 2.0760940349679557e-05, + "loss": 0.0033, + "num_input_tokens_seen": 22975584, + "step": 108865 + }, + { + "epoch": 11.976897689768977, + "grad_norm": 0.07685238867998123, + "learning_rate": 2.0758575055777058e-05, + "loss": 0.0403, + "num_input_tokens_seen": 22976640, + "step": 108870 + }, + { + "epoch": 11.977447744774478, + "grad_norm": 0.04283434897661209, + "learning_rate": 2.0756209800965743e-05, + "loss": 0.0017, + "num_input_tokens_seen": 22977728, + "step": 108875 + }, + { + "epoch": 11.977997799779978, + "grad_norm": 3.0288140773773193, + "learning_rate": 2.07538445852674e-05, + "loss": 0.0653, + "num_input_tokens_seen": 22978848, + "step": 108880 + }, + { + "epoch": 11.978547854785479, + "grad_norm": 0.020565129816532135, + "learning_rate": 2.0751479408703835e-05, + "loss": 0.054, + "num_input_tokens_seen": 22979904, + "step": 108885 + }, + { + "epoch": 11.97909790979098, + "grad_norm": 2.510249614715576, + "learning_rate": 2.074911427129685e-05, + "loss": 0.0331, + "num_input_tokens_seen": 22981024, + "step": 108890 + }, + { + "epoch": 11.979647964796479, + "grad_norm": 0.29321229457855225, + "learning_rate": 2.0746749173068235e-05, + "loss": 0.0027, + "num_input_tokens_seen": 22982080, + "step": 108895 + }, + { + "epoch": 11.98019801980198, + "grad_norm": 0.1817958503961563, + "learning_rate": 2.0744384114039787e-05, + "loss": 0.0041, + "num_input_tokens_seen": 22983200, + "step": 108900 + }, + { + "epoch": 11.980748074807481, + "grad_norm": 0.015084180980920792, + "learning_rate": 2.0742019094233318e-05, + "loss": 0.0047, + "num_input_tokens_seen": 22984224, + "step": 108905 + }, + { + "epoch": 11.98129812981298, + "grad_norm": 0.006338208913803101, + "learning_rate": 2.0739654113670607e-05, + "loss": 0.0098, + "num_input_tokens_seen": 22985248, + "step": 108910 + }, + { + "epoch": 11.981848184818482, + "grad_norm": 0.3250313997268677, + "learning_rate": 2.0737289172373463e-05, + "loss": 0.0049, + "num_input_tokens_seen": 22986368, + "step": 108915 + }, + { + "epoch": 11.982398239823983, + "grad_norm": 0.031635843217372894, + "learning_rate": 2.0734924270363687e-05, + "loss": 0.0014, + "num_input_tokens_seen": 22987424, + "step": 108920 + }, + { + "epoch": 11.982948294829484, + "grad_norm": 0.03739609196782112, + "learning_rate": 2.073255940766305e-05, + "loss": 0.0037, + "num_input_tokens_seen": 22988448, + "step": 108925 + }, + { + "epoch": 11.983498349834983, + "grad_norm": 2.2242166996002197, + "learning_rate": 2.0730194584293383e-05, + "loss": 0.017, + "num_input_tokens_seen": 22989536, + "step": 108930 + }, + { + "epoch": 11.984048404840484, + "grad_norm": 0.024465588852763176, + "learning_rate": 2.072782980027645e-05, + "loss": 0.0884, + "num_input_tokens_seen": 22990592, + "step": 108935 + }, + { + "epoch": 11.984598459845985, + "grad_norm": 0.04393136128783226, + "learning_rate": 2.0725465055634067e-05, + "loss": 0.0874, + "num_input_tokens_seen": 22991616, + "step": 108940 + }, + { + "epoch": 11.985148514851485, + "grad_norm": 0.05837548151612282, + "learning_rate": 2.0723100350388028e-05, + "loss": 0.0186, + "num_input_tokens_seen": 22992704, + "step": 108945 + }, + { + "epoch": 11.985698569856986, + "grad_norm": 0.05161342769861221, + "learning_rate": 2.0720735684560107e-05, + "loss": 0.0034, + "num_input_tokens_seen": 22993792, + "step": 108950 + }, + { + "epoch": 11.986248624862487, + "grad_norm": 0.8202314972877502, + "learning_rate": 2.071837105817212e-05, + "loss": 0.1241, + "num_input_tokens_seen": 22994880, + "step": 108955 + }, + { + "epoch": 11.986798679867986, + "grad_norm": 0.3409045338630676, + "learning_rate": 2.0716006471245844e-05, + "loss": 0.0762, + "num_input_tokens_seen": 22995936, + "step": 108960 + }, + { + "epoch": 11.987348734873487, + "grad_norm": 0.06541203707456589, + "learning_rate": 2.0713641923803094e-05, + "loss": 0.0051, + "num_input_tokens_seen": 22997024, + "step": 108965 + }, + { + "epoch": 11.987898789878988, + "grad_norm": 0.15485215187072754, + "learning_rate": 2.071127741586564e-05, + "loss": 0.0725, + "num_input_tokens_seen": 22998080, + "step": 108970 + }, + { + "epoch": 11.988448844884488, + "grad_norm": 2.359231948852539, + "learning_rate": 2.0708912947455287e-05, + "loss": 0.0567, + "num_input_tokens_seen": 22999200, + "step": 108975 + }, + { + "epoch": 11.988998899889989, + "grad_norm": 0.0785381942987442, + "learning_rate": 2.070654851859383e-05, + "loss": 0.0014, + "num_input_tokens_seen": 23000192, + "step": 108980 + }, + { + "epoch": 11.98954895489549, + "grad_norm": 0.08562452346086502, + "learning_rate": 2.070418412930304e-05, + "loss": 0.0672, + "num_input_tokens_seen": 23001280, + "step": 108985 + }, + { + "epoch": 11.990099009900991, + "grad_norm": 0.04734231159090996, + "learning_rate": 2.0701819779604738e-05, + "loss": 0.0079, + "num_input_tokens_seen": 23002400, + "step": 108990 + }, + { + "epoch": 11.99064906490649, + "grad_norm": 0.006729878485202789, + "learning_rate": 2.0699455469520694e-05, + "loss": 0.0037, + "num_input_tokens_seen": 23003456, + "step": 108995 + }, + { + "epoch": 11.991199119911991, + "grad_norm": 0.0799187496304512, + "learning_rate": 2.0697091199072703e-05, + "loss": 0.0496, + "num_input_tokens_seen": 23004512, + "step": 109000 + }, + { + "epoch": 11.991749174917492, + "grad_norm": 0.08737896382808685, + "learning_rate": 2.0694726968282567e-05, + "loss": 0.0329, + "num_input_tokens_seen": 23005600, + "step": 109005 + }, + { + "epoch": 11.992299229922992, + "grad_norm": 3.4202849864959717, + "learning_rate": 2.069236277717206e-05, + "loss": 0.0368, + "num_input_tokens_seen": 23006656, + "step": 109010 + }, + { + "epoch": 11.992849284928493, + "grad_norm": 0.006213538348674774, + "learning_rate": 2.068999862576298e-05, + "loss": 0.0183, + "num_input_tokens_seen": 23007680, + "step": 109015 + }, + { + "epoch": 11.993399339933994, + "grad_norm": 0.28254491090774536, + "learning_rate": 2.068763451407712e-05, + "loss": 0.0792, + "num_input_tokens_seen": 23008736, + "step": 109020 + }, + { + "epoch": 11.993949394939493, + "grad_norm": 0.3732132613658905, + "learning_rate": 2.068527044213625e-05, + "loss": 0.0399, + "num_input_tokens_seen": 23009792, + "step": 109025 + }, + { + "epoch": 11.994499449944994, + "grad_norm": 0.018634140491485596, + "learning_rate": 2.0682906409962182e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23010816, + "step": 109030 + }, + { + "epoch": 11.995049504950495, + "grad_norm": 0.6200851798057556, + "learning_rate": 2.0680542417576687e-05, + "loss": 0.0081, + "num_input_tokens_seen": 23011904, + "step": 109035 + }, + { + "epoch": 11.995599559955995, + "grad_norm": 0.05946163833141327, + "learning_rate": 2.067817846500157e-05, + "loss": 0.0107, + "num_input_tokens_seen": 23012928, + "step": 109040 + }, + { + "epoch": 11.996149614961496, + "grad_norm": 0.09819082915782928, + "learning_rate": 2.06758145522586e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23013984, + "step": 109045 + }, + { + "epoch": 11.996699669966997, + "grad_norm": 0.8703126907348633, + "learning_rate": 2.0673450679369573e-05, + "loss": 0.0495, + "num_input_tokens_seen": 23015008, + "step": 109050 + }, + { + "epoch": 11.997249724972498, + "grad_norm": 0.02154347486793995, + "learning_rate": 2.0671086846356277e-05, + "loss": 0.0246, + "num_input_tokens_seen": 23016032, + "step": 109055 + }, + { + "epoch": 11.997799779977997, + "grad_norm": 0.00925991777330637, + "learning_rate": 2.066872305324049e-05, + "loss": 0.0091, + "num_input_tokens_seen": 23017120, + "step": 109060 + }, + { + "epoch": 11.998349834983498, + "grad_norm": 2.508566379547119, + "learning_rate": 2.066635930004402e-05, + "loss": 0.0178, + "num_input_tokens_seen": 23018144, + "step": 109065 + }, + { + "epoch": 11.998899889989, + "grad_norm": 0.020544124767184258, + "learning_rate": 2.0663995586788627e-05, + "loss": 0.0063, + "num_input_tokens_seen": 23019232, + "step": 109070 + }, + { + "epoch": 11.999449944994499, + "grad_norm": 0.047450438141822815, + "learning_rate": 2.0661631913496103e-05, + "loss": 0.0055, + "num_input_tokens_seen": 23020320, + "step": 109075 + }, + { + "epoch": 12.0, + "grad_norm": 0.017068391665816307, + "learning_rate": 2.0659268280188243e-05, + "loss": 0.0008, + "num_input_tokens_seen": 23021312, + "step": 109080 + }, + { + "epoch": 12.0, + "eval_loss": 0.07049336284399033, + "eval_runtime": 36.9486, + "eval_samples_per_second": 109.341, + "eval_steps_per_second": 27.335, + "num_input_tokens_seen": 23021312, + "step": 109080 + }, + { + "epoch": 12.000550055005501, + "grad_norm": 0.013740845955908298, + "learning_rate": 2.0656904686886823e-05, + "loss": 0.1471, + "num_input_tokens_seen": 23022368, + "step": 109085 + }, + { + "epoch": 12.001100110011, + "grad_norm": 0.28806668519973755, + "learning_rate": 2.065454113361362e-05, + "loss": 0.0099, + "num_input_tokens_seen": 23023488, + "step": 109090 + }, + { + "epoch": 12.001650165016502, + "grad_norm": 0.027548858895897865, + "learning_rate": 2.065217762039044e-05, + "loss": 0.0881, + "num_input_tokens_seen": 23024448, + "step": 109095 + }, + { + "epoch": 12.002200220022003, + "grad_norm": 0.04732624441385269, + "learning_rate": 2.0649814147239034e-05, + "loss": 0.0954, + "num_input_tokens_seen": 23025568, + "step": 109100 + }, + { + "epoch": 12.002750275027502, + "grad_norm": 0.0220318716019392, + "learning_rate": 2.064745071418122e-05, + "loss": 0.0094, + "num_input_tokens_seen": 23026688, + "step": 109105 + }, + { + "epoch": 12.003300330033003, + "grad_norm": 1.7940073013305664, + "learning_rate": 2.064508732123876e-05, + "loss": 0.1536, + "num_input_tokens_seen": 23027744, + "step": 109110 + }, + { + "epoch": 12.003850385038504, + "grad_norm": 0.16883909702301025, + "learning_rate": 2.0642723968433434e-05, + "loss": 0.0041, + "num_input_tokens_seen": 23028800, + "step": 109115 + }, + { + "epoch": 12.004400440044005, + "grad_norm": 0.043157316744327545, + "learning_rate": 2.064036065578704e-05, + "loss": 0.0031, + "num_input_tokens_seen": 23029824, + "step": 109120 + }, + { + "epoch": 12.004950495049505, + "grad_norm": 0.008086210116744041, + "learning_rate": 2.063799738332134e-05, + "loss": 0.0929, + "num_input_tokens_seen": 23030880, + "step": 109125 + }, + { + "epoch": 12.005500550055006, + "grad_norm": 0.1200680211186409, + "learning_rate": 2.0635634151058134e-05, + "loss": 0.0168, + "num_input_tokens_seen": 23031904, + "step": 109130 + }, + { + "epoch": 12.006050605060507, + "grad_norm": 0.010292849503457546, + "learning_rate": 2.0633270959019196e-05, + "loss": 0.0212, + "num_input_tokens_seen": 23032960, + "step": 109135 + }, + { + "epoch": 12.006600660066006, + "grad_norm": 0.670831024646759, + "learning_rate": 2.0630907807226293e-05, + "loss": 0.0242, + "num_input_tokens_seen": 23033952, + "step": 109140 + }, + { + "epoch": 12.007150715071507, + "grad_norm": 2.384841203689575, + "learning_rate": 2.0628544695701226e-05, + "loss": 0.0954, + "num_input_tokens_seen": 23035008, + "step": 109145 + }, + { + "epoch": 12.007700770077008, + "grad_norm": 0.007517574820667505, + "learning_rate": 2.062618162446575e-05, + "loss": 0.0184, + "num_input_tokens_seen": 23036032, + "step": 109150 + }, + { + "epoch": 12.008250825082508, + "grad_norm": 0.02275899238884449, + "learning_rate": 2.062381859354168e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23037088, + "step": 109155 + }, + { + "epoch": 12.008800880088009, + "grad_norm": 0.014727048575878143, + "learning_rate": 2.0621455602950763e-05, + "loss": 0.0264, + "num_input_tokens_seen": 23038144, + "step": 109160 + }, + { + "epoch": 12.00935093509351, + "grad_norm": 0.015607316978275776, + "learning_rate": 2.0619092652714785e-05, + "loss": 0.0033, + "num_input_tokens_seen": 23039168, + "step": 109165 + }, + { + "epoch": 12.009900990099009, + "grad_norm": 0.006817135028541088, + "learning_rate": 2.0616729742855533e-05, + "loss": 0.0987, + "num_input_tokens_seen": 23040224, + "step": 109170 + }, + { + "epoch": 12.01045104510451, + "grad_norm": 0.008903572335839272, + "learning_rate": 2.0614366873394775e-05, + "loss": 0.0044, + "num_input_tokens_seen": 23041248, + "step": 109175 + }, + { + "epoch": 12.011001100110011, + "grad_norm": 0.009421350434422493, + "learning_rate": 2.0612004044354304e-05, + "loss": 0.003, + "num_input_tokens_seen": 23042304, + "step": 109180 + }, + { + "epoch": 12.011551155115512, + "grad_norm": 0.04061488062143326, + "learning_rate": 2.0609641255755878e-05, + "loss": 0.0093, + "num_input_tokens_seen": 23043296, + "step": 109185 + }, + { + "epoch": 12.012101210121012, + "grad_norm": 0.018557177856564522, + "learning_rate": 2.060727850762128e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23044384, + "step": 109190 + }, + { + "epoch": 12.012651265126513, + "grad_norm": 0.006735694129019976, + "learning_rate": 2.0604915799972296e-05, + "loss": 0.0017, + "num_input_tokens_seen": 23045472, + "step": 109195 + }, + { + "epoch": 12.013201320132014, + "grad_norm": 1.5316776037216187, + "learning_rate": 2.0602553132830687e-05, + "loss": 0.1107, + "num_input_tokens_seen": 23046496, + "step": 109200 + }, + { + "epoch": 12.013751375137513, + "grad_norm": 1.4242746829986572, + "learning_rate": 2.060019050621824e-05, + "loss": 0.0556, + "num_input_tokens_seen": 23047648, + "step": 109205 + }, + { + "epoch": 12.014301430143014, + "grad_norm": 0.014995060861110687, + "learning_rate": 2.0597827920156727e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23048672, + "step": 109210 + }, + { + "epoch": 12.014851485148515, + "grad_norm": 0.07283422350883484, + "learning_rate": 2.0595465374667915e-05, + "loss": 0.0147, + "num_input_tokens_seen": 23049664, + "step": 109215 + }, + { + "epoch": 12.015401540154015, + "grad_norm": 0.009412611834704876, + "learning_rate": 2.059310286977359e-05, + "loss": 0.0193, + "num_input_tokens_seen": 23050720, + "step": 109220 + }, + { + "epoch": 12.015951595159516, + "grad_norm": 0.03635158762335777, + "learning_rate": 2.0590740405495514e-05, + "loss": 0.0153, + "num_input_tokens_seen": 23051744, + "step": 109225 + }, + { + "epoch": 12.016501650165017, + "grad_norm": 0.0038390331901609898, + "learning_rate": 2.0588377981855478e-05, + "loss": 0.1641, + "num_input_tokens_seen": 23052736, + "step": 109230 + }, + { + "epoch": 12.017051705170518, + "grad_norm": 0.05577091500163078, + "learning_rate": 2.058601559887525e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23053792, + "step": 109235 + }, + { + "epoch": 12.017601760176017, + "grad_norm": 0.006980491802096367, + "learning_rate": 2.058365325657658e-05, + "loss": 0.0386, + "num_input_tokens_seen": 23054848, + "step": 109240 + }, + { + "epoch": 12.018151815181518, + "grad_norm": 0.013159489259123802, + "learning_rate": 2.058129095498127e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23055904, + "step": 109245 + }, + { + "epoch": 12.01870187018702, + "grad_norm": 0.02373890019953251, + "learning_rate": 2.0578928694111073e-05, + "loss": 0.0091, + "num_input_tokens_seen": 23056928, + "step": 109250 + }, + { + "epoch": 12.019251925192519, + "grad_norm": 0.030327167361974716, + "learning_rate": 2.057656647398778e-05, + "loss": 0.0096, + "num_input_tokens_seen": 23057984, + "step": 109255 + }, + { + "epoch": 12.01980198019802, + "grad_norm": 0.029764289036393166, + "learning_rate": 2.0574204294633148e-05, + "loss": 0.0322, + "num_input_tokens_seen": 23059072, + "step": 109260 + }, + { + "epoch": 12.020352035203521, + "grad_norm": 0.006607276387512684, + "learning_rate": 2.0571842156068948e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23060128, + "step": 109265 + }, + { + "epoch": 12.02090209020902, + "grad_norm": 0.032683201134204865, + "learning_rate": 2.056948005831696e-05, + "loss": 0.0135, + "num_input_tokens_seen": 23061184, + "step": 109270 + }, + { + "epoch": 12.021452145214521, + "grad_norm": 0.07776985317468643, + "learning_rate": 2.0567118001398944e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23062240, + "step": 109275 + }, + { + "epoch": 12.022002200220022, + "grad_norm": 0.010544576682150364, + "learning_rate": 2.056475598533667e-05, + "loss": 0.0157, + "num_input_tokens_seen": 23063296, + "step": 109280 + }, + { + "epoch": 12.022552255225522, + "grad_norm": 1.048736810684204, + "learning_rate": 2.0562394010151922e-05, + "loss": 0.0444, + "num_input_tokens_seen": 23064352, + "step": 109285 + }, + { + "epoch": 12.023102310231023, + "grad_norm": 0.5792057514190674, + "learning_rate": 2.056003207586644e-05, + "loss": 0.0391, + "num_input_tokens_seen": 23065472, + "step": 109290 + }, + { + "epoch": 12.023652365236524, + "grad_norm": 3.1511549949645996, + "learning_rate": 2.0557670182502033e-05, + "loss": 0.0979, + "num_input_tokens_seen": 23066560, + "step": 109295 + }, + { + "epoch": 12.024202420242025, + "grad_norm": 0.24231581389904022, + "learning_rate": 2.055530833008044e-05, + "loss": 0.0876, + "num_input_tokens_seen": 23067648, + "step": 109300 + }, + { + "epoch": 12.024752475247524, + "grad_norm": 0.02113467827439308, + "learning_rate": 2.0552946518623434e-05, + "loss": 0.0037, + "num_input_tokens_seen": 23068704, + "step": 109305 + }, + { + "epoch": 12.025302530253025, + "grad_norm": 1.2983490228652954, + "learning_rate": 2.0550584748152797e-05, + "loss": 0.013, + "num_input_tokens_seen": 23069760, + "step": 109310 + }, + { + "epoch": 12.025852585258527, + "grad_norm": 0.03332002833485603, + "learning_rate": 2.054822301869027e-05, + "loss": 0.0666, + "num_input_tokens_seen": 23070784, + "step": 109315 + }, + { + "epoch": 12.026402640264026, + "grad_norm": 0.015539980493485928, + "learning_rate": 2.0545861330257647e-05, + "loss": 0.0439, + "num_input_tokens_seen": 23071808, + "step": 109320 + }, + { + "epoch": 12.026952695269527, + "grad_norm": 0.015486929565668106, + "learning_rate": 2.0543499682876686e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23072768, + "step": 109325 + }, + { + "epoch": 12.027502750275028, + "grad_norm": 0.837267279624939, + "learning_rate": 2.0541138076569135e-05, + "loss": 0.045, + "num_input_tokens_seen": 23073888, + "step": 109330 + }, + { + "epoch": 12.028052805280527, + "grad_norm": 0.0259484201669693, + "learning_rate": 2.0538776511356787e-05, + "loss": 0.0275, + "num_input_tokens_seen": 23074912, + "step": 109335 + }, + { + "epoch": 12.028602860286028, + "grad_norm": 0.023400720208883286, + "learning_rate": 2.0536414987261383e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23076000, + "step": 109340 + }, + { + "epoch": 12.02915291529153, + "grad_norm": 0.028049703687429428, + "learning_rate": 2.053405350430471e-05, + "loss": 0.0084, + "num_input_tokens_seen": 23076992, + "step": 109345 + }, + { + "epoch": 12.029702970297029, + "grad_norm": 0.46456092596054077, + "learning_rate": 2.0531692062508528e-05, + "loss": 0.0738, + "num_input_tokens_seen": 23078048, + "step": 109350 + }, + { + "epoch": 12.03025302530253, + "grad_norm": 0.013148603960871696, + "learning_rate": 2.0529330661894586e-05, + "loss": 0.011, + "num_input_tokens_seen": 23079072, + "step": 109355 + }, + { + "epoch": 12.030803080308031, + "grad_norm": 1.29367995262146, + "learning_rate": 2.052696930248466e-05, + "loss": 0.0361, + "num_input_tokens_seen": 23080064, + "step": 109360 + }, + { + "epoch": 12.031353135313532, + "grad_norm": 0.01980806142091751, + "learning_rate": 2.052460798430051e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23081088, + "step": 109365 + }, + { + "epoch": 12.031903190319031, + "grad_norm": 1.189164638519287, + "learning_rate": 2.0522246707363906e-05, + "loss": 0.013, + "num_input_tokens_seen": 23082144, + "step": 109370 + }, + { + "epoch": 12.032453245324533, + "grad_norm": 0.013579318299889565, + "learning_rate": 2.0519885471696608e-05, + "loss": 0.0194, + "num_input_tokens_seen": 23083200, + "step": 109375 + }, + { + "epoch": 12.033003300330034, + "grad_norm": 1.3815745115280151, + "learning_rate": 2.0517524277320363e-05, + "loss": 0.091, + "num_input_tokens_seen": 23084224, + "step": 109380 + }, + { + "epoch": 12.033553355335533, + "grad_norm": 0.020455962046980858, + "learning_rate": 2.051516312425696e-05, + "loss": 0.0026, + "num_input_tokens_seen": 23085312, + "step": 109385 + }, + { + "epoch": 12.034103410341034, + "grad_norm": 1.1820340156555176, + "learning_rate": 2.0512802012528136e-05, + "loss": 0.0685, + "num_input_tokens_seen": 23086336, + "step": 109390 + }, + { + "epoch": 12.034653465346535, + "grad_norm": 0.11515168100595474, + "learning_rate": 2.0510440942155668e-05, + "loss": 0.0216, + "num_input_tokens_seen": 23087392, + "step": 109395 + }, + { + "epoch": 12.035203520352034, + "grad_norm": 0.007786223199218512, + "learning_rate": 2.0508079913161315e-05, + "loss": 0.0584, + "num_input_tokens_seen": 23088416, + "step": 109400 + }, + { + "epoch": 12.035753575357536, + "grad_norm": 0.06285369396209717, + "learning_rate": 2.050571892556682e-05, + "loss": 0.0628, + "num_input_tokens_seen": 23089472, + "step": 109405 + }, + { + "epoch": 12.036303630363037, + "grad_norm": 0.02178160287439823, + "learning_rate": 2.050335797939397e-05, + "loss": 0.0072, + "num_input_tokens_seen": 23090464, + "step": 109410 + }, + { + "epoch": 12.036853685368538, + "grad_norm": 0.05256297066807747, + "learning_rate": 2.050099707466451e-05, + "loss": 0.0759, + "num_input_tokens_seen": 23091488, + "step": 109415 + }, + { + "epoch": 12.037403740374037, + "grad_norm": 0.07801936566829681, + "learning_rate": 2.0498636211400195e-05, + "loss": 0.0077, + "num_input_tokens_seen": 23092512, + "step": 109420 + }, + { + "epoch": 12.037953795379538, + "grad_norm": 0.03283878415822983, + "learning_rate": 2.0496275389622803e-05, + "loss": 0.0101, + "num_input_tokens_seen": 23093600, + "step": 109425 + }, + { + "epoch": 12.03850385038504, + "grad_norm": 0.010597867891192436, + "learning_rate": 2.0493914609354064e-05, + "loss": 0.0118, + "num_input_tokens_seen": 23094656, + "step": 109430 + }, + { + "epoch": 12.039053905390539, + "grad_norm": 0.03081027790904045, + "learning_rate": 2.049155387061576e-05, + "loss": 0.0016, + "num_input_tokens_seen": 23095648, + "step": 109435 + }, + { + "epoch": 12.03960396039604, + "grad_norm": 0.00741636473685503, + "learning_rate": 2.0489193173429635e-05, + "loss": 0.0202, + "num_input_tokens_seen": 23096672, + "step": 109440 + }, + { + "epoch": 12.04015401540154, + "grad_norm": 0.09413633495569229, + "learning_rate": 2.0486832517817464e-05, + "loss": 0.0093, + "num_input_tokens_seen": 23097760, + "step": 109445 + }, + { + "epoch": 12.04070407040704, + "grad_norm": 0.0672784373164177, + "learning_rate": 2.0484471903800984e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23098752, + "step": 109450 + }, + { + "epoch": 12.041254125412541, + "grad_norm": 0.42584940791130066, + "learning_rate": 2.0482111331401957e-05, + "loss": 0.0047, + "num_input_tokens_seen": 23099872, + "step": 109455 + }, + { + "epoch": 12.041804180418042, + "grad_norm": 0.39731651544570923, + "learning_rate": 2.047975080064215e-05, + "loss": 0.0033, + "num_input_tokens_seen": 23100928, + "step": 109460 + }, + { + "epoch": 12.042354235423542, + "grad_norm": 0.009176562540233135, + "learning_rate": 2.0477390311543306e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23101984, + "step": 109465 + }, + { + "epoch": 12.042904290429043, + "grad_norm": 0.291491836309433, + "learning_rate": 2.0475029864127182e-05, + "loss": 0.0059, + "num_input_tokens_seen": 23103072, + "step": 109470 + }, + { + "epoch": 12.043454345434544, + "grad_norm": 0.16905736923217773, + "learning_rate": 2.0472669458415542e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23104064, + "step": 109475 + }, + { + "epoch": 12.044004400440045, + "grad_norm": 0.11761096119880676, + "learning_rate": 2.047030909443013e-05, + "loss": 0.0078, + "num_input_tokens_seen": 23105088, + "step": 109480 + }, + { + "epoch": 12.044554455445544, + "grad_norm": 0.015486803837120533, + "learning_rate": 2.0467948772192713e-05, + "loss": 0.0215, + "num_input_tokens_seen": 23106112, + "step": 109485 + }, + { + "epoch": 12.045104510451045, + "grad_norm": 0.07832198590040207, + "learning_rate": 2.046558849172503e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23107168, + "step": 109490 + }, + { + "epoch": 12.045654565456546, + "grad_norm": 0.10102080553770065, + "learning_rate": 2.0463228253048844e-05, + "loss": 0.1202, + "num_input_tokens_seen": 23108192, + "step": 109495 + }, + { + "epoch": 12.046204620462046, + "grad_norm": 0.008911027573049068, + "learning_rate": 2.046086805618591e-05, + "loss": 0.0257, + "num_input_tokens_seen": 23109376, + "step": 109500 + }, + { + "epoch": 12.046754675467547, + "grad_norm": 0.035691097378730774, + "learning_rate": 2.0458507901157973e-05, + "loss": 0.0044, + "num_input_tokens_seen": 23110368, + "step": 109505 + }, + { + "epoch": 12.047304730473048, + "grad_norm": 0.11501327157020569, + "learning_rate": 2.045614778798679e-05, + "loss": 0.1445, + "num_input_tokens_seen": 23111456, + "step": 109510 + }, + { + "epoch": 12.047854785478547, + "grad_norm": 0.06856998056173325, + "learning_rate": 2.045378771669412e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23112448, + "step": 109515 + }, + { + "epoch": 12.048404840484048, + "grad_norm": 1.2580972909927368, + "learning_rate": 2.0451427687301696e-05, + "loss": 0.048, + "num_input_tokens_seen": 23113536, + "step": 109520 + }, + { + "epoch": 12.04895489548955, + "grad_norm": 0.012583190575242043, + "learning_rate": 2.0449067699831286e-05, + "loss": 0.0007, + "num_input_tokens_seen": 23114560, + "step": 109525 + }, + { + "epoch": 12.049504950495049, + "grad_norm": 0.6379572749137878, + "learning_rate": 2.044670775430463e-05, + "loss": 0.0606, + "num_input_tokens_seen": 23115648, + "step": 109530 + }, + { + "epoch": 12.05005500550055, + "grad_norm": 0.012178981676697731, + "learning_rate": 2.044434785074349e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23116704, + "step": 109535 + }, + { + "epoch": 12.05060506050605, + "grad_norm": 0.022222600877285004, + "learning_rate": 2.0441987989169612e-05, + "loss": 0.0301, + "num_input_tokens_seen": 23117728, + "step": 109540 + }, + { + "epoch": 12.051155115511552, + "grad_norm": 0.022899076342582703, + "learning_rate": 2.0439628169604734e-05, + "loss": 0.0026, + "num_input_tokens_seen": 23118816, + "step": 109545 + }, + { + "epoch": 12.051705170517051, + "grad_norm": 1.4418576955795288, + "learning_rate": 2.043726839207062e-05, + "loss": 0.0189, + "num_input_tokens_seen": 23119904, + "step": 109550 + }, + { + "epoch": 12.052255225522552, + "grad_norm": 0.05859708413481712, + "learning_rate": 2.0434908656589012e-05, + "loss": 0.0073, + "num_input_tokens_seen": 23120928, + "step": 109555 + }, + { + "epoch": 12.052805280528053, + "grad_norm": 1.2675524950027466, + "learning_rate": 2.0432548963181665e-05, + "loss": 0.0465, + "num_input_tokens_seen": 23122016, + "step": 109560 + }, + { + "epoch": 12.053355335533553, + "grad_norm": 0.2742975056171417, + "learning_rate": 2.0430189311870324e-05, + "loss": 0.0776, + "num_input_tokens_seen": 23123040, + "step": 109565 + }, + { + "epoch": 12.053905390539054, + "grad_norm": 0.009734734892845154, + "learning_rate": 2.0427829702676728e-05, + "loss": 0.0026, + "num_input_tokens_seen": 23124064, + "step": 109570 + }, + { + "epoch": 12.054455445544555, + "grad_norm": 0.014970668591558933, + "learning_rate": 2.0425470135622637e-05, + "loss": 0.0398, + "num_input_tokens_seen": 23125152, + "step": 109575 + }, + { + "epoch": 12.055005500550054, + "grad_norm": 0.0128599489107728, + "learning_rate": 2.0423110610729788e-05, + "loss": 0.0048, + "num_input_tokens_seen": 23126240, + "step": 109580 + }, + { + "epoch": 12.055555555555555, + "grad_norm": 0.14420954883098602, + "learning_rate": 2.0420751128019935e-05, + "loss": 0.004, + "num_input_tokens_seen": 23127296, + "step": 109585 + }, + { + "epoch": 12.056105610561056, + "grad_norm": 0.08388978987932205, + "learning_rate": 2.041839168751482e-05, + "loss": 0.0087, + "num_input_tokens_seen": 23128288, + "step": 109590 + }, + { + "epoch": 12.056655665566556, + "grad_norm": 1.3574044704437256, + "learning_rate": 2.0416032289236188e-05, + "loss": 0.0309, + "num_input_tokens_seen": 23129408, + "step": 109595 + }, + { + "epoch": 12.057205720572057, + "grad_norm": 0.16751043498516083, + "learning_rate": 2.0413672933205798e-05, + "loss": 0.1445, + "num_input_tokens_seen": 23130432, + "step": 109600 + }, + { + "epoch": 12.057755775577558, + "grad_norm": 0.07944311201572418, + "learning_rate": 2.041131361944537e-05, + "loss": 0.0719, + "num_input_tokens_seen": 23131520, + "step": 109605 + }, + { + "epoch": 12.058305830583059, + "grad_norm": 0.11643285304307938, + "learning_rate": 2.040895434797667e-05, + "loss": 0.0078, + "num_input_tokens_seen": 23132544, + "step": 109610 + }, + { + "epoch": 12.058855885588558, + "grad_norm": 0.10920886695384979, + "learning_rate": 2.0406595118821438e-05, + "loss": 0.0041, + "num_input_tokens_seen": 23133568, + "step": 109615 + }, + { + "epoch": 12.05940594059406, + "grad_norm": 1.723318099975586, + "learning_rate": 2.040423593200141e-05, + "loss": 0.1222, + "num_input_tokens_seen": 23134560, + "step": 109620 + }, + { + "epoch": 12.05995599559956, + "grad_norm": 0.008390169590711594, + "learning_rate": 2.0401876787538332e-05, + "loss": 0.0011, + "num_input_tokens_seen": 23135616, + "step": 109625 + }, + { + "epoch": 12.06050605060506, + "grad_norm": 1.7740696668624878, + "learning_rate": 2.039951768545395e-05, + "loss": 0.1926, + "num_input_tokens_seen": 23136640, + "step": 109630 + }, + { + "epoch": 12.061056105610561, + "grad_norm": 1.1078287363052368, + "learning_rate": 2.039715862577001e-05, + "loss": 0.0483, + "num_input_tokens_seen": 23137696, + "step": 109635 + }, + { + "epoch": 12.061606160616062, + "grad_norm": 0.2828141748905182, + "learning_rate": 2.039479960850825e-05, + "loss": 0.0095, + "num_input_tokens_seen": 23138720, + "step": 109640 + }, + { + "epoch": 12.062156215621561, + "grad_norm": 0.029680613428354263, + "learning_rate": 2.0392440633690407e-05, + "loss": 0.0414, + "num_input_tokens_seen": 23139744, + "step": 109645 + }, + { + "epoch": 12.062706270627062, + "grad_norm": 1.8797683715820312, + "learning_rate": 2.0390081701338233e-05, + "loss": 0.0633, + "num_input_tokens_seen": 23140864, + "step": 109650 + }, + { + "epoch": 12.063256325632564, + "grad_norm": 0.14368821680545807, + "learning_rate": 2.0387722811473452e-05, + "loss": 0.0204, + "num_input_tokens_seen": 23142048, + "step": 109655 + }, + { + "epoch": 12.063806380638065, + "grad_norm": 0.017492227256298065, + "learning_rate": 2.0385363964117836e-05, + "loss": 0.0238, + "num_input_tokens_seen": 23143040, + "step": 109660 + }, + { + "epoch": 12.064356435643564, + "grad_norm": 0.09197627007961273, + "learning_rate": 2.0383005159293096e-05, + "loss": 0.0396, + "num_input_tokens_seen": 23144064, + "step": 109665 + }, + { + "epoch": 12.064906490649065, + "grad_norm": 0.23301494121551514, + "learning_rate": 2.0380646397020977e-05, + "loss": 0.0594, + "num_input_tokens_seen": 23145152, + "step": 109670 + }, + { + "epoch": 12.065456545654566, + "grad_norm": 2.8180768489837646, + "learning_rate": 2.0378287677323237e-05, + "loss": 0.1785, + "num_input_tokens_seen": 23146176, + "step": 109675 + }, + { + "epoch": 12.066006600660065, + "grad_norm": 0.010001473128795624, + "learning_rate": 2.0375929000221592e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23147200, + "step": 109680 + }, + { + "epoch": 12.066556655665567, + "grad_norm": 0.1440306007862091, + "learning_rate": 2.037357036573779e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23148288, + "step": 109685 + }, + { + "epoch": 12.067106710671068, + "grad_norm": 0.010300848633050919, + "learning_rate": 2.037121177389358e-05, + "loss": 0.0042, + "num_input_tokens_seen": 23149376, + "step": 109690 + }, + { + "epoch": 12.067656765676567, + "grad_norm": 0.01160070113837719, + "learning_rate": 2.0368853224710677e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23150368, + "step": 109695 + }, + { + "epoch": 12.068206820682068, + "grad_norm": 0.08149411529302597, + "learning_rate": 2.036649471821084e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23151456, + "step": 109700 + }, + { + "epoch": 12.06875687568757, + "grad_norm": 0.05541582405567169, + "learning_rate": 2.0364136254415797e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23152512, + "step": 109705 + }, + { + "epoch": 12.069306930693068, + "grad_norm": 0.0590091198682785, + "learning_rate": 2.0361777833347272e-05, + "loss": 0.0082, + "num_input_tokens_seen": 23153600, + "step": 109710 + }, + { + "epoch": 12.06985698569857, + "grad_norm": 0.013176586478948593, + "learning_rate": 2.0359419455027033e-05, + "loss": 0.0833, + "num_input_tokens_seen": 23154688, + "step": 109715 + }, + { + "epoch": 12.07040704070407, + "grad_norm": 2.0324530601501465, + "learning_rate": 2.0357061119476788e-05, + "loss": 0.0334, + "num_input_tokens_seen": 23155776, + "step": 109720 + }, + { + "epoch": 12.070957095709572, + "grad_norm": 0.24924515187740326, + "learning_rate": 2.0354702826718287e-05, + "loss": 0.0438, + "num_input_tokens_seen": 23156864, + "step": 109725 + }, + { + "epoch": 12.071507150715071, + "grad_norm": 0.02754046954214573, + "learning_rate": 2.0352344576773267e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23157952, + "step": 109730 + }, + { + "epoch": 12.072057205720572, + "grad_norm": 0.027209322899580002, + "learning_rate": 2.0349986369663445e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23159072, + "step": 109735 + }, + { + "epoch": 12.072607260726073, + "grad_norm": 0.9954952001571655, + "learning_rate": 2.0347628205410573e-05, + "loss": 0.0667, + "num_input_tokens_seen": 23160256, + "step": 109740 + }, + { + "epoch": 12.073157315731573, + "grad_norm": 2.7734456062316895, + "learning_rate": 2.0345270084036376e-05, + "loss": 0.1722, + "num_input_tokens_seen": 23161408, + "step": 109745 + }, + { + "epoch": 12.073707370737074, + "grad_norm": 0.10248680412769318, + "learning_rate": 2.03429120055626e-05, + "loss": 0.0056, + "num_input_tokens_seen": 23162400, + "step": 109750 + }, + { + "epoch": 12.074257425742575, + "grad_norm": 0.2259010225534439, + "learning_rate": 2.034055397001096e-05, + "loss": 0.0079, + "num_input_tokens_seen": 23163456, + "step": 109755 + }, + { + "epoch": 12.074807480748074, + "grad_norm": 0.029614489525556564, + "learning_rate": 2.03381959774032e-05, + "loss": 0.1378, + "num_input_tokens_seen": 23164544, + "step": 109760 + }, + { + "epoch": 12.075357535753575, + "grad_norm": 0.006774798035621643, + "learning_rate": 2.0335838027761058e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23165600, + "step": 109765 + }, + { + "epoch": 12.075907590759076, + "grad_norm": 0.6177176833152771, + "learning_rate": 2.0333480121106243e-05, + "loss": 0.0114, + "num_input_tokens_seen": 23166656, + "step": 109770 + }, + { + "epoch": 12.076457645764576, + "grad_norm": 0.40577197074890137, + "learning_rate": 2.033112225746052e-05, + "loss": 0.0211, + "num_input_tokens_seen": 23167712, + "step": 109775 + }, + { + "epoch": 12.077007700770077, + "grad_norm": 0.10305886715650558, + "learning_rate": 2.0328764436845597e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23168768, + "step": 109780 + }, + { + "epoch": 12.077557755775578, + "grad_norm": 0.029866181313991547, + "learning_rate": 2.0326406659283204e-05, + "loss": 0.0633, + "num_input_tokens_seen": 23169760, + "step": 109785 + }, + { + "epoch": 12.078107810781079, + "grad_norm": 0.1911557912826538, + "learning_rate": 2.032404892479509e-05, + "loss": 0.007, + "num_input_tokens_seen": 23170848, + "step": 109790 + }, + { + "epoch": 12.078657865786578, + "grad_norm": 0.031209981068968773, + "learning_rate": 2.0321691233402962e-05, + "loss": 0.0134, + "num_input_tokens_seen": 23171904, + "step": 109795 + }, + { + "epoch": 12.07920792079208, + "grad_norm": 0.0035427496768534184, + "learning_rate": 2.0319333585128565e-05, + "loss": 0.0438, + "num_input_tokens_seen": 23172928, + "step": 109800 + }, + { + "epoch": 12.07975797579758, + "grad_norm": 0.015247927978634834, + "learning_rate": 2.031697597999363e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23174016, + "step": 109805 + }, + { + "epoch": 12.08030803080308, + "grad_norm": 0.03707171231508255, + "learning_rate": 2.031461841801987e-05, + "loss": 0.0054, + "num_input_tokens_seen": 23175072, + "step": 109810 + }, + { + "epoch": 12.08085808580858, + "grad_norm": 0.011769055388867855, + "learning_rate": 2.031226089922903e-05, + "loss": 0.0383, + "num_input_tokens_seen": 23176192, + "step": 109815 + }, + { + "epoch": 12.081408140814082, + "grad_norm": 0.05967089906334877, + "learning_rate": 2.030990342364283e-05, + "loss": 0.0082, + "num_input_tokens_seen": 23177248, + "step": 109820 + }, + { + "epoch": 12.081958195819581, + "grad_norm": 0.06209677457809448, + "learning_rate": 2.0307545991283005e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23178272, + "step": 109825 + }, + { + "epoch": 12.082508250825082, + "grad_norm": 0.021566318348050117, + "learning_rate": 2.0305188602171274e-05, + "loss": 0.0166, + "num_input_tokens_seen": 23179360, + "step": 109830 + }, + { + "epoch": 12.083058305830583, + "grad_norm": 0.13527041673660278, + "learning_rate": 2.030283125632936e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23180384, + "step": 109835 + }, + { + "epoch": 12.083608360836084, + "grad_norm": 3.1188395023345947, + "learning_rate": 2.0300473953779003e-05, + "loss": 0.0737, + "num_input_tokens_seen": 23181504, + "step": 109840 + }, + { + "epoch": 12.084158415841584, + "grad_norm": 0.25249814987182617, + "learning_rate": 2.0298116694541916e-05, + "loss": 0.0072, + "num_input_tokens_seen": 23182528, + "step": 109845 + }, + { + "epoch": 12.084708470847085, + "grad_norm": 0.5825029611587524, + "learning_rate": 2.029575947863984e-05, + "loss": 0.0847, + "num_input_tokens_seen": 23183552, + "step": 109850 + }, + { + "epoch": 12.085258525852586, + "grad_norm": 0.07705424726009369, + "learning_rate": 2.029340230609449e-05, + "loss": 0.012, + "num_input_tokens_seen": 23184576, + "step": 109855 + }, + { + "epoch": 12.085808580858085, + "grad_norm": 0.03174031525850296, + "learning_rate": 2.0291045176927587e-05, + "loss": 0.0209, + "num_input_tokens_seen": 23185664, + "step": 109860 + }, + { + "epoch": 12.086358635863586, + "grad_norm": 1.2400853633880615, + "learning_rate": 2.0288688091160872e-05, + "loss": 0.0994, + "num_input_tokens_seen": 23186720, + "step": 109865 + }, + { + "epoch": 12.086908690869087, + "grad_norm": 0.3459571301937103, + "learning_rate": 2.028633104881605e-05, + "loss": 0.0498, + "num_input_tokens_seen": 23187744, + "step": 109870 + }, + { + "epoch": 12.087458745874587, + "grad_norm": 0.03694647550582886, + "learning_rate": 2.028397404991485e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23188768, + "step": 109875 + }, + { + "epoch": 12.088008800880088, + "grad_norm": 0.20795409381389618, + "learning_rate": 2.028161709447901e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23189824, + "step": 109880 + }, + { + "epoch": 12.088558855885589, + "grad_norm": 0.09285852313041687, + "learning_rate": 2.0279260182530228e-05, + "loss": 0.0079, + "num_input_tokens_seen": 23190912, + "step": 109885 + }, + { + "epoch": 12.089108910891088, + "grad_norm": 0.009812536649405956, + "learning_rate": 2.0276903314090246e-05, + "loss": 0.033, + "num_input_tokens_seen": 23191936, + "step": 109890 + }, + { + "epoch": 12.08965896589659, + "grad_norm": 0.09317240118980408, + "learning_rate": 2.0274546489180778e-05, + "loss": 0.0042, + "num_input_tokens_seen": 23192992, + "step": 109895 + }, + { + "epoch": 12.09020902090209, + "grad_norm": 0.00993774738162756, + "learning_rate": 2.0272189707823545e-05, + "loss": 0.0158, + "num_input_tokens_seen": 23194048, + "step": 109900 + }, + { + "epoch": 12.090759075907592, + "grad_norm": 0.7906057834625244, + "learning_rate": 2.026983297004028e-05, + "loss": 0.0114, + "num_input_tokens_seen": 23195072, + "step": 109905 + }, + { + "epoch": 12.091309130913091, + "grad_norm": 0.006734913215041161, + "learning_rate": 2.026747627585268e-05, + "loss": 0.0979, + "num_input_tokens_seen": 23196128, + "step": 109910 + }, + { + "epoch": 12.091859185918592, + "grad_norm": 0.02456767112016678, + "learning_rate": 2.0265119625282492e-05, + "loss": 0.0038, + "num_input_tokens_seen": 23197152, + "step": 109915 + }, + { + "epoch": 12.092409240924093, + "grad_norm": 0.03189678117632866, + "learning_rate": 2.0262763018351427e-05, + "loss": 0.0046, + "num_input_tokens_seen": 23198208, + "step": 109920 + }, + { + "epoch": 12.092959295929592, + "grad_norm": 0.044858306646347046, + "learning_rate": 2.0260406455081193e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23199328, + "step": 109925 + }, + { + "epoch": 12.093509350935093, + "grad_norm": 0.010204542428255081, + "learning_rate": 2.0258049935493523e-05, + "loss": 0.1095, + "num_input_tokens_seen": 23200352, + "step": 109930 + }, + { + "epoch": 12.094059405940595, + "grad_norm": 0.018154451623558998, + "learning_rate": 2.0255693459610127e-05, + "loss": 0.1009, + "num_input_tokens_seen": 23201440, + "step": 109935 + }, + { + "epoch": 12.094609460946094, + "grad_norm": 0.048153121024370193, + "learning_rate": 2.0253337027452733e-05, + "loss": 0.011, + "num_input_tokens_seen": 23202496, + "step": 109940 + }, + { + "epoch": 12.095159515951595, + "grad_norm": 0.07271378487348557, + "learning_rate": 2.0250980639043053e-05, + "loss": 0.003, + "num_input_tokens_seen": 23203456, + "step": 109945 + }, + { + "epoch": 12.095709570957096, + "grad_norm": 2.2731285095214844, + "learning_rate": 2.02486242944028e-05, + "loss": 0.0273, + "num_input_tokens_seen": 23204544, + "step": 109950 + }, + { + "epoch": 12.096259625962595, + "grad_norm": 0.07930523157119751, + "learning_rate": 2.0246267993553702e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23205632, + "step": 109955 + }, + { + "epoch": 12.096809680968097, + "grad_norm": 0.031048143282532692, + "learning_rate": 2.0243911736517466e-05, + "loss": 0.0467, + "num_input_tokens_seen": 23206656, + "step": 109960 + }, + { + "epoch": 12.097359735973598, + "grad_norm": 1.8569514751434326, + "learning_rate": 2.0241555523315826e-05, + "loss": 0.1575, + "num_input_tokens_seen": 23207712, + "step": 109965 + }, + { + "epoch": 12.097909790979099, + "grad_norm": 0.018971996381878853, + "learning_rate": 2.023919935397048e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23208768, + "step": 109970 + }, + { + "epoch": 12.098459845984598, + "grad_norm": 0.06076095253229141, + "learning_rate": 2.023684322850314e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23209824, + "step": 109975 + }, + { + "epoch": 12.099009900990099, + "grad_norm": 0.3156352937221527, + "learning_rate": 2.023448714693554e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23210912, + "step": 109980 + }, + { + "epoch": 12.0995599559956, + "grad_norm": 0.08439714461565018, + "learning_rate": 2.023213110928938e-05, + "loss": 0.0037, + "num_input_tokens_seen": 23212032, + "step": 109985 + }, + { + "epoch": 12.1001100110011, + "grad_norm": 1.5032877922058105, + "learning_rate": 2.022977511558638e-05, + "loss": 0.0365, + "num_input_tokens_seen": 23213120, + "step": 109990 + }, + { + "epoch": 12.1006600660066, + "grad_norm": 0.03517638146877289, + "learning_rate": 2.0227419165848262e-05, + "loss": 0.0266, + "num_input_tokens_seen": 23214272, + "step": 109995 + }, + { + "epoch": 12.101210121012102, + "grad_norm": 0.053916677832603455, + "learning_rate": 2.0225063260096723e-05, + "loss": 0.0764, + "num_input_tokens_seen": 23215296, + "step": 110000 + }, + { + "epoch": 12.101760176017601, + "grad_norm": 3.2106521129608154, + "learning_rate": 2.0222707398353484e-05, + "loss": 0.0954, + "num_input_tokens_seen": 23216352, + "step": 110005 + }, + { + "epoch": 12.102310231023102, + "grad_norm": 0.005511240568011999, + "learning_rate": 2.0220351580640255e-05, + "loss": 0.0473, + "num_input_tokens_seen": 23217440, + "step": 110010 + }, + { + "epoch": 12.102860286028603, + "grad_norm": 0.3764977753162384, + "learning_rate": 2.0217995806978763e-05, + "loss": 0.0407, + "num_input_tokens_seen": 23218432, + "step": 110015 + }, + { + "epoch": 12.103410341034103, + "grad_norm": 0.07487514615058899, + "learning_rate": 2.0215640077390706e-05, + "loss": 0.0675, + "num_input_tokens_seen": 23219424, + "step": 110020 + }, + { + "epoch": 12.103960396039604, + "grad_norm": 0.07148420065641403, + "learning_rate": 2.0213284391897792e-05, + "loss": 0.0367, + "num_input_tokens_seen": 23220416, + "step": 110025 + }, + { + "epoch": 12.104510451045105, + "grad_norm": 0.007297937758266926, + "learning_rate": 2.0210928750521745e-05, + "loss": 0.0023, + "num_input_tokens_seen": 23221472, + "step": 110030 + }, + { + "epoch": 12.105060506050606, + "grad_norm": 0.027753008529543877, + "learning_rate": 2.0208573153284267e-05, + "loss": 0.0469, + "num_input_tokens_seen": 23222432, + "step": 110035 + }, + { + "epoch": 12.105610561056105, + "grad_norm": 0.06746090948581696, + "learning_rate": 2.020621760020708e-05, + "loss": 0.0033, + "num_input_tokens_seen": 23223456, + "step": 110040 + }, + { + "epoch": 12.106160616061606, + "grad_norm": 0.06481079012155533, + "learning_rate": 2.0203862091311877e-05, + "loss": 0.0085, + "num_input_tokens_seen": 23224544, + "step": 110045 + }, + { + "epoch": 12.106710671067107, + "grad_norm": 0.07543829828500748, + "learning_rate": 2.020150662662037e-05, + "loss": 0.0332, + "num_input_tokens_seen": 23225600, + "step": 110050 + }, + { + "epoch": 12.107260726072607, + "grad_norm": 0.5252038240432739, + "learning_rate": 2.0199151206154288e-05, + "loss": 0.0101, + "num_input_tokens_seen": 23226688, + "step": 110055 + }, + { + "epoch": 12.107810781078108, + "grad_norm": 0.011073529720306396, + "learning_rate": 2.0196795829935315e-05, + "loss": 0.0051, + "num_input_tokens_seen": 23227712, + "step": 110060 + }, + { + "epoch": 12.108360836083609, + "grad_norm": 1.1355820894241333, + "learning_rate": 2.019444049798517e-05, + "loss": 0.0437, + "num_input_tokens_seen": 23228736, + "step": 110065 + }, + { + "epoch": 12.108910891089108, + "grad_norm": 0.030736593529582024, + "learning_rate": 2.019208521032557e-05, + "loss": 0.0899, + "num_input_tokens_seen": 23229824, + "step": 110070 + }, + { + "epoch": 12.10946094609461, + "grad_norm": 0.04472794756293297, + "learning_rate": 2.0189729966978196e-05, + "loss": 0.1891, + "num_input_tokens_seen": 23230944, + "step": 110075 + }, + { + "epoch": 12.11001100110011, + "grad_norm": 0.4874995946884155, + "learning_rate": 2.0187374767964788e-05, + "loss": 0.1967, + "num_input_tokens_seen": 23231968, + "step": 110080 + }, + { + "epoch": 12.110561056105611, + "grad_norm": 0.5516405701637268, + "learning_rate": 2.018501961330704e-05, + "loss": 0.0428, + "num_input_tokens_seen": 23233056, + "step": 110085 + }, + { + "epoch": 12.11111111111111, + "grad_norm": 0.0224937554448843, + "learning_rate": 2.018266450302664e-05, + "loss": 0.0304, + "num_input_tokens_seen": 23234112, + "step": 110090 + }, + { + "epoch": 12.111661166116612, + "grad_norm": 0.04723275452852249, + "learning_rate": 2.0180309437145327e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23235168, + "step": 110095 + }, + { + "epoch": 12.112211221122113, + "grad_norm": 0.24106182157993317, + "learning_rate": 2.0177954415684776e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23236256, + "step": 110100 + }, + { + "epoch": 12.112761276127612, + "grad_norm": 0.15871359407901764, + "learning_rate": 2.0175599438666713e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23237280, + "step": 110105 + }, + { + "epoch": 12.113311331133113, + "grad_norm": 0.04060009494423866, + "learning_rate": 2.0173244506112837e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23238304, + "step": 110110 + }, + { + "epoch": 12.113861386138614, + "grad_norm": 2.074287176132202, + "learning_rate": 2.017088961804484e-05, + "loss": 0.1236, + "num_input_tokens_seen": 23239328, + "step": 110115 + }, + { + "epoch": 12.114411441144114, + "grad_norm": 0.04266772046685219, + "learning_rate": 2.0168534774484445e-05, + "loss": 0.0092, + "num_input_tokens_seen": 23240320, + "step": 110120 + }, + { + "epoch": 12.114961496149615, + "grad_norm": 0.023112379014492035, + "learning_rate": 2.0166179975453336e-05, + "loss": 0.0709, + "num_input_tokens_seen": 23241280, + "step": 110125 + }, + { + "epoch": 12.115511551155116, + "grad_norm": 0.014973907731473446, + "learning_rate": 2.0163825220973237e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23242272, + "step": 110130 + }, + { + "epoch": 12.116061606160615, + "grad_norm": 1.3966871500015259, + "learning_rate": 2.0161470511065844e-05, + "loss": 0.0461, + "num_input_tokens_seen": 23243328, + "step": 110135 + }, + { + "epoch": 12.116611661166116, + "grad_norm": 0.09661589562892914, + "learning_rate": 2.0159115845752842e-05, + "loss": 0.0128, + "num_input_tokens_seen": 23244352, + "step": 110140 + }, + { + "epoch": 12.117161716171617, + "grad_norm": 0.004948119167238474, + "learning_rate": 2.0156761225055956e-05, + "loss": 0.0626, + "num_input_tokens_seen": 23245472, + "step": 110145 + }, + { + "epoch": 12.117711771177119, + "grad_norm": 0.05553450807929039, + "learning_rate": 2.0154406648996872e-05, + "loss": 0.177, + "num_input_tokens_seen": 23246560, + "step": 110150 + }, + { + "epoch": 12.118261826182618, + "grad_norm": 0.018161846324801445, + "learning_rate": 2.0152052117597303e-05, + "loss": 0.0541, + "num_input_tokens_seen": 23247552, + "step": 110155 + }, + { + "epoch": 12.118811881188119, + "grad_norm": 1.8980203866958618, + "learning_rate": 2.014969763087894e-05, + "loss": 0.1811, + "num_input_tokens_seen": 23248672, + "step": 110160 + }, + { + "epoch": 12.11936193619362, + "grad_norm": 1.1897507905960083, + "learning_rate": 2.0147343188863485e-05, + "loss": 0.0756, + "num_input_tokens_seen": 23249632, + "step": 110165 + }, + { + "epoch": 12.11991199119912, + "grad_norm": 0.0453537218272686, + "learning_rate": 2.0144988791572647e-05, + "loss": 0.0193, + "num_input_tokens_seen": 23250656, + "step": 110170 + }, + { + "epoch": 12.12046204620462, + "grad_norm": 0.10242487490177155, + "learning_rate": 2.014263443902811e-05, + "loss": 0.0235, + "num_input_tokens_seen": 23251712, + "step": 110175 + }, + { + "epoch": 12.121012101210122, + "grad_norm": 0.022518092766404152, + "learning_rate": 2.014028013125158e-05, + "loss": 0.0046, + "num_input_tokens_seen": 23252800, + "step": 110180 + }, + { + "epoch": 12.12156215621562, + "grad_norm": 0.016325846314430237, + "learning_rate": 2.0137925868264764e-05, + "loss": 0.0114, + "num_input_tokens_seen": 23253824, + "step": 110185 + }, + { + "epoch": 12.122112211221122, + "grad_norm": 0.35122954845428467, + "learning_rate": 2.013557165008934e-05, + "loss": 0.0229, + "num_input_tokens_seen": 23254944, + "step": 110190 + }, + { + "epoch": 12.122662266226623, + "grad_norm": 0.06262455135583878, + "learning_rate": 2.0133217476747028e-05, + "loss": 0.0252, + "num_input_tokens_seen": 23256000, + "step": 110195 + }, + { + "epoch": 12.123212321232122, + "grad_norm": 0.014688091352581978, + "learning_rate": 2.0130863348259507e-05, + "loss": 0.0639, + "num_input_tokens_seen": 23257024, + "step": 110200 + }, + { + "epoch": 12.123762376237623, + "grad_norm": 0.7899251580238342, + "learning_rate": 2.012850926464849e-05, + "loss": 0.0329, + "num_input_tokens_seen": 23258080, + "step": 110205 + }, + { + "epoch": 12.124312431243125, + "grad_norm": 0.1433659940958023, + "learning_rate": 2.012615522593567e-05, + "loss": 0.0646, + "num_input_tokens_seen": 23259136, + "step": 110210 + }, + { + "epoch": 12.124862486248626, + "grad_norm": 2.164318561553955, + "learning_rate": 2.0123801232142723e-05, + "loss": 0.1254, + "num_input_tokens_seen": 23260224, + "step": 110215 + }, + { + "epoch": 12.125412541254125, + "grad_norm": 3.3300514221191406, + "learning_rate": 2.012144728329137e-05, + "loss": 0.0883, + "num_input_tokens_seen": 23261312, + "step": 110220 + }, + { + "epoch": 12.125962596259626, + "grad_norm": 0.02222386747598648, + "learning_rate": 2.011909337940329e-05, + "loss": 0.0348, + "num_input_tokens_seen": 23262336, + "step": 110225 + }, + { + "epoch": 12.126512651265127, + "grad_norm": 0.32359740138053894, + "learning_rate": 2.0116739520500194e-05, + "loss": 0.0094, + "num_input_tokens_seen": 23263392, + "step": 110230 + }, + { + "epoch": 12.127062706270626, + "grad_norm": 1.0783601999282837, + "learning_rate": 2.0114385706603762e-05, + "loss": 0.0194, + "num_input_tokens_seen": 23264416, + "step": 110235 + }, + { + "epoch": 12.127612761276128, + "grad_norm": 0.07220403850078583, + "learning_rate": 2.0112031937735686e-05, + "loss": 0.0068, + "num_input_tokens_seen": 23265504, + "step": 110240 + }, + { + "epoch": 12.128162816281629, + "grad_norm": 0.006709603127092123, + "learning_rate": 2.010967821391768e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23266560, + "step": 110245 + }, + { + "epoch": 12.128712871287128, + "grad_norm": 1.2570164203643799, + "learning_rate": 2.0107324535171417e-05, + "loss": 0.0418, + "num_input_tokens_seen": 23267616, + "step": 110250 + }, + { + "epoch": 12.129262926292629, + "grad_norm": 0.17849481105804443, + "learning_rate": 2.0104970901518588e-05, + "loss": 0.0538, + "num_input_tokens_seen": 23268640, + "step": 110255 + }, + { + "epoch": 12.12981298129813, + "grad_norm": 0.09114939719438553, + "learning_rate": 2.0102617312980903e-05, + "loss": 0.0726, + "num_input_tokens_seen": 23269728, + "step": 110260 + }, + { + "epoch": 12.130363036303631, + "grad_norm": 0.08034680038690567, + "learning_rate": 2.0100263769580034e-05, + "loss": 0.0739, + "num_input_tokens_seen": 23270784, + "step": 110265 + }, + { + "epoch": 12.13091309130913, + "grad_norm": 0.28964963555336, + "learning_rate": 2.0097910271337698e-05, + "loss": 0.0241, + "num_input_tokens_seen": 23271840, + "step": 110270 + }, + { + "epoch": 12.131463146314632, + "grad_norm": 0.02140648663043976, + "learning_rate": 2.0095556818275563e-05, + "loss": 0.0089, + "num_input_tokens_seen": 23272928, + "step": 110275 + }, + { + "epoch": 12.132013201320133, + "grad_norm": 1.102315068244934, + "learning_rate": 2.0093203410415324e-05, + "loss": 0.0207, + "num_input_tokens_seen": 23273984, + "step": 110280 + }, + { + "epoch": 12.132563256325632, + "grad_norm": 0.010910880751907825, + "learning_rate": 2.0090850047778683e-05, + "loss": 0.0062, + "num_input_tokens_seen": 23275040, + "step": 110285 + }, + { + "epoch": 12.133113311331133, + "grad_norm": 0.04145684093236923, + "learning_rate": 2.0088496730387313e-05, + "loss": 0.0091, + "num_input_tokens_seen": 23276096, + "step": 110290 + }, + { + "epoch": 12.133663366336634, + "grad_norm": 0.18177731335163116, + "learning_rate": 2.0086143458262912e-05, + "loss": 0.0856, + "num_input_tokens_seen": 23277184, + "step": 110295 + }, + { + "epoch": 12.134213421342134, + "grad_norm": 0.1347111314535141, + "learning_rate": 2.008379023142718e-05, + "loss": 0.0949, + "num_input_tokens_seen": 23278240, + "step": 110300 + }, + { + "epoch": 12.134763476347635, + "grad_norm": 0.5072401762008667, + "learning_rate": 2.0081437049901782e-05, + "loss": 0.0234, + "num_input_tokens_seen": 23279264, + "step": 110305 + }, + { + "epoch": 12.135313531353136, + "grad_norm": 0.0072789983823895454, + "learning_rate": 2.0079083913708425e-05, + "loss": 0.0317, + "num_input_tokens_seen": 23280320, + "step": 110310 + }, + { + "epoch": 12.135863586358635, + "grad_norm": 0.5335843563079834, + "learning_rate": 2.0076730822868784e-05, + "loss": 0.0149, + "num_input_tokens_seen": 23281408, + "step": 110315 + }, + { + "epoch": 12.136413641364136, + "grad_norm": 0.8758626580238342, + "learning_rate": 2.0074377777404558e-05, + "loss": 0.029, + "num_input_tokens_seen": 23282464, + "step": 110320 + }, + { + "epoch": 12.136963696369637, + "grad_norm": 0.02277202159166336, + "learning_rate": 2.0072024777337432e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23283488, + "step": 110325 + }, + { + "epoch": 12.137513751375138, + "grad_norm": 0.022879553958773613, + "learning_rate": 2.0069671822689077e-05, + "loss": 0.1, + "num_input_tokens_seen": 23284544, + "step": 110330 + }, + { + "epoch": 12.138063806380638, + "grad_norm": 0.0843629464507103, + "learning_rate": 2.00673189134812e-05, + "loss": 0.1397, + "num_input_tokens_seen": 23285664, + "step": 110335 + }, + { + "epoch": 12.138613861386139, + "grad_norm": 0.21260599792003632, + "learning_rate": 2.0064966049735473e-05, + "loss": 0.0181, + "num_input_tokens_seen": 23286688, + "step": 110340 + }, + { + "epoch": 12.13916391639164, + "grad_norm": 0.3491048514842987, + "learning_rate": 2.006261323147359e-05, + "loss": 0.1415, + "num_input_tokens_seen": 23287712, + "step": 110345 + }, + { + "epoch": 12.13971397139714, + "grad_norm": 0.04898796230554581, + "learning_rate": 2.0060260458717234e-05, + "loss": 0.0234, + "num_input_tokens_seen": 23288768, + "step": 110350 + }, + { + "epoch": 12.14026402640264, + "grad_norm": 0.018054010346531868, + "learning_rate": 2.005790773148808e-05, + "loss": 0.0124, + "num_input_tokens_seen": 23289824, + "step": 110355 + }, + { + "epoch": 12.140814081408141, + "grad_norm": 0.19746823608875275, + "learning_rate": 2.005555504980783e-05, + "loss": 0.0119, + "num_input_tokens_seen": 23291008, + "step": 110360 + }, + { + "epoch": 12.14136413641364, + "grad_norm": 0.49338698387145996, + "learning_rate": 2.005320241369814e-05, + "loss": 0.0135, + "num_input_tokens_seen": 23292096, + "step": 110365 + }, + { + "epoch": 12.141914191419142, + "grad_norm": 0.5198517441749573, + "learning_rate": 2.005084982318072e-05, + "loss": 0.0069, + "num_input_tokens_seen": 23293120, + "step": 110370 + }, + { + "epoch": 12.142464246424643, + "grad_norm": 0.010053453035652637, + "learning_rate": 2.0048497278277247e-05, + "loss": 0.0026, + "num_input_tokens_seen": 23294240, + "step": 110375 + }, + { + "epoch": 12.143014301430142, + "grad_norm": 0.13586904108524323, + "learning_rate": 2.0046144779009386e-05, + "loss": 0.017, + "num_input_tokens_seen": 23295232, + "step": 110380 + }, + { + "epoch": 12.143564356435643, + "grad_norm": 0.030571918934583664, + "learning_rate": 2.004379232539884e-05, + "loss": 0.0791, + "num_input_tokens_seen": 23296352, + "step": 110385 + }, + { + "epoch": 12.144114411441144, + "grad_norm": 0.014700356870889664, + "learning_rate": 2.004143991746728e-05, + "loss": 0.0186, + "num_input_tokens_seen": 23297440, + "step": 110390 + }, + { + "epoch": 12.144664466446645, + "grad_norm": 0.368054062128067, + "learning_rate": 2.003908755523639e-05, + "loss": 0.0314, + "num_input_tokens_seen": 23298592, + "step": 110395 + }, + { + "epoch": 12.145214521452145, + "grad_norm": 0.026950718834996223, + "learning_rate": 2.0036735238727854e-05, + "loss": 0.017, + "num_input_tokens_seen": 23299680, + "step": 110400 + }, + { + "epoch": 12.145764576457646, + "grad_norm": 0.11712007224559784, + "learning_rate": 2.0034382967963336e-05, + "loss": 0.0925, + "num_input_tokens_seen": 23300736, + "step": 110405 + }, + { + "epoch": 12.146314631463147, + "grad_norm": 0.5050343871116638, + "learning_rate": 2.0032030742964537e-05, + "loss": 0.0314, + "num_input_tokens_seen": 23301824, + "step": 110410 + }, + { + "epoch": 12.146864686468646, + "grad_norm": 1.5402231216430664, + "learning_rate": 2.0029678563753114e-05, + "loss": 0.0685, + "num_input_tokens_seen": 23302880, + "step": 110415 + }, + { + "epoch": 12.147414741474147, + "grad_norm": 0.013680849224328995, + "learning_rate": 2.0027326430350778e-05, + "loss": 0.0104, + "num_input_tokens_seen": 23303968, + "step": 110420 + }, + { + "epoch": 12.147964796479648, + "grad_norm": 1.0117404460906982, + "learning_rate": 2.0024974342779178e-05, + "loss": 0.0643, + "num_input_tokens_seen": 23305088, + "step": 110425 + }, + { + "epoch": 12.148514851485148, + "grad_norm": 0.07463479787111282, + "learning_rate": 2.0022622301059996e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23306176, + "step": 110430 + }, + { + "epoch": 12.149064906490649, + "grad_norm": 0.02058347873389721, + "learning_rate": 2.0020270305214926e-05, + "loss": 0.0459, + "num_input_tokens_seen": 23307232, + "step": 110435 + }, + { + "epoch": 12.14961496149615, + "grad_norm": 0.0688895657658577, + "learning_rate": 2.0017918355265635e-05, + "loss": 0.0177, + "num_input_tokens_seen": 23308288, + "step": 110440 + }, + { + "epoch": 12.150165016501651, + "grad_norm": 0.04413817450404167, + "learning_rate": 2.0015566451233793e-05, + "loss": 0.0084, + "num_input_tokens_seen": 23309376, + "step": 110445 + }, + { + "epoch": 12.15071507150715, + "grad_norm": 0.03347209095954895, + "learning_rate": 2.0013214593141084e-05, + "loss": 0.1087, + "num_input_tokens_seen": 23310464, + "step": 110450 + }, + { + "epoch": 12.151265126512651, + "grad_norm": 1.1790661811828613, + "learning_rate": 2.0010862781009183e-05, + "loss": 0.0443, + "num_input_tokens_seen": 23311616, + "step": 110455 + }, + { + "epoch": 12.151815181518153, + "grad_norm": 0.010510695166885853, + "learning_rate": 2.0008511014859777e-05, + "loss": 0.0056, + "num_input_tokens_seen": 23312608, + "step": 110460 + }, + { + "epoch": 12.152365236523652, + "grad_norm": 0.021365078166127205, + "learning_rate": 2.0006159294714522e-05, + "loss": 0.0353, + "num_input_tokens_seen": 23313696, + "step": 110465 + }, + { + "epoch": 12.152915291529153, + "grad_norm": 0.4100249707698822, + "learning_rate": 2.0003807620595093e-05, + "loss": 0.0357, + "num_input_tokens_seen": 23314784, + "step": 110470 + }, + { + "epoch": 12.153465346534654, + "grad_norm": 1.0231056213378906, + "learning_rate": 2.0001455992523184e-05, + "loss": 0.0151, + "num_input_tokens_seen": 23315808, + "step": 110475 + }, + { + "epoch": 12.154015401540153, + "grad_norm": 0.031169012188911438, + "learning_rate": 1.9999104410520448e-05, + "loss": 0.0059, + "num_input_tokens_seen": 23316960, + "step": 110480 + }, + { + "epoch": 12.154565456545654, + "grad_norm": 0.020275508984923363, + "learning_rate": 1.9996752874608572e-05, + "loss": 0.1461, + "num_input_tokens_seen": 23318048, + "step": 110485 + }, + { + "epoch": 12.155115511551156, + "grad_norm": 0.08695300668478012, + "learning_rate": 1.999440138480923e-05, + "loss": 0.0556, + "num_input_tokens_seen": 23319072, + "step": 110490 + }, + { + "epoch": 12.155665566556655, + "grad_norm": 0.028062736615538597, + "learning_rate": 1.9992049941144066e-05, + "loss": 0.009, + "num_input_tokens_seen": 23320160, + "step": 110495 + }, + { + "epoch": 12.156215621562156, + "grad_norm": 0.28239190578460693, + "learning_rate": 1.9989698543634796e-05, + "loss": 0.0135, + "num_input_tokens_seen": 23321184, + "step": 110500 + }, + { + "epoch": 12.156765676567657, + "grad_norm": 0.20970728993415833, + "learning_rate": 1.9987347192303062e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23322240, + "step": 110505 + }, + { + "epoch": 12.157315731573158, + "grad_norm": 0.02149984799325466, + "learning_rate": 1.9984995887170544e-05, + "loss": 0.0495, + "num_input_tokens_seen": 23323360, + "step": 110510 + }, + { + "epoch": 12.157865786578657, + "grad_norm": 0.06699982285499573, + "learning_rate": 1.998264462825892e-05, + "loss": 0.0101, + "num_input_tokens_seen": 23324448, + "step": 110515 + }, + { + "epoch": 12.158415841584159, + "grad_norm": 0.07385141402482986, + "learning_rate": 1.998029341558984e-05, + "loss": 0.0044, + "num_input_tokens_seen": 23325472, + "step": 110520 + }, + { + "epoch": 12.15896589658966, + "grad_norm": 0.00921916589140892, + "learning_rate": 1.9977942249184993e-05, + "loss": 0.0108, + "num_input_tokens_seen": 23326432, + "step": 110525 + }, + { + "epoch": 12.159515951595159, + "grad_norm": 0.11498454958200455, + "learning_rate": 1.997559112906604e-05, + "loss": 0.0414, + "num_input_tokens_seen": 23327520, + "step": 110530 + }, + { + "epoch": 12.16006600660066, + "grad_norm": 0.03935238718986511, + "learning_rate": 1.9973240055254663e-05, + "loss": 0.0119, + "num_input_tokens_seen": 23328544, + "step": 110535 + }, + { + "epoch": 12.160616061606161, + "grad_norm": 0.03510189428925514, + "learning_rate": 1.997088902777251e-05, + "loss": 0.0672, + "num_input_tokens_seen": 23329632, + "step": 110540 + }, + { + "epoch": 12.16116611661166, + "grad_norm": 1.816656231880188, + "learning_rate": 1.996853804664126e-05, + "loss": 0.036, + "num_input_tokens_seen": 23330688, + "step": 110545 + }, + { + "epoch": 12.161716171617162, + "grad_norm": 0.03748100623488426, + "learning_rate": 1.996618711188259e-05, + "loss": 0.0509, + "num_input_tokens_seen": 23331712, + "step": 110550 + }, + { + "epoch": 12.162266226622663, + "grad_norm": 0.05356979742646217, + "learning_rate": 1.996383622351814e-05, + "loss": 0.0324, + "num_input_tokens_seen": 23332832, + "step": 110555 + }, + { + "epoch": 12.162816281628162, + "grad_norm": 0.07772811502218246, + "learning_rate": 1.9961485381569614e-05, + "loss": 0.011, + "num_input_tokens_seen": 23333856, + "step": 110560 + }, + { + "epoch": 12.163366336633663, + "grad_norm": 0.03669213131070137, + "learning_rate": 1.9959134586058654e-05, + "loss": 0.0041, + "num_input_tokens_seen": 23334880, + "step": 110565 + }, + { + "epoch": 12.163916391639164, + "grad_norm": 0.2676081955432892, + "learning_rate": 1.9956783837006922e-05, + "loss": 0.0084, + "num_input_tokens_seen": 23335872, + "step": 110570 + }, + { + "epoch": 12.164466446644665, + "grad_norm": 0.00949207041412592, + "learning_rate": 1.9954433134436106e-05, + "loss": 0.0149, + "num_input_tokens_seen": 23336896, + "step": 110575 + }, + { + "epoch": 12.165016501650165, + "grad_norm": 0.034655388444662094, + "learning_rate": 1.995208247836785e-05, + "loss": 0.0488, + "num_input_tokens_seen": 23337952, + "step": 110580 + }, + { + "epoch": 12.165566556655666, + "grad_norm": 0.009537292644381523, + "learning_rate": 1.994973186882383e-05, + "loss": 0.0432, + "num_input_tokens_seen": 23339008, + "step": 110585 + }, + { + "epoch": 12.166116611661167, + "grad_norm": 0.8507912755012512, + "learning_rate": 1.9947381305825717e-05, + "loss": 0.0746, + "num_input_tokens_seen": 23340096, + "step": 110590 + }, + { + "epoch": 12.166666666666666, + "grad_norm": 1.5042637586593628, + "learning_rate": 1.9945030789395152e-05, + "loss": 0.0834, + "num_input_tokens_seen": 23341088, + "step": 110595 + }, + { + "epoch": 12.167216721672167, + "grad_norm": 0.020377637818455696, + "learning_rate": 1.9942680319553818e-05, + "loss": 0.0504, + "num_input_tokens_seen": 23342144, + "step": 110600 + }, + { + "epoch": 12.167766776677668, + "grad_norm": 0.01528916321694851, + "learning_rate": 1.994032989632337e-05, + "loss": 0.0479, + "num_input_tokens_seen": 23343136, + "step": 110605 + }, + { + "epoch": 12.168316831683168, + "grad_norm": 0.020525168627500534, + "learning_rate": 1.9937979519725485e-05, + "loss": 0.0841, + "num_input_tokens_seen": 23344160, + "step": 110610 + }, + { + "epoch": 12.168866886688669, + "grad_norm": 0.3480483293533325, + "learning_rate": 1.9935629189781803e-05, + "loss": 0.0467, + "num_input_tokens_seen": 23345152, + "step": 110615 + }, + { + "epoch": 12.16941694169417, + "grad_norm": 0.0497538298368454, + "learning_rate": 1.9933278906513997e-05, + "loss": 0.0398, + "num_input_tokens_seen": 23346240, + "step": 110620 + }, + { + "epoch": 12.16996699669967, + "grad_norm": 0.015483078546822071, + "learning_rate": 1.993092866994373e-05, + "loss": 0.0234, + "num_input_tokens_seen": 23347296, + "step": 110625 + }, + { + "epoch": 12.17051705170517, + "grad_norm": 0.14140407741069794, + "learning_rate": 1.9928578480092665e-05, + "loss": 0.004, + "num_input_tokens_seen": 23348352, + "step": 110630 + }, + { + "epoch": 12.171067106710671, + "grad_norm": 0.3037475049495697, + "learning_rate": 1.992622833698245e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23349376, + "step": 110635 + }, + { + "epoch": 12.171617161716172, + "grad_norm": 0.05464792251586914, + "learning_rate": 1.992387824063476e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23350432, + "step": 110640 + }, + { + "epoch": 12.172167216721672, + "grad_norm": 2.1106855869293213, + "learning_rate": 1.992152819107124e-05, + "loss": 0.0348, + "num_input_tokens_seen": 23351520, + "step": 110645 + }, + { + "epoch": 12.172717271727173, + "grad_norm": 0.04268481209874153, + "learning_rate": 1.9919178188313575e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23352576, + "step": 110650 + }, + { + "epoch": 12.173267326732674, + "grad_norm": 0.11431887745857239, + "learning_rate": 1.9916828232383396e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23353600, + "step": 110655 + }, + { + "epoch": 12.173817381738173, + "grad_norm": 0.1252332627773285, + "learning_rate": 1.991447832330237e-05, + "loss": 0.0072, + "num_input_tokens_seen": 23354688, + "step": 110660 + }, + { + "epoch": 12.174367436743674, + "grad_norm": 0.07613702863454819, + "learning_rate": 1.9912128461092165e-05, + "loss": 0.0164, + "num_input_tokens_seen": 23355776, + "step": 110665 + }, + { + "epoch": 12.174917491749175, + "grad_norm": 0.6321103572845459, + "learning_rate": 1.990977864577442e-05, + "loss": 0.0112, + "num_input_tokens_seen": 23356832, + "step": 110670 + }, + { + "epoch": 12.175467546754675, + "grad_norm": 0.05745282024145126, + "learning_rate": 1.990742887737081e-05, + "loss": 0.005, + "num_input_tokens_seen": 23357920, + "step": 110675 + }, + { + "epoch": 12.176017601760176, + "grad_norm": 0.23311153054237366, + "learning_rate": 1.990507915590298e-05, + "loss": 0.0069, + "num_input_tokens_seen": 23358944, + "step": 110680 + }, + { + "epoch": 12.176567656765677, + "grad_norm": 0.041278280317783356, + "learning_rate": 1.990272948139259e-05, + "loss": 0.0104, + "num_input_tokens_seen": 23360000, + "step": 110685 + }, + { + "epoch": 12.177117711771178, + "grad_norm": 0.02662292867898941, + "learning_rate": 1.9900379853861306e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23361056, + "step": 110690 + }, + { + "epoch": 12.177667766776677, + "grad_norm": 0.1031142920255661, + "learning_rate": 1.9898030273330763e-05, + "loss": 0.0788, + "num_input_tokens_seen": 23362112, + "step": 110695 + }, + { + "epoch": 12.178217821782178, + "grad_norm": 0.060701481997966766, + "learning_rate": 1.989568073982263e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23363168, + "step": 110700 + }, + { + "epoch": 12.17876787678768, + "grad_norm": 0.04800145700573921, + "learning_rate": 1.9893331253358566e-05, + "loss": 0.0113, + "num_input_tokens_seen": 23364224, + "step": 110705 + }, + { + "epoch": 12.179317931793179, + "grad_norm": 0.07597306370735168, + "learning_rate": 1.9890981813960208e-05, + "loss": 0.007, + "num_input_tokens_seen": 23365216, + "step": 110710 + }, + { + "epoch": 12.17986798679868, + "grad_norm": 0.03771926462650299, + "learning_rate": 1.9888632421649222e-05, + "loss": 0.0502, + "num_input_tokens_seen": 23366304, + "step": 110715 + }, + { + "epoch": 12.180418041804181, + "grad_norm": 0.00805632770061493, + "learning_rate": 1.9886283076447254e-05, + "loss": 0.0385, + "num_input_tokens_seen": 23367392, + "step": 110720 + }, + { + "epoch": 12.18096809680968, + "grad_norm": 0.02899163030087948, + "learning_rate": 1.9883933778375968e-05, + "loss": 0.0085, + "num_input_tokens_seen": 23368416, + "step": 110725 + }, + { + "epoch": 12.181518151815181, + "grad_norm": 1.960247278213501, + "learning_rate": 1.988158452745701e-05, + "loss": 0.0376, + "num_input_tokens_seen": 23369472, + "step": 110730 + }, + { + "epoch": 12.182068206820682, + "grad_norm": 0.01298485603183508, + "learning_rate": 1.987923532371202e-05, + "loss": 0.013, + "num_input_tokens_seen": 23370560, + "step": 110735 + }, + { + "epoch": 12.182618261826182, + "grad_norm": 3.051293134689331, + "learning_rate": 1.987688616716267e-05, + "loss": 0.134, + "num_input_tokens_seen": 23371584, + "step": 110740 + }, + { + "epoch": 12.183168316831683, + "grad_norm": 0.01557027455419302, + "learning_rate": 1.98745370578306e-05, + "loss": 0.004, + "num_input_tokens_seen": 23372640, + "step": 110745 + }, + { + "epoch": 12.183718371837184, + "grad_norm": 0.05721030384302139, + "learning_rate": 1.987218799573747e-05, + "loss": 0.0729, + "num_input_tokens_seen": 23373696, + "step": 110750 + }, + { + "epoch": 12.184268426842685, + "grad_norm": 0.020656829699873924, + "learning_rate": 1.9869838980904914e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23374688, + "step": 110755 + }, + { + "epoch": 12.184818481848184, + "grad_norm": 0.058892872184515, + "learning_rate": 1.986749001335459e-05, + "loss": 0.0645, + "num_input_tokens_seen": 23375744, + "step": 110760 + }, + { + "epoch": 12.185368536853685, + "grad_norm": 0.00967680849134922, + "learning_rate": 1.9865141093108155e-05, + "loss": 0.0085, + "num_input_tokens_seen": 23376864, + "step": 110765 + }, + { + "epoch": 12.185918591859187, + "grad_norm": 0.015950459986925125, + "learning_rate": 1.986279222018724e-05, + "loss": 0.0546, + "num_input_tokens_seen": 23377888, + "step": 110770 + }, + { + "epoch": 12.186468646864686, + "grad_norm": 0.01892750710248947, + "learning_rate": 1.986044339461351e-05, + "loss": 0.0684, + "num_input_tokens_seen": 23378912, + "step": 110775 + }, + { + "epoch": 12.187018701870187, + "grad_norm": 0.0416920967400074, + "learning_rate": 1.9858094616408613e-05, + "loss": 0.0016, + "num_input_tokens_seen": 23379968, + "step": 110780 + }, + { + "epoch": 12.187568756875688, + "grad_norm": 0.022671164944767952, + "learning_rate": 1.985574588559418e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23381024, + "step": 110785 + }, + { + "epoch": 12.188118811881187, + "grad_norm": 0.020148133859038353, + "learning_rate": 1.9853397202191874e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23382016, + "step": 110790 + }, + { + "epoch": 12.188668866886688, + "grad_norm": 0.020121386274695396, + "learning_rate": 1.9851048566223332e-05, + "loss": 0.0031, + "num_input_tokens_seen": 23383040, + "step": 110795 + }, + { + "epoch": 12.18921892189219, + "grad_norm": 0.34523457288742065, + "learning_rate": 1.9848699977710208e-05, + "loss": 0.0692, + "num_input_tokens_seen": 23384160, + "step": 110800 + }, + { + "epoch": 12.189768976897689, + "grad_norm": 0.052941594272851944, + "learning_rate": 1.984635143667415e-05, + "loss": 0.0082, + "num_input_tokens_seen": 23385216, + "step": 110805 + }, + { + "epoch": 12.19031903190319, + "grad_norm": 0.11522682756185532, + "learning_rate": 1.9844002943136788e-05, + "loss": 0.0112, + "num_input_tokens_seen": 23386240, + "step": 110810 + }, + { + "epoch": 12.190869086908691, + "grad_norm": 2.033831834793091, + "learning_rate": 1.9841654497119782e-05, + "loss": 0.1652, + "num_input_tokens_seen": 23387232, + "step": 110815 + }, + { + "epoch": 12.191419141914192, + "grad_norm": 0.013385108672082424, + "learning_rate": 1.9839306098644777e-05, + "loss": 0.0015, + "num_input_tokens_seen": 23388288, + "step": 110820 + }, + { + "epoch": 12.191969196919691, + "grad_norm": 0.16645489633083344, + "learning_rate": 1.98369577477334e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23389312, + "step": 110825 + }, + { + "epoch": 12.192519251925193, + "grad_norm": 0.1047118529677391, + "learning_rate": 1.983460944440731e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23390400, + "step": 110830 + }, + { + "epoch": 12.193069306930694, + "grad_norm": 0.028850506991147995, + "learning_rate": 1.9832261188688146e-05, + "loss": 0.0023, + "num_input_tokens_seen": 23391456, + "step": 110835 + }, + { + "epoch": 12.193619361936193, + "grad_norm": 0.009810320101678371, + "learning_rate": 1.9829912980597555e-05, + "loss": 0.0084, + "num_input_tokens_seen": 23392512, + "step": 110840 + }, + { + "epoch": 12.194169416941694, + "grad_norm": 1.515531301498413, + "learning_rate": 1.9827564820157172e-05, + "loss": 0.0747, + "num_input_tokens_seen": 23393600, + "step": 110845 + }, + { + "epoch": 12.194719471947195, + "grad_norm": 0.03083246760070324, + "learning_rate": 1.9825216707388637e-05, + "loss": 0.0521, + "num_input_tokens_seen": 23394656, + "step": 110850 + }, + { + "epoch": 12.195269526952695, + "grad_norm": 0.009355181828141212, + "learning_rate": 1.982286864231361e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23395648, + "step": 110855 + }, + { + "epoch": 12.195819581958196, + "grad_norm": 0.034341610968112946, + "learning_rate": 1.98205206249537e-05, + "loss": 0.0354, + "num_input_tokens_seen": 23396704, + "step": 110860 + }, + { + "epoch": 12.196369636963697, + "grad_norm": 0.003956349100917578, + "learning_rate": 1.9818172655330583e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23397792, + "step": 110865 + }, + { + "epoch": 12.196919691969198, + "grad_norm": 0.08583714812994003, + "learning_rate": 1.9815824733465876e-05, + "loss": 0.1486, + "num_input_tokens_seen": 23398848, + "step": 110870 + }, + { + "epoch": 12.197469746974697, + "grad_norm": 0.022372618317604065, + "learning_rate": 1.981347685938122e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23399840, + "step": 110875 + }, + { + "epoch": 12.198019801980198, + "grad_norm": 0.024394430220127106, + "learning_rate": 1.981112903309827e-05, + "loss": 0.0394, + "num_input_tokens_seen": 23400928, + "step": 110880 + }, + { + "epoch": 12.1985698569857, + "grad_norm": 0.0411057211458683, + "learning_rate": 1.9808781254638643e-05, + "loss": 0.0667, + "num_input_tokens_seen": 23402080, + "step": 110885 + }, + { + "epoch": 12.199119911991199, + "grad_norm": 1.1979329586029053, + "learning_rate": 1.980643352402399e-05, + "loss": 0.0446, + "num_input_tokens_seen": 23403072, + "step": 110890 + }, + { + "epoch": 12.1996699669967, + "grad_norm": 0.017318595200777054, + "learning_rate": 1.9804085841275957e-05, + "loss": 0.0107, + "num_input_tokens_seen": 23404128, + "step": 110895 + }, + { + "epoch": 12.2002200220022, + "grad_norm": 0.1099901795387268, + "learning_rate": 1.980173820641616e-05, + "loss": 0.0677, + "num_input_tokens_seen": 23405184, + "step": 110900 + }, + { + "epoch": 12.2007700770077, + "grad_norm": 0.2985727787017822, + "learning_rate": 1.979939061946625e-05, + "loss": 0.0605, + "num_input_tokens_seen": 23406240, + "step": 110905 + }, + { + "epoch": 12.201320132013201, + "grad_norm": 0.031622909009456635, + "learning_rate": 1.979704308044786e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23407296, + "step": 110910 + }, + { + "epoch": 12.201870187018702, + "grad_norm": 0.020384078845381737, + "learning_rate": 1.9794695589382637e-05, + "loss": 0.0776, + "num_input_tokens_seen": 23408352, + "step": 110915 + }, + { + "epoch": 12.202420242024202, + "grad_norm": 0.14308470487594604, + "learning_rate": 1.97923481462922e-05, + "loss": 0.0267, + "num_input_tokens_seen": 23409440, + "step": 110920 + }, + { + "epoch": 12.202970297029703, + "grad_norm": 0.027315469458699226, + "learning_rate": 1.9790000751198188e-05, + "loss": 0.0023, + "num_input_tokens_seen": 23410496, + "step": 110925 + }, + { + "epoch": 12.203520352035204, + "grad_norm": 0.016273213550448418, + "learning_rate": 1.9787653404122243e-05, + "loss": 0.0196, + "num_input_tokens_seen": 23411552, + "step": 110930 + }, + { + "epoch": 12.204070407040705, + "grad_norm": 0.3556426465511322, + "learning_rate": 1.978530610508599e-05, + "loss": 0.0053, + "num_input_tokens_seen": 23412576, + "step": 110935 + }, + { + "epoch": 12.204620462046204, + "grad_norm": 0.03732222318649292, + "learning_rate": 1.978295885411108e-05, + "loss": 0.0206, + "num_input_tokens_seen": 23413600, + "step": 110940 + }, + { + "epoch": 12.205170517051705, + "grad_norm": 0.18878990411758423, + "learning_rate": 1.9780611651219128e-05, + "loss": 0.0048, + "num_input_tokens_seen": 23414624, + "step": 110945 + }, + { + "epoch": 12.205720572057206, + "grad_norm": 0.03711145371198654, + "learning_rate": 1.9778264496431768e-05, + "loss": 0.0159, + "num_input_tokens_seen": 23415744, + "step": 110950 + }, + { + "epoch": 12.206270627062706, + "grad_norm": 0.05159680172801018, + "learning_rate": 1.9775917389770652e-05, + "loss": 0.0178, + "num_input_tokens_seen": 23416832, + "step": 110955 + }, + { + "epoch": 12.206820682068207, + "grad_norm": 0.018725555390119553, + "learning_rate": 1.9773570331257385e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23417856, + "step": 110960 + }, + { + "epoch": 12.207370737073708, + "grad_norm": 2.181626081466675, + "learning_rate": 1.977122332091362e-05, + "loss": 0.0504, + "num_input_tokens_seen": 23418912, + "step": 110965 + }, + { + "epoch": 12.207920792079207, + "grad_norm": 0.005884845275431871, + "learning_rate": 1.9768876358760985e-05, + "loss": 0.0626, + "num_input_tokens_seen": 23419968, + "step": 110970 + }, + { + "epoch": 12.208470847084708, + "grad_norm": 0.01934560015797615, + "learning_rate": 1.976652944482109e-05, + "loss": 0.1287, + "num_input_tokens_seen": 23421024, + "step": 110975 + }, + { + "epoch": 12.20902090209021, + "grad_norm": 0.01779816672205925, + "learning_rate": 1.9764182579115596e-05, + "loss": 0.0323, + "num_input_tokens_seen": 23422080, + "step": 110980 + }, + { + "epoch": 12.209570957095709, + "grad_norm": 0.07492861896753311, + "learning_rate": 1.9761835761666114e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23423136, + "step": 110985 + }, + { + "epoch": 12.21012101210121, + "grad_norm": 0.017870711162686348, + "learning_rate": 1.9759488992494278e-05, + "loss": 0.0269, + "num_input_tokens_seen": 23424192, + "step": 110990 + }, + { + "epoch": 12.210671067106711, + "grad_norm": 0.0035973277408629656, + "learning_rate": 1.9757142271621727e-05, + "loss": 0.0507, + "num_input_tokens_seen": 23425248, + "step": 110995 + }, + { + "epoch": 12.211221122112212, + "grad_norm": 0.01342963520437479, + "learning_rate": 1.9754795599070068e-05, + "loss": 0.0884, + "num_input_tokens_seen": 23426272, + "step": 111000 + }, + { + "epoch": 12.211771177117711, + "grad_norm": 0.061771027743816376, + "learning_rate": 1.9752448974860945e-05, + "loss": 0.0062, + "num_input_tokens_seen": 23427328, + "step": 111005 + }, + { + "epoch": 12.212321232123212, + "grad_norm": 1.8772259950637817, + "learning_rate": 1.9750102399015986e-05, + "loss": 0.1225, + "num_input_tokens_seen": 23428416, + "step": 111010 + }, + { + "epoch": 12.212871287128714, + "grad_norm": 0.0261052418500185, + "learning_rate": 1.9747755871556805e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23429472, + "step": 111015 + }, + { + "epoch": 12.213421342134213, + "grad_norm": 0.026227138936519623, + "learning_rate": 1.9745409392505043e-05, + "loss": 0.0486, + "num_input_tokens_seen": 23430528, + "step": 111020 + }, + { + "epoch": 12.213971397139714, + "grad_norm": 0.05962378904223442, + "learning_rate": 1.9743062961882318e-05, + "loss": 0.03, + "num_input_tokens_seen": 23431616, + "step": 111025 + }, + { + "epoch": 12.214521452145215, + "grad_norm": 0.035992082208395004, + "learning_rate": 1.9740716579710267e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23432640, + "step": 111030 + }, + { + "epoch": 12.215071507150714, + "grad_norm": 0.05759146809577942, + "learning_rate": 1.9738370246010503e-05, + "loss": 0.0081, + "num_input_tokens_seen": 23433696, + "step": 111035 + }, + { + "epoch": 12.215621562156215, + "grad_norm": 0.05678822100162506, + "learning_rate": 1.973602396080465e-05, + "loss": 0.0355, + "num_input_tokens_seen": 23434752, + "step": 111040 + }, + { + "epoch": 12.216171617161717, + "grad_norm": 1.1474113464355469, + "learning_rate": 1.9733677724114342e-05, + "loss": 0.05, + "num_input_tokens_seen": 23435776, + "step": 111045 + }, + { + "epoch": 12.216721672167218, + "grad_norm": 1.4243500232696533, + "learning_rate": 1.97313315359612e-05, + "loss": 0.03, + "num_input_tokens_seen": 23436896, + "step": 111050 + }, + { + "epoch": 12.217271727172717, + "grad_norm": 0.40186163783073425, + "learning_rate": 1.972898539636685e-05, + "loss": 0.0196, + "num_input_tokens_seen": 23438048, + "step": 111055 + }, + { + "epoch": 12.217821782178218, + "grad_norm": 0.0067073507234454155, + "learning_rate": 1.9726639305352913e-05, + "loss": 0.0907, + "num_input_tokens_seen": 23439072, + "step": 111060 + }, + { + "epoch": 12.218371837183719, + "grad_norm": 1.588433027267456, + "learning_rate": 1.9724293262941003e-05, + "loss": 0.1159, + "num_input_tokens_seen": 23440160, + "step": 111065 + }, + { + "epoch": 12.218921892189218, + "grad_norm": 0.6874143481254578, + "learning_rate": 1.9721947269152763e-05, + "loss": 0.0111, + "num_input_tokens_seen": 23441248, + "step": 111070 + }, + { + "epoch": 12.21947194719472, + "grad_norm": 1.2644731998443604, + "learning_rate": 1.971960132400979e-05, + "loss": 0.0368, + "num_input_tokens_seen": 23442304, + "step": 111075 + }, + { + "epoch": 12.22002200220022, + "grad_norm": 0.03846825659275055, + "learning_rate": 1.9717255427533723e-05, + "loss": 0.0571, + "num_input_tokens_seen": 23443296, + "step": 111080 + }, + { + "epoch": 12.22057205720572, + "grad_norm": 1.0459345579147339, + "learning_rate": 1.9714909579746186e-05, + "loss": 0.0186, + "num_input_tokens_seen": 23444352, + "step": 111085 + }, + { + "epoch": 12.221122112211221, + "grad_norm": 0.15367887914180756, + "learning_rate": 1.9712563780668777e-05, + "loss": 0.0051, + "num_input_tokens_seen": 23445408, + "step": 111090 + }, + { + "epoch": 12.221672167216722, + "grad_norm": 0.02337607368826866, + "learning_rate": 1.9710218030323138e-05, + "loss": 0.0044, + "num_input_tokens_seen": 23446496, + "step": 111095 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 0.02200441248714924, + "learning_rate": 1.9707872328730876e-05, + "loss": 0.0069, + "num_input_tokens_seen": 23447552, + "step": 111100 + }, + { + "epoch": 12.222772277227723, + "grad_norm": 0.014505107887089252, + "learning_rate": 1.9705526675913617e-05, + "loss": 0.0534, + "num_input_tokens_seen": 23448576, + "step": 111105 + }, + { + "epoch": 12.223322332233224, + "grad_norm": 0.008921146392822266, + "learning_rate": 1.9703181071892988e-05, + "loss": 0.001, + "num_input_tokens_seen": 23449600, + "step": 111110 + }, + { + "epoch": 12.223872387238725, + "grad_norm": 0.633385181427002, + "learning_rate": 1.9700835516690584e-05, + "loss": 0.0591, + "num_input_tokens_seen": 23450624, + "step": 111115 + }, + { + "epoch": 12.224422442244224, + "grad_norm": 0.01379534974694252, + "learning_rate": 1.9698490010328043e-05, + "loss": 0.0067, + "num_input_tokens_seen": 23451680, + "step": 111120 + }, + { + "epoch": 12.224972497249725, + "grad_norm": 0.020715737715363503, + "learning_rate": 1.969614455282697e-05, + "loss": 0.0021, + "num_input_tokens_seen": 23452800, + "step": 111125 + }, + { + "epoch": 12.225522552255226, + "grad_norm": 1.923896312713623, + "learning_rate": 1.9693799144208997e-05, + "loss": 0.0663, + "num_input_tokens_seen": 23453856, + "step": 111130 + }, + { + "epoch": 12.226072607260726, + "grad_norm": 0.3631644546985626, + "learning_rate": 1.969145378449573e-05, + "loss": 0.0263, + "num_input_tokens_seen": 23454944, + "step": 111135 + }, + { + "epoch": 12.226622662266227, + "grad_norm": 0.019158899784088135, + "learning_rate": 1.9689108473708776e-05, + "loss": 0.0066, + "num_input_tokens_seen": 23456032, + "step": 111140 + }, + { + "epoch": 12.227172717271728, + "grad_norm": 0.018562769517302513, + "learning_rate": 1.9686763211869776e-05, + "loss": 0.0059, + "num_input_tokens_seen": 23457152, + "step": 111145 + }, + { + "epoch": 12.227722772277227, + "grad_norm": 0.7722204923629761, + "learning_rate": 1.968441799900031e-05, + "loss": 0.0144, + "num_input_tokens_seen": 23458240, + "step": 111150 + }, + { + "epoch": 12.228272827282728, + "grad_norm": 0.5422900319099426, + "learning_rate": 1.968207283512203e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23459232, + "step": 111155 + }, + { + "epoch": 12.22882288228823, + "grad_norm": 0.009642400778830051, + "learning_rate": 1.9679727720256524e-05, + "loss": 0.0021, + "num_input_tokens_seen": 23460288, + "step": 111160 + }, + { + "epoch": 12.229372937293729, + "grad_norm": 0.021146520972251892, + "learning_rate": 1.9677382654425414e-05, + "loss": 0.0052, + "num_input_tokens_seen": 23461408, + "step": 111165 + }, + { + "epoch": 12.22992299229923, + "grad_norm": 0.48552945256233215, + "learning_rate": 1.9675037637650325e-05, + "loss": 0.0077, + "num_input_tokens_seen": 23462464, + "step": 111170 + }, + { + "epoch": 12.23047304730473, + "grad_norm": 1.8533107042312622, + "learning_rate": 1.9672692669952843e-05, + "loss": 0.058, + "num_input_tokens_seen": 23463520, + "step": 111175 + }, + { + "epoch": 12.231023102310232, + "grad_norm": 1.3535981178283691, + "learning_rate": 1.9670347751354605e-05, + "loss": 0.0226, + "num_input_tokens_seen": 23464608, + "step": 111180 + }, + { + "epoch": 12.231573157315731, + "grad_norm": 0.04270043596625328, + "learning_rate": 1.9668002881877224e-05, + "loss": 0.0272, + "num_input_tokens_seen": 23465568, + "step": 111185 + }, + { + "epoch": 12.232123212321232, + "grad_norm": 0.0627623125910759, + "learning_rate": 1.9665658061542286e-05, + "loss": 0.0071, + "num_input_tokens_seen": 23466624, + "step": 111190 + }, + { + "epoch": 12.232673267326733, + "grad_norm": 0.7063885927200317, + "learning_rate": 1.9663313290371423e-05, + "loss": 0.0083, + "num_input_tokens_seen": 23467680, + "step": 111195 + }, + { + "epoch": 12.233223322332233, + "grad_norm": 1.1914494037628174, + "learning_rate": 1.966096856838624e-05, + "loss": 0.0568, + "num_input_tokens_seen": 23468768, + "step": 111200 + }, + { + "epoch": 12.233773377337734, + "grad_norm": 2.238741159439087, + "learning_rate": 1.9658623895608354e-05, + "loss": 0.0103, + "num_input_tokens_seen": 23469856, + "step": 111205 + }, + { + "epoch": 12.234323432343235, + "grad_norm": 0.2906837463378906, + "learning_rate": 1.9656279272059362e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23470880, + "step": 111210 + }, + { + "epoch": 12.234873487348734, + "grad_norm": 0.04279372841119766, + "learning_rate": 1.9653934697760878e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23471872, + "step": 111215 + }, + { + "epoch": 12.235423542354235, + "grad_norm": 0.05918637290596962, + "learning_rate": 1.965159017273452e-05, + "loss": 0.0416, + "num_input_tokens_seen": 23472864, + "step": 111220 + }, + { + "epoch": 12.235973597359736, + "grad_norm": 0.009935532696545124, + "learning_rate": 1.964924569700189e-05, + "loss": 0.0507, + "num_input_tokens_seen": 23473856, + "step": 111225 + }, + { + "epoch": 12.236523652365236, + "grad_norm": 0.055081941187381744, + "learning_rate": 1.9646901270584583e-05, + "loss": 0.004, + "num_input_tokens_seen": 23474976, + "step": 111230 + }, + { + "epoch": 12.237073707370737, + "grad_norm": 0.08040013909339905, + "learning_rate": 1.964455689350423e-05, + "loss": 0.0108, + "num_input_tokens_seen": 23476032, + "step": 111235 + }, + { + "epoch": 12.237623762376238, + "grad_norm": 0.0252887811511755, + "learning_rate": 1.9642212565782416e-05, + "loss": 0.1404, + "num_input_tokens_seen": 23477056, + "step": 111240 + }, + { + "epoch": 12.238173817381739, + "grad_norm": 1.4275624752044678, + "learning_rate": 1.963986828744077e-05, + "loss": 0.0118, + "num_input_tokens_seen": 23478144, + "step": 111245 + }, + { + "epoch": 12.238723872387238, + "grad_norm": 0.016260966658592224, + "learning_rate": 1.9637524058500878e-05, + "loss": 0.0423, + "num_input_tokens_seen": 23479264, + "step": 111250 + }, + { + "epoch": 12.23927392739274, + "grad_norm": 0.08612287789583206, + "learning_rate": 1.963517987898435e-05, + "loss": 0.0348, + "num_input_tokens_seen": 23480320, + "step": 111255 + }, + { + "epoch": 12.23982398239824, + "grad_norm": 0.04407993331551552, + "learning_rate": 1.963283574891281e-05, + "loss": 0.0267, + "num_input_tokens_seen": 23481408, + "step": 111260 + }, + { + "epoch": 12.24037403740374, + "grad_norm": 0.009695591405034065, + "learning_rate": 1.963049166830783e-05, + "loss": 0.0125, + "num_input_tokens_seen": 23482496, + "step": 111265 + }, + { + "epoch": 12.24092409240924, + "grad_norm": 0.08158744126558304, + "learning_rate": 1.9628147637191042e-05, + "loss": 0.0064, + "num_input_tokens_seen": 23483552, + "step": 111270 + }, + { + "epoch": 12.241474147414742, + "grad_norm": 0.07785646617412567, + "learning_rate": 1.9625803655584042e-05, + "loss": 0.0249, + "num_input_tokens_seen": 23484640, + "step": 111275 + }, + { + "epoch": 12.242024202420241, + "grad_norm": 0.044657450169324875, + "learning_rate": 1.9623459723508415e-05, + "loss": 0.0419, + "num_input_tokens_seen": 23485728, + "step": 111280 + }, + { + "epoch": 12.242574257425742, + "grad_norm": 0.03081057220697403, + "learning_rate": 1.9621115840985795e-05, + "loss": 0.0013, + "num_input_tokens_seen": 23486752, + "step": 111285 + }, + { + "epoch": 12.243124312431243, + "grad_norm": 2.00361704826355, + "learning_rate": 1.9618772008037763e-05, + "loss": 0.1254, + "num_input_tokens_seen": 23487872, + "step": 111290 + }, + { + "epoch": 12.243674367436745, + "grad_norm": 2.4225053787231445, + "learning_rate": 1.961642822468593e-05, + "loss": 0.028, + "num_input_tokens_seen": 23488896, + "step": 111295 + }, + { + "epoch": 12.244224422442244, + "grad_norm": 1.5273925065994263, + "learning_rate": 1.96140844909519e-05, + "loss": 0.0854, + "num_input_tokens_seen": 23489920, + "step": 111300 + }, + { + "epoch": 12.244774477447745, + "grad_norm": 2.1562883853912354, + "learning_rate": 1.9611740806857253e-05, + "loss": 0.1432, + "num_input_tokens_seen": 23491008, + "step": 111305 + }, + { + "epoch": 12.245324532453246, + "grad_norm": 2.233703136444092, + "learning_rate": 1.9609397172423617e-05, + "loss": 0.154, + "num_input_tokens_seen": 23492128, + "step": 111310 + }, + { + "epoch": 12.245874587458745, + "grad_norm": 0.4512510895729065, + "learning_rate": 1.9607053587672576e-05, + "loss": 0.0199, + "num_input_tokens_seen": 23493184, + "step": 111315 + }, + { + "epoch": 12.246424642464246, + "grad_norm": 0.08378447592258453, + "learning_rate": 1.960471005262574e-05, + "loss": 0.0052, + "num_input_tokens_seen": 23494176, + "step": 111320 + }, + { + "epoch": 12.246974697469748, + "grad_norm": 0.05036405473947525, + "learning_rate": 1.96023665673047e-05, + "loss": 0.0765, + "num_input_tokens_seen": 23495200, + "step": 111325 + }, + { + "epoch": 12.247524752475247, + "grad_norm": 1.0413401126861572, + "learning_rate": 1.9600023131731054e-05, + "loss": 0.0819, + "num_input_tokens_seen": 23496256, + "step": 111330 + }, + { + "epoch": 12.248074807480748, + "grad_norm": 0.023421593010425568, + "learning_rate": 1.9597679745926412e-05, + "loss": 0.0031, + "num_input_tokens_seen": 23497280, + "step": 111335 + }, + { + "epoch": 12.248624862486249, + "grad_norm": 0.28906184434890747, + "learning_rate": 1.9595336409912346e-05, + "loss": 0.0096, + "num_input_tokens_seen": 23498304, + "step": 111340 + }, + { + "epoch": 12.249174917491748, + "grad_norm": 0.06518832594156265, + "learning_rate": 1.959299312371049e-05, + "loss": 0.1033, + "num_input_tokens_seen": 23499392, + "step": 111345 + }, + { + "epoch": 12.24972497249725, + "grad_norm": 0.22214855253696442, + "learning_rate": 1.9590649887342418e-05, + "loss": 0.0227, + "num_input_tokens_seen": 23500384, + "step": 111350 + }, + { + "epoch": 12.25027502750275, + "grad_norm": 0.04196471348404884, + "learning_rate": 1.958830670082972e-05, + "loss": 0.0043, + "num_input_tokens_seen": 23501504, + "step": 111355 + }, + { + "epoch": 12.250825082508252, + "grad_norm": 0.49233901500701904, + "learning_rate": 1.9585963564194017e-05, + "loss": 0.0216, + "num_input_tokens_seen": 23502528, + "step": 111360 + }, + { + "epoch": 12.251375137513751, + "grad_norm": 0.018693946301937103, + "learning_rate": 1.9583620477456878e-05, + "loss": 0.002, + "num_input_tokens_seen": 23503584, + "step": 111365 + }, + { + "epoch": 12.251925192519252, + "grad_norm": 0.05629327520728111, + "learning_rate": 1.958127744063991e-05, + "loss": 0.0421, + "num_input_tokens_seen": 23504640, + "step": 111370 + }, + { + "epoch": 12.252475247524753, + "grad_norm": 0.00643382640555501, + "learning_rate": 1.9578934453764718e-05, + "loss": 0.0043, + "num_input_tokens_seen": 23505664, + "step": 111375 + }, + { + "epoch": 12.253025302530252, + "grad_norm": 1.111213207244873, + "learning_rate": 1.9576591516852878e-05, + "loss": 0.119, + "num_input_tokens_seen": 23506752, + "step": 111380 + }, + { + "epoch": 12.253575357535754, + "grad_norm": 0.6281939744949341, + "learning_rate": 1.9574248629925993e-05, + "loss": 0.052, + "num_input_tokens_seen": 23507744, + "step": 111385 + }, + { + "epoch": 12.254125412541255, + "grad_norm": 1.3062057495117188, + "learning_rate": 1.957190579300565e-05, + "loss": 0.0338, + "num_input_tokens_seen": 23508768, + "step": 111390 + }, + { + "epoch": 12.254675467546754, + "grad_norm": 0.16693352162837982, + "learning_rate": 1.956956300611346e-05, + "loss": 0.0351, + "num_input_tokens_seen": 23509856, + "step": 111395 + }, + { + "epoch": 12.255225522552255, + "grad_norm": 0.03673608973622322, + "learning_rate": 1.956722026927099e-05, + "loss": 0.0761, + "num_input_tokens_seen": 23510912, + "step": 111400 + }, + { + "epoch": 12.255775577557756, + "grad_norm": 1.3450270891189575, + "learning_rate": 1.9564877582499845e-05, + "loss": 0.0103, + "num_input_tokens_seen": 23511968, + "step": 111405 + }, + { + "epoch": 12.256325632563255, + "grad_norm": 0.39107799530029297, + "learning_rate": 1.956253494582162e-05, + "loss": 0.081, + "num_input_tokens_seen": 23512992, + "step": 111410 + }, + { + "epoch": 12.256875687568757, + "grad_norm": 0.07453873753547668, + "learning_rate": 1.9560192359257906e-05, + "loss": 0.0079, + "num_input_tokens_seen": 23513984, + "step": 111415 + }, + { + "epoch": 12.257425742574258, + "grad_norm": 0.003671528771519661, + "learning_rate": 1.9557849822830277e-05, + "loss": 0.0158, + "num_input_tokens_seen": 23515072, + "step": 111420 + }, + { + "epoch": 12.257975797579759, + "grad_norm": 0.0954749658703804, + "learning_rate": 1.955550733656034e-05, + "loss": 0.0378, + "num_input_tokens_seen": 23516128, + "step": 111425 + }, + { + "epoch": 12.258525852585258, + "grad_norm": 0.17773829400539398, + "learning_rate": 1.9553164900469677e-05, + "loss": 0.0029, + "num_input_tokens_seen": 23517216, + "step": 111430 + }, + { + "epoch": 12.25907590759076, + "grad_norm": 0.04328804463148117, + "learning_rate": 1.9550822514579887e-05, + "loss": 0.0045, + "num_input_tokens_seen": 23518240, + "step": 111435 + }, + { + "epoch": 12.25962596259626, + "grad_norm": 0.29245200753211975, + "learning_rate": 1.9548480178912545e-05, + "loss": 0.0426, + "num_input_tokens_seen": 23519264, + "step": 111440 + }, + { + "epoch": 12.26017601760176, + "grad_norm": 0.7417306303977966, + "learning_rate": 1.9546137893489242e-05, + "loss": 0.0097, + "num_input_tokens_seen": 23520288, + "step": 111445 + }, + { + "epoch": 12.26072607260726, + "grad_norm": 0.05202911049127579, + "learning_rate": 1.9543795658331578e-05, + "loss": 0.0023, + "num_input_tokens_seen": 23521344, + "step": 111450 + }, + { + "epoch": 12.261276127612762, + "grad_norm": 0.10595200210809708, + "learning_rate": 1.954145347346112e-05, + "loss": 0.0942, + "num_input_tokens_seen": 23522400, + "step": 111455 + }, + { + "epoch": 12.261826182618261, + "grad_norm": 0.10246240347623825, + "learning_rate": 1.9539111338899475e-05, + "loss": 0.0693, + "num_input_tokens_seen": 23523424, + "step": 111460 + }, + { + "epoch": 12.262376237623762, + "grad_norm": 0.023145444691181183, + "learning_rate": 1.953676925466822e-05, + "loss": 0.0091, + "num_input_tokens_seen": 23524512, + "step": 111465 + }, + { + "epoch": 12.262926292629263, + "grad_norm": 0.038166921585798264, + "learning_rate": 1.953442722078893e-05, + "loss": 0.0029, + "num_input_tokens_seen": 23525504, + "step": 111470 + }, + { + "epoch": 12.263476347634764, + "grad_norm": 0.13615944981575012, + "learning_rate": 1.953208523728322e-05, + "loss": 0.05, + "num_input_tokens_seen": 23526528, + "step": 111475 + }, + { + "epoch": 12.264026402640264, + "grad_norm": 2.4010586738586426, + "learning_rate": 1.9529743304172643e-05, + "loss": 0.0496, + "num_input_tokens_seen": 23527616, + "step": 111480 + }, + { + "epoch": 12.264576457645765, + "grad_norm": 0.7888060808181763, + "learning_rate": 1.9527401421478802e-05, + "loss": 0.0936, + "num_input_tokens_seen": 23528640, + "step": 111485 + }, + { + "epoch": 12.265126512651266, + "grad_norm": 3.3760547637939453, + "learning_rate": 1.952505958922328e-05, + "loss": 0.0966, + "num_input_tokens_seen": 23529696, + "step": 111490 + }, + { + "epoch": 12.265676567656765, + "grad_norm": 0.06926446408033371, + "learning_rate": 1.9522717807427647e-05, + "loss": 0.0114, + "num_input_tokens_seen": 23530720, + "step": 111495 + }, + { + "epoch": 12.266226622662266, + "grad_norm": 0.14127206802368164, + "learning_rate": 1.9520376076113504e-05, + "loss": 0.0021, + "num_input_tokens_seen": 23531776, + "step": 111500 + }, + { + "epoch": 12.266776677667767, + "grad_norm": 0.032115038484334946, + "learning_rate": 1.9518034395302414e-05, + "loss": 0.044, + "num_input_tokens_seen": 23532832, + "step": 111505 + }, + { + "epoch": 12.267326732673267, + "grad_norm": 0.050578124821186066, + "learning_rate": 1.9515692765015985e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23533856, + "step": 111510 + }, + { + "epoch": 12.267876787678768, + "grad_norm": 0.15985552966594696, + "learning_rate": 1.951335118527578e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23534880, + "step": 111515 + }, + { + "epoch": 12.268426842684269, + "grad_norm": 0.031501106917858124, + "learning_rate": 1.9511009656103377e-05, + "loss": 0.0543, + "num_input_tokens_seen": 23535904, + "step": 111520 + }, + { + "epoch": 12.268976897689768, + "grad_norm": 0.12487692385911942, + "learning_rate": 1.950866817752037e-05, + "loss": 0.0105, + "num_input_tokens_seen": 23536928, + "step": 111525 + }, + { + "epoch": 12.26952695269527, + "grad_norm": 0.006101865321397781, + "learning_rate": 1.950632674954833e-05, + "loss": 0.0232, + "num_input_tokens_seen": 23537920, + "step": 111530 + }, + { + "epoch": 12.27007700770077, + "grad_norm": 0.21515901386737823, + "learning_rate": 1.950398537220885e-05, + "loss": 0.0302, + "num_input_tokens_seen": 23539040, + "step": 111535 + }, + { + "epoch": 12.270627062706271, + "grad_norm": 0.0653003454208374, + "learning_rate": 1.950164404552349e-05, + "loss": 0.125, + "num_input_tokens_seen": 23540096, + "step": 111540 + }, + { + "epoch": 12.27117711771177, + "grad_norm": 0.27864786982536316, + "learning_rate": 1.9499302769513837e-05, + "loss": 0.087, + "num_input_tokens_seen": 23541088, + "step": 111545 + }, + { + "epoch": 12.271727172717272, + "grad_norm": 0.020829275250434875, + "learning_rate": 1.949696154420148e-05, + "loss": 0.0701, + "num_input_tokens_seen": 23542112, + "step": 111550 + }, + { + "epoch": 12.272277227722773, + "grad_norm": 0.07504879683256149, + "learning_rate": 1.949462036960798e-05, + "loss": 0.029, + "num_input_tokens_seen": 23543168, + "step": 111555 + }, + { + "epoch": 12.272827282728272, + "grad_norm": 1.5777188539505005, + "learning_rate": 1.9492279245754923e-05, + "loss": 0.0877, + "num_input_tokens_seen": 23544224, + "step": 111560 + }, + { + "epoch": 12.273377337733773, + "grad_norm": 0.01655750349164009, + "learning_rate": 1.9489938172663897e-05, + "loss": 0.0014, + "num_input_tokens_seen": 23545248, + "step": 111565 + }, + { + "epoch": 12.273927392739274, + "grad_norm": 0.08863350749015808, + "learning_rate": 1.948759715035645e-05, + "loss": 0.0354, + "num_input_tokens_seen": 23546240, + "step": 111570 + }, + { + "epoch": 12.274477447744774, + "grad_norm": 1.3980785608291626, + "learning_rate": 1.9485256178854182e-05, + "loss": 0.0464, + "num_input_tokens_seen": 23547328, + "step": 111575 + }, + { + "epoch": 12.275027502750275, + "grad_norm": 1.3081647157669067, + "learning_rate": 1.9482915258178658e-05, + "loss": 0.0538, + "num_input_tokens_seen": 23548352, + "step": 111580 + }, + { + "epoch": 12.275577557755776, + "grad_norm": 0.020168889313936234, + "learning_rate": 1.9480574388351464e-05, + "loss": 0.0094, + "num_input_tokens_seen": 23549408, + "step": 111585 + }, + { + "epoch": 12.276127612761275, + "grad_norm": 0.232477068901062, + "learning_rate": 1.9478233569394168e-05, + "loss": 0.0089, + "num_input_tokens_seen": 23550432, + "step": 111590 + }, + { + "epoch": 12.276677667766776, + "grad_norm": 0.030403701588511467, + "learning_rate": 1.9475892801328336e-05, + "loss": 0.0015, + "num_input_tokens_seen": 23551392, + "step": 111595 + }, + { + "epoch": 12.277227722772277, + "grad_norm": 1.0948841571807861, + "learning_rate": 1.9473552084175554e-05, + "loss": 0.1733, + "num_input_tokens_seen": 23552416, + "step": 111600 + }, + { + "epoch": 12.277777777777779, + "grad_norm": 0.111507348716259, + "learning_rate": 1.9471211417957394e-05, + "loss": 0.0491, + "num_input_tokens_seen": 23553536, + "step": 111605 + }, + { + "epoch": 12.278327832783278, + "grad_norm": 0.15634487569332123, + "learning_rate": 1.9468870802695415e-05, + "loss": 0.0238, + "num_input_tokens_seen": 23554528, + "step": 111610 + }, + { + "epoch": 12.278877887788779, + "grad_norm": 0.27220070362091064, + "learning_rate": 1.9466530238411205e-05, + "loss": 0.1534, + "num_input_tokens_seen": 23555616, + "step": 111615 + }, + { + "epoch": 12.27942794279428, + "grad_norm": 1.6004170179367065, + "learning_rate": 1.9464189725126328e-05, + "loss": 0.0807, + "num_input_tokens_seen": 23556704, + "step": 111620 + }, + { + "epoch": 12.27997799779978, + "grad_norm": 0.02328583225607872, + "learning_rate": 1.9461849262862365e-05, + "loss": 0.0904, + "num_input_tokens_seen": 23557824, + "step": 111625 + }, + { + "epoch": 12.28052805280528, + "grad_norm": 1.9879831075668335, + "learning_rate": 1.945950885164088e-05, + "loss": 0.2218, + "num_input_tokens_seen": 23558912, + "step": 111630 + }, + { + "epoch": 12.281078107810782, + "grad_norm": 0.03833777830004692, + "learning_rate": 1.9457168491483434e-05, + "loss": 0.0183, + "num_input_tokens_seen": 23559936, + "step": 111635 + }, + { + "epoch": 12.281628162816281, + "grad_norm": 0.05423051863908768, + "learning_rate": 1.9454828182411617e-05, + "loss": 0.0293, + "num_input_tokens_seen": 23560928, + "step": 111640 + }, + { + "epoch": 12.282178217821782, + "grad_norm": 0.21344077587127686, + "learning_rate": 1.9452487924446973e-05, + "loss": 0.007, + "num_input_tokens_seen": 23561952, + "step": 111645 + }, + { + "epoch": 12.282728272827283, + "grad_norm": 0.18135711550712585, + "learning_rate": 1.94501477176111e-05, + "loss": 0.0033, + "num_input_tokens_seen": 23563008, + "step": 111650 + }, + { + "epoch": 12.283278327832782, + "grad_norm": 0.043531838804483414, + "learning_rate": 1.944780756192555e-05, + "loss": 0.004, + "num_input_tokens_seen": 23564032, + "step": 111655 + }, + { + "epoch": 12.283828382838283, + "grad_norm": 0.21732033789157867, + "learning_rate": 1.9445467457411888e-05, + "loss": 0.0093, + "num_input_tokens_seen": 23565120, + "step": 111660 + }, + { + "epoch": 12.284378437843785, + "grad_norm": 1.4990122318267822, + "learning_rate": 1.9443127404091695e-05, + "loss": 0.0517, + "num_input_tokens_seen": 23566112, + "step": 111665 + }, + { + "epoch": 12.284928492849286, + "grad_norm": 0.7005138993263245, + "learning_rate": 1.944078740198652e-05, + "loss": 0.0102, + "num_input_tokens_seen": 23567200, + "step": 111670 + }, + { + "epoch": 12.285478547854785, + "grad_norm": 0.06914503872394562, + "learning_rate": 1.9438447451117945e-05, + "loss": 0.0029, + "num_input_tokens_seen": 23568256, + "step": 111675 + }, + { + "epoch": 12.286028602860286, + "grad_norm": 0.10390513390302658, + "learning_rate": 1.9436107551507537e-05, + "loss": 0.0058, + "num_input_tokens_seen": 23569312, + "step": 111680 + }, + { + "epoch": 12.286578657865787, + "grad_norm": 0.013547536917030811, + "learning_rate": 1.9433767703176846e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23570400, + "step": 111685 + }, + { + "epoch": 12.287128712871286, + "grad_norm": 0.656035304069519, + "learning_rate": 1.9431427906147453e-05, + "loss": 0.0505, + "num_input_tokens_seen": 23571488, + "step": 111690 + }, + { + "epoch": 12.287678767876788, + "grad_norm": 0.0510450042784214, + "learning_rate": 1.9429088160440912e-05, + "loss": 0.0041, + "num_input_tokens_seen": 23572512, + "step": 111695 + }, + { + "epoch": 12.288228822882289, + "grad_norm": 0.01922447979450226, + "learning_rate": 1.94267484660788e-05, + "loss": 0.037, + "num_input_tokens_seen": 23573600, + "step": 111700 + }, + { + "epoch": 12.288778877887788, + "grad_norm": 0.030417371541261673, + "learning_rate": 1.9424408823082672e-05, + "loss": 0.007, + "num_input_tokens_seen": 23574688, + "step": 111705 + }, + { + "epoch": 12.289328932893289, + "grad_norm": 0.09726843982934952, + "learning_rate": 1.9422069231474084e-05, + "loss": 0.0068, + "num_input_tokens_seen": 23575744, + "step": 111710 + }, + { + "epoch": 12.28987898789879, + "grad_norm": 0.020541708916425705, + "learning_rate": 1.9419729691274612e-05, + "loss": 0.1276, + "num_input_tokens_seen": 23576736, + "step": 111715 + }, + { + "epoch": 12.290429042904291, + "grad_norm": 2.5234179496765137, + "learning_rate": 1.941739020250581e-05, + "loss": 0.0536, + "num_input_tokens_seen": 23577760, + "step": 111720 + }, + { + "epoch": 12.29097909790979, + "grad_norm": 0.27655717730522156, + "learning_rate": 1.9415050765189252e-05, + "loss": 0.0162, + "num_input_tokens_seen": 23578784, + "step": 111725 + }, + { + "epoch": 12.291529152915292, + "grad_norm": 0.03696785494685173, + "learning_rate": 1.9412711379346492e-05, + "loss": 0.0077, + "num_input_tokens_seen": 23579808, + "step": 111730 + }, + { + "epoch": 12.292079207920793, + "grad_norm": 0.10223780572414398, + "learning_rate": 1.941037204499908e-05, + "loss": 0.0273, + "num_input_tokens_seen": 23580832, + "step": 111735 + }, + { + "epoch": 12.292629262926292, + "grad_norm": 0.08683915436267853, + "learning_rate": 1.9408032762168596e-05, + "loss": 0.092, + "num_input_tokens_seen": 23581856, + "step": 111740 + }, + { + "epoch": 12.293179317931793, + "grad_norm": 0.12579397857189178, + "learning_rate": 1.9405693530876586e-05, + "loss": 0.0035, + "num_input_tokens_seen": 23582880, + "step": 111745 + }, + { + "epoch": 12.293729372937294, + "grad_norm": 0.08134254068136215, + "learning_rate": 1.9403354351144612e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23583968, + "step": 111750 + }, + { + "epoch": 12.294279427942794, + "grad_norm": 0.014364717528223991, + "learning_rate": 1.9401015222994245e-05, + "loss": 0.0097, + "num_input_tokens_seen": 23584992, + "step": 111755 + }, + { + "epoch": 12.294829482948295, + "grad_norm": 0.44942858815193176, + "learning_rate": 1.9398676146447018e-05, + "loss": 0.0509, + "num_input_tokens_seen": 23585952, + "step": 111760 + }, + { + "epoch": 12.295379537953796, + "grad_norm": 0.13540905714035034, + "learning_rate": 1.9396337121524522e-05, + "loss": 0.005, + "num_input_tokens_seen": 23587008, + "step": 111765 + }, + { + "epoch": 12.295929592959295, + "grad_norm": 0.5409436225891113, + "learning_rate": 1.9393998148248285e-05, + "loss": 0.0082, + "num_input_tokens_seen": 23588064, + "step": 111770 + }, + { + "epoch": 12.296479647964796, + "grad_norm": 0.017983712255954742, + "learning_rate": 1.939165922663988e-05, + "loss": 0.0078, + "num_input_tokens_seen": 23589152, + "step": 111775 + }, + { + "epoch": 12.297029702970297, + "grad_norm": 0.024708257988095284, + "learning_rate": 1.9389320356720874e-05, + "loss": 0.0077, + "num_input_tokens_seen": 23590144, + "step": 111780 + }, + { + "epoch": 12.297579757975798, + "grad_norm": 0.12607617676258087, + "learning_rate": 1.9386981538512794e-05, + "loss": 0.051, + "num_input_tokens_seen": 23591232, + "step": 111785 + }, + { + "epoch": 12.298129812981298, + "grad_norm": 0.08365999162197113, + "learning_rate": 1.9384642772037218e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23592320, + "step": 111790 + }, + { + "epoch": 12.298679867986799, + "grad_norm": 0.3675709664821625, + "learning_rate": 1.93823040573157e-05, + "loss": 0.0565, + "num_input_tokens_seen": 23593312, + "step": 111795 + }, + { + "epoch": 12.2992299229923, + "grad_norm": 0.1145833432674408, + "learning_rate": 1.937996539436978e-05, + "loss": 0.0098, + "num_input_tokens_seen": 23594368, + "step": 111800 + }, + { + "epoch": 12.2997799779978, + "grad_norm": 1.2131953239440918, + "learning_rate": 1.9377626783221026e-05, + "loss": 0.1354, + "num_input_tokens_seen": 23595392, + "step": 111805 + }, + { + "epoch": 12.3003300330033, + "grad_norm": 0.014408892020583153, + "learning_rate": 1.9375288223890985e-05, + "loss": 0.0135, + "num_input_tokens_seen": 23596480, + "step": 111810 + }, + { + "epoch": 12.300880088008801, + "grad_norm": 0.8162006139755249, + "learning_rate": 1.9372949716401222e-05, + "loss": 0.0149, + "num_input_tokens_seen": 23597536, + "step": 111815 + }, + { + "epoch": 12.3014301430143, + "grad_norm": 0.007331563625484705, + "learning_rate": 1.9370611260773276e-05, + "loss": 0.014, + "num_input_tokens_seen": 23598624, + "step": 111820 + }, + { + "epoch": 12.301980198019802, + "grad_norm": 2.731835126876831, + "learning_rate": 1.93682728570287e-05, + "loss": 0.0398, + "num_input_tokens_seen": 23599680, + "step": 111825 + }, + { + "epoch": 12.302530253025303, + "grad_norm": 0.017052263021469116, + "learning_rate": 1.936593450518906e-05, + "loss": 0.0297, + "num_input_tokens_seen": 23600672, + "step": 111830 + }, + { + "epoch": 12.303080308030804, + "grad_norm": 0.10388454049825668, + "learning_rate": 1.9363596205275888e-05, + "loss": 0.028, + "num_input_tokens_seen": 23601728, + "step": 111835 + }, + { + "epoch": 12.303630363036303, + "grad_norm": 0.6634169816970825, + "learning_rate": 1.9361257957310758e-05, + "loss": 0.0047, + "num_input_tokens_seen": 23602752, + "step": 111840 + }, + { + "epoch": 12.304180418041804, + "grad_norm": 0.05391551926732063, + "learning_rate": 1.93589197613152e-05, + "loss": 0.0201, + "num_input_tokens_seen": 23603808, + "step": 111845 + }, + { + "epoch": 12.304730473047305, + "grad_norm": 4.140171527862549, + "learning_rate": 1.9356581617310766e-05, + "loss": 0.0809, + "num_input_tokens_seen": 23604960, + "step": 111850 + }, + { + "epoch": 12.305280528052805, + "grad_norm": 0.025276124477386475, + "learning_rate": 1.9354243525319025e-05, + "loss": 0.004, + "num_input_tokens_seen": 23606048, + "step": 111855 + }, + { + "epoch": 12.305830583058306, + "grad_norm": 0.02931048348546028, + "learning_rate": 1.93519054853615e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23607104, + "step": 111860 + }, + { + "epoch": 12.306380638063807, + "grad_norm": 0.05744883790612221, + "learning_rate": 1.9349567497459758e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23608128, + "step": 111865 + }, + { + "epoch": 12.306930693069306, + "grad_norm": 0.46032559871673584, + "learning_rate": 1.9347229561635346e-05, + "loss": 0.0048, + "num_input_tokens_seen": 23609216, + "step": 111870 + }, + { + "epoch": 12.307480748074807, + "grad_norm": 0.9953637719154358, + "learning_rate": 1.93448916779098e-05, + "loss": 0.0412, + "num_input_tokens_seen": 23610240, + "step": 111875 + }, + { + "epoch": 12.308030803080309, + "grad_norm": 0.0813576802611351, + "learning_rate": 1.9342553846304673e-05, + "loss": 0.0131, + "num_input_tokens_seen": 23611296, + "step": 111880 + }, + { + "epoch": 12.308580858085808, + "grad_norm": 0.04504280164837837, + "learning_rate": 1.934021606684151e-05, + "loss": 0.0374, + "num_input_tokens_seen": 23612384, + "step": 111885 + }, + { + "epoch": 12.309130913091309, + "grad_norm": 0.07537494599819183, + "learning_rate": 1.933787833954186e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23613408, + "step": 111890 + }, + { + "epoch": 12.30968096809681, + "grad_norm": 2.0456292629241943, + "learning_rate": 1.933554066442728e-05, + "loss": 0.0783, + "num_input_tokens_seen": 23614464, + "step": 111895 + }, + { + "epoch": 12.310231023102311, + "grad_norm": 0.006484316196292639, + "learning_rate": 1.9333203041519293e-05, + "loss": 0.0067, + "num_input_tokens_seen": 23615520, + "step": 111900 + }, + { + "epoch": 12.31078107810781, + "grad_norm": 0.09582455456256866, + "learning_rate": 1.9330865470839458e-05, + "loss": 0.1526, + "num_input_tokens_seen": 23616544, + "step": 111905 + }, + { + "epoch": 12.311331133113312, + "grad_norm": 1.313324213027954, + "learning_rate": 1.9328527952409308e-05, + "loss": 0.0367, + "num_input_tokens_seen": 23617568, + "step": 111910 + }, + { + "epoch": 12.311881188118813, + "grad_norm": 0.9346924424171448, + "learning_rate": 1.9326190486250408e-05, + "loss": 0.0267, + "num_input_tokens_seen": 23618592, + "step": 111915 + }, + { + "epoch": 12.312431243124312, + "grad_norm": 0.07327421754598618, + "learning_rate": 1.9323853072384283e-05, + "loss": 0.0931, + "num_input_tokens_seen": 23619648, + "step": 111920 + }, + { + "epoch": 12.312981298129813, + "grad_norm": 0.009645669721066952, + "learning_rate": 1.932151571083247e-05, + "loss": 0.0048, + "num_input_tokens_seen": 23620672, + "step": 111925 + }, + { + "epoch": 12.313531353135314, + "grad_norm": 3.178403854370117, + "learning_rate": 1.9319178401616538e-05, + "loss": 0.1134, + "num_input_tokens_seen": 23621696, + "step": 111930 + }, + { + "epoch": 12.314081408140813, + "grad_norm": 0.05022364482283592, + "learning_rate": 1.9316841144758002e-05, + "loss": 0.0053, + "num_input_tokens_seen": 23622816, + "step": 111935 + }, + { + "epoch": 12.314631463146315, + "grad_norm": 0.48232564330101013, + "learning_rate": 1.9314503940278418e-05, + "loss": 0.064, + "num_input_tokens_seen": 23623904, + "step": 111940 + }, + { + "epoch": 12.315181518151816, + "grad_norm": 0.010165400803089142, + "learning_rate": 1.9312166788199328e-05, + "loss": 0.0258, + "num_input_tokens_seen": 23624992, + "step": 111945 + }, + { + "epoch": 12.315731573157315, + "grad_norm": 0.008293481543660164, + "learning_rate": 1.9309829688542253e-05, + "loss": 0.0147, + "num_input_tokens_seen": 23626048, + "step": 111950 + }, + { + "epoch": 12.316281628162816, + "grad_norm": 0.09562880545854568, + "learning_rate": 1.9307492641328762e-05, + "loss": 0.0197, + "num_input_tokens_seen": 23627072, + "step": 111955 + }, + { + "epoch": 12.316831683168317, + "grad_norm": 0.27447929978370667, + "learning_rate": 1.9305155646580367e-05, + "loss": 0.0099, + "num_input_tokens_seen": 23628192, + "step": 111960 + }, + { + "epoch": 12.317381738173818, + "grad_norm": 0.015669358894228935, + "learning_rate": 1.9302818704318626e-05, + "loss": 0.0246, + "num_input_tokens_seen": 23629216, + "step": 111965 + }, + { + "epoch": 12.317931793179318, + "grad_norm": 0.027773266658186913, + "learning_rate": 1.930048181456508e-05, + "loss": 0.0305, + "num_input_tokens_seen": 23630208, + "step": 111970 + }, + { + "epoch": 12.318481848184819, + "grad_norm": 0.04874830320477486, + "learning_rate": 1.9298144977341242e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23631200, + "step": 111975 + }, + { + "epoch": 12.31903190319032, + "grad_norm": 0.09508305042982101, + "learning_rate": 1.9295808192668677e-05, + "loss": 0.0173, + "num_input_tokens_seen": 23632224, + "step": 111980 + }, + { + "epoch": 12.319581958195819, + "grad_norm": 0.283621221780777, + "learning_rate": 1.929347146056891e-05, + "loss": 0.0474, + "num_input_tokens_seen": 23633280, + "step": 111985 + }, + { + "epoch": 12.32013201320132, + "grad_norm": 1.3815420866012573, + "learning_rate": 1.9291134781063475e-05, + "loss": 0.0874, + "num_input_tokens_seen": 23634304, + "step": 111990 + }, + { + "epoch": 12.320682068206821, + "grad_norm": 0.027602557092905045, + "learning_rate": 1.9288798154173913e-05, + "loss": 0.016, + "num_input_tokens_seen": 23635360, + "step": 111995 + }, + { + "epoch": 12.32123212321232, + "grad_norm": 0.021474771201610565, + "learning_rate": 1.928646157992175e-05, + "loss": 0.0394, + "num_input_tokens_seen": 23636416, + "step": 112000 + }, + { + "epoch": 12.321782178217822, + "grad_norm": 0.3680967092514038, + "learning_rate": 1.928412505832854e-05, + "loss": 0.0548, + "num_input_tokens_seen": 23637536, + "step": 112005 + }, + { + "epoch": 12.322332233223323, + "grad_norm": 0.6201640367507935, + "learning_rate": 1.9281788589415804e-05, + "loss": 0.009, + "num_input_tokens_seen": 23638560, + "step": 112010 + }, + { + "epoch": 12.322882288228822, + "grad_norm": 0.015735233202576637, + "learning_rate": 1.9279452173205078e-05, + "loss": 0.0014, + "num_input_tokens_seen": 23639552, + "step": 112015 + }, + { + "epoch": 12.323432343234323, + "grad_norm": 1.4469070434570312, + "learning_rate": 1.9277115809717893e-05, + "loss": 0.1523, + "num_input_tokens_seen": 23640544, + "step": 112020 + }, + { + "epoch": 12.323982398239824, + "grad_norm": 0.032510045915842056, + "learning_rate": 1.9274779498975782e-05, + "loss": 0.0037, + "num_input_tokens_seen": 23641632, + "step": 112025 + }, + { + "epoch": 12.324532453245325, + "grad_norm": 0.03337704390287399, + "learning_rate": 1.9272443241000297e-05, + "loss": 0.0155, + "num_input_tokens_seen": 23642720, + "step": 112030 + }, + { + "epoch": 12.325082508250825, + "grad_norm": 0.13406944274902344, + "learning_rate": 1.9270107035812942e-05, + "loss": 0.002, + "num_input_tokens_seen": 23643776, + "step": 112035 + }, + { + "epoch": 12.325632563256326, + "grad_norm": 0.504833996295929, + "learning_rate": 1.926777088343526e-05, + "loss": 0.0067, + "num_input_tokens_seen": 23644864, + "step": 112040 + }, + { + "epoch": 12.326182618261827, + "grad_norm": 0.15121228992938995, + "learning_rate": 1.9265434783888792e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23645984, + "step": 112045 + }, + { + "epoch": 12.326732673267326, + "grad_norm": 0.7675992250442505, + "learning_rate": 1.926309873719505e-05, + "loss": 0.0096, + "num_input_tokens_seen": 23647104, + "step": 112050 + }, + { + "epoch": 12.327282728272827, + "grad_norm": 0.017457829788327217, + "learning_rate": 1.926076274337558e-05, + "loss": 0.0012, + "num_input_tokens_seen": 23648128, + "step": 112055 + }, + { + "epoch": 12.327832783278328, + "grad_norm": 0.3931926190853119, + "learning_rate": 1.9258426802451912e-05, + "loss": 0.0219, + "num_input_tokens_seen": 23649184, + "step": 112060 + }, + { + "epoch": 12.328382838283828, + "grad_norm": 0.006950932089239359, + "learning_rate": 1.925609091444555e-05, + "loss": 0.0038, + "num_input_tokens_seen": 23650176, + "step": 112065 + }, + { + "epoch": 12.328932893289329, + "grad_norm": 0.9580169916152954, + "learning_rate": 1.9253755079378062e-05, + "loss": 0.0184, + "num_input_tokens_seen": 23651200, + "step": 112070 + }, + { + "epoch": 12.32948294829483, + "grad_norm": 0.061920613050460815, + "learning_rate": 1.9251419297270938e-05, + "loss": 0.023, + "num_input_tokens_seen": 23652288, + "step": 112075 + }, + { + "epoch": 12.33003300330033, + "grad_norm": 0.12956520915031433, + "learning_rate": 1.924908356814574e-05, + "loss": 0.1128, + "num_input_tokens_seen": 23653312, + "step": 112080 + }, + { + "epoch": 12.33058305830583, + "grad_norm": 0.046273354440927505, + "learning_rate": 1.9246747892023975e-05, + "loss": 0.0073, + "num_input_tokens_seen": 23654368, + "step": 112085 + }, + { + "epoch": 12.331133113311331, + "grad_norm": 0.08752616494894028, + "learning_rate": 1.9244412268927168e-05, + "loss": 0.0191, + "num_input_tokens_seen": 23655456, + "step": 112090 + }, + { + "epoch": 12.331683168316832, + "grad_norm": 0.022264748811721802, + "learning_rate": 1.9242076698876854e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23656512, + "step": 112095 + }, + { + "epoch": 12.332233223322332, + "grad_norm": 0.027843205258250237, + "learning_rate": 1.9239741181894556e-05, + "loss": 0.0053, + "num_input_tokens_seen": 23657632, + "step": 112100 + }, + { + "epoch": 12.332783278327833, + "grad_norm": 0.043701495975255966, + "learning_rate": 1.9237405718001806e-05, + "loss": 0.0839, + "num_input_tokens_seen": 23658656, + "step": 112105 + }, + { + "epoch": 12.333333333333334, + "grad_norm": 1.0038397312164307, + "learning_rate": 1.9235070307220117e-05, + "loss": 0.0134, + "num_input_tokens_seen": 23659712, + "step": 112110 + }, + { + "epoch": 12.333883388338833, + "grad_norm": 0.1168970912694931, + "learning_rate": 1.923273494957101e-05, + "loss": 0.003, + "num_input_tokens_seen": 23660768, + "step": 112115 + }, + { + "epoch": 12.334433443344334, + "grad_norm": 0.20618678629398346, + "learning_rate": 1.9230399645076036e-05, + "loss": 0.0038, + "num_input_tokens_seen": 23661888, + "step": 112120 + }, + { + "epoch": 12.334983498349835, + "grad_norm": 0.07415329664945602, + "learning_rate": 1.9228064393756677e-05, + "loss": 0.0053, + "num_input_tokens_seen": 23662880, + "step": 112125 + }, + { + "epoch": 12.335533553355335, + "grad_norm": 0.028254808858036995, + "learning_rate": 1.92257291956345e-05, + "loss": 0.0045, + "num_input_tokens_seen": 23663936, + "step": 112130 + }, + { + "epoch": 12.336083608360836, + "grad_norm": 0.04750978946685791, + "learning_rate": 1.9223394050731e-05, + "loss": 0.0213, + "num_input_tokens_seen": 23664992, + "step": 112135 + }, + { + "epoch": 12.336633663366337, + "grad_norm": 0.749151885509491, + "learning_rate": 1.9221058959067698e-05, + "loss": 0.0071, + "num_input_tokens_seen": 23666016, + "step": 112140 + }, + { + "epoch": 12.337183718371838, + "grad_norm": 0.04352365806698799, + "learning_rate": 1.9218723920666134e-05, + "loss": 0.0545, + "num_input_tokens_seen": 23667072, + "step": 112145 + }, + { + "epoch": 12.337733773377337, + "grad_norm": 0.005021955352276564, + "learning_rate": 1.9216388935547807e-05, + "loss": 0.0105, + "num_input_tokens_seen": 23668192, + "step": 112150 + }, + { + "epoch": 12.338283828382838, + "grad_norm": 0.12915632128715515, + "learning_rate": 1.9214054003734254e-05, + "loss": 0.0122, + "num_input_tokens_seen": 23669248, + "step": 112155 + }, + { + "epoch": 12.33883388338834, + "grad_norm": 0.31273841857910156, + "learning_rate": 1.921171912524699e-05, + "loss": 0.0126, + "num_input_tokens_seen": 23670368, + "step": 112160 + }, + { + "epoch": 12.339383938393839, + "grad_norm": 0.044459421187639236, + "learning_rate": 1.9209384300107526e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23671424, + "step": 112165 + }, + { + "epoch": 12.33993399339934, + "grad_norm": 0.02336876466870308, + "learning_rate": 1.9207049528337395e-05, + "loss": 0.0082, + "num_input_tokens_seen": 23672512, + "step": 112170 + }, + { + "epoch": 12.340484048404841, + "grad_norm": 0.027398958802223206, + "learning_rate": 1.9204714809958113e-05, + "loss": 0.1133, + "num_input_tokens_seen": 23673504, + "step": 112175 + }, + { + "epoch": 12.34103410341034, + "grad_norm": 0.02195039950311184, + "learning_rate": 1.9202380144991183e-05, + "loss": 0.0017, + "num_input_tokens_seen": 23674560, + "step": 112180 + }, + { + "epoch": 12.341584158415841, + "grad_norm": 0.05416739359498024, + "learning_rate": 1.9200045533458137e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23675552, + "step": 112185 + }, + { + "epoch": 12.342134213421343, + "grad_norm": 0.05413444712758064, + "learning_rate": 1.9197710975380487e-05, + "loss": 0.0023, + "num_input_tokens_seen": 23676608, + "step": 112190 + }, + { + "epoch": 12.342684268426842, + "grad_norm": 0.010394756682217121, + "learning_rate": 1.9195376470779754e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23677696, + "step": 112195 + }, + { + "epoch": 12.343234323432343, + "grad_norm": 0.026216035708785057, + "learning_rate": 1.9193042019677455e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23678752, + "step": 112200 + }, + { + "epoch": 12.343784378437844, + "grad_norm": 0.3566052317619324, + "learning_rate": 1.919070762209509e-05, + "loss": 0.1247, + "num_input_tokens_seen": 23679808, + "step": 112205 + }, + { + "epoch": 12.344334433443345, + "grad_norm": 0.011606224812567234, + "learning_rate": 1.918837327805419e-05, + "loss": 0.0361, + "num_input_tokens_seen": 23680800, + "step": 112210 + }, + { + "epoch": 12.344884488448844, + "grad_norm": 1.0653489828109741, + "learning_rate": 1.9186038987576262e-05, + "loss": 0.0166, + "num_input_tokens_seen": 23681888, + "step": 112215 + }, + { + "epoch": 12.345434543454346, + "grad_norm": 0.32593539357185364, + "learning_rate": 1.9183704750682833e-05, + "loss": 0.0097, + "num_input_tokens_seen": 23682912, + "step": 112220 + }, + { + "epoch": 12.345984598459847, + "grad_norm": 0.025391491129994392, + "learning_rate": 1.91813705673954e-05, + "loss": 0.117, + "num_input_tokens_seen": 23683936, + "step": 112225 + }, + { + "epoch": 12.346534653465346, + "grad_norm": 1.0528260469436646, + "learning_rate": 1.917903643773548e-05, + "loss": 0.0205, + "num_input_tokens_seen": 23684960, + "step": 112230 + }, + { + "epoch": 12.347084708470847, + "grad_norm": 0.4893682301044464, + "learning_rate": 1.9176702361724594e-05, + "loss": 0.0154, + "num_input_tokens_seen": 23686016, + "step": 112235 + }, + { + "epoch": 12.347634763476348, + "grad_norm": 0.5064583420753479, + "learning_rate": 1.917436833938424e-05, + "loss": 0.0059, + "num_input_tokens_seen": 23687040, + "step": 112240 + }, + { + "epoch": 12.348184818481847, + "grad_norm": 0.03307325020432472, + "learning_rate": 1.917203437073594e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23688064, + "step": 112245 + }, + { + "epoch": 12.348734873487349, + "grad_norm": 0.0968216210603714, + "learning_rate": 1.916970045580121e-05, + "loss": 0.0073, + "num_input_tokens_seen": 23689120, + "step": 112250 + }, + { + "epoch": 12.34928492849285, + "grad_norm": 2.0940499305725098, + "learning_rate": 1.916736659460154e-05, + "loss": 0.1209, + "num_input_tokens_seen": 23690112, + "step": 112255 + }, + { + "epoch": 12.34983498349835, + "grad_norm": 0.04352913051843643, + "learning_rate": 1.916503278715847e-05, + "loss": 0.0012, + "num_input_tokens_seen": 23691200, + "step": 112260 + }, + { + "epoch": 12.35038503850385, + "grad_norm": 0.007021281868219376, + "learning_rate": 1.9162699033493478e-05, + "loss": 0.0436, + "num_input_tokens_seen": 23692256, + "step": 112265 + }, + { + "epoch": 12.350935093509351, + "grad_norm": 0.011578687466681004, + "learning_rate": 1.9160365333628095e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23693216, + "step": 112270 + }, + { + "epoch": 12.351485148514852, + "grad_norm": 0.07095815241336823, + "learning_rate": 1.9158031687583826e-05, + "loss": 0.0466, + "num_input_tokens_seen": 23694304, + "step": 112275 + }, + { + "epoch": 12.352035203520352, + "grad_norm": 0.03551071137189865, + "learning_rate": 1.915569809538217e-05, + "loss": 0.0093, + "num_input_tokens_seen": 23695328, + "step": 112280 + }, + { + "epoch": 12.352585258525853, + "grad_norm": 0.03158121928572655, + "learning_rate": 1.9153364557044638e-05, + "loss": 0.0159, + "num_input_tokens_seen": 23696352, + "step": 112285 + }, + { + "epoch": 12.353135313531354, + "grad_norm": 0.06640642136335373, + "learning_rate": 1.915103107259274e-05, + "loss": 0.069, + "num_input_tokens_seen": 23697408, + "step": 112290 + }, + { + "epoch": 12.353685368536853, + "grad_norm": 0.094061940908432, + "learning_rate": 1.9148697642047993e-05, + "loss": 0.0064, + "num_input_tokens_seen": 23698400, + "step": 112295 + }, + { + "epoch": 12.354235423542354, + "grad_norm": 0.14589834213256836, + "learning_rate": 1.9146364265431884e-05, + "loss": 0.0659, + "num_input_tokens_seen": 23699456, + "step": 112300 + }, + { + "epoch": 12.354785478547855, + "grad_norm": 2.710508346557617, + "learning_rate": 1.9144030942765923e-05, + "loss": 0.0892, + "num_input_tokens_seen": 23700416, + "step": 112305 + }, + { + "epoch": 12.355335533553355, + "grad_norm": 4.960779666900635, + "learning_rate": 1.9141697674071623e-05, + "loss": 0.0353, + "num_input_tokens_seen": 23701504, + "step": 112310 + }, + { + "epoch": 12.355885588558856, + "grad_norm": 0.34515658020973206, + "learning_rate": 1.913936445937048e-05, + "loss": 0.0262, + "num_input_tokens_seen": 23702592, + "step": 112315 + }, + { + "epoch": 12.356435643564357, + "grad_norm": 0.09532682597637177, + "learning_rate": 1.9137031298684012e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23703648, + "step": 112320 + }, + { + "epoch": 12.356985698569858, + "grad_norm": 0.023032797500491142, + "learning_rate": 1.913469819203371e-05, + "loss": 0.0283, + "num_input_tokens_seen": 23704768, + "step": 112325 + }, + { + "epoch": 12.357535753575357, + "grad_norm": 0.02063939720392227, + "learning_rate": 1.9132365139441072e-05, + "loss": 0.0495, + "num_input_tokens_seen": 23705760, + "step": 112330 + }, + { + "epoch": 12.358085808580858, + "grad_norm": 1.852968454360962, + "learning_rate": 1.9130032140927623e-05, + "loss": 0.0285, + "num_input_tokens_seen": 23706912, + "step": 112335 + }, + { + "epoch": 12.35863586358636, + "grad_norm": 0.1843467801809311, + "learning_rate": 1.9127699196514836e-05, + "loss": 0.0021, + "num_input_tokens_seen": 23708000, + "step": 112340 + }, + { + "epoch": 12.359185918591859, + "grad_norm": 0.048380330204963684, + "learning_rate": 1.912536630622424e-05, + "loss": 0.0012, + "num_input_tokens_seen": 23709024, + "step": 112345 + }, + { + "epoch": 12.35973597359736, + "grad_norm": 0.010293334722518921, + "learning_rate": 1.9123033470077322e-05, + "loss": 0.0078, + "num_input_tokens_seen": 23710048, + "step": 112350 + }, + { + "epoch": 12.36028602860286, + "grad_norm": 0.46971800923347473, + "learning_rate": 1.9120700688095578e-05, + "loss": 0.0045, + "num_input_tokens_seen": 23711200, + "step": 112355 + }, + { + "epoch": 12.36083608360836, + "grad_norm": 0.020493702962994576, + "learning_rate": 1.9118367960300517e-05, + "loss": 0.005, + "num_input_tokens_seen": 23712224, + "step": 112360 + }, + { + "epoch": 12.361386138613861, + "grad_norm": 0.026611091569066048, + "learning_rate": 1.9116035286713643e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23713280, + "step": 112365 + }, + { + "epoch": 12.361936193619362, + "grad_norm": 1.779586672782898, + "learning_rate": 1.911370266735643e-05, + "loss": 0.1458, + "num_input_tokens_seen": 23714336, + "step": 112370 + }, + { + "epoch": 12.362486248624862, + "grad_norm": 0.009632192552089691, + "learning_rate": 1.911137010225042e-05, + "loss": 0.0134, + "num_input_tokens_seen": 23715456, + "step": 112375 + }, + { + "epoch": 12.363036303630363, + "grad_norm": 0.8039903044700623, + "learning_rate": 1.9109037591417063e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23716576, + "step": 112380 + }, + { + "epoch": 12.363586358635864, + "grad_norm": 0.14447762072086334, + "learning_rate": 1.9106705134877896e-05, + "loss": 0.0287, + "num_input_tokens_seen": 23717600, + "step": 112385 + }, + { + "epoch": 12.364136413641365, + "grad_norm": 0.3596899211406708, + "learning_rate": 1.9104372732654395e-05, + "loss": 0.0963, + "num_input_tokens_seen": 23718688, + "step": 112390 + }, + { + "epoch": 12.364686468646864, + "grad_norm": 0.009376454167068005, + "learning_rate": 1.9102040384768057e-05, + "loss": 0.0108, + "num_input_tokens_seen": 23719680, + "step": 112395 + }, + { + "epoch": 12.365236523652365, + "grad_norm": 0.08634833991527557, + "learning_rate": 1.909970809124039e-05, + "loss": 0.0023, + "num_input_tokens_seen": 23720768, + "step": 112400 + }, + { + "epoch": 12.365786578657866, + "grad_norm": 0.01758180931210518, + "learning_rate": 1.909737585209287e-05, + "loss": 0.0718, + "num_input_tokens_seen": 23721824, + "step": 112405 + }, + { + "epoch": 12.366336633663366, + "grad_norm": 0.005954386200755835, + "learning_rate": 1.9095043667347024e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23722912, + "step": 112410 + }, + { + "epoch": 12.366886688668867, + "grad_norm": 0.08670122921466827, + "learning_rate": 1.9092711537024317e-05, + "loss": 0.027, + "num_input_tokens_seen": 23724000, + "step": 112415 + }, + { + "epoch": 12.367436743674368, + "grad_norm": 0.012621181085705757, + "learning_rate": 1.9090379461146246e-05, + "loss": 0.1493, + "num_input_tokens_seen": 23724992, + "step": 112420 + }, + { + "epoch": 12.367986798679867, + "grad_norm": 0.010012864135205746, + "learning_rate": 1.9088047439734325e-05, + "loss": 0.0016, + "num_input_tokens_seen": 23726048, + "step": 112425 + }, + { + "epoch": 12.368536853685368, + "grad_norm": 0.0072736977599561214, + "learning_rate": 1.9085715472810017e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23727072, + "step": 112430 + }, + { + "epoch": 12.36908690869087, + "grad_norm": 0.720096230506897, + "learning_rate": 1.9083383560394848e-05, + "loss": 0.0896, + "num_input_tokens_seen": 23728096, + "step": 112435 + }, + { + "epoch": 12.369636963696369, + "grad_norm": 0.03069625422358513, + "learning_rate": 1.9081051702510287e-05, + "loss": 0.033, + "num_input_tokens_seen": 23729120, + "step": 112440 + }, + { + "epoch": 12.37018701870187, + "grad_norm": 0.02856285870075226, + "learning_rate": 1.9078719899177826e-05, + "loss": 0.0015, + "num_input_tokens_seen": 23730208, + "step": 112445 + }, + { + "epoch": 12.370737073707371, + "grad_norm": 0.03747507184743881, + "learning_rate": 1.9076388150418977e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23731232, + "step": 112450 + }, + { + "epoch": 12.371287128712872, + "grad_norm": 0.09852378070354462, + "learning_rate": 1.9074056456255202e-05, + "loss": 0.0065, + "num_input_tokens_seen": 23732256, + "step": 112455 + }, + { + "epoch": 12.371837183718371, + "grad_norm": 0.02884564734995365, + "learning_rate": 1.907172481670801e-05, + "loss": 0.0024, + "num_input_tokens_seen": 23733344, + "step": 112460 + }, + { + "epoch": 12.372387238723872, + "grad_norm": 0.012846885249018669, + "learning_rate": 1.9069393231798896e-05, + "loss": 0.0009, + "num_input_tokens_seen": 23734432, + "step": 112465 + }, + { + "epoch": 12.372937293729374, + "grad_norm": 0.003965931013226509, + "learning_rate": 1.9067061701549325e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23735488, + "step": 112470 + }, + { + "epoch": 12.373487348734873, + "grad_norm": 0.05117589607834816, + "learning_rate": 1.9064730225980804e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23736480, + "step": 112475 + }, + { + "epoch": 12.374037403740374, + "grad_norm": 0.006793085485696793, + "learning_rate": 1.906239880511481e-05, + "loss": 0.0692, + "num_input_tokens_seen": 23737568, + "step": 112480 + }, + { + "epoch": 12.374587458745875, + "grad_norm": 0.015513220801949501, + "learning_rate": 1.906006743897285e-05, + "loss": 0.0539, + "num_input_tokens_seen": 23738624, + "step": 112485 + }, + { + "epoch": 12.375137513751374, + "grad_norm": 0.02321596071124077, + "learning_rate": 1.9057736127576393e-05, + "loss": 0.0074, + "num_input_tokens_seen": 23739648, + "step": 112490 + }, + { + "epoch": 12.375687568756875, + "grad_norm": 0.16472023725509644, + "learning_rate": 1.9055404870946925e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23740672, + "step": 112495 + }, + { + "epoch": 12.376237623762377, + "grad_norm": 0.0328768715262413, + "learning_rate": 1.9053073669105943e-05, + "loss": 0.0666, + "num_input_tokens_seen": 23741760, + "step": 112500 + }, + { + "epoch": 12.376787678767876, + "grad_norm": 0.36548781394958496, + "learning_rate": 1.9050742522074922e-05, + "loss": 0.0048, + "num_input_tokens_seen": 23742816, + "step": 112505 + }, + { + "epoch": 12.377337733773377, + "grad_norm": 0.09301643818616867, + "learning_rate": 1.9048411429875362e-05, + "loss": 0.0066, + "num_input_tokens_seen": 23743872, + "step": 112510 + }, + { + "epoch": 12.377887788778878, + "grad_norm": 0.004443852696567774, + "learning_rate": 1.9046080392528735e-05, + "loss": 0.0004, + "num_input_tokens_seen": 23744896, + "step": 112515 + }, + { + "epoch": 12.37843784378438, + "grad_norm": 0.008479908108711243, + "learning_rate": 1.9043749410056523e-05, + "loss": 0.0267, + "num_input_tokens_seen": 23745920, + "step": 112520 + }, + { + "epoch": 12.378987898789878, + "grad_norm": 2.5175869464874268, + "learning_rate": 1.9041418482480222e-05, + "loss": 0.032, + "num_input_tokens_seen": 23747008, + "step": 112525 + }, + { + "epoch": 12.37953795379538, + "grad_norm": 0.05339397117495537, + "learning_rate": 1.90390876098213e-05, + "loss": 0.0556, + "num_input_tokens_seen": 23748128, + "step": 112530 + }, + { + "epoch": 12.38008800880088, + "grad_norm": 1.8780906200408936, + "learning_rate": 1.9036756792101248e-05, + "loss": 0.0753, + "num_input_tokens_seen": 23749248, + "step": 112535 + }, + { + "epoch": 12.38063806380638, + "grad_norm": 0.12786865234375, + "learning_rate": 1.9034426029341556e-05, + "loss": 0.0458, + "num_input_tokens_seen": 23750336, + "step": 112540 + }, + { + "epoch": 12.381188118811881, + "grad_norm": 0.018162120133638382, + "learning_rate": 1.903209532156368e-05, + "loss": 0.0513, + "num_input_tokens_seen": 23751392, + "step": 112545 + }, + { + "epoch": 12.381738173817382, + "grad_norm": 0.41563665866851807, + "learning_rate": 1.9029764668789134e-05, + "loss": 0.0081, + "num_input_tokens_seen": 23752416, + "step": 112550 + }, + { + "epoch": 12.382288228822881, + "grad_norm": 0.005339703522622585, + "learning_rate": 1.902743407103938e-05, + "loss": 0.0012, + "num_input_tokens_seen": 23753408, + "step": 112555 + }, + { + "epoch": 12.382838283828383, + "grad_norm": 0.5302613973617554, + "learning_rate": 1.9025103528335893e-05, + "loss": 0.0538, + "num_input_tokens_seen": 23754400, + "step": 112560 + }, + { + "epoch": 12.383388338833884, + "grad_norm": 0.026826197281479836, + "learning_rate": 1.9022773040700167e-05, + "loss": 0.0154, + "num_input_tokens_seen": 23755456, + "step": 112565 + }, + { + "epoch": 12.383938393839385, + "grad_norm": 0.04752935841679573, + "learning_rate": 1.9020442608153664e-05, + "loss": 0.0115, + "num_input_tokens_seen": 23756448, + "step": 112570 + }, + { + "epoch": 12.384488448844884, + "grad_norm": 0.029205793514847755, + "learning_rate": 1.901811223071788e-05, + "loss": 0.0043, + "num_input_tokens_seen": 23757504, + "step": 112575 + }, + { + "epoch": 12.385038503850385, + "grad_norm": 0.010334921069443226, + "learning_rate": 1.901578190841429e-05, + "loss": 0.0034, + "num_input_tokens_seen": 23758528, + "step": 112580 + }, + { + "epoch": 12.385588558855886, + "grad_norm": 0.017644571140408516, + "learning_rate": 1.9013451641264354e-05, + "loss": 0.0137, + "num_input_tokens_seen": 23759648, + "step": 112585 + }, + { + "epoch": 12.386138613861386, + "grad_norm": 1.6321290731430054, + "learning_rate": 1.9011121429289568e-05, + "loss": 0.0261, + "num_input_tokens_seen": 23760768, + "step": 112590 + }, + { + "epoch": 12.386688668866887, + "grad_norm": 0.024049747735261917, + "learning_rate": 1.90087912725114e-05, + "loss": 0.0106, + "num_input_tokens_seen": 23761856, + "step": 112595 + }, + { + "epoch": 12.387238723872388, + "grad_norm": 0.5417534112930298, + "learning_rate": 1.9006461170951335e-05, + "loss": 0.0988, + "num_input_tokens_seen": 23762912, + "step": 112600 + }, + { + "epoch": 12.387788778877887, + "grad_norm": 0.0029901089146733284, + "learning_rate": 1.9004131124630837e-05, + "loss": 0.0074, + "num_input_tokens_seen": 23764000, + "step": 112605 + }, + { + "epoch": 12.388338833883388, + "grad_norm": 0.08464313298463821, + "learning_rate": 1.9001801133571378e-05, + "loss": 0.0158, + "num_input_tokens_seen": 23765088, + "step": 112610 + }, + { + "epoch": 12.38888888888889, + "grad_norm": 0.23480361700057983, + "learning_rate": 1.899947119779445e-05, + "loss": 0.0208, + "num_input_tokens_seen": 23766144, + "step": 112615 + }, + { + "epoch": 12.389438943894389, + "grad_norm": 0.3890409469604492, + "learning_rate": 1.8997141317321507e-05, + "loss": 0.0075, + "num_input_tokens_seen": 23767200, + "step": 112620 + }, + { + "epoch": 12.38998899889989, + "grad_norm": 0.17088764905929565, + "learning_rate": 1.8994811492174044e-05, + "loss": 0.0029, + "num_input_tokens_seen": 23768256, + "step": 112625 + }, + { + "epoch": 12.39053905390539, + "grad_norm": 0.012480373494327068, + "learning_rate": 1.899248172237352e-05, + "loss": 0.0038, + "num_input_tokens_seen": 23769312, + "step": 112630 + }, + { + "epoch": 12.391089108910892, + "grad_norm": 0.07298798114061356, + "learning_rate": 1.8990152007941397e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23770400, + "step": 112635 + }, + { + "epoch": 12.391639163916391, + "grad_norm": 0.08729181438684464, + "learning_rate": 1.8987822348899174e-05, + "loss": 0.1765, + "num_input_tokens_seen": 23771424, + "step": 112640 + }, + { + "epoch": 12.392189218921892, + "grad_norm": 0.01166969072073698, + "learning_rate": 1.8985492745268296e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23772512, + "step": 112645 + }, + { + "epoch": 12.392739273927393, + "grad_norm": 0.014335397630929947, + "learning_rate": 1.898316319707025e-05, + "loss": 0.0015, + "num_input_tokens_seen": 23773568, + "step": 112650 + }, + { + "epoch": 12.393289328932893, + "grad_norm": 1.8465508222579956, + "learning_rate": 1.8980833704326507e-05, + "loss": 0.0381, + "num_input_tokens_seen": 23774592, + "step": 112655 + }, + { + "epoch": 12.393839383938394, + "grad_norm": 0.03772418573498726, + "learning_rate": 1.8978504267058523e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23775616, + "step": 112660 + }, + { + "epoch": 12.394389438943895, + "grad_norm": 0.6134648323059082, + "learning_rate": 1.897617488528778e-05, + "loss": 0.0183, + "num_input_tokens_seen": 23776672, + "step": 112665 + }, + { + "epoch": 12.394939493949394, + "grad_norm": 0.029645832255482674, + "learning_rate": 1.8973845559035736e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23777760, + "step": 112670 + }, + { + "epoch": 12.395489548954895, + "grad_norm": 1.4221128225326538, + "learning_rate": 1.897151628832387e-05, + "loss": 0.1575, + "num_input_tokens_seen": 23778784, + "step": 112675 + }, + { + "epoch": 12.396039603960396, + "grad_norm": 0.06591728329658508, + "learning_rate": 1.8969187073173654e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23779872, + "step": 112680 + }, + { + "epoch": 12.396589658965897, + "grad_norm": 0.5025357604026794, + "learning_rate": 1.8966857913606534e-05, + "loss": 0.0741, + "num_input_tokens_seen": 23780960, + "step": 112685 + }, + { + "epoch": 12.397139713971397, + "grad_norm": 0.02049141190946102, + "learning_rate": 1.8964528809643996e-05, + "loss": 0.0017, + "num_input_tokens_seen": 23781984, + "step": 112690 + }, + { + "epoch": 12.397689768976898, + "grad_norm": 0.0046780770644545555, + "learning_rate": 1.8962199761307497e-05, + "loss": 0.0174, + "num_input_tokens_seen": 23783104, + "step": 112695 + }, + { + "epoch": 12.398239823982399, + "grad_norm": 0.014355508610606194, + "learning_rate": 1.8959870768618516e-05, + "loss": 0.0102, + "num_input_tokens_seen": 23784192, + "step": 112700 + }, + { + "epoch": 12.398789878987898, + "grad_norm": 0.039581041783094406, + "learning_rate": 1.8957541831598497e-05, + "loss": 0.0176, + "num_input_tokens_seen": 23785248, + "step": 112705 + }, + { + "epoch": 12.3993399339934, + "grad_norm": 0.021409321576356888, + "learning_rate": 1.8955212950268913e-05, + "loss": 0.0217, + "num_input_tokens_seen": 23786368, + "step": 112710 + }, + { + "epoch": 12.3998899889989, + "grad_norm": 0.012338947504758835, + "learning_rate": 1.8952884124651245e-05, + "loss": 0.0758, + "num_input_tokens_seen": 23787392, + "step": 112715 + }, + { + "epoch": 12.4004400440044, + "grad_norm": 0.010954918339848518, + "learning_rate": 1.8950555354766927e-05, + "loss": 0.188, + "num_input_tokens_seen": 23788416, + "step": 112720 + }, + { + "epoch": 12.400990099009901, + "grad_norm": 0.011762527748942375, + "learning_rate": 1.8948226640637446e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23789472, + "step": 112725 + }, + { + "epoch": 12.401540154015402, + "grad_norm": 0.342987984418869, + "learning_rate": 1.8945897982284257e-05, + "loss": 0.0176, + "num_input_tokens_seen": 23790496, + "step": 112730 + }, + { + "epoch": 12.402090209020901, + "grad_norm": 0.0048225936479866505, + "learning_rate": 1.894356937972881e-05, + "loss": 0.0007, + "num_input_tokens_seen": 23791584, + "step": 112735 + }, + { + "epoch": 12.402640264026402, + "grad_norm": 1.4903461933135986, + "learning_rate": 1.894124083299259e-05, + "loss": 0.0108, + "num_input_tokens_seen": 23792640, + "step": 112740 + }, + { + "epoch": 12.403190319031903, + "grad_norm": 0.03685179725289345, + "learning_rate": 1.8938912342097043e-05, + "loss": 0.072, + "num_input_tokens_seen": 23793664, + "step": 112745 + }, + { + "epoch": 12.403740374037405, + "grad_norm": 0.026810619980096817, + "learning_rate": 1.8936583907063627e-05, + "loss": 0.064, + "num_input_tokens_seen": 23794720, + "step": 112750 + }, + { + "epoch": 12.404290429042904, + "grad_norm": 0.12446889281272888, + "learning_rate": 1.893425552791382e-05, + "loss": 0.0072, + "num_input_tokens_seen": 23795776, + "step": 112755 + }, + { + "epoch": 12.404840484048405, + "grad_norm": 0.045411575585603714, + "learning_rate": 1.8931927204669057e-05, + "loss": 0.0093, + "num_input_tokens_seen": 23796864, + "step": 112760 + }, + { + "epoch": 12.405390539053906, + "grad_norm": 0.01539030484855175, + "learning_rate": 1.892959893735081e-05, + "loss": 0.0114, + "num_input_tokens_seen": 23797888, + "step": 112765 + }, + { + "epoch": 12.405940594059405, + "grad_norm": 0.010701664723455906, + "learning_rate": 1.892727072598055e-05, + "loss": 0.0086, + "num_input_tokens_seen": 23798944, + "step": 112770 + }, + { + "epoch": 12.406490649064907, + "grad_norm": 0.030183326452970505, + "learning_rate": 1.8924942570579707e-05, + "loss": 0.0528, + "num_input_tokens_seen": 23800000, + "step": 112775 + }, + { + "epoch": 12.407040704070408, + "grad_norm": 0.04097088426351547, + "learning_rate": 1.8922614471169757e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23801024, + "step": 112780 + }, + { + "epoch": 12.407590759075907, + "grad_norm": 0.09906892478466034, + "learning_rate": 1.8920286427772148e-05, + "loss": 0.0014, + "num_input_tokens_seen": 23802112, + "step": 112785 + }, + { + "epoch": 12.408140814081408, + "grad_norm": 0.03131239861249924, + "learning_rate": 1.891795844040835e-05, + "loss": 0.0229, + "num_input_tokens_seen": 23803168, + "step": 112790 + }, + { + "epoch": 12.408690869086909, + "grad_norm": 0.03820054233074188, + "learning_rate": 1.8915630509099816e-05, + "loss": 0.0308, + "num_input_tokens_seen": 23804224, + "step": 112795 + }, + { + "epoch": 12.409240924092408, + "grad_norm": 0.1505601555109024, + "learning_rate": 1.8913302633867985e-05, + "loss": 0.0044, + "num_input_tokens_seen": 23805216, + "step": 112800 + }, + { + "epoch": 12.40979097909791, + "grad_norm": 0.1812973916530609, + "learning_rate": 1.891097481473433e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23806208, + "step": 112805 + }, + { + "epoch": 12.41034103410341, + "grad_norm": 0.7051252722740173, + "learning_rate": 1.8908647051720285e-05, + "loss": 0.007, + "num_input_tokens_seen": 23807200, + "step": 112810 + }, + { + "epoch": 12.410891089108912, + "grad_norm": 0.2293645590543747, + "learning_rate": 1.8906319344847334e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23808288, + "step": 112815 + }, + { + "epoch": 12.411441144114411, + "grad_norm": 0.005988016724586487, + "learning_rate": 1.8903991694136906e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23809344, + "step": 112820 + }, + { + "epoch": 12.411991199119912, + "grad_norm": 0.6710920929908752, + "learning_rate": 1.8901664099610462e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23810432, + "step": 112825 + }, + { + "epoch": 12.412541254125413, + "grad_norm": 0.006686422973871231, + "learning_rate": 1.8899336561289458e-05, + "loss": 0.0131, + "num_input_tokens_seen": 23811488, + "step": 112830 + }, + { + "epoch": 12.413091309130913, + "grad_norm": 2.8677685260772705, + "learning_rate": 1.889700907919533e-05, + "loss": 0.0328, + "num_input_tokens_seen": 23812544, + "step": 112835 + }, + { + "epoch": 12.413641364136414, + "grad_norm": 0.7698342204093933, + "learning_rate": 1.889468165334955e-05, + "loss": 0.0162, + "num_input_tokens_seen": 23813632, + "step": 112840 + }, + { + "epoch": 12.414191419141915, + "grad_norm": 0.7340851426124573, + "learning_rate": 1.8892354283773567e-05, + "loss": 0.0093, + "num_input_tokens_seen": 23814688, + "step": 112845 + }, + { + "epoch": 12.414741474147414, + "grad_norm": 0.19010387361049652, + "learning_rate": 1.8890026970488804e-05, + "loss": 0.1014, + "num_input_tokens_seen": 23815680, + "step": 112850 + }, + { + "epoch": 12.415291529152915, + "grad_norm": 0.01077097188681364, + "learning_rate": 1.8887699713516746e-05, + "loss": 0.0011, + "num_input_tokens_seen": 23816736, + "step": 112855 + }, + { + "epoch": 12.415841584158416, + "grad_norm": 1.1511625051498413, + "learning_rate": 1.888537251287882e-05, + "loss": 0.0274, + "num_input_tokens_seen": 23817792, + "step": 112860 + }, + { + "epoch": 12.416391639163916, + "grad_norm": 2.084899425506592, + "learning_rate": 1.8883045368596482e-05, + "loss": 0.0142, + "num_input_tokens_seen": 23818880, + "step": 112865 + }, + { + "epoch": 12.416941694169417, + "grad_norm": 0.009592367336153984, + "learning_rate": 1.8880718280691187e-05, + "loss": 0.0008, + "num_input_tokens_seen": 23819936, + "step": 112870 + }, + { + "epoch": 12.417491749174918, + "grad_norm": 0.014316447079181671, + "learning_rate": 1.887839124918436e-05, + "loss": 0.0736, + "num_input_tokens_seen": 23821024, + "step": 112875 + }, + { + "epoch": 12.418041804180419, + "grad_norm": 0.09735186398029327, + "learning_rate": 1.887606427409748e-05, + "loss": 0.1484, + "num_input_tokens_seen": 23822048, + "step": 112880 + }, + { + "epoch": 12.418591859185918, + "grad_norm": 0.08222629874944687, + "learning_rate": 1.8873737355451962e-05, + "loss": 0.1117, + "num_input_tokens_seen": 23823072, + "step": 112885 + }, + { + "epoch": 12.41914191419142, + "grad_norm": 0.15647198259830475, + "learning_rate": 1.8871410493269282e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23824160, + "step": 112890 + }, + { + "epoch": 12.41969196919692, + "grad_norm": 0.8712233901023865, + "learning_rate": 1.8869083687570866e-05, + "loss": 0.013, + "num_input_tokens_seen": 23825248, + "step": 112895 + }, + { + "epoch": 12.42024202420242, + "grad_norm": 0.006695268210023642, + "learning_rate": 1.8866756938378156e-05, + "loss": 0.004, + "num_input_tokens_seen": 23826336, + "step": 112900 + }, + { + "epoch": 12.42079207920792, + "grad_norm": 0.4542827308177948, + "learning_rate": 1.8864430245712617e-05, + "loss": 0.038, + "num_input_tokens_seen": 23827392, + "step": 112905 + }, + { + "epoch": 12.421342134213422, + "grad_norm": 4.456386089324951, + "learning_rate": 1.886210360959566e-05, + "loss": 0.0099, + "num_input_tokens_seen": 23828384, + "step": 112910 + }, + { + "epoch": 12.421892189218921, + "grad_norm": 1.7612665891647339, + "learning_rate": 1.885977703004877e-05, + "loss": 0.0191, + "num_input_tokens_seen": 23829472, + "step": 112915 + }, + { + "epoch": 12.422442244224422, + "grad_norm": 0.07770148664712906, + "learning_rate": 1.885745050709336e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23830496, + "step": 112920 + }, + { + "epoch": 12.422992299229923, + "grad_norm": 0.004485161975026131, + "learning_rate": 1.8855124040750878e-05, + "loss": 0.0015, + "num_input_tokens_seen": 23831520, + "step": 112925 + }, + { + "epoch": 12.423542354235423, + "grad_norm": 0.05715520307421684, + "learning_rate": 1.885279763104278e-05, + "loss": 0.0017, + "num_input_tokens_seen": 23832672, + "step": 112930 + }, + { + "epoch": 12.424092409240924, + "grad_norm": 0.03694511577486992, + "learning_rate": 1.8850471277990483e-05, + "loss": 0.0043, + "num_input_tokens_seen": 23833728, + "step": 112935 + }, + { + "epoch": 12.424642464246425, + "grad_norm": 0.015329647809267044, + "learning_rate": 1.8848144981615448e-05, + "loss": 0.0351, + "num_input_tokens_seen": 23834720, + "step": 112940 + }, + { + "epoch": 12.425192519251926, + "grad_norm": 0.037214674055576324, + "learning_rate": 1.8845818741939114e-05, + "loss": 0.0174, + "num_input_tokens_seen": 23835744, + "step": 112945 + }, + { + "epoch": 12.425742574257425, + "grad_norm": 0.003738060826435685, + "learning_rate": 1.8843492558982905e-05, + "loss": 0.059, + "num_input_tokens_seen": 23836736, + "step": 112950 + }, + { + "epoch": 12.426292629262926, + "grad_norm": 0.03722662106156349, + "learning_rate": 1.8841166432768278e-05, + "loss": 0.0056, + "num_input_tokens_seen": 23837792, + "step": 112955 + }, + { + "epoch": 12.426842684268427, + "grad_norm": 0.010892423801124096, + "learning_rate": 1.883884036331667e-05, + "loss": 0.1577, + "num_input_tokens_seen": 23838816, + "step": 112960 + }, + { + "epoch": 12.427392739273927, + "grad_norm": 0.007413272745907307, + "learning_rate": 1.8836514350649502e-05, + "loss": 0.064, + "num_input_tokens_seen": 23839840, + "step": 112965 + }, + { + "epoch": 12.427942794279428, + "grad_norm": 1.3986632823944092, + "learning_rate": 1.8834188394788227e-05, + "loss": 0.0658, + "num_input_tokens_seen": 23840896, + "step": 112970 + }, + { + "epoch": 12.428492849284929, + "grad_norm": 0.12081455439329147, + "learning_rate": 1.8831862495754277e-05, + "loss": 0.0913, + "num_input_tokens_seen": 23841888, + "step": 112975 + }, + { + "epoch": 12.429042904290428, + "grad_norm": 0.032447509467601776, + "learning_rate": 1.8829536653569092e-05, + "loss": 0.0017, + "num_input_tokens_seen": 23842944, + "step": 112980 + }, + { + "epoch": 12.42959295929593, + "grad_norm": 0.004545341711491346, + "learning_rate": 1.8827210868254117e-05, + "loss": 0.0014, + "num_input_tokens_seen": 23844000, + "step": 112985 + }, + { + "epoch": 12.43014301430143, + "grad_norm": 0.0063478415831923485, + "learning_rate": 1.8824885139830763e-05, + "loss": 0.0301, + "num_input_tokens_seen": 23845088, + "step": 112990 + }, + { + "epoch": 12.430693069306932, + "grad_norm": 1.4421395063400269, + "learning_rate": 1.8822559468320488e-05, + "loss": 0.0413, + "num_input_tokens_seen": 23846176, + "step": 112995 + }, + { + "epoch": 12.43124312431243, + "grad_norm": 0.01874176226556301, + "learning_rate": 1.8820233853744708e-05, + "loss": 0.0133, + "num_input_tokens_seen": 23847232, + "step": 113000 + }, + { + "epoch": 12.431793179317932, + "grad_norm": 0.02491438016295433, + "learning_rate": 1.8817908296124883e-05, + "loss": 0.0012, + "num_input_tokens_seen": 23848256, + "step": 113005 + }, + { + "epoch": 12.432343234323433, + "grad_norm": 0.039053719490766525, + "learning_rate": 1.881558279548242e-05, + "loss": 0.0562, + "num_input_tokens_seen": 23849312, + "step": 113010 + }, + { + "epoch": 12.432893289328932, + "grad_norm": 0.3556765615940094, + "learning_rate": 1.8813257351838764e-05, + "loss": 0.0139, + "num_input_tokens_seen": 23850336, + "step": 113015 + }, + { + "epoch": 12.433443344334433, + "grad_norm": 0.012821570038795471, + "learning_rate": 1.8810931965215356e-05, + "loss": 0.0545, + "num_input_tokens_seen": 23851392, + "step": 113020 + }, + { + "epoch": 12.433993399339935, + "grad_norm": 0.043556664139032364, + "learning_rate": 1.8808606635633606e-05, + "loss": 0.0074, + "num_input_tokens_seen": 23852448, + "step": 113025 + }, + { + "epoch": 12.434543454345434, + "grad_norm": 0.013027409091591835, + "learning_rate": 1.8806281363114962e-05, + "loss": 0.0353, + "num_input_tokens_seen": 23853600, + "step": 113030 + }, + { + "epoch": 12.435093509350935, + "grad_norm": 0.16430823504924774, + "learning_rate": 1.8803956147680853e-05, + "loss": 0.0358, + "num_input_tokens_seen": 23854656, + "step": 113035 + }, + { + "epoch": 12.435643564356436, + "grad_norm": 3.0892035961151123, + "learning_rate": 1.88016309893527e-05, + "loss": 0.1054, + "num_input_tokens_seen": 23855680, + "step": 113040 + }, + { + "epoch": 12.436193619361935, + "grad_norm": 0.3296353220939636, + "learning_rate": 1.8799305888151947e-05, + "loss": 0.0171, + "num_input_tokens_seen": 23856736, + "step": 113045 + }, + { + "epoch": 12.436743674367436, + "grad_norm": 0.025077957659959793, + "learning_rate": 1.879698084410001e-05, + "loss": 0.0568, + "num_input_tokens_seen": 23857760, + "step": 113050 + }, + { + "epoch": 12.437293729372938, + "grad_norm": 0.34561359882354736, + "learning_rate": 1.879465585721833e-05, + "loss": 0.0545, + "num_input_tokens_seen": 23858816, + "step": 113055 + }, + { + "epoch": 12.437843784378439, + "grad_norm": 0.008574268780648708, + "learning_rate": 1.8792330927528333e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23859808, + "step": 113060 + }, + { + "epoch": 12.438393839383938, + "grad_norm": 0.49820080399513245, + "learning_rate": 1.879000605505143e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23860864, + "step": 113065 + }, + { + "epoch": 12.438943894389439, + "grad_norm": 0.009535351768136024, + "learning_rate": 1.8787681239809072e-05, + "loss": 0.0541, + "num_input_tokens_seen": 23861952, + "step": 113070 + }, + { + "epoch": 12.43949394939494, + "grad_norm": 0.11337874829769135, + "learning_rate": 1.8785356481822665e-05, + "loss": 0.0112, + "num_input_tokens_seen": 23862976, + "step": 113075 + }, + { + "epoch": 12.44004400440044, + "grad_norm": 0.3897630572319031, + "learning_rate": 1.878303178111366e-05, + "loss": 0.0144, + "num_input_tokens_seen": 23864032, + "step": 113080 + }, + { + "epoch": 12.44059405940594, + "grad_norm": 1.0125566720962524, + "learning_rate": 1.878070713770346e-05, + "loss": 0.0557, + "num_input_tokens_seen": 23865056, + "step": 113085 + }, + { + "epoch": 12.441144114411442, + "grad_norm": 0.011857116594910622, + "learning_rate": 1.8778382551613494e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23866144, + "step": 113090 + }, + { + "epoch": 12.441694169416941, + "grad_norm": 0.08003848791122437, + "learning_rate": 1.8776058022865193e-05, + "loss": 0.0038, + "num_input_tokens_seen": 23867168, + "step": 113095 + }, + { + "epoch": 12.442244224422442, + "grad_norm": 0.004580851644277573, + "learning_rate": 1.8773733551479977e-05, + "loss": 0.0011, + "num_input_tokens_seen": 23868192, + "step": 113100 + }, + { + "epoch": 12.442794279427943, + "grad_norm": 0.009679313749074936, + "learning_rate": 1.8771409137479277e-05, + "loss": 0.001, + "num_input_tokens_seen": 23869280, + "step": 113105 + }, + { + "epoch": 12.443344334433444, + "grad_norm": 0.010214713402092457, + "learning_rate": 1.876908478088451e-05, + "loss": 0.0312, + "num_input_tokens_seen": 23870368, + "step": 113110 + }, + { + "epoch": 12.443894389438944, + "grad_norm": 2.335069417953491, + "learning_rate": 1.8766760481717092e-05, + "loss": 0.0438, + "num_input_tokens_seen": 23871424, + "step": 113115 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 0.9382518529891968, + "learning_rate": 1.8764436239998462e-05, + "loss": 0.0083, + "num_input_tokens_seen": 23872544, + "step": 113120 + }, + { + "epoch": 12.444994499449946, + "grad_norm": 0.05247019603848457, + "learning_rate": 1.876211205575002e-05, + "loss": 0.067, + "num_input_tokens_seen": 23873600, + "step": 113125 + }, + { + "epoch": 12.445544554455445, + "grad_norm": 0.3237760663032532, + "learning_rate": 1.8759787928993204e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23874656, + "step": 113130 + }, + { + "epoch": 12.446094609460946, + "grad_norm": 0.03230748698115349, + "learning_rate": 1.8757463859749433e-05, + "loss": 0.0062, + "num_input_tokens_seen": 23875808, + "step": 113135 + }, + { + "epoch": 12.446644664466447, + "grad_norm": 1.0613741874694824, + "learning_rate": 1.875513984804011e-05, + "loss": 0.0185, + "num_input_tokens_seen": 23876864, + "step": 113140 + }, + { + "epoch": 12.447194719471947, + "grad_norm": 0.05949188768863678, + "learning_rate": 1.8752815893886675e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23877920, + "step": 113145 + }, + { + "epoch": 12.447744774477448, + "grad_norm": 0.2827100455760956, + "learning_rate": 1.8750491997310543e-05, + "loss": 0.0055, + "num_input_tokens_seen": 23878912, + "step": 113150 + }, + { + "epoch": 12.448294829482949, + "grad_norm": 0.045226845890283585, + "learning_rate": 1.8748168158333108e-05, + "loss": 0.0018, + "num_input_tokens_seen": 23879936, + "step": 113155 + }, + { + "epoch": 12.448844884488448, + "grad_norm": 0.12804335355758667, + "learning_rate": 1.8745844376975828e-05, + "loss": 0.0955, + "num_input_tokens_seen": 23880960, + "step": 113160 + }, + { + "epoch": 12.44939493949395, + "grad_norm": 0.020697075873613358, + "learning_rate": 1.8743520653260082e-05, + "loss": 0.0359, + "num_input_tokens_seen": 23882016, + "step": 113165 + }, + { + "epoch": 12.44994499449945, + "grad_norm": 0.3481443226337433, + "learning_rate": 1.8741196987207317e-05, + "loss": 0.059, + "num_input_tokens_seen": 23883136, + "step": 113170 + }, + { + "epoch": 12.450495049504951, + "grad_norm": 0.12115703523159027, + "learning_rate": 1.873887337883894e-05, + "loss": 0.0841, + "num_input_tokens_seen": 23884160, + "step": 113175 + }, + { + "epoch": 12.45104510451045, + "grad_norm": 0.012989924289286137, + "learning_rate": 1.8736549828176347e-05, + "loss": 0.0012, + "num_input_tokens_seen": 23885216, + "step": 113180 + }, + { + "epoch": 12.451595159515952, + "grad_norm": 0.254513680934906, + "learning_rate": 1.873422633524098e-05, + "loss": 0.0157, + "num_input_tokens_seen": 23886208, + "step": 113185 + }, + { + "epoch": 12.452145214521453, + "grad_norm": 1.560178279876709, + "learning_rate": 1.8731902900054233e-05, + "loss": 0.1013, + "num_input_tokens_seen": 23887296, + "step": 113190 + }, + { + "epoch": 12.452695269526952, + "grad_norm": 0.09216199815273285, + "learning_rate": 1.872957952263754e-05, + "loss": 0.0286, + "num_input_tokens_seen": 23888384, + "step": 113195 + }, + { + "epoch": 12.453245324532453, + "grad_norm": 0.026453964412212372, + "learning_rate": 1.87272562030123e-05, + "loss": 0.0011, + "num_input_tokens_seen": 23889440, + "step": 113200 + }, + { + "epoch": 12.453795379537954, + "grad_norm": 0.28039804100990295, + "learning_rate": 1.8724932941199923e-05, + "loss": 0.0495, + "num_input_tokens_seen": 23890528, + "step": 113205 + }, + { + "epoch": 12.454345434543454, + "grad_norm": 1.0878313779830933, + "learning_rate": 1.8722609737221838e-05, + "loss": 0.0248, + "num_input_tokens_seen": 23891616, + "step": 113210 + }, + { + "epoch": 12.454895489548955, + "grad_norm": 0.14759103953838348, + "learning_rate": 1.8720286591099434e-05, + "loss": 0.0041, + "num_input_tokens_seen": 23892704, + "step": 113215 + }, + { + "epoch": 12.455445544554456, + "grad_norm": 0.034413013607263565, + "learning_rate": 1.8717963502854148e-05, + "loss": 0.0149, + "num_input_tokens_seen": 23893792, + "step": 113220 + }, + { + "epoch": 12.455995599559955, + "grad_norm": 5.001830101013184, + "learning_rate": 1.8715640472507372e-05, + "loss": 0.0073, + "num_input_tokens_seen": 23894848, + "step": 113225 + }, + { + "epoch": 12.456545654565456, + "grad_norm": 0.0238770991563797, + "learning_rate": 1.8713317500080523e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23895872, + "step": 113230 + }, + { + "epoch": 12.457095709570957, + "grad_norm": 0.00922898855060339, + "learning_rate": 1.8710994585595016e-05, + "loss": 0.013, + "num_input_tokens_seen": 23896864, + "step": 113235 + }, + { + "epoch": 12.457645764576458, + "grad_norm": 0.06663955748081207, + "learning_rate": 1.8708671729072242e-05, + "loss": 0.0078, + "num_input_tokens_seen": 23897920, + "step": 113240 + }, + { + "epoch": 12.458195819581958, + "grad_norm": 0.015142772346735, + "learning_rate": 1.870634893053363e-05, + "loss": 0.0769, + "num_input_tokens_seen": 23899072, + "step": 113245 + }, + { + "epoch": 12.458745874587459, + "grad_norm": 0.01963566243648529, + "learning_rate": 1.8704026190000583e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23900128, + "step": 113250 + }, + { + "epoch": 12.45929592959296, + "grad_norm": 0.016667891293764114, + "learning_rate": 1.8701703507494497e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23901280, + "step": 113255 + }, + { + "epoch": 12.45984598459846, + "grad_norm": 2.3157122135162354, + "learning_rate": 1.8699380883036792e-05, + "loss": 0.0289, + "num_input_tokens_seen": 23902304, + "step": 113260 + }, + { + "epoch": 12.46039603960396, + "grad_norm": 0.0714123323559761, + "learning_rate": 1.8697058316648864e-05, + "loss": 0.0465, + "num_input_tokens_seen": 23903424, + "step": 113265 + }, + { + "epoch": 12.460946094609461, + "grad_norm": 0.01239117980003357, + "learning_rate": 1.869473580835214e-05, + "loss": 0.0009, + "num_input_tokens_seen": 23904480, + "step": 113270 + }, + { + "epoch": 12.46149614961496, + "grad_norm": 1.4262248277664185, + "learning_rate": 1.8692413358168e-05, + "loss": 0.0143, + "num_input_tokens_seen": 23905568, + "step": 113275 + }, + { + "epoch": 12.462046204620462, + "grad_norm": 0.007387694902718067, + "learning_rate": 1.8690090966117857e-05, + "loss": 0.0013, + "num_input_tokens_seen": 23906624, + "step": 113280 + }, + { + "epoch": 12.462596259625963, + "grad_norm": 2.8949146270751953, + "learning_rate": 1.868776863222312e-05, + "loss": 0.0984, + "num_input_tokens_seen": 23907712, + "step": 113285 + }, + { + "epoch": 12.463146314631462, + "grad_norm": 2.9436187744140625, + "learning_rate": 1.8685446356505188e-05, + "loss": 0.0698, + "num_input_tokens_seen": 23908704, + "step": 113290 + }, + { + "epoch": 12.463696369636963, + "grad_norm": 3.31345796585083, + "learning_rate": 1.8683124138985475e-05, + "loss": 0.0815, + "num_input_tokens_seen": 23909824, + "step": 113295 + }, + { + "epoch": 12.464246424642464, + "grad_norm": 1.8499490022659302, + "learning_rate": 1.8680801979685373e-05, + "loss": 0.0289, + "num_input_tokens_seen": 23910912, + "step": 113300 + }, + { + "epoch": 12.464796479647966, + "grad_norm": 0.04876350238919258, + "learning_rate": 1.867847987862628e-05, + "loss": 0.0193, + "num_input_tokens_seen": 23912000, + "step": 113305 + }, + { + "epoch": 12.465346534653465, + "grad_norm": 1.281102180480957, + "learning_rate": 1.8676157835829617e-05, + "loss": 0.0233, + "num_input_tokens_seen": 23913088, + "step": 113310 + }, + { + "epoch": 12.465896589658966, + "grad_norm": 0.028807975351810455, + "learning_rate": 1.8673835851316762e-05, + "loss": 0.0074, + "num_input_tokens_seen": 23914176, + "step": 113315 + }, + { + "epoch": 12.466446644664467, + "grad_norm": 1.8108494281768799, + "learning_rate": 1.867151392510913e-05, + "loss": 0.0448, + "num_input_tokens_seen": 23915296, + "step": 113320 + }, + { + "epoch": 12.466996699669966, + "grad_norm": 0.10393620282411575, + "learning_rate": 1.8669192057228123e-05, + "loss": 0.1105, + "num_input_tokens_seen": 23916352, + "step": 113325 + }, + { + "epoch": 12.467546754675467, + "grad_norm": 0.07123374193906784, + "learning_rate": 1.8666870247695118e-05, + "loss": 0.0022, + "num_input_tokens_seen": 23917408, + "step": 113330 + }, + { + "epoch": 12.468096809680969, + "grad_norm": 0.27305862307548523, + "learning_rate": 1.866454849653155e-05, + "loss": 0.0936, + "num_input_tokens_seen": 23918496, + "step": 113335 + }, + { + "epoch": 12.468646864686468, + "grad_norm": 0.002930727554485202, + "learning_rate": 1.8662226803758793e-05, + "loss": 0.0026, + "num_input_tokens_seen": 23919488, + "step": 113340 + }, + { + "epoch": 12.469196919691969, + "grad_norm": 0.03201329708099365, + "learning_rate": 1.8659905169398242e-05, + "loss": 0.1167, + "num_input_tokens_seen": 23920544, + "step": 113345 + }, + { + "epoch": 12.46974697469747, + "grad_norm": 0.009588774293661118, + "learning_rate": 1.8657583593471316e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23921600, + "step": 113350 + }, + { + "epoch": 12.47029702970297, + "grad_norm": 0.059311944991350174, + "learning_rate": 1.8655262075999384e-05, + "loss": 0.0258, + "num_input_tokens_seen": 23922656, + "step": 113355 + }, + { + "epoch": 12.47084708470847, + "grad_norm": 0.01115559320896864, + "learning_rate": 1.8652940617003866e-05, + "loss": 0.0444, + "num_input_tokens_seen": 23923712, + "step": 113360 + }, + { + "epoch": 12.471397139713972, + "grad_norm": 1.8294119834899902, + "learning_rate": 1.8650619216506154e-05, + "loss": 0.0406, + "num_input_tokens_seen": 23924704, + "step": 113365 + }, + { + "epoch": 12.471947194719473, + "grad_norm": 0.002833379665389657, + "learning_rate": 1.8648297874527627e-05, + "loss": 0.1044, + "num_input_tokens_seen": 23925824, + "step": 113370 + }, + { + "epoch": 12.472497249724972, + "grad_norm": 0.02232940122485161, + "learning_rate": 1.86459765910897e-05, + "loss": 0.0512, + "num_input_tokens_seen": 23926880, + "step": 113375 + }, + { + "epoch": 12.473047304730473, + "grad_norm": 0.009820754639804363, + "learning_rate": 1.8643655366213745e-05, + "loss": 0.0008, + "num_input_tokens_seen": 23927936, + "step": 113380 + }, + { + "epoch": 12.473597359735974, + "grad_norm": 0.0346769355237484, + "learning_rate": 1.8641334199921182e-05, + "loss": 0.002, + "num_input_tokens_seen": 23929056, + "step": 113385 + }, + { + "epoch": 12.474147414741473, + "grad_norm": 0.057820167392492294, + "learning_rate": 1.863901309223339e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23930112, + "step": 113390 + }, + { + "epoch": 12.474697469746975, + "grad_norm": 0.08384356647729874, + "learning_rate": 1.8636692043171747e-05, + "loss": 0.0045, + "num_input_tokens_seen": 23931104, + "step": 113395 + }, + { + "epoch": 12.475247524752476, + "grad_norm": 0.002516085049137473, + "learning_rate": 1.863437105275767e-05, + "loss": 0.1735, + "num_input_tokens_seen": 23932160, + "step": 113400 + }, + { + "epoch": 12.475797579757975, + "grad_norm": 0.3701344132423401, + "learning_rate": 1.8632050121012538e-05, + "loss": 0.1212, + "num_input_tokens_seen": 23933216, + "step": 113405 + }, + { + "epoch": 12.476347634763476, + "grad_norm": 0.030698981136083603, + "learning_rate": 1.862972924795775e-05, + "loss": 0.0096, + "num_input_tokens_seen": 23934304, + "step": 113410 + }, + { + "epoch": 12.476897689768977, + "grad_norm": 0.013049857690930367, + "learning_rate": 1.862740843361469e-05, + "loss": 0.0066, + "num_input_tokens_seen": 23935328, + "step": 113415 + }, + { + "epoch": 12.477447744774478, + "grad_norm": 0.03378107026219368, + "learning_rate": 1.862508767800474e-05, + "loss": 0.0119, + "num_input_tokens_seen": 23936384, + "step": 113420 + }, + { + "epoch": 12.477997799779978, + "grad_norm": 0.07857826352119446, + "learning_rate": 1.862276698114931e-05, + "loss": 0.0044, + "num_input_tokens_seen": 23937440, + "step": 113425 + }, + { + "epoch": 12.478547854785479, + "grad_norm": 0.8550015091896057, + "learning_rate": 1.8620446343069768e-05, + "loss": 0.0085, + "num_input_tokens_seen": 23938496, + "step": 113430 + }, + { + "epoch": 12.47909790979098, + "grad_norm": 0.021661698818206787, + "learning_rate": 1.8618125763787514e-05, + "loss": 0.1412, + "num_input_tokens_seen": 23939552, + "step": 113435 + }, + { + "epoch": 12.479647964796479, + "grad_norm": 0.036307696253061295, + "learning_rate": 1.8615805243323935e-05, + "loss": 0.003, + "num_input_tokens_seen": 23940576, + "step": 113440 + }, + { + "epoch": 12.48019801980198, + "grad_norm": 0.339893102645874, + "learning_rate": 1.861348478170041e-05, + "loss": 0.0239, + "num_input_tokens_seen": 23941664, + "step": 113445 + }, + { + "epoch": 12.480748074807481, + "grad_norm": 0.026091769337654114, + "learning_rate": 1.8611164378938333e-05, + "loss": 0.0807, + "num_input_tokens_seen": 23942720, + "step": 113450 + }, + { + "epoch": 12.48129812981298, + "grad_norm": 0.020786743611097336, + "learning_rate": 1.8608844035059088e-05, + "loss": 0.003, + "num_input_tokens_seen": 23943808, + "step": 113455 + }, + { + "epoch": 12.481848184818482, + "grad_norm": 0.2742413282394409, + "learning_rate": 1.860652375008406e-05, + "loss": 0.0318, + "num_input_tokens_seen": 23944832, + "step": 113460 + }, + { + "epoch": 12.482398239823983, + "grad_norm": 0.02290143445134163, + "learning_rate": 1.860420352403464e-05, + "loss": 0.001, + "num_input_tokens_seen": 23945920, + "step": 113465 + }, + { + "epoch": 12.482948294829482, + "grad_norm": 0.010835487395524979, + "learning_rate": 1.8601883356932202e-05, + "loss": 0.0248, + "num_input_tokens_seen": 23947008, + "step": 113470 + }, + { + "epoch": 12.483498349834983, + "grad_norm": 4.366533279418945, + "learning_rate": 1.8599563248798137e-05, + "loss": 0.0319, + "num_input_tokens_seen": 23948032, + "step": 113475 + }, + { + "epoch": 12.484048404840484, + "grad_norm": 0.003369030775502324, + "learning_rate": 1.8597243199653817e-05, + "loss": 0.0088, + "num_input_tokens_seen": 23949120, + "step": 113480 + }, + { + "epoch": 12.484598459845985, + "grad_norm": 0.03429488465189934, + "learning_rate": 1.8594923209520654e-05, + "loss": 0.0425, + "num_input_tokens_seen": 23950144, + "step": 113485 + }, + { + "epoch": 12.485148514851485, + "grad_norm": 0.11036000400781631, + "learning_rate": 1.8592603278419998e-05, + "loss": 0.0087, + "num_input_tokens_seen": 23951200, + "step": 113490 + }, + { + "epoch": 12.485698569856986, + "grad_norm": 0.08124057948589325, + "learning_rate": 1.859028340637324e-05, + "loss": 0.0039, + "num_input_tokens_seen": 23952256, + "step": 113495 + }, + { + "epoch": 12.486248624862487, + "grad_norm": 0.00957114715129137, + "learning_rate": 1.8587963593401773e-05, + "loss": 0.0037, + "num_input_tokens_seen": 23953344, + "step": 113500 + }, + { + "epoch": 12.486798679867986, + "grad_norm": 0.00624071666970849, + "learning_rate": 1.8585643839526963e-05, + "loss": 0.0541, + "num_input_tokens_seen": 23954368, + "step": 113505 + }, + { + "epoch": 12.487348734873487, + "grad_norm": 0.009346570819616318, + "learning_rate": 1.8583324144770197e-05, + "loss": 0.0057, + "num_input_tokens_seen": 23955456, + "step": 113510 + }, + { + "epoch": 12.487898789878988, + "grad_norm": 0.09125746041536331, + "learning_rate": 1.858100450915286e-05, + "loss": 0.0031, + "num_input_tokens_seen": 23956544, + "step": 113515 + }, + { + "epoch": 12.488448844884488, + "grad_norm": 0.006191182415932417, + "learning_rate": 1.8578684932696305e-05, + "loss": 0.0849, + "num_input_tokens_seen": 23957536, + "step": 113520 + }, + { + "epoch": 12.488998899889989, + "grad_norm": 3.721590280532837, + "learning_rate": 1.857636541542195e-05, + "loss": 0.0701, + "num_input_tokens_seen": 23958592, + "step": 113525 + }, + { + "epoch": 12.48954895489549, + "grad_norm": 0.01949491910636425, + "learning_rate": 1.8574045957351144e-05, + "loss": 0.001, + "num_input_tokens_seen": 23959712, + "step": 113530 + }, + { + "epoch": 12.490099009900991, + "grad_norm": 0.11432497203350067, + "learning_rate": 1.857172655850527e-05, + "loss": 0.0097, + "num_input_tokens_seen": 23960768, + "step": 113535 + }, + { + "epoch": 12.49064906490649, + "grad_norm": 0.0035378863103687763, + "learning_rate": 1.856940721890572e-05, + "loss": 0.0332, + "num_input_tokens_seen": 23961824, + "step": 113540 + }, + { + "epoch": 12.491199119911991, + "grad_norm": 1.2145954370498657, + "learning_rate": 1.8567087938573843e-05, + "loss": 0.0153, + "num_input_tokens_seen": 23962912, + "step": 113545 + }, + { + "epoch": 12.491749174917492, + "grad_norm": 0.009242219850420952, + "learning_rate": 1.856476871753104e-05, + "loss": 0.016, + "num_input_tokens_seen": 23964000, + "step": 113550 + }, + { + "epoch": 12.492299229922992, + "grad_norm": 0.008508261293172836, + "learning_rate": 1.8562449555798677e-05, + "loss": 0.0296, + "num_input_tokens_seen": 23964992, + "step": 113555 + }, + { + "epoch": 12.492849284928493, + "grad_norm": 0.008627135306596756, + "learning_rate": 1.8560130453398116e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23966048, + "step": 113560 + }, + { + "epoch": 12.493399339933994, + "grad_norm": 0.004922179970890284, + "learning_rate": 1.855781141035075e-05, + "loss": 0.0455, + "num_input_tokens_seen": 23967136, + "step": 113565 + }, + { + "epoch": 12.493949394939493, + "grad_norm": 0.004327928181737661, + "learning_rate": 1.855549242667794e-05, + "loss": 0.0559, + "num_input_tokens_seen": 23968192, + "step": 113570 + }, + { + "epoch": 12.494499449944994, + "grad_norm": 0.013967812992632389, + "learning_rate": 1.8553173502401065e-05, + "loss": 0.0692, + "num_input_tokens_seen": 23969216, + "step": 113575 + }, + { + "epoch": 12.495049504950495, + "grad_norm": 0.012616750784218311, + "learning_rate": 1.855085463754151e-05, + "loss": 0.0894, + "num_input_tokens_seen": 23970336, + "step": 113580 + }, + { + "epoch": 12.495599559955995, + "grad_norm": 0.020946942269802094, + "learning_rate": 1.8548535832120617e-05, + "loss": 0.0941, + "num_input_tokens_seen": 23971392, + "step": 113585 + }, + { + "epoch": 12.496149614961496, + "grad_norm": 0.061822906136512756, + "learning_rate": 1.854621708615978e-05, + "loss": 0.0095, + "num_input_tokens_seen": 23972544, + "step": 113590 + }, + { + "epoch": 12.496699669966997, + "grad_norm": 0.7743729948997498, + "learning_rate": 1.8543898399680358e-05, + "loss": 0.0605, + "num_input_tokens_seen": 23973600, + "step": 113595 + }, + { + "epoch": 12.497249724972498, + "grad_norm": 0.09422292560338974, + "learning_rate": 1.8541579772703737e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23974656, + "step": 113600 + }, + { + "epoch": 12.497799779977997, + "grad_norm": 0.08083156496286392, + "learning_rate": 1.853926120525127e-05, + "loss": 0.0542, + "num_input_tokens_seen": 23975712, + "step": 113605 + }, + { + "epoch": 12.498349834983498, + "grad_norm": 0.04016323760151863, + "learning_rate": 1.853694269734433e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23976736, + "step": 113610 + }, + { + "epoch": 12.498899889989, + "grad_norm": 0.20571500062942505, + "learning_rate": 1.8534624249004294e-05, + "loss": 0.0442, + "num_input_tokens_seen": 23977728, + "step": 113615 + }, + { + "epoch": 12.499449944994499, + "grad_norm": 0.023993831127882004, + "learning_rate": 1.8532305860252515e-05, + "loss": 0.0041, + "num_input_tokens_seen": 23978784, + "step": 113620 + }, + { + "epoch": 12.5, + "grad_norm": 0.2983201742172241, + "learning_rate": 1.8529987531110378e-05, + "loss": 0.0111, + "num_input_tokens_seen": 23979808, + "step": 113625 + }, + { + "epoch": 12.500550055005501, + "grad_norm": 0.33997493982315063, + "learning_rate": 1.852766926159924e-05, + "loss": 0.0883, + "num_input_tokens_seen": 23980832, + "step": 113630 + }, + { + "epoch": 12.501100110011, + "grad_norm": 0.016804607585072517, + "learning_rate": 1.852535105174046e-05, + "loss": 0.0462, + "num_input_tokens_seen": 23981856, + "step": 113635 + }, + { + "epoch": 12.501650165016502, + "grad_norm": 0.05472705140709877, + "learning_rate": 1.852303290155542e-05, + "loss": 0.0045, + "num_input_tokens_seen": 23982880, + "step": 113640 + }, + { + "epoch": 12.502200220022003, + "grad_norm": 0.138014554977417, + "learning_rate": 1.8520714811065477e-05, + "loss": 0.0489, + "num_input_tokens_seen": 23983968, + "step": 113645 + }, + { + "epoch": 12.502750275027502, + "grad_norm": 0.02953839674592018, + "learning_rate": 1.8518396780291994e-05, + "loss": 0.0026, + "num_input_tokens_seen": 23984992, + "step": 113650 + }, + { + "epoch": 12.503300330033003, + "grad_norm": 0.13352419435977936, + "learning_rate": 1.8516078809256345e-05, + "loss": 0.1707, + "num_input_tokens_seen": 23986048, + "step": 113655 + }, + { + "epoch": 12.503850385038504, + "grad_norm": 0.05618375539779663, + "learning_rate": 1.8513760897979874e-05, + "loss": 0.0061, + "num_input_tokens_seen": 23987040, + "step": 113660 + }, + { + "epoch": 12.504400440044005, + "grad_norm": 0.13065873086452484, + "learning_rate": 1.8511443046483967e-05, + "loss": 0.0064, + "num_input_tokens_seen": 23988096, + "step": 113665 + }, + { + "epoch": 12.504950495049505, + "grad_norm": 0.013602317310869694, + "learning_rate": 1.850912525478997e-05, + "loss": 0.0021, + "num_input_tokens_seen": 23989088, + "step": 113670 + }, + { + "epoch": 12.505500550055006, + "grad_norm": 2.9222605228424072, + "learning_rate": 1.850680752291926e-05, + "loss": 0.0239, + "num_input_tokens_seen": 23990112, + "step": 113675 + }, + { + "epoch": 12.506050605060507, + "grad_norm": 0.20914246141910553, + "learning_rate": 1.850448985089318e-05, + "loss": 0.0068, + "num_input_tokens_seen": 23991136, + "step": 113680 + }, + { + "epoch": 12.506600660066006, + "grad_norm": 0.10242699086666107, + "learning_rate": 1.85021722387331e-05, + "loss": 0.0472, + "num_input_tokens_seen": 23992192, + "step": 113685 + }, + { + "epoch": 12.507150715071507, + "grad_norm": 0.03987769037485123, + "learning_rate": 1.8499854686460393e-05, + "loss": 0.0028, + "num_input_tokens_seen": 23993216, + "step": 113690 + }, + { + "epoch": 12.507700770077008, + "grad_norm": 2.3014602661132812, + "learning_rate": 1.8497537194096385e-05, + "loss": 0.1224, + "num_input_tokens_seen": 23994240, + "step": 113695 + }, + { + "epoch": 12.508250825082508, + "grad_norm": 0.017496248707175255, + "learning_rate": 1.8495219761662476e-05, + "loss": 0.0011, + "num_input_tokens_seen": 23995296, + "step": 113700 + }, + { + "epoch": 12.508800880088009, + "grad_norm": 0.008638747036457062, + "learning_rate": 1.8492902389179993e-05, + "loss": 0.0094, + "num_input_tokens_seen": 23996320, + "step": 113705 + }, + { + "epoch": 12.50935093509351, + "grad_norm": 0.1744728833436966, + "learning_rate": 1.849058507667031e-05, + "loss": 0.0377, + "num_input_tokens_seen": 23997312, + "step": 113710 + }, + { + "epoch": 12.509900990099009, + "grad_norm": 0.0752522349357605, + "learning_rate": 1.8488267824154787e-05, + "loss": 0.0019, + "num_input_tokens_seen": 23998464, + "step": 113715 + }, + { + "epoch": 12.51045104510451, + "grad_norm": 0.005991199519485235, + "learning_rate": 1.8485950631654768e-05, + "loss": 0.0447, + "num_input_tokens_seen": 23999456, + "step": 113720 + }, + { + "epoch": 12.511001100110011, + "grad_norm": 0.026249052956700325, + "learning_rate": 1.8483633499191618e-05, + "loss": 0.1062, + "num_input_tokens_seen": 24000512, + "step": 113725 + }, + { + "epoch": 12.511551155115512, + "grad_norm": 0.5449499487876892, + "learning_rate": 1.8481316426786694e-05, + "loss": 0.0132, + "num_input_tokens_seen": 24001568, + "step": 113730 + }, + { + "epoch": 12.512101210121012, + "grad_norm": 1.056717038154602, + "learning_rate": 1.847899941446134e-05, + "loss": 0.1102, + "num_input_tokens_seen": 24002656, + "step": 113735 + }, + { + "epoch": 12.512651265126513, + "grad_norm": 0.183928981423378, + "learning_rate": 1.8476682462236928e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24003712, + "step": 113740 + }, + { + "epoch": 12.513201320132014, + "grad_norm": 0.04730607196688652, + "learning_rate": 1.8474365570134805e-05, + "loss": 0.0276, + "num_input_tokens_seen": 24004864, + "step": 113745 + }, + { + "epoch": 12.513751375137513, + "grad_norm": 0.4489598870277405, + "learning_rate": 1.847204873817631e-05, + "loss": 0.005, + "num_input_tokens_seen": 24005920, + "step": 113750 + }, + { + "epoch": 12.514301430143014, + "grad_norm": 0.11151622980833054, + "learning_rate": 1.8469731966382822e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24006976, + "step": 113755 + }, + { + "epoch": 12.514851485148515, + "grad_norm": 0.02220333367586136, + "learning_rate": 1.8467415254775672e-05, + "loss": 0.0055, + "num_input_tokens_seen": 24008096, + "step": 113760 + }, + { + "epoch": 12.515401540154015, + "grad_norm": 0.3691350221633911, + "learning_rate": 1.8465098603376227e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24009120, + "step": 113765 + }, + { + "epoch": 12.515951595159516, + "grad_norm": 0.7791423797607422, + "learning_rate": 1.8462782012205836e-05, + "loss": 0.028, + "num_input_tokens_seen": 24010208, + "step": 113770 + }, + { + "epoch": 12.516501650165017, + "grad_norm": 0.0299359280616045, + "learning_rate": 1.8460465481285837e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24011200, + "step": 113775 + }, + { + "epoch": 12.517051705170516, + "grad_norm": 0.04401461407542229, + "learning_rate": 1.8458149010637595e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24012320, + "step": 113780 + }, + { + "epoch": 12.517601760176017, + "grad_norm": 0.103756844997406, + "learning_rate": 1.845583260028245e-05, + "loss": 0.0024, + "num_input_tokens_seen": 24013440, + "step": 113785 + }, + { + "epoch": 12.518151815181518, + "grad_norm": 0.012673113495111465, + "learning_rate": 1.8453516250241765e-05, + "loss": 0.0151, + "num_input_tokens_seen": 24014496, + "step": 113790 + }, + { + "epoch": 12.51870187018702, + "grad_norm": 0.0517859011888504, + "learning_rate": 1.8451199960536874e-05, + "loss": 0.0047, + "num_input_tokens_seen": 24015520, + "step": 113795 + }, + { + "epoch": 12.519251925192519, + "grad_norm": 0.27629461884498596, + "learning_rate": 1.8448883731189126e-05, + "loss": 0.006, + "num_input_tokens_seen": 24016672, + "step": 113800 + }, + { + "epoch": 12.51980198019802, + "grad_norm": 0.06102503463625908, + "learning_rate": 1.8446567562219884e-05, + "loss": 0.0204, + "num_input_tokens_seen": 24017728, + "step": 113805 + }, + { + "epoch": 12.520352035203521, + "grad_norm": 0.024310583248734474, + "learning_rate": 1.8444251453650475e-05, + "loss": 0.0138, + "num_input_tokens_seen": 24018752, + "step": 113810 + }, + { + "epoch": 12.52090209020902, + "grad_norm": 0.01559207309037447, + "learning_rate": 1.8441935405502257e-05, + "loss": 0.0024, + "num_input_tokens_seen": 24019744, + "step": 113815 + }, + { + "epoch": 12.521452145214521, + "grad_norm": 1.250430941581726, + "learning_rate": 1.8439619417796573e-05, + "loss": 0.0545, + "num_input_tokens_seen": 24020800, + "step": 113820 + }, + { + "epoch": 12.522002200220022, + "grad_norm": 0.014978417195379734, + "learning_rate": 1.8437303490554763e-05, + "loss": 0.0361, + "num_input_tokens_seen": 24021888, + "step": 113825 + }, + { + "epoch": 12.522552255225522, + "grad_norm": 0.014848335646092892, + "learning_rate": 1.8434987623798193e-05, + "loss": 0.0089, + "num_input_tokens_seen": 24022912, + "step": 113830 + }, + { + "epoch": 12.523102310231023, + "grad_norm": 0.4149784445762634, + "learning_rate": 1.843267181754818e-05, + "loss": 0.0086, + "num_input_tokens_seen": 24023968, + "step": 113835 + }, + { + "epoch": 12.523652365236524, + "grad_norm": 0.31496503949165344, + "learning_rate": 1.8430356071826082e-05, + "loss": 0.0302, + "num_input_tokens_seen": 24024992, + "step": 113840 + }, + { + "epoch": 12.524202420242025, + "grad_norm": 0.10929623246192932, + "learning_rate": 1.8428040386653244e-05, + "loss": 0.0287, + "num_input_tokens_seen": 24026016, + "step": 113845 + }, + { + "epoch": 12.524752475247524, + "grad_norm": 0.008380859158933163, + "learning_rate": 1.8425724762051e-05, + "loss": 0.0178, + "num_input_tokens_seen": 24027072, + "step": 113850 + }, + { + "epoch": 12.525302530253025, + "grad_norm": 0.08392161130905151, + "learning_rate": 1.8423409198040696e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24028096, + "step": 113855 + }, + { + "epoch": 12.525852585258527, + "grad_norm": 0.020505523309111595, + "learning_rate": 1.842109369464367e-05, + "loss": 0.0141, + "num_input_tokens_seen": 24029088, + "step": 113860 + }, + { + "epoch": 12.526402640264026, + "grad_norm": 1.6285814046859741, + "learning_rate": 1.8418778251881273e-05, + "loss": 0.0229, + "num_input_tokens_seen": 24030208, + "step": 113865 + }, + { + "epoch": 12.526952695269527, + "grad_norm": 0.08262188732624054, + "learning_rate": 1.841646286977484e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24031328, + "step": 113870 + }, + { + "epoch": 12.527502750275028, + "grad_norm": 0.7768985629081726, + "learning_rate": 1.8414147548345703e-05, + "loss": 0.0333, + "num_input_tokens_seen": 24032416, + "step": 113875 + }, + { + "epoch": 12.528052805280527, + "grad_norm": 0.0038690068759024143, + "learning_rate": 1.841183228761521e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24033440, + "step": 113880 + }, + { + "epoch": 12.528602860286028, + "grad_norm": 1.5532817840576172, + "learning_rate": 1.8409517087604695e-05, + "loss": 0.062, + "num_input_tokens_seen": 24034592, + "step": 113885 + }, + { + "epoch": 12.52915291529153, + "grad_norm": 0.054025642573833466, + "learning_rate": 1.8407201948335508e-05, + "loss": 0.0408, + "num_input_tokens_seen": 24035680, + "step": 113890 + }, + { + "epoch": 12.52970297029703, + "grad_norm": 0.01640200801193714, + "learning_rate": 1.8404886869828974e-05, + "loss": 0.0068, + "num_input_tokens_seen": 24036704, + "step": 113895 + }, + { + "epoch": 12.53025302530253, + "grad_norm": 2.508552312850952, + "learning_rate": 1.8402571852106427e-05, + "loss": 0.0559, + "num_input_tokens_seen": 24037760, + "step": 113900 + }, + { + "epoch": 12.530803080308031, + "grad_norm": 0.054520487785339355, + "learning_rate": 1.8400256895189217e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24038816, + "step": 113905 + }, + { + "epoch": 12.531353135313532, + "grad_norm": 0.013837341219186783, + "learning_rate": 1.839794199909867e-05, + "loss": 0.0121, + "num_input_tokens_seen": 24039872, + "step": 113910 + }, + { + "epoch": 12.531903190319031, + "grad_norm": 0.0327664278447628, + "learning_rate": 1.8395627163856116e-05, + "loss": 0.0908, + "num_input_tokens_seen": 24040992, + "step": 113915 + }, + { + "epoch": 12.532453245324533, + "grad_norm": 0.08634637296199799, + "learning_rate": 1.8393312389482912e-05, + "loss": 0.002, + "num_input_tokens_seen": 24042048, + "step": 113920 + }, + { + "epoch": 12.533003300330034, + "grad_norm": 0.021408963948488235, + "learning_rate": 1.8390997676000362e-05, + "loss": 0.021, + "num_input_tokens_seen": 24043136, + "step": 113925 + }, + { + "epoch": 12.533553355335533, + "grad_norm": 0.005049980711191893, + "learning_rate": 1.8388683023429825e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24044192, + "step": 113930 + }, + { + "epoch": 12.534103410341034, + "grad_norm": 0.14312204718589783, + "learning_rate": 1.838636843179263e-05, + "loss": 0.0038, + "num_input_tokens_seen": 24045248, + "step": 113935 + }, + { + "epoch": 12.534653465346535, + "grad_norm": 0.003729949938133359, + "learning_rate": 1.8384053901110082e-05, + "loss": 0.0273, + "num_input_tokens_seen": 24046368, + "step": 113940 + }, + { + "epoch": 12.535203520352034, + "grad_norm": 0.011109253391623497, + "learning_rate": 1.8381739431403554e-05, + "loss": 0.0137, + "num_input_tokens_seen": 24047392, + "step": 113945 + }, + { + "epoch": 12.535753575357536, + "grad_norm": 0.01539607159793377, + "learning_rate": 1.8379425022694348e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24048544, + "step": 113950 + }, + { + "epoch": 12.536303630363037, + "grad_norm": 0.012874647974967957, + "learning_rate": 1.8377110675003806e-05, + "loss": 0.0087, + "num_input_tokens_seen": 24049632, + "step": 113955 + }, + { + "epoch": 12.536853685368538, + "grad_norm": 0.09394843131303787, + "learning_rate": 1.8374796388353264e-05, + "loss": 0.0533, + "num_input_tokens_seen": 24050688, + "step": 113960 + }, + { + "epoch": 12.537403740374037, + "grad_norm": 2.0859012603759766, + "learning_rate": 1.8372482162764033e-05, + "loss": 0.0369, + "num_input_tokens_seen": 24051680, + "step": 113965 + }, + { + "epoch": 12.537953795379538, + "grad_norm": 1.607842206954956, + "learning_rate": 1.8370167998257465e-05, + "loss": 0.0866, + "num_input_tokens_seen": 24052736, + "step": 113970 + }, + { + "epoch": 12.53850385038504, + "grad_norm": 4.319972515106201, + "learning_rate": 1.8367853894854868e-05, + "loss": 0.0907, + "num_input_tokens_seen": 24053760, + "step": 113975 + }, + { + "epoch": 12.539053905390539, + "grad_norm": 0.016209181398153305, + "learning_rate": 1.8365539852577587e-05, + "loss": 0.0527, + "num_input_tokens_seen": 24054720, + "step": 113980 + }, + { + "epoch": 12.53960396039604, + "grad_norm": 0.011808923445641994, + "learning_rate": 1.836322587144694e-05, + "loss": 0.0069, + "num_input_tokens_seen": 24055712, + "step": 113985 + }, + { + "epoch": 12.54015401540154, + "grad_norm": 0.003397237043827772, + "learning_rate": 1.836091195148425e-05, + "loss": 0.0771, + "num_input_tokens_seen": 24056800, + "step": 113990 + }, + { + "epoch": 12.54070407040704, + "grad_norm": 2.3898441791534424, + "learning_rate": 1.8358598092710858e-05, + "loss": 0.0875, + "num_input_tokens_seen": 24057824, + "step": 113995 + }, + { + "epoch": 12.541254125412541, + "grad_norm": 0.013379900716245174, + "learning_rate": 1.8356284295148067e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24058880, + "step": 114000 + }, + { + "epoch": 12.541804180418042, + "grad_norm": 0.046897564083337784, + "learning_rate": 1.835397055881723e-05, + "loss": 0.0288, + "num_input_tokens_seen": 24059904, + "step": 114005 + }, + { + "epoch": 12.542354235423542, + "grad_norm": 0.005157775245606899, + "learning_rate": 1.8351656883739647e-05, + "loss": 0.0392, + "num_input_tokens_seen": 24060960, + "step": 114010 + }, + { + "epoch": 12.542904290429043, + "grad_norm": 0.029011158272624016, + "learning_rate": 1.8349343269936652e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24062048, + "step": 114015 + }, + { + "epoch": 12.543454345434544, + "grad_norm": 0.21482345461845398, + "learning_rate": 1.8347029717429583e-05, + "loss": 0.0076, + "num_input_tokens_seen": 24063168, + "step": 114020 + }, + { + "epoch": 12.544004400440045, + "grad_norm": 0.00965277198702097, + "learning_rate": 1.8344716226239733e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24064224, + "step": 114025 + }, + { + "epoch": 12.544554455445544, + "grad_norm": 0.041214510798454285, + "learning_rate": 1.8342402796388445e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24065248, + "step": 114030 + }, + { + "epoch": 12.545104510451045, + "grad_norm": 0.03444405645132065, + "learning_rate": 1.8340089427897044e-05, + "loss": 0.0056, + "num_input_tokens_seen": 24066304, + "step": 114035 + }, + { + "epoch": 12.545654565456546, + "grad_norm": 0.9743524193763733, + "learning_rate": 1.833777612078683e-05, + "loss": 0.0203, + "num_input_tokens_seen": 24067360, + "step": 114040 + }, + { + "epoch": 12.546204620462046, + "grad_norm": 0.07059656828641891, + "learning_rate": 1.8335462875079147e-05, + "loss": 0.1133, + "num_input_tokens_seen": 24068416, + "step": 114045 + }, + { + "epoch": 12.546754675467547, + "grad_norm": 0.1701379418373108, + "learning_rate": 1.8333149690795294e-05, + "loss": 0.0766, + "num_input_tokens_seen": 24069504, + "step": 114050 + }, + { + "epoch": 12.547304730473048, + "grad_norm": 1.6655254364013672, + "learning_rate": 1.8330836567956614e-05, + "loss": 0.0792, + "num_input_tokens_seen": 24070624, + "step": 114055 + }, + { + "epoch": 12.547854785478547, + "grad_norm": 0.05210261046886444, + "learning_rate": 1.8328523506584404e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24071648, + "step": 114060 + }, + { + "epoch": 12.548404840484048, + "grad_norm": 0.06558574736118317, + "learning_rate": 1.8326210506699993e-05, + "loss": 0.0056, + "num_input_tokens_seen": 24072736, + "step": 114065 + }, + { + "epoch": 12.54895489548955, + "grad_norm": 0.012758993543684483, + "learning_rate": 1.83238975683247e-05, + "loss": 0.0106, + "num_input_tokens_seen": 24073856, + "step": 114070 + }, + { + "epoch": 12.549504950495049, + "grad_norm": 0.012155258096754551, + "learning_rate": 1.8321584691479833e-05, + "loss": 0.0095, + "num_input_tokens_seen": 24074912, + "step": 114075 + }, + { + "epoch": 12.55005500550055, + "grad_norm": 0.03243435174226761, + "learning_rate": 1.8319271876186723e-05, + "loss": 0.0114, + "num_input_tokens_seen": 24075968, + "step": 114080 + }, + { + "epoch": 12.55060506050605, + "grad_norm": 0.6881588697433472, + "learning_rate": 1.831695912246668e-05, + "loss": 0.0308, + "num_input_tokens_seen": 24076992, + "step": 114085 + }, + { + "epoch": 12.551155115511552, + "grad_norm": 0.01518958155065775, + "learning_rate": 1.831464643034101e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24077984, + "step": 114090 + }, + { + "epoch": 12.551705170517051, + "grad_norm": 0.03017381578683853, + "learning_rate": 1.8312333799831045e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24079072, + "step": 114095 + }, + { + "epoch": 12.552255225522552, + "grad_norm": 1.0409736633300781, + "learning_rate": 1.8310021230958085e-05, + "loss": 0.0257, + "num_input_tokens_seen": 24080160, + "step": 114100 + }, + { + "epoch": 12.552805280528053, + "grad_norm": 0.01806940883398056, + "learning_rate": 1.8307708723743446e-05, + "loss": 0.1063, + "num_input_tokens_seen": 24081216, + "step": 114105 + }, + { + "epoch": 12.553355335533553, + "grad_norm": 0.04061044007539749, + "learning_rate": 1.8305396278208454e-05, + "loss": 0.0706, + "num_input_tokens_seen": 24082240, + "step": 114110 + }, + { + "epoch": 12.553905390539054, + "grad_norm": 0.005994291510432959, + "learning_rate": 1.8303083894374392e-05, + "loss": 0.0603, + "num_input_tokens_seen": 24083264, + "step": 114115 + }, + { + "epoch": 12.554455445544555, + "grad_norm": 0.029989616945385933, + "learning_rate": 1.8300771572262615e-05, + "loss": 0.0384, + "num_input_tokens_seen": 24084288, + "step": 114120 + }, + { + "epoch": 12.555005500550054, + "grad_norm": 0.019854426383972168, + "learning_rate": 1.82984593118944e-05, + "loss": 0.1024, + "num_input_tokens_seen": 24085344, + "step": 114125 + }, + { + "epoch": 12.555555555555555, + "grad_norm": 0.02169816754758358, + "learning_rate": 1.8296147113291068e-05, + "loss": 0.0157, + "num_input_tokens_seen": 24086400, + "step": 114130 + }, + { + "epoch": 12.556105610561056, + "grad_norm": 0.022369569167494774, + "learning_rate": 1.8293834976473944e-05, + "loss": 0.1496, + "num_input_tokens_seen": 24087424, + "step": 114135 + }, + { + "epoch": 12.556655665566556, + "grad_norm": 0.3510085344314575, + "learning_rate": 1.829152290146431e-05, + "loss": 0.0524, + "num_input_tokens_seen": 24088480, + "step": 114140 + }, + { + "epoch": 12.557205720572057, + "grad_norm": 2.0648412704467773, + "learning_rate": 1.8289210888283498e-05, + "loss": 0.0833, + "num_input_tokens_seen": 24089472, + "step": 114145 + }, + { + "epoch": 12.557755775577558, + "grad_norm": 0.010651815682649612, + "learning_rate": 1.8286898936952817e-05, + "loss": 0.0862, + "num_input_tokens_seen": 24090464, + "step": 114150 + }, + { + "epoch": 12.558305830583059, + "grad_norm": 0.6179704666137695, + "learning_rate": 1.828458704749355e-05, + "loss": 0.0061, + "num_input_tokens_seen": 24091456, + "step": 114155 + }, + { + "epoch": 12.558855885588558, + "grad_norm": 0.12232392281293869, + "learning_rate": 1.8282275219927034e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24092544, + "step": 114160 + }, + { + "epoch": 12.55940594059406, + "grad_norm": 0.17190487682819366, + "learning_rate": 1.8279963454274557e-05, + "loss": 0.0813, + "num_input_tokens_seen": 24093536, + "step": 114165 + }, + { + "epoch": 12.55995599559956, + "grad_norm": 0.07646667957305908, + "learning_rate": 1.827765175055744e-05, + "loss": 0.0981, + "num_input_tokens_seen": 24094624, + "step": 114170 + }, + { + "epoch": 12.56050605060506, + "grad_norm": 0.03919572755694389, + "learning_rate": 1.8275340108796978e-05, + "loss": 0.0912, + "num_input_tokens_seen": 24095680, + "step": 114175 + }, + { + "epoch": 12.561056105610561, + "grad_norm": 0.11777619272470474, + "learning_rate": 1.8273028529014474e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24096736, + "step": 114180 + }, + { + "epoch": 12.561606160616062, + "grad_norm": 0.0808170959353447, + "learning_rate": 1.827071701123124e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24097728, + "step": 114185 + }, + { + "epoch": 12.562156215621561, + "grad_norm": 0.053592268377542496, + "learning_rate": 1.8268405555468577e-05, + "loss": 0.0132, + "num_input_tokens_seen": 24098816, + "step": 114190 + }, + { + "epoch": 12.562706270627062, + "grad_norm": 1.8144153356552124, + "learning_rate": 1.82660941617478e-05, + "loss": 0.0503, + "num_input_tokens_seen": 24099936, + "step": 114195 + }, + { + "epoch": 12.563256325632564, + "grad_norm": 0.12192433327436447, + "learning_rate": 1.8263782830090195e-05, + "loss": 0.0634, + "num_input_tokens_seen": 24101056, + "step": 114200 + }, + { + "epoch": 12.563806380638063, + "grad_norm": 0.15111052989959717, + "learning_rate": 1.826147156051707e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24102112, + "step": 114205 + }, + { + "epoch": 12.564356435643564, + "grad_norm": 0.055816445499658585, + "learning_rate": 1.8259160353049733e-05, + "loss": 0.0342, + "num_input_tokens_seen": 24103136, + "step": 114210 + }, + { + "epoch": 12.564906490649065, + "grad_norm": 3.057920217514038, + "learning_rate": 1.825684920770947e-05, + "loss": 0.065, + "num_input_tokens_seen": 24104192, + "step": 114215 + }, + { + "epoch": 12.565456545654566, + "grad_norm": 0.09012199938297272, + "learning_rate": 1.82545381245176e-05, + "loss": 0.0038, + "num_input_tokens_seen": 24105344, + "step": 114220 + }, + { + "epoch": 12.566006600660065, + "grad_norm": 0.028583763167262077, + "learning_rate": 1.8252227103495424e-05, + "loss": 0.0068, + "num_input_tokens_seen": 24106400, + "step": 114225 + }, + { + "epoch": 12.566556655665567, + "grad_norm": 0.027046676725149155, + "learning_rate": 1.824991614466422e-05, + "loss": 0.0017, + "num_input_tokens_seen": 24107456, + "step": 114230 + }, + { + "epoch": 12.567106710671068, + "grad_norm": 0.022002551704645157, + "learning_rate": 1.8247605248045303e-05, + "loss": 0.0428, + "num_input_tokens_seen": 24108576, + "step": 114235 + }, + { + "epoch": 12.567656765676567, + "grad_norm": 1.7108368873596191, + "learning_rate": 1.8245294413659964e-05, + "loss": 0.0446, + "num_input_tokens_seen": 24109600, + "step": 114240 + }, + { + "epoch": 12.568206820682068, + "grad_norm": 0.034256160259246826, + "learning_rate": 1.8242983641529508e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24110656, + "step": 114245 + }, + { + "epoch": 12.56875687568757, + "grad_norm": 0.04285002872347832, + "learning_rate": 1.824067293167524e-05, + "loss": 0.0102, + "num_input_tokens_seen": 24111712, + "step": 114250 + }, + { + "epoch": 12.569306930693068, + "grad_norm": 0.08283250033855438, + "learning_rate": 1.8238362284118433e-05, + "loss": 0.0161, + "num_input_tokens_seen": 24112832, + "step": 114255 + }, + { + "epoch": 12.56985698569857, + "grad_norm": 0.028166458010673523, + "learning_rate": 1.82360516988804e-05, + "loss": 0.061, + "num_input_tokens_seen": 24113888, + "step": 114260 + }, + { + "epoch": 12.57040704070407, + "grad_norm": 0.16271422803401947, + "learning_rate": 1.8233741175982428e-05, + "loss": 0.0314, + "num_input_tokens_seen": 24115008, + "step": 114265 + }, + { + "epoch": 12.570957095709572, + "grad_norm": 0.009317801333963871, + "learning_rate": 1.823143071544583e-05, + "loss": 0.0724, + "num_input_tokens_seen": 24116032, + "step": 114270 + }, + { + "epoch": 12.571507150715071, + "grad_norm": 0.0332842655479908, + "learning_rate": 1.8229120317291877e-05, + "loss": 0.075, + "num_input_tokens_seen": 24117088, + "step": 114275 + }, + { + "epoch": 12.572057205720572, + "grad_norm": 0.36095091700553894, + "learning_rate": 1.8226809981541872e-05, + "loss": 0.0093, + "num_input_tokens_seen": 24118112, + "step": 114280 + }, + { + "epoch": 12.572607260726073, + "grad_norm": 0.04478329047560692, + "learning_rate": 1.8224499708217117e-05, + "loss": 0.0112, + "num_input_tokens_seen": 24119168, + "step": 114285 + }, + { + "epoch": 12.573157315731573, + "grad_norm": 1.2988996505737305, + "learning_rate": 1.8222189497338895e-05, + "loss": 0.1433, + "num_input_tokens_seen": 24120160, + "step": 114290 + }, + { + "epoch": 12.573707370737074, + "grad_norm": 0.04279254004359245, + "learning_rate": 1.821987934892849e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24121248, + "step": 114295 + }, + { + "epoch": 12.574257425742575, + "grad_norm": 0.035706061869859695, + "learning_rate": 1.821756926300722e-05, + "loss": 0.0388, + "num_input_tokens_seen": 24122240, + "step": 114300 + }, + { + "epoch": 12.574807480748074, + "grad_norm": 0.08981942385435104, + "learning_rate": 1.8215259239596343e-05, + "loss": 0.0149, + "num_input_tokens_seen": 24123360, + "step": 114305 + }, + { + "epoch": 12.575357535753575, + "grad_norm": 0.04383710399270058, + "learning_rate": 1.8212949278717178e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24124448, + "step": 114310 + }, + { + "epoch": 12.575907590759076, + "grad_norm": 0.5220504999160767, + "learning_rate": 1.8210639380391002e-05, + "loss": 0.0635, + "num_input_tokens_seen": 24125568, + "step": 114315 + }, + { + "epoch": 12.576457645764577, + "grad_norm": 0.14199034869670868, + "learning_rate": 1.8208329544639098e-05, + "loss": 0.0051, + "num_input_tokens_seen": 24126624, + "step": 114320 + }, + { + "epoch": 12.577007700770077, + "grad_norm": 0.06450343132019043, + "learning_rate": 1.8206019771482772e-05, + "loss": 0.0718, + "num_input_tokens_seen": 24127648, + "step": 114325 + }, + { + "epoch": 12.577557755775578, + "grad_norm": 0.0104566290974617, + "learning_rate": 1.820371006094329e-05, + "loss": 0.0117, + "num_input_tokens_seen": 24128736, + "step": 114330 + }, + { + "epoch": 12.578107810781079, + "grad_norm": 0.03681085631251335, + "learning_rate": 1.8201400413041957e-05, + "loss": 0.045, + "num_input_tokens_seen": 24129728, + "step": 114335 + }, + { + "epoch": 12.578657865786578, + "grad_norm": 0.28735169768333435, + "learning_rate": 1.8199090827800057e-05, + "loss": 0.0207, + "num_input_tokens_seen": 24130816, + "step": 114340 + }, + { + "epoch": 12.57920792079208, + "grad_norm": 0.03149935603141785, + "learning_rate": 1.819678130523887e-05, + "loss": 0.0216, + "num_input_tokens_seen": 24131840, + "step": 114345 + }, + { + "epoch": 12.57975797579758, + "grad_norm": 1.7733399868011475, + "learning_rate": 1.8194471845379685e-05, + "loss": 0.0482, + "num_input_tokens_seen": 24132896, + "step": 114350 + }, + { + "epoch": 12.58030803080308, + "grad_norm": 0.8313104510307312, + "learning_rate": 1.8192162448243782e-05, + "loss": 0.0093, + "num_input_tokens_seen": 24133952, + "step": 114355 + }, + { + "epoch": 12.58085808580858, + "grad_norm": 0.5249998569488525, + "learning_rate": 1.8189853113852458e-05, + "loss": 0.0061, + "num_input_tokens_seen": 24135008, + "step": 114360 + }, + { + "epoch": 12.581408140814082, + "grad_norm": 0.07164414972066879, + "learning_rate": 1.8187543842226993e-05, + "loss": 0.0098, + "num_input_tokens_seen": 24136128, + "step": 114365 + }, + { + "epoch": 12.581958195819581, + "grad_norm": 0.010026929900050163, + "learning_rate": 1.818523463338866e-05, + "loss": 0.0209, + "num_input_tokens_seen": 24137216, + "step": 114370 + }, + { + "epoch": 12.582508250825082, + "grad_norm": 0.0572495236992836, + "learning_rate": 1.818292548735875e-05, + "loss": 0.0024, + "num_input_tokens_seen": 24138208, + "step": 114375 + }, + { + "epoch": 12.583058305830583, + "grad_norm": 0.013836841098964214, + "learning_rate": 1.8180616404158546e-05, + "loss": 0.0028, + "num_input_tokens_seen": 24139264, + "step": 114380 + }, + { + "epoch": 12.583608360836084, + "grad_norm": 0.30461037158966064, + "learning_rate": 1.8178307383809335e-05, + "loss": 0.0119, + "num_input_tokens_seen": 24140288, + "step": 114385 + }, + { + "epoch": 12.584158415841584, + "grad_norm": 0.3759044110774994, + "learning_rate": 1.8175998426332385e-05, + "loss": 0.0065, + "num_input_tokens_seen": 24141408, + "step": 114390 + }, + { + "epoch": 12.584708470847085, + "grad_norm": 0.166439026594162, + "learning_rate": 1.8173689531748977e-05, + "loss": 0.0028, + "num_input_tokens_seen": 24142464, + "step": 114395 + }, + { + "epoch": 12.585258525852586, + "grad_norm": 0.05735420808196068, + "learning_rate": 1.817138070008041e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24143488, + "step": 114400 + }, + { + "epoch": 12.585808580858085, + "grad_norm": 0.012091557495296001, + "learning_rate": 1.816907193134794e-05, + "loss": 0.0575, + "num_input_tokens_seen": 24144608, + "step": 114405 + }, + { + "epoch": 12.586358635863586, + "grad_norm": 0.03531729802489281, + "learning_rate": 1.8166763225572862e-05, + "loss": 0.008, + "num_input_tokens_seen": 24145728, + "step": 114410 + }, + { + "epoch": 12.586908690869087, + "grad_norm": 0.010661713778972626, + "learning_rate": 1.816445458277645e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24146720, + "step": 114415 + }, + { + "epoch": 12.587458745874587, + "grad_norm": 0.38583478331565857, + "learning_rate": 1.816214600297997e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24147776, + "step": 114420 + }, + { + "epoch": 12.588008800880088, + "grad_norm": 0.011081326752901077, + "learning_rate": 1.815983748620472e-05, + "loss": 0.007, + "num_input_tokens_seen": 24148864, + "step": 114425 + }, + { + "epoch": 12.588558855885589, + "grad_norm": 0.047576967626810074, + "learning_rate": 1.815752903247196e-05, + "loss": 0.0308, + "num_input_tokens_seen": 24149888, + "step": 114430 + }, + { + "epoch": 12.589108910891088, + "grad_norm": 1.1046067476272583, + "learning_rate": 1.8155220641802973e-05, + "loss": 0.0672, + "num_input_tokens_seen": 24150880, + "step": 114435 + }, + { + "epoch": 12.58965896589659, + "grad_norm": 0.041274379938840866, + "learning_rate": 1.815291231421904e-05, + "loss": 0.1027, + "num_input_tokens_seen": 24151968, + "step": 114440 + }, + { + "epoch": 12.59020902090209, + "grad_norm": 1.5013203620910645, + "learning_rate": 1.8150604049741417e-05, + "loss": 0.1016, + "num_input_tokens_seen": 24152992, + "step": 114445 + }, + { + "epoch": 12.590759075907592, + "grad_norm": 3.054410934448242, + "learning_rate": 1.8148295848391395e-05, + "loss": 0.0665, + "num_input_tokens_seen": 24154080, + "step": 114450 + }, + { + "epoch": 12.591309130913091, + "grad_norm": 2.260035276412964, + "learning_rate": 1.8145987710190238e-05, + "loss": 0.0817, + "num_input_tokens_seen": 24155136, + "step": 114455 + }, + { + "epoch": 12.591859185918592, + "grad_norm": 0.06990143656730652, + "learning_rate": 1.8143679635159232e-05, + "loss": 0.0075, + "num_input_tokens_seen": 24156224, + "step": 114460 + }, + { + "epoch": 12.592409240924093, + "grad_norm": 4.353559494018555, + "learning_rate": 1.814137162331964e-05, + "loss": 0.0859, + "num_input_tokens_seen": 24157344, + "step": 114465 + }, + { + "epoch": 12.592959295929592, + "grad_norm": 0.014553777873516083, + "learning_rate": 1.8139063674692723e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24158400, + "step": 114470 + }, + { + "epoch": 12.593509350935093, + "grad_norm": 0.025617098435759544, + "learning_rate": 1.813675578929978e-05, + "loss": 0.001, + "num_input_tokens_seen": 24159456, + "step": 114475 + }, + { + "epoch": 12.594059405940595, + "grad_norm": 1.6519477367401123, + "learning_rate": 1.813444796716205e-05, + "loss": 0.0529, + "num_input_tokens_seen": 24160480, + "step": 114480 + }, + { + "epoch": 12.594609460946094, + "grad_norm": 1.6518042087554932, + "learning_rate": 1.813214020830083e-05, + "loss": 0.0815, + "num_input_tokens_seen": 24161536, + "step": 114485 + }, + { + "epoch": 12.595159515951595, + "grad_norm": 0.08912354707717896, + "learning_rate": 1.8129832512737376e-05, + "loss": 0.0688, + "num_input_tokens_seen": 24162624, + "step": 114490 + }, + { + "epoch": 12.595709570957096, + "grad_norm": 0.0759839117527008, + "learning_rate": 1.8127524880492952e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24163680, + "step": 114495 + }, + { + "epoch": 12.596259625962595, + "grad_norm": 0.026729809120297432, + "learning_rate": 1.8125217311588846e-05, + "loss": 0.083, + "num_input_tokens_seen": 24164672, + "step": 114500 + }, + { + "epoch": 12.596809680968097, + "grad_norm": 0.43335849046707153, + "learning_rate": 1.8122909806046307e-05, + "loss": 0.0065, + "num_input_tokens_seen": 24165696, + "step": 114505 + }, + { + "epoch": 12.597359735973598, + "grad_norm": 0.31669697165489197, + "learning_rate": 1.81206023638866e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24166784, + "step": 114510 + }, + { + "epoch": 12.597909790979099, + "grad_norm": 0.0187861155718565, + "learning_rate": 1.811829498513102e-05, + "loss": 0.0225, + "num_input_tokens_seen": 24167872, + "step": 114515 + }, + { + "epoch": 12.598459845984598, + "grad_norm": 0.0501592755317688, + "learning_rate": 1.8115987669800795e-05, + "loss": 0.037, + "num_input_tokens_seen": 24168960, + "step": 114520 + }, + { + "epoch": 12.599009900990099, + "grad_norm": 0.03618689626455307, + "learning_rate": 1.811368041791721e-05, + "loss": 0.0539, + "num_input_tokens_seen": 24169952, + "step": 114525 + }, + { + "epoch": 12.5995599559956, + "grad_norm": 0.006473650690168142, + "learning_rate": 1.8111373229501538e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24171008, + "step": 114530 + }, + { + "epoch": 12.6001100110011, + "grad_norm": 0.013398510403931141, + "learning_rate": 1.8109066104575023e-05, + "loss": 0.0231, + "num_input_tokens_seen": 24172160, + "step": 114535 + }, + { + "epoch": 12.6006600660066, + "grad_norm": 0.1749749779701233, + "learning_rate": 1.8106759043158943e-05, + "loss": 0.0066, + "num_input_tokens_seen": 24173216, + "step": 114540 + }, + { + "epoch": 12.601210121012102, + "grad_norm": 0.04021377116441727, + "learning_rate": 1.8104452045274556e-05, + "loss": 0.0042, + "num_input_tokens_seen": 24174272, + "step": 114545 + }, + { + "epoch": 12.601760176017601, + "grad_norm": 0.013869086280465126, + "learning_rate": 1.810214511094313e-05, + "loss": 0.0086, + "num_input_tokens_seen": 24175360, + "step": 114550 + }, + { + "epoch": 12.602310231023102, + "grad_norm": 0.7940022349357605, + "learning_rate": 1.8099838240185923e-05, + "loss": 0.0568, + "num_input_tokens_seen": 24176448, + "step": 114555 + }, + { + "epoch": 12.602860286028603, + "grad_norm": 0.009507793001830578, + "learning_rate": 1.809753143302419e-05, + "loss": 0.0129, + "num_input_tokens_seen": 24177504, + "step": 114560 + }, + { + "epoch": 12.603410341034103, + "grad_norm": 0.009057764895260334, + "learning_rate": 1.80952246894792e-05, + "loss": 0.0079, + "num_input_tokens_seen": 24178560, + "step": 114565 + }, + { + "epoch": 12.603960396039604, + "grad_norm": 0.15868988633155823, + "learning_rate": 1.8092918009572208e-05, + "loss": 0.0101, + "num_input_tokens_seen": 24179616, + "step": 114570 + }, + { + "epoch": 12.604510451045105, + "grad_norm": 0.9610435366630554, + "learning_rate": 1.8090611393324485e-05, + "loss": 0.0295, + "num_input_tokens_seen": 24180768, + "step": 114575 + }, + { + "epoch": 12.605060506050606, + "grad_norm": 0.041332218796014786, + "learning_rate": 1.8088304840757275e-05, + "loss": 0.0487, + "num_input_tokens_seen": 24181856, + "step": 114580 + }, + { + "epoch": 12.605610561056105, + "grad_norm": 0.010948209092020988, + "learning_rate": 1.8085998351891842e-05, + "loss": 0.0012, + "num_input_tokens_seen": 24182880, + "step": 114585 + }, + { + "epoch": 12.606160616061606, + "grad_norm": 0.49854961037635803, + "learning_rate": 1.808369192674945e-05, + "loss": 0.0686, + "num_input_tokens_seen": 24183904, + "step": 114590 + }, + { + "epoch": 12.606710671067107, + "grad_norm": 0.09002776443958282, + "learning_rate": 1.8081385565351343e-05, + "loss": 0.1085, + "num_input_tokens_seen": 24184992, + "step": 114595 + }, + { + "epoch": 12.607260726072607, + "grad_norm": 1.9051066637039185, + "learning_rate": 1.807907926771879e-05, + "loss": 0.0387, + "num_input_tokens_seen": 24186080, + "step": 114600 + }, + { + "epoch": 12.607810781078108, + "grad_norm": 3.7960877418518066, + "learning_rate": 1.8076773033873046e-05, + "loss": 0.1355, + "num_input_tokens_seen": 24187136, + "step": 114605 + }, + { + "epoch": 12.608360836083609, + "grad_norm": 0.0194424856454134, + "learning_rate": 1.8074466863835355e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24188192, + "step": 114610 + }, + { + "epoch": 12.608910891089108, + "grad_norm": 0.019132178276777267, + "learning_rate": 1.8072160757626987e-05, + "loss": 0.002, + "num_input_tokens_seen": 24189216, + "step": 114615 + }, + { + "epoch": 12.60946094609461, + "grad_norm": 1.785589337348938, + "learning_rate": 1.806985471526918e-05, + "loss": 0.0297, + "num_input_tokens_seen": 24190272, + "step": 114620 + }, + { + "epoch": 12.61001100110011, + "grad_norm": 0.016688385978341103, + "learning_rate": 1.8067548736783204e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24191328, + "step": 114625 + }, + { + "epoch": 12.61056105610561, + "grad_norm": 0.30011242628097534, + "learning_rate": 1.806524282219031e-05, + "loss": 0.023, + "num_input_tokens_seen": 24192384, + "step": 114630 + }, + { + "epoch": 12.61111111111111, + "grad_norm": 0.008864756673574448, + "learning_rate": 1.8062936971511735e-05, + "loss": 0.0077, + "num_input_tokens_seen": 24193440, + "step": 114635 + }, + { + "epoch": 12.611661166116612, + "grad_norm": 0.11122319847345352, + "learning_rate": 1.8060631184768744e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24194528, + "step": 114640 + }, + { + "epoch": 12.612211221122113, + "grad_norm": 0.008704468607902527, + "learning_rate": 1.8058325461982582e-05, + "loss": 0.0168, + "num_input_tokens_seen": 24195616, + "step": 114645 + }, + { + "epoch": 12.612761276127612, + "grad_norm": 0.04704107716679573, + "learning_rate": 1.8056019803174513e-05, + "loss": 0.0084, + "num_input_tokens_seen": 24196704, + "step": 114650 + }, + { + "epoch": 12.613311331133113, + "grad_norm": 0.1239408403635025, + "learning_rate": 1.8053714208365773e-05, + "loss": 0.0104, + "num_input_tokens_seen": 24197760, + "step": 114655 + }, + { + "epoch": 12.613861386138614, + "grad_norm": 0.06646318733692169, + "learning_rate": 1.805140867757761e-05, + "loss": 0.0317, + "num_input_tokens_seen": 24198784, + "step": 114660 + }, + { + "epoch": 12.614411441144114, + "grad_norm": 0.026357393711805344, + "learning_rate": 1.804910321083128e-05, + "loss": 0.0017, + "num_input_tokens_seen": 24199872, + "step": 114665 + }, + { + "epoch": 12.614961496149615, + "grad_norm": 0.05678282305598259, + "learning_rate": 1.804679780814803e-05, + "loss": 0.01, + "num_input_tokens_seen": 24201024, + "step": 114670 + }, + { + "epoch": 12.615511551155116, + "grad_norm": 0.029307691380381584, + "learning_rate": 1.804449246954912e-05, + "loss": 0.0065, + "num_input_tokens_seen": 24202112, + "step": 114675 + }, + { + "epoch": 12.616061606160617, + "grad_norm": 0.026649706065654755, + "learning_rate": 1.8042187195055776e-05, + "loss": 0.0082, + "num_input_tokens_seen": 24203200, + "step": 114680 + }, + { + "epoch": 12.616611661166116, + "grad_norm": 1.5373163223266602, + "learning_rate": 1.8039881984689253e-05, + "loss": 0.0419, + "num_input_tokens_seen": 24204224, + "step": 114685 + }, + { + "epoch": 12.617161716171617, + "grad_norm": 0.007258086930960417, + "learning_rate": 1.803757683847081e-05, + "loss": 0.015, + "num_input_tokens_seen": 24205312, + "step": 114690 + }, + { + "epoch": 12.617711771177119, + "grad_norm": 0.06641701608896255, + "learning_rate": 1.8035271756421666e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24206432, + "step": 114695 + }, + { + "epoch": 12.618261826182618, + "grad_norm": 2.144305944442749, + "learning_rate": 1.8032966738563086e-05, + "loss": 0.1352, + "num_input_tokens_seen": 24207520, + "step": 114700 + }, + { + "epoch": 12.618811881188119, + "grad_norm": 0.8886732459068298, + "learning_rate": 1.8030661784916308e-05, + "loss": 0.0557, + "num_input_tokens_seen": 24208576, + "step": 114705 + }, + { + "epoch": 12.61936193619362, + "grad_norm": 2.6461029052734375, + "learning_rate": 1.8028356895502573e-05, + "loss": 0.1141, + "num_input_tokens_seen": 24209696, + "step": 114710 + }, + { + "epoch": 12.61991199119912, + "grad_norm": 0.015074132941663265, + "learning_rate": 1.802605207034313e-05, + "loss": 0.0309, + "num_input_tokens_seen": 24210752, + "step": 114715 + }, + { + "epoch": 12.62046204620462, + "grad_norm": 2.03307843208313, + "learning_rate": 1.8023747309459225e-05, + "loss": 0.0281, + "num_input_tokens_seen": 24211712, + "step": 114720 + }, + { + "epoch": 12.621012101210122, + "grad_norm": 0.0097417701035738, + "learning_rate": 1.8021442612872076e-05, + "loss": 0.0028, + "num_input_tokens_seen": 24212832, + "step": 114725 + }, + { + "epoch": 12.62156215621562, + "grad_norm": 0.015091360546648502, + "learning_rate": 1.801913798060296e-05, + "loss": 0.0033, + "num_input_tokens_seen": 24213920, + "step": 114730 + }, + { + "epoch": 12.622112211221122, + "grad_norm": 0.05483239144086838, + "learning_rate": 1.8016833412673087e-05, + "loss": 0.0037, + "num_input_tokens_seen": 24214944, + "step": 114735 + }, + { + "epoch": 12.622662266226623, + "grad_norm": 1.2570011615753174, + "learning_rate": 1.8014528909103716e-05, + "loss": 0.0151, + "num_input_tokens_seen": 24215936, + "step": 114740 + }, + { + "epoch": 12.623212321232124, + "grad_norm": 0.01383251789957285, + "learning_rate": 1.8012224469916084e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24217024, + "step": 114745 + }, + { + "epoch": 12.623762376237623, + "grad_norm": 0.09171174466609955, + "learning_rate": 1.8009920095131415e-05, + "loss": 0.007, + "num_input_tokens_seen": 24218048, + "step": 114750 + }, + { + "epoch": 12.624312431243125, + "grad_norm": 0.020242897793650627, + "learning_rate": 1.8007615784770965e-05, + "loss": 0.0055, + "num_input_tokens_seen": 24219072, + "step": 114755 + }, + { + "epoch": 12.624862486248626, + "grad_norm": 0.05513587221503258, + "learning_rate": 1.8005311538855963e-05, + "loss": 0.0975, + "num_input_tokens_seen": 24220160, + "step": 114760 + }, + { + "epoch": 12.625412541254125, + "grad_norm": 0.09679923951625824, + "learning_rate": 1.8003007357407652e-05, + "loss": 0.0068, + "num_input_tokens_seen": 24221248, + "step": 114765 + }, + { + "epoch": 12.625962596259626, + "grad_norm": 0.02392997033894062, + "learning_rate": 1.8000703240447263e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24222304, + "step": 114770 + }, + { + "epoch": 12.626512651265127, + "grad_norm": 0.05441474914550781, + "learning_rate": 1.799839918799603e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24223328, + "step": 114775 + }, + { + "epoch": 12.627062706270626, + "grad_norm": 0.15437085926532745, + "learning_rate": 1.79960952000752e-05, + "loss": 0.0704, + "num_input_tokens_seen": 24224416, + "step": 114780 + }, + { + "epoch": 12.627612761276128, + "grad_norm": 0.06645704805850983, + "learning_rate": 1.7993791276705985e-05, + "loss": 0.0053, + "num_input_tokens_seen": 24225472, + "step": 114785 + }, + { + "epoch": 12.628162816281629, + "grad_norm": 0.01137420255690813, + "learning_rate": 1.7991487417909647e-05, + "loss": 0.0089, + "num_input_tokens_seen": 24226560, + "step": 114790 + }, + { + "epoch": 12.628712871287128, + "grad_norm": 0.24401716887950897, + "learning_rate": 1.7989183623707406e-05, + "loss": 0.0221, + "num_input_tokens_seen": 24227648, + "step": 114795 + }, + { + "epoch": 12.629262926292629, + "grad_norm": 0.23352371156215668, + "learning_rate": 1.7986879894120486e-05, + "loss": 0.0731, + "num_input_tokens_seen": 24228672, + "step": 114800 + }, + { + "epoch": 12.62981298129813, + "grad_norm": 0.10304965823888779, + "learning_rate": 1.7984576229170142e-05, + "loss": 0.0943, + "num_input_tokens_seen": 24229664, + "step": 114805 + }, + { + "epoch": 12.630363036303631, + "grad_norm": 5.406233787536621, + "learning_rate": 1.798227262887758e-05, + "loss": 0.0375, + "num_input_tokens_seen": 24230720, + "step": 114810 + }, + { + "epoch": 12.63091309130913, + "grad_norm": 0.01037838589400053, + "learning_rate": 1.7979969093264053e-05, + "loss": 0.0032, + "num_input_tokens_seen": 24231776, + "step": 114815 + }, + { + "epoch": 12.631463146314632, + "grad_norm": 0.0077848974615335464, + "learning_rate": 1.797766562235078e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24232832, + "step": 114820 + }, + { + "epoch": 12.632013201320133, + "grad_norm": 0.19098930060863495, + "learning_rate": 1.7975362216158993e-05, + "loss": 0.0783, + "num_input_tokens_seen": 24233888, + "step": 114825 + }, + { + "epoch": 12.632563256325632, + "grad_norm": 0.06379356980323792, + "learning_rate": 1.797305887470992e-05, + "loss": 0.0032, + "num_input_tokens_seen": 24234944, + "step": 114830 + }, + { + "epoch": 12.633113311331133, + "grad_norm": 1.595929741859436, + "learning_rate": 1.7970755598024784e-05, + "loss": 0.0515, + "num_input_tokens_seen": 24236032, + "step": 114835 + }, + { + "epoch": 12.633663366336634, + "grad_norm": 0.03392508253455162, + "learning_rate": 1.7968452386124836e-05, + "loss": 0.009, + "num_input_tokens_seen": 24237152, + "step": 114840 + }, + { + "epoch": 12.634213421342134, + "grad_norm": 0.055688727647066116, + "learning_rate": 1.7966149239031283e-05, + "loss": 0.0076, + "num_input_tokens_seen": 24238208, + "step": 114845 + }, + { + "epoch": 12.634763476347635, + "grad_norm": 0.0030927197076380253, + "learning_rate": 1.7963846156765353e-05, + "loss": 0.1149, + "num_input_tokens_seen": 24239264, + "step": 114850 + }, + { + "epoch": 12.635313531353136, + "grad_norm": 0.008149903267621994, + "learning_rate": 1.796154313934828e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24240320, + "step": 114855 + }, + { + "epoch": 12.635863586358635, + "grad_norm": 0.027008865028619766, + "learning_rate": 1.795924018680128e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24241280, + "step": 114860 + }, + { + "epoch": 12.636413641364136, + "grad_norm": 0.05012377351522446, + "learning_rate": 1.7956937299145595e-05, + "loss": 0.0403, + "num_input_tokens_seen": 24242368, + "step": 114865 + }, + { + "epoch": 12.636963696369637, + "grad_norm": 1.0450176000595093, + "learning_rate": 1.7954634476402438e-05, + "loss": 0.01, + "num_input_tokens_seen": 24243456, + "step": 114870 + }, + { + "epoch": 12.637513751375138, + "grad_norm": 0.021556919440627098, + "learning_rate": 1.7952331718593024e-05, + "loss": 0.0353, + "num_input_tokens_seen": 24244544, + "step": 114875 + }, + { + "epoch": 12.638063806380638, + "grad_norm": 0.04935586825013161, + "learning_rate": 1.7950029025738602e-05, + "loss": 0.017, + "num_input_tokens_seen": 24245632, + "step": 114880 + }, + { + "epoch": 12.638613861386139, + "grad_norm": 0.09112007170915604, + "learning_rate": 1.794772639786037e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24246624, + "step": 114885 + }, + { + "epoch": 12.63916391639164, + "grad_norm": 0.9510229229927063, + "learning_rate": 1.794542383497956e-05, + "loss": 0.0116, + "num_input_tokens_seen": 24247648, + "step": 114890 + }, + { + "epoch": 12.63971397139714, + "grad_norm": 0.058220285922288895, + "learning_rate": 1.7943121337117397e-05, + "loss": 0.0503, + "num_input_tokens_seen": 24248640, + "step": 114895 + }, + { + "epoch": 12.64026402640264, + "grad_norm": 3.074319362640381, + "learning_rate": 1.7940818904295094e-05, + "loss": 0.0229, + "num_input_tokens_seen": 24249760, + "step": 114900 + }, + { + "epoch": 12.640814081408141, + "grad_norm": 0.052687257528305054, + "learning_rate": 1.7938516536533876e-05, + "loss": 0.0071, + "num_input_tokens_seen": 24250784, + "step": 114905 + }, + { + "epoch": 12.64136413641364, + "grad_norm": 0.06480596959590912, + "learning_rate": 1.7936214233854966e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24251840, + "step": 114910 + }, + { + "epoch": 12.641914191419142, + "grad_norm": 0.76939457654953, + "learning_rate": 1.7933911996279574e-05, + "loss": 0.0176, + "num_input_tokens_seen": 24252864, + "step": 114915 + }, + { + "epoch": 12.642464246424643, + "grad_norm": 0.0373072624206543, + "learning_rate": 1.793160982382893e-05, + "loss": 0.0164, + "num_input_tokens_seen": 24253984, + "step": 114920 + }, + { + "epoch": 12.643014301430142, + "grad_norm": 0.05678539350628853, + "learning_rate": 1.7929307716524246e-05, + "loss": 0.1025, + "num_input_tokens_seen": 24255072, + "step": 114925 + }, + { + "epoch": 12.643564356435643, + "grad_norm": 0.018675168976187706, + "learning_rate": 1.7927005674386738e-05, + "loss": 0.069, + "num_input_tokens_seen": 24256064, + "step": 114930 + }, + { + "epoch": 12.644114411441144, + "grad_norm": 0.10055375099182129, + "learning_rate": 1.7924703697437635e-05, + "loss": 0.0068, + "num_input_tokens_seen": 24257088, + "step": 114935 + }, + { + "epoch": 12.644664466446645, + "grad_norm": 1.5410531759262085, + "learning_rate": 1.792240178569813e-05, + "loss": 0.1696, + "num_input_tokens_seen": 24258208, + "step": 114940 + }, + { + "epoch": 12.645214521452145, + "grad_norm": 0.16259755194187164, + "learning_rate": 1.7920099939189453e-05, + "loss": 0.0923, + "num_input_tokens_seen": 24259232, + "step": 114945 + }, + { + "epoch": 12.645764576457646, + "grad_norm": 0.017971675843000412, + "learning_rate": 1.7917798157932814e-05, + "loss": 0.0354, + "num_input_tokens_seen": 24260288, + "step": 114950 + }, + { + "epoch": 12.646314631463147, + "grad_norm": 0.048663169145584106, + "learning_rate": 1.7915496441949446e-05, + "loss": 0.008, + "num_input_tokens_seen": 24261344, + "step": 114955 + }, + { + "epoch": 12.646864686468646, + "grad_norm": 3.1399128437042236, + "learning_rate": 1.791319479126054e-05, + "loss": 0.0392, + "num_input_tokens_seen": 24262336, + "step": 114960 + }, + { + "epoch": 12.647414741474147, + "grad_norm": 1.057359218597412, + "learning_rate": 1.7910893205887308e-05, + "loss": 0.0486, + "num_input_tokens_seen": 24263456, + "step": 114965 + }, + { + "epoch": 12.647964796479648, + "grad_norm": 0.2547925114631653, + "learning_rate": 1.7908591685850985e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24264480, + "step": 114970 + }, + { + "epoch": 12.648514851485148, + "grad_norm": 0.14681513607501984, + "learning_rate": 1.7906290231172758e-05, + "loss": 0.0176, + "num_input_tokens_seen": 24265536, + "step": 114975 + }, + { + "epoch": 12.649064906490649, + "grad_norm": 0.36793017387390137, + "learning_rate": 1.7903988841873863e-05, + "loss": 0.0063, + "num_input_tokens_seen": 24266592, + "step": 114980 + }, + { + "epoch": 12.64961496149615, + "grad_norm": 0.02239694446325302, + "learning_rate": 1.790168751797549e-05, + "loss": 0.0365, + "num_input_tokens_seen": 24267584, + "step": 114985 + }, + { + "epoch": 12.65016501650165, + "grad_norm": 0.04044688120484352, + "learning_rate": 1.7899386259498852e-05, + "loss": 0.0023, + "num_input_tokens_seen": 24268640, + "step": 114990 + }, + { + "epoch": 12.65071507150715, + "grad_norm": 0.3672526776790619, + "learning_rate": 1.789708506646518e-05, + "loss": 0.0066, + "num_input_tokens_seen": 24269632, + "step": 114995 + }, + { + "epoch": 12.651265126512651, + "grad_norm": 3.6804428100585938, + "learning_rate": 1.789478393889565e-05, + "loss": 0.1238, + "num_input_tokens_seen": 24270688, + "step": 115000 + }, + { + "epoch": 12.651815181518153, + "grad_norm": 0.04888693243265152, + "learning_rate": 1.7892482876811494e-05, + "loss": 0.0084, + "num_input_tokens_seen": 24271776, + "step": 115005 + }, + { + "epoch": 12.652365236523652, + "grad_norm": 0.0816841647028923, + "learning_rate": 1.789018188023392e-05, + "loss": 0.0102, + "num_input_tokens_seen": 24272832, + "step": 115010 + }, + { + "epoch": 12.652915291529153, + "grad_norm": 0.046141691505908966, + "learning_rate": 1.788788094918411e-05, + "loss": 0.0023, + "num_input_tokens_seen": 24273888, + "step": 115015 + }, + { + "epoch": 12.653465346534654, + "grad_norm": 0.019196702167391777, + "learning_rate": 1.78855800836833e-05, + "loss": 0.014, + "num_input_tokens_seen": 24274976, + "step": 115020 + }, + { + "epoch": 12.654015401540153, + "grad_norm": 0.0055049206130206585, + "learning_rate": 1.788327928375268e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24275968, + "step": 115025 + }, + { + "epoch": 12.654565456545654, + "grad_norm": 0.07626672089099884, + "learning_rate": 1.7880978549413464e-05, + "loss": 0.1155, + "num_input_tokens_seen": 24276960, + "step": 115030 + }, + { + "epoch": 12.655115511551156, + "grad_norm": 0.8961281776428223, + "learning_rate": 1.7878677880686863e-05, + "loss": 0.0178, + "num_input_tokens_seen": 24277952, + "step": 115035 + }, + { + "epoch": 12.655665566556655, + "grad_norm": 0.14811433851718903, + "learning_rate": 1.7876377277594053e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24279008, + "step": 115040 + }, + { + "epoch": 12.656215621562156, + "grad_norm": 0.04227769747376442, + "learning_rate": 1.787407674015626e-05, + "loss": 0.0678, + "num_input_tokens_seen": 24280000, + "step": 115045 + }, + { + "epoch": 12.656765676567657, + "grad_norm": 0.020474474877119064, + "learning_rate": 1.787177626839468e-05, + "loss": 0.007, + "num_input_tokens_seen": 24281024, + "step": 115050 + }, + { + "epoch": 12.657315731573158, + "grad_norm": 0.7676312923431396, + "learning_rate": 1.7869475862330532e-05, + "loss": 0.0073, + "num_input_tokens_seen": 24282080, + "step": 115055 + }, + { + "epoch": 12.657865786578657, + "grad_norm": 0.01866152510046959, + "learning_rate": 1.786717552198499e-05, + "loss": 0.0308, + "num_input_tokens_seen": 24283136, + "step": 115060 + }, + { + "epoch": 12.658415841584159, + "grad_norm": 0.012331638485193253, + "learning_rate": 1.786487524737927e-05, + "loss": 0.0128, + "num_input_tokens_seen": 24284160, + "step": 115065 + }, + { + "epoch": 12.65896589658966, + "grad_norm": 0.07119853049516678, + "learning_rate": 1.786257503853458e-05, + "loss": 0.0672, + "num_input_tokens_seen": 24285248, + "step": 115070 + }, + { + "epoch": 12.659515951595159, + "grad_norm": 1.081937551498413, + "learning_rate": 1.7860274895472103e-05, + "loss": 0.0841, + "num_input_tokens_seen": 24286272, + "step": 115075 + }, + { + "epoch": 12.66006600660066, + "grad_norm": 0.008345485664904118, + "learning_rate": 1.7857974818213044e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24287264, + "step": 115080 + }, + { + "epoch": 12.660616061606161, + "grad_norm": 0.051015354692935944, + "learning_rate": 1.7855674806778612e-05, + "loss": 0.0361, + "num_input_tokens_seen": 24288352, + "step": 115085 + }, + { + "epoch": 12.66116611661166, + "grad_norm": 0.07422881573438644, + "learning_rate": 1.7853374861189988e-05, + "loss": 0.0012, + "num_input_tokens_seen": 24289408, + "step": 115090 + }, + { + "epoch": 12.661716171617162, + "grad_norm": 0.668803870677948, + "learning_rate": 1.7851074981468384e-05, + "loss": 0.0133, + "num_input_tokens_seen": 24290400, + "step": 115095 + }, + { + "epoch": 12.662266226622663, + "grad_norm": 0.007168805226683617, + "learning_rate": 1.7848775167634992e-05, + "loss": 0.0079, + "num_input_tokens_seen": 24291520, + "step": 115100 + }, + { + "epoch": 12.662816281628164, + "grad_norm": 0.03936435282230377, + "learning_rate": 1.7846475419711003e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24292640, + "step": 115105 + }, + { + "epoch": 12.663366336633663, + "grad_norm": 0.06927154213190079, + "learning_rate": 1.784417573771763e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24293696, + "step": 115110 + }, + { + "epoch": 12.663916391639164, + "grad_norm": 2.648000955581665, + "learning_rate": 1.784187612167604e-05, + "loss": 0.0408, + "num_input_tokens_seen": 24294816, + "step": 115115 + }, + { + "epoch": 12.664466446644665, + "grad_norm": 0.05172906070947647, + "learning_rate": 1.783957657160745e-05, + "loss": 0.0137, + "num_input_tokens_seen": 24295872, + "step": 115120 + }, + { + "epoch": 12.665016501650165, + "grad_norm": 0.08900728821754456, + "learning_rate": 1.7837277087533052e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24296896, + "step": 115125 + }, + { + "epoch": 12.665566556655666, + "grad_norm": 0.006581608671694994, + "learning_rate": 1.7834977669474023e-05, + "loss": 0.0007, + "num_input_tokens_seen": 24297920, + "step": 115130 + }, + { + "epoch": 12.666116611661167, + "grad_norm": 0.23128284513950348, + "learning_rate": 1.7832678317451573e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24299008, + "step": 115135 + }, + { + "epoch": 12.666666666666666, + "grad_norm": 0.027336524799466133, + "learning_rate": 1.783037903148688e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24300032, + "step": 115140 + }, + { + "epoch": 12.667216721672167, + "grad_norm": 1.6083283424377441, + "learning_rate": 1.7828079811601155e-05, + "loss": 0.204, + "num_input_tokens_seen": 24301088, + "step": 115145 + }, + { + "epoch": 12.667766776677668, + "grad_norm": 0.17816615104675293, + "learning_rate": 1.782578065781558e-05, + "loss": 0.003, + "num_input_tokens_seen": 24302304, + "step": 115150 + }, + { + "epoch": 12.668316831683168, + "grad_norm": 0.01746392250061035, + "learning_rate": 1.782348157015133e-05, + "loss": 0.015, + "num_input_tokens_seen": 24303360, + "step": 115155 + }, + { + "epoch": 12.668866886688669, + "grad_norm": 0.3815113306045532, + "learning_rate": 1.782118254862961e-05, + "loss": 0.0184, + "num_input_tokens_seen": 24304384, + "step": 115160 + }, + { + "epoch": 12.66941694169417, + "grad_norm": 0.008754939772188663, + "learning_rate": 1.78188835932716e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24305408, + "step": 115165 + }, + { + "epoch": 12.66996699669967, + "grad_norm": 0.020035527646541595, + "learning_rate": 1.7816584704098507e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24306432, + "step": 115170 + }, + { + "epoch": 12.67051705170517, + "grad_norm": 0.040532950311899185, + "learning_rate": 1.78142858811315e-05, + "loss": 0.0655, + "num_input_tokens_seen": 24307424, + "step": 115175 + }, + { + "epoch": 12.671067106710671, + "grad_norm": 0.4895229637622833, + "learning_rate": 1.7811987124391768e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24308448, + "step": 115180 + }, + { + "epoch": 12.671617161716172, + "grad_norm": 0.03915462642908096, + "learning_rate": 1.7809688433900512e-05, + "loss": 0.0993, + "num_input_tokens_seen": 24309440, + "step": 115185 + }, + { + "epoch": 12.672167216721672, + "grad_norm": 2.340120315551758, + "learning_rate": 1.7807389809678894e-05, + "loss": 0.1025, + "num_input_tokens_seen": 24310528, + "step": 115190 + }, + { + "epoch": 12.672717271727173, + "grad_norm": 0.018190566450357437, + "learning_rate": 1.780509125174812e-05, + "loss": 0.0427, + "num_input_tokens_seen": 24311584, + "step": 115195 + }, + { + "epoch": 12.673267326732674, + "grad_norm": 0.3146221935749054, + "learning_rate": 1.7802792760129372e-05, + "loss": 0.0054, + "num_input_tokens_seen": 24312640, + "step": 115200 + }, + { + "epoch": 12.673817381738173, + "grad_norm": 0.027897298336029053, + "learning_rate": 1.7800494334843813e-05, + "loss": 0.0068, + "num_input_tokens_seen": 24313696, + "step": 115205 + }, + { + "epoch": 12.674367436743674, + "grad_norm": 0.008179303258657455, + "learning_rate": 1.7798195975912664e-05, + "loss": 0.0046, + "num_input_tokens_seen": 24314816, + "step": 115210 + }, + { + "epoch": 12.674917491749175, + "grad_norm": 0.01618390530347824, + "learning_rate": 1.779589768335707e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24315840, + "step": 115215 + }, + { + "epoch": 12.675467546754675, + "grad_norm": 0.09670811146497726, + "learning_rate": 1.779359945719824e-05, + "loss": 0.0122, + "num_input_tokens_seen": 24316864, + "step": 115220 + }, + { + "epoch": 12.676017601760176, + "grad_norm": 0.007036436814814806, + "learning_rate": 1.7791301297457354e-05, + "loss": 0.0717, + "num_input_tokens_seen": 24317920, + "step": 115225 + }, + { + "epoch": 12.676567656765677, + "grad_norm": 0.045359790325164795, + "learning_rate": 1.7789003204155567e-05, + "loss": 0.0504, + "num_input_tokens_seen": 24318944, + "step": 115230 + }, + { + "epoch": 12.677117711771178, + "grad_norm": 0.03150225430727005, + "learning_rate": 1.7786705177314083e-05, + "loss": 0.0347, + "num_input_tokens_seen": 24320032, + "step": 115235 + }, + { + "epoch": 12.677667766776677, + "grad_norm": 0.04945005476474762, + "learning_rate": 1.7784407216954074e-05, + "loss": 0.0529, + "num_input_tokens_seen": 24321056, + "step": 115240 + }, + { + "epoch": 12.678217821782178, + "grad_norm": 0.02111821435391903, + "learning_rate": 1.7782109323096728e-05, + "loss": 0.0017, + "num_input_tokens_seen": 24322016, + "step": 115245 + }, + { + "epoch": 12.67876787678768, + "grad_norm": 0.07702270150184631, + "learning_rate": 1.777981149576322e-05, + "loss": 0.0031, + "num_input_tokens_seen": 24323072, + "step": 115250 + }, + { + "epoch": 12.679317931793179, + "grad_norm": 0.032782237976789474, + "learning_rate": 1.7777513734974713e-05, + "loss": 0.0218, + "num_input_tokens_seen": 24324096, + "step": 115255 + }, + { + "epoch": 12.67986798679868, + "grad_norm": 0.04262614995241165, + "learning_rate": 1.7775216040752408e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24325184, + "step": 115260 + }, + { + "epoch": 12.680418041804181, + "grad_norm": 0.6638199090957642, + "learning_rate": 1.7772918413117463e-05, + "loss": 0.0178, + "num_input_tokens_seen": 24326272, + "step": 115265 + }, + { + "epoch": 12.68096809680968, + "grad_norm": 0.1460024118423462, + "learning_rate": 1.777062085209106e-05, + "loss": 0.0136, + "num_input_tokens_seen": 24327328, + "step": 115270 + }, + { + "epoch": 12.681518151815181, + "grad_norm": 0.04342317953705788, + "learning_rate": 1.7768323357694377e-05, + "loss": 0.0098, + "num_input_tokens_seen": 24328288, + "step": 115275 + }, + { + "epoch": 12.682068206820682, + "grad_norm": 3.583709955215454, + "learning_rate": 1.7766025929948583e-05, + "loss": 0.0145, + "num_input_tokens_seen": 24329344, + "step": 115280 + }, + { + "epoch": 12.682618261826182, + "grad_norm": 0.009278065524995327, + "learning_rate": 1.776372856887487e-05, + "loss": 0.003, + "num_input_tokens_seen": 24330368, + "step": 115285 + }, + { + "epoch": 12.683168316831683, + "grad_norm": 3.360741376876831, + "learning_rate": 1.776143127449439e-05, + "loss": 0.0914, + "num_input_tokens_seen": 24331424, + "step": 115290 + }, + { + "epoch": 12.683718371837184, + "grad_norm": 0.015860509127378464, + "learning_rate": 1.775913404682832e-05, + "loss": 0.0401, + "num_input_tokens_seen": 24332480, + "step": 115295 + }, + { + "epoch": 12.684268426842685, + "grad_norm": 0.020842041820287704, + "learning_rate": 1.775683688589785e-05, + "loss": 0.064, + "num_input_tokens_seen": 24333536, + "step": 115300 + }, + { + "epoch": 12.684818481848184, + "grad_norm": 0.21093927323818207, + "learning_rate": 1.7754539791724123e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24334624, + "step": 115305 + }, + { + "epoch": 12.685368536853685, + "grad_norm": 0.00752227334305644, + "learning_rate": 1.7752242764328336e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24335616, + "step": 115310 + }, + { + "epoch": 12.685918591859187, + "grad_norm": 0.05249205604195595, + "learning_rate": 1.7749945803731655e-05, + "loss": 0.0029, + "num_input_tokens_seen": 24336672, + "step": 115315 + }, + { + "epoch": 12.686468646864686, + "grad_norm": 0.039849258959293365, + "learning_rate": 1.774764890995523e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24337792, + "step": 115320 + }, + { + "epoch": 12.687018701870187, + "grad_norm": 0.011294020339846611, + "learning_rate": 1.7745352083020254e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24338976, + "step": 115325 + }, + { + "epoch": 12.687568756875688, + "grad_norm": 0.018224775791168213, + "learning_rate": 1.774305532294788e-05, + "loss": 0.001, + "num_input_tokens_seen": 24340000, + "step": 115330 + }, + { + "epoch": 12.688118811881187, + "grad_norm": 0.2302846759557724, + "learning_rate": 1.7740758629759286e-05, + "loss": 0.0055, + "num_input_tokens_seen": 24340992, + "step": 115335 + }, + { + "epoch": 12.688668866886688, + "grad_norm": 0.016749758273363113, + "learning_rate": 1.7738462003475646e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24342080, + "step": 115340 + }, + { + "epoch": 12.68921892189219, + "grad_norm": 0.10445710271596909, + "learning_rate": 1.7736165444118103e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24343104, + "step": 115345 + }, + { + "epoch": 12.689768976897689, + "grad_norm": 0.003966368269175291, + "learning_rate": 1.7733868951707845e-05, + "loss": 0.0009, + "num_input_tokens_seen": 24344160, + "step": 115350 + }, + { + "epoch": 12.69031903190319, + "grad_norm": 0.5334576964378357, + "learning_rate": 1.7731572526266027e-05, + "loss": 0.0046, + "num_input_tokens_seen": 24345248, + "step": 115355 + }, + { + "epoch": 12.690869086908691, + "grad_norm": 0.02216925658285618, + "learning_rate": 1.7729276167813823e-05, + "loss": 0.0029, + "num_input_tokens_seen": 24346304, + "step": 115360 + }, + { + "epoch": 12.691419141914192, + "grad_norm": 0.15720845758914948, + "learning_rate": 1.772697987637239e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24347296, + "step": 115365 + }, + { + "epoch": 12.691969196919691, + "grad_norm": 0.06507238745689392, + "learning_rate": 1.7724683651962886e-05, + "loss": 0.0156, + "num_input_tokens_seen": 24348320, + "step": 115370 + }, + { + "epoch": 12.692519251925193, + "grad_norm": 0.08669717609882355, + "learning_rate": 1.7722387494606495e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24349408, + "step": 115375 + }, + { + "epoch": 12.693069306930694, + "grad_norm": 0.04132651537656784, + "learning_rate": 1.7720091404324352e-05, + "loss": 0.1277, + "num_input_tokens_seen": 24350496, + "step": 115380 + }, + { + "epoch": 12.693619361936193, + "grad_norm": 1.8657678365707397, + "learning_rate": 1.771779538113764e-05, + "loss": 0.1119, + "num_input_tokens_seen": 24351488, + "step": 115385 + }, + { + "epoch": 12.694169416941694, + "grad_norm": 0.5269753932952881, + "learning_rate": 1.7715499425067517e-05, + "loss": 0.0144, + "num_input_tokens_seen": 24352576, + "step": 115390 + }, + { + "epoch": 12.694719471947195, + "grad_norm": 0.0435778833925724, + "learning_rate": 1.771320353613513e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24353600, + "step": 115395 + }, + { + "epoch": 12.695269526952695, + "grad_norm": 0.021144239231944084, + "learning_rate": 1.7710907714361664e-05, + "loss": 0.0215, + "num_input_tokens_seen": 24354624, + "step": 115400 + }, + { + "epoch": 12.695819581958196, + "grad_norm": 0.005404651630669832, + "learning_rate": 1.770861195976825e-05, + "loss": 0.016, + "num_input_tokens_seen": 24355680, + "step": 115405 + }, + { + "epoch": 12.696369636963697, + "grad_norm": 2.9794046878814697, + "learning_rate": 1.770631627237607e-05, + "loss": 0.0953, + "num_input_tokens_seen": 24356704, + "step": 115410 + }, + { + "epoch": 12.696919691969196, + "grad_norm": 0.013483925722539425, + "learning_rate": 1.7704020652206276e-05, + "loss": 0.1028, + "num_input_tokens_seen": 24357760, + "step": 115415 + }, + { + "epoch": 12.697469746974697, + "grad_norm": 0.007286310661584139, + "learning_rate": 1.7701725099280015e-05, + "loss": 0.0209, + "num_input_tokens_seen": 24358720, + "step": 115420 + }, + { + "epoch": 12.698019801980198, + "grad_norm": 0.009459658525884151, + "learning_rate": 1.769942961361845e-05, + "loss": 0.0014, + "num_input_tokens_seen": 24359808, + "step": 115425 + }, + { + "epoch": 12.6985698569857, + "grad_norm": 0.021971968933939934, + "learning_rate": 1.769713419524274e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24360960, + "step": 115430 + }, + { + "epoch": 12.699119911991199, + "grad_norm": 0.011071779765188694, + "learning_rate": 1.769483884417405e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24361984, + "step": 115435 + }, + { + "epoch": 12.6996699669967, + "grad_norm": 0.015268893912434578, + "learning_rate": 1.7692543560433517e-05, + "loss": 0.0617, + "num_input_tokens_seen": 24363008, + "step": 115440 + }, + { + "epoch": 12.7002200220022, + "grad_norm": 0.4780924618244171, + "learning_rate": 1.76902483440423e-05, + "loss": 0.0061, + "num_input_tokens_seen": 24364000, + "step": 115445 + }, + { + "epoch": 12.7007700770077, + "grad_norm": 0.24448257684707642, + "learning_rate": 1.7687953195021563e-05, + "loss": 0.0075, + "num_input_tokens_seen": 24365056, + "step": 115450 + }, + { + "epoch": 12.701320132013201, + "grad_norm": 0.02274664305150509, + "learning_rate": 1.7685658113392457e-05, + "loss": 0.0069, + "num_input_tokens_seen": 24366112, + "step": 115455 + }, + { + "epoch": 12.701870187018702, + "grad_norm": 0.017184490337967873, + "learning_rate": 1.7683363099176116e-05, + "loss": 0.0481, + "num_input_tokens_seen": 24367264, + "step": 115460 + }, + { + "epoch": 12.702420242024202, + "grad_norm": 0.21518535912036896, + "learning_rate": 1.7681068152393714e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24368384, + "step": 115465 + }, + { + "epoch": 12.702970297029703, + "grad_norm": 0.14274905622005463, + "learning_rate": 1.7678773273066394e-05, + "loss": 0.0385, + "num_input_tokens_seen": 24369472, + "step": 115470 + }, + { + "epoch": 12.703520352035204, + "grad_norm": 0.2052963525056839, + "learning_rate": 1.7676478461215314e-05, + "loss": 0.0117, + "num_input_tokens_seen": 24370528, + "step": 115475 + }, + { + "epoch": 12.704070407040705, + "grad_norm": 2.056103467941284, + "learning_rate": 1.7674183716861613e-05, + "loss": 0.1001, + "num_input_tokens_seen": 24371552, + "step": 115480 + }, + { + "epoch": 12.704620462046204, + "grad_norm": 0.01183338463306427, + "learning_rate": 1.767188904002644e-05, + "loss": 0.025, + "num_input_tokens_seen": 24372608, + "step": 115485 + }, + { + "epoch": 12.705170517051705, + "grad_norm": 0.0032701489981263876, + "learning_rate": 1.7669594430730963e-05, + "loss": 0.0721, + "num_input_tokens_seen": 24373632, + "step": 115490 + }, + { + "epoch": 12.705720572057206, + "grad_norm": 0.032201871275901794, + "learning_rate": 1.76672998889963e-05, + "loss": 0.0761, + "num_input_tokens_seen": 24374624, + "step": 115495 + }, + { + "epoch": 12.706270627062706, + "grad_norm": 0.14651663601398468, + "learning_rate": 1.7665005414843627e-05, + "loss": 0.1249, + "num_input_tokens_seen": 24375648, + "step": 115500 + }, + { + "epoch": 12.706820682068207, + "grad_norm": 0.001519666169770062, + "learning_rate": 1.7662711008294082e-05, + "loss": 0.0428, + "num_input_tokens_seen": 24376704, + "step": 115505 + }, + { + "epoch": 12.707370737073708, + "grad_norm": 0.03680454567074776, + "learning_rate": 1.766041666936879e-05, + "loss": 0.009, + "num_input_tokens_seen": 24377856, + "step": 115510 + }, + { + "epoch": 12.707920792079207, + "grad_norm": 0.15202708542346954, + "learning_rate": 1.7658122398088937e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24378880, + "step": 115515 + }, + { + "epoch": 12.708470847084708, + "grad_norm": 0.271280974149704, + "learning_rate": 1.7655828194475632e-05, + "loss": 0.0246, + "num_input_tokens_seen": 24379872, + "step": 115520 + }, + { + "epoch": 12.70902090209021, + "grad_norm": 0.08957716077566147, + "learning_rate": 1.765353405855004e-05, + "loss": 0.0089, + "num_input_tokens_seen": 24380928, + "step": 115525 + }, + { + "epoch": 12.70957095709571, + "grad_norm": 0.036652594804763794, + "learning_rate": 1.765123999033331e-05, + "loss": 0.0463, + "num_input_tokens_seen": 24381984, + "step": 115530 + }, + { + "epoch": 12.71012101210121, + "grad_norm": 0.0029474825132638216, + "learning_rate": 1.764894598984656e-05, + "loss": 0.139, + "num_input_tokens_seen": 24383008, + "step": 115535 + }, + { + "epoch": 12.710671067106711, + "grad_norm": 0.008611074648797512, + "learning_rate": 1.7646652057110953e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24384032, + "step": 115540 + }, + { + "epoch": 12.711221122112212, + "grad_norm": 0.017796741798520088, + "learning_rate": 1.764435819214762e-05, + "loss": 0.0012, + "num_input_tokens_seen": 24385088, + "step": 115545 + }, + { + "epoch": 12.711771177117711, + "grad_norm": 0.08502573519945145, + "learning_rate": 1.764206439497772e-05, + "loss": 0.0732, + "num_input_tokens_seen": 24386112, + "step": 115550 + }, + { + "epoch": 12.712321232123212, + "grad_norm": 0.05314083397388458, + "learning_rate": 1.7639770665622374e-05, + "loss": 0.0425, + "num_input_tokens_seen": 24387168, + "step": 115555 + }, + { + "epoch": 12.712871287128714, + "grad_norm": 0.04660731181502342, + "learning_rate": 1.7637477004102725e-05, + "loss": 0.001, + "num_input_tokens_seen": 24388192, + "step": 115560 + }, + { + "epoch": 12.713421342134213, + "grad_norm": 1.0095208883285522, + "learning_rate": 1.7635183410439932e-05, + "loss": 0.1008, + "num_input_tokens_seen": 24389248, + "step": 115565 + }, + { + "epoch": 12.713971397139714, + "grad_norm": 0.012950685806572437, + "learning_rate": 1.76328898846551e-05, + "loss": 0.0327, + "num_input_tokens_seen": 24390368, + "step": 115570 + }, + { + "epoch": 12.714521452145215, + "grad_norm": 0.002886029426008463, + "learning_rate": 1.76305964267694e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24391424, + "step": 115575 + }, + { + "epoch": 12.715071507150714, + "grad_norm": 1.8851908445358276, + "learning_rate": 1.7628303036803954e-05, + "loss": 0.0421, + "num_input_tokens_seen": 24392512, + "step": 115580 + }, + { + "epoch": 12.715621562156215, + "grad_norm": 0.09882369637489319, + "learning_rate": 1.76260097147799e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24393632, + "step": 115585 + }, + { + "epoch": 12.716171617161717, + "grad_norm": 0.4158982038497925, + "learning_rate": 1.7623716460718383e-05, + "loss": 0.0102, + "num_input_tokens_seen": 24394656, + "step": 115590 + }, + { + "epoch": 12.716721672167218, + "grad_norm": 0.4395512640476227, + "learning_rate": 1.7621423274640526e-05, + "loss": 0.0088, + "num_input_tokens_seen": 24395712, + "step": 115595 + }, + { + "epoch": 12.717271727172717, + "grad_norm": 0.3676702380180359, + "learning_rate": 1.761913015656747e-05, + "loss": 0.0059, + "num_input_tokens_seen": 24396800, + "step": 115600 + }, + { + "epoch": 12.717821782178218, + "grad_norm": 0.042546506971120834, + "learning_rate": 1.7616837106520358e-05, + "loss": 0.0849, + "num_input_tokens_seen": 24397888, + "step": 115605 + }, + { + "epoch": 12.718371837183719, + "grad_norm": 0.04481887072324753, + "learning_rate": 1.76145441245203e-05, + "loss": 0.0086, + "num_input_tokens_seen": 24398944, + "step": 115610 + }, + { + "epoch": 12.718921892189218, + "grad_norm": 0.007431684527546167, + "learning_rate": 1.7612251210588454e-05, + "loss": 0.0118, + "num_input_tokens_seen": 24400032, + "step": 115615 + }, + { + "epoch": 12.71947194719472, + "grad_norm": 0.13411200046539307, + "learning_rate": 1.7609958364745937e-05, + "loss": 0.0075, + "num_input_tokens_seen": 24401120, + "step": 115620 + }, + { + "epoch": 12.72002200220022, + "grad_norm": 0.10298454761505127, + "learning_rate": 1.76076655870139e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24402208, + "step": 115625 + }, + { + "epoch": 12.72057205720572, + "grad_norm": 0.04375193640589714, + "learning_rate": 1.7605372877413457e-05, + "loss": 0.0063, + "num_input_tokens_seen": 24403264, + "step": 115630 + }, + { + "epoch": 12.721122112211221, + "grad_norm": 0.02101701684296131, + "learning_rate": 1.7603080235965737e-05, + "loss": 0.0106, + "num_input_tokens_seen": 24404288, + "step": 115635 + }, + { + "epoch": 12.721672167216722, + "grad_norm": 0.004810609854757786, + "learning_rate": 1.7600787662691883e-05, + "loss": 0.0009, + "num_input_tokens_seen": 24405376, + "step": 115640 + }, + { + "epoch": 12.722222222222221, + "grad_norm": 0.03783124312758446, + "learning_rate": 1.7598495157613025e-05, + "loss": 0.0419, + "num_input_tokens_seen": 24406400, + "step": 115645 + }, + { + "epoch": 12.722772277227723, + "grad_norm": 0.014668487943708897, + "learning_rate": 1.759620272075027e-05, + "loss": 0.0065, + "num_input_tokens_seen": 24407456, + "step": 115650 + }, + { + "epoch": 12.723322332233224, + "grad_norm": 0.04528952017426491, + "learning_rate": 1.7593910352124778e-05, + "loss": 0.002, + "num_input_tokens_seen": 24408512, + "step": 115655 + }, + { + "epoch": 12.723872387238725, + "grad_norm": 0.10069569945335388, + "learning_rate": 1.7591618051757645e-05, + "loss": 0.0083, + "num_input_tokens_seen": 24409536, + "step": 115660 + }, + { + "epoch": 12.724422442244224, + "grad_norm": 0.07119341939687729, + "learning_rate": 1.758932581967003e-05, + "loss": 0.0563, + "num_input_tokens_seen": 24410592, + "step": 115665 + }, + { + "epoch": 12.724972497249725, + "grad_norm": 0.0244772769510746, + "learning_rate": 1.7587033655883032e-05, + "loss": 0.0017, + "num_input_tokens_seen": 24411584, + "step": 115670 + }, + { + "epoch": 12.725522552255226, + "grad_norm": 0.46705174446105957, + "learning_rate": 1.7584741560417784e-05, + "loss": 0.0076, + "num_input_tokens_seen": 24412672, + "step": 115675 + }, + { + "epoch": 12.726072607260726, + "grad_norm": 1.2750862836837769, + "learning_rate": 1.7582449533295427e-05, + "loss": 0.0807, + "num_input_tokens_seen": 24413760, + "step": 115680 + }, + { + "epoch": 12.726622662266227, + "grad_norm": 0.06652535498142242, + "learning_rate": 1.7580157574537065e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24414816, + "step": 115685 + }, + { + "epoch": 12.727172717271728, + "grad_norm": 0.0037314945366233587, + "learning_rate": 1.757786568416383e-05, + "loss": 0.0169, + "num_input_tokens_seen": 24415904, + "step": 115690 + }, + { + "epoch": 12.727722772277227, + "grad_norm": 0.030223960056900978, + "learning_rate": 1.757557386219685e-05, + "loss": 0.0273, + "num_input_tokens_seen": 24416896, + "step": 115695 + }, + { + "epoch": 12.728272827282728, + "grad_norm": 0.04676602780818939, + "learning_rate": 1.7573282108657235e-05, + "loss": 0.0954, + "num_input_tokens_seen": 24417984, + "step": 115700 + }, + { + "epoch": 12.72882288228823, + "grad_norm": 0.0035190528724342585, + "learning_rate": 1.757099042356613e-05, + "loss": 0.0181, + "num_input_tokens_seen": 24419040, + "step": 115705 + }, + { + "epoch": 12.729372937293729, + "grad_norm": 0.03670646250247955, + "learning_rate": 1.7568698806944627e-05, + "loss": 0.0006, + "num_input_tokens_seen": 24420096, + "step": 115710 + }, + { + "epoch": 12.72992299229923, + "grad_norm": 2.2616395950317383, + "learning_rate": 1.7566407258813866e-05, + "loss": 0.1029, + "num_input_tokens_seen": 24421184, + "step": 115715 + }, + { + "epoch": 12.73047304730473, + "grad_norm": 0.037715550512075424, + "learning_rate": 1.7564115779194974e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24422240, + "step": 115720 + }, + { + "epoch": 12.731023102310232, + "grad_norm": 0.2313435673713684, + "learning_rate": 1.756182436810904e-05, + "loss": 0.0062, + "num_input_tokens_seen": 24423264, + "step": 115725 + }, + { + "epoch": 12.731573157315731, + "grad_norm": 0.004091896116733551, + "learning_rate": 1.7559533025577214e-05, + "loss": 0.0445, + "num_input_tokens_seen": 24424384, + "step": 115730 + }, + { + "epoch": 12.732123212321232, + "grad_norm": 1.1939040422439575, + "learning_rate": 1.755724175162059e-05, + "loss": 0.078, + "num_input_tokens_seen": 24425536, + "step": 115735 + }, + { + "epoch": 12.732673267326733, + "grad_norm": 0.0730220228433609, + "learning_rate": 1.7554950546260313e-05, + "loss": 0.0259, + "num_input_tokens_seen": 24426592, + "step": 115740 + }, + { + "epoch": 12.733223322332233, + "grad_norm": 0.08482849597930908, + "learning_rate": 1.7552659409517475e-05, + "loss": 0.1059, + "num_input_tokens_seen": 24427584, + "step": 115745 + }, + { + "epoch": 12.733773377337734, + "grad_norm": 0.01961476169526577, + "learning_rate": 1.7550368341413197e-05, + "loss": 0.0523, + "num_input_tokens_seen": 24428672, + "step": 115750 + }, + { + "epoch": 12.734323432343235, + "grad_norm": 0.021487699821591377, + "learning_rate": 1.7548077341968606e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24429728, + "step": 115755 + }, + { + "epoch": 12.734873487348734, + "grad_norm": 0.16336914896965027, + "learning_rate": 1.7545786411204808e-05, + "loss": 0.0107, + "num_input_tokens_seen": 24430784, + "step": 115760 + }, + { + "epoch": 12.735423542354235, + "grad_norm": 0.19328373670578003, + "learning_rate": 1.7543495549142924e-05, + "loss": 0.0049, + "num_input_tokens_seen": 24431904, + "step": 115765 + }, + { + "epoch": 12.735973597359736, + "grad_norm": 0.08218204975128174, + "learning_rate": 1.7541204755804065e-05, + "loss": 0.0042, + "num_input_tokens_seen": 24432960, + "step": 115770 + }, + { + "epoch": 12.736523652365236, + "grad_norm": 0.7676203846931458, + "learning_rate": 1.753891403120933e-05, + "loss": 0.0186, + "num_input_tokens_seen": 24433920, + "step": 115775 + }, + { + "epoch": 12.737073707370737, + "grad_norm": 0.035173822194337845, + "learning_rate": 1.753662337537986e-05, + "loss": 0.0492, + "num_input_tokens_seen": 24435008, + "step": 115780 + }, + { + "epoch": 12.737623762376238, + "grad_norm": 0.05555211752653122, + "learning_rate": 1.753433278833674e-05, + "loss": 0.092, + "num_input_tokens_seen": 24436000, + "step": 115785 + }, + { + "epoch": 12.738173817381739, + "grad_norm": 1.2593731880187988, + "learning_rate": 1.7532042270101096e-05, + "loss": 0.0624, + "num_input_tokens_seen": 24437056, + "step": 115790 + }, + { + "epoch": 12.738723872387238, + "grad_norm": 0.031173406168818474, + "learning_rate": 1.752975182069404e-05, + "loss": 0.0146, + "num_input_tokens_seen": 24438112, + "step": 115795 + }, + { + "epoch": 12.73927392739274, + "grad_norm": 0.014264233410358429, + "learning_rate": 1.7527461440136667e-05, + "loss": 0.0255, + "num_input_tokens_seen": 24439168, + "step": 115800 + }, + { + "epoch": 12.73982398239824, + "grad_norm": 0.012016265653073788, + "learning_rate": 1.75251711284501e-05, + "loss": 0.0204, + "num_input_tokens_seen": 24440256, + "step": 115805 + }, + { + "epoch": 12.74037403740374, + "grad_norm": 0.004279498942196369, + "learning_rate": 1.7522880885655444e-05, + "loss": 0.0005, + "num_input_tokens_seen": 24441312, + "step": 115810 + }, + { + "epoch": 12.74092409240924, + "grad_norm": 0.19118480384349823, + "learning_rate": 1.7520590711773806e-05, + "loss": 0.0046, + "num_input_tokens_seen": 24442336, + "step": 115815 + }, + { + "epoch": 12.741474147414742, + "grad_norm": 0.019573604688048363, + "learning_rate": 1.7518300606826304e-05, + "loss": 0.0236, + "num_input_tokens_seen": 24443360, + "step": 115820 + }, + { + "epoch": 12.742024202420241, + "grad_norm": 0.004965207539498806, + "learning_rate": 1.7516010570834023e-05, + "loss": 0.0738, + "num_input_tokens_seen": 24444416, + "step": 115825 + }, + { + "epoch": 12.742574257425742, + "grad_norm": 0.0034887732472270727, + "learning_rate": 1.7513720603818088e-05, + "loss": 0.0094, + "num_input_tokens_seen": 24445472, + "step": 115830 + }, + { + "epoch": 12.743124312431243, + "grad_norm": 0.7614947557449341, + "learning_rate": 1.7511430705799604e-05, + "loss": 0.0125, + "num_input_tokens_seen": 24446528, + "step": 115835 + }, + { + "epoch": 12.743674367436743, + "grad_norm": 0.028759844601154327, + "learning_rate": 1.7509140876799654e-05, + "loss": 0.0496, + "num_input_tokens_seen": 24447584, + "step": 115840 + }, + { + "epoch": 12.744224422442244, + "grad_norm": 1.779366135597229, + "learning_rate": 1.750685111683937e-05, + "loss": 0.0613, + "num_input_tokens_seen": 24448608, + "step": 115845 + }, + { + "epoch": 12.744774477447745, + "grad_norm": 0.30906593799591064, + "learning_rate": 1.750456142593983e-05, + "loss": 0.0058, + "num_input_tokens_seen": 24449632, + "step": 115850 + }, + { + "epoch": 12.745324532453246, + "grad_norm": 0.15666741132736206, + "learning_rate": 1.7502271804122167e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24450656, + "step": 115855 + }, + { + "epoch": 12.745874587458745, + "grad_norm": 0.10382714122533798, + "learning_rate": 1.7499982251407462e-05, + "loss": 0.006, + "num_input_tokens_seen": 24451744, + "step": 115860 + }, + { + "epoch": 12.746424642464246, + "grad_norm": 0.14829933643341064, + "learning_rate": 1.7497692767816815e-05, + "loss": 0.0081, + "num_input_tokens_seen": 24452832, + "step": 115865 + }, + { + "epoch": 12.746974697469748, + "grad_norm": 0.07622353732585907, + "learning_rate": 1.7495403353371348e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24453920, + "step": 115870 + }, + { + "epoch": 12.747524752475247, + "grad_norm": 0.08490131795406342, + "learning_rate": 1.7493114008092128e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24455008, + "step": 115875 + }, + { + "epoch": 12.748074807480748, + "grad_norm": 0.018879111856222153, + "learning_rate": 1.7490824732000287e-05, + "loss": 0.0147, + "num_input_tokens_seen": 24456064, + "step": 115880 + }, + { + "epoch": 12.748624862486249, + "grad_norm": 0.13998501002788544, + "learning_rate": 1.7488535525116913e-05, + "loss": 0.0345, + "num_input_tokens_seen": 24457056, + "step": 115885 + }, + { + "epoch": 12.749174917491748, + "grad_norm": 0.11986677348613739, + "learning_rate": 1.748624638746309e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24458144, + "step": 115890 + }, + { + "epoch": 12.74972497249725, + "grad_norm": 0.00436215428635478, + "learning_rate": 1.7483957319059943e-05, + "loss": 0.042, + "num_input_tokens_seen": 24459168, + "step": 115895 + }, + { + "epoch": 12.75027502750275, + "grad_norm": 0.0038041137158870697, + "learning_rate": 1.7481668319928547e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24460224, + "step": 115900 + }, + { + "epoch": 12.750825082508252, + "grad_norm": 0.07194703072309494, + "learning_rate": 1.7479379390090006e-05, + "loss": 0.0247, + "num_input_tokens_seen": 24461280, + "step": 115905 + }, + { + "epoch": 12.751375137513751, + "grad_norm": 0.001997235929593444, + "learning_rate": 1.7477090529565427e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24462400, + "step": 115910 + }, + { + "epoch": 12.751925192519252, + "grad_norm": 0.37486323714256287, + "learning_rate": 1.747480173837588e-05, + "loss": 0.005, + "num_input_tokens_seen": 24463456, + "step": 115915 + }, + { + "epoch": 12.752475247524753, + "grad_norm": 0.029187753796577454, + "learning_rate": 1.747251301654248e-05, + "loss": 0.0899, + "num_input_tokens_seen": 24464512, + "step": 115920 + }, + { + "epoch": 12.753025302530252, + "grad_norm": 0.0696701779961586, + "learning_rate": 1.7470224364086314e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24465504, + "step": 115925 + }, + { + "epoch": 12.753575357535754, + "grad_norm": 0.027943240478634834, + "learning_rate": 1.7467935781028483e-05, + "loss": 0.0013, + "num_input_tokens_seen": 24466656, + "step": 115930 + }, + { + "epoch": 12.754125412541255, + "grad_norm": 0.01079727802425623, + "learning_rate": 1.746564726739007e-05, + "loss": 0.0547, + "num_input_tokens_seen": 24467680, + "step": 115935 + }, + { + "epoch": 12.754675467546754, + "grad_norm": 1.3606599569320679, + "learning_rate": 1.7463358823192166e-05, + "loss": 0.0334, + "num_input_tokens_seen": 24468832, + "step": 115940 + }, + { + "epoch": 12.755225522552255, + "grad_norm": 0.011600272729992867, + "learning_rate": 1.7461070448455874e-05, + "loss": 0.004, + "num_input_tokens_seen": 24469856, + "step": 115945 + }, + { + "epoch": 12.755775577557756, + "grad_norm": 0.010850343853235245, + "learning_rate": 1.7458782143202267e-05, + "loss": 0.0017, + "num_input_tokens_seen": 24470912, + "step": 115950 + }, + { + "epoch": 12.756325632563257, + "grad_norm": 0.15328094363212585, + "learning_rate": 1.7456493907452463e-05, + "loss": 0.0029, + "num_input_tokens_seen": 24471904, + "step": 115955 + }, + { + "epoch": 12.756875687568757, + "grad_norm": 0.07905122637748718, + "learning_rate": 1.7454205741227528e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24473024, + "step": 115960 + }, + { + "epoch": 12.757425742574258, + "grad_norm": 3.4647421836853027, + "learning_rate": 1.7451917644548554e-05, + "loss": 0.0686, + "num_input_tokens_seen": 24474016, + "step": 115965 + }, + { + "epoch": 12.757975797579759, + "grad_norm": 0.042871952056884766, + "learning_rate": 1.7449629617436643e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24475008, + "step": 115970 + }, + { + "epoch": 12.758525852585258, + "grad_norm": 0.028025100007653236, + "learning_rate": 1.744734165991286e-05, + "loss": 0.011, + "num_input_tokens_seen": 24476064, + "step": 115975 + }, + { + "epoch": 12.75907590759076, + "grad_norm": 0.12342268973588943, + "learning_rate": 1.7445053771998308e-05, + "loss": 0.0329, + "num_input_tokens_seen": 24477152, + "step": 115980 + }, + { + "epoch": 12.75962596259626, + "grad_norm": 1.4488276243209839, + "learning_rate": 1.744276595371408e-05, + "loss": 0.0451, + "num_input_tokens_seen": 24478240, + "step": 115985 + }, + { + "epoch": 12.76017601760176, + "grad_norm": 0.01942513696849346, + "learning_rate": 1.744047820508124e-05, + "loss": 0.0838, + "num_input_tokens_seen": 24479296, + "step": 115990 + }, + { + "epoch": 12.76072607260726, + "grad_norm": 0.0654435083270073, + "learning_rate": 1.7438190526120893e-05, + "loss": 0.014, + "num_input_tokens_seen": 24480288, + "step": 115995 + }, + { + "epoch": 12.761276127612762, + "grad_norm": 2.192683696746826, + "learning_rate": 1.743590291685411e-05, + "loss": 0.0635, + "num_input_tokens_seen": 24481376, + "step": 116000 + }, + { + "epoch": 12.761826182618261, + "grad_norm": 0.0036722898948937654, + "learning_rate": 1.7433615377301983e-05, + "loss": 0.0066, + "num_input_tokens_seen": 24482400, + "step": 116005 + }, + { + "epoch": 12.762376237623762, + "grad_norm": 0.38412654399871826, + "learning_rate": 1.7431327907485595e-05, + "loss": 0.0046, + "num_input_tokens_seen": 24483424, + "step": 116010 + }, + { + "epoch": 12.762926292629263, + "grad_norm": 0.024880696088075638, + "learning_rate": 1.742904050742602e-05, + "loss": 0.0134, + "num_input_tokens_seen": 24484416, + "step": 116015 + }, + { + "epoch": 12.763476347634764, + "grad_norm": 0.04380696266889572, + "learning_rate": 1.7426753177144346e-05, + "loss": 0.2018, + "num_input_tokens_seen": 24485568, + "step": 116020 + }, + { + "epoch": 12.764026402640264, + "grad_norm": 0.014094282872974873, + "learning_rate": 1.7424465916661664e-05, + "loss": 0.0949, + "num_input_tokens_seen": 24486592, + "step": 116025 + }, + { + "epoch": 12.764576457645765, + "grad_norm": 1.8305716514587402, + "learning_rate": 1.7422178725999034e-05, + "loss": 0.0716, + "num_input_tokens_seen": 24487712, + "step": 116030 + }, + { + "epoch": 12.765126512651266, + "grad_norm": 1.2224934101104736, + "learning_rate": 1.7419891605177547e-05, + "loss": 0.0263, + "num_input_tokens_seen": 24488736, + "step": 116035 + }, + { + "epoch": 12.765676567656765, + "grad_norm": 0.010871064849197865, + "learning_rate": 1.741760455421828e-05, + "loss": 0.0455, + "num_input_tokens_seen": 24489824, + "step": 116040 + }, + { + "epoch": 12.766226622662266, + "grad_norm": 0.02556600607931614, + "learning_rate": 1.7415317573142325e-05, + "loss": 0.0173, + "num_input_tokens_seen": 24490816, + "step": 116045 + }, + { + "epoch": 12.766776677667767, + "grad_norm": 0.03688249737024307, + "learning_rate": 1.7413030661970742e-05, + "loss": 0.0231, + "num_input_tokens_seen": 24491904, + "step": 116050 + }, + { + "epoch": 12.767326732673267, + "grad_norm": 0.06948556751012802, + "learning_rate": 1.7410743820724606e-05, + "loss": 0.0625, + "num_input_tokens_seen": 24492928, + "step": 116055 + }, + { + "epoch": 12.767876787678768, + "grad_norm": 0.010905199684202671, + "learning_rate": 1.7408457049425013e-05, + "loss": 0.0075, + "num_input_tokens_seen": 24493984, + "step": 116060 + }, + { + "epoch": 12.768426842684269, + "grad_norm": 0.003463160479441285, + "learning_rate": 1.7406170348093022e-05, + "loss": 0.0663, + "num_input_tokens_seen": 24495072, + "step": 116065 + }, + { + "epoch": 12.768976897689768, + "grad_norm": 0.13273751735687256, + "learning_rate": 1.7403883716749723e-05, + "loss": 0.1266, + "num_input_tokens_seen": 24496064, + "step": 116070 + }, + { + "epoch": 12.76952695269527, + "grad_norm": 0.007421276532113552, + "learning_rate": 1.740159715541618e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24497120, + "step": 116075 + }, + { + "epoch": 12.77007700770077, + "grad_norm": 0.022798791527748108, + "learning_rate": 1.7399310664113467e-05, + "loss": 0.0029, + "num_input_tokens_seen": 24498208, + "step": 116080 + }, + { + "epoch": 12.770627062706271, + "grad_norm": 0.03347473591566086, + "learning_rate": 1.7397024242862668e-05, + "loss": 0.1194, + "num_input_tokens_seen": 24499264, + "step": 116085 + }, + { + "epoch": 12.77117711771177, + "grad_norm": 0.08995874226093292, + "learning_rate": 1.7394737891684837e-05, + "loss": 0.0112, + "num_input_tokens_seen": 24500352, + "step": 116090 + }, + { + "epoch": 12.771727172717272, + "grad_norm": 0.6332334280014038, + "learning_rate": 1.7392451610601063e-05, + "loss": 0.0244, + "num_input_tokens_seen": 24501408, + "step": 116095 + }, + { + "epoch": 12.772277227722773, + "grad_norm": 2.2165122032165527, + "learning_rate": 1.7390165399632417e-05, + "loss": 0.0402, + "num_input_tokens_seen": 24502432, + "step": 116100 + }, + { + "epoch": 12.772827282728272, + "grad_norm": 0.024393659085035324, + "learning_rate": 1.7387879258799956e-05, + "loss": 0.0402, + "num_input_tokens_seen": 24503520, + "step": 116105 + }, + { + "epoch": 12.773377337733773, + "grad_norm": 0.030698738992214203, + "learning_rate": 1.7385593188124767e-05, + "loss": 0.0086, + "num_input_tokens_seen": 24504544, + "step": 116110 + }, + { + "epoch": 12.773927392739274, + "grad_norm": 2.363083839416504, + "learning_rate": 1.73833071876279e-05, + "loss": 0.1143, + "num_input_tokens_seen": 24505632, + "step": 116115 + }, + { + "epoch": 12.774477447744774, + "grad_norm": 1.7512527704238892, + "learning_rate": 1.738102125733044e-05, + "loss": 0.0678, + "num_input_tokens_seen": 24506688, + "step": 116120 + }, + { + "epoch": 12.775027502750275, + "grad_norm": 0.023076709359884262, + "learning_rate": 1.7378735397253464e-05, + "loss": 0.0078, + "num_input_tokens_seen": 24507712, + "step": 116125 + }, + { + "epoch": 12.775577557755776, + "grad_norm": 0.018686510622501373, + "learning_rate": 1.7376449607418015e-05, + "loss": 0.0102, + "num_input_tokens_seen": 24508800, + "step": 116130 + }, + { + "epoch": 12.776127612761275, + "grad_norm": 1.9854933023452759, + "learning_rate": 1.7374163887845172e-05, + "loss": 0.1353, + "num_input_tokens_seen": 24509856, + "step": 116135 + }, + { + "epoch": 12.776677667766776, + "grad_norm": 1.068699598312378, + "learning_rate": 1.7371878238556003e-05, + "loss": 0.0107, + "num_input_tokens_seen": 24510944, + "step": 116140 + }, + { + "epoch": 12.777227722772277, + "grad_norm": 0.11718728393316269, + "learning_rate": 1.7369592659571578e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24512064, + "step": 116145 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 0.4935269057750702, + "learning_rate": 1.7367307150912955e-05, + "loss": 0.0178, + "num_input_tokens_seen": 24513120, + "step": 116150 + }, + { + "epoch": 12.778327832783278, + "grad_norm": 0.009304918348789215, + "learning_rate": 1.7365021712601186e-05, + "loss": 0.089, + "num_input_tokens_seen": 24514176, + "step": 116155 + }, + { + "epoch": 12.778877887788779, + "grad_norm": 0.11921186745166779, + "learning_rate": 1.7362736344657365e-05, + "loss": 0.154, + "num_input_tokens_seen": 24515232, + "step": 116160 + }, + { + "epoch": 12.77942794279428, + "grad_norm": 0.024137835949659348, + "learning_rate": 1.7360451047102527e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24516288, + "step": 116165 + }, + { + "epoch": 12.77997799779978, + "grad_norm": 1.2988805770874023, + "learning_rate": 1.735816581995775e-05, + "loss": 0.0765, + "num_input_tokens_seen": 24517280, + "step": 116170 + }, + { + "epoch": 12.78052805280528, + "grad_norm": 0.2944655418395996, + "learning_rate": 1.7355880663244094e-05, + "loss": 0.0049, + "num_input_tokens_seen": 24518368, + "step": 116175 + }, + { + "epoch": 12.781078107810782, + "grad_norm": 0.02140960469841957, + "learning_rate": 1.735359557698261e-05, + "loss": 0.0017, + "num_input_tokens_seen": 24519424, + "step": 116180 + }, + { + "epoch": 12.781628162816281, + "grad_norm": 1.0481781959533691, + "learning_rate": 1.735131056119438e-05, + "loss": 0.1089, + "num_input_tokens_seen": 24520448, + "step": 116185 + }, + { + "epoch": 12.782178217821782, + "grad_norm": 2.1153080463409424, + "learning_rate": 1.7349025615900436e-05, + "loss": 0.0775, + "num_input_tokens_seen": 24521440, + "step": 116190 + }, + { + "epoch": 12.782728272827283, + "grad_norm": 0.21231505274772644, + "learning_rate": 1.734674074112186e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24522464, + "step": 116195 + }, + { + "epoch": 12.783278327832782, + "grad_norm": 1.2577481269836426, + "learning_rate": 1.7344455936879704e-05, + "loss": 0.0129, + "num_input_tokens_seen": 24523584, + "step": 116200 + }, + { + "epoch": 12.783828382838283, + "grad_norm": 0.01215166412293911, + "learning_rate": 1.7342171203195014e-05, + "loss": 0.0036, + "num_input_tokens_seen": 24524608, + "step": 116205 + }, + { + "epoch": 12.784378437843785, + "grad_norm": 0.04410878196358681, + "learning_rate": 1.7339886540088866e-05, + "loss": 0.0047, + "num_input_tokens_seen": 24525632, + "step": 116210 + }, + { + "epoch": 12.784928492849286, + "grad_norm": 0.034336913377046585, + "learning_rate": 1.7337601947582298e-05, + "loss": 0.0525, + "num_input_tokens_seen": 24526688, + "step": 116215 + }, + { + "epoch": 12.785478547854785, + "grad_norm": 0.03345119580626488, + "learning_rate": 1.7335317425696394e-05, + "loss": 0.0234, + "num_input_tokens_seen": 24527744, + "step": 116220 + }, + { + "epoch": 12.786028602860286, + "grad_norm": 1.5047603845596313, + "learning_rate": 1.733303297445218e-05, + "loss": 0.0193, + "num_input_tokens_seen": 24528800, + "step": 116225 + }, + { + "epoch": 12.786578657865787, + "grad_norm": 0.033191993832588196, + "learning_rate": 1.733074859387072e-05, + "loss": 0.084, + "num_input_tokens_seen": 24529856, + "step": 116230 + }, + { + "epoch": 12.787128712871286, + "grad_norm": 0.010751497931778431, + "learning_rate": 1.7328464283973074e-05, + "loss": 0.0086, + "num_input_tokens_seen": 24530880, + "step": 116235 + }, + { + "epoch": 12.787678767876788, + "grad_norm": 0.0691601037979126, + "learning_rate": 1.7326180044780296e-05, + "loss": 0.0432, + "num_input_tokens_seen": 24531904, + "step": 116240 + }, + { + "epoch": 12.788228822882289, + "grad_norm": 0.10883469134569168, + "learning_rate": 1.7323895876313424e-05, + "loss": 0.0427, + "num_input_tokens_seen": 24532992, + "step": 116245 + }, + { + "epoch": 12.788778877887788, + "grad_norm": 0.06977395713329315, + "learning_rate": 1.7321611778593526e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24534048, + "step": 116250 + }, + { + "epoch": 12.789328932893289, + "grad_norm": 0.04697199538350105, + "learning_rate": 1.7319327751641644e-05, + "loss": 0.0214, + "num_input_tokens_seen": 24535136, + "step": 116255 + }, + { + "epoch": 12.78987898789879, + "grad_norm": 0.5411428213119507, + "learning_rate": 1.7317043795478844e-05, + "loss": 0.0363, + "num_input_tokens_seen": 24536224, + "step": 116260 + }, + { + "epoch": 12.79042904290429, + "grad_norm": 1.9310308694839478, + "learning_rate": 1.7314759910126155e-05, + "loss": 0.074, + "num_input_tokens_seen": 24537312, + "step": 116265 + }, + { + "epoch": 12.79097909790979, + "grad_norm": 0.05642737075686455, + "learning_rate": 1.7312476095604634e-05, + "loss": 0.0088, + "num_input_tokens_seen": 24538336, + "step": 116270 + }, + { + "epoch": 12.791529152915292, + "grad_norm": 0.016359761357307434, + "learning_rate": 1.7310192351935343e-05, + "loss": 0.0858, + "num_input_tokens_seen": 24539328, + "step": 116275 + }, + { + "epoch": 12.792079207920793, + "grad_norm": 0.3800690770149231, + "learning_rate": 1.7307908679139307e-05, + "loss": 0.006, + "num_input_tokens_seen": 24540384, + "step": 116280 + }, + { + "epoch": 12.792629262926292, + "grad_norm": 0.17552606761455536, + "learning_rate": 1.730562507723759e-05, + "loss": 0.0161, + "num_input_tokens_seen": 24541440, + "step": 116285 + }, + { + "epoch": 12.793179317931793, + "grad_norm": 0.0036750747822225094, + "learning_rate": 1.7303341546251242e-05, + "loss": 0.0187, + "num_input_tokens_seen": 24542432, + "step": 116290 + }, + { + "epoch": 12.793729372937294, + "grad_norm": 2.338322639465332, + "learning_rate": 1.7301058086201284e-05, + "loss": 0.0667, + "num_input_tokens_seen": 24543488, + "step": 116295 + }, + { + "epoch": 12.794279427942794, + "grad_norm": 0.028594261035323143, + "learning_rate": 1.72987746971088e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24544512, + "step": 116300 + }, + { + "epoch": 12.794829482948295, + "grad_norm": 1.2973445653915405, + "learning_rate": 1.7296491378994797e-05, + "loss": 0.027, + "num_input_tokens_seen": 24545600, + "step": 116305 + }, + { + "epoch": 12.795379537953796, + "grad_norm": 0.28628507256507874, + "learning_rate": 1.7294208131880346e-05, + "loss": 0.0453, + "num_input_tokens_seen": 24546656, + "step": 116310 + }, + { + "epoch": 12.795929592959295, + "grad_norm": 0.06861753761768341, + "learning_rate": 1.7291924955786484e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24547744, + "step": 116315 + }, + { + "epoch": 12.796479647964796, + "grad_norm": 0.17698562145233154, + "learning_rate": 1.728964185073424e-05, + "loss": 0.0078, + "num_input_tokens_seen": 24548768, + "step": 116320 + }, + { + "epoch": 12.797029702970297, + "grad_norm": 2.009477138519287, + "learning_rate": 1.7287358816744677e-05, + "loss": 0.0295, + "num_input_tokens_seen": 24549856, + "step": 116325 + }, + { + "epoch": 12.797579757975798, + "grad_norm": 0.009507044218480587, + "learning_rate": 1.7285075853838816e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24550944, + "step": 116330 + }, + { + "epoch": 12.798129812981298, + "grad_norm": 0.09103613346815109, + "learning_rate": 1.7282792962037726e-05, + "loss": 0.1081, + "num_input_tokens_seen": 24552032, + "step": 116335 + }, + { + "epoch": 12.798679867986799, + "grad_norm": 0.0805731862783432, + "learning_rate": 1.728051014136242e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24553088, + "step": 116340 + }, + { + "epoch": 12.7992299229923, + "grad_norm": 1.8619318008422852, + "learning_rate": 1.7278227391833942e-05, + "loss": 0.0153, + "num_input_tokens_seen": 24554144, + "step": 116345 + }, + { + "epoch": 12.7997799779978, + "grad_norm": 0.09374063462018967, + "learning_rate": 1.727594471347335e-05, + "loss": 0.0419, + "num_input_tokens_seen": 24555168, + "step": 116350 + }, + { + "epoch": 12.8003300330033, + "grad_norm": 0.01666279323399067, + "learning_rate": 1.7273662106301652e-05, + "loss": 0.0296, + "num_input_tokens_seen": 24556320, + "step": 116355 + }, + { + "epoch": 12.800880088008801, + "grad_norm": 0.2986571788787842, + "learning_rate": 1.727137957033992e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24557408, + "step": 116360 + }, + { + "epoch": 12.8014301430143, + "grad_norm": 0.08031042665243149, + "learning_rate": 1.7269097105609168e-05, + "loss": 0.0087, + "num_input_tokens_seen": 24558464, + "step": 116365 + }, + { + "epoch": 12.801980198019802, + "grad_norm": 0.011521991342306137, + "learning_rate": 1.726681471213043e-05, + "loss": 0.0438, + "num_input_tokens_seen": 24559520, + "step": 116370 + }, + { + "epoch": 12.802530253025303, + "grad_norm": 0.01750272698700428, + "learning_rate": 1.7264532389924764e-05, + "loss": 0.0042, + "num_input_tokens_seen": 24560608, + "step": 116375 + }, + { + "epoch": 12.803080308030804, + "grad_norm": 0.005602648947387934, + "learning_rate": 1.726225013901318e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24561696, + "step": 116380 + }, + { + "epoch": 12.803630363036303, + "grad_norm": 0.017650391906499863, + "learning_rate": 1.7259967959416727e-05, + "loss": 0.1032, + "num_input_tokens_seen": 24562752, + "step": 116385 + }, + { + "epoch": 12.804180418041804, + "grad_norm": 0.05570811778306961, + "learning_rate": 1.7257685851156447e-05, + "loss": 0.0514, + "num_input_tokens_seen": 24563808, + "step": 116390 + }, + { + "epoch": 12.804730473047305, + "grad_norm": 1.6791892051696777, + "learning_rate": 1.7255403814253347e-05, + "loss": 0.0067, + "num_input_tokens_seen": 24564864, + "step": 116395 + }, + { + "epoch": 12.805280528052805, + "grad_norm": 0.06918440759181976, + "learning_rate": 1.7253121848728477e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24565920, + "step": 116400 + }, + { + "epoch": 12.805830583058306, + "grad_norm": 1.9146157503128052, + "learning_rate": 1.725083995460286e-05, + "loss": 0.0283, + "num_input_tokens_seen": 24566976, + "step": 116405 + }, + { + "epoch": 12.806380638063807, + "grad_norm": 0.02983895316720009, + "learning_rate": 1.7248558131897546e-05, + "loss": 0.0024, + "num_input_tokens_seen": 24568000, + "step": 116410 + }, + { + "epoch": 12.806930693069306, + "grad_norm": 0.08654060959815979, + "learning_rate": 1.7246276380633546e-05, + "loss": 0.0267, + "num_input_tokens_seen": 24569056, + "step": 116415 + }, + { + "epoch": 12.807480748074807, + "grad_norm": 0.1561482697725296, + "learning_rate": 1.724399470083189e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24570080, + "step": 116420 + }, + { + "epoch": 12.808030803080309, + "grad_norm": 0.03454488515853882, + "learning_rate": 1.7241713092513616e-05, + "loss": 0.0805, + "num_input_tokens_seen": 24571136, + "step": 116425 + }, + { + "epoch": 12.808580858085808, + "grad_norm": 0.047858718782663345, + "learning_rate": 1.7239431555699757e-05, + "loss": 0.0445, + "num_input_tokens_seen": 24572160, + "step": 116430 + }, + { + "epoch": 12.809130913091309, + "grad_norm": 1.0906320810317993, + "learning_rate": 1.7237150090411326e-05, + "loss": 0.0569, + "num_input_tokens_seen": 24573184, + "step": 116435 + }, + { + "epoch": 12.80968096809681, + "grad_norm": 0.2219868302345276, + "learning_rate": 1.7234868696669362e-05, + "loss": 0.0045, + "num_input_tokens_seen": 24574240, + "step": 116440 + }, + { + "epoch": 12.810231023102311, + "grad_norm": 0.1161281019449234, + "learning_rate": 1.7232587374494878e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24575296, + "step": 116445 + }, + { + "epoch": 12.81078107810781, + "grad_norm": 0.06702185422182083, + "learning_rate": 1.7230306123908923e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24576320, + "step": 116450 + }, + { + "epoch": 12.811331133113312, + "grad_norm": 0.01012458000332117, + "learning_rate": 1.72280249449325e-05, + "loss": 0.0042, + "num_input_tokens_seen": 24577376, + "step": 116455 + }, + { + "epoch": 12.811881188118813, + "grad_norm": 0.1088775098323822, + "learning_rate": 1.722574383758664e-05, + "loss": 0.0042, + "num_input_tokens_seen": 24578464, + "step": 116460 + }, + { + "epoch": 12.812431243124312, + "grad_norm": 0.020440105348825455, + "learning_rate": 1.722346280189238e-05, + "loss": 0.0505, + "num_input_tokens_seen": 24579552, + "step": 116465 + }, + { + "epoch": 12.812981298129813, + "grad_norm": 0.006187452934682369, + "learning_rate": 1.722118183787072e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24580576, + "step": 116470 + }, + { + "epoch": 12.813531353135314, + "grad_norm": 1.2627172470092773, + "learning_rate": 1.7218900945542698e-05, + "loss": 0.0577, + "num_input_tokens_seen": 24581536, + "step": 116475 + }, + { + "epoch": 12.814081408140813, + "grad_norm": 0.5837905406951904, + "learning_rate": 1.7216620124929335e-05, + "loss": 0.0098, + "num_input_tokens_seen": 24582592, + "step": 116480 + }, + { + "epoch": 12.814631463146315, + "grad_norm": 0.48610737919807434, + "learning_rate": 1.721433937605164e-05, + "loss": 0.0214, + "num_input_tokens_seen": 24583584, + "step": 116485 + }, + { + "epoch": 12.815181518151816, + "grad_norm": 0.009401973336935043, + "learning_rate": 1.7212058698930655e-05, + "loss": 0.0623, + "num_input_tokens_seen": 24584640, + "step": 116490 + }, + { + "epoch": 12.815731573157315, + "grad_norm": 0.68342125415802, + "learning_rate": 1.720977809358738e-05, + "loss": 0.0089, + "num_input_tokens_seen": 24585664, + "step": 116495 + }, + { + "epoch": 12.816281628162816, + "grad_norm": 0.27005520462989807, + "learning_rate": 1.7207497560042843e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24586752, + "step": 116500 + }, + { + "epoch": 12.816831683168317, + "grad_norm": 0.008340273052453995, + "learning_rate": 1.720521709831807e-05, + "loss": 0.0752, + "num_input_tokens_seen": 24587776, + "step": 116505 + }, + { + "epoch": 12.817381738173818, + "grad_norm": 0.012825395911931992, + "learning_rate": 1.720293670843406e-05, + "loss": 0.0031, + "num_input_tokens_seen": 24588800, + "step": 116510 + }, + { + "epoch": 12.817931793179318, + "grad_norm": 0.023042622953653336, + "learning_rate": 1.7200656390411847e-05, + "loss": 0.0297, + "num_input_tokens_seen": 24589792, + "step": 116515 + }, + { + "epoch": 12.818481848184819, + "grad_norm": 0.052632324397563934, + "learning_rate": 1.719837614427243e-05, + "loss": 0.008, + "num_input_tokens_seen": 24590816, + "step": 116520 + }, + { + "epoch": 12.81903190319032, + "grad_norm": 0.08075510710477829, + "learning_rate": 1.719609597003686e-05, + "loss": 0.0043, + "num_input_tokens_seen": 24591936, + "step": 116525 + }, + { + "epoch": 12.819581958195819, + "grad_norm": 1.7210413217544556, + "learning_rate": 1.7193815867726114e-05, + "loss": 0.0573, + "num_input_tokens_seen": 24592928, + "step": 116530 + }, + { + "epoch": 12.82013201320132, + "grad_norm": 0.012811249122023582, + "learning_rate": 1.7191535837361215e-05, + "loss": 0.0406, + "num_input_tokens_seen": 24594016, + "step": 116535 + }, + { + "epoch": 12.820682068206821, + "grad_norm": 0.01752656325697899, + "learning_rate": 1.718925587896319e-05, + "loss": 0.0296, + "num_input_tokens_seen": 24595072, + "step": 116540 + }, + { + "epoch": 12.82123212321232, + "grad_norm": 0.462998628616333, + "learning_rate": 1.7186975992553047e-05, + "loss": 0.0073, + "num_input_tokens_seen": 24596096, + "step": 116545 + }, + { + "epoch": 12.821782178217822, + "grad_norm": 0.11223665624856949, + "learning_rate": 1.71846961781518e-05, + "loss": 0.0023, + "num_input_tokens_seen": 24597152, + "step": 116550 + }, + { + "epoch": 12.822332233223323, + "grad_norm": 0.21026891469955444, + "learning_rate": 1.7182416435780454e-05, + "loss": 0.0178, + "num_input_tokens_seen": 24598176, + "step": 116555 + }, + { + "epoch": 12.822882288228822, + "grad_norm": 0.037576477974653244, + "learning_rate": 1.718013676546002e-05, + "loss": 0.0263, + "num_input_tokens_seen": 24599232, + "step": 116560 + }, + { + "epoch": 12.823432343234323, + "grad_norm": 0.03964557871222496, + "learning_rate": 1.7177857167211527e-05, + "loss": 0.0796, + "num_input_tokens_seen": 24600224, + "step": 116565 + }, + { + "epoch": 12.823982398239824, + "grad_norm": 0.13455116748809814, + "learning_rate": 1.7175577641055957e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24601248, + "step": 116570 + }, + { + "epoch": 12.824532453245325, + "grad_norm": 2.8469464778900146, + "learning_rate": 1.717329818701434e-05, + "loss": 0.2243, + "num_input_tokens_seen": 24602272, + "step": 116575 + }, + { + "epoch": 12.825082508250825, + "grad_norm": 0.5829986333847046, + "learning_rate": 1.717101880510768e-05, + "loss": 0.0635, + "num_input_tokens_seen": 24603360, + "step": 116580 + }, + { + "epoch": 12.825632563256326, + "grad_norm": 0.01684149168431759, + "learning_rate": 1.7168739495356977e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24604448, + "step": 116585 + }, + { + "epoch": 12.826182618261827, + "grad_norm": 0.006078936625272036, + "learning_rate": 1.716646025778325e-05, + "loss": 0.0397, + "num_input_tokens_seen": 24605536, + "step": 116590 + }, + { + "epoch": 12.826732673267326, + "grad_norm": 0.04382658004760742, + "learning_rate": 1.716418109240749e-05, + "loss": 0.113, + "num_input_tokens_seen": 24606656, + "step": 116595 + }, + { + "epoch": 12.827282728272827, + "grad_norm": 0.11887959390878677, + "learning_rate": 1.716190199925072e-05, + "loss": 0.0249, + "num_input_tokens_seen": 24607712, + "step": 116600 + }, + { + "epoch": 12.827832783278328, + "grad_norm": 0.04378858953714371, + "learning_rate": 1.7159622978333945e-05, + "loss": 0.0085, + "num_input_tokens_seen": 24608768, + "step": 116605 + }, + { + "epoch": 12.828382838283828, + "grad_norm": 0.4038788974285126, + "learning_rate": 1.715734402967815e-05, + "loss": 0.0815, + "num_input_tokens_seen": 24609824, + "step": 116610 + }, + { + "epoch": 12.828932893289329, + "grad_norm": 1.0931795835494995, + "learning_rate": 1.715506515330436e-05, + "loss": 0.0298, + "num_input_tokens_seen": 24610816, + "step": 116615 + }, + { + "epoch": 12.82948294829483, + "grad_norm": 0.7259512543678284, + "learning_rate": 1.7152786349233573e-05, + "loss": 0.1381, + "num_input_tokens_seen": 24611808, + "step": 116620 + }, + { + "epoch": 12.83003300330033, + "grad_norm": 0.012637123465538025, + "learning_rate": 1.715050761748678e-05, + "loss": 0.0295, + "num_input_tokens_seen": 24612832, + "step": 116625 + }, + { + "epoch": 12.83058305830583, + "grad_norm": 0.06163882464170456, + "learning_rate": 1.7148228958084994e-05, + "loss": 0.2167, + "num_input_tokens_seen": 24613888, + "step": 116630 + }, + { + "epoch": 12.831133113311331, + "grad_norm": 0.13236209750175476, + "learning_rate": 1.7145950371049215e-05, + "loss": 0.0484, + "num_input_tokens_seen": 24614976, + "step": 116635 + }, + { + "epoch": 12.831683168316832, + "grad_norm": 0.009752027690410614, + "learning_rate": 1.7143671856400444e-05, + "loss": 0.0774, + "num_input_tokens_seen": 24616032, + "step": 116640 + }, + { + "epoch": 12.832233223322332, + "grad_norm": 0.21898743510246277, + "learning_rate": 1.714139341415968e-05, + "loss": 0.0368, + "num_input_tokens_seen": 24617088, + "step": 116645 + }, + { + "epoch": 12.832783278327833, + "grad_norm": 0.062367819249629974, + "learning_rate": 1.713911504434792e-05, + "loss": 0.0111, + "num_input_tokens_seen": 24618176, + "step": 116650 + }, + { + "epoch": 12.833333333333334, + "grad_norm": 0.20685428380966187, + "learning_rate": 1.713683674698617e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24619232, + "step": 116655 + }, + { + "epoch": 12.833883388338833, + "grad_norm": 0.05880739912390709, + "learning_rate": 1.7134558522095407e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24620320, + "step": 116660 + }, + { + "epoch": 12.834433443344334, + "grad_norm": 0.09056884795427322, + "learning_rate": 1.713228036969666e-05, + "loss": 0.0306, + "num_input_tokens_seen": 24621440, + "step": 116665 + }, + { + "epoch": 12.834983498349835, + "grad_norm": 0.14206144213676453, + "learning_rate": 1.7130002289810902e-05, + "loss": 0.0964, + "num_input_tokens_seen": 24622464, + "step": 116670 + }, + { + "epoch": 12.835533553355335, + "grad_norm": 0.024768203496932983, + "learning_rate": 1.7127724282459136e-05, + "loss": 0.0321, + "num_input_tokens_seen": 24623488, + "step": 116675 + }, + { + "epoch": 12.836083608360836, + "grad_norm": 0.2503580152988434, + "learning_rate": 1.7125446347662365e-05, + "loss": 0.022, + "num_input_tokens_seen": 24624480, + "step": 116680 + }, + { + "epoch": 12.836633663366337, + "grad_norm": 0.09057138115167618, + "learning_rate": 1.712316848544157e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24625568, + "step": 116685 + }, + { + "epoch": 12.837183718371836, + "grad_norm": 0.1976321041584015, + "learning_rate": 1.7120890695817752e-05, + "loss": 0.0371, + "num_input_tokens_seen": 24626624, + "step": 116690 + }, + { + "epoch": 12.837733773377337, + "grad_norm": 0.306232750415802, + "learning_rate": 1.7118612978811913e-05, + "loss": 0.0583, + "num_input_tokens_seen": 24627680, + "step": 116695 + }, + { + "epoch": 12.838283828382838, + "grad_norm": 0.059436775743961334, + "learning_rate": 1.7116335334445023e-05, + "loss": 0.0053, + "num_input_tokens_seen": 24628768, + "step": 116700 + }, + { + "epoch": 12.83883388338834, + "grad_norm": 1.2521179914474487, + "learning_rate": 1.7114057762738094e-05, + "loss": 0.04, + "num_input_tokens_seen": 24629792, + "step": 116705 + }, + { + "epoch": 12.839383938393839, + "grad_norm": 0.6470771431922913, + "learning_rate": 1.7111780263712103e-05, + "loss": 0.0123, + "num_input_tokens_seen": 24630880, + "step": 116710 + }, + { + "epoch": 12.83993399339934, + "grad_norm": 0.6011210083961487, + "learning_rate": 1.7109502837388062e-05, + "loss": 0.0143, + "num_input_tokens_seen": 24631968, + "step": 116715 + }, + { + "epoch": 12.840484048404841, + "grad_norm": 0.29258161783218384, + "learning_rate": 1.710722548378694e-05, + "loss": 0.0148, + "num_input_tokens_seen": 24632960, + "step": 116720 + }, + { + "epoch": 12.84103410341034, + "grad_norm": 0.03596848249435425, + "learning_rate": 1.710494820292973e-05, + "loss": 0.1276, + "num_input_tokens_seen": 24634016, + "step": 116725 + }, + { + "epoch": 12.841584158415841, + "grad_norm": 0.9533481597900391, + "learning_rate": 1.7102670994837428e-05, + "loss": 0.0696, + "num_input_tokens_seen": 24635040, + "step": 116730 + }, + { + "epoch": 12.842134213421343, + "grad_norm": 1.8119450807571411, + "learning_rate": 1.710039385953101e-05, + "loss": 0.0459, + "num_input_tokens_seen": 24636128, + "step": 116735 + }, + { + "epoch": 12.842684268426842, + "grad_norm": 1.0840471982955933, + "learning_rate": 1.7098116797031483e-05, + "loss": 0.0662, + "num_input_tokens_seen": 24637184, + "step": 116740 + }, + { + "epoch": 12.843234323432343, + "grad_norm": 0.11381171643733978, + "learning_rate": 1.709583980735982e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24638176, + "step": 116745 + }, + { + "epoch": 12.843784378437844, + "grad_norm": 2.153642177581787, + "learning_rate": 1.7093562890537e-05, + "loss": 0.0295, + "num_input_tokens_seen": 24639168, + "step": 116750 + }, + { + "epoch": 12.844334433443345, + "grad_norm": 0.008902572095394135, + "learning_rate": 1.7091286046584026e-05, + "loss": 0.003, + "num_input_tokens_seen": 24640192, + "step": 116755 + }, + { + "epoch": 12.844884488448844, + "grad_norm": 0.0680762305855751, + "learning_rate": 1.7089009275521867e-05, + "loss": 0.009, + "num_input_tokens_seen": 24641280, + "step": 116760 + }, + { + "epoch": 12.845434543454346, + "grad_norm": 0.010864008218050003, + "learning_rate": 1.7086732577371516e-05, + "loss": 0.0418, + "num_input_tokens_seen": 24642336, + "step": 116765 + }, + { + "epoch": 12.845984598459847, + "grad_norm": 0.025128765031695366, + "learning_rate": 1.708445595215396e-05, + "loss": 0.1166, + "num_input_tokens_seen": 24643360, + "step": 116770 + }, + { + "epoch": 12.846534653465346, + "grad_norm": 0.01167375035583973, + "learning_rate": 1.708217939989016e-05, + "loss": 0.0364, + "num_input_tokens_seen": 24644416, + "step": 116775 + }, + { + "epoch": 12.847084708470847, + "grad_norm": 0.016002008691430092, + "learning_rate": 1.707990292060113e-05, + "loss": 0.0914, + "num_input_tokens_seen": 24645408, + "step": 116780 + }, + { + "epoch": 12.847634763476348, + "grad_norm": 0.031530868262052536, + "learning_rate": 1.707762651430782e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24646528, + "step": 116785 + }, + { + "epoch": 12.848184818481847, + "grad_norm": 0.22802859544754028, + "learning_rate": 1.7075350181031235e-05, + "loss": 0.0466, + "num_input_tokens_seen": 24647616, + "step": 116790 + }, + { + "epoch": 12.848734873487349, + "grad_norm": 0.02939465083181858, + "learning_rate": 1.707307392079235e-05, + "loss": 0.0558, + "num_input_tokens_seen": 24648672, + "step": 116795 + }, + { + "epoch": 12.84928492849285, + "grad_norm": 0.32163846492767334, + "learning_rate": 1.7070797733612123e-05, + "loss": 0.0067, + "num_input_tokens_seen": 24649696, + "step": 116800 + }, + { + "epoch": 12.84983498349835, + "grad_norm": 0.008856185711920261, + "learning_rate": 1.7068521619511558e-05, + "loss": 0.0073, + "num_input_tokens_seen": 24650688, + "step": 116805 + }, + { + "epoch": 12.85038503850385, + "grad_norm": 0.008460802026093006, + "learning_rate": 1.7066245578511623e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24651744, + "step": 116810 + }, + { + "epoch": 12.850935093509351, + "grad_norm": 0.052296269685029984, + "learning_rate": 1.7063969610633292e-05, + "loss": 0.0376, + "num_input_tokens_seen": 24652736, + "step": 116815 + }, + { + "epoch": 12.851485148514852, + "grad_norm": 0.29574161767959595, + "learning_rate": 1.7061693715897546e-05, + "loss": 0.0062, + "num_input_tokens_seen": 24653760, + "step": 116820 + }, + { + "epoch": 12.852035203520352, + "grad_norm": 0.10305901616811752, + "learning_rate": 1.705941789432536e-05, + "loss": 0.0091, + "num_input_tokens_seen": 24654848, + "step": 116825 + }, + { + "epoch": 12.852585258525853, + "grad_norm": 0.12510846555233002, + "learning_rate": 1.7057142145937717e-05, + "loss": 0.0247, + "num_input_tokens_seen": 24655840, + "step": 116830 + }, + { + "epoch": 12.853135313531354, + "grad_norm": 0.026210596784949303, + "learning_rate": 1.7054866470755574e-05, + "loss": 0.0367, + "num_input_tokens_seen": 24656896, + "step": 116835 + }, + { + "epoch": 12.853685368536853, + "grad_norm": 0.03630821779370308, + "learning_rate": 1.705259086879991e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24657984, + "step": 116840 + }, + { + "epoch": 12.854235423542354, + "grad_norm": 0.017515502870082855, + "learning_rate": 1.705031534009171e-05, + "loss": 0.0204, + "num_input_tokens_seen": 24659040, + "step": 116845 + }, + { + "epoch": 12.854785478547855, + "grad_norm": 0.04973214492201805, + "learning_rate": 1.7048039884651934e-05, + "loss": 0.0321, + "num_input_tokens_seen": 24660096, + "step": 116850 + }, + { + "epoch": 12.855335533553355, + "grad_norm": 2.507568836212158, + "learning_rate": 1.7045764502501568e-05, + "loss": 0.1316, + "num_input_tokens_seen": 24661184, + "step": 116855 + }, + { + "epoch": 12.855885588558856, + "grad_norm": 0.10199599713087082, + "learning_rate": 1.704348919366157e-05, + "loss": 0.11, + "num_input_tokens_seen": 24662240, + "step": 116860 + }, + { + "epoch": 12.856435643564357, + "grad_norm": 0.40578359365463257, + "learning_rate": 1.7041213958152907e-05, + "loss": 0.0303, + "num_input_tokens_seen": 24663200, + "step": 116865 + }, + { + "epoch": 12.856985698569858, + "grad_norm": 0.16852933168411255, + "learning_rate": 1.7038938795996567e-05, + "loss": 0.0076, + "num_input_tokens_seen": 24664224, + "step": 116870 + }, + { + "epoch": 12.857535753575357, + "grad_norm": 0.034947507083415985, + "learning_rate": 1.7036663707213498e-05, + "loss": 0.071, + "num_input_tokens_seen": 24665248, + "step": 116875 + }, + { + "epoch": 12.858085808580858, + "grad_norm": 0.13056732714176178, + "learning_rate": 1.7034388691824684e-05, + "loss": 0.0093, + "num_input_tokens_seen": 24666272, + "step": 116880 + }, + { + "epoch": 12.85863586358636, + "grad_norm": 0.00795821938663721, + "learning_rate": 1.7032113749851093e-05, + "loss": 0.0012, + "num_input_tokens_seen": 24667296, + "step": 116885 + }, + { + "epoch": 12.859185918591859, + "grad_norm": 0.04128454625606537, + "learning_rate": 1.7029838881313677e-05, + "loss": 0.0092, + "num_input_tokens_seen": 24668352, + "step": 116890 + }, + { + "epoch": 12.85973597359736, + "grad_norm": 0.03507761284708977, + "learning_rate": 1.702756408623341e-05, + "loss": 0.0628, + "num_input_tokens_seen": 24669408, + "step": 116895 + }, + { + "epoch": 12.86028602860286, + "grad_norm": 0.19318139553070068, + "learning_rate": 1.702528936463126e-05, + "loss": 0.0105, + "num_input_tokens_seen": 24670496, + "step": 116900 + }, + { + "epoch": 12.86083608360836, + "grad_norm": 0.026148851960897446, + "learning_rate": 1.7023014716528197e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24671520, + "step": 116905 + }, + { + "epoch": 12.861386138613861, + "grad_norm": 0.010413167998194695, + "learning_rate": 1.702074014194518e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24672576, + "step": 116910 + }, + { + "epoch": 12.861936193619362, + "grad_norm": 0.028271790593862534, + "learning_rate": 1.7018465640903164e-05, + "loss": 0.1369, + "num_input_tokens_seen": 24673664, + "step": 116915 + }, + { + "epoch": 12.862486248624862, + "grad_norm": 0.023191003128886223, + "learning_rate": 1.7016191213423128e-05, + "loss": 0.005, + "num_input_tokens_seen": 24674688, + "step": 116920 + }, + { + "epoch": 12.863036303630363, + "grad_norm": 0.014522550627589226, + "learning_rate": 1.701391685952602e-05, + "loss": 0.0009, + "num_input_tokens_seen": 24675712, + "step": 116925 + }, + { + "epoch": 12.863586358635864, + "grad_norm": 0.01388042327016592, + "learning_rate": 1.701164257923282e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24676736, + "step": 116930 + }, + { + "epoch": 12.864136413641365, + "grad_norm": 1.0717508792877197, + "learning_rate": 1.7009368372564466e-05, + "loss": 0.0161, + "num_input_tokens_seen": 24677824, + "step": 116935 + }, + { + "epoch": 12.864686468646864, + "grad_norm": 0.0819491595029831, + "learning_rate": 1.7007094239541933e-05, + "loss": 0.0129, + "num_input_tokens_seen": 24678880, + "step": 116940 + }, + { + "epoch": 12.865236523652365, + "grad_norm": 0.0757511705160141, + "learning_rate": 1.7004820180186183e-05, + "loss": 0.0125, + "num_input_tokens_seen": 24679936, + "step": 116945 + }, + { + "epoch": 12.865786578657866, + "grad_norm": 0.011108385398983955, + "learning_rate": 1.7002546194518153e-05, + "loss": 0.0009, + "num_input_tokens_seen": 24680928, + "step": 116950 + }, + { + "epoch": 12.866336633663366, + "grad_norm": 1.6596680879592896, + "learning_rate": 1.7000272282558828e-05, + "loss": 0.0766, + "num_input_tokens_seen": 24682016, + "step": 116955 + }, + { + "epoch": 12.866886688668867, + "grad_norm": 0.07010769844055176, + "learning_rate": 1.699799844432916e-05, + "loss": 0.0036, + "num_input_tokens_seen": 24683168, + "step": 116960 + }, + { + "epoch": 12.867436743674368, + "grad_norm": 0.011795750819146633, + "learning_rate": 1.6995724679850087e-05, + "loss": 0.001, + "num_input_tokens_seen": 24684224, + "step": 116965 + }, + { + "epoch": 12.867986798679867, + "grad_norm": 0.027150006964802742, + "learning_rate": 1.699345098914259e-05, + "loss": 0.0187, + "num_input_tokens_seen": 24685312, + "step": 116970 + }, + { + "epoch": 12.868536853685368, + "grad_norm": 0.020266180858016014, + "learning_rate": 1.6991177372227607e-05, + "loss": 0.0328, + "num_input_tokens_seen": 24686336, + "step": 116975 + }, + { + "epoch": 12.86908690869087, + "grad_norm": 0.36015811562538147, + "learning_rate": 1.6988903829126102e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24687392, + "step": 116980 + }, + { + "epoch": 12.869636963696369, + "grad_norm": 0.01707354001700878, + "learning_rate": 1.6986630359859034e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24688480, + "step": 116985 + }, + { + "epoch": 12.87018701870187, + "grad_norm": 1.3003817796707153, + "learning_rate": 1.6984356964447336e-05, + "loss": 0.0114, + "num_input_tokens_seen": 24689504, + "step": 116990 + }, + { + "epoch": 12.870737073707371, + "grad_norm": 0.3311695456504822, + "learning_rate": 1.6982083642911978e-05, + "loss": 0.1269, + "num_input_tokens_seen": 24690496, + "step": 116995 + }, + { + "epoch": 12.871287128712872, + "grad_norm": 0.025372609496116638, + "learning_rate": 1.6979810395273912e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24691520, + "step": 117000 + }, + { + "epoch": 12.871837183718371, + "grad_norm": 0.09920946508646011, + "learning_rate": 1.6977537221554077e-05, + "loss": 0.0102, + "num_input_tokens_seen": 24692544, + "step": 117005 + }, + { + "epoch": 12.872387238723872, + "grad_norm": 0.14248047769069672, + "learning_rate": 1.697526412177344e-05, + "loss": 0.1227, + "num_input_tokens_seen": 24693600, + "step": 117010 + }, + { + "epoch": 12.872937293729374, + "grad_norm": 0.01928173191845417, + "learning_rate": 1.697299109595293e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24694656, + "step": 117015 + }, + { + "epoch": 12.873487348734873, + "grad_norm": 0.030506515875458717, + "learning_rate": 1.697071814411352e-05, + "loss": 0.0042, + "num_input_tokens_seen": 24695712, + "step": 117020 + }, + { + "epoch": 12.874037403740374, + "grad_norm": 0.21952340006828308, + "learning_rate": 1.6968445266276152e-05, + "loss": 0.0104, + "num_input_tokens_seen": 24696768, + "step": 117025 + }, + { + "epoch": 12.874587458745875, + "grad_norm": 0.4293183386325836, + "learning_rate": 1.696617246246176e-05, + "loss": 0.0145, + "num_input_tokens_seen": 24697856, + "step": 117030 + }, + { + "epoch": 12.875137513751374, + "grad_norm": 0.048262424767017365, + "learning_rate": 1.6963899732691308e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24698912, + "step": 117035 + }, + { + "epoch": 12.875687568756875, + "grad_norm": 0.08256267011165619, + "learning_rate": 1.6961627076985728e-05, + "loss": 0.0626, + "num_input_tokens_seen": 24699936, + "step": 117040 + }, + { + "epoch": 12.876237623762377, + "grad_norm": 1.708141565322876, + "learning_rate": 1.6959354495365986e-05, + "loss": 0.1096, + "num_input_tokens_seen": 24700928, + "step": 117045 + }, + { + "epoch": 12.876787678767876, + "grad_norm": 1.840608835220337, + "learning_rate": 1.695708198785301e-05, + "loss": 0.0121, + "num_input_tokens_seen": 24702016, + "step": 117050 + }, + { + "epoch": 12.877337733773377, + "grad_norm": 0.00833926908671856, + "learning_rate": 1.6954809554467744e-05, + "loss": 0.0204, + "num_input_tokens_seen": 24703008, + "step": 117055 + }, + { + "epoch": 12.877887788778878, + "grad_norm": 0.10911231487989426, + "learning_rate": 1.695253719523115e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24704064, + "step": 117060 + }, + { + "epoch": 12.87843784378438, + "grad_norm": 1.3060413599014282, + "learning_rate": 1.695026491016415e-05, + "loss": 0.0588, + "num_input_tokens_seen": 24705120, + "step": 117065 + }, + { + "epoch": 12.878987898789878, + "grad_norm": 0.13831554353237152, + "learning_rate": 1.6947992699287697e-05, + "loss": 0.0075, + "num_input_tokens_seen": 24706144, + "step": 117070 + }, + { + "epoch": 12.87953795379538, + "grad_norm": 0.00416301004588604, + "learning_rate": 1.6945720562622737e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24707200, + "step": 117075 + }, + { + "epoch": 12.88008800880088, + "grad_norm": 0.29830077290534973, + "learning_rate": 1.6943448500190193e-05, + "loss": 0.0763, + "num_input_tokens_seen": 24708192, + "step": 117080 + }, + { + "epoch": 12.88063806380638, + "grad_norm": 0.005112926010042429, + "learning_rate": 1.694117651201103e-05, + "loss": 0.0013, + "num_input_tokens_seen": 24709248, + "step": 117085 + }, + { + "epoch": 12.881188118811881, + "grad_norm": 2.4534082412719727, + "learning_rate": 1.693890459810617e-05, + "loss": 0.0756, + "num_input_tokens_seen": 24710368, + "step": 117090 + }, + { + "epoch": 12.881738173817382, + "grad_norm": 0.008810286410152912, + "learning_rate": 1.693663275849656e-05, + "loss": 0.0369, + "num_input_tokens_seen": 24711520, + "step": 117095 + }, + { + "epoch": 12.882288228822881, + "grad_norm": 3.0254697799682617, + "learning_rate": 1.6934360993203145e-05, + "loss": 0.1924, + "num_input_tokens_seen": 24712512, + "step": 117100 + }, + { + "epoch": 12.882838283828383, + "grad_norm": 0.6990963816642761, + "learning_rate": 1.693208930224684e-05, + "loss": 0.1194, + "num_input_tokens_seen": 24713568, + "step": 117105 + }, + { + "epoch": 12.883388338833884, + "grad_norm": 0.1187061220407486, + "learning_rate": 1.692981768564861e-05, + "loss": 0.008, + "num_input_tokens_seen": 24714624, + "step": 117110 + }, + { + "epoch": 12.883938393839383, + "grad_norm": 0.012531655840575695, + "learning_rate": 1.6927546143429362e-05, + "loss": 0.0255, + "num_input_tokens_seen": 24715680, + "step": 117115 + }, + { + "epoch": 12.884488448844884, + "grad_norm": 0.007150754332542419, + "learning_rate": 1.6925274675610065e-05, + "loss": 0.0087, + "num_input_tokens_seen": 24716736, + "step": 117120 + }, + { + "epoch": 12.885038503850385, + "grad_norm": 0.02224000357091427, + "learning_rate": 1.692300328221163e-05, + "loss": 0.0079, + "num_input_tokens_seen": 24717760, + "step": 117125 + }, + { + "epoch": 12.885588558855886, + "grad_norm": 0.2427866905927658, + "learning_rate": 1.6920731963254992e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24718816, + "step": 117130 + }, + { + "epoch": 12.886138613861386, + "grad_norm": 0.047325942665338516, + "learning_rate": 1.69184607187611e-05, + "loss": 0.0038, + "num_input_tokens_seen": 24719904, + "step": 117135 + }, + { + "epoch": 12.886688668866887, + "grad_norm": 0.057707808911800385, + "learning_rate": 1.6916189548750863e-05, + "loss": 0.0052, + "num_input_tokens_seen": 24720992, + "step": 117140 + }, + { + "epoch": 12.887238723872388, + "grad_norm": 0.004047395195811987, + "learning_rate": 1.6913918453245246e-05, + "loss": 0.0432, + "num_input_tokens_seen": 24721984, + "step": 117145 + }, + { + "epoch": 12.887788778877887, + "grad_norm": 0.2547886073589325, + "learning_rate": 1.6911647432265153e-05, + "loss": 0.0597, + "num_input_tokens_seen": 24723040, + "step": 117150 + }, + { + "epoch": 12.888338833883388, + "grad_norm": 0.049304258078336716, + "learning_rate": 1.6909376485831524e-05, + "loss": 0.0049, + "num_input_tokens_seen": 24724096, + "step": 117155 + }, + { + "epoch": 12.88888888888889, + "grad_norm": 0.14723707735538483, + "learning_rate": 1.6907105613965296e-05, + "loss": 0.0146, + "num_input_tokens_seen": 24725184, + "step": 117160 + }, + { + "epoch": 12.88943894389439, + "grad_norm": 0.07646752148866653, + "learning_rate": 1.6904834816687383e-05, + "loss": 0.0013, + "num_input_tokens_seen": 24726240, + "step": 117165 + }, + { + "epoch": 12.88998899889989, + "grad_norm": 3.6496312618255615, + "learning_rate": 1.690256409401873e-05, + "loss": 0.1717, + "num_input_tokens_seen": 24727296, + "step": 117170 + }, + { + "epoch": 12.89053905390539, + "grad_norm": 0.101103775203228, + "learning_rate": 1.690029344598026e-05, + "loss": 0.1153, + "num_input_tokens_seen": 24728352, + "step": 117175 + }, + { + "epoch": 12.891089108910892, + "grad_norm": 0.05903579294681549, + "learning_rate": 1.689802287259289e-05, + "loss": 0.0453, + "num_input_tokens_seen": 24729440, + "step": 117180 + }, + { + "epoch": 12.891639163916391, + "grad_norm": 0.007333370856940746, + "learning_rate": 1.6895752373877562e-05, + "loss": 0.0023, + "num_input_tokens_seen": 24730496, + "step": 117185 + }, + { + "epoch": 12.892189218921892, + "grad_norm": 0.05480213463306427, + "learning_rate": 1.68934819498552e-05, + "loss": 0.006, + "num_input_tokens_seen": 24731584, + "step": 117190 + }, + { + "epoch": 12.892739273927393, + "grad_norm": 0.032065559178590775, + "learning_rate": 1.6891211600546714e-05, + "loss": 0.0031, + "num_input_tokens_seen": 24732704, + "step": 117195 + }, + { + "epoch": 12.893289328932893, + "grad_norm": 0.014140002429485321, + "learning_rate": 1.6888941325973046e-05, + "loss": 0.0338, + "num_input_tokens_seen": 24733888, + "step": 117200 + }, + { + "epoch": 12.893839383938394, + "grad_norm": 0.011676001362502575, + "learning_rate": 1.6886671126155108e-05, + "loss": 0.0125, + "num_input_tokens_seen": 24734944, + "step": 117205 + }, + { + "epoch": 12.894389438943895, + "grad_norm": 0.04524506255984306, + "learning_rate": 1.6884401001113838e-05, + "loss": 0.0073, + "num_input_tokens_seen": 24736000, + "step": 117210 + }, + { + "epoch": 12.894939493949394, + "grad_norm": 0.010347329080104828, + "learning_rate": 1.6882130950870148e-05, + "loss": 0.0068, + "num_input_tokens_seen": 24737056, + "step": 117215 + }, + { + "epoch": 12.895489548954895, + "grad_norm": 0.03509661927819252, + "learning_rate": 1.6879860975444956e-05, + "loss": 0.0135, + "num_input_tokens_seen": 24738112, + "step": 117220 + }, + { + "epoch": 12.896039603960396, + "grad_norm": 0.1524520069360733, + "learning_rate": 1.687759107485919e-05, + "loss": 0.0127, + "num_input_tokens_seen": 24739168, + "step": 117225 + }, + { + "epoch": 12.896589658965897, + "grad_norm": 0.011252395808696747, + "learning_rate": 1.6875321249133767e-05, + "loss": 0.0205, + "num_input_tokens_seen": 24740224, + "step": 117230 + }, + { + "epoch": 12.897139713971397, + "grad_norm": 0.005048193037509918, + "learning_rate": 1.687305149828962e-05, + "loss": 0.0066, + "num_input_tokens_seen": 24741216, + "step": 117235 + }, + { + "epoch": 12.897689768976898, + "grad_norm": 0.0054192086681723595, + "learning_rate": 1.6870781822347652e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24742304, + "step": 117240 + }, + { + "epoch": 12.898239823982399, + "grad_norm": 0.09674245864152908, + "learning_rate": 1.686851222132878e-05, + "loss": 0.0622, + "num_input_tokens_seen": 24743456, + "step": 117245 + }, + { + "epoch": 12.898789878987898, + "grad_norm": 0.014803353697061539, + "learning_rate": 1.6866242695253943e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24744448, + "step": 117250 + }, + { + "epoch": 12.8993399339934, + "grad_norm": 0.4036315679550171, + "learning_rate": 1.6863973244144033e-05, + "loss": 0.0541, + "num_input_tokens_seen": 24745408, + "step": 117255 + }, + { + "epoch": 12.8998899889989, + "grad_norm": 1.2179980278015137, + "learning_rate": 1.686170386801998e-05, + "loss": 0.0168, + "num_input_tokens_seen": 24746464, + "step": 117260 + }, + { + "epoch": 12.9004400440044, + "grad_norm": 0.01043616607785225, + "learning_rate": 1.6859434566902702e-05, + "loss": 0.0222, + "num_input_tokens_seen": 24747552, + "step": 117265 + }, + { + "epoch": 12.900990099009901, + "grad_norm": 0.04999662563204765, + "learning_rate": 1.68571653408131e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24748608, + "step": 117270 + }, + { + "epoch": 12.901540154015402, + "grad_norm": 0.14909681677818298, + "learning_rate": 1.6854896189772114e-05, + "loss": 0.0032, + "num_input_tokens_seen": 24749696, + "step": 117275 + }, + { + "epoch": 12.902090209020901, + "grad_norm": 0.05821448564529419, + "learning_rate": 1.6852627113800624e-05, + "loss": 0.0992, + "num_input_tokens_seen": 24750784, + "step": 117280 + }, + { + "epoch": 12.902640264026402, + "grad_norm": 0.4365694224834442, + "learning_rate": 1.6850358112919567e-05, + "loss": 0.0241, + "num_input_tokens_seen": 24751808, + "step": 117285 + }, + { + "epoch": 12.903190319031903, + "grad_norm": 0.9204643368721008, + "learning_rate": 1.6848089187149858e-05, + "loss": 0.0246, + "num_input_tokens_seen": 24752864, + "step": 117290 + }, + { + "epoch": 12.903740374037405, + "grad_norm": 2.6844940185546875, + "learning_rate": 1.6845820336512385e-05, + "loss": 0.0646, + "num_input_tokens_seen": 24753888, + "step": 117295 + }, + { + "epoch": 12.904290429042904, + "grad_norm": 0.04457797855138779, + "learning_rate": 1.684355156102808e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24754944, + "step": 117300 + }, + { + "epoch": 12.904840484048405, + "grad_norm": 0.011964286677539349, + "learning_rate": 1.684128286071784e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24756000, + "step": 117305 + }, + { + "epoch": 12.905390539053906, + "grad_norm": 0.011387749575078487, + "learning_rate": 1.683901423560259e-05, + "loss": 0.0814, + "num_input_tokens_seen": 24757056, + "step": 117310 + }, + { + "epoch": 12.905940594059405, + "grad_norm": 0.018410107120871544, + "learning_rate": 1.6836745685703224e-05, + "loss": 0.0029, + "num_input_tokens_seen": 24758144, + "step": 117315 + }, + { + "epoch": 12.906490649064907, + "grad_norm": 0.6982165575027466, + "learning_rate": 1.6834477211040654e-05, + "loss": 0.011, + "num_input_tokens_seen": 24759232, + "step": 117320 + }, + { + "epoch": 12.907040704070408, + "grad_norm": 0.253863126039505, + "learning_rate": 1.6832208811635796e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24760288, + "step": 117325 + }, + { + "epoch": 12.907590759075907, + "grad_norm": 0.03276235610246658, + "learning_rate": 1.6829940487509544e-05, + "loss": 0.0028, + "num_input_tokens_seen": 24761280, + "step": 117330 + }, + { + "epoch": 12.908140814081408, + "grad_norm": 0.013996109366416931, + "learning_rate": 1.6827672238682817e-05, + "loss": 0.2039, + "num_input_tokens_seen": 24762336, + "step": 117335 + }, + { + "epoch": 12.908690869086909, + "grad_norm": 0.013407266698777676, + "learning_rate": 1.6825404065176513e-05, + "loss": 0.0237, + "num_input_tokens_seen": 24763360, + "step": 117340 + }, + { + "epoch": 12.909240924092408, + "grad_norm": 0.006786946672946215, + "learning_rate": 1.6823135967011526e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24764384, + "step": 117345 + }, + { + "epoch": 12.90979097909791, + "grad_norm": 0.6261762380599976, + "learning_rate": 1.6820867944208792e-05, + "loss": 0.0061, + "num_input_tokens_seen": 24765440, + "step": 117350 + }, + { + "epoch": 12.91034103410341, + "grad_norm": 0.0227833092212677, + "learning_rate": 1.681859999678917e-05, + "loss": 0.0182, + "num_input_tokens_seen": 24766592, + "step": 117355 + }, + { + "epoch": 12.910891089108912, + "grad_norm": 0.004074383527040482, + "learning_rate": 1.68163321247736e-05, + "loss": 0.0033, + "num_input_tokens_seen": 24767616, + "step": 117360 + }, + { + "epoch": 12.911441144114411, + "grad_norm": 2.159492015838623, + "learning_rate": 1.681406432818297e-05, + "loss": 0.2461, + "num_input_tokens_seen": 24768704, + "step": 117365 + }, + { + "epoch": 12.911991199119912, + "grad_norm": 0.024084150791168213, + "learning_rate": 1.6811796607038176e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24769792, + "step": 117370 + }, + { + "epoch": 12.912541254125413, + "grad_norm": 0.05650252476334572, + "learning_rate": 1.6809528961360127e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24770816, + "step": 117375 + }, + { + "epoch": 12.913091309130913, + "grad_norm": 0.6324335336685181, + "learning_rate": 1.680726139116972e-05, + "loss": 0.0054, + "num_input_tokens_seen": 24771904, + "step": 117380 + }, + { + "epoch": 12.913641364136414, + "grad_norm": 0.006590310018509626, + "learning_rate": 1.680499389648784e-05, + "loss": 0.0341, + "num_input_tokens_seen": 24772992, + "step": 117385 + }, + { + "epoch": 12.914191419141915, + "grad_norm": 1.26344633102417, + "learning_rate": 1.6802726477335418e-05, + "loss": 0.0681, + "num_input_tokens_seen": 24774080, + "step": 117390 + }, + { + "epoch": 12.914741474147414, + "grad_norm": 0.005794468801468611, + "learning_rate": 1.680045913373332e-05, + "loss": 0.0065, + "num_input_tokens_seen": 24775168, + "step": 117395 + }, + { + "epoch": 12.915291529152915, + "grad_norm": 0.011744266375899315, + "learning_rate": 1.679819186570246e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24776160, + "step": 117400 + }, + { + "epoch": 12.915841584158416, + "grad_norm": 0.09478046745061874, + "learning_rate": 1.6795924673263737e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24777216, + "step": 117405 + }, + { + "epoch": 12.916391639163916, + "grad_norm": 0.2654370665550232, + "learning_rate": 1.679365755643803e-05, + "loss": 0.0613, + "num_input_tokens_seen": 24778304, + "step": 117410 + }, + { + "epoch": 12.916941694169417, + "grad_norm": 0.03196285665035248, + "learning_rate": 1.6791390515246248e-05, + "loss": 0.0008, + "num_input_tokens_seen": 24779264, + "step": 117415 + }, + { + "epoch": 12.917491749174918, + "grad_norm": 0.015118950977921486, + "learning_rate": 1.6789123549709273e-05, + "loss": 0.0402, + "num_input_tokens_seen": 24780256, + "step": 117420 + }, + { + "epoch": 12.918041804180419, + "grad_norm": 1.7836885452270508, + "learning_rate": 1.678685665984802e-05, + "loss": 0.0707, + "num_input_tokens_seen": 24781344, + "step": 117425 + }, + { + "epoch": 12.918591859185918, + "grad_norm": 1.8020048141479492, + "learning_rate": 1.678458984568336e-05, + "loss": 0.1152, + "num_input_tokens_seen": 24782464, + "step": 117430 + }, + { + "epoch": 12.91914191419142, + "grad_norm": 0.00435736496001482, + "learning_rate": 1.678232310723619e-05, + "loss": 0.0006, + "num_input_tokens_seen": 24783488, + "step": 117435 + }, + { + "epoch": 12.91969196919692, + "grad_norm": 0.2513815462589264, + "learning_rate": 1.6780056444527416e-05, + "loss": 0.0162, + "num_input_tokens_seen": 24784512, + "step": 117440 + }, + { + "epoch": 12.92024202420242, + "grad_norm": 0.18248237669467926, + "learning_rate": 1.6777789857577898e-05, + "loss": 0.0203, + "num_input_tokens_seen": 24785600, + "step": 117445 + }, + { + "epoch": 12.92079207920792, + "grad_norm": 0.07578159868717194, + "learning_rate": 1.6775523346408566e-05, + "loss": 0.0073, + "num_input_tokens_seen": 24786624, + "step": 117450 + }, + { + "epoch": 12.921342134213422, + "grad_norm": 0.18715594708919525, + "learning_rate": 1.677325691104028e-05, + "loss": 0.0037, + "num_input_tokens_seen": 24787680, + "step": 117455 + }, + { + "epoch": 12.921892189218921, + "grad_norm": 0.05860108137130737, + "learning_rate": 1.677099055149393e-05, + "loss": 0.0084, + "num_input_tokens_seen": 24788736, + "step": 117460 + }, + { + "epoch": 12.922442244224422, + "grad_norm": 0.007724703289568424, + "learning_rate": 1.6768724267790425e-05, + "loss": 0.0436, + "num_input_tokens_seen": 24789696, + "step": 117465 + }, + { + "epoch": 12.922992299229923, + "grad_norm": 0.00294024427421391, + "learning_rate": 1.676645805995063e-05, + "loss": 0.0506, + "num_input_tokens_seen": 24790720, + "step": 117470 + }, + { + "epoch": 12.923542354235423, + "grad_norm": 0.026891563087701797, + "learning_rate": 1.6764191927995443e-05, + "loss": 0.0153, + "num_input_tokens_seen": 24791744, + "step": 117475 + }, + { + "epoch": 12.924092409240924, + "grad_norm": 0.006915939040482044, + "learning_rate": 1.6761925871945756e-05, + "loss": 0.0361, + "num_input_tokens_seen": 24792832, + "step": 117480 + }, + { + "epoch": 12.924642464246425, + "grad_norm": 0.12968137860298157, + "learning_rate": 1.675965989182243e-05, + "loss": 0.0222, + "num_input_tokens_seen": 24793856, + "step": 117485 + }, + { + "epoch": 12.925192519251926, + "grad_norm": 0.5177050232887268, + "learning_rate": 1.6757393987646374e-05, + "loss": 0.087, + "num_input_tokens_seen": 24794944, + "step": 117490 + }, + { + "epoch": 12.925742574257425, + "grad_norm": 1.9332126379013062, + "learning_rate": 1.6755128159438453e-05, + "loss": 0.1392, + "num_input_tokens_seen": 24796032, + "step": 117495 + }, + { + "epoch": 12.926292629262926, + "grad_norm": 0.2827162444591522, + "learning_rate": 1.6752862407219573e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24797056, + "step": 117500 + }, + { + "epoch": 12.926842684268427, + "grad_norm": 0.036616139113903046, + "learning_rate": 1.6750596731010593e-05, + "loss": 0.0049, + "num_input_tokens_seen": 24798112, + "step": 117505 + }, + { + "epoch": 12.927392739273927, + "grad_norm": 0.052444227039813995, + "learning_rate": 1.6748331130832402e-05, + "loss": 0.0012, + "num_input_tokens_seen": 24799136, + "step": 117510 + }, + { + "epoch": 12.927942794279428, + "grad_norm": 0.022202355787158012, + "learning_rate": 1.674606560670589e-05, + "loss": 0.0242, + "num_input_tokens_seen": 24800128, + "step": 117515 + }, + { + "epoch": 12.928492849284929, + "grad_norm": 0.009169146418571472, + "learning_rate": 1.6743800158651924e-05, + "loss": 0.1871, + "num_input_tokens_seen": 24801248, + "step": 117520 + }, + { + "epoch": 12.929042904290428, + "grad_norm": 0.022580444812774658, + "learning_rate": 1.6741534786691403e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24802336, + "step": 117525 + }, + { + "epoch": 12.92959295929593, + "grad_norm": 0.6588152050971985, + "learning_rate": 1.6739269490845187e-05, + "loss": 0.0699, + "num_input_tokens_seen": 24803424, + "step": 117530 + }, + { + "epoch": 12.93014301430143, + "grad_norm": 0.0996236577630043, + "learning_rate": 1.6737004271134156e-05, + "loss": 0.0623, + "num_input_tokens_seen": 24804544, + "step": 117535 + }, + { + "epoch": 12.930693069306932, + "grad_norm": 0.03368092700839043, + "learning_rate": 1.67347391275792e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24805600, + "step": 117540 + }, + { + "epoch": 12.93124312431243, + "grad_norm": 0.014740983955562115, + "learning_rate": 1.6732474060201174e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24806656, + "step": 117545 + }, + { + "epoch": 12.931793179317932, + "grad_norm": 0.00550640607252717, + "learning_rate": 1.673020906902098e-05, + "loss": 0.0071, + "num_input_tokens_seen": 24807680, + "step": 117550 + }, + { + "epoch": 12.932343234323433, + "grad_norm": 0.03060946986079216, + "learning_rate": 1.6727944154059484e-05, + "loss": 0.0525, + "num_input_tokens_seen": 24808768, + "step": 117555 + }, + { + "epoch": 12.932893289328932, + "grad_norm": 2.3151752948760986, + "learning_rate": 1.6725679315337536e-05, + "loss": 0.1739, + "num_input_tokens_seen": 24809824, + "step": 117560 + }, + { + "epoch": 12.933443344334433, + "grad_norm": 0.08574223518371582, + "learning_rate": 1.672341455287605e-05, + "loss": 0.0808, + "num_input_tokens_seen": 24810848, + "step": 117565 + }, + { + "epoch": 12.933993399339935, + "grad_norm": 0.7783398628234863, + "learning_rate": 1.6721149866695878e-05, + "loss": 0.0919, + "num_input_tokens_seen": 24811936, + "step": 117570 + }, + { + "epoch": 12.934543454345434, + "grad_norm": 0.006907428614795208, + "learning_rate": 1.6718885256817888e-05, + "loss": 0.0009, + "num_input_tokens_seen": 24812928, + "step": 117575 + }, + { + "epoch": 12.935093509350935, + "grad_norm": 0.03861735761165619, + "learning_rate": 1.671662072326297e-05, + "loss": 0.0036, + "num_input_tokens_seen": 24814048, + "step": 117580 + }, + { + "epoch": 12.935643564356436, + "grad_norm": 0.013699183240532875, + "learning_rate": 1.671435626605197e-05, + "loss": 0.1577, + "num_input_tokens_seen": 24815104, + "step": 117585 + }, + { + "epoch": 12.936193619361937, + "grad_norm": 1.406812310218811, + "learning_rate": 1.6712091885205782e-05, + "loss": 0.0135, + "num_input_tokens_seen": 24816192, + "step": 117590 + }, + { + "epoch": 12.936743674367436, + "grad_norm": 0.2070905566215515, + "learning_rate": 1.670982758074527e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24817248, + "step": 117595 + }, + { + "epoch": 12.937293729372938, + "grad_norm": 0.08154206722974777, + "learning_rate": 1.670756335269129e-05, + "loss": 0.0028, + "num_input_tokens_seen": 24818304, + "step": 117600 + }, + { + "epoch": 12.937843784378439, + "grad_norm": 0.015180781483650208, + "learning_rate": 1.6705299201064722e-05, + "loss": 0.0049, + "num_input_tokens_seen": 24819392, + "step": 117605 + }, + { + "epoch": 12.938393839383938, + "grad_norm": 0.03174001723527908, + "learning_rate": 1.6703035125886428e-05, + "loss": 0.015, + "num_input_tokens_seen": 24820416, + "step": 117610 + }, + { + "epoch": 12.938943894389439, + "grad_norm": 0.052573926746845245, + "learning_rate": 1.670077112717729e-05, + "loss": 0.1029, + "num_input_tokens_seen": 24821472, + "step": 117615 + }, + { + "epoch": 12.93949394939494, + "grad_norm": 0.14843866229057312, + "learning_rate": 1.6698507204958153e-05, + "loss": 0.0034, + "num_input_tokens_seen": 24822528, + "step": 117620 + }, + { + "epoch": 12.94004400440044, + "grad_norm": 0.6114689707756042, + "learning_rate": 1.6696243359249887e-05, + "loss": 0.124, + "num_input_tokens_seen": 24823584, + "step": 117625 + }, + { + "epoch": 12.94059405940594, + "grad_norm": 0.020299002528190613, + "learning_rate": 1.669397959007337e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24824672, + "step": 117630 + }, + { + "epoch": 12.941144114411442, + "grad_norm": 0.01958102360367775, + "learning_rate": 1.6691715897449452e-05, + "loss": 0.0015, + "num_input_tokens_seen": 24825728, + "step": 117635 + }, + { + "epoch": 12.941694169416941, + "grad_norm": 0.2148684710264206, + "learning_rate": 1.668945228139901e-05, + "loss": 0.0083, + "num_input_tokens_seen": 24826752, + "step": 117640 + }, + { + "epoch": 12.942244224422442, + "grad_norm": 0.027043521404266357, + "learning_rate": 1.668718874194289e-05, + "loss": 0.0021, + "num_input_tokens_seen": 24827808, + "step": 117645 + }, + { + "epoch": 12.942794279427943, + "grad_norm": 0.16984005272388458, + "learning_rate": 1.6684925279101962e-05, + "loss": 0.0105, + "num_input_tokens_seen": 24828928, + "step": 117650 + }, + { + "epoch": 12.943344334433444, + "grad_norm": 0.11638345569372177, + "learning_rate": 1.6682661892897097e-05, + "loss": 0.0845, + "num_input_tokens_seen": 24829920, + "step": 117655 + }, + { + "epoch": 12.943894389438944, + "grad_norm": 0.016215475276112556, + "learning_rate": 1.668039858334914e-05, + "loss": 0.0031, + "num_input_tokens_seen": 24830912, + "step": 117660 + }, + { + "epoch": 12.944444444444445, + "grad_norm": 0.011816955171525478, + "learning_rate": 1.667813535047896e-05, + "loss": 0.0073, + "num_input_tokens_seen": 24832064, + "step": 117665 + }, + { + "epoch": 12.944994499449946, + "grad_norm": 1.7573094367980957, + "learning_rate": 1.6675872194307414e-05, + "loss": 0.145, + "num_input_tokens_seen": 24833120, + "step": 117670 + }, + { + "epoch": 12.945544554455445, + "grad_norm": 0.05818018317222595, + "learning_rate": 1.6673609114855347e-05, + "loss": 0.003, + "num_input_tokens_seen": 24834208, + "step": 117675 + }, + { + "epoch": 12.946094609460946, + "grad_norm": 0.018906449899077415, + "learning_rate": 1.6671346112143642e-05, + "loss": 0.0692, + "num_input_tokens_seen": 24835232, + "step": 117680 + }, + { + "epoch": 12.946644664466447, + "grad_norm": 0.0077171893790364265, + "learning_rate": 1.6669083186193135e-05, + "loss": 0.0071, + "num_input_tokens_seen": 24836288, + "step": 117685 + }, + { + "epoch": 12.947194719471947, + "grad_norm": 0.020114043727517128, + "learning_rate": 1.6666820337024695e-05, + "loss": 0.007, + "num_input_tokens_seen": 24837344, + "step": 117690 + }, + { + "epoch": 12.947744774477448, + "grad_norm": 0.02398977242410183, + "learning_rate": 1.6664557564659185e-05, + "loss": 0.0104, + "num_input_tokens_seen": 24838464, + "step": 117695 + }, + { + "epoch": 12.948294829482949, + "grad_norm": 0.021315397694706917, + "learning_rate": 1.6662294869117436e-05, + "loss": 0.0071, + "num_input_tokens_seen": 24839488, + "step": 117700 + }, + { + "epoch": 12.948844884488448, + "grad_norm": 1.7278180122375488, + "learning_rate": 1.6660032250420314e-05, + "loss": 0.0352, + "num_input_tokens_seen": 24840544, + "step": 117705 + }, + { + "epoch": 12.94939493949395, + "grad_norm": 0.0730428621172905, + "learning_rate": 1.6657769708588668e-05, + "loss": 0.0956, + "num_input_tokens_seen": 24841568, + "step": 117710 + }, + { + "epoch": 12.94994499449945, + "grad_norm": 0.017183013260364532, + "learning_rate": 1.665550724364337e-05, + "loss": 0.0386, + "num_input_tokens_seen": 24842624, + "step": 117715 + }, + { + "epoch": 12.950495049504951, + "grad_norm": 0.018551694229245186, + "learning_rate": 1.6653244855605248e-05, + "loss": 0.0153, + "num_input_tokens_seen": 24843648, + "step": 117720 + }, + { + "epoch": 12.95104510451045, + "grad_norm": 1.2981469631195068, + "learning_rate": 1.6650982544495165e-05, + "loss": 0.1672, + "num_input_tokens_seen": 24844640, + "step": 117725 + }, + { + "epoch": 12.951595159515952, + "grad_norm": 0.03580554574728012, + "learning_rate": 1.6648720310333973e-05, + "loss": 0.0039, + "num_input_tokens_seen": 24845696, + "step": 117730 + }, + { + "epoch": 12.952145214521453, + "grad_norm": 0.017403773963451385, + "learning_rate": 1.6646458153142512e-05, + "loss": 0.0025, + "num_input_tokens_seen": 24846752, + "step": 117735 + }, + { + "epoch": 12.952695269526952, + "grad_norm": 0.3986909091472626, + "learning_rate": 1.6644196072941644e-05, + "loss": 0.0069, + "num_input_tokens_seen": 24847872, + "step": 117740 + }, + { + "epoch": 12.953245324532453, + "grad_norm": 3.213726282119751, + "learning_rate": 1.664193406975221e-05, + "loss": 0.0907, + "num_input_tokens_seen": 24848896, + "step": 117745 + }, + { + "epoch": 12.953795379537954, + "grad_norm": 0.1378946751356125, + "learning_rate": 1.663967214359505e-05, + "loss": 0.0216, + "num_input_tokens_seen": 24850016, + "step": 117750 + }, + { + "epoch": 12.954345434543454, + "grad_norm": 0.08997222781181335, + "learning_rate": 1.6637410294491036e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24851008, + "step": 117755 + }, + { + "epoch": 12.954895489548955, + "grad_norm": 0.010284395888447762, + "learning_rate": 1.6635148522460985e-05, + "loss": 0.0059, + "num_input_tokens_seen": 24852064, + "step": 117760 + }, + { + "epoch": 12.955445544554456, + "grad_norm": 2.0797247886657715, + "learning_rate": 1.6632886827525758e-05, + "loss": 0.1126, + "num_input_tokens_seen": 24853056, + "step": 117765 + }, + { + "epoch": 12.955995599559955, + "grad_norm": 0.048307664692401886, + "learning_rate": 1.6630625209706206e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24854112, + "step": 117770 + }, + { + "epoch": 12.956545654565456, + "grad_norm": 0.007395296823233366, + "learning_rate": 1.6628363669023163e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24855104, + "step": 117775 + }, + { + "epoch": 12.957095709570957, + "grad_norm": 0.032810311764478683, + "learning_rate": 1.662610220549747e-05, + "loss": 0.0179, + "num_input_tokens_seen": 24856128, + "step": 117780 + }, + { + "epoch": 12.957645764576458, + "grad_norm": 0.15884479880332947, + "learning_rate": 1.6623840819149982e-05, + "loss": 0.0113, + "num_input_tokens_seen": 24857216, + "step": 117785 + }, + { + "epoch": 12.958195819581958, + "grad_norm": 0.010008919052779675, + "learning_rate": 1.662157951000153e-05, + "loss": 0.0094, + "num_input_tokens_seen": 24858240, + "step": 117790 + }, + { + "epoch": 12.958745874587459, + "grad_norm": 0.03656716272234917, + "learning_rate": 1.6619318278072956e-05, + "loss": 0.0502, + "num_input_tokens_seen": 24859232, + "step": 117795 + }, + { + "epoch": 12.95929592959296, + "grad_norm": 0.01258651539683342, + "learning_rate": 1.6617057123385104e-05, + "loss": 0.003, + "num_input_tokens_seen": 24860352, + "step": 117800 + }, + { + "epoch": 12.95984598459846, + "grad_norm": 0.044890690594911575, + "learning_rate": 1.6614796045958816e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24861376, + "step": 117805 + }, + { + "epoch": 12.96039603960396, + "grad_norm": 0.006251107901334763, + "learning_rate": 1.6612535045814938e-05, + "loss": 0.0009, + "num_input_tokens_seen": 24862432, + "step": 117810 + }, + { + "epoch": 12.960946094609461, + "grad_norm": 0.010257644578814507, + "learning_rate": 1.6610274122974288e-05, + "loss": 0.0112, + "num_input_tokens_seen": 24863456, + "step": 117815 + }, + { + "epoch": 12.96149614961496, + "grad_norm": 0.004924559500068426, + "learning_rate": 1.6608013277457717e-05, + "loss": 0.072, + "num_input_tokens_seen": 24864576, + "step": 117820 + }, + { + "epoch": 12.962046204620462, + "grad_norm": 0.015234031714498997, + "learning_rate": 1.660575250928606e-05, + "loss": 0.0104, + "num_input_tokens_seen": 24865632, + "step": 117825 + }, + { + "epoch": 12.962596259625963, + "grad_norm": 0.02036857418715954, + "learning_rate": 1.6603491818480164e-05, + "loss": 0.0029, + "num_input_tokens_seen": 24866752, + "step": 117830 + }, + { + "epoch": 12.963146314631462, + "grad_norm": 0.018691793084144592, + "learning_rate": 1.660123120506085e-05, + "loss": 0.1025, + "num_input_tokens_seen": 24867808, + "step": 117835 + }, + { + "epoch": 12.963696369636963, + "grad_norm": 0.01749337464570999, + "learning_rate": 1.6598970669048953e-05, + "loss": 0.0146, + "num_input_tokens_seen": 24868896, + "step": 117840 + }, + { + "epoch": 12.964246424642464, + "grad_norm": 0.08019812405109406, + "learning_rate": 1.659671021046532e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24870048, + "step": 117845 + }, + { + "epoch": 12.964796479647966, + "grad_norm": 0.027903392910957336, + "learning_rate": 1.6594449829330767e-05, + "loss": 0.0024, + "num_input_tokens_seen": 24871072, + "step": 117850 + }, + { + "epoch": 12.965346534653465, + "grad_norm": 0.06749775260686874, + "learning_rate": 1.659218952566614e-05, + "loss": 0.0383, + "num_input_tokens_seen": 24872160, + "step": 117855 + }, + { + "epoch": 12.965896589658966, + "grad_norm": 0.1925431191921234, + "learning_rate": 1.6589929299492278e-05, + "loss": 0.0698, + "num_input_tokens_seen": 24873248, + "step": 117860 + }, + { + "epoch": 12.966446644664467, + "grad_norm": 3.6930229663848877, + "learning_rate": 1.6587669150829984e-05, + "loss": 0.0476, + "num_input_tokens_seen": 24874304, + "step": 117865 + }, + { + "epoch": 12.966996699669966, + "grad_norm": 1.746177077293396, + "learning_rate": 1.6585409079700122e-05, + "loss": 0.0115, + "num_input_tokens_seen": 24875328, + "step": 117870 + }, + { + "epoch": 12.967546754675467, + "grad_norm": 0.03714320808649063, + "learning_rate": 1.6583149086123496e-05, + "loss": 0.004, + "num_input_tokens_seen": 24876384, + "step": 117875 + }, + { + "epoch": 12.968096809680969, + "grad_norm": 0.06779482960700989, + "learning_rate": 1.658088917012095e-05, + "loss": 0.0231, + "num_input_tokens_seen": 24877472, + "step": 117880 + }, + { + "epoch": 12.968646864686468, + "grad_norm": 0.03901408612728119, + "learning_rate": 1.657862933171332e-05, + "loss": 0.0722, + "num_input_tokens_seen": 24878464, + "step": 117885 + }, + { + "epoch": 12.969196919691969, + "grad_norm": 0.00902838259935379, + "learning_rate": 1.6576369570921404e-05, + "loss": 0.0278, + "num_input_tokens_seen": 24879424, + "step": 117890 + }, + { + "epoch": 12.96974697469747, + "grad_norm": 0.004918292630463839, + "learning_rate": 1.6574109887766056e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24880512, + "step": 117895 + }, + { + "epoch": 12.97029702970297, + "grad_norm": 0.007910648360848427, + "learning_rate": 1.657185028226809e-05, + "loss": 0.0105, + "num_input_tokens_seen": 24881568, + "step": 117900 + }, + { + "epoch": 12.97084708470847, + "grad_norm": 0.018599802628159523, + "learning_rate": 1.6569590754448344e-05, + "loss": 0.0137, + "num_input_tokens_seen": 24882560, + "step": 117905 + }, + { + "epoch": 12.971397139713972, + "grad_norm": 0.006058042868971825, + "learning_rate": 1.6567331304327633e-05, + "loss": 0.0044, + "num_input_tokens_seen": 24883584, + "step": 117910 + }, + { + "epoch": 12.971947194719473, + "grad_norm": 2.054483413696289, + "learning_rate": 1.6565071931926776e-05, + "loss": 0.0683, + "num_input_tokens_seen": 24884640, + "step": 117915 + }, + { + "epoch": 12.972497249724972, + "grad_norm": 0.011281229555606842, + "learning_rate": 1.6562812637266613e-05, + "loss": 0.016, + "num_input_tokens_seen": 24885728, + "step": 117920 + }, + { + "epoch": 12.973047304730473, + "grad_norm": 0.052059005945920944, + "learning_rate": 1.656055342036794e-05, + "loss": 0.0106, + "num_input_tokens_seen": 24886688, + "step": 117925 + }, + { + "epoch": 12.973597359735974, + "grad_norm": 0.015397224575281143, + "learning_rate": 1.655829428125162e-05, + "loss": 0.0557, + "num_input_tokens_seen": 24887744, + "step": 117930 + }, + { + "epoch": 12.974147414741473, + "grad_norm": 0.006570310331881046, + "learning_rate": 1.655603521993844e-05, + "loss": 0.0852, + "num_input_tokens_seen": 24888864, + "step": 117935 + }, + { + "epoch": 12.974697469746975, + "grad_norm": 1.2103065252304077, + "learning_rate": 1.6553776236449225e-05, + "loss": 0.0373, + "num_input_tokens_seen": 24889920, + "step": 117940 + }, + { + "epoch": 12.975247524752476, + "grad_norm": 0.03191114962100983, + "learning_rate": 1.655151733080481e-05, + "loss": 0.0069, + "num_input_tokens_seen": 24891040, + "step": 117945 + }, + { + "epoch": 12.975797579757975, + "grad_norm": 0.23168157041072845, + "learning_rate": 1.6549258503026002e-05, + "loss": 0.008, + "num_input_tokens_seen": 24892064, + "step": 117950 + }, + { + "epoch": 12.976347634763476, + "grad_norm": 1.8901582956314087, + "learning_rate": 1.6546999753133622e-05, + "loss": 0.035, + "num_input_tokens_seen": 24893152, + "step": 117955 + }, + { + "epoch": 12.976897689768977, + "grad_norm": 0.1904064118862152, + "learning_rate": 1.65447410811485e-05, + "loss": 0.0038, + "num_input_tokens_seen": 24894240, + "step": 117960 + }, + { + "epoch": 12.977447744774478, + "grad_norm": 0.09523554891347885, + "learning_rate": 1.6542482487091433e-05, + "loss": 0.006, + "num_input_tokens_seen": 24895360, + "step": 117965 + }, + { + "epoch": 12.977997799779978, + "grad_norm": 0.2142425775527954, + "learning_rate": 1.6540223970983247e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24896416, + "step": 117970 + }, + { + "epoch": 12.978547854785479, + "grad_norm": 0.09559143334627151, + "learning_rate": 1.6537965532844762e-05, + "loss": 0.0155, + "num_input_tokens_seen": 24897440, + "step": 117975 + }, + { + "epoch": 12.97909790979098, + "grad_norm": 0.017496265470981598, + "learning_rate": 1.6535707172696778e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24898496, + "step": 117980 + }, + { + "epoch": 12.979647964796479, + "grad_norm": 0.1921353042125702, + "learning_rate": 1.653344889056013e-05, + "loss": 0.0755, + "num_input_tokens_seen": 24899520, + "step": 117985 + }, + { + "epoch": 12.98019801980198, + "grad_norm": 0.0017385866958647966, + "learning_rate": 1.653119068645561e-05, + "loss": 0.0012, + "num_input_tokens_seen": 24900576, + "step": 117990 + }, + { + "epoch": 12.980748074807481, + "grad_norm": 0.11482954770326614, + "learning_rate": 1.6528932560404046e-05, + "loss": 0.0111, + "num_input_tokens_seen": 24901632, + "step": 117995 + }, + { + "epoch": 12.98129812981298, + "grad_norm": 0.030810020864009857, + "learning_rate": 1.6526674512426257e-05, + "loss": 0.0588, + "num_input_tokens_seen": 24902784, + "step": 118000 + }, + { + "epoch": 12.981848184818482, + "grad_norm": 0.11448341608047485, + "learning_rate": 1.6524416542543032e-05, + "loss": 0.1058, + "num_input_tokens_seen": 24903872, + "step": 118005 + }, + { + "epoch": 12.982398239823983, + "grad_norm": 0.05494708567857742, + "learning_rate": 1.6522158650775194e-05, + "loss": 0.0016, + "num_input_tokens_seen": 24905024, + "step": 118010 + }, + { + "epoch": 12.982948294829484, + "grad_norm": 0.020389478653669357, + "learning_rate": 1.6519900837143547e-05, + "loss": 0.0732, + "num_input_tokens_seen": 24906048, + "step": 118015 + }, + { + "epoch": 12.983498349834983, + "grad_norm": 4.001580715179443, + "learning_rate": 1.6517643101668918e-05, + "loss": 0.0728, + "num_input_tokens_seen": 24907040, + "step": 118020 + }, + { + "epoch": 12.984048404840484, + "grad_norm": 0.045668575912714005, + "learning_rate": 1.6515385444372095e-05, + "loss": 0.0051, + "num_input_tokens_seen": 24908128, + "step": 118025 + }, + { + "epoch": 12.984598459845985, + "grad_norm": 0.1280720978975296, + "learning_rate": 1.6513127865273887e-05, + "loss": 0.0027, + "num_input_tokens_seen": 24909216, + "step": 118030 + }, + { + "epoch": 12.985148514851485, + "grad_norm": 0.3589882254600525, + "learning_rate": 1.651087036439512e-05, + "loss": 0.0114, + "num_input_tokens_seen": 24910304, + "step": 118035 + }, + { + "epoch": 12.985698569856986, + "grad_norm": 0.19847138226032257, + "learning_rate": 1.6508612941756573e-05, + "loss": 0.0627, + "num_input_tokens_seen": 24911328, + "step": 118040 + }, + { + "epoch": 12.986248624862487, + "grad_norm": 0.02673657052218914, + "learning_rate": 1.6506355597379075e-05, + "loss": 0.0019, + "num_input_tokens_seen": 24912480, + "step": 118045 + }, + { + "epoch": 12.986798679867986, + "grad_norm": 0.1771986335515976, + "learning_rate": 1.650409833128342e-05, + "loss": 0.0023, + "num_input_tokens_seen": 24913600, + "step": 118050 + }, + { + "epoch": 12.987348734873487, + "grad_norm": 0.14017798006534576, + "learning_rate": 1.650184114349041e-05, + "loss": 0.0022, + "num_input_tokens_seen": 24914656, + "step": 118055 + }, + { + "epoch": 12.987898789878988, + "grad_norm": 0.007656489498913288, + "learning_rate": 1.6499584034020865e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24915776, + "step": 118060 + }, + { + "epoch": 12.988448844884488, + "grad_norm": 0.2618308961391449, + "learning_rate": 1.649732700289556e-05, + "loss": 0.0038, + "num_input_tokens_seen": 24916832, + "step": 118065 + }, + { + "epoch": 12.988998899889989, + "grad_norm": 0.010329204611480236, + "learning_rate": 1.649507005013532e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24917856, + "step": 118070 + }, + { + "epoch": 12.98954895489549, + "grad_norm": 0.01905226893723011, + "learning_rate": 1.649281317576094e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24918848, + "step": 118075 + }, + { + "epoch": 12.990099009900991, + "grad_norm": 2.4006996154785156, + "learning_rate": 1.6490556379793213e-05, + "loss": 0.0206, + "num_input_tokens_seen": 24919968, + "step": 118080 + }, + { + "epoch": 12.99064906490649, + "grad_norm": 0.2487478405237198, + "learning_rate": 1.6488299662252947e-05, + "loss": 0.0122, + "num_input_tokens_seen": 24920992, + "step": 118085 + }, + { + "epoch": 12.991199119911991, + "grad_norm": 0.28302279114723206, + "learning_rate": 1.6486043023160936e-05, + "loss": 0.0091, + "num_input_tokens_seen": 24922144, + "step": 118090 + }, + { + "epoch": 12.991749174917492, + "grad_norm": 0.03595560044050217, + "learning_rate": 1.6483786462537985e-05, + "loss": 0.0682, + "num_input_tokens_seen": 24923232, + "step": 118095 + }, + { + "epoch": 12.992299229922992, + "grad_norm": 0.04196653142571449, + "learning_rate": 1.6481529980404892e-05, + "loss": 0.0123, + "num_input_tokens_seen": 24924256, + "step": 118100 + }, + { + "epoch": 12.992849284928493, + "grad_norm": 1.3496118783950806, + "learning_rate": 1.647927357678244e-05, + "loss": 0.0602, + "num_input_tokens_seen": 24925312, + "step": 118105 + }, + { + "epoch": 12.993399339933994, + "grad_norm": 0.033678531646728516, + "learning_rate": 1.647701725169144e-05, + "loss": 0.0048, + "num_input_tokens_seen": 24926272, + "step": 118110 + }, + { + "epoch": 12.993949394939493, + "grad_norm": 0.019952045753598213, + "learning_rate": 1.6474761005152674e-05, + "loss": 0.0043, + "num_input_tokens_seen": 24927328, + "step": 118115 + }, + { + "epoch": 12.994499449944994, + "grad_norm": 0.10768046230077744, + "learning_rate": 1.6472504837186958e-05, + "loss": 0.0099, + "num_input_tokens_seen": 24928352, + "step": 118120 + }, + { + "epoch": 12.995049504950495, + "grad_norm": 0.501858651638031, + "learning_rate": 1.647024874781507e-05, + "loss": 0.0064, + "num_input_tokens_seen": 24929440, + "step": 118125 + }, + { + "epoch": 12.995599559955995, + "grad_norm": 0.09700275957584381, + "learning_rate": 1.6467992737057798e-05, + "loss": 0.0632, + "num_input_tokens_seen": 24930560, + "step": 118130 + }, + { + "epoch": 12.996149614961496, + "grad_norm": 0.028016112744808197, + "learning_rate": 1.6465736804935954e-05, + "loss": 0.005, + "num_input_tokens_seen": 24931584, + "step": 118135 + }, + { + "epoch": 12.996699669966997, + "grad_norm": 3.0905377864837646, + "learning_rate": 1.646348095147031e-05, + "loss": 0.1665, + "num_input_tokens_seen": 24932544, + "step": 118140 + }, + { + "epoch": 12.997249724972498, + "grad_norm": 0.08683820068836212, + "learning_rate": 1.646122517668167e-05, + "loss": 0.1612, + "num_input_tokens_seen": 24933632, + "step": 118145 + }, + { + "epoch": 12.997799779977997, + "grad_norm": 0.26592981815338135, + "learning_rate": 1.6458969480590826e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24934656, + "step": 118150 + }, + { + "epoch": 12.998349834983498, + "grad_norm": 0.018327103927731514, + "learning_rate": 1.6456713863218548e-05, + "loss": 0.1046, + "num_input_tokens_seen": 24935680, + "step": 118155 + }, + { + "epoch": 12.998899889989, + "grad_norm": 0.026906276121735573, + "learning_rate": 1.6454458324585646e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24936704, + "step": 118160 + }, + { + "epoch": 12.999449944994499, + "grad_norm": 0.8416022062301636, + "learning_rate": 1.6452202864712907e-05, + "loss": 0.0735, + "num_input_tokens_seen": 24937792, + "step": 118165 + }, + { + "epoch": 13.0, + "grad_norm": 0.0032995271030813456, + "learning_rate": 1.6449947483621096e-05, + "loss": 0.002, + "num_input_tokens_seen": 24938688, + "step": 118170 + }, + { + "epoch": 13.0, + "eval_loss": 0.07189643383026123, + "eval_runtime": 36.9723, + "eval_samples_per_second": 109.271, + "eval_steps_per_second": 27.318, + "num_input_tokens_seen": 24938688, + "step": 118170 + }, + { + "epoch": 13.000550055005501, + "grad_norm": 0.0627431645989418, + "learning_rate": 1.6447692181331038e-05, + "loss": 0.0299, + "num_input_tokens_seen": 24939744, + "step": 118175 + }, + { + "epoch": 13.001100110011, + "grad_norm": 0.01356345321983099, + "learning_rate": 1.644543695786348e-05, + "loss": 0.0982, + "num_input_tokens_seen": 24940736, + "step": 118180 + }, + { + "epoch": 13.001650165016502, + "grad_norm": 0.014599241316318512, + "learning_rate": 1.644318181323923e-05, + "loss": 0.0205, + "num_input_tokens_seen": 24941824, + "step": 118185 + }, + { + "epoch": 13.002200220022003, + "grad_norm": 0.10880894958972931, + "learning_rate": 1.6440926747479073e-05, + "loss": 0.002, + "num_input_tokens_seen": 24942848, + "step": 118190 + }, + { + "epoch": 13.002750275027502, + "grad_norm": 0.004131580702960491, + "learning_rate": 1.643867176060378e-05, + "loss": 0.0082, + "num_input_tokens_seen": 24943904, + "step": 118195 + }, + { + "epoch": 13.003300330033003, + "grad_norm": 0.020524887368083, + "learning_rate": 1.6436416852634147e-05, + "loss": 0.0127, + "num_input_tokens_seen": 24944960, + "step": 118200 + }, + { + "epoch": 13.003850385038504, + "grad_norm": 0.016403259709477425, + "learning_rate": 1.643416202359094e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24946016, + "step": 118205 + }, + { + "epoch": 13.004400440044005, + "grad_norm": 1.6125060319900513, + "learning_rate": 1.6431907273494965e-05, + "loss": 0.0302, + "num_input_tokens_seen": 24947072, + "step": 118210 + }, + { + "epoch": 13.004950495049505, + "grad_norm": 0.02371882274746895, + "learning_rate": 1.6429652602366984e-05, + "loss": 0.0329, + "num_input_tokens_seen": 24948096, + "step": 118215 + }, + { + "epoch": 13.005500550055006, + "grad_norm": 0.037804946303367615, + "learning_rate": 1.6427398010227776e-05, + "loss": 0.0126, + "num_input_tokens_seen": 24949184, + "step": 118220 + }, + { + "epoch": 13.006050605060507, + "grad_norm": 0.06031009182333946, + "learning_rate": 1.6425143497098138e-05, + "loss": 0.0109, + "num_input_tokens_seen": 24950176, + "step": 118225 + }, + { + "epoch": 13.006600660066006, + "grad_norm": 0.023150531575083733, + "learning_rate": 1.6422889062998826e-05, + "loss": 0.0074, + "num_input_tokens_seen": 24951296, + "step": 118230 + }, + { + "epoch": 13.007150715071507, + "grad_norm": 0.07634890079498291, + "learning_rate": 1.642063470795064e-05, + "loss": 0.0589, + "num_input_tokens_seen": 24952352, + "step": 118235 + }, + { + "epoch": 13.007700770077008, + "grad_norm": 1.31108558177948, + "learning_rate": 1.6418380431974345e-05, + "loss": 0.0342, + "num_input_tokens_seen": 24953408, + "step": 118240 + }, + { + "epoch": 13.008250825082508, + "grad_norm": 0.005220603197813034, + "learning_rate": 1.6416126235090713e-05, + "loss": 0.0304, + "num_input_tokens_seen": 24954432, + "step": 118245 + }, + { + "epoch": 13.008800880088009, + "grad_norm": 2.944181203842163, + "learning_rate": 1.6413872117320537e-05, + "loss": 0.1179, + "num_input_tokens_seen": 24955552, + "step": 118250 + }, + { + "epoch": 13.00935093509351, + "grad_norm": 0.48014041781425476, + "learning_rate": 1.641161807868457e-05, + "loss": 0.0162, + "num_input_tokens_seen": 24956608, + "step": 118255 + }, + { + "epoch": 13.009900990099009, + "grad_norm": 0.37512046098709106, + "learning_rate": 1.6409364119203608e-05, + "loss": 0.0094, + "num_input_tokens_seen": 24957600, + "step": 118260 + }, + { + "epoch": 13.01045104510451, + "grad_norm": 0.03998442739248276, + "learning_rate": 1.6407110238898414e-05, + "loss": 0.0179, + "num_input_tokens_seen": 24958624, + "step": 118265 + }, + { + "epoch": 13.011001100110011, + "grad_norm": 0.004208785016089678, + "learning_rate": 1.6404856437789757e-05, + "loss": 0.0072, + "num_input_tokens_seen": 24959680, + "step": 118270 + }, + { + "epoch": 13.011551155115512, + "grad_norm": 0.6249334216117859, + "learning_rate": 1.6402602715898413e-05, + "loss": 0.0643, + "num_input_tokens_seen": 24960704, + "step": 118275 + }, + { + "epoch": 13.012101210121012, + "grad_norm": 0.027726352214813232, + "learning_rate": 1.640034907324515e-05, + "loss": 0.0056, + "num_input_tokens_seen": 24961824, + "step": 118280 + }, + { + "epoch": 13.012651265126513, + "grad_norm": 0.007456731051206589, + "learning_rate": 1.639809550985076e-05, + "loss": 0.0007, + "num_input_tokens_seen": 24962944, + "step": 118285 + }, + { + "epoch": 13.013201320132014, + "grad_norm": 0.09315823763608932, + "learning_rate": 1.6395842025735986e-05, + "loss": 0.0636, + "num_input_tokens_seen": 24964000, + "step": 118290 + }, + { + "epoch": 13.013751375137513, + "grad_norm": 0.00986612681299448, + "learning_rate": 1.63935886209216e-05, + "loss": 0.0139, + "num_input_tokens_seen": 24965056, + "step": 118295 + }, + { + "epoch": 13.014301430143014, + "grad_norm": 0.004246850963681936, + "learning_rate": 1.639133529542839e-05, + "loss": 0.0269, + "num_input_tokens_seen": 24966144, + "step": 118300 + }, + { + "epoch": 13.014851485148515, + "grad_norm": 0.007590136025100946, + "learning_rate": 1.63890820492771e-05, + "loss": 0.0079, + "num_input_tokens_seen": 24967168, + "step": 118305 + }, + { + "epoch": 13.015401540154015, + "grad_norm": 0.011690405197441578, + "learning_rate": 1.6386828882488524e-05, + "loss": 0.0065, + "num_input_tokens_seen": 24968224, + "step": 118310 + }, + { + "epoch": 13.015951595159516, + "grad_norm": 0.006315089762210846, + "learning_rate": 1.6384575795083404e-05, + "loss": 0.0647, + "num_input_tokens_seen": 24969248, + "step": 118315 + }, + { + "epoch": 13.016501650165017, + "grad_norm": 0.017767686396837234, + "learning_rate": 1.638232278708251e-05, + "loss": 0.0006, + "num_input_tokens_seen": 24970336, + "step": 118320 + }, + { + "epoch": 13.017051705170518, + "grad_norm": 0.009977818466722965, + "learning_rate": 1.6380069858506626e-05, + "loss": 0.0217, + "num_input_tokens_seen": 24971392, + "step": 118325 + }, + { + "epoch": 13.017601760176017, + "grad_norm": 0.004491935018450022, + "learning_rate": 1.637781700937649e-05, + "loss": 0.0033, + "num_input_tokens_seen": 24972512, + "step": 118330 + }, + { + "epoch": 13.018151815181518, + "grad_norm": 0.76633220911026, + "learning_rate": 1.637556423971288e-05, + "loss": 0.0056, + "num_input_tokens_seen": 24973600, + "step": 118335 + }, + { + "epoch": 13.01870187018702, + "grad_norm": 0.08373960852622986, + "learning_rate": 1.6373311549536563e-05, + "loss": 0.0417, + "num_input_tokens_seen": 24974560, + "step": 118340 + }, + { + "epoch": 13.019251925192519, + "grad_norm": 0.040478192269802094, + "learning_rate": 1.6371058938868278e-05, + "loss": 0.0072, + "num_input_tokens_seen": 24975584, + "step": 118345 + }, + { + "epoch": 13.01980198019802, + "grad_norm": 0.5751674771308899, + "learning_rate": 1.6368806407728814e-05, + "loss": 0.1085, + "num_input_tokens_seen": 24976640, + "step": 118350 + }, + { + "epoch": 13.020352035203521, + "grad_norm": 0.10171868652105331, + "learning_rate": 1.636655395613892e-05, + "loss": 0.0088, + "num_input_tokens_seen": 24977664, + "step": 118355 + }, + { + "epoch": 13.02090209020902, + "grad_norm": 0.02475913241505623, + "learning_rate": 1.636430158411935e-05, + "loss": 0.0105, + "num_input_tokens_seen": 24978720, + "step": 118360 + }, + { + "epoch": 13.021452145214521, + "grad_norm": 0.4674474000930786, + "learning_rate": 1.6362049291690877e-05, + "loss": 0.0658, + "num_input_tokens_seen": 24979776, + "step": 118365 + }, + { + "epoch": 13.022002200220022, + "grad_norm": 0.5118508338928223, + "learning_rate": 1.6359797078874238e-05, + "loss": 0.0057, + "num_input_tokens_seen": 24980832, + "step": 118370 + }, + { + "epoch": 13.022552255225522, + "grad_norm": 0.02225012704730034, + "learning_rate": 1.6357544945690213e-05, + "loss": 0.0233, + "num_input_tokens_seen": 24981856, + "step": 118375 + }, + { + "epoch": 13.023102310231023, + "grad_norm": 0.019066225737333298, + "learning_rate": 1.6355292892159553e-05, + "loss": 0.0432, + "num_input_tokens_seen": 24982912, + "step": 118380 + }, + { + "epoch": 13.023652365236524, + "grad_norm": 0.02565029263496399, + "learning_rate": 1.6353040918303e-05, + "loss": 0.0973, + "num_input_tokens_seen": 24984000, + "step": 118385 + }, + { + "epoch": 13.024202420242025, + "grad_norm": 0.0033116010017693043, + "learning_rate": 1.6350789024141326e-05, + "loss": 0.0352, + "num_input_tokens_seen": 24985088, + "step": 118390 + }, + { + "epoch": 13.024752475247524, + "grad_norm": 0.005996114108711481, + "learning_rate": 1.634853720969527e-05, + "loss": 0.0537, + "num_input_tokens_seen": 24986080, + "step": 118395 + }, + { + "epoch": 13.025302530253025, + "grad_norm": 3.949247121810913, + "learning_rate": 1.634628547498561e-05, + "loss": 0.0403, + "num_input_tokens_seen": 24987168, + "step": 118400 + }, + { + "epoch": 13.025852585258527, + "grad_norm": 0.021745873615145683, + "learning_rate": 1.634403382003308e-05, + "loss": 0.03, + "num_input_tokens_seen": 24988192, + "step": 118405 + }, + { + "epoch": 13.026402640264026, + "grad_norm": 0.11876936256885529, + "learning_rate": 1.6341782244858425e-05, + "loss": 0.0111, + "num_input_tokens_seen": 24989216, + "step": 118410 + }, + { + "epoch": 13.026952695269527, + "grad_norm": 0.15922026336193085, + "learning_rate": 1.6339530749482416e-05, + "loss": 0.0071, + "num_input_tokens_seen": 24990272, + "step": 118415 + }, + { + "epoch": 13.027502750275028, + "grad_norm": 0.005242678802460432, + "learning_rate": 1.633727933392579e-05, + "loss": 0.1192, + "num_input_tokens_seen": 24991328, + "step": 118420 + }, + { + "epoch": 13.028052805280527, + "grad_norm": 0.11243577301502228, + "learning_rate": 1.6335027998209317e-05, + "loss": 0.0035, + "num_input_tokens_seen": 24992480, + "step": 118425 + }, + { + "epoch": 13.028602860286028, + "grad_norm": 0.023661065846681595, + "learning_rate": 1.633277674235373e-05, + "loss": 0.0049, + "num_input_tokens_seen": 24993568, + "step": 118430 + }, + { + "epoch": 13.02915291529153, + "grad_norm": 0.023897025734186172, + "learning_rate": 1.6330525566379772e-05, + "loss": 0.0175, + "num_input_tokens_seen": 24994624, + "step": 118435 + }, + { + "epoch": 13.029702970297029, + "grad_norm": 1.972880244255066, + "learning_rate": 1.6328274470308213e-05, + "loss": 0.0597, + "num_input_tokens_seen": 24995616, + "step": 118440 + }, + { + "epoch": 13.03025302530253, + "grad_norm": 2.175610303878784, + "learning_rate": 1.6326023454159776e-05, + "loss": 0.0994, + "num_input_tokens_seen": 24996736, + "step": 118445 + }, + { + "epoch": 13.030803080308031, + "grad_norm": 0.14652417600154877, + "learning_rate": 1.632377251795522e-05, + "loss": 0.0958, + "num_input_tokens_seen": 24997792, + "step": 118450 + }, + { + "epoch": 13.031353135313532, + "grad_norm": 0.018243921920657158, + "learning_rate": 1.63215216617153e-05, + "loss": 0.0614, + "num_input_tokens_seen": 24998880, + "step": 118455 + }, + { + "epoch": 13.031903190319031, + "grad_norm": 3.798330307006836, + "learning_rate": 1.6319270885460734e-05, + "loss": 0.0719, + "num_input_tokens_seen": 24999968, + "step": 118460 + }, + { + "epoch": 13.032453245324533, + "grad_norm": 0.042312126606702805, + "learning_rate": 1.6317020189212294e-05, + "loss": 0.0096, + "num_input_tokens_seen": 25000992, + "step": 118465 + }, + { + "epoch": 13.033003300330034, + "grad_norm": 0.011192476376891136, + "learning_rate": 1.6314769572990708e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25002080, + "step": 118470 + }, + { + "epoch": 13.033553355335533, + "grad_norm": 0.006523795425891876, + "learning_rate": 1.631251903681672e-05, + "loss": 0.005, + "num_input_tokens_seen": 25003072, + "step": 118475 + }, + { + "epoch": 13.034103410341034, + "grad_norm": 0.009559323079884052, + "learning_rate": 1.631026858071109e-05, + "loss": 0.0422, + "num_input_tokens_seen": 25004160, + "step": 118480 + }, + { + "epoch": 13.034653465346535, + "grad_norm": 0.008834924548864365, + "learning_rate": 1.6308018204694527e-05, + "loss": 0.0071, + "num_input_tokens_seen": 25005248, + "step": 118485 + }, + { + "epoch": 13.035203520352034, + "grad_norm": 3.554163932800293, + "learning_rate": 1.63057679087878e-05, + "loss": 0.1128, + "num_input_tokens_seen": 25006304, + "step": 118490 + }, + { + "epoch": 13.035753575357536, + "grad_norm": 0.04166461154818535, + "learning_rate": 1.630351769301163e-05, + "loss": 0.0027, + "num_input_tokens_seen": 25007360, + "step": 118495 + }, + { + "epoch": 13.036303630363037, + "grad_norm": 0.008369666524231434, + "learning_rate": 1.6301267557386777e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25008416, + "step": 118500 + }, + { + "epoch": 13.036853685368538, + "grad_norm": 0.0322888046503067, + "learning_rate": 1.629901750193396e-05, + "loss": 0.0072, + "num_input_tokens_seen": 25009440, + "step": 118505 + }, + { + "epoch": 13.037403740374037, + "grad_norm": 0.037579018622636795, + "learning_rate": 1.629676752667392e-05, + "loss": 0.0921, + "num_input_tokens_seen": 25010464, + "step": 118510 + }, + { + "epoch": 13.037953795379538, + "grad_norm": 0.04381691291928291, + "learning_rate": 1.6294517631627405e-05, + "loss": 0.0458, + "num_input_tokens_seen": 25011520, + "step": 118515 + }, + { + "epoch": 13.03850385038504, + "grad_norm": 1.2549290657043457, + "learning_rate": 1.6292267816815135e-05, + "loss": 0.0214, + "num_input_tokens_seen": 25012608, + "step": 118520 + }, + { + "epoch": 13.039053905390539, + "grad_norm": 0.11348607391119003, + "learning_rate": 1.6290018082257857e-05, + "loss": 0.0529, + "num_input_tokens_seen": 25013664, + "step": 118525 + }, + { + "epoch": 13.03960396039604, + "grad_norm": 0.9460042715072632, + "learning_rate": 1.6287768427976315e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25014720, + "step": 118530 + }, + { + "epoch": 13.04015401540154, + "grad_norm": 0.02147730067372322, + "learning_rate": 1.628551885399121e-05, + "loss": 0.1915, + "num_input_tokens_seen": 25015808, + "step": 118535 + }, + { + "epoch": 13.04070407040704, + "grad_norm": 0.04567424952983856, + "learning_rate": 1.6283269360323312e-05, + "loss": 0.151, + "num_input_tokens_seen": 25016800, + "step": 118540 + }, + { + "epoch": 13.041254125412541, + "grad_norm": 0.014637690037488937, + "learning_rate": 1.6281019946993332e-05, + "loss": 0.0008, + "num_input_tokens_seen": 25017888, + "step": 118545 + }, + { + "epoch": 13.041804180418042, + "grad_norm": 0.045549873262643814, + "learning_rate": 1.6278770614022002e-05, + "loss": 0.0128, + "num_input_tokens_seen": 25018912, + "step": 118550 + }, + { + "epoch": 13.042354235423542, + "grad_norm": 0.015264945104718208, + "learning_rate": 1.627652136143007e-05, + "loss": 0.0203, + "num_input_tokens_seen": 25020000, + "step": 118555 + }, + { + "epoch": 13.042904290429043, + "grad_norm": 0.1521703153848648, + "learning_rate": 1.6274272189238244e-05, + "loss": 0.0099, + "num_input_tokens_seen": 25021056, + "step": 118560 + }, + { + "epoch": 13.043454345434544, + "grad_norm": 0.03650771826505661, + "learning_rate": 1.627202309746727e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25022112, + "step": 118565 + }, + { + "epoch": 13.044004400440045, + "grad_norm": 1.7867389917373657, + "learning_rate": 1.6269774086137874e-05, + "loss": 0.0309, + "num_input_tokens_seen": 25023168, + "step": 118570 + }, + { + "epoch": 13.044554455445544, + "grad_norm": 0.05084417387843132, + "learning_rate": 1.6267525155270773e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25024224, + "step": 118575 + }, + { + "epoch": 13.045104510451045, + "grad_norm": 0.3780803382396698, + "learning_rate": 1.6265276304886707e-05, + "loss": 0.1849, + "num_input_tokens_seen": 25025280, + "step": 118580 + }, + { + "epoch": 13.045654565456546, + "grad_norm": 0.012091688811779022, + "learning_rate": 1.6263027535006393e-05, + "loss": 0.0992, + "num_input_tokens_seen": 25026368, + "step": 118585 + }, + { + "epoch": 13.046204620462046, + "grad_norm": 1.0131765604019165, + "learning_rate": 1.6260778845650564e-05, + "loss": 0.0508, + "num_input_tokens_seen": 25027456, + "step": 118590 + }, + { + "epoch": 13.046754675467547, + "grad_norm": 0.17717345058918, + "learning_rate": 1.6258530236839952e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25028448, + "step": 118595 + }, + { + "epoch": 13.047304730473048, + "grad_norm": 0.03796539828181267, + "learning_rate": 1.625628170859526e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25029504, + "step": 118600 + }, + { + "epoch": 13.047854785478547, + "grad_norm": 0.16363126039505005, + "learning_rate": 1.625403326093723e-05, + "loss": 0.0044, + "num_input_tokens_seen": 25030592, + "step": 118605 + }, + { + "epoch": 13.048404840484048, + "grad_norm": 0.9932324886322021, + "learning_rate": 1.6251784893886573e-05, + "loss": 0.0108, + "num_input_tokens_seen": 25031680, + "step": 118610 + }, + { + "epoch": 13.04895489548955, + "grad_norm": 1.0281075239181519, + "learning_rate": 1.6249536607464026e-05, + "loss": 0.0093, + "num_input_tokens_seen": 25032704, + "step": 118615 + }, + { + "epoch": 13.049504950495049, + "grad_norm": 0.024781066924333572, + "learning_rate": 1.62472884016903e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25033792, + "step": 118620 + }, + { + "epoch": 13.05005500550055, + "grad_norm": 0.045266687870025635, + "learning_rate": 1.624504027658611e-05, + "loss": 0.0027, + "num_input_tokens_seen": 25034848, + "step": 118625 + }, + { + "epoch": 13.05060506050605, + "grad_norm": 0.03371025621891022, + "learning_rate": 1.6242792232172193e-05, + "loss": 0.0019, + "num_input_tokens_seen": 25035904, + "step": 118630 + }, + { + "epoch": 13.051155115511552, + "grad_norm": 0.062340084463357925, + "learning_rate": 1.6240544268469246e-05, + "loss": 0.0472, + "num_input_tokens_seen": 25036960, + "step": 118635 + }, + { + "epoch": 13.051705170517051, + "grad_norm": 0.05194506421685219, + "learning_rate": 1.6238296385498002e-05, + "loss": 0.0025, + "num_input_tokens_seen": 25038016, + "step": 118640 + }, + { + "epoch": 13.052255225522552, + "grad_norm": 0.7354145646095276, + "learning_rate": 1.6236048583279186e-05, + "loss": 0.0098, + "num_input_tokens_seen": 25039040, + "step": 118645 + }, + { + "epoch": 13.052805280528053, + "grad_norm": 0.0430569164454937, + "learning_rate": 1.6233800861833486e-05, + "loss": 0.0059, + "num_input_tokens_seen": 25040096, + "step": 118650 + }, + { + "epoch": 13.053355335533553, + "grad_norm": 0.04864649847149849, + "learning_rate": 1.6231553221181655e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25041056, + "step": 118655 + }, + { + "epoch": 13.053905390539054, + "grad_norm": 0.0072577884420752525, + "learning_rate": 1.622930566134438e-05, + "loss": 0.0044, + "num_input_tokens_seen": 25042112, + "step": 118660 + }, + { + "epoch": 13.054455445544555, + "grad_norm": 0.023212654516100883, + "learning_rate": 1.6227058182342393e-05, + "loss": 0.0204, + "num_input_tokens_seen": 25043200, + "step": 118665 + }, + { + "epoch": 13.055005500550054, + "grad_norm": 0.19343534111976624, + "learning_rate": 1.6224810784196403e-05, + "loss": 0.1177, + "num_input_tokens_seen": 25044256, + "step": 118670 + }, + { + "epoch": 13.055555555555555, + "grad_norm": 0.09235160797834396, + "learning_rate": 1.622256346692711e-05, + "loss": 0.0129, + "num_input_tokens_seen": 25045312, + "step": 118675 + }, + { + "epoch": 13.056105610561056, + "grad_norm": 2.4476351737976074, + "learning_rate": 1.6220316230555245e-05, + "loss": 0.1899, + "num_input_tokens_seen": 25046368, + "step": 118680 + }, + { + "epoch": 13.056655665566556, + "grad_norm": 0.012220168486237526, + "learning_rate": 1.6218069075101507e-05, + "loss": 0.0798, + "num_input_tokens_seen": 25047456, + "step": 118685 + }, + { + "epoch": 13.057205720572057, + "grad_norm": 0.8644677996635437, + "learning_rate": 1.6215822000586623e-05, + "loss": 0.0079, + "num_input_tokens_seen": 25048544, + "step": 118690 + }, + { + "epoch": 13.057755775577558, + "grad_norm": 0.05322332680225372, + "learning_rate": 1.6213575007031287e-05, + "loss": 0.0103, + "num_input_tokens_seen": 25049600, + "step": 118695 + }, + { + "epoch": 13.058305830583059, + "grad_norm": 0.0063255284912884235, + "learning_rate": 1.6211328094456207e-05, + "loss": 0.0027, + "num_input_tokens_seen": 25050688, + "step": 118700 + }, + { + "epoch": 13.058855885588558, + "grad_norm": 0.04128258675336838, + "learning_rate": 1.6209081262882115e-05, + "loss": 0.034, + "num_input_tokens_seen": 25051744, + "step": 118705 + }, + { + "epoch": 13.05940594059406, + "grad_norm": 0.033488765358924866, + "learning_rate": 1.620683451232968e-05, + "loss": 0.0073, + "num_input_tokens_seen": 25052768, + "step": 118710 + }, + { + "epoch": 13.05995599559956, + "grad_norm": 0.02557407133281231, + "learning_rate": 1.620458784281965e-05, + "loss": 0.014, + "num_input_tokens_seen": 25053792, + "step": 118715 + }, + { + "epoch": 13.06050605060506, + "grad_norm": 0.04942759498953819, + "learning_rate": 1.620234125437271e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25054880, + "step": 118720 + }, + { + "epoch": 13.061056105610561, + "grad_norm": 0.24245120584964752, + "learning_rate": 1.6200094747009565e-05, + "loss": 0.0305, + "num_input_tokens_seen": 25056000, + "step": 118725 + }, + { + "epoch": 13.061606160616062, + "grad_norm": 0.04348784312605858, + "learning_rate": 1.619784832075093e-05, + "loss": 0.0923, + "num_input_tokens_seen": 25057056, + "step": 118730 + }, + { + "epoch": 13.062156215621561, + "grad_norm": 0.017082443460822105, + "learning_rate": 1.61956019756175e-05, + "loss": 0.0087, + "num_input_tokens_seen": 25058112, + "step": 118735 + }, + { + "epoch": 13.062706270627062, + "grad_norm": 0.05036589130759239, + "learning_rate": 1.6193355711629977e-05, + "loss": 0.001, + "num_input_tokens_seen": 25059136, + "step": 118740 + }, + { + "epoch": 13.063256325632564, + "grad_norm": 0.37677890062332153, + "learning_rate": 1.619110952880908e-05, + "loss": 0.0119, + "num_input_tokens_seen": 25060224, + "step": 118745 + }, + { + "epoch": 13.063806380638065, + "grad_norm": 0.2217153012752533, + "learning_rate": 1.618886342717549e-05, + "loss": 0.0312, + "num_input_tokens_seen": 25061184, + "step": 118750 + }, + { + "epoch": 13.064356435643564, + "grad_norm": 0.05503050982952118, + "learning_rate": 1.618661740674992e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25062176, + "step": 118755 + }, + { + "epoch": 13.064906490649065, + "grad_norm": 0.036364760249853134, + "learning_rate": 1.618437146755307e-05, + "loss": 0.0851, + "num_input_tokens_seen": 25063168, + "step": 118760 + }, + { + "epoch": 13.065456545654566, + "grad_norm": 0.03734968975186348, + "learning_rate": 1.6182125609605635e-05, + "loss": 0.0109, + "num_input_tokens_seen": 25064224, + "step": 118765 + }, + { + "epoch": 13.066006600660065, + "grad_norm": 0.007724975235760212, + "learning_rate": 1.6179879832928317e-05, + "loss": 0.0021, + "num_input_tokens_seen": 25065312, + "step": 118770 + }, + { + "epoch": 13.066556655665567, + "grad_norm": 1.8414736986160278, + "learning_rate": 1.6177634137541815e-05, + "loss": 0.1123, + "num_input_tokens_seen": 25066400, + "step": 118775 + }, + { + "epoch": 13.067106710671068, + "grad_norm": 0.28626030683517456, + "learning_rate": 1.6175388523466824e-05, + "loss": 0.0047, + "num_input_tokens_seen": 25067456, + "step": 118780 + }, + { + "epoch": 13.067656765676567, + "grad_norm": 0.01736164651811123, + "learning_rate": 1.617314299072405e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25068544, + "step": 118785 + }, + { + "epoch": 13.068206820682068, + "grad_norm": 0.20836226642131805, + "learning_rate": 1.6170897539334175e-05, + "loss": 0.0779, + "num_input_tokens_seen": 25069568, + "step": 118790 + }, + { + "epoch": 13.06875687568757, + "grad_norm": 0.0072373393923044205, + "learning_rate": 1.6168652169317904e-05, + "loss": 0.0164, + "num_input_tokens_seen": 25070560, + "step": 118795 + }, + { + "epoch": 13.069306930693068, + "grad_norm": 0.07859248667955399, + "learning_rate": 1.6166406880695922e-05, + "loss": 0.0025, + "num_input_tokens_seen": 25071648, + "step": 118800 + }, + { + "epoch": 13.06985698569857, + "grad_norm": 1.6627511978149414, + "learning_rate": 1.616416167348894e-05, + "loss": 0.0529, + "num_input_tokens_seen": 25072704, + "step": 118805 + }, + { + "epoch": 13.07040704070407, + "grad_norm": 0.4130421280860901, + "learning_rate": 1.6161916547717635e-05, + "loss": 0.0084, + "num_input_tokens_seen": 25073696, + "step": 118810 + }, + { + "epoch": 13.070957095709572, + "grad_norm": 2.906381368637085, + "learning_rate": 1.6159671503402697e-05, + "loss": 0.1257, + "num_input_tokens_seen": 25074816, + "step": 118815 + }, + { + "epoch": 13.071507150715071, + "grad_norm": 0.01626303233206272, + "learning_rate": 1.615742654056484e-05, + "loss": 0.0088, + "num_input_tokens_seen": 25075904, + "step": 118820 + }, + { + "epoch": 13.072057205720572, + "grad_norm": 0.0571935661137104, + "learning_rate": 1.615518165922472e-05, + "loss": 0.0219, + "num_input_tokens_seen": 25076992, + "step": 118825 + }, + { + "epoch": 13.072607260726073, + "grad_norm": 0.013493368402123451, + "learning_rate": 1.615293685940306e-05, + "loss": 0.0035, + "num_input_tokens_seen": 25078080, + "step": 118830 + }, + { + "epoch": 13.073157315731573, + "grad_norm": 0.01566704362630844, + "learning_rate": 1.6150692141120533e-05, + "loss": 0.0315, + "num_input_tokens_seen": 25079136, + "step": 118835 + }, + { + "epoch": 13.073707370737074, + "grad_norm": 0.08240853995084763, + "learning_rate": 1.6148447504397825e-05, + "loss": 0.0233, + "num_input_tokens_seen": 25080128, + "step": 118840 + }, + { + "epoch": 13.074257425742575, + "grad_norm": 0.018086925148963928, + "learning_rate": 1.614620294925564e-05, + "loss": 0.0811, + "num_input_tokens_seen": 25081184, + "step": 118845 + }, + { + "epoch": 13.074807480748074, + "grad_norm": 0.005923519376665354, + "learning_rate": 1.6143958475714644e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25082272, + "step": 118850 + }, + { + "epoch": 13.075357535753575, + "grad_norm": 0.01945318840444088, + "learning_rate": 1.614171408379554e-05, + "loss": 0.0018, + "num_input_tokens_seen": 25083296, + "step": 118855 + }, + { + "epoch": 13.075907590759076, + "grad_norm": 0.0978735014796257, + "learning_rate": 1.6139469773519007e-05, + "loss": 0.022, + "num_input_tokens_seen": 25084288, + "step": 118860 + }, + { + "epoch": 13.076457645764576, + "grad_norm": 0.012686926871538162, + "learning_rate": 1.613722554490572e-05, + "loss": 0.0456, + "num_input_tokens_seen": 25085344, + "step": 118865 + }, + { + "epoch": 13.077007700770077, + "grad_norm": 0.25464609265327454, + "learning_rate": 1.613498139797638e-05, + "loss": 0.0471, + "num_input_tokens_seen": 25086400, + "step": 118870 + }, + { + "epoch": 13.077557755775578, + "grad_norm": 0.018449073657393456, + "learning_rate": 1.613273733275166e-05, + "loss": 0.0086, + "num_input_tokens_seen": 25087488, + "step": 118875 + }, + { + "epoch": 13.078107810781079, + "grad_norm": 1.1631830930709839, + "learning_rate": 1.613049334925225e-05, + "loss": 0.012, + "num_input_tokens_seen": 25088512, + "step": 118880 + }, + { + "epoch": 13.078657865786578, + "grad_norm": 0.021412314847111702, + "learning_rate": 1.6128249447498822e-05, + "loss": 0.0012, + "num_input_tokens_seen": 25089536, + "step": 118885 + }, + { + "epoch": 13.07920792079208, + "grad_norm": 0.22522053122520447, + "learning_rate": 1.6126005627512058e-05, + "loss": 0.0506, + "num_input_tokens_seen": 25090528, + "step": 118890 + }, + { + "epoch": 13.07975797579758, + "grad_norm": 0.2646140158176422, + "learning_rate": 1.612376188931265e-05, + "loss": 0.0088, + "num_input_tokens_seen": 25091616, + "step": 118895 + }, + { + "epoch": 13.08030803080308, + "grad_norm": 0.08337726444005966, + "learning_rate": 1.612151823292126e-05, + "loss": 0.0017, + "num_input_tokens_seen": 25092640, + "step": 118900 + }, + { + "epoch": 13.08085808580858, + "grad_norm": 0.011979465372860432, + "learning_rate": 1.6119274658358586e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25093760, + "step": 118905 + }, + { + "epoch": 13.081408140814082, + "grad_norm": 0.06147519871592522, + "learning_rate": 1.6117031165645295e-05, + "loss": 0.0521, + "num_input_tokens_seen": 25094816, + "step": 118910 + }, + { + "epoch": 13.081958195819581, + "grad_norm": 0.01880500465631485, + "learning_rate": 1.611478775480206e-05, + "loss": 0.0446, + "num_input_tokens_seen": 25095776, + "step": 118915 + }, + { + "epoch": 13.082508250825082, + "grad_norm": 0.17284737527370453, + "learning_rate": 1.6112544425849567e-05, + "loss": 0.043, + "num_input_tokens_seen": 25096832, + "step": 118920 + }, + { + "epoch": 13.083058305830583, + "grad_norm": 0.003356093307957053, + "learning_rate": 1.6110301178808486e-05, + "loss": 0.0098, + "num_input_tokens_seen": 25097888, + "step": 118925 + }, + { + "epoch": 13.083608360836084, + "grad_norm": 0.010355811566114426, + "learning_rate": 1.610805801369949e-05, + "loss": 0.0542, + "num_input_tokens_seen": 25098880, + "step": 118930 + }, + { + "epoch": 13.084158415841584, + "grad_norm": 0.11173464357852936, + "learning_rate": 1.6105814930543266e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25099904, + "step": 118935 + }, + { + "epoch": 13.084708470847085, + "grad_norm": 0.011019453406333923, + "learning_rate": 1.6103571929360467e-05, + "loss": 0.0165, + "num_input_tokens_seen": 25100992, + "step": 118940 + }, + { + "epoch": 13.085258525852586, + "grad_norm": 0.002504441887140274, + "learning_rate": 1.6101329010171782e-05, + "loss": 0.1103, + "num_input_tokens_seen": 25102016, + "step": 118945 + }, + { + "epoch": 13.085808580858085, + "grad_norm": 0.41727375984191895, + "learning_rate": 1.6099086172997882e-05, + "loss": 0.038, + "num_input_tokens_seen": 25103072, + "step": 118950 + }, + { + "epoch": 13.086358635863586, + "grad_norm": 0.01179437804967165, + "learning_rate": 1.609684341785942e-05, + "loss": 0.0026, + "num_input_tokens_seen": 25104160, + "step": 118955 + }, + { + "epoch": 13.086908690869087, + "grad_norm": 0.02803724817931652, + "learning_rate": 1.6094600744777094e-05, + "loss": 0.0929, + "num_input_tokens_seen": 25105216, + "step": 118960 + }, + { + "epoch": 13.087458745874587, + "grad_norm": 0.48402678966522217, + "learning_rate": 1.6092358153771547e-05, + "loss": 0.0092, + "num_input_tokens_seen": 25106272, + "step": 118965 + }, + { + "epoch": 13.088008800880088, + "grad_norm": 0.015465657226741314, + "learning_rate": 1.6090115644863467e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25107328, + "step": 118970 + }, + { + "epoch": 13.088558855885589, + "grad_norm": 0.1932622641324997, + "learning_rate": 1.6087873218073523e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25108352, + "step": 118975 + }, + { + "epoch": 13.089108910891088, + "grad_norm": 1.4858378171920776, + "learning_rate": 1.6085630873422363e-05, + "loss": 0.0374, + "num_input_tokens_seen": 25109376, + "step": 118980 + }, + { + "epoch": 13.08965896589659, + "grad_norm": 0.11010708659887314, + "learning_rate": 1.608338861093067e-05, + "loss": 0.0869, + "num_input_tokens_seen": 25110432, + "step": 118985 + }, + { + "epoch": 13.09020902090209, + "grad_norm": 0.1315566450357437, + "learning_rate": 1.6081146430619102e-05, + "loss": 0.0637, + "num_input_tokens_seen": 25111520, + "step": 118990 + }, + { + "epoch": 13.090759075907592, + "grad_norm": 0.017440903931856155, + "learning_rate": 1.6078904332508344e-05, + "loss": 0.0466, + "num_input_tokens_seen": 25112544, + "step": 118995 + }, + { + "epoch": 13.091309130913091, + "grad_norm": 0.06580030918121338, + "learning_rate": 1.6076662316619033e-05, + "loss": 0.0722, + "num_input_tokens_seen": 25113664, + "step": 119000 + }, + { + "epoch": 13.091859185918592, + "grad_norm": 0.012450899928808212, + "learning_rate": 1.607442038297184e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25114688, + "step": 119005 + }, + { + "epoch": 13.092409240924093, + "grad_norm": 1.3967915773391724, + "learning_rate": 1.6072178531587437e-05, + "loss": 0.0607, + "num_input_tokens_seen": 25115680, + "step": 119010 + }, + { + "epoch": 13.092959295929592, + "grad_norm": 0.020596494898200035, + "learning_rate": 1.6069936762486467e-05, + "loss": 0.0604, + "num_input_tokens_seen": 25116736, + "step": 119015 + }, + { + "epoch": 13.093509350935093, + "grad_norm": 0.7097257375717163, + "learning_rate": 1.6067695075689622e-05, + "loss": 0.0093, + "num_input_tokens_seen": 25117760, + "step": 119020 + }, + { + "epoch": 13.094059405940595, + "grad_norm": 0.03354417905211449, + "learning_rate": 1.606545347121754e-05, + "loss": 0.0299, + "num_input_tokens_seen": 25118816, + "step": 119025 + }, + { + "epoch": 13.094609460946094, + "grad_norm": 0.02960655838251114, + "learning_rate": 1.6063211949090882e-05, + "loss": 0.1727, + "num_input_tokens_seen": 25119872, + "step": 119030 + }, + { + "epoch": 13.095159515951595, + "grad_norm": 0.011702664196491241, + "learning_rate": 1.606097050933032e-05, + "loss": 0.0201, + "num_input_tokens_seen": 25120864, + "step": 119035 + }, + { + "epoch": 13.095709570957096, + "grad_norm": 0.3326641321182251, + "learning_rate": 1.6058729151956493e-05, + "loss": 0.0087, + "num_input_tokens_seen": 25121856, + "step": 119040 + }, + { + "epoch": 13.096259625962595, + "grad_norm": 0.03328481316566467, + "learning_rate": 1.6056487876990072e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25122912, + "step": 119045 + }, + { + "epoch": 13.096809680968097, + "grad_norm": 0.01343738753348589, + "learning_rate": 1.605424668445172e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25123904, + "step": 119050 + }, + { + "epoch": 13.097359735973598, + "grad_norm": 0.9817541241645813, + "learning_rate": 1.6052005574362073e-05, + "loss": 0.0965, + "num_input_tokens_seen": 25125056, + "step": 119055 + }, + { + "epoch": 13.097909790979099, + "grad_norm": 0.05100097879767418, + "learning_rate": 1.60497645467418e-05, + "loss": 0.0475, + "num_input_tokens_seen": 25126112, + "step": 119060 + }, + { + "epoch": 13.098459845984598, + "grad_norm": 0.3159683644771576, + "learning_rate": 1.6047523601611546e-05, + "loss": 0.0049, + "num_input_tokens_seen": 25127136, + "step": 119065 + }, + { + "epoch": 13.099009900990099, + "grad_norm": 0.010292529128491879, + "learning_rate": 1.6045282738991985e-05, + "loss": 0.0105, + "num_input_tokens_seen": 25128224, + "step": 119070 + }, + { + "epoch": 13.0995599559956, + "grad_norm": 0.09173671156167984, + "learning_rate": 1.6043041958903747e-05, + "loss": 0.0027, + "num_input_tokens_seen": 25129248, + "step": 119075 + }, + { + "epoch": 13.1001100110011, + "grad_norm": 0.007387902121990919, + "learning_rate": 1.6040801261367493e-05, + "loss": 0.0552, + "num_input_tokens_seen": 25130304, + "step": 119080 + }, + { + "epoch": 13.1006600660066, + "grad_norm": 0.012877301312983036, + "learning_rate": 1.6038560646403876e-05, + "loss": 0.0083, + "num_input_tokens_seen": 25131360, + "step": 119085 + }, + { + "epoch": 13.101210121012102, + "grad_norm": 0.6000218391418457, + "learning_rate": 1.6036320114033544e-05, + "loss": 0.1131, + "num_input_tokens_seen": 25132416, + "step": 119090 + }, + { + "epoch": 13.101760176017601, + "grad_norm": 0.032514579594135284, + "learning_rate": 1.6034079664277158e-05, + "loss": 0.002, + "num_input_tokens_seen": 25133536, + "step": 119095 + }, + { + "epoch": 13.102310231023102, + "grad_norm": 0.019308404996991158, + "learning_rate": 1.6031839297155348e-05, + "loss": 0.0731, + "num_input_tokens_seen": 25134560, + "step": 119100 + }, + { + "epoch": 13.102860286028603, + "grad_norm": 0.20531733334064484, + "learning_rate": 1.6029599012688774e-05, + "loss": 0.0502, + "num_input_tokens_seen": 25135584, + "step": 119105 + }, + { + "epoch": 13.103410341034103, + "grad_norm": 0.10181255638599396, + "learning_rate": 1.6027358810898085e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25136640, + "step": 119110 + }, + { + "epoch": 13.103960396039604, + "grad_norm": 1.8605360984802246, + "learning_rate": 1.602511869180392e-05, + "loss": 0.0901, + "num_input_tokens_seen": 25137696, + "step": 119115 + }, + { + "epoch": 13.104510451045105, + "grad_norm": 0.009821498766541481, + "learning_rate": 1.6022878655426927e-05, + "loss": 0.0871, + "num_input_tokens_seen": 25138816, + "step": 119120 + }, + { + "epoch": 13.105060506050606, + "grad_norm": 0.06316907703876495, + "learning_rate": 1.6020638701787764e-05, + "loss": 0.0031, + "num_input_tokens_seen": 25139808, + "step": 119125 + }, + { + "epoch": 13.105610561056105, + "grad_norm": 0.03566030040383339, + "learning_rate": 1.6018398830907046e-05, + "loss": 0.0021, + "num_input_tokens_seen": 25140864, + "step": 119130 + }, + { + "epoch": 13.106160616061606, + "grad_norm": 0.12630288302898407, + "learning_rate": 1.6016159042805456e-05, + "loss": 0.0021, + "num_input_tokens_seen": 25141888, + "step": 119135 + }, + { + "epoch": 13.106710671067107, + "grad_norm": 0.015185444615781307, + "learning_rate": 1.6013919337503606e-05, + "loss": 0.039, + "num_input_tokens_seen": 25142912, + "step": 119140 + }, + { + "epoch": 13.107260726072607, + "grad_norm": 2.379120111465454, + "learning_rate": 1.6011679715022155e-05, + "loss": 0.1431, + "num_input_tokens_seen": 25143872, + "step": 119145 + }, + { + "epoch": 13.107810781078108, + "grad_norm": 0.09675081074237823, + "learning_rate": 1.6009440175381742e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25144928, + "step": 119150 + }, + { + "epoch": 13.108360836083609, + "grad_norm": 0.4324854612350464, + "learning_rate": 1.6007200718602994e-05, + "loss": 0.0378, + "num_input_tokens_seen": 25145984, + "step": 119155 + }, + { + "epoch": 13.108910891089108, + "grad_norm": 0.025827795267105103, + "learning_rate": 1.600496134470657e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25147040, + "step": 119160 + }, + { + "epoch": 13.10946094609461, + "grad_norm": 0.09062430262565613, + "learning_rate": 1.6002722053713106e-05, + "loss": 0.0023, + "num_input_tokens_seen": 25148096, + "step": 119165 + }, + { + "epoch": 13.11001100110011, + "grad_norm": 0.041754115372896194, + "learning_rate": 1.600048284564322e-05, + "loss": 0.0368, + "num_input_tokens_seen": 25149152, + "step": 119170 + }, + { + "epoch": 13.110561056105611, + "grad_norm": 0.027683991938829422, + "learning_rate": 1.5998243720517575e-05, + "loss": 0.0594, + "num_input_tokens_seen": 25150208, + "step": 119175 + }, + { + "epoch": 13.11111111111111, + "grad_norm": 0.016632571816444397, + "learning_rate": 1.5996004678356793e-05, + "loss": 0.0085, + "num_input_tokens_seen": 25151296, + "step": 119180 + }, + { + "epoch": 13.111661166116612, + "grad_norm": 0.05962717905640602, + "learning_rate": 1.5993765719181523e-05, + "loss": 0.0138, + "num_input_tokens_seen": 25152352, + "step": 119185 + }, + { + "epoch": 13.112211221122113, + "grad_norm": 0.16118597984313965, + "learning_rate": 1.5991526843012396e-05, + "loss": 0.0596, + "num_input_tokens_seen": 25153344, + "step": 119190 + }, + { + "epoch": 13.112761276127612, + "grad_norm": 0.029387492686510086, + "learning_rate": 1.5989288049870027e-05, + "loss": 0.0542, + "num_input_tokens_seen": 25154464, + "step": 119195 + }, + { + "epoch": 13.113311331133113, + "grad_norm": 0.037049975246191025, + "learning_rate": 1.5987049339775078e-05, + "loss": 0.0984, + "num_input_tokens_seen": 25155520, + "step": 119200 + }, + { + "epoch": 13.113861386138614, + "grad_norm": 0.010200093500316143, + "learning_rate": 1.5984810712748163e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25156576, + "step": 119205 + }, + { + "epoch": 13.114411441144114, + "grad_norm": 0.15489462018013, + "learning_rate": 1.5982572168809935e-05, + "loss": 0.055, + "num_input_tokens_seen": 25157664, + "step": 119210 + }, + { + "epoch": 13.114961496149615, + "grad_norm": 0.04164690896868706, + "learning_rate": 1.5980333707981004e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25158688, + "step": 119215 + }, + { + "epoch": 13.115511551155116, + "grad_norm": 0.15940386056900024, + "learning_rate": 1.5978095330282007e-05, + "loss": 0.0056, + "num_input_tokens_seen": 25159744, + "step": 119220 + }, + { + "epoch": 13.116061606160615, + "grad_norm": 0.16046620905399323, + "learning_rate": 1.5975857035733582e-05, + "loss": 0.0182, + "num_input_tokens_seen": 25160896, + "step": 119225 + }, + { + "epoch": 13.116611661166116, + "grad_norm": 1.1303819417953491, + "learning_rate": 1.5973618824356345e-05, + "loss": 0.0382, + "num_input_tokens_seen": 25162016, + "step": 119230 + }, + { + "epoch": 13.117161716171617, + "grad_norm": 0.4540170431137085, + "learning_rate": 1.5971380696170934e-05, + "loss": 0.1041, + "num_input_tokens_seen": 25163104, + "step": 119235 + }, + { + "epoch": 13.117711771177119, + "grad_norm": 0.0403839610517025, + "learning_rate": 1.5969142651197985e-05, + "loss": 0.0128, + "num_input_tokens_seen": 25164160, + "step": 119240 + }, + { + "epoch": 13.118261826182618, + "grad_norm": 2.9187304973602295, + "learning_rate": 1.5966904689458102e-05, + "loss": 0.1285, + "num_input_tokens_seen": 25165184, + "step": 119245 + }, + { + "epoch": 13.118811881188119, + "grad_norm": 0.9022109508514404, + "learning_rate": 1.596466681097193e-05, + "loss": 0.142, + "num_input_tokens_seen": 25166272, + "step": 119250 + }, + { + "epoch": 13.11936193619362, + "grad_norm": 4.461977958679199, + "learning_rate": 1.5962429015760082e-05, + "loss": 0.0325, + "num_input_tokens_seen": 25167360, + "step": 119255 + }, + { + "epoch": 13.11991199119912, + "grad_norm": 0.0068044946528971195, + "learning_rate": 1.5960191303843195e-05, + "loss": 0.0898, + "num_input_tokens_seen": 25168384, + "step": 119260 + }, + { + "epoch": 13.12046204620462, + "grad_norm": 0.03279376029968262, + "learning_rate": 1.5957953675241892e-05, + "loss": 0.0273, + "num_input_tokens_seen": 25169472, + "step": 119265 + }, + { + "epoch": 13.121012101210122, + "grad_norm": 0.14340735971927643, + "learning_rate": 1.595571612997678e-05, + "loss": 0.0116, + "num_input_tokens_seen": 25170464, + "step": 119270 + }, + { + "epoch": 13.12156215621562, + "grad_norm": 0.025656307116150856, + "learning_rate": 1.59534786680685e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25171520, + "step": 119275 + }, + { + "epoch": 13.122112211221122, + "grad_norm": 1.9002389907836914, + "learning_rate": 1.595124128953766e-05, + "loss": 0.0777, + "num_input_tokens_seen": 25172640, + "step": 119280 + }, + { + "epoch": 13.122662266226623, + "grad_norm": 0.029647592455148697, + "learning_rate": 1.5949003994404897e-05, + "loss": 0.0075, + "num_input_tokens_seen": 25173728, + "step": 119285 + }, + { + "epoch": 13.123212321232122, + "grad_norm": 0.02366536855697632, + "learning_rate": 1.5946766782690815e-05, + "loss": 0.0078, + "num_input_tokens_seen": 25174752, + "step": 119290 + }, + { + "epoch": 13.123762376237623, + "grad_norm": 0.8928076028823853, + "learning_rate": 1.5944529654416034e-05, + "loss": 0.01, + "num_input_tokens_seen": 25175776, + "step": 119295 + }, + { + "epoch": 13.124312431243125, + "grad_norm": 1.7001512050628662, + "learning_rate": 1.594229260960119e-05, + "loss": 0.0883, + "num_input_tokens_seen": 25176832, + "step": 119300 + }, + { + "epoch": 13.124862486248626, + "grad_norm": 0.10133384168148041, + "learning_rate": 1.5940055648266878e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25177920, + "step": 119305 + }, + { + "epoch": 13.125412541254125, + "grad_norm": 0.0926947072148323, + "learning_rate": 1.5937818770433733e-05, + "loss": 0.0261, + "num_input_tokens_seen": 25178976, + "step": 119310 + }, + { + "epoch": 13.125962596259626, + "grad_norm": 0.33057674765586853, + "learning_rate": 1.5935581976122366e-05, + "loss": 0.0077, + "num_input_tokens_seen": 25180064, + "step": 119315 + }, + { + "epoch": 13.126512651265127, + "grad_norm": 0.05390430986881256, + "learning_rate": 1.593334526535337e-05, + "loss": 0.0073, + "num_input_tokens_seen": 25181120, + "step": 119320 + }, + { + "epoch": 13.127062706270626, + "grad_norm": 0.05123491585254669, + "learning_rate": 1.593110863814741e-05, + "loss": 0.0047, + "num_input_tokens_seen": 25182112, + "step": 119325 + }, + { + "epoch": 13.127612761276128, + "grad_norm": 0.035157445818185806, + "learning_rate": 1.592887209452505e-05, + "loss": 0.0062, + "num_input_tokens_seen": 25183200, + "step": 119330 + }, + { + "epoch": 13.128162816281629, + "grad_norm": 0.007823296822607517, + "learning_rate": 1.5926635634506926e-05, + "loss": 0.0019, + "num_input_tokens_seen": 25184224, + "step": 119335 + }, + { + "epoch": 13.128712871287128, + "grad_norm": 0.04628341645002365, + "learning_rate": 1.5924399258113653e-05, + "loss": 0.0057, + "num_input_tokens_seen": 25185248, + "step": 119340 + }, + { + "epoch": 13.129262926292629, + "grad_norm": 0.022409027442336082, + "learning_rate": 1.5922162965365833e-05, + "loss": 0.0023, + "num_input_tokens_seen": 25186272, + "step": 119345 + }, + { + "epoch": 13.12981298129813, + "grad_norm": 0.03596121072769165, + "learning_rate": 1.591992675628408e-05, + "loss": 0.0682, + "num_input_tokens_seen": 25187264, + "step": 119350 + }, + { + "epoch": 13.130363036303631, + "grad_norm": 0.038348838686943054, + "learning_rate": 1.5917690630889015e-05, + "loss": 0.0052, + "num_input_tokens_seen": 25188352, + "step": 119355 + }, + { + "epoch": 13.13091309130913, + "grad_norm": 0.05565108731389046, + "learning_rate": 1.5915454589201222e-05, + "loss": 0.0123, + "num_input_tokens_seen": 25189440, + "step": 119360 + }, + { + "epoch": 13.131463146314632, + "grad_norm": 0.009772663936018944, + "learning_rate": 1.5913218631241337e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25190496, + "step": 119365 + }, + { + "epoch": 13.132013201320133, + "grad_norm": 0.01889793947339058, + "learning_rate": 1.5910982757029945e-05, + "loss": 0.0595, + "num_input_tokens_seen": 25191552, + "step": 119370 + }, + { + "epoch": 13.132563256325632, + "grad_norm": 0.06603728234767914, + "learning_rate": 1.590874696658767e-05, + "loss": 0.0585, + "num_input_tokens_seen": 25192640, + "step": 119375 + }, + { + "epoch": 13.133113311331133, + "grad_norm": 0.011675474233925343, + "learning_rate": 1.590651125993512e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25193664, + "step": 119380 + }, + { + "epoch": 13.133663366336634, + "grad_norm": 0.02216639555990696, + "learning_rate": 1.5904275637092878e-05, + "loss": 0.003, + "num_input_tokens_seen": 25194688, + "step": 119385 + }, + { + "epoch": 13.134213421342134, + "grad_norm": 4.834566116333008, + "learning_rate": 1.590204009808157e-05, + "loss": 0.0517, + "num_input_tokens_seen": 25195712, + "step": 119390 + }, + { + "epoch": 13.134763476347635, + "grad_norm": 0.008824612013995647, + "learning_rate": 1.5899804642921786e-05, + "loss": 0.0069, + "num_input_tokens_seen": 25196832, + "step": 119395 + }, + { + "epoch": 13.135313531353136, + "grad_norm": 0.02529318630695343, + "learning_rate": 1.5897569271634146e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25197952, + "step": 119400 + }, + { + "epoch": 13.135863586358635, + "grad_norm": 0.3463803231716156, + "learning_rate": 1.5895333984239237e-05, + "loss": 0.0165, + "num_input_tokens_seen": 25198912, + "step": 119405 + }, + { + "epoch": 13.136413641364136, + "grad_norm": 0.29609546065330505, + "learning_rate": 1.5893098780757664e-05, + "loss": 0.0298, + "num_input_tokens_seen": 25200032, + "step": 119410 + }, + { + "epoch": 13.136963696369637, + "grad_norm": 0.0733293741941452, + "learning_rate": 1.5890863661210035e-05, + "loss": 0.099, + "num_input_tokens_seen": 25201056, + "step": 119415 + }, + { + "epoch": 13.137513751375138, + "grad_norm": 0.02973201312124729, + "learning_rate": 1.588862862561694e-05, + "loss": 0.0052, + "num_input_tokens_seen": 25202112, + "step": 119420 + }, + { + "epoch": 13.138063806380638, + "grad_norm": 0.2703460156917572, + "learning_rate": 1.5886393673998985e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25203200, + "step": 119425 + }, + { + "epoch": 13.138613861386139, + "grad_norm": 0.015020244754850864, + "learning_rate": 1.588415880637677e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25204224, + "step": 119430 + }, + { + "epoch": 13.13916391639164, + "grad_norm": 0.0037185600958764553, + "learning_rate": 1.588192402277088e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25205280, + "step": 119435 + }, + { + "epoch": 13.13971397139714, + "grad_norm": 0.01297481544315815, + "learning_rate": 1.587968932320193e-05, + "loss": 0.0083, + "num_input_tokens_seen": 25206336, + "step": 119440 + }, + { + "epoch": 13.14026402640264, + "grad_norm": 1.2682455778121948, + "learning_rate": 1.5877454707690497e-05, + "loss": 0.0081, + "num_input_tokens_seen": 25207360, + "step": 119445 + }, + { + "epoch": 13.140814081408141, + "grad_norm": 0.13195031881332397, + "learning_rate": 1.5875220176257193e-05, + "loss": 0.0544, + "num_input_tokens_seen": 25208352, + "step": 119450 + }, + { + "epoch": 13.14136413641364, + "grad_norm": 1.5543217658996582, + "learning_rate": 1.5872985728922613e-05, + "loss": 0.0812, + "num_input_tokens_seen": 25209312, + "step": 119455 + }, + { + "epoch": 13.141914191419142, + "grad_norm": 0.11639714986085892, + "learning_rate": 1.5870751365707332e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25210336, + "step": 119460 + }, + { + "epoch": 13.142464246424643, + "grad_norm": 0.06259596347808838, + "learning_rate": 1.586851708663196e-05, + "loss": 0.0068, + "num_input_tokens_seen": 25211488, + "step": 119465 + }, + { + "epoch": 13.143014301430142, + "grad_norm": 0.05579780787229538, + "learning_rate": 1.586628289171708e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25212576, + "step": 119470 + }, + { + "epoch": 13.143564356435643, + "grad_norm": 0.044165126979351044, + "learning_rate": 1.5864048780983297e-05, + "loss": 0.0138, + "num_input_tokens_seen": 25213664, + "step": 119475 + }, + { + "epoch": 13.144114411441144, + "grad_norm": 0.017571277916431427, + "learning_rate": 1.586181475445119e-05, + "loss": 0.0019, + "num_input_tokens_seen": 25214720, + "step": 119480 + }, + { + "epoch": 13.144664466446645, + "grad_norm": 0.016417765989899635, + "learning_rate": 1.5859580812141344e-05, + "loss": 0.002, + "num_input_tokens_seen": 25215744, + "step": 119485 + }, + { + "epoch": 13.145214521452145, + "grad_norm": 0.014610753394663334, + "learning_rate": 1.5857346954074365e-05, + "loss": 0.059, + "num_input_tokens_seen": 25216800, + "step": 119490 + }, + { + "epoch": 13.145764576457646, + "grad_norm": 0.054804082959890366, + "learning_rate": 1.585511318027082e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25217824, + "step": 119495 + }, + { + "epoch": 13.146314631463147, + "grad_norm": 0.006591046694666147, + "learning_rate": 1.585287949075132e-05, + "loss": 0.0469, + "num_input_tokens_seen": 25218912, + "step": 119500 + }, + { + "epoch": 13.146864686468646, + "grad_norm": 0.725534975528717, + "learning_rate": 1.5850645885536436e-05, + "loss": 0.0722, + "num_input_tokens_seen": 25220000, + "step": 119505 + }, + { + "epoch": 13.147414741474147, + "grad_norm": 0.0013300289865583181, + "learning_rate": 1.5848412364646757e-05, + "loss": 0.0149, + "num_input_tokens_seen": 25220992, + "step": 119510 + }, + { + "epoch": 13.147964796479648, + "grad_norm": 0.10161056369543076, + "learning_rate": 1.5846178928102878e-05, + "loss": 0.0202, + "num_input_tokens_seen": 25222080, + "step": 119515 + }, + { + "epoch": 13.148514851485148, + "grad_norm": 0.008302475325763226, + "learning_rate": 1.584394557592537e-05, + "loss": 0.0215, + "num_input_tokens_seen": 25223168, + "step": 119520 + }, + { + "epoch": 13.149064906490649, + "grad_norm": 0.025262149050831795, + "learning_rate": 1.584171230813482e-05, + "loss": 0.0819, + "num_input_tokens_seen": 25224288, + "step": 119525 + }, + { + "epoch": 13.14961496149615, + "grad_norm": 0.007945218123495579, + "learning_rate": 1.5839479124751818e-05, + "loss": 0.0012, + "num_input_tokens_seen": 25225312, + "step": 119530 + }, + { + "epoch": 13.150165016501651, + "grad_norm": 0.27703213691711426, + "learning_rate": 1.5837246025796934e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25226304, + "step": 119535 + }, + { + "epoch": 13.15071507150715, + "grad_norm": 0.08459891378879547, + "learning_rate": 1.5835013011290762e-05, + "loss": 0.0361, + "num_input_tokens_seen": 25227328, + "step": 119540 + }, + { + "epoch": 13.151265126512651, + "grad_norm": 0.02335742861032486, + "learning_rate": 1.5832780081253882e-05, + "loss": 0.0159, + "num_input_tokens_seen": 25228352, + "step": 119545 + }, + { + "epoch": 13.151815181518153, + "grad_norm": 0.011405989527702332, + "learning_rate": 1.583054723570686e-05, + "loss": 0.004, + "num_input_tokens_seen": 25229472, + "step": 119550 + }, + { + "epoch": 13.152365236523652, + "grad_norm": 1.1693766117095947, + "learning_rate": 1.5828314474670285e-05, + "loss": 0.0358, + "num_input_tokens_seen": 25230528, + "step": 119555 + }, + { + "epoch": 13.152915291529153, + "grad_norm": 0.02404775656759739, + "learning_rate": 1.5826081798164732e-05, + "loss": 0.0061, + "num_input_tokens_seen": 25231680, + "step": 119560 + }, + { + "epoch": 13.153465346534654, + "grad_norm": 0.032704178243875504, + "learning_rate": 1.5823849206210788e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25232736, + "step": 119565 + }, + { + "epoch": 13.154015401540153, + "grad_norm": 0.03979768231511116, + "learning_rate": 1.5821616698829027e-05, + "loss": 0.0389, + "num_input_tokens_seen": 25233856, + "step": 119570 + }, + { + "epoch": 13.154565456545654, + "grad_norm": 0.46144038438796997, + "learning_rate": 1.581938427604001e-05, + "loss": 0.0173, + "num_input_tokens_seen": 25234912, + "step": 119575 + }, + { + "epoch": 13.155115511551156, + "grad_norm": 0.2753364145755768, + "learning_rate": 1.5817151937864326e-05, + "loss": 0.0241, + "num_input_tokens_seen": 25235968, + "step": 119580 + }, + { + "epoch": 13.155665566556655, + "grad_norm": 0.0335087887942791, + "learning_rate": 1.5814919684322545e-05, + "loss": 0.0027, + "num_input_tokens_seen": 25236992, + "step": 119585 + }, + { + "epoch": 13.156215621562156, + "grad_norm": 0.013777632266283035, + "learning_rate": 1.581268751543525e-05, + "loss": 0.0046, + "num_input_tokens_seen": 25238144, + "step": 119590 + }, + { + "epoch": 13.156765676567657, + "grad_norm": 0.3315429091453552, + "learning_rate": 1.5810455431222997e-05, + "loss": 0.005, + "num_input_tokens_seen": 25239200, + "step": 119595 + }, + { + "epoch": 13.157315731573158, + "grad_norm": 0.05668791010975838, + "learning_rate": 1.580822343170637e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25240256, + "step": 119600 + }, + { + "epoch": 13.157865786578657, + "grad_norm": 1.196004033088684, + "learning_rate": 1.5805991516905938e-05, + "loss": 0.0752, + "num_input_tokens_seen": 25241280, + "step": 119605 + }, + { + "epoch": 13.158415841584159, + "grad_norm": 0.028201237320899963, + "learning_rate": 1.5803759686842267e-05, + "loss": 0.0178, + "num_input_tokens_seen": 25242368, + "step": 119610 + }, + { + "epoch": 13.15896589658966, + "grad_norm": 0.028235943987965584, + "learning_rate": 1.580152794153593e-05, + "loss": 0.009, + "num_input_tokens_seen": 25243424, + "step": 119615 + }, + { + "epoch": 13.159515951595159, + "grad_norm": 0.16724494099617004, + "learning_rate": 1.5799296281007502e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25244480, + "step": 119620 + }, + { + "epoch": 13.16006600660066, + "grad_norm": 2.046044111251831, + "learning_rate": 1.5797064705277538e-05, + "loss": 0.0254, + "num_input_tokens_seen": 25245504, + "step": 119625 + }, + { + "epoch": 13.160616061606161, + "grad_norm": 1.1466537714004517, + "learning_rate": 1.579483321436662e-05, + "loss": 0.0187, + "num_input_tokens_seen": 25246528, + "step": 119630 + }, + { + "epoch": 13.16116611661166, + "grad_norm": 0.5970932245254517, + "learning_rate": 1.57926018082953e-05, + "loss": 0.0057, + "num_input_tokens_seen": 25247648, + "step": 119635 + }, + { + "epoch": 13.161716171617162, + "grad_norm": 0.1080993041396141, + "learning_rate": 1.5790370487084157e-05, + "loss": 0.0584, + "num_input_tokens_seen": 25248672, + "step": 119640 + }, + { + "epoch": 13.162266226622663, + "grad_norm": 0.004548074211925268, + "learning_rate": 1.578813925075375e-05, + "loss": 0.0044, + "num_input_tokens_seen": 25249728, + "step": 119645 + }, + { + "epoch": 13.162816281628162, + "grad_norm": 0.09217608720064163, + "learning_rate": 1.5785908099324638e-05, + "loss": 0.045, + "num_input_tokens_seen": 25250720, + "step": 119650 + }, + { + "epoch": 13.163366336633663, + "grad_norm": 0.19405393302440643, + "learning_rate": 1.578367703281739e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25251744, + "step": 119655 + }, + { + "epoch": 13.163916391639164, + "grad_norm": 0.022867826744914055, + "learning_rate": 1.5781446051252568e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25252800, + "step": 119660 + }, + { + "epoch": 13.164466446644665, + "grad_norm": 0.017574172466993332, + "learning_rate": 1.577921515465074e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25253856, + "step": 119665 + }, + { + "epoch": 13.165016501650165, + "grad_norm": 0.01983277127146721, + "learning_rate": 1.5776984343032465e-05, + "loss": 0.0147, + "num_input_tokens_seen": 25254880, + "step": 119670 + }, + { + "epoch": 13.165566556655666, + "grad_norm": 0.0016576533671468496, + "learning_rate": 1.5774753616418285e-05, + "loss": 0.0677, + "num_input_tokens_seen": 25255904, + "step": 119675 + }, + { + "epoch": 13.166116611661167, + "grad_norm": 0.005496303550899029, + "learning_rate": 1.5772522974828784e-05, + "loss": 0.0081, + "num_input_tokens_seen": 25256928, + "step": 119680 + }, + { + "epoch": 13.166666666666666, + "grad_norm": 0.005870073568075895, + "learning_rate": 1.5770292418284505e-05, + "loss": 0.1271, + "num_input_tokens_seen": 25258016, + "step": 119685 + }, + { + "epoch": 13.167216721672167, + "grad_norm": 0.4892513155937195, + "learning_rate": 1.576806194680602e-05, + "loss": 0.0099, + "num_input_tokens_seen": 25259136, + "step": 119690 + }, + { + "epoch": 13.167766776677668, + "grad_norm": 0.060557860881090164, + "learning_rate": 1.5765831560413873e-05, + "loss": 0.0044, + "num_input_tokens_seen": 25260160, + "step": 119695 + }, + { + "epoch": 13.168316831683168, + "grad_norm": 0.008709731511771679, + "learning_rate": 1.5763601259128624e-05, + "loss": 0.1117, + "num_input_tokens_seen": 25261248, + "step": 119700 + }, + { + "epoch": 13.168866886688669, + "grad_norm": 0.03386683017015457, + "learning_rate": 1.576137104297084e-05, + "loss": 0.1426, + "num_input_tokens_seen": 25262240, + "step": 119705 + }, + { + "epoch": 13.16941694169417, + "grad_norm": 0.016509097069501877, + "learning_rate": 1.575914091196106e-05, + "loss": 0.0006, + "num_input_tokens_seen": 25263328, + "step": 119710 + }, + { + "epoch": 13.16996699669967, + "grad_norm": 0.014838832430541515, + "learning_rate": 1.575691086611984e-05, + "loss": 0.0337, + "num_input_tokens_seen": 25264352, + "step": 119715 + }, + { + "epoch": 13.17051705170517, + "grad_norm": 0.015387468971312046, + "learning_rate": 1.5754680905467743e-05, + "loss": 0.0629, + "num_input_tokens_seen": 25265312, + "step": 119720 + }, + { + "epoch": 13.171067106710671, + "grad_norm": 0.2771329879760742, + "learning_rate": 1.575245103002531e-05, + "loss": 0.0261, + "num_input_tokens_seen": 25266336, + "step": 119725 + }, + { + "epoch": 13.171617161716172, + "grad_norm": 0.02965403161942959, + "learning_rate": 1.57502212398131e-05, + "loss": 0.1622, + "num_input_tokens_seen": 25267360, + "step": 119730 + }, + { + "epoch": 13.172167216721672, + "grad_norm": 0.00415837112814188, + "learning_rate": 1.5747991534851668e-05, + "loss": 0.0044, + "num_input_tokens_seen": 25268416, + "step": 119735 + }, + { + "epoch": 13.172717271727173, + "grad_norm": 0.29527533054351807, + "learning_rate": 1.5745761915161546e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25269440, + "step": 119740 + }, + { + "epoch": 13.173267326732674, + "grad_norm": 0.07903750985860825, + "learning_rate": 1.574353238076331e-05, + "loss": 0.0992, + "num_input_tokens_seen": 25270496, + "step": 119745 + }, + { + "epoch": 13.173817381738173, + "grad_norm": 2.2678334712982178, + "learning_rate": 1.5741302931677485e-05, + "loss": 0.1163, + "num_input_tokens_seen": 25271520, + "step": 119750 + }, + { + "epoch": 13.174367436743674, + "grad_norm": 0.01921854540705681, + "learning_rate": 1.5739073567924633e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25272544, + "step": 119755 + }, + { + "epoch": 13.174917491749175, + "grad_norm": 0.41293251514434814, + "learning_rate": 1.57368442895253e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25273664, + "step": 119760 + }, + { + "epoch": 13.175467546754675, + "grad_norm": 0.03751532360911369, + "learning_rate": 1.5734615096500015e-05, + "loss": 0.0466, + "num_input_tokens_seen": 25274784, + "step": 119765 + }, + { + "epoch": 13.176017601760176, + "grad_norm": 0.054483186453580856, + "learning_rate": 1.5732385988869347e-05, + "loss": 0.0099, + "num_input_tokens_seen": 25275840, + "step": 119770 + }, + { + "epoch": 13.176567656765677, + "grad_norm": 0.01254955306649208, + "learning_rate": 1.5730156966653824e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25276928, + "step": 119775 + }, + { + "epoch": 13.177117711771178, + "grad_norm": 1.4911773204803467, + "learning_rate": 1.5727928029874007e-05, + "loss": 0.0766, + "num_input_tokens_seen": 25277920, + "step": 119780 + }, + { + "epoch": 13.177667766776677, + "grad_norm": 2.0157079696655273, + "learning_rate": 1.572569917855042e-05, + "loss": 0.2713, + "num_input_tokens_seen": 25278976, + "step": 119785 + }, + { + "epoch": 13.178217821782178, + "grad_norm": 0.015105734579265118, + "learning_rate": 1.572347041270361e-05, + "loss": 0.0047, + "num_input_tokens_seen": 25280032, + "step": 119790 + }, + { + "epoch": 13.17876787678768, + "grad_norm": 0.10585006326436996, + "learning_rate": 1.5721241732354132e-05, + "loss": 0.0078, + "num_input_tokens_seen": 25281056, + "step": 119795 + }, + { + "epoch": 13.179317931793179, + "grad_norm": 0.01960689201951027, + "learning_rate": 1.5719013137522503e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25282048, + "step": 119800 + }, + { + "epoch": 13.17986798679868, + "grad_norm": 0.07542402297258377, + "learning_rate": 1.571678462822929e-05, + "loss": 0.0839, + "num_input_tokens_seen": 25283072, + "step": 119805 + }, + { + "epoch": 13.180418041804181, + "grad_norm": 0.013183050788939, + "learning_rate": 1.5714556204495014e-05, + "loss": 0.0233, + "num_input_tokens_seen": 25284096, + "step": 119810 + }, + { + "epoch": 13.18096809680968, + "grad_norm": 0.02257232367992401, + "learning_rate": 1.5712327866340212e-05, + "loss": 0.0068, + "num_input_tokens_seen": 25285120, + "step": 119815 + }, + { + "epoch": 13.181518151815181, + "grad_norm": 0.06778368353843689, + "learning_rate": 1.571009961378544e-05, + "loss": 0.0081, + "num_input_tokens_seen": 25286208, + "step": 119820 + }, + { + "epoch": 13.182068206820682, + "grad_norm": 0.0336771197617054, + "learning_rate": 1.5707871446851208e-05, + "loss": 0.0261, + "num_input_tokens_seen": 25287200, + "step": 119825 + }, + { + "epoch": 13.182618261826182, + "grad_norm": 0.013911861926317215, + "learning_rate": 1.5705643365558077e-05, + "loss": 0.0012, + "num_input_tokens_seen": 25288256, + "step": 119830 + }, + { + "epoch": 13.183168316831683, + "grad_norm": 0.012980887666344643, + "learning_rate": 1.5703415369926573e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25289280, + "step": 119835 + }, + { + "epoch": 13.183718371837184, + "grad_norm": 0.007347458507865667, + "learning_rate": 1.5701187459977216e-05, + "loss": 0.0035, + "num_input_tokens_seen": 25290336, + "step": 119840 + }, + { + "epoch": 13.184268426842685, + "grad_norm": 1.808254361152649, + "learning_rate": 1.5698959635730558e-05, + "loss": 0.0869, + "num_input_tokens_seen": 25291296, + "step": 119845 + }, + { + "epoch": 13.184818481848184, + "grad_norm": 0.1797972023487091, + "learning_rate": 1.5696731897207123e-05, + "loss": 0.0677, + "num_input_tokens_seen": 25292288, + "step": 119850 + }, + { + "epoch": 13.185368536853685, + "grad_norm": 0.03637246787548065, + "learning_rate": 1.5694504244427455e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25293376, + "step": 119855 + }, + { + "epoch": 13.185918591859187, + "grad_norm": 0.37315449118614197, + "learning_rate": 1.5692276677412072e-05, + "loss": 0.0397, + "num_input_tokens_seen": 25294368, + "step": 119860 + }, + { + "epoch": 13.186468646864686, + "grad_norm": 3.5907533168792725, + "learning_rate": 1.56900491961815e-05, + "loss": 0.1648, + "num_input_tokens_seen": 25295360, + "step": 119865 + }, + { + "epoch": 13.187018701870187, + "grad_norm": 0.014210055582225323, + "learning_rate": 1.5687821800756285e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25296416, + "step": 119870 + }, + { + "epoch": 13.187568756875688, + "grad_norm": 0.06971901655197144, + "learning_rate": 1.5685594491156945e-05, + "loss": 0.0815, + "num_input_tokens_seen": 25297504, + "step": 119875 + }, + { + "epoch": 13.188118811881187, + "grad_norm": 0.09567039459943771, + "learning_rate": 1.568336726740402e-05, + "loss": 0.0084, + "num_input_tokens_seen": 25298528, + "step": 119880 + }, + { + "epoch": 13.188668866886688, + "grad_norm": 1.3024628162384033, + "learning_rate": 1.5681140129518023e-05, + "loss": 0.0594, + "num_input_tokens_seen": 25299552, + "step": 119885 + }, + { + "epoch": 13.18921892189219, + "grad_norm": 1.3387895822525024, + "learning_rate": 1.5678913077519482e-05, + "loss": 0.1366, + "num_input_tokens_seen": 25300640, + "step": 119890 + }, + { + "epoch": 13.189768976897689, + "grad_norm": 0.013221727684140205, + "learning_rate": 1.5676686111428933e-05, + "loss": 0.0482, + "num_input_tokens_seen": 25301664, + "step": 119895 + }, + { + "epoch": 13.19031903190319, + "grad_norm": 0.052503183484077454, + "learning_rate": 1.567445923126689e-05, + "loss": 0.0787, + "num_input_tokens_seen": 25302784, + "step": 119900 + }, + { + "epoch": 13.190869086908691, + "grad_norm": 0.031928349286317825, + "learning_rate": 1.567223243705388e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25303872, + "step": 119905 + }, + { + "epoch": 13.191419141914192, + "grad_norm": 0.4734790325164795, + "learning_rate": 1.5670005728810432e-05, + "loss": 0.0128, + "num_input_tokens_seen": 25304928, + "step": 119910 + }, + { + "epoch": 13.191969196919691, + "grad_norm": 0.007344445679336786, + "learning_rate": 1.5667779106557055e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25305952, + "step": 119915 + }, + { + "epoch": 13.192519251925193, + "grad_norm": 0.07062804698944092, + "learning_rate": 1.566555257031429e-05, + "loss": 0.008, + "num_input_tokens_seen": 25307104, + "step": 119920 + }, + { + "epoch": 13.193069306930694, + "grad_norm": 0.1431453675031662, + "learning_rate": 1.566332612010265e-05, + "loss": 0.0363, + "num_input_tokens_seen": 25308192, + "step": 119925 + }, + { + "epoch": 13.193619361936193, + "grad_norm": 0.21769452095031738, + "learning_rate": 1.566109975594264e-05, + "loss": 0.0045, + "num_input_tokens_seen": 25309312, + "step": 119930 + }, + { + "epoch": 13.194169416941694, + "grad_norm": 0.021470144391059875, + "learning_rate": 1.565887347785481e-05, + "loss": 0.0084, + "num_input_tokens_seen": 25310400, + "step": 119935 + }, + { + "epoch": 13.194719471947195, + "grad_norm": 0.042381882667541504, + "learning_rate": 1.5656647285859644e-05, + "loss": 0.0018, + "num_input_tokens_seen": 25311392, + "step": 119940 + }, + { + "epoch": 13.195269526952695, + "grad_norm": 0.07004862278699875, + "learning_rate": 1.5654421179977684e-05, + "loss": 0.0369, + "num_input_tokens_seen": 25312448, + "step": 119945 + }, + { + "epoch": 13.195819581958196, + "grad_norm": 0.9317771792411804, + "learning_rate": 1.5652195160229443e-05, + "loss": 0.0723, + "num_input_tokens_seen": 25313568, + "step": 119950 + }, + { + "epoch": 13.196369636963697, + "grad_norm": 1.118721604347229, + "learning_rate": 1.5649969226635425e-05, + "loss": 0.0309, + "num_input_tokens_seen": 25314528, + "step": 119955 + }, + { + "epoch": 13.196919691969198, + "grad_norm": 0.9585808515548706, + "learning_rate": 1.5647743379216163e-05, + "loss": 0.2125, + "num_input_tokens_seen": 25315552, + "step": 119960 + }, + { + "epoch": 13.197469746974697, + "grad_norm": 0.03638710454106331, + "learning_rate": 1.564551761799215e-05, + "loss": 0.0566, + "num_input_tokens_seen": 25316608, + "step": 119965 + }, + { + "epoch": 13.198019801980198, + "grad_norm": 0.3983447253704071, + "learning_rate": 1.564329194298393e-05, + "loss": 0.0523, + "num_input_tokens_seen": 25317696, + "step": 119970 + }, + { + "epoch": 13.1985698569857, + "grad_norm": 0.03439343720674515, + "learning_rate": 1.5641066354211986e-05, + "loss": 0.0087, + "num_input_tokens_seen": 25318752, + "step": 119975 + }, + { + "epoch": 13.199119911991199, + "grad_norm": 0.13997963070869446, + "learning_rate": 1.5638840851696846e-05, + "loss": 0.0065, + "num_input_tokens_seen": 25319872, + "step": 119980 + }, + { + "epoch": 13.1996699669967, + "grad_norm": 0.032108914107084274, + "learning_rate": 1.5636615435459018e-05, + "loss": 0.0103, + "num_input_tokens_seen": 25320928, + "step": 119985 + }, + { + "epoch": 13.2002200220022, + "grad_norm": 0.18002961575984955, + "learning_rate": 1.5634390105519007e-05, + "loss": 0.1086, + "num_input_tokens_seen": 25321888, + "step": 119990 + }, + { + "epoch": 13.2007700770077, + "grad_norm": 0.9260240197181702, + "learning_rate": 1.5632164861897336e-05, + "loss": 0.0828, + "num_input_tokens_seen": 25322944, + "step": 119995 + }, + { + "epoch": 13.201320132013201, + "grad_norm": 0.9092060923576355, + "learning_rate": 1.5629939704614506e-05, + "loss": 0.0134, + "num_input_tokens_seen": 25324000, + "step": 120000 + }, + { + "epoch": 13.201870187018702, + "grad_norm": 0.06924891471862793, + "learning_rate": 1.562771463369102e-05, + "loss": 0.0067, + "num_input_tokens_seen": 25325024, + "step": 120005 + }, + { + "epoch": 13.202420242024202, + "grad_norm": 0.011685656383633614, + "learning_rate": 1.56254896491474e-05, + "loss": 0.0025, + "num_input_tokens_seen": 25326048, + "step": 120010 + }, + { + "epoch": 13.202970297029703, + "grad_norm": 0.019129455089569092, + "learning_rate": 1.5623264751004136e-05, + "loss": 0.0651, + "num_input_tokens_seen": 25327136, + "step": 120015 + }, + { + "epoch": 13.203520352035204, + "grad_norm": 0.2493441104888916, + "learning_rate": 1.562103993928174e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25328128, + "step": 120020 + }, + { + "epoch": 13.204070407040705, + "grad_norm": 0.004207003396004438, + "learning_rate": 1.5618815214000727e-05, + "loss": 0.0075, + "num_input_tokens_seen": 25329184, + "step": 120025 + }, + { + "epoch": 13.204620462046204, + "grad_norm": 0.15251880884170532, + "learning_rate": 1.5616590575181583e-05, + "loss": 0.0867, + "num_input_tokens_seen": 25330240, + "step": 120030 + }, + { + "epoch": 13.205170517051705, + "grad_norm": 0.04976392537355423, + "learning_rate": 1.561436602284483e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25331296, + "step": 120035 + }, + { + "epoch": 13.205720572057206, + "grad_norm": 0.28831928968429565, + "learning_rate": 1.561214155701095e-05, + "loss": 0.0107, + "num_input_tokens_seen": 25332320, + "step": 120040 + }, + { + "epoch": 13.206270627062706, + "grad_norm": 0.9732663035392761, + "learning_rate": 1.5609917177700466e-05, + "loss": 0.0246, + "num_input_tokens_seen": 25333376, + "step": 120045 + }, + { + "epoch": 13.206820682068207, + "grad_norm": 0.054959725588560104, + "learning_rate": 1.560769288493387e-05, + "loss": 0.008, + "num_input_tokens_seen": 25334464, + "step": 120050 + }, + { + "epoch": 13.207370737073708, + "grad_norm": 0.008734139613807201, + "learning_rate": 1.560546867873166e-05, + "loss": 0.0074, + "num_input_tokens_seen": 25335552, + "step": 120055 + }, + { + "epoch": 13.207920792079207, + "grad_norm": 0.026820819824934006, + "learning_rate": 1.5603244559114333e-05, + "loss": 0.0063, + "num_input_tokens_seen": 25336576, + "step": 120060 + }, + { + "epoch": 13.208470847084708, + "grad_norm": 0.006326289847493172, + "learning_rate": 1.5601020526102392e-05, + "loss": 0.0185, + "num_input_tokens_seen": 25337664, + "step": 120065 + }, + { + "epoch": 13.20902090209021, + "grad_norm": 0.259503036737442, + "learning_rate": 1.5598796579716345e-05, + "loss": 0.0077, + "num_input_tokens_seen": 25338688, + "step": 120070 + }, + { + "epoch": 13.209570957095709, + "grad_norm": 2.474141836166382, + "learning_rate": 1.559657271997667e-05, + "loss": 0.0422, + "num_input_tokens_seen": 25339776, + "step": 120075 + }, + { + "epoch": 13.21012101210121, + "grad_norm": 0.01614106260240078, + "learning_rate": 1.559434894690387e-05, + "loss": 0.0062, + "num_input_tokens_seen": 25340864, + "step": 120080 + }, + { + "epoch": 13.210671067106711, + "grad_norm": 0.21374769508838654, + "learning_rate": 1.5592125260518452e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25341888, + "step": 120085 + }, + { + "epoch": 13.211221122112212, + "grad_norm": 0.03699326515197754, + "learning_rate": 1.5589901660840896e-05, + "loss": 0.0289, + "num_input_tokens_seen": 25342912, + "step": 120090 + }, + { + "epoch": 13.211771177117711, + "grad_norm": 0.01073975395411253, + "learning_rate": 1.5587678147891698e-05, + "loss": 0.0148, + "num_input_tokens_seen": 25343968, + "step": 120095 + }, + { + "epoch": 13.212321232123212, + "grad_norm": 1.3498138189315796, + "learning_rate": 1.5585454721691362e-05, + "loss": 0.0549, + "num_input_tokens_seen": 25345088, + "step": 120100 + }, + { + "epoch": 13.212871287128714, + "grad_norm": 0.8712267875671387, + "learning_rate": 1.5583231382260357e-05, + "loss": 0.0148, + "num_input_tokens_seen": 25346208, + "step": 120105 + }, + { + "epoch": 13.213421342134213, + "grad_norm": 0.2198614925146103, + "learning_rate": 1.5581008129619206e-05, + "loss": 0.0114, + "num_input_tokens_seen": 25347232, + "step": 120110 + }, + { + "epoch": 13.213971397139714, + "grad_norm": 0.019764818251132965, + "learning_rate": 1.557878496378838e-05, + "loss": 0.004, + "num_input_tokens_seen": 25348288, + "step": 120115 + }, + { + "epoch": 13.214521452145215, + "grad_norm": 0.005455693230032921, + "learning_rate": 1.557656188478836e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25349312, + "step": 120120 + }, + { + "epoch": 13.215071507150714, + "grad_norm": 0.010988929308950901, + "learning_rate": 1.5574338892639666e-05, + "loss": 0.0017, + "num_input_tokens_seen": 25350368, + "step": 120125 + }, + { + "epoch": 13.215621562156215, + "grad_norm": 0.015680115669965744, + "learning_rate": 1.557211598736275e-05, + "loss": 0.0634, + "num_input_tokens_seen": 25351392, + "step": 120130 + }, + { + "epoch": 13.216171617161717, + "grad_norm": 0.015161367133259773, + "learning_rate": 1.556989316897813e-05, + "loss": 0.0143, + "num_input_tokens_seen": 25352416, + "step": 120135 + }, + { + "epoch": 13.216721672167218, + "grad_norm": 1.145586371421814, + "learning_rate": 1.5567670437506278e-05, + "loss": 0.0507, + "num_input_tokens_seen": 25353472, + "step": 120140 + }, + { + "epoch": 13.217271727172717, + "grad_norm": 0.004934362135827541, + "learning_rate": 1.5565447792967675e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25354496, + "step": 120145 + }, + { + "epoch": 13.217821782178218, + "grad_norm": 0.011214037425816059, + "learning_rate": 1.5563225235382818e-05, + "loss": 0.0087, + "num_input_tokens_seen": 25355584, + "step": 120150 + }, + { + "epoch": 13.218371837183719, + "grad_norm": 0.053743015974760056, + "learning_rate": 1.5561002764772175e-05, + "loss": 0.0062, + "num_input_tokens_seen": 25356640, + "step": 120155 + }, + { + "epoch": 13.218921892189218, + "grad_norm": 0.07332861423492432, + "learning_rate": 1.555878038115625e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25357664, + "step": 120160 + }, + { + "epoch": 13.21947194719472, + "grad_norm": 0.012225953862071037, + "learning_rate": 1.5556558084555523e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25358688, + "step": 120165 + }, + { + "epoch": 13.22002200220022, + "grad_norm": 1.532242774963379, + "learning_rate": 1.5554335874990456e-05, + "loss": 0.0268, + "num_input_tokens_seen": 25359680, + "step": 120170 + }, + { + "epoch": 13.22057205720572, + "grad_norm": 0.025731228291988373, + "learning_rate": 1.5552113752481545e-05, + "loss": 0.1329, + "num_input_tokens_seen": 25360672, + "step": 120175 + }, + { + "epoch": 13.221122112211221, + "grad_norm": 0.01821015775203705, + "learning_rate": 1.5549891717049267e-05, + "loss": 0.0281, + "num_input_tokens_seen": 25361728, + "step": 120180 + }, + { + "epoch": 13.221672167216722, + "grad_norm": 0.08223793655633926, + "learning_rate": 1.554766976871411e-05, + "loss": 0.0167, + "num_input_tokens_seen": 25362784, + "step": 120185 + }, + { + "epoch": 13.222222222222221, + "grad_norm": 0.11216707527637482, + "learning_rate": 1.5545447907496545e-05, + "loss": 0.108, + "num_input_tokens_seen": 25363840, + "step": 120190 + }, + { + "epoch": 13.222772277227723, + "grad_norm": 0.039723340421915054, + "learning_rate": 1.554322613341704e-05, + "loss": 0.0045, + "num_input_tokens_seen": 25364896, + "step": 120195 + }, + { + "epoch": 13.223322332233224, + "grad_norm": 0.01591704785823822, + "learning_rate": 1.5541004446496095e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25365984, + "step": 120200 + }, + { + "epoch": 13.223872387238725, + "grad_norm": 0.04612886160612106, + "learning_rate": 1.5538782846754157e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25367040, + "step": 120205 + }, + { + "epoch": 13.224422442244224, + "grad_norm": 0.04278651624917984, + "learning_rate": 1.553656133421173e-05, + "loss": 0.1251, + "num_input_tokens_seen": 25368032, + "step": 120210 + }, + { + "epoch": 13.224972497249725, + "grad_norm": 1.513936996459961, + "learning_rate": 1.5534339908889278e-05, + "loss": 0.0389, + "num_input_tokens_seen": 25369024, + "step": 120215 + }, + { + "epoch": 13.225522552255226, + "grad_norm": 0.425458163022995, + "learning_rate": 1.553211857080726e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25370048, + "step": 120220 + }, + { + "epoch": 13.226072607260726, + "grad_norm": 0.002976587973535061, + "learning_rate": 1.552989731998617e-05, + "loss": 0.017, + "num_input_tokens_seen": 25371104, + "step": 120225 + }, + { + "epoch": 13.226622662266227, + "grad_norm": 0.026356101036071777, + "learning_rate": 1.552767615644647e-05, + "loss": 0.0678, + "num_input_tokens_seen": 25372192, + "step": 120230 + }, + { + "epoch": 13.227172717271728, + "grad_norm": 0.0283469520509243, + "learning_rate": 1.5525455080208636e-05, + "loss": 0.0826, + "num_input_tokens_seen": 25373344, + "step": 120235 + }, + { + "epoch": 13.227722772277227, + "grad_norm": 0.04183095693588257, + "learning_rate": 1.5523234091293144e-05, + "loss": 0.0269, + "num_input_tokens_seen": 25374400, + "step": 120240 + }, + { + "epoch": 13.228272827282728, + "grad_norm": 1.5474399328231812, + "learning_rate": 1.552101318972044e-05, + "loss": 0.0457, + "num_input_tokens_seen": 25375456, + "step": 120245 + }, + { + "epoch": 13.22882288228823, + "grad_norm": 2.366273880004883, + "learning_rate": 1.5518792375511016e-05, + "loss": 0.1131, + "num_input_tokens_seen": 25376544, + "step": 120250 + }, + { + "epoch": 13.229372937293729, + "grad_norm": 0.4776642620563507, + "learning_rate": 1.551657164868533e-05, + "loss": 0.0073, + "num_input_tokens_seen": 25377600, + "step": 120255 + }, + { + "epoch": 13.22992299229923, + "grad_norm": 0.46110546588897705, + "learning_rate": 1.551435100926386e-05, + "loss": 0.1307, + "num_input_tokens_seen": 25378624, + "step": 120260 + }, + { + "epoch": 13.23047304730473, + "grad_norm": 0.005110376980155706, + "learning_rate": 1.5512130457267066e-05, + "loss": 0.0005, + "num_input_tokens_seen": 25379680, + "step": 120265 + }, + { + "epoch": 13.231023102310232, + "grad_norm": 0.3064146935939789, + "learning_rate": 1.55099099927154e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25380736, + "step": 120270 + }, + { + "epoch": 13.231573157315731, + "grad_norm": 0.08773080259561539, + "learning_rate": 1.5507689615629357e-05, + "loss": 0.0221, + "num_input_tokens_seen": 25381792, + "step": 120275 + }, + { + "epoch": 13.232123212321232, + "grad_norm": 0.02459793910384178, + "learning_rate": 1.5505469326029372e-05, + "loss": 0.1601, + "num_input_tokens_seen": 25382848, + "step": 120280 + }, + { + "epoch": 13.232673267326733, + "grad_norm": 1.6548799276351929, + "learning_rate": 1.550324912393592e-05, + "loss": 0.0895, + "num_input_tokens_seen": 25383904, + "step": 120285 + }, + { + "epoch": 13.233223322332233, + "grad_norm": 0.03298134729266167, + "learning_rate": 1.5501029009369467e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25384896, + "step": 120290 + }, + { + "epoch": 13.233773377337734, + "grad_norm": 0.036244235932826996, + "learning_rate": 1.549880898235047e-05, + "loss": 0.003, + "num_input_tokens_seen": 25385952, + "step": 120295 + }, + { + "epoch": 13.234323432343235, + "grad_norm": 0.008616319857537746, + "learning_rate": 1.5496589042899396e-05, + "loss": 0.0049, + "num_input_tokens_seen": 25387008, + "step": 120300 + }, + { + "epoch": 13.234873487348734, + "grad_norm": 0.7895151376724243, + "learning_rate": 1.54943691910367e-05, + "loss": 0.0065, + "num_input_tokens_seen": 25388000, + "step": 120305 + }, + { + "epoch": 13.235423542354235, + "grad_norm": 0.03130010887980461, + "learning_rate": 1.5492149426782834e-05, + "loss": 0.0716, + "num_input_tokens_seen": 25389024, + "step": 120310 + }, + { + "epoch": 13.235973597359736, + "grad_norm": 0.005236582364886999, + "learning_rate": 1.548992975015828e-05, + "loss": 0.0283, + "num_input_tokens_seen": 25390016, + "step": 120315 + }, + { + "epoch": 13.236523652365236, + "grad_norm": 0.7369796633720398, + "learning_rate": 1.548771016118346e-05, + "loss": 0.0125, + "num_input_tokens_seen": 25391072, + "step": 120320 + }, + { + "epoch": 13.237073707370737, + "grad_norm": 0.10001201927661896, + "learning_rate": 1.5485490659878865e-05, + "loss": 0.0683, + "num_input_tokens_seen": 25392096, + "step": 120325 + }, + { + "epoch": 13.237623762376238, + "grad_norm": 0.21490341424942017, + "learning_rate": 1.5483271246264938e-05, + "loss": 0.0189, + "num_input_tokens_seen": 25393152, + "step": 120330 + }, + { + "epoch": 13.238173817381739, + "grad_norm": 0.010056648403406143, + "learning_rate": 1.5481051920362123e-05, + "loss": 0.0587, + "num_input_tokens_seen": 25394176, + "step": 120335 + }, + { + "epoch": 13.238723872387238, + "grad_norm": 0.04659925401210785, + "learning_rate": 1.5478832682190892e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25395200, + "step": 120340 + }, + { + "epoch": 13.23927392739274, + "grad_norm": 2.0178468227386475, + "learning_rate": 1.5476613531771685e-05, + "loss": 0.0441, + "num_input_tokens_seen": 25396288, + "step": 120345 + }, + { + "epoch": 13.23982398239824, + "grad_norm": 0.06502532958984375, + "learning_rate": 1.547439446912497e-05, + "loss": 0.0135, + "num_input_tokens_seen": 25397344, + "step": 120350 + }, + { + "epoch": 13.24037403740374, + "grad_norm": 2.9257326126098633, + "learning_rate": 1.547217549427119e-05, + "loss": 0.0378, + "num_input_tokens_seen": 25398336, + "step": 120355 + }, + { + "epoch": 13.24092409240924, + "grad_norm": 0.00648814532905817, + "learning_rate": 1.5469956607230787e-05, + "loss": 0.0018, + "num_input_tokens_seen": 25399392, + "step": 120360 + }, + { + "epoch": 13.241474147414742, + "grad_norm": 0.020959246903657913, + "learning_rate": 1.546773780802423e-05, + "loss": 0.0935, + "num_input_tokens_seen": 25400384, + "step": 120365 + }, + { + "epoch": 13.242024202420241, + "grad_norm": 0.10461276769638062, + "learning_rate": 1.546551909667195e-05, + "loss": 0.0756, + "num_input_tokens_seen": 25401440, + "step": 120370 + }, + { + "epoch": 13.242574257425742, + "grad_norm": 0.011910231783986092, + "learning_rate": 1.546330047319442e-05, + "loss": 0.0319, + "num_input_tokens_seen": 25402528, + "step": 120375 + }, + { + "epoch": 13.243124312431243, + "grad_norm": 0.02639074996113777, + "learning_rate": 1.5461081937612064e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25403584, + "step": 120380 + }, + { + "epoch": 13.243674367436745, + "grad_norm": 0.02877216972410679, + "learning_rate": 1.5458863489945335e-05, + "loss": 0.0508, + "num_input_tokens_seen": 25404640, + "step": 120385 + }, + { + "epoch": 13.244224422442244, + "grad_norm": 0.12879370152950287, + "learning_rate": 1.5456645130214694e-05, + "loss": 0.0127, + "num_input_tokens_seen": 25405728, + "step": 120390 + }, + { + "epoch": 13.244774477447745, + "grad_norm": 0.02817522920668125, + "learning_rate": 1.5454426858440567e-05, + "loss": 0.0771, + "num_input_tokens_seen": 25406816, + "step": 120395 + }, + { + "epoch": 13.245324532453246, + "grad_norm": 0.06678333133459091, + "learning_rate": 1.545220867464341e-05, + "loss": 0.0137, + "num_input_tokens_seen": 25407904, + "step": 120400 + }, + { + "epoch": 13.245874587458745, + "grad_norm": 2.799116611480713, + "learning_rate": 1.5449990578843667e-05, + "loss": 0.1254, + "num_input_tokens_seen": 25408960, + "step": 120405 + }, + { + "epoch": 13.246424642464246, + "grad_norm": 0.06368236243724823, + "learning_rate": 1.544777257106177e-05, + "loss": 0.006, + "num_input_tokens_seen": 25409984, + "step": 120410 + }, + { + "epoch": 13.246974697469748, + "grad_norm": 0.4622279107570648, + "learning_rate": 1.544555465131818e-05, + "loss": 0.0202, + "num_input_tokens_seen": 25411040, + "step": 120415 + }, + { + "epoch": 13.247524752475247, + "grad_norm": 0.007938853465020657, + "learning_rate": 1.5443336819633315e-05, + "loss": 0.0426, + "num_input_tokens_seen": 25412064, + "step": 120420 + }, + { + "epoch": 13.248074807480748, + "grad_norm": 0.056736987084150314, + "learning_rate": 1.5441119076027637e-05, + "loss": 0.0262, + "num_input_tokens_seen": 25413184, + "step": 120425 + }, + { + "epoch": 13.248624862486249, + "grad_norm": 0.05049882084131241, + "learning_rate": 1.5438901420521584e-05, + "loss": 0.0064, + "num_input_tokens_seen": 25414208, + "step": 120430 + }, + { + "epoch": 13.249174917491748, + "grad_norm": 0.14073660969734192, + "learning_rate": 1.5436683853135577e-05, + "loss": 0.0043, + "num_input_tokens_seen": 25415232, + "step": 120435 + }, + { + "epoch": 13.24972497249725, + "grad_norm": 0.08175969123840332, + "learning_rate": 1.5434466373890073e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25416224, + "step": 120440 + }, + { + "epoch": 13.25027502750275, + "grad_norm": 0.04863037168979645, + "learning_rate": 1.5432248982805497e-05, + "loss": 0.0603, + "num_input_tokens_seen": 25417216, + "step": 120445 + }, + { + "epoch": 13.250825082508252, + "grad_norm": 2.484863519668579, + "learning_rate": 1.5430031679902303e-05, + "loss": 0.1349, + "num_input_tokens_seen": 25418336, + "step": 120450 + }, + { + "epoch": 13.251375137513751, + "grad_norm": 0.13785023987293243, + "learning_rate": 1.5427814465200903e-05, + "loss": 0.0236, + "num_input_tokens_seen": 25419424, + "step": 120455 + }, + { + "epoch": 13.251925192519252, + "grad_norm": 0.06072824448347092, + "learning_rate": 1.5425597338721742e-05, + "loss": 0.0165, + "num_input_tokens_seen": 25420416, + "step": 120460 + }, + { + "epoch": 13.252475247524753, + "grad_norm": 0.49630188941955566, + "learning_rate": 1.5423380300485264e-05, + "loss": 0.0527, + "num_input_tokens_seen": 25421472, + "step": 120465 + }, + { + "epoch": 13.253025302530252, + "grad_norm": 0.9145739674568176, + "learning_rate": 1.5421163350511896e-05, + "loss": 0.0234, + "num_input_tokens_seen": 25422464, + "step": 120470 + }, + { + "epoch": 13.253575357535754, + "grad_norm": 1.5644878149032593, + "learning_rate": 1.541894648882206e-05, + "loss": 0.066, + "num_input_tokens_seen": 25423488, + "step": 120475 + }, + { + "epoch": 13.254125412541255, + "grad_norm": 0.30303001403808594, + "learning_rate": 1.5416729715436205e-05, + "loss": 0.0066, + "num_input_tokens_seen": 25424576, + "step": 120480 + }, + { + "epoch": 13.254675467546754, + "grad_norm": 0.12778161466121674, + "learning_rate": 1.5414513030374746e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25425664, + "step": 120485 + }, + { + "epoch": 13.255225522552255, + "grad_norm": 0.06836335361003876, + "learning_rate": 1.541229643365813e-05, + "loss": 0.007, + "num_input_tokens_seen": 25426752, + "step": 120490 + }, + { + "epoch": 13.255775577557756, + "grad_norm": 0.024890027940273285, + "learning_rate": 1.5410079925306777e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25427744, + "step": 120495 + }, + { + "epoch": 13.256325632563255, + "grad_norm": 0.05299581214785576, + "learning_rate": 1.5407863505341104e-05, + "loss": 0.0566, + "num_input_tokens_seen": 25428864, + "step": 120500 + }, + { + "epoch": 13.256875687568757, + "grad_norm": 0.04118259623646736, + "learning_rate": 1.540564717378156e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25429888, + "step": 120505 + }, + { + "epoch": 13.257425742574258, + "grad_norm": 0.014723586849868298, + "learning_rate": 1.5403430930648558e-05, + "loss": 0.0219, + "num_input_tokens_seen": 25430976, + "step": 120510 + }, + { + "epoch": 13.257975797579759, + "grad_norm": 0.5933873653411865, + "learning_rate": 1.5401214775962532e-05, + "loss": 0.0089, + "num_input_tokens_seen": 25432000, + "step": 120515 + }, + { + "epoch": 13.258525852585258, + "grad_norm": 0.2252485156059265, + "learning_rate": 1.5398998709743906e-05, + "loss": 0.0062, + "num_input_tokens_seen": 25433056, + "step": 120520 + }, + { + "epoch": 13.25907590759076, + "grad_norm": 0.06787153333425522, + "learning_rate": 1.539678273201309e-05, + "loss": 0.0067, + "num_input_tokens_seen": 25434112, + "step": 120525 + }, + { + "epoch": 13.25962596259626, + "grad_norm": 0.032815683633089066, + "learning_rate": 1.539456684279053e-05, + "loss": 0.0127, + "num_input_tokens_seen": 25435168, + "step": 120530 + }, + { + "epoch": 13.26017601760176, + "grad_norm": 0.013169772922992706, + "learning_rate": 1.5392351042096627e-05, + "loss": 0.0092, + "num_input_tokens_seen": 25436256, + "step": 120535 + }, + { + "epoch": 13.26072607260726, + "grad_norm": 1.1785982847213745, + "learning_rate": 1.5390135329951824e-05, + "loss": 0.0858, + "num_input_tokens_seen": 25437312, + "step": 120540 + }, + { + "epoch": 13.261276127612762, + "grad_norm": 0.4649844765663147, + "learning_rate": 1.5387919706376537e-05, + "loss": 0.0096, + "num_input_tokens_seen": 25438368, + "step": 120545 + }, + { + "epoch": 13.261826182618261, + "grad_norm": 0.061695944517850876, + "learning_rate": 1.538570417139117e-05, + "loss": 0.016, + "num_input_tokens_seen": 25439424, + "step": 120550 + }, + { + "epoch": 13.262376237623762, + "grad_norm": 0.20564796030521393, + "learning_rate": 1.538348872501616e-05, + "loss": 0.0043, + "num_input_tokens_seen": 25440480, + "step": 120555 + }, + { + "epoch": 13.262926292629263, + "grad_norm": 0.044584374874830246, + "learning_rate": 1.5381273367271915e-05, + "loss": 0.0023, + "num_input_tokens_seen": 25441536, + "step": 120560 + }, + { + "epoch": 13.263476347634764, + "grad_norm": 0.034349750727415085, + "learning_rate": 1.537905809817887e-05, + "loss": 0.0077, + "num_input_tokens_seen": 25442592, + "step": 120565 + }, + { + "epoch": 13.264026402640264, + "grad_norm": 1.4350076913833618, + "learning_rate": 1.5376842917757423e-05, + "loss": 0.0293, + "num_input_tokens_seen": 25443648, + "step": 120570 + }, + { + "epoch": 13.264576457645765, + "grad_norm": 0.5936359763145447, + "learning_rate": 1.5374627826027995e-05, + "loss": 0.0091, + "num_input_tokens_seen": 25444768, + "step": 120575 + }, + { + "epoch": 13.265126512651266, + "grad_norm": 0.12636403739452362, + "learning_rate": 1.5372412823011006e-05, + "loss": 0.0823, + "num_input_tokens_seen": 25445856, + "step": 120580 + }, + { + "epoch": 13.265676567656765, + "grad_norm": 0.04832906648516655, + "learning_rate": 1.537019790872686e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25446880, + "step": 120585 + }, + { + "epoch": 13.266226622662266, + "grad_norm": 0.025647655129432678, + "learning_rate": 1.5367983083195992e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25447968, + "step": 120590 + }, + { + "epoch": 13.266776677667767, + "grad_norm": 0.36111390590667725, + "learning_rate": 1.5365768346438797e-05, + "loss": 0.0087, + "num_input_tokens_seen": 25449088, + "step": 120595 + }, + { + "epoch": 13.267326732673267, + "grad_norm": 0.046749114990234375, + "learning_rate": 1.5363553698475685e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25450112, + "step": 120600 + }, + { + "epoch": 13.267876787678768, + "grad_norm": 0.06221366301178932, + "learning_rate": 1.536133913932709e-05, + "loss": 0.0244, + "num_input_tokens_seen": 25451136, + "step": 120605 + }, + { + "epoch": 13.268426842684269, + "grad_norm": 0.09790633618831635, + "learning_rate": 1.535912466901339e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25452288, + "step": 120610 + }, + { + "epoch": 13.268976897689768, + "grad_norm": 0.2883903980255127, + "learning_rate": 1.535691028755502e-05, + "loss": 0.0063, + "num_input_tokens_seen": 25453344, + "step": 120615 + }, + { + "epoch": 13.26952695269527, + "grad_norm": 0.6124384999275208, + "learning_rate": 1.535469599497239e-05, + "loss": 0.0046, + "num_input_tokens_seen": 25454336, + "step": 120620 + }, + { + "epoch": 13.27007700770077, + "grad_norm": 0.013626636937260628, + "learning_rate": 1.5352481791285884e-05, + "loss": 0.0477, + "num_input_tokens_seen": 25455424, + "step": 120625 + }, + { + "epoch": 13.270627062706271, + "grad_norm": 0.01783752627670765, + "learning_rate": 1.535026767651593e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25456544, + "step": 120630 + }, + { + "epoch": 13.27117711771177, + "grad_norm": 0.010371098294854164, + "learning_rate": 1.5348053650682925e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25457568, + "step": 120635 + }, + { + "epoch": 13.271727172717272, + "grad_norm": 0.12488260865211487, + "learning_rate": 1.5345839713807287e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25458624, + "step": 120640 + }, + { + "epoch": 13.272277227722773, + "grad_norm": 0.015980873256921768, + "learning_rate": 1.5343625865909407e-05, + "loss": 0.0583, + "num_input_tokens_seen": 25459712, + "step": 120645 + }, + { + "epoch": 13.272827282728272, + "grad_norm": 0.15287649631500244, + "learning_rate": 1.5341412107009685e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25460800, + "step": 120650 + }, + { + "epoch": 13.273377337733773, + "grad_norm": 0.06516789644956589, + "learning_rate": 1.5339198437128542e-05, + "loss": 0.0039, + "num_input_tokens_seen": 25461856, + "step": 120655 + }, + { + "epoch": 13.273927392739274, + "grad_norm": 0.025626476854085922, + "learning_rate": 1.5336984856286374e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25462944, + "step": 120660 + }, + { + "epoch": 13.274477447744774, + "grad_norm": 0.004857848864048719, + "learning_rate": 1.533477136450357e-05, + "loss": 0.1265, + "num_input_tokens_seen": 25464032, + "step": 120665 + }, + { + "epoch": 13.275027502750275, + "grad_norm": 0.008798143826425076, + "learning_rate": 1.533255796180055e-05, + "loss": 0.0109, + "num_input_tokens_seen": 25465056, + "step": 120670 + }, + { + "epoch": 13.275577557755776, + "grad_norm": 0.03574181720614433, + "learning_rate": 1.5330344648197697e-05, + "loss": 0.0686, + "num_input_tokens_seen": 25466176, + "step": 120675 + }, + { + "epoch": 13.276127612761275, + "grad_norm": 1.2070326805114746, + "learning_rate": 1.5328131423715426e-05, + "loss": 0.0073, + "num_input_tokens_seen": 25467232, + "step": 120680 + }, + { + "epoch": 13.276677667766776, + "grad_norm": 0.04160361737012863, + "learning_rate": 1.5325918288374123e-05, + "loss": 0.0429, + "num_input_tokens_seen": 25468352, + "step": 120685 + }, + { + "epoch": 13.277227722772277, + "grad_norm": 0.007622856181114912, + "learning_rate": 1.532370524219418e-05, + "loss": 0.1123, + "num_input_tokens_seen": 25469408, + "step": 120690 + }, + { + "epoch": 13.277777777777779, + "grad_norm": 0.01566368155181408, + "learning_rate": 1.5321492285196017e-05, + "loss": 0.103, + "num_input_tokens_seen": 25470400, + "step": 120695 + }, + { + "epoch": 13.278327832783278, + "grad_norm": 0.009662341326475143, + "learning_rate": 1.5319279417400008e-05, + "loss": 0.0021, + "num_input_tokens_seen": 25471456, + "step": 120700 + }, + { + "epoch": 13.278877887788779, + "grad_norm": 0.00883371289819479, + "learning_rate": 1.5317066638826557e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25472480, + "step": 120705 + }, + { + "epoch": 13.27942794279428, + "grad_norm": 0.8304991722106934, + "learning_rate": 1.5314853949496056e-05, + "loss": 0.0093, + "num_input_tokens_seen": 25473472, + "step": 120710 + }, + { + "epoch": 13.27997799779978, + "grad_norm": 5.268264293670654, + "learning_rate": 1.5312641349428892e-05, + "loss": 0.0471, + "num_input_tokens_seen": 25474624, + "step": 120715 + }, + { + "epoch": 13.28052805280528, + "grad_norm": 2.2284770011901855, + "learning_rate": 1.5310428838645473e-05, + "loss": 0.0689, + "num_input_tokens_seen": 25475680, + "step": 120720 + }, + { + "epoch": 13.281078107810782, + "grad_norm": 0.365719735622406, + "learning_rate": 1.5308216417166177e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25476736, + "step": 120725 + }, + { + "epoch": 13.281628162816281, + "grad_norm": 0.0478791706264019, + "learning_rate": 1.5306004085011404e-05, + "loss": 0.0413, + "num_input_tokens_seen": 25477728, + "step": 120730 + }, + { + "epoch": 13.282178217821782, + "grad_norm": 0.026086851954460144, + "learning_rate": 1.5303791842201542e-05, + "loss": 0.003, + "num_input_tokens_seen": 25478816, + "step": 120735 + }, + { + "epoch": 13.282728272827283, + "grad_norm": 0.2092236876487732, + "learning_rate": 1.530157968875697e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25479840, + "step": 120740 + }, + { + "epoch": 13.283278327832782, + "grad_norm": 0.14971542358398438, + "learning_rate": 1.5299367624698092e-05, + "loss": 0.0213, + "num_input_tokens_seen": 25480896, + "step": 120745 + }, + { + "epoch": 13.283828382838283, + "grad_norm": 1.9585884809494019, + "learning_rate": 1.5297155650045274e-05, + "loss": 0.015, + "num_input_tokens_seen": 25481920, + "step": 120750 + }, + { + "epoch": 13.284378437843785, + "grad_norm": 0.03805548697710037, + "learning_rate": 1.5294943764818936e-05, + "loss": 0.046, + "num_input_tokens_seen": 25483008, + "step": 120755 + }, + { + "epoch": 13.284928492849286, + "grad_norm": 0.6990798115730286, + "learning_rate": 1.5292731969039432e-05, + "loss": 0.0106, + "num_input_tokens_seen": 25484032, + "step": 120760 + }, + { + "epoch": 13.285478547854785, + "grad_norm": 0.056368615478277206, + "learning_rate": 1.529052026272716e-05, + "loss": 0.046, + "num_input_tokens_seen": 25485120, + "step": 120765 + }, + { + "epoch": 13.286028602860286, + "grad_norm": 2.4185471534729004, + "learning_rate": 1.5288308645902507e-05, + "loss": 0.0835, + "num_input_tokens_seen": 25486144, + "step": 120770 + }, + { + "epoch": 13.286578657865787, + "grad_norm": 0.011225647293031216, + "learning_rate": 1.5286097118585847e-05, + "loss": 0.0214, + "num_input_tokens_seen": 25487232, + "step": 120775 + }, + { + "epoch": 13.287128712871286, + "grad_norm": 0.012404520064592361, + "learning_rate": 1.5283885680797578e-05, + "loss": 0.0077, + "num_input_tokens_seen": 25488352, + "step": 120780 + }, + { + "epoch": 13.287678767876788, + "grad_norm": 0.42270177602767944, + "learning_rate": 1.5281674332558073e-05, + "loss": 0.0114, + "num_input_tokens_seen": 25489440, + "step": 120785 + }, + { + "epoch": 13.288228822882289, + "grad_norm": 0.00685531459748745, + "learning_rate": 1.52794630738877e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25490528, + "step": 120790 + }, + { + "epoch": 13.288778877887788, + "grad_norm": 0.022339634597301483, + "learning_rate": 1.5277251904806865e-05, + "loss": 0.0405, + "num_input_tokens_seen": 25491616, + "step": 120795 + }, + { + "epoch": 13.289328932893289, + "grad_norm": 0.013279814273118973, + "learning_rate": 1.5275040825335922e-05, + "loss": 0.01, + "num_input_tokens_seen": 25492736, + "step": 120800 + }, + { + "epoch": 13.28987898789879, + "grad_norm": 2.12347412109375, + "learning_rate": 1.5272829835495267e-05, + "loss": 0.1441, + "num_input_tokens_seen": 25493728, + "step": 120805 + }, + { + "epoch": 13.290429042904291, + "grad_norm": 0.06959115713834763, + "learning_rate": 1.5270618935305278e-05, + "loss": 0.0643, + "num_input_tokens_seen": 25494752, + "step": 120810 + }, + { + "epoch": 13.29097909790979, + "grad_norm": 0.33579692244529724, + "learning_rate": 1.5268408124786316e-05, + "loss": 0.081, + "num_input_tokens_seen": 25495808, + "step": 120815 + }, + { + "epoch": 13.291529152915292, + "grad_norm": 0.02216832898557186, + "learning_rate": 1.5266197403958772e-05, + "loss": 0.006, + "num_input_tokens_seen": 25496896, + "step": 120820 + }, + { + "epoch": 13.292079207920793, + "grad_norm": 0.46562308073043823, + "learning_rate": 1.526398677284301e-05, + "loss": 0.0088, + "num_input_tokens_seen": 25497952, + "step": 120825 + }, + { + "epoch": 13.292629262926292, + "grad_norm": 0.028533361852169037, + "learning_rate": 1.526177623145941e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25499008, + "step": 120830 + }, + { + "epoch": 13.293179317931793, + "grad_norm": 0.1996426284313202, + "learning_rate": 1.5259565779828357e-05, + "loss": 0.0233, + "num_input_tokens_seen": 25500000, + "step": 120835 + }, + { + "epoch": 13.293729372937294, + "grad_norm": 0.0827748104929924, + "learning_rate": 1.5257355417970201e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25501056, + "step": 120840 + }, + { + "epoch": 13.294279427942794, + "grad_norm": 0.2111549824476242, + "learning_rate": 1.525514514590533e-05, + "loss": 0.0174, + "num_input_tokens_seen": 25502080, + "step": 120845 + }, + { + "epoch": 13.294829482948295, + "grad_norm": 0.03320338577032089, + "learning_rate": 1.5252934963654114e-05, + "loss": 0.0162, + "num_input_tokens_seen": 25503136, + "step": 120850 + }, + { + "epoch": 13.295379537953796, + "grad_norm": 0.21091081202030182, + "learning_rate": 1.525072487123691e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25504128, + "step": 120855 + }, + { + "epoch": 13.295929592959295, + "grad_norm": 0.4029223918914795, + "learning_rate": 1.5248514868674102e-05, + "loss": 0.0072, + "num_input_tokens_seen": 25505152, + "step": 120860 + }, + { + "epoch": 13.296479647964796, + "grad_norm": 0.021197224035859108, + "learning_rate": 1.5246304955986046e-05, + "loss": 0.003, + "num_input_tokens_seen": 25506240, + "step": 120865 + }, + { + "epoch": 13.297029702970297, + "grad_norm": 0.01682635210454464, + "learning_rate": 1.524409513319313e-05, + "loss": 0.0293, + "num_input_tokens_seen": 25507296, + "step": 120870 + }, + { + "epoch": 13.297579757975798, + "grad_norm": 0.06065301597118378, + "learning_rate": 1.5241885400315704e-05, + "loss": 0.0829, + "num_input_tokens_seen": 25508320, + "step": 120875 + }, + { + "epoch": 13.298129812981298, + "grad_norm": 0.02138221077620983, + "learning_rate": 1.5239675757374128e-05, + "loss": 0.0747, + "num_input_tokens_seen": 25509408, + "step": 120880 + }, + { + "epoch": 13.298679867986799, + "grad_norm": 0.010035622864961624, + "learning_rate": 1.5237466204388789e-05, + "loss": 0.0035, + "num_input_tokens_seen": 25510464, + "step": 120885 + }, + { + "epoch": 13.2992299229923, + "grad_norm": 0.007483079098165035, + "learning_rate": 1.5235256741380027e-05, + "loss": 0.0324, + "num_input_tokens_seen": 25511552, + "step": 120890 + }, + { + "epoch": 13.2997799779978, + "grad_norm": 0.013713247142732143, + "learning_rate": 1.523304736836823e-05, + "loss": 0.1104, + "num_input_tokens_seen": 25512704, + "step": 120895 + }, + { + "epoch": 13.3003300330033, + "grad_norm": 0.06413280218839645, + "learning_rate": 1.5230838085373747e-05, + "loss": 0.0338, + "num_input_tokens_seen": 25513792, + "step": 120900 + }, + { + "epoch": 13.300880088008801, + "grad_norm": 0.013576777651906013, + "learning_rate": 1.5228628892416935e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25514848, + "step": 120905 + }, + { + "epoch": 13.3014301430143, + "grad_norm": 0.06588106602430344, + "learning_rate": 1.522641978951817e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25515936, + "step": 120910 + }, + { + "epoch": 13.301980198019802, + "grad_norm": 5.421051025390625, + "learning_rate": 1.5224210776697794e-05, + "loss": 0.0916, + "num_input_tokens_seen": 25516960, + "step": 120915 + }, + { + "epoch": 13.302530253025303, + "grad_norm": 0.041271913796663284, + "learning_rate": 1.5222001853976187e-05, + "loss": 0.0339, + "num_input_tokens_seen": 25518048, + "step": 120920 + }, + { + "epoch": 13.303080308030804, + "grad_norm": 0.16769391298294067, + "learning_rate": 1.5219793021373696e-05, + "loss": 0.0221, + "num_input_tokens_seen": 25519168, + "step": 120925 + }, + { + "epoch": 13.303630363036303, + "grad_norm": 0.252515584230423, + "learning_rate": 1.5217584278910674e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25520224, + "step": 120930 + }, + { + "epoch": 13.304180418041804, + "grad_norm": 0.24658040702342987, + "learning_rate": 1.5215375626607483e-05, + "loss": 0.013, + "num_input_tokens_seen": 25521312, + "step": 120935 + }, + { + "epoch": 13.304730473047305, + "grad_norm": 0.25187844038009644, + "learning_rate": 1.521316706448448e-05, + "loss": 0.006, + "num_input_tokens_seen": 25522336, + "step": 120940 + }, + { + "epoch": 13.305280528052805, + "grad_norm": 0.1499790996313095, + "learning_rate": 1.5210958592562021e-05, + "loss": 0.0062, + "num_input_tokens_seen": 25523424, + "step": 120945 + }, + { + "epoch": 13.305830583058306, + "grad_norm": 0.10789914429187775, + "learning_rate": 1.5208750210860467e-05, + "loss": 0.0099, + "num_input_tokens_seen": 25524448, + "step": 120950 + }, + { + "epoch": 13.306380638063807, + "grad_norm": 0.25608280301094055, + "learning_rate": 1.5206541919400152e-05, + "loss": 0.0031, + "num_input_tokens_seen": 25525504, + "step": 120955 + }, + { + "epoch": 13.306930693069306, + "grad_norm": 0.13003018498420715, + "learning_rate": 1.5204333718201446e-05, + "loss": 0.021, + "num_input_tokens_seen": 25526560, + "step": 120960 + }, + { + "epoch": 13.307480748074807, + "grad_norm": 0.054183363914489746, + "learning_rate": 1.5202125607284694e-05, + "loss": 0.0074, + "num_input_tokens_seen": 25527584, + "step": 120965 + }, + { + "epoch": 13.308030803080309, + "grad_norm": 4.167004585266113, + "learning_rate": 1.5199917586670257e-05, + "loss": 0.1107, + "num_input_tokens_seen": 25528672, + "step": 120970 + }, + { + "epoch": 13.308580858085808, + "grad_norm": 0.061486802995204926, + "learning_rate": 1.5197709656378473e-05, + "loss": 0.0056, + "num_input_tokens_seen": 25529760, + "step": 120975 + }, + { + "epoch": 13.309130913091309, + "grad_norm": 0.005252250004559755, + "learning_rate": 1.5195501816429686e-05, + "loss": 0.1075, + "num_input_tokens_seen": 25530848, + "step": 120980 + }, + { + "epoch": 13.30968096809681, + "grad_norm": 0.1890227198600769, + "learning_rate": 1.5193294066844268e-05, + "loss": 0.0083, + "num_input_tokens_seen": 25531936, + "step": 120985 + }, + { + "epoch": 13.310231023102311, + "grad_norm": 0.023425094783306122, + "learning_rate": 1.519108640764254e-05, + "loss": 0.1361, + "num_input_tokens_seen": 25532928, + "step": 120990 + }, + { + "epoch": 13.31078107810781, + "grad_norm": 1.6444909572601318, + "learning_rate": 1.5188878838844872e-05, + "loss": 0.0704, + "num_input_tokens_seen": 25533952, + "step": 120995 + }, + { + "epoch": 13.311331133113312, + "grad_norm": 0.06691955775022507, + "learning_rate": 1.5186671360471602e-05, + "loss": 0.0666, + "num_input_tokens_seen": 25535072, + "step": 121000 + }, + { + "epoch": 13.311881188118813, + "grad_norm": 0.08617236465215683, + "learning_rate": 1.5184463972543056e-05, + "loss": 0.0653, + "num_input_tokens_seen": 25536096, + "step": 121005 + }, + { + "epoch": 13.312431243124312, + "grad_norm": 0.06992917507886887, + "learning_rate": 1.5182256675079614e-05, + "loss": 0.0017, + "num_input_tokens_seen": 25537184, + "step": 121010 + }, + { + "epoch": 13.312981298129813, + "grad_norm": 2.2093138694763184, + "learning_rate": 1.518004946810159e-05, + "loss": 0.0665, + "num_input_tokens_seen": 25538176, + "step": 121015 + }, + { + "epoch": 13.313531353135314, + "grad_norm": 0.007332727313041687, + "learning_rate": 1.5177842351629344e-05, + "loss": 0.0068, + "num_input_tokens_seen": 25539232, + "step": 121020 + }, + { + "epoch": 13.314081408140813, + "grad_norm": 0.05986398085951805, + "learning_rate": 1.5175635325683213e-05, + "loss": 0.113, + "num_input_tokens_seen": 25540288, + "step": 121025 + }, + { + "epoch": 13.314631463146315, + "grad_norm": 0.04998031631112099, + "learning_rate": 1.5173428390283532e-05, + "loss": 0.0431, + "num_input_tokens_seen": 25541376, + "step": 121030 + }, + { + "epoch": 13.315181518151816, + "grad_norm": 0.17128685116767883, + "learning_rate": 1.5171221545450648e-05, + "loss": 0.0144, + "num_input_tokens_seen": 25542400, + "step": 121035 + }, + { + "epoch": 13.315731573157315, + "grad_norm": 0.0040847426280379295, + "learning_rate": 1.5169014791204894e-05, + "loss": 0.008, + "num_input_tokens_seen": 25543392, + "step": 121040 + }, + { + "epoch": 13.316281628162816, + "grad_norm": 0.025447748601436615, + "learning_rate": 1.5166808127566625e-05, + "loss": 0.003, + "num_input_tokens_seen": 25544416, + "step": 121045 + }, + { + "epoch": 13.316831683168317, + "grad_norm": 0.0852644070982933, + "learning_rate": 1.5164601554556162e-05, + "loss": 0.0367, + "num_input_tokens_seen": 25545408, + "step": 121050 + }, + { + "epoch": 13.317381738173818, + "grad_norm": 0.006941956002265215, + "learning_rate": 1.5162395072193842e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25546464, + "step": 121055 + }, + { + "epoch": 13.317931793179318, + "grad_norm": 0.03959326073527336, + "learning_rate": 1.5160188680500015e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25547584, + "step": 121060 + }, + { + "epoch": 13.318481848184819, + "grad_norm": 0.010035563260316849, + "learning_rate": 1.5157982379495001e-05, + "loss": 0.0094, + "num_input_tokens_seen": 25548640, + "step": 121065 + }, + { + "epoch": 13.31903190319032, + "grad_norm": 0.0017413003370165825, + "learning_rate": 1.5155776169199137e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25549600, + "step": 121070 + }, + { + "epoch": 13.319581958195819, + "grad_norm": 0.02826342172920704, + "learning_rate": 1.5153570049632764e-05, + "loss": 0.0242, + "num_input_tokens_seen": 25550624, + "step": 121075 + }, + { + "epoch": 13.32013201320132, + "grad_norm": 0.015629967674613, + "learning_rate": 1.5151364020816211e-05, + "loss": 0.0026, + "num_input_tokens_seen": 25551744, + "step": 121080 + }, + { + "epoch": 13.320682068206821, + "grad_norm": 0.2649279832839966, + "learning_rate": 1.5149158082769816e-05, + "loss": 0.0055, + "num_input_tokens_seen": 25552768, + "step": 121085 + }, + { + "epoch": 13.32123212321232, + "grad_norm": 0.14758805930614471, + "learning_rate": 1.51469522355139e-05, + "loss": 0.0061, + "num_input_tokens_seen": 25553792, + "step": 121090 + }, + { + "epoch": 13.321782178217822, + "grad_norm": 1.1863454580307007, + "learning_rate": 1.5144746479068789e-05, + "loss": 0.0243, + "num_input_tokens_seen": 25554880, + "step": 121095 + }, + { + "epoch": 13.322332233223323, + "grad_norm": 0.010657526552677155, + "learning_rate": 1.5142540813454836e-05, + "loss": 0.0039, + "num_input_tokens_seen": 25555904, + "step": 121100 + }, + { + "epoch": 13.322882288228822, + "grad_norm": 0.035257238894701004, + "learning_rate": 1.514033523869234e-05, + "loss": 0.0009, + "num_input_tokens_seen": 25556896, + "step": 121105 + }, + { + "epoch": 13.323432343234323, + "grad_norm": 0.7088279128074646, + "learning_rate": 1.513812975480165e-05, + "loss": 0.0061, + "num_input_tokens_seen": 25557984, + "step": 121110 + }, + { + "epoch": 13.323982398239824, + "grad_norm": 0.017506301403045654, + "learning_rate": 1.5135924361803092e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25559072, + "step": 121115 + }, + { + "epoch": 13.324532453245325, + "grad_norm": 0.008179697208106518, + "learning_rate": 1.5133719059716976e-05, + "loss": 0.0154, + "num_input_tokens_seen": 25560096, + "step": 121120 + }, + { + "epoch": 13.325082508250825, + "grad_norm": 0.05669284984469414, + "learning_rate": 1.5131513848563639e-05, + "loss": 0.0221, + "num_input_tokens_seen": 25561184, + "step": 121125 + }, + { + "epoch": 13.325632563256326, + "grad_norm": 1.1387985944747925, + "learning_rate": 1.51293087283634e-05, + "loss": 0.1125, + "num_input_tokens_seen": 25562208, + "step": 121130 + }, + { + "epoch": 13.326182618261827, + "grad_norm": 0.09745440632104874, + "learning_rate": 1.5127103699136591e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25563200, + "step": 121135 + }, + { + "epoch": 13.326732673267326, + "grad_norm": 0.18573935329914093, + "learning_rate": 1.5124898760903536e-05, + "loss": 0.0291, + "num_input_tokens_seen": 25564288, + "step": 121140 + }, + { + "epoch": 13.327282728272827, + "grad_norm": 0.007201810833066702, + "learning_rate": 1.5122693913684538e-05, + "loss": 0.0259, + "num_input_tokens_seen": 25565280, + "step": 121145 + }, + { + "epoch": 13.327832783278328, + "grad_norm": 0.005793724674731493, + "learning_rate": 1.5120489157499939e-05, + "loss": 0.0078, + "num_input_tokens_seen": 25566272, + "step": 121150 + }, + { + "epoch": 13.328382838283828, + "grad_norm": 0.00958110112696886, + "learning_rate": 1.5118284492370044e-05, + "loss": 0.0795, + "num_input_tokens_seen": 25567392, + "step": 121155 + }, + { + "epoch": 13.328932893289329, + "grad_norm": 0.724082350730896, + "learning_rate": 1.511607991831519e-05, + "loss": 0.0799, + "num_input_tokens_seen": 25568512, + "step": 121160 + }, + { + "epoch": 13.32948294829483, + "grad_norm": 0.3083716332912445, + "learning_rate": 1.511387543535568e-05, + "loss": 0.0346, + "num_input_tokens_seen": 25569568, + "step": 121165 + }, + { + "epoch": 13.33003300330033, + "grad_norm": 0.004040246829390526, + "learning_rate": 1.5111671043511827e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25570624, + "step": 121170 + }, + { + "epoch": 13.33058305830583, + "grad_norm": 0.00418048957362771, + "learning_rate": 1.5109466742803971e-05, + "loss": 0.0017, + "num_input_tokens_seen": 25571648, + "step": 121175 + }, + { + "epoch": 13.331133113311331, + "grad_norm": 0.0018192483112215996, + "learning_rate": 1.5107262533252404e-05, + "loss": 0.0392, + "num_input_tokens_seen": 25572736, + "step": 121180 + }, + { + "epoch": 13.331683168316832, + "grad_norm": 0.03625437244772911, + "learning_rate": 1.5105058414877455e-05, + "loss": 0.0007, + "num_input_tokens_seen": 25573856, + "step": 121185 + }, + { + "epoch": 13.332233223322332, + "grad_norm": 0.09237545728683472, + "learning_rate": 1.5102854387699436e-05, + "loss": 0.1529, + "num_input_tokens_seen": 25574912, + "step": 121190 + }, + { + "epoch": 13.332783278327833, + "grad_norm": 0.057383451610803604, + "learning_rate": 1.5100650451738652e-05, + "loss": 0.0009, + "num_input_tokens_seen": 25575968, + "step": 121195 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 2.0670595169067383, + "learning_rate": 1.5098446607015435e-05, + "loss": 0.1967, + "num_input_tokens_seen": 25577024, + "step": 121200 + }, + { + "epoch": 13.333883388338833, + "grad_norm": 0.017400525510311127, + "learning_rate": 1.5096242853550072e-05, + "loss": 0.0035, + "num_input_tokens_seen": 25578016, + "step": 121205 + }, + { + "epoch": 13.334433443344334, + "grad_norm": 0.3982073962688446, + "learning_rate": 1.5094039191362891e-05, + "loss": 0.058, + "num_input_tokens_seen": 25579104, + "step": 121210 + }, + { + "epoch": 13.334983498349835, + "grad_norm": 0.17491331696510315, + "learning_rate": 1.5091835620474205e-05, + "loss": 0.0301, + "num_input_tokens_seen": 25580160, + "step": 121215 + }, + { + "epoch": 13.335533553355335, + "grad_norm": 0.021167118102312088, + "learning_rate": 1.5089632140904306e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25581184, + "step": 121220 + }, + { + "epoch": 13.336083608360836, + "grad_norm": 0.025958670303225517, + "learning_rate": 1.5087428752673516e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25582240, + "step": 121225 + }, + { + "epoch": 13.336633663366337, + "grad_norm": 0.2535720467567444, + "learning_rate": 1.5085225455802131e-05, + "loss": 0.0215, + "num_input_tokens_seen": 25583360, + "step": 121230 + }, + { + "epoch": 13.337183718371838, + "grad_norm": 0.06138727068901062, + "learning_rate": 1.508302225031048e-05, + "loss": 0.0096, + "num_input_tokens_seen": 25584352, + "step": 121235 + }, + { + "epoch": 13.337733773377337, + "grad_norm": 0.057009294629096985, + "learning_rate": 1.5080819136218846e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25585408, + "step": 121240 + }, + { + "epoch": 13.338283828382838, + "grad_norm": 0.03746757283806801, + "learning_rate": 1.5078616113547538e-05, + "loss": 0.036, + "num_input_tokens_seen": 25586496, + "step": 121245 + }, + { + "epoch": 13.33883388338834, + "grad_norm": 0.011623544618487358, + "learning_rate": 1.507641318231687e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25587584, + "step": 121250 + }, + { + "epoch": 13.339383938393839, + "grad_norm": 0.2470942586660385, + "learning_rate": 1.5074210342547145e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25588608, + "step": 121255 + }, + { + "epoch": 13.33993399339934, + "grad_norm": 0.08754722774028778, + "learning_rate": 1.507200759425865e-05, + "loss": 0.0218, + "num_input_tokens_seen": 25589664, + "step": 121260 + }, + { + "epoch": 13.340484048404841, + "grad_norm": 0.01174034085124731, + "learning_rate": 1.5069804937471702e-05, + "loss": 0.0754, + "num_input_tokens_seen": 25590752, + "step": 121265 + }, + { + "epoch": 13.34103410341034, + "grad_norm": 0.07756946235895157, + "learning_rate": 1.506760237220659e-05, + "loss": 0.0052, + "num_input_tokens_seen": 25591840, + "step": 121270 + }, + { + "epoch": 13.341584158415841, + "grad_norm": 0.08953718096017838, + "learning_rate": 1.5065399898483635e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25592864, + "step": 121275 + }, + { + "epoch": 13.342134213421343, + "grad_norm": 0.25537434220314026, + "learning_rate": 1.5063197516323116e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25593920, + "step": 121280 + }, + { + "epoch": 13.342684268426842, + "grad_norm": 0.046071190387010574, + "learning_rate": 1.506099522574533e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25594944, + "step": 121285 + }, + { + "epoch": 13.343234323432343, + "grad_norm": 0.07907766848802567, + "learning_rate": 1.5058793026770596e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25596000, + "step": 121290 + }, + { + "epoch": 13.343784378437844, + "grad_norm": 0.21122217178344727, + "learning_rate": 1.5056590919419184e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25597056, + "step": 121295 + }, + { + "epoch": 13.344334433443345, + "grad_norm": 0.004096193704754114, + "learning_rate": 1.5054388903711406e-05, + "loss": 0.002, + "num_input_tokens_seen": 25598112, + "step": 121300 + }, + { + "epoch": 13.344884488448844, + "grad_norm": 0.8303377628326416, + "learning_rate": 1.505218697966756e-05, + "loss": 0.0114, + "num_input_tokens_seen": 25599200, + "step": 121305 + }, + { + "epoch": 13.345434543454346, + "grad_norm": 0.0695909857749939, + "learning_rate": 1.5049985147307915e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25600224, + "step": 121310 + }, + { + "epoch": 13.345984598459847, + "grad_norm": 0.005262613296508789, + "learning_rate": 1.5047783406652798e-05, + "loss": 0.0044, + "num_input_tokens_seen": 25601216, + "step": 121315 + }, + { + "epoch": 13.346534653465346, + "grad_norm": 0.006789712235331535, + "learning_rate": 1.504558175772248e-05, + "loss": 0.013, + "num_input_tokens_seen": 25602240, + "step": 121320 + }, + { + "epoch": 13.347084708470847, + "grad_norm": 1.7820332050323486, + "learning_rate": 1.504338020053726e-05, + "loss": 0.2078, + "num_input_tokens_seen": 25603328, + "step": 121325 + }, + { + "epoch": 13.347634763476348, + "grad_norm": 0.5271599292755127, + "learning_rate": 1.5041178735117434e-05, + "loss": 0.0496, + "num_input_tokens_seen": 25604416, + "step": 121330 + }, + { + "epoch": 13.348184818481847, + "grad_norm": 0.017845068126916885, + "learning_rate": 1.5038977361483272e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25605408, + "step": 121335 + }, + { + "epoch": 13.348734873487349, + "grad_norm": 0.017848998308181763, + "learning_rate": 1.5036776079655084e-05, + "loss": 0.0406, + "num_input_tokens_seen": 25606464, + "step": 121340 + }, + { + "epoch": 13.34928492849285, + "grad_norm": 0.009429698809981346, + "learning_rate": 1.5034574889653142e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25607584, + "step": 121345 + }, + { + "epoch": 13.34983498349835, + "grad_norm": 0.0027867683675140142, + "learning_rate": 1.5032373791497754e-05, + "loss": 0.0294, + "num_input_tokens_seen": 25608672, + "step": 121350 + }, + { + "epoch": 13.35038503850385, + "grad_norm": 0.1381349265575409, + "learning_rate": 1.5030172785209187e-05, + "loss": 0.0043, + "num_input_tokens_seen": 25609696, + "step": 121355 + }, + { + "epoch": 13.350935093509351, + "grad_norm": 0.17195917665958405, + "learning_rate": 1.5027971870807727e-05, + "loss": 0.0495, + "num_input_tokens_seen": 25610752, + "step": 121360 + }, + { + "epoch": 13.351485148514852, + "grad_norm": 0.018165983259677887, + "learning_rate": 1.502577104831368e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25611808, + "step": 121365 + }, + { + "epoch": 13.352035203520352, + "grad_norm": 0.03238287195563316, + "learning_rate": 1.50235703177473e-05, + "loss": 0.0006, + "num_input_tokens_seen": 25612928, + "step": 121370 + }, + { + "epoch": 13.352585258525853, + "grad_norm": 0.012660296633839607, + "learning_rate": 1.5021369679128894e-05, + "loss": 0.0008, + "num_input_tokens_seen": 25613984, + "step": 121375 + }, + { + "epoch": 13.353135313531354, + "grad_norm": 0.011678926646709442, + "learning_rate": 1.5019169132478737e-05, + "loss": 0.0031, + "num_input_tokens_seen": 25615008, + "step": 121380 + }, + { + "epoch": 13.353685368536853, + "grad_norm": 0.09001568704843521, + "learning_rate": 1.50169686778171e-05, + "loss": 0.0043, + "num_input_tokens_seen": 25616000, + "step": 121385 + }, + { + "epoch": 13.354235423542354, + "grad_norm": 0.025108572095632553, + "learning_rate": 1.5014768315164285e-05, + "loss": 0.0428, + "num_input_tokens_seen": 25617088, + "step": 121390 + }, + { + "epoch": 13.354785478547855, + "grad_norm": 0.021705660969018936, + "learning_rate": 1.5012568044540548e-05, + "loss": 0.0495, + "num_input_tokens_seen": 25618112, + "step": 121395 + }, + { + "epoch": 13.355335533553355, + "grad_norm": 4.908166408538818, + "learning_rate": 1.5010367865966183e-05, + "loss": 0.1431, + "num_input_tokens_seen": 25619168, + "step": 121400 + }, + { + "epoch": 13.355885588558856, + "grad_norm": 0.00805138610303402, + "learning_rate": 1.5008167779461468e-05, + "loss": 0.0009, + "num_input_tokens_seen": 25620288, + "step": 121405 + }, + { + "epoch": 13.356435643564357, + "grad_norm": 0.01660512387752533, + "learning_rate": 1.5005967785046668e-05, + "loss": 0.0106, + "num_input_tokens_seen": 25621344, + "step": 121410 + }, + { + "epoch": 13.356985698569858, + "grad_norm": 0.01051709707826376, + "learning_rate": 1.500376788274207e-05, + "loss": 0.0106, + "num_input_tokens_seen": 25622432, + "step": 121415 + }, + { + "epoch": 13.357535753575357, + "grad_norm": 0.020178036764264107, + "learning_rate": 1.5001568072567945e-05, + "loss": 0.0465, + "num_input_tokens_seen": 25623456, + "step": 121420 + }, + { + "epoch": 13.358085808580858, + "grad_norm": 0.003873707726597786, + "learning_rate": 1.499936835454458e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25624480, + "step": 121425 + }, + { + "epoch": 13.35863586358636, + "grad_norm": 0.008874798193573952, + "learning_rate": 1.499716872869223e-05, + "loss": 0.0068, + "num_input_tokens_seen": 25625504, + "step": 121430 + }, + { + "epoch": 13.359185918591859, + "grad_norm": 0.0030574286356568336, + "learning_rate": 1.4994969195031172e-05, + "loss": 0.011, + "num_input_tokens_seen": 25626560, + "step": 121435 + }, + { + "epoch": 13.35973597359736, + "grad_norm": 0.010684583336114883, + "learning_rate": 1.4992769753581688e-05, + "loss": 0.0894, + "num_input_tokens_seen": 25627648, + "step": 121440 + }, + { + "epoch": 13.36028602860286, + "grad_norm": 0.020320730283856392, + "learning_rate": 1.4990570404364045e-05, + "loss": 0.129, + "num_input_tokens_seen": 25628640, + "step": 121445 + }, + { + "epoch": 13.36083608360836, + "grad_norm": 0.509706437587738, + "learning_rate": 1.4988371147398506e-05, + "loss": 0.0395, + "num_input_tokens_seen": 25629728, + "step": 121450 + }, + { + "epoch": 13.361386138613861, + "grad_norm": 0.05794699117541313, + "learning_rate": 1.4986171982705346e-05, + "loss": 0.0089, + "num_input_tokens_seen": 25630784, + "step": 121455 + }, + { + "epoch": 13.361936193619362, + "grad_norm": 0.013668258674442768, + "learning_rate": 1.4983972910304834e-05, + "loss": 0.1135, + "num_input_tokens_seen": 25631808, + "step": 121460 + }, + { + "epoch": 13.362486248624862, + "grad_norm": 0.035585954785346985, + "learning_rate": 1.4981773930217243e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25632832, + "step": 121465 + }, + { + "epoch": 13.363036303630363, + "grad_norm": 2.4923465251922607, + "learning_rate": 1.4979575042462835e-05, + "loss": 0.03, + "num_input_tokens_seen": 25633920, + "step": 121470 + }, + { + "epoch": 13.363586358635864, + "grad_norm": 0.00974262598901987, + "learning_rate": 1.4977376247061865e-05, + "loss": 0.0296, + "num_input_tokens_seen": 25635008, + "step": 121475 + }, + { + "epoch": 13.364136413641365, + "grad_norm": 0.020096896216273308, + "learning_rate": 1.497517754403462e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25636032, + "step": 121480 + }, + { + "epoch": 13.364686468646864, + "grad_norm": 0.3212309181690216, + "learning_rate": 1.4972978933401343e-05, + "loss": 0.009, + "num_input_tokens_seen": 25637088, + "step": 121485 + }, + { + "epoch": 13.365236523652365, + "grad_norm": 0.5319961905479431, + "learning_rate": 1.4970780415182318e-05, + "loss": 0.0441, + "num_input_tokens_seen": 25638176, + "step": 121490 + }, + { + "epoch": 13.365786578657866, + "grad_norm": 1.5127348899841309, + "learning_rate": 1.496858198939779e-05, + "loss": 0.017, + "num_input_tokens_seen": 25639328, + "step": 121495 + }, + { + "epoch": 13.366336633663366, + "grad_norm": 1.9241210222244263, + "learning_rate": 1.4966383656068029e-05, + "loss": 0.0697, + "num_input_tokens_seen": 25640448, + "step": 121500 + }, + { + "epoch": 13.366886688668867, + "grad_norm": 0.01725032925605774, + "learning_rate": 1.4964185415213303e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25641472, + "step": 121505 + }, + { + "epoch": 13.367436743674368, + "grad_norm": 0.08673596382141113, + "learning_rate": 1.4961987266853855e-05, + "loss": 0.0022, + "num_input_tokens_seen": 25642528, + "step": 121510 + }, + { + "epoch": 13.367986798679867, + "grad_norm": 0.05233996734023094, + "learning_rate": 1.4959789211009956e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25643552, + "step": 121515 + }, + { + "epoch": 13.368536853685368, + "grad_norm": 0.11885521560907364, + "learning_rate": 1.4957591247701869e-05, + "loss": 0.0026, + "num_input_tokens_seen": 25644576, + "step": 121520 + }, + { + "epoch": 13.36908690869087, + "grad_norm": 0.027564050629734993, + "learning_rate": 1.4955393376949833e-05, + "loss": 0.001, + "num_input_tokens_seen": 25645568, + "step": 121525 + }, + { + "epoch": 13.369636963696369, + "grad_norm": 0.029131250455975533, + "learning_rate": 1.4953195598774125e-05, + "loss": 0.0661, + "num_input_tokens_seen": 25646592, + "step": 121530 + }, + { + "epoch": 13.37018701870187, + "grad_norm": 0.03359561786055565, + "learning_rate": 1.4950997913194987e-05, + "loss": 0.2027, + "num_input_tokens_seen": 25647584, + "step": 121535 + }, + { + "epoch": 13.370737073707371, + "grad_norm": 0.09308013319969177, + "learning_rate": 1.4948800320232686e-05, + "loss": 0.0076, + "num_input_tokens_seen": 25648608, + "step": 121540 + }, + { + "epoch": 13.371287128712872, + "grad_norm": 0.014979906380176544, + "learning_rate": 1.4946602819907468e-05, + "loss": 0.0239, + "num_input_tokens_seen": 25649664, + "step": 121545 + }, + { + "epoch": 13.371837183718371, + "grad_norm": 0.3289802372455597, + "learning_rate": 1.494440541223958e-05, + "loss": 0.007, + "num_input_tokens_seen": 25650720, + "step": 121550 + }, + { + "epoch": 13.372387238723872, + "grad_norm": 0.03273859992623329, + "learning_rate": 1.4942208097249288e-05, + "loss": 0.0532, + "num_input_tokens_seen": 25651776, + "step": 121555 + }, + { + "epoch": 13.372937293729374, + "grad_norm": 0.03453008830547333, + "learning_rate": 1.494001087495683e-05, + "loss": 0.0231, + "num_input_tokens_seen": 25652832, + "step": 121560 + }, + { + "epoch": 13.373487348734873, + "grad_norm": 0.014563057571649551, + "learning_rate": 1.4937813745382478e-05, + "loss": 0.0019, + "num_input_tokens_seen": 25653920, + "step": 121565 + }, + { + "epoch": 13.374037403740374, + "grad_norm": 0.04901229590177536, + "learning_rate": 1.4935616708546462e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25654976, + "step": 121570 + }, + { + "epoch": 13.374587458745875, + "grad_norm": 0.03629901260137558, + "learning_rate": 1.4933419764469034e-05, + "loss": 0.004, + "num_input_tokens_seen": 25655968, + "step": 121575 + }, + { + "epoch": 13.375137513751374, + "grad_norm": 3.090639591217041, + "learning_rate": 1.4931222913170456e-05, + "loss": 0.0454, + "num_input_tokens_seen": 25656992, + "step": 121580 + }, + { + "epoch": 13.375687568756875, + "grad_norm": 5.6557817459106445, + "learning_rate": 1.4929026154670952e-05, + "loss": 0.1798, + "num_input_tokens_seen": 25658048, + "step": 121585 + }, + { + "epoch": 13.376237623762377, + "grad_norm": 0.026585353538393974, + "learning_rate": 1.492682948899079e-05, + "loss": 0.008, + "num_input_tokens_seen": 25659104, + "step": 121590 + }, + { + "epoch": 13.376787678767876, + "grad_norm": 0.05850420147180557, + "learning_rate": 1.4924632916150208e-05, + "loss": 0.0013, + "num_input_tokens_seen": 25660192, + "step": 121595 + }, + { + "epoch": 13.377337733773377, + "grad_norm": 0.006805990356951952, + "learning_rate": 1.492243643616944e-05, + "loss": 0.005, + "num_input_tokens_seen": 25661216, + "step": 121600 + }, + { + "epoch": 13.377887788778878, + "grad_norm": 0.059285011142492294, + "learning_rate": 1.4920240049068748e-05, + "loss": 0.0198, + "num_input_tokens_seen": 25662240, + "step": 121605 + }, + { + "epoch": 13.37843784378438, + "grad_norm": 2.172950267791748, + "learning_rate": 1.491804375486836e-05, + "loss": 0.0492, + "num_input_tokens_seen": 25663296, + "step": 121610 + }, + { + "epoch": 13.378987898789878, + "grad_norm": 0.06431090086698532, + "learning_rate": 1.491584755358853e-05, + "loss": 0.0076, + "num_input_tokens_seen": 25664352, + "step": 121615 + }, + { + "epoch": 13.37953795379538, + "grad_norm": 0.08255041390657425, + "learning_rate": 1.49136514452495e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25665376, + "step": 121620 + }, + { + "epoch": 13.38008800880088, + "grad_norm": 0.17197208106517792, + "learning_rate": 1.4911455429871496e-05, + "loss": 0.0512, + "num_input_tokens_seen": 25666464, + "step": 121625 + }, + { + "epoch": 13.38063806380638, + "grad_norm": 0.022762980312108994, + "learning_rate": 1.4909259507474765e-05, + "loss": 0.0478, + "num_input_tokens_seen": 25667520, + "step": 121630 + }, + { + "epoch": 13.381188118811881, + "grad_norm": 0.04065529257059097, + "learning_rate": 1.4907063678079558e-05, + "loss": 0.0046, + "num_input_tokens_seen": 25668512, + "step": 121635 + }, + { + "epoch": 13.381738173817382, + "grad_norm": 0.012137919664382935, + "learning_rate": 1.490486794170609e-05, + "loss": 0.0453, + "num_input_tokens_seen": 25669664, + "step": 121640 + }, + { + "epoch": 13.382288228822881, + "grad_norm": 0.903899073600769, + "learning_rate": 1.4902672298374617e-05, + "loss": 0.1394, + "num_input_tokens_seen": 25670816, + "step": 121645 + }, + { + "epoch": 13.382838283828383, + "grad_norm": 2.058558225631714, + "learning_rate": 1.4900476748105362e-05, + "loss": 0.0635, + "num_input_tokens_seen": 25671840, + "step": 121650 + }, + { + "epoch": 13.383388338833884, + "grad_norm": 0.008590099401772022, + "learning_rate": 1.4898281290918576e-05, + "loss": 0.0361, + "num_input_tokens_seen": 25673024, + "step": 121655 + }, + { + "epoch": 13.383938393839385, + "grad_norm": 0.10475532710552216, + "learning_rate": 1.489608592683448e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25674112, + "step": 121660 + }, + { + "epoch": 13.384488448844884, + "grad_norm": 0.03402800112962723, + "learning_rate": 1.4893890655873306e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25675168, + "step": 121665 + }, + { + "epoch": 13.385038503850385, + "grad_norm": 0.21171724796295166, + "learning_rate": 1.4891695478055304e-05, + "loss": 0.0484, + "num_input_tokens_seen": 25676192, + "step": 121670 + }, + { + "epoch": 13.385588558855886, + "grad_norm": 0.14475367963314056, + "learning_rate": 1.4889500393400677e-05, + "loss": 0.0117, + "num_input_tokens_seen": 25677216, + "step": 121675 + }, + { + "epoch": 13.386138613861386, + "grad_norm": 0.07012619823217392, + "learning_rate": 1.488730540192969e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25678304, + "step": 121680 + }, + { + "epoch": 13.386688668866887, + "grad_norm": 1.685553789138794, + "learning_rate": 1.488511050366255e-05, + "loss": 0.0182, + "num_input_tokens_seen": 25679360, + "step": 121685 + }, + { + "epoch": 13.387238723872388, + "grad_norm": 0.023267488926649094, + "learning_rate": 1.4882915698619487e-05, + "loss": 0.0785, + "num_input_tokens_seen": 25680448, + "step": 121690 + }, + { + "epoch": 13.387788778877887, + "grad_norm": 0.018727941438555717, + "learning_rate": 1.488072098682075e-05, + "loss": 0.0956, + "num_input_tokens_seen": 25681504, + "step": 121695 + }, + { + "epoch": 13.388338833883388, + "grad_norm": 0.8631610870361328, + "learning_rate": 1.4878526368286538e-05, + "loss": 0.01, + "num_input_tokens_seen": 25682528, + "step": 121700 + }, + { + "epoch": 13.38888888888889, + "grad_norm": 0.015507700853049755, + "learning_rate": 1.48763318430371e-05, + "loss": 0.0277, + "num_input_tokens_seen": 25683616, + "step": 121705 + }, + { + "epoch": 13.389438943894389, + "grad_norm": 0.20549525320529938, + "learning_rate": 1.4874137411092653e-05, + "loss": 0.0783, + "num_input_tokens_seen": 25684640, + "step": 121710 + }, + { + "epoch": 13.38998899889989, + "grad_norm": 2.7955451011657715, + "learning_rate": 1.4871943072473418e-05, + "loss": 0.0404, + "num_input_tokens_seen": 25685664, + "step": 121715 + }, + { + "epoch": 13.39053905390539, + "grad_norm": 0.38406726717948914, + "learning_rate": 1.4869748827199626e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25686752, + "step": 121720 + }, + { + "epoch": 13.391089108910892, + "grad_norm": 0.15427836775779724, + "learning_rate": 1.4867554675291495e-05, + "loss": 0.0466, + "num_input_tokens_seen": 25687808, + "step": 121725 + }, + { + "epoch": 13.391639163916391, + "grad_norm": 0.02262415550649166, + "learning_rate": 1.4865360616769259e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25688864, + "step": 121730 + }, + { + "epoch": 13.392189218921892, + "grad_norm": 0.006975209340453148, + "learning_rate": 1.4863166651653126e-05, + "loss": 0.0018, + "num_input_tokens_seen": 25689856, + "step": 121735 + }, + { + "epoch": 13.392739273927393, + "grad_norm": 0.011003333143889904, + "learning_rate": 1.4860972779963317e-05, + "loss": 0.03, + "num_input_tokens_seen": 25690912, + "step": 121740 + }, + { + "epoch": 13.393289328932893, + "grad_norm": 0.04765262082219124, + "learning_rate": 1.4858779001720063e-05, + "loss": 0.0321, + "num_input_tokens_seen": 25691872, + "step": 121745 + }, + { + "epoch": 13.393839383938394, + "grad_norm": 0.025711238384246826, + "learning_rate": 1.4856585316943573e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25692928, + "step": 121750 + }, + { + "epoch": 13.394389438943895, + "grad_norm": 2.9943201541900635, + "learning_rate": 1.4854391725654076e-05, + "loss": 0.0974, + "num_input_tokens_seen": 25693952, + "step": 121755 + }, + { + "epoch": 13.394939493949394, + "grad_norm": 0.015207544900476933, + "learning_rate": 1.4852198227871778e-05, + "loss": 0.0023, + "num_input_tokens_seen": 25695040, + "step": 121760 + }, + { + "epoch": 13.395489548954895, + "grad_norm": 0.009334784001111984, + "learning_rate": 1.4850004823616892e-05, + "loss": 0.0437, + "num_input_tokens_seen": 25696096, + "step": 121765 + }, + { + "epoch": 13.396039603960396, + "grad_norm": 0.012976765632629395, + "learning_rate": 1.4847811512909655e-05, + "loss": 0.0009, + "num_input_tokens_seen": 25697152, + "step": 121770 + }, + { + "epoch": 13.396589658965897, + "grad_norm": 0.059200797230005264, + "learning_rate": 1.4845618295770256e-05, + "loss": 0.064, + "num_input_tokens_seen": 25698272, + "step": 121775 + }, + { + "epoch": 13.397139713971397, + "grad_norm": 0.005864364560693502, + "learning_rate": 1.4843425172218928e-05, + "loss": 0.1227, + "num_input_tokens_seen": 25699360, + "step": 121780 + }, + { + "epoch": 13.397689768976898, + "grad_norm": 0.1231144517660141, + "learning_rate": 1.4841232142275879e-05, + "loss": 0.0055, + "num_input_tokens_seen": 25700352, + "step": 121785 + }, + { + "epoch": 13.398239823982399, + "grad_norm": 0.1893797665834427, + "learning_rate": 1.4839039205961302e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25701408, + "step": 121790 + }, + { + "epoch": 13.398789878987898, + "grad_norm": 0.025126857683062553, + "learning_rate": 1.4836846363295442e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25702432, + "step": 121795 + }, + { + "epoch": 13.3993399339934, + "grad_norm": 3.2050249576568604, + "learning_rate": 1.4834653614298482e-05, + "loss": 0.0188, + "num_input_tokens_seen": 25703456, + "step": 121800 + }, + { + "epoch": 13.3998899889989, + "grad_norm": 0.012841659598052502, + "learning_rate": 1.4832460958990648e-05, + "loss": 0.0738, + "num_input_tokens_seen": 25704544, + "step": 121805 + }, + { + "epoch": 13.4004400440044, + "grad_norm": 0.030670883134007454, + "learning_rate": 1.4830268397392146e-05, + "loss": 0.0246, + "num_input_tokens_seen": 25705632, + "step": 121810 + }, + { + "epoch": 13.400990099009901, + "grad_norm": 0.05257135257124901, + "learning_rate": 1.4828075929523171e-05, + "loss": 0.0108, + "num_input_tokens_seen": 25706656, + "step": 121815 + }, + { + "epoch": 13.401540154015402, + "grad_norm": 0.02442670427262783, + "learning_rate": 1.4825883555403947e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25707744, + "step": 121820 + }, + { + "epoch": 13.402090209020901, + "grad_norm": 0.07723961770534515, + "learning_rate": 1.4823691275054675e-05, + "loss": 0.0983, + "num_input_tokens_seen": 25708800, + "step": 121825 + }, + { + "epoch": 13.402640264026402, + "grad_norm": 0.28761065006256104, + "learning_rate": 1.4821499088495543e-05, + "loss": 0.0047, + "num_input_tokens_seen": 25709888, + "step": 121830 + }, + { + "epoch": 13.403190319031903, + "grad_norm": 0.010266097262501717, + "learning_rate": 1.481930699574678e-05, + "loss": 0.0031, + "num_input_tokens_seen": 25711008, + "step": 121835 + }, + { + "epoch": 13.403740374037405, + "grad_norm": 0.40841686725616455, + "learning_rate": 1.4817114996828573e-05, + "loss": 0.0225, + "num_input_tokens_seen": 25712032, + "step": 121840 + }, + { + "epoch": 13.404290429042904, + "grad_norm": 0.37446147203445435, + "learning_rate": 1.481492309176114e-05, + "loss": 0.0173, + "num_input_tokens_seen": 25713088, + "step": 121845 + }, + { + "epoch": 13.404840484048405, + "grad_norm": 0.025501888245344162, + "learning_rate": 1.481273128056467e-05, + "loss": 0.0007, + "num_input_tokens_seen": 25714112, + "step": 121850 + }, + { + "epoch": 13.405390539053906, + "grad_norm": 0.08594174683094025, + "learning_rate": 1.4810539563259363e-05, + "loss": 0.0045, + "num_input_tokens_seen": 25715200, + "step": 121855 + }, + { + "epoch": 13.405940594059405, + "grad_norm": 0.24989919364452362, + "learning_rate": 1.4808347939865428e-05, + "loss": 0.004, + "num_input_tokens_seen": 25716256, + "step": 121860 + }, + { + "epoch": 13.406490649064907, + "grad_norm": 0.1963845044374466, + "learning_rate": 1.4806156410403055e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25717376, + "step": 121865 + }, + { + "epoch": 13.407040704070408, + "grad_norm": 0.11867985874414444, + "learning_rate": 1.4803964974892453e-05, + "loss": 0.0231, + "num_input_tokens_seen": 25718464, + "step": 121870 + }, + { + "epoch": 13.407590759075907, + "grad_norm": 0.026173803955316544, + "learning_rate": 1.480177363335381e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25719584, + "step": 121875 + }, + { + "epoch": 13.408140814081408, + "grad_norm": 0.052950311452150345, + "learning_rate": 1.4799582385807326e-05, + "loss": 0.005, + "num_input_tokens_seen": 25720608, + "step": 121880 + }, + { + "epoch": 13.408690869086909, + "grad_norm": 0.11444168537855148, + "learning_rate": 1.47973912322732e-05, + "loss": 0.0135, + "num_input_tokens_seen": 25721632, + "step": 121885 + }, + { + "epoch": 13.409240924092408, + "grad_norm": 1.1237870454788208, + "learning_rate": 1.4795200172771618e-05, + "loss": 0.0171, + "num_input_tokens_seen": 25722720, + "step": 121890 + }, + { + "epoch": 13.40979097909791, + "grad_norm": 0.22200052440166473, + "learning_rate": 1.4793009207322781e-05, + "loss": 0.0151, + "num_input_tokens_seen": 25723776, + "step": 121895 + }, + { + "epoch": 13.41034103410341, + "grad_norm": 0.018772467970848083, + "learning_rate": 1.4790818335946885e-05, + "loss": 0.002, + "num_input_tokens_seen": 25724832, + "step": 121900 + }, + { + "epoch": 13.410891089108912, + "grad_norm": 0.006998189724981785, + "learning_rate": 1.4788627558664109e-05, + "loss": 0.0026, + "num_input_tokens_seen": 25725888, + "step": 121905 + }, + { + "epoch": 13.411441144114411, + "grad_norm": 0.06733861565589905, + "learning_rate": 1.4786436875494658e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25726976, + "step": 121910 + }, + { + "epoch": 13.411991199119912, + "grad_norm": 0.031902097165584564, + "learning_rate": 1.4784246286458713e-05, + "loss": 0.0018, + "num_input_tokens_seen": 25727968, + "step": 121915 + }, + { + "epoch": 13.412541254125413, + "grad_norm": 0.008345665410161018, + "learning_rate": 1.4782055791576471e-05, + "loss": 0.0163, + "num_input_tokens_seen": 25729120, + "step": 121920 + }, + { + "epoch": 13.413091309130913, + "grad_norm": 1.674123764038086, + "learning_rate": 1.4779865390868126e-05, + "loss": 0.0488, + "num_input_tokens_seen": 25730144, + "step": 121925 + }, + { + "epoch": 13.413641364136414, + "grad_norm": 0.011546154506504536, + "learning_rate": 1.4777675084353842e-05, + "loss": 0.0294, + "num_input_tokens_seen": 25731200, + "step": 121930 + }, + { + "epoch": 13.414191419141915, + "grad_norm": 0.11716082692146301, + "learning_rate": 1.4775484872053832e-05, + "loss": 0.0043, + "num_input_tokens_seen": 25732256, + "step": 121935 + }, + { + "epoch": 13.414741474147414, + "grad_norm": 0.0127512626349926, + "learning_rate": 1.4773294753988265e-05, + "loss": 0.0053, + "num_input_tokens_seen": 25733312, + "step": 121940 + }, + { + "epoch": 13.415291529152915, + "grad_norm": 0.0019541513174772263, + "learning_rate": 1.4771104730177343e-05, + "loss": 0.0046, + "num_input_tokens_seen": 25734368, + "step": 121945 + }, + { + "epoch": 13.415841584158416, + "grad_norm": 1.5737788677215576, + "learning_rate": 1.4768914800641237e-05, + "loss": 0.0395, + "num_input_tokens_seen": 25735424, + "step": 121950 + }, + { + "epoch": 13.416391639163916, + "grad_norm": 0.013607924804091454, + "learning_rate": 1.4766724965400124e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25736448, + "step": 121955 + }, + { + "epoch": 13.416941694169417, + "grad_norm": 0.0851660892367363, + "learning_rate": 1.4764535224474214e-05, + "loss": 0.0008, + "num_input_tokens_seen": 25737504, + "step": 121960 + }, + { + "epoch": 13.417491749174918, + "grad_norm": 0.013817057013511658, + "learning_rate": 1.4762345577883655e-05, + "loss": 0.002, + "num_input_tokens_seen": 25738560, + "step": 121965 + }, + { + "epoch": 13.418041804180419, + "grad_norm": 0.006480727344751358, + "learning_rate": 1.4760156025648653e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25739680, + "step": 121970 + }, + { + "epoch": 13.418591859185918, + "grad_norm": 4.634519100189209, + "learning_rate": 1.4757966567789378e-05, + "loss": 0.0611, + "num_input_tokens_seen": 25740768, + "step": 121975 + }, + { + "epoch": 13.41914191419142, + "grad_norm": 0.22724558413028717, + "learning_rate": 1.4755777204326004e-05, + "loss": 0.0056, + "num_input_tokens_seen": 25741856, + "step": 121980 + }, + { + "epoch": 13.41969196919692, + "grad_norm": 0.07177133858203888, + "learning_rate": 1.4753587935278725e-05, + "loss": 0.0045, + "num_input_tokens_seen": 25742848, + "step": 121985 + }, + { + "epoch": 13.42024202420242, + "grad_norm": 0.04259601980447769, + "learning_rate": 1.4751398760667703e-05, + "loss": 0.0092, + "num_input_tokens_seen": 25743904, + "step": 121990 + }, + { + "epoch": 13.42079207920792, + "grad_norm": 0.26794183254241943, + "learning_rate": 1.4749209680513123e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25744992, + "step": 121995 + }, + { + "epoch": 13.421342134213422, + "grad_norm": 0.0065993391908705235, + "learning_rate": 1.4747020694835168e-05, + "loss": 0.0017, + "num_input_tokens_seen": 25746016, + "step": 122000 + }, + { + "epoch": 13.421892189218921, + "grad_norm": 0.00826339889317751, + "learning_rate": 1.474483180365399e-05, + "loss": 0.0084, + "num_input_tokens_seen": 25747104, + "step": 122005 + }, + { + "epoch": 13.422442244224422, + "grad_norm": 0.11648602783679962, + "learning_rate": 1.4742643006989781e-05, + "loss": 0.0619, + "num_input_tokens_seen": 25748192, + "step": 122010 + }, + { + "epoch": 13.422992299229923, + "grad_norm": 0.019407624378800392, + "learning_rate": 1.4740454304862716e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25749280, + "step": 122015 + }, + { + "epoch": 13.423542354235423, + "grad_norm": 0.055679675191640854, + "learning_rate": 1.4738265697292947e-05, + "loss": 0.0934, + "num_input_tokens_seen": 25750368, + "step": 122020 + }, + { + "epoch": 13.424092409240924, + "grad_norm": 0.243874192237854, + "learning_rate": 1.4736077184300668e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25751392, + "step": 122025 + }, + { + "epoch": 13.424642464246425, + "grad_norm": 0.01959337294101715, + "learning_rate": 1.4733888765906034e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25752448, + "step": 122030 + }, + { + "epoch": 13.425192519251926, + "grad_norm": 0.00626281276345253, + "learning_rate": 1.4731700442129226e-05, + "loss": 0.0797, + "num_input_tokens_seen": 25753504, + "step": 122035 + }, + { + "epoch": 13.425742574257425, + "grad_norm": 0.04402102157473564, + "learning_rate": 1.4729512212990415e-05, + "loss": 0.0694, + "num_input_tokens_seen": 25754528, + "step": 122040 + }, + { + "epoch": 13.426292629262926, + "grad_norm": 0.08292177319526672, + "learning_rate": 1.4727324078509749e-05, + "loss": 0.0021, + "num_input_tokens_seen": 25755584, + "step": 122045 + }, + { + "epoch": 13.426842684268427, + "grad_norm": 0.019029412418603897, + "learning_rate": 1.4725136038707413e-05, + "loss": 0.0017, + "num_input_tokens_seen": 25756640, + "step": 122050 + }, + { + "epoch": 13.427392739273927, + "grad_norm": 0.0973803848028183, + "learning_rate": 1.4722948093603563e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25757664, + "step": 122055 + }, + { + "epoch": 13.427942794279428, + "grad_norm": 0.04236505553126335, + "learning_rate": 1.4720760243218379e-05, + "loss": 0.1184, + "num_input_tokens_seen": 25758688, + "step": 122060 + }, + { + "epoch": 13.428492849284929, + "grad_norm": 0.006682112812995911, + "learning_rate": 1.4718572487572014e-05, + "loss": 0.009, + "num_input_tokens_seen": 25759776, + "step": 122065 + }, + { + "epoch": 13.429042904290428, + "grad_norm": 4.905400276184082, + "learning_rate": 1.471638482668462e-05, + "loss": 0.065, + "num_input_tokens_seen": 25760864, + "step": 122070 + }, + { + "epoch": 13.42959295929593, + "grad_norm": 0.02208196371793747, + "learning_rate": 1.471419726057639e-05, + "loss": 0.0405, + "num_input_tokens_seen": 25761888, + "step": 122075 + }, + { + "epoch": 13.43014301430143, + "grad_norm": 0.041058141738176346, + "learning_rate": 1.4712009789267455e-05, + "loss": 0.0967, + "num_input_tokens_seen": 25762976, + "step": 122080 + }, + { + "epoch": 13.430693069306932, + "grad_norm": 0.0024052022490650415, + "learning_rate": 1.4709822412777996e-05, + "loss": 0.0203, + "num_input_tokens_seen": 25764000, + "step": 122085 + }, + { + "epoch": 13.43124312431243, + "grad_norm": 0.03508080542087555, + "learning_rate": 1.4707635131128166e-05, + "loss": 0.1114, + "num_input_tokens_seen": 25765056, + "step": 122090 + }, + { + "epoch": 13.431793179317932, + "grad_norm": 0.008805795572698116, + "learning_rate": 1.4705447944338114e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25766144, + "step": 122095 + }, + { + "epoch": 13.432343234323433, + "grad_norm": 0.019046589732170105, + "learning_rate": 1.4703260852428025e-05, + "loss": 0.0418, + "num_input_tokens_seen": 25767232, + "step": 122100 + }, + { + "epoch": 13.432893289328932, + "grad_norm": 0.010293981991708279, + "learning_rate": 1.4701073855418025e-05, + "loss": 0.0101, + "num_input_tokens_seen": 25768320, + "step": 122105 + }, + { + "epoch": 13.433443344334433, + "grad_norm": 0.007854824885725975, + "learning_rate": 1.4698886953328292e-05, + "loss": 0.0154, + "num_input_tokens_seen": 25769376, + "step": 122110 + }, + { + "epoch": 13.433993399339935, + "grad_norm": 0.0029308416415005922, + "learning_rate": 1.4696700146178982e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25770400, + "step": 122115 + }, + { + "epoch": 13.434543454345434, + "grad_norm": 0.06482184678316116, + "learning_rate": 1.4694513433990231e-05, + "loss": 0.0177, + "num_input_tokens_seen": 25771456, + "step": 122120 + }, + { + "epoch": 13.435093509350935, + "grad_norm": 0.06975792348384857, + "learning_rate": 1.4692326816782208e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25772544, + "step": 122125 + }, + { + "epoch": 13.435643564356436, + "grad_norm": 0.17943869531154633, + "learning_rate": 1.4690140294575062e-05, + "loss": 0.1495, + "num_input_tokens_seen": 25773568, + "step": 122130 + }, + { + "epoch": 13.436193619361935, + "grad_norm": 0.022088441997766495, + "learning_rate": 1.4687953867388952e-05, + "loss": 0.0168, + "num_input_tokens_seen": 25774624, + "step": 122135 + }, + { + "epoch": 13.436743674367436, + "grad_norm": 0.13237138092517853, + "learning_rate": 1.4685767535244018e-05, + "loss": 0.0349, + "num_input_tokens_seen": 25775680, + "step": 122140 + }, + { + "epoch": 13.437293729372938, + "grad_norm": 0.18217836320400238, + "learning_rate": 1.4683581298160413e-05, + "loss": 0.0023, + "num_input_tokens_seen": 25776736, + "step": 122145 + }, + { + "epoch": 13.437843784378439, + "grad_norm": 0.036177411675453186, + "learning_rate": 1.4681395156158296e-05, + "loss": 0.0102, + "num_input_tokens_seen": 25777856, + "step": 122150 + }, + { + "epoch": 13.438393839383938, + "grad_norm": 0.10821671038866043, + "learning_rate": 1.4679209109257797e-05, + "loss": 0.013, + "num_input_tokens_seen": 25778912, + "step": 122155 + }, + { + "epoch": 13.438943894389439, + "grad_norm": 1.3263741731643677, + "learning_rate": 1.4677023157479086e-05, + "loss": 0.0269, + "num_input_tokens_seen": 25779872, + "step": 122160 + }, + { + "epoch": 13.43949394939494, + "grad_norm": 0.04948689788579941, + "learning_rate": 1.4674837300842298e-05, + "loss": 0.1824, + "num_input_tokens_seen": 25780864, + "step": 122165 + }, + { + "epoch": 13.44004400440044, + "grad_norm": 0.005746161099523306, + "learning_rate": 1.467265153936757e-05, + "loss": 0.0006, + "num_input_tokens_seen": 25781888, + "step": 122170 + }, + { + "epoch": 13.44059405940594, + "grad_norm": 0.03032015822827816, + "learning_rate": 1.4670465873075074e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25782912, + "step": 122175 + }, + { + "epoch": 13.441144114411442, + "grad_norm": 0.06013929471373558, + "learning_rate": 1.4668280301984923e-05, + "loss": 0.0787, + "num_input_tokens_seen": 25784000, + "step": 122180 + }, + { + "epoch": 13.441694169416941, + "grad_norm": 0.4080130457878113, + "learning_rate": 1.4666094826117283e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25785056, + "step": 122185 + }, + { + "epoch": 13.442244224422442, + "grad_norm": 1.046566128730774, + "learning_rate": 1.4663909445492291e-05, + "loss": 0.0146, + "num_input_tokens_seen": 25786112, + "step": 122190 + }, + { + "epoch": 13.442794279427943, + "grad_norm": 0.10329635441303253, + "learning_rate": 1.4661724160130076e-05, + "loss": 0.0016, + "num_input_tokens_seen": 25787168, + "step": 122195 + }, + { + "epoch": 13.443344334433444, + "grad_norm": 0.4140474200248718, + "learning_rate": 1.4659538970050793e-05, + "loss": 0.031, + "num_input_tokens_seen": 25788224, + "step": 122200 + }, + { + "epoch": 13.443894389438944, + "grad_norm": 0.006889081560075283, + "learning_rate": 1.4657353875274582e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25789312, + "step": 122205 + }, + { + "epoch": 13.444444444444445, + "grad_norm": 0.0446246974170208, + "learning_rate": 1.465516887582157e-05, + "loss": 0.0008, + "num_input_tokens_seen": 25790400, + "step": 122210 + }, + { + "epoch": 13.444994499449946, + "grad_norm": 0.07065709680318832, + "learning_rate": 1.4652983971711906e-05, + "loss": 0.0401, + "num_input_tokens_seen": 25791488, + "step": 122215 + }, + { + "epoch": 13.445544554455445, + "grad_norm": 2.4033336639404297, + "learning_rate": 1.465079916296572e-05, + "loss": 0.1802, + "num_input_tokens_seen": 25792544, + "step": 122220 + }, + { + "epoch": 13.446094609460946, + "grad_norm": 0.006230258848518133, + "learning_rate": 1.4648614449603154e-05, + "loss": 0.1226, + "num_input_tokens_seen": 25793632, + "step": 122225 + }, + { + "epoch": 13.446644664466447, + "grad_norm": 0.32450565695762634, + "learning_rate": 1.4646429831644348e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25794688, + "step": 122230 + }, + { + "epoch": 13.447194719471947, + "grad_norm": 5.957287311553955, + "learning_rate": 1.4644245309109422e-05, + "loss": 0.0311, + "num_input_tokens_seen": 25795808, + "step": 122235 + }, + { + "epoch": 13.447744774477448, + "grad_norm": 0.3572753667831421, + "learning_rate": 1.464206088201852e-05, + "loss": 0.0287, + "num_input_tokens_seen": 25796928, + "step": 122240 + }, + { + "epoch": 13.448294829482949, + "grad_norm": 0.1015309989452362, + "learning_rate": 1.4639876550391773e-05, + "loss": 0.0064, + "num_input_tokens_seen": 25798048, + "step": 122245 + }, + { + "epoch": 13.448844884488448, + "grad_norm": 0.03989998251199722, + "learning_rate": 1.4637692314249319e-05, + "loss": 0.0064, + "num_input_tokens_seen": 25799072, + "step": 122250 + }, + { + "epoch": 13.44939493949395, + "grad_norm": 0.01673954352736473, + "learning_rate": 1.4635508173611276e-05, + "loss": 0.1983, + "num_input_tokens_seen": 25800128, + "step": 122255 + }, + { + "epoch": 13.44994499449945, + "grad_norm": 0.09311916679143906, + "learning_rate": 1.4633324128497778e-05, + "loss": 0.0674, + "num_input_tokens_seen": 25801216, + "step": 122260 + }, + { + "epoch": 13.450495049504951, + "grad_norm": 0.3369082808494568, + "learning_rate": 1.4631140178928965e-05, + "loss": 0.041, + "num_input_tokens_seen": 25802240, + "step": 122265 + }, + { + "epoch": 13.45104510451045, + "grad_norm": 0.006809399928897619, + "learning_rate": 1.4628956324924947e-05, + "loss": 0.0213, + "num_input_tokens_seen": 25803232, + "step": 122270 + }, + { + "epoch": 13.451595159515952, + "grad_norm": 0.27366045117378235, + "learning_rate": 1.462677256650587e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25804256, + "step": 122275 + }, + { + "epoch": 13.452145214521453, + "grad_norm": 0.023090019822120667, + "learning_rate": 1.4624588903691853e-05, + "loss": 0.0122, + "num_input_tokens_seen": 25805312, + "step": 122280 + }, + { + "epoch": 13.452695269526952, + "grad_norm": 0.051749009639024734, + "learning_rate": 1.4622405336503015e-05, + "loss": 0.0776, + "num_input_tokens_seen": 25806368, + "step": 122285 + }, + { + "epoch": 13.453245324532453, + "grad_norm": 0.0512290820479393, + "learning_rate": 1.4620221864959497e-05, + "loss": 0.003, + "num_input_tokens_seen": 25807488, + "step": 122290 + }, + { + "epoch": 13.453795379537954, + "grad_norm": 0.009034747257828712, + "learning_rate": 1.4618038489081403e-05, + "loss": 0.094, + "num_input_tokens_seen": 25808480, + "step": 122295 + }, + { + "epoch": 13.454345434543454, + "grad_norm": 0.0034423568286001682, + "learning_rate": 1.4615855208888874e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25809568, + "step": 122300 + }, + { + "epoch": 13.454895489548955, + "grad_norm": 3.376527786254883, + "learning_rate": 1.4613672024402025e-05, + "loss": 0.1737, + "num_input_tokens_seen": 25810656, + "step": 122305 + }, + { + "epoch": 13.455445544554456, + "grad_norm": 0.4234525263309479, + "learning_rate": 1.4611488935640972e-05, + "loss": 0.0039, + "num_input_tokens_seen": 25811744, + "step": 122310 + }, + { + "epoch": 13.455995599559955, + "grad_norm": 0.010032557882368565, + "learning_rate": 1.4609305942625843e-05, + "loss": 0.0177, + "num_input_tokens_seen": 25812832, + "step": 122315 + }, + { + "epoch": 13.456545654565456, + "grad_norm": 0.7331368923187256, + "learning_rate": 1.4607123045376753e-05, + "loss": 0.022, + "num_input_tokens_seen": 25813920, + "step": 122320 + }, + { + "epoch": 13.457095709570957, + "grad_norm": 0.11748532205820084, + "learning_rate": 1.4604940243913828e-05, + "loss": 0.0442, + "num_input_tokens_seen": 25815008, + "step": 122325 + }, + { + "epoch": 13.457645764576458, + "grad_norm": 0.04007377475500107, + "learning_rate": 1.4602757538257179e-05, + "loss": 0.0334, + "num_input_tokens_seen": 25816064, + "step": 122330 + }, + { + "epoch": 13.458195819581958, + "grad_norm": 0.2792276442050934, + "learning_rate": 1.4600574928426919e-05, + "loss": 0.078, + "num_input_tokens_seen": 25817088, + "step": 122335 + }, + { + "epoch": 13.458745874587459, + "grad_norm": 0.031463075429201126, + "learning_rate": 1.4598392414443174e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25818144, + "step": 122340 + }, + { + "epoch": 13.45929592959296, + "grad_norm": 0.2718115448951721, + "learning_rate": 1.4596209996326052e-05, + "loss": 0.0424, + "num_input_tokens_seen": 25819232, + "step": 122345 + }, + { + "epoch": 13.45984598459846, + "grad_norm": 3.201089382171631, + "learning_rate": 1.4594027674095679e-05, + "loss": 0.0927, + "num_input_tokens_seen": 25820320, + "step": 122350 + }, + { + "epoch": 13.46039603960396, + "grad_norm": 0.0417354442179203, + "learning_rate": 1.459184544777216e-05, + "loss": 0.0359, + "num_input_tokens_seen": 25821376, + "step": 122355 + }, + { + "epoch": 13.460946094609461, + "grad_norm": 0.01663007214665413, + "learning_rate": 1.4589663317375596e-05, + "loss": 0.007, + "num_input_tokens_seen": 25822464, + "step": 122360 + }, + { + "epoch": 13.46149614961496, + "grad_norm": 0.32667770981788635, + "learning_rate": 1.4587481282926113e-05, + "loss": 0.0059, + "num_input_tokens_seen": 25823456, + "step": 122365 + }, + { + "epoch": 13.462046204620462, + "grad_norm": 0.03977758809924126, + "learning_rate": 1.4585299344443815e-05, + "loss": 0.001, + "num_input_tokens_seen": 25824544, + "step": 122370 + }, + { + "epoch": 13.462596259625963, + "grad_norm": 0.057193975895643234, + "learning_rate": 1.4583117501948825e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25825536, + "step": 122375 + }, + { + "epoch": 13.463146314631462, + "grad_norm": 0.03211965784430504, + "learning_rate": 1.4580935755461244e-05, + "loss": 0.0768, + "num_input_tokens_seen": 25826592, + "step": 122380 + }, + { + "epoch": 13.463696369636963, + "grad_norm": 0.023418985307216644, + "learning_rate": 1.457875410500117e-05, + "loss": 0.0457, + "num_input_tokens_seen": 25827712, + "step": 122385 + }, + { + "epoch": 13.464246424642464, + "grad_norm": 0.06556093692779541, + "learning_rate": 1.457657255058873e-05, + "loss": 0.07, + "num_input_tokens_seen": 25828736, + "step": 122390 + }, + { + "epoch": 13.464796479647966, + "grad_norm": 0.055409640073776245, + "learning_rate": 1.4574391092244005e-05, + "loss": 0.0068, + "num_input_tokens_seen": 25829760, + "step": 122395 + }, + { + "epoch": 13.465346534653465, + "grad_norm": 0.18669423460960388, + "learning_rate": 1.4572209729987118e-05, + "loss": 0.0717, + "num_input_tokens_seen": 25830848, + "step": 122400 + }, + { + "epoch": 13.465896589658966, + "grad_norm": 1.0534268617630005, + "learning_rate": 1.4570028463838175e-05, + "loss": 0.1016, + "num_input_tokens_seen": 25831904, + "step": 122405 + }, + { + "epoch": 13.466446644664467, + "grad_norm": 0.20975945889949799, + "learning_rate": 1.4567847293817272e-05, + "loss": 0.0045, + "num_input_tokens_seen": 25832992, + "step": 122410 + }, + { + "epoch": 13.466996699669966, + "grad_norm": 0.047745294868946075, + "learning_rate": 1.4565666219944521e-05, + "loss": 0.0067, + "num_input_tokens_seen": 25834080, + "step": 122415 + }, + { + "epoch": 13.467546754675467, + "grad_norm": 0.018715228885412216, + "learning_rate": 1.4563485242240008e-05, + "loss": 0.1521, + "num_input_tokens_seen": 25835136, + "step": 122420 + }, + { + "epoch": 13.468096809680969, + "grad_norm": 0.03314239904284477, + "learning_rate": 1.4561304360723843e-05, + "loss": 0.0027, + "num_input_tokens_seen": 25836192, + "step": 122425 + }, + { + "epoch": 13.468646864686468, + "grad_norm": 0.0107414023950696, + "learning_rate": 1.4559123575416136e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25837248, + "step": 122430 + }, + { + "epoch": 13.469196919691969, + "grad_norm": 0.07910645753145218, + "learning_rate": 1.455694288633697e-05, + "loss": 0.0061, + "num_input_tokens_seen": 25838304, + "step": 122435 + }, + { + "epoch": 13.46974697469747, + "grad_norm": 3.177513360977173, + "learning_rate": 1.4554762293506458e-05, + "loss": 0.0797, + "num_input_tokens_seen": 25839392, + "step": 122440 + }, + { + "epoch": 13.47029702970297, + "grad_norm": 0.037825219333171844, + "learning_rate": 1.4552581796944689e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25840448, + "step": 122445 + }, + { + "epoch": 13.47084708470847, + "grad_norm": 0.013854334130883217, + "learning_rate": 1.4550401396671753e-05, + "loss": 0.0021, + "num_input_tokens_seen": 25841472, + "step": 122450 + }, + { + "epoch": 13.471397139713972, + "grad_norm": 2.3758020401000977, + "learning_rate": 1.454822109270775e-05, + "loss": 0.0337, + "num_input_tokens_seen": 25842560, + "step": 122455 + }, + { + "epoch": 13.471947194719473, + "grad_norm": 0.012571328319609165, + "learning_rate": 1.454604088507278e-05, + "loss": 0.0035, + "num_input_tokens_seen": 25843616, + "step": 122460 + }, + { + "epoch": 13.472497249724972, + "grad_norm": 0.018719179555773735, + "learning_rate": 1.4543860773786944e-05, + "loss": 0.0183, + "num_input_tokens_seen": 25844576, + "step": 122465 + }, + { + "epoch": 13.473047304730473, + "grad_norm": 0.612960934638977, + "learning_rate": 1.454168075887033e-05, + "loss": 0.015, + "num_input_tokens_seen": 25845664, + "step": 122470 + }, + { + "epoch": 13.473597359735974, + "grad_norm": 0.02076708897948265, + "learning_rate": 1.4539500840343012e-05, + "loss": 0.0041, + "num_input_tokens_seen": 25846720, + "step": 122475 + }, + { + "epoch": 13.474147414741473, + "grad_norm": 2.2080295085906982, + "learning_rate": 1.4537321018225094e-05, + "loss": 0.0712, + "num_input_tokens_seen": 25847712, + "step": 122480 + }, + { + "epoch": 13.474697469746975, + "grad_norm": 0.003345941659063101, + "learning_rate": 1.4535141292536668e-05, + "loss": 0.1718, + "num_input_tokens_seen": 25848800, + "step": 122485 + }, + { + "epoch": 13.475247524752476, + "grad_norm": 1.6441277265548706, + "learning_rate": 1.4532961663297836e-05, + "loss": 0.1343, + "num_input_tokens_seen": 25849856, + "step": 122490 + }, + { + "epoch": 13.475797579757975, + "grad_norm": 0.9465174674987793, + "learning_rate": 1.4530782130528672e-05, + "loss": 0.0229, + "num_input_tokens_seen": 25850880, + "step": 122495 + }, + { + "epoch": 13.476347634763476, + "grad_norm": 0.24509450793266296, + "learning_rate": 1.4528602694249258e-05, + "loss": 0.0434, + "num_input_tokens_seen": 25851968, + "step": 122500 + }, + { + "epoch": 13.476897689768977, + "grad_norm": 0.012137924320995808, + "learning_rate": 1.4526423354479695e-05, + "loss": 0.0404, + "num_input_tokens_seen": 25852992, + "step": 122505 + }, + { + "epoch": 13.477447744774478, + "grad_norm": 0.027017302811145782, + "learning_rate": 1.4524244111240054e-05, + "loss": 0.0068, + "num_input_tokens_seen": 25854048, + "step": 122510 + }, + { + "epoch": 13.477997799779978, + "grad_norm": 0.031280700117349625, + "learning_rate": 1.4522064964550428e-05, + "loss": 0.0763, + "num_input_tokens_seen": 25855040, + "step": 122515 + }, + { + "epoch": 13.478547854785479, + "grad_norm": 0.3058684766292572, + "learning_rate": 1.4519885914430912e-05, + "loss": 0.0781, + "num_input_tokens_seen": 25856064, + "step": 122520 + }, + { + "epoch": 13.47909790979098, + "grad_norm": 0.03244372457265854, + "learning_rate": 1.4517706960901566e-05, + "loss": 0.006, + "num_input_tokens_seen": 25857120, + "step": 122525 + }, + { + "epoch": 13.479647964796479, + "grad_norm": 1.9012099504470825, + "learning_rate": 1.4515528103982498e-05, + "loss": 0.0365, + "num_input_tokens_seen": 25858208, + "step": 122530 + }, + { + "epoch": 13.48019801980198, + "grad_norm": 0.21095064282417297, + "learning_rate": 1.4513349343693767e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25859232, + "step": 122535 + }, + { + "epoch": 13.480748074807481, + "grad_norm": 0.11983462423086166, + "learning_rate": 1.4511170680055457e-05, + "loss": 0.0032, + "num_input_tokens_seen": 25860256, + "step": 122540 + }, + { + "epoch": 13.48129812981298, + "grad_norm": 0.9775000810623169, + "learning_rate": 1.4508992113087671e-05, + "loss": 0.0177, + "num_input_tokens_seen": 25861280, + "step": 122545 + }, + { + "epoch": 13.481848184818482, + "grad_norm": 0.0676254853606224, + "learning_rate": 1.4506813642810458e-05, + "loss": 0.0055, + "num_input_tokens_seen": 25862336, + "step": 122550 + }, + { + "epoch": 13.482398239823983, + "grad_norm": 0.03319278731942177, + "learning_rate": 1.450463526924392e-05, + "loss": 0.0484, + "num_input_tokens_seen": 25863488, + "step": 122555 + }, + { + "epoch": 13.482948294829482, + "grad_norm": 1.584501028060913, + "learning_rate": 1.450245699240811e-05, + "loss": 0.0877, + "num_input_tokens_seen": 25864512, + "step": 122560 + }, + { + "epoch": 13.483498349834983, + "grad_norm": 0.011692346073687077, + "learning_rate": 1.4500278812323127e-05, + "loss": 0.0064, + "num_input_tokens_seen": 25865600, + "step": 122565 + }, + { + "epoch": 13.484048404840484, + "grad_norm": 0.19733686745166779, + "learning_rate": 1.4498100729009029e-05, + "loss": 0.0048, + "num_input_tokens_seen": 25866720, + "step": 122570 + }, + { + "epoch": 13.484598459845985, + "grad_norm": 0.17044101655483246, + "learning_rate": 1.4495922742485893e-05, + "loss": 0.0386, + "num_input_tokens_seen": 25867712, + "step": 122575 + }, + { + "epoch": 13.485148514851485, + "grad_norm": 1.1962167024612427, + "learning_rate": 1.4493744852773812e-05, + "loss": 0.0472, + "num_input_tokens_seen": 25868736, + "step": 122580 + }, + { + "epoch": 13.485698569856986, + "grad_norm": 0.04725657403469086, + "learning_rate": 1.4491567059892827e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25869792, + "step": 122585 + }, + { + "epoch": 13.486248624862487, + "grad_norm": 0.021568482741713524, + "learning_rate": 1.4489389363863037e-05, + "loss": 0.0322, + "num_input_tokens_seen": 25870880, + "step": 122590 + }, + { + "epoch": 13.486798679867986, + "grad_norm": 0.016299309208989143, + "learning_rate": 1.4487211764704495e-05, + "loss": 0.0744, + "num_input_tokens_seen": 25872064, + "step": 122595 + }, + { + "epoch": 13.487348734873487, + "grad_norm": 0.05820239335298538, + "learning_rate": 1.4485034262437278e-05, + "loss": 0.0573, + "num_input_tokens_seen": 25873056, + "step": 122600 + }, + { + "epoch": 13.487898789878988, + "grad_norm": 0.030223477631807327, + "learning_rate": 1.4482856857081461e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25874112, + "step": 122605 + }, + { + "epoch": 13.488448844884488, + "grad_norm": 1.9603980779647827, + "learning_rate": 1.448067954865711e-05, + "loss": 0.1484, + "num_input_tokens_seen": 25875168, + "step": 122610 + }, + { + "epoch": 13.488998899889989, + "grad_norm": 1.6340309381484985, + "learning_rate": 1.4478502337184274e-05, + "loss": 0.021, + "num_input_tokens_seen": 25876192, + "step": 122615 + }, + { + "epoch": 13.48954895489549, + "grad_norm": 0.02480948716402054, + "learning_rate": 1.4476325222683047e-05, + "loss": 0.0036, + "num_input_tokens_seen": 25877248, + "step": 122620 + }, + { + "epoch": 13.490099009900991, + "grad_norm": 0.3247028887271881, + "learning_rate": 1.4474148205173468e-05, + "loss": 0.007, + "num_input_tokens_seen": 25878336, + "step": 122625 + }, + { + "epoch": 13.49064906490649, + "grad_norm": 0.07453755289316177, + "learning_rate": 1.4471971284675614e-05, + "loss": 0.0066, + "num_input_tokens_seen": 25879360, + "step": 122630 + }, + { + "epoch": 13.491199119911991, + "grad_norm": 0.6402144432067871, + "learning_rate": 1.446979446120956e-05, + "loss": 0.018, + "num_input_tokens_seen": 25880448, + "step": 122635 + }, + { + "epoch": 13.491749174917492, + "grad_norm": 0.11403796821832657, + "learning_rate": 1.4467617734795347e-05, + "loss": 0.002, + "num_input_tokens_seen": 25881504, + "step": 122640 + }, + { + "epoch": 13.492299229922992, + "grad_norm": 0.09241770952939987, + "learning_rate": 1.4465441105453058e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25882560, + "step": 122645 + }, + { + "epoch": 13.492849284928493, + "grad_norm": 0.014391794800758362, + "learning_rate": 1.4463264573202734e-05, + "loss": 0.0015, + "num_input_tokens_seen": 25883552, + "step": 122650 + }, + { + "epoch": 13.493399339933994, + "grad_norm": 0.013061074540019035, + "learning_rate": 1.4461088138064444e-05, + "loss": 0.0879, + "num_input_tokens_seen": 25884576, + "step": 122655 + }, + { + "epoch": 13.493949394939493, + "grad_norm": 0.04903942346572876, + "learning_rate": 1.4458911800058257e-05, + "loss": 0.0025, + "num_input_tokens_seen": 25885664, + "step": 122660 + }, + { + "epoch": 13.494499449944994, + "grad_norm": 0.015223701484501362, + "learning_rate": 1.445673555920421e-05, + "loss": 0.0037, + "num_input_tokens_seen": 25886656, + "step": 122665 + }, + { + "epoch": 13.495049504950495, + "grad_norm": 0.361539363861084, + "learning_rate": 1.4454559415522383e-05, + "loss": 0.0228, + "num_input_tokens_seen": 25887680, + "step": 122670 + }, + { + "epoch": 13.495599559955995, + "grad_norm": 0.019647380337119102, + "learning_rate": 1.4452383369032813e-05, + "loss": 0.0371, + "num_input_tokens_seen": 25888736, + "step": 122675 + }, + { + "epoch": 13.496149614961496, + "grad_norm": 0.10782705992460251, + "learning_rate": 1.4450207419755573e-05, + "loss": 0.1, + "num_input_tokens_seen": 25889760, + "step": 122680 + }, + { + "epoch": 13.496699669966997, + "grad_norm": 0.0225947555154562, + "learning_rate": 1.4448031567710702e-05, + "loss": 0.007, + "num_input_tokens_seen": 25890784, + "step": 122685 + }, + { + "epoch": 13.497249724972498, + "grad_norm": 0.06156136840581894, + "learning_rate": 1.4445855812918257e-05, + "loss": 0.0239, + "num_input_tokens_seen": 25891872, + "step": 122690 + }, + { + "epoch": 13.497799779977997, + "grad_norm": 0.11442206054925919, + "learning_rate": 1.444368015539831e-05, + "loss": 0.1061, + "num_input_tokens_seen": 25892960, + "step": 122695 + }, + { + "epoch": 13.498349834983498, + "grad_norm": 0.012571129947900772, + "learning_rate": 1.4441504595170882e-05, + "loss": 0.1343, + "num_input_tokens_seen": 25893984, + "step": 122700 + }, + { + "epoch": 13.498899889989, + "grad_norm": 0.018673911690711975, + "learning_rate": 1.4439329132256051e-05, + "loss": 0.0681, + "num_input_tokens_seen": 25895072, + "step": 122705 + }, + { + "epoch": 13.499449944994499, + "grad_norm": 0.03150368854403496, + "learning_rate": 1.4437153766673847e-05, + "loss": 0.0014, + "num_input_tokens_seen": 25896128, + "step": 122710 + }, + { + "epoch": 13.5, + "grad_norm": 0.05852828919887543, + "learning_rate": 1.4434978498444327e-05, + "loss": 0.0253, + "num_input_tokens_seen": 25897216, + "step": 122715 + }, + { + "epoch": 13.500550055005501, + "grad_norm": 2.663851022720337, + "learning_rate": 1.4432803327587551e-05, + "loss": 0.0911, + "num_input_tokens_seen": 25898272, + "step": 122720 + }, + { + "epoch": 13.501100110011, + "grad_norm": 0.1341477483510971, + "learning_rate": 1.4430628254123545e-05, + "loss": 0.0297, + "num_input_tokens_seen": 25899328, + "step": 122725 + }, + { + "epoch": 13.501650165016502, + "grad_norm": 0.014714355580508709, + "learning_rate": 1.4428453278072379e-05, + "loss": 0.0049, + "num_input_tokens_seen": 25900384, + "step": 122730 + }, + { + "epoch": 13.502200220022003, + "grad_norm": 0.04515550285577774, + "learning_rate": 1.4426278399454085e-05, + "loss": 0.0114, + "num_input_tokens_seen": 25901536, + "step": 122735 + }, + { + "epoch": 13.502750275027502, + "grad_norm": 0.023331135511398315, + "learning_rate": 1.4424103618288701e-05, + "loss": 0.0031, + "num_input_tokens_seen": 25902528, + "step": 122740 + }, + { + "epoch": 13.503300330033003, + "grad_norm": 0.04618019983172417, + "learning_rate": 1.4421928934596277e-05, + "loss": 0.021, + "num_input_tokens_seen": 25903616, + "step": 122745 + }, + { + "epoch": 13.503850385038504, + "grad_norm": 0.2255372554063797, + "learning_rate": 1.4419754348396858e-05, + "loss": 0.0043, + "num_input_tokens_seen": 25904640, + "step": 122750 + }, + { + "epoch": 13.504400440044005, + "grad_norm": 0.31224778294563293, + "learning_rate": 1.4417579859710498e-05, + "loss": 0.0107, + "num_input_tokens_seen": 25905696, + "step": 122755 + }, + { + "epoch": 13.504950495049505, + "grad_norm": 0.42319345474243164, + "learning_rate": 1.4415405468557225e-05, + "loss": 0.0276, + "num_input_tokens_seen": 25906752, + "step": 122760 + }, + { + "epoch": 13.505500550055006, + "grad_norm": 0.015573750250041485, + "learning_rate": 1.441323117495707e-05, + "loss": 0.0025, + "num_input_tokens_seen": 25907776, + "step": 122765 + }, + { + "epoch": 13.506050605060507, + "grad_norm": 0.02141125127673149, + "learning_rate": 1.4411056978930088e-05, + "loss": 0.001, + "num_input_tokens_seen": 25908800, + "step": 122770 + }, + { + "epoch": 13.506600660066006, + "grad_norm": 0.05671992152929306, + "learning_rate": 1.4408882880496308e-05, + "loss": 0.0143, + "num_input_tokens_seen": 25909856, + "step": 122775 + }, + { + "epoch": 13.507150715071507, + "grad_norm": 0.044127434492111206, + "learning_rate": 1.4406708879675785e-05, + "loss": 0.0893, + "num_input_tokens_seen": 25910880, + "step": 122780 + }, + { + "epoch": 13.507700770077008, + "grad_norm": 0.8437298536300659, + "learning_rate": 1.4404534976488542e-05, + "loss": 0.0416, + "num_input_tokens_seen": 25911936, + "step": 122785 + }, + { + "epoch": 13.508250825082508, + "grad_norm": 0.15953519940376282, + "learning_rate": 1.4402361170954604e-05, + "loss": 0.0114, + "num_input_tokens_seen": 25912992, + "step": 122790 + }, + { + "epoch": 13.508800880088009, + "grad_norm": 0.07682981342077255, + "learning_rate": 1.4400187463094031e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25914048, + "step": 122795 + }, + { + "epoch": 13.50935093509351, + "grad_norm": 0.009475618600845337, + "learning_rate": 1.4398013852926834e-05, + "loss": 0.0071, + "num_input_tokens_seen": 25915072, + "step": 122800 + }, + { + "epoch": 13.509900990099009, + "grad_norm": 0.26694032549858093, + "learning_rate": 1.4395840340473055e-05, + "loss": 0.1357, + "num_input_tokens_seen": 25916160, + "step": 122805 + }, + { + "epoch": 13.51045104510451, + "grad_norm": 0.021527552977204323, + "learning_rate": 1.4393666925752736e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25917248, + "step": 122810 + }, + { + "epoch": 13.511001100110011, + "grad_norm": 0.013158510439097881, + "learning_rate": 1.4391493608785888e-05, + "loss": 0.0011, + "num_input_tokens_seen": 25918400, + "step": 122815 + }, + { + "epoch": 13.511551155115512, + "grad_norm": 0.015243562869727612, + "learning_rate": 1.4389320389592565e-05, + "loss": 0.0946, + "num_input_tokens_seen": 25919488, + "step": 122820 + }, + { + "epoch": 13.512101210121012, + "grad_norm": 0.1382325440645218, + "learning_rate": 1.4387147268192774e-05, + "loss": 0.0678, + "num_input_tokens_seen": 25920480, + "step": 122825 + }, + { + "epoch": 13.512651265126513, + "grad_norm": 0.03488694876432419, + "learning_rate": 1.4384974244606558e-05, + "loss": 0.017, + "num_input_tokens_seen": 25921504, + "step": 122830 + }, + { + "epoch": 13.513201320132014, + "grad_norm": 1.8333662748336792, + "learning_rate": 1.4382801318853945e-05, + "loss": 0.1109, + "num_input_tokens_seen": 25922528, + "step": 122835 + }, + { + "epoch": 13.513751375137513, + "grad_norm": 1.5845752954483032, + "learning_rate": 1.438062849095495e-05, + "loss": 0.0519, + "num_input_tokens_seen": 25923552, + "step": 122840 + }, + { + "epoch": 13.514301430143014, + "grad_norm": 0.5158067941665649, + "learning_rate": 1.4378455760929619e-05, + "loss": 0.0063, + "num_input_tokens_seen": 25924640, + "step": 122845 + }, + { + "epoch": 13.514851485148515, + "grad_norm": 0.13052873313426971, + "learning_rate": 1.4376283128797963e-05, + "loss": 0.0576, + "num_input_tokens_seen": 25925696, + "step": 122850 + }, + { + "epoch": 13.515401540154015, + "grad_norm": 0.01768212392926216, + "learning_rate": 1.4374110594579996e-05, + "loss": 0.0446, + "num_input_tokens_seen": 25926784, + "step": 122855 + }, + { + "epoch": 13.515951595159516, + "grad_norm": 0.024314554408192635, + "learning_rate": 1.4371938158295753e-05, + "loss": 0.0048, + "num_input_tokens_seen": 25927872, + "step": 122860 + }, + { + "epoch": 13.516501650165017, + "grad_norm": 0.04474617913365364, + "learning_rate": 1.4369765819965259e-05, + "loss": 0.0644, + "num_input_tokens_seen": 25928928, + "step": 122865 + }, + { + "epoch": 13.517051705170516, + "grad_norm": 0.14779381453990936, + "learning_rate": 1.436759357960854e-05, + "loss": 0.0684, + "num_input_tokens_seen": 25929888, + "step": 122870 + }, + { + "epoch": 13.517601760176017, + "grad_norm": 0.039242662489414215, + "learning_rate": 1.4365421437245608e-05, + "loss": 0.0064, + "num_input_tokens_seen": 25931040, + "step": 122875 + }, + { + "epoch": 13.518151815181518, + "grad_norm": 0.10476656258106232, + "learning_rate": 1.4363249392896477e-05, + "loss": 0.0624, + "num_input_tokens_seen": 25932096, + "step": 122880 + }, + { + "epoch": 13.51870187018702, + "grad_norm": 0.03800633177161217, + "learning_rate": 1.4361077446581166e-05, + "loss": 0.0869, + "num_input_tokens_seen": 25933216, + "step": 122885 + }, + { + "epoch": 13.519251925192519, + "grad_norm": 1.6838566064834595, + "learning_rate": 1.4358905598319706e-05, + "loss": 0.0442, + "num_input_tokens_seen": 25934240, + "step": 122890 + }, + { + "epoch": 13.51980198019802, + "grad_norm": 0.04895174130797386, + "learning_rate": 1.4356733848132114e-05, + "loss": 0.0496, + "num_input_tokens_seen": 25935264, + "step": 122895 + }, + { + "epoch": 13.520352035203521, + "grad_norm": 0.03371906280517578, + "learning_rate": 1.4354562196038402e-05, + "loss": 0.0074, + "num_input_tokens_seen": 25936288, + "step": 122900 + }, + { + "epoch": 13.52090209020902, + "grad_norm": 0.1391344666481018, + "learning_rate": 1.435239064205857e-05, + "loss": 0.0049, + "num_input_tokens_seen": 25937312, + "step": 122905 + }, + { + "epoch": 13.521452145214521, + "grad_norm": 0.026218391954898834, + "learning_rate": 1.4350219186212654e-05, + "loss": 0.0204, + "num_input_tokens_seen": 25938272, + "step": 122910 + }, + { + "epoch": 13.522002200220022, + "grad_norm": 0.055578313767910004, + "learning_rate": 1.4348047828520645e-05, + "loss": 0.002, + "num_input_tokens_seen": 25939328, + "step": 122915 + }, + { + "epoch": 13.522552255225522, + "grad_norm": 0.0305815190076828, + "learning_rate": 1.434587656900257e-05, + "loss": 0.0209, + "num_input_tokens_seen": 25940352, + "step": 122920 + }, + { + "epoch": 13.523102310231023, + "grad_norm": 0.007035900838673115, + "learning_rate": 1.4343705407678448e-05, + "loss": 0.0012, + "num_input_tokens_seen": 25941440, + "step": 122925 + }, + { + "epoch": 13.523652365236524, + "grad_norm": 0.08448680490255356, + "learning_rate": 1.4341534344568273e-05, + "loss": 0.0065, + "num_input_tokens_seen": 25942496, + "step": 122930 + }, + { + "epoch": 13.524202420242025, + "grad_norm": 0.0786690041422844, + "learning_rate": 1.4339363379692067e-05, + "loss": 0.0364, + "num_input_tokens_seen": 25943552, + "step": 122935 + }, + { + "epoch": 13.524752475247524, + "grad_norm": 0.10298381745815277, + "learning_rate": 1.4337192513069827e-05, + "loss": 0.0189, + "num_input_tokens_seen": 25944608, + "step": 122940 + }, + { + "epoch": 13.525302530253025, + "grad_norm": 0.03265446797013283, + "learning_rate": 1.4335021744721565e-05, + "loss": 0.003, + "num_input_tokens_seen": 25945664, + "step": 122945 + }, + { + "epoch": 13.525852585258527, + "grad_norm": 1.327347993850708, + "learning_rate": 1.43328510746673e-05, + "loss": 0.0382, + "num_input_tokens_seen": 25946720, + "step": 122950 + }, + { + "epoch": 13.526402640264026, + "grad_norm": 0.12246304005384445, + "learning_rate": 1.4330680502927019e-05, + "loss": 0.0435, + "num_input_tokens_seen": 25947808, + "step": 122955 + }, + { + "epoch": 13.526952695269527, + "grad_norm": 0.07949183881282806, + "learning_rate": 1.4328510029520747e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25948832, + "step": 122960 + }, + { + "epoch": 13.527502750275028, + "grad_norm": 0.12473434209823608, + "learning_rate": 1.4326339654468466e-05, + "loss": 0.0981, + "num_input_tokens_seen": 25949888, + "step": 122965 + }, + { + "epoch": 13.528052805280527, + "grad_norm": 0.013530896976590157, + "learning_rate": 1.4324169377790197e-05, + "loss": 0.0012, + "num_input_tokens_seen": 25951008, + "step": 122970 + }, + { + "epoch": 13.528602860286028, + "grad_norm": 0.028051892295479774, + "learning_rate": 1.432199919950593e-05, + "loss": 0.0103, + "num_input_tokens_seen": 25952032, + "step": 122975 + }, + { + "epoch": 13.52915291529153, + "grad_norm": 0.015206288546323776, + "learning_rate": 1.4319829119635674e-05, + "loss": 0.0403, + "num_input_tokens_seen": 25953088, + "step": 122980 + }, + { + "epoch": 13.52970297029703, + "grad_norm": 0.012651611119508743, + "learning_rate": 1.4317659138199436e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25954144, + "step": 122985 + }, + { + "epoch": 13.53025302530253, + "grad_norm": 0.06160072982311249, + "learning_rate": 1.431548925521721e-05, + "loss": 0.0103, + "num_input_tokens_seen": 25955200, + "step": 122990 + }, + { + "epoch": 13.530803080308031, + "grad_norm": 0.37642911076545715, + "learning_rate": 1.431331947070898e-05, + "loss": 0.0109, + "num_input_tokens_seen": 25956256, + "step": 122995 + }, + { + "epoch": 13.531353135313532, + "grad_norm": 0.00975427869707346, + "learning_rate": 1.4311149784694766e-05, + "loss": 0.0042, + "num_input_tokens_seen": 25957312, + "step": 123000 + }, + { + "epoch": 13.531903190319031, + "grad_norm": 0.27121394872665405, + "learning_rate": 1.4308980197194539e-05, + "loss": 0.0149, + "num_input_tokens_seen": 25958400, + "step": 123005 + }, + { + "epoch": 13.532453245324533, + "grad_norm": 1.5359859466552734, + "learning_rate": 1.4306810708228329e-05, + "loss": 0.0659, + "num_input_tokens_seen": 25959424, + "step": 123010 + }, + { + "epoch": 13.533003300330034, + "grad_norm": 0.02263115905225277, + "learning_rate": 1.4304641317816114e-05, + "loss": 0.0029, + "num_input_tokens_seen": 25960480, + "step": 123015 + }, + { + "epoch": 13.533553355335533, + "grad_norm": 0.30647358298301697, + "learning_rate": 1.4302472025977876e-05, + "loss": 0.0049, + "num_input_tokens_seen": 25961600, + "step": 123020 + }, + { + "epoch": 13.534103410341034, + "grad_norm": 0.06592361629009247, + "learning_rate": 1.4300302832733633e-05, + "loss": 0.0061, + "num_input_tokens_seen": 25962624, + "step": 123025 + }, + { + "epoch": 13.534653465346535, + "grad_norm": 0.030349979177117348, + "learning_rate": 1.4298133738103353e-05, + "loss": 0.005, + "num_input_tokens_seen": 25963648, + "step": 123030 + }, + { + "epoch": 13.535203520352034, + "grad_norm": 0.13373427093029022, + "learning_rate": 1.429596474210704e-05, + "loss": 0.0073, + "num_input_tokens_seen": 25964672, + "step": 123035 + }, + { + "epoch": 13.535753575357536, + "grad_norm": 0.006828392390161753, + "learning_rate": 1.4293795844764693e-05, + "loss": 0.0105, + "num_input_tokens_seen": 25965760, + "step": 123040 + }, + { + "epoch": 13.536303630363037, + "grad_norm": 0.0029843919910490513, + "learning_rate": 1.4291627046096285e-05, + "loss": 0.0058, + "num_input_tokens_seen": 25966784, + "step": 123045 + }, + { + "epoch": 13.536853685368538, + "grad_norm": 0.21141085028648376, + "learning_rate": 1.428945834612182e-05, + "loss": 0.0051, + "num_input_tokens_seen": 25967744, + "step": 123050 + }, + { + "epoch": 13.537403740374037, + "grad_norm": 0.05069917067885399, + "learning_rate": 1.4287289744861264e-05, + "loss": 0.1851, + "num_input_tokens_seen": 25968832, + "step": 123055 + }, + { + "epoch": 13.537953795379538, + "grad_norm": 0.01258427556604147, + "learning_rate": 1.4285121242334622e-05, + "loss": 0.1109, + "num_input_tokens_seen": 25969856, + "step": 123060 + }, + { + "epoch": 13.53850385038504, + "grad_norm": 0.05643947049975395, + "learning_rate": 1.4282952838561887e-05, + "loss": 0.0615, + "num_input_tokens_seen": 25970848, + "step": 123065 + }, + { + "epoch": 13.539053905390539, + "grad_norm": 0.014255174435675144, + "learning_rate": 1.4280784533563025e-05, + "loss": 0.0122, + "num_input_tokens_seen": 25971936, + "step": 123070 + }, + { + "epoch": 13.53960396039604, + "grad_norm": 0.02536974474787712, + "learning_rate": 1.4278616327358032e-05, + "loss": 0.0082, + "num_input_tokens_seen": 25972992, + "step": 123075 + }, + { + "epoch": 13.54015401540154, + "grad_norm": 0.5283911824226379, + "learning_rate": 1.4276448219966881e-05, + "loss": 0.0133, + "num_input_tokens_seen": 25974112, + "step": 123080 + }, + { + "epoch": 13.54070407040704, + "grad_norm": 0.020600110292434692, + "learning_rate": 1.4274280211409571e-05, + "loss": 0.0553, + "num_input_tokens_seen": 25975136, + "step": 123085 + }, + { + "epoch": 13.541254125412541, + "grad_norm": 0.03371019661426544, + "learning_rate": 1.4272112301706064e-05, + "loss": 0.0624, + "num_input_tokens_seen": 25976160, + "step": 123090 + }, + { + "epoch": 13.541804180418042, + "grad_norm": 1.1403112411499023, + "learning_rate": 1.426994449087635e-05, + "loss": 0.0343, + "num_input_tokens_seen": 25977216, + "step": 123095 + }, + { + "epoch": 13.542354235423542, + "grad_norm": 0.03116747923195362, + "learning_rate": 1.4267776778940418e-05, + "loss": 0.0074, + "num_input_tokens_seen": 25978240, + "step": 123100 + }, + { + "epoch": 13.542904290429043, + "grad_norm": 0.05911088362336159, + "learning_rate": 1.4265609165918229e-05, + "loss": 0.0439, + "num_input_tokens_seen": 25979328, + "step": 123105 + }, + { + "epoch": 13.543454345434544, + "grad_norm": 0.019998077303171158, + "learning_rate": 1.4263441651829778e-05, + "loss": 0.08, + "num_input_tokens_seen": 25980384, + "step": 123110 + }, + { + "epoch": 13.544004400440045, + "grad_norm": 0.06359273195266724, + "learning_rate": 1.4261274236695035e-05, + "loss": 0.0048, + "num_input_tokens_seen": 25981408, + "step": 123115 + }, + { + "epoch": 13.544554455445544, + "grad_norm": 0.05697010084986687, + "learning_rate": 1.4259106920533955e-05, + "loss": 0.0052, + "num_input_tokens_seen": 25982496, + "step": 123120 + }, + { + "epoch": 13.545104510451045, + "grad_norm": 0.8490546345710754, + "learning_rate": 1.4256939703366551e-05, + "loss": 0.1014, + "num_input_tokens_seen": 25983552, + "step": 123125 + }, + { + "epoch": 13.545654565456546, + "grad_norm": 0.029133131727576256, + "learning_rate": 1.4254772585212767e-05, + "loss": 0.0062, + "num_input_tokens_seen": 25984640, + "step": 123130 + }, + { + "epoch": 13.546204620462046, + "grad_norm": 0.11118056625127792, + "learning_rate": 1.42526055660926e-05, + "loss": 0.0013, + "num_input_tokens_seen": 25985664, + "step": 123135 + }, + { + "epoch": 13.546754675467547, + "grad_norm": 0.005181905813515186, + "learning_rate": 1.425043864602601e-05, + "loss": 0.0004, + "num_input_tokens_seen": 25986784, + "step": 123140 + }, + { + "epoch": 13.547304730473048, + "grad_norm": 0.03877662122249603, + "learning_rate": 1.424827182503296e-05, + "loss": 0.0033, + "num_input_tokens_seen": 25987840, + "step": 123145 + }, + { + "epoch": 13.547854785478547, + "grad_norm": 0.045381344854831696, + "learning_rate": 1.4246105103133428e-05, + "loss": 0.0081, + "num_input_tokens_seen": 25988896, + "step": 123150 + }, + { + "epoch": 13.548404840484048, + "grad_norm": 1.0226986408233643, + "learning_rate": 1.4243938480347385e-05, + "loss": 0.0366, + "num_input_tokens_seen": 25989888, + "step": 123155 + }, + { + "epoch": 13.54895489548955, + "grad_norm": 0.3147698938846588, + "learning_rate": 1.424177195669481e-05, + "loss": 0.0057, + "num_input_tokens_seen": 25991008, + "step": 123160 + }, + { + "epoch": 13.549504950495049, + "grad_norm": 0.028186742216348648, + "learning_rate": 1.423960553219566e-05, + "loss": 0.0059, + "num_input_tokens_seen": 25992064, + "step": 123165 + }, + { + "epoch": 13.55005500550055, + "grad_norm": 2.1228275299072266, + "learning_rate": 1.4237439206869896e-05, + "loss": 0.0321, + "num_input_tokens_seen": 25993120, + "step": 123170 + }, + { + "epoch": 13.55060506050605, + "grad_norm": 0.0668291300535202, + "learning_rate": 1.4235272980737497e-05, + "loss": 0.0917, + "num_input_tokens_seen": 25994208, + "step": 123175 + }, + { + "epoch": 13.551155115511552, + "grad_norm": 0.2641432285308838, + "learning_rate": 1.4233106853818412e-05, + "loss": 0.0038, + "num_input_tokens_seen": 25995200, + "step": 123180 + }, + { + "epoch": 13.551705170517051, + "grad_norm": 0.08787482231855392, + "learning_rate": 1.4230940826132617e-05, + "loss": 0.0015, + "num_input_tokens_seen": 25996288, + "step": 123185 + }, + { + "epoch": 13.552255225522552, + "grad_norm": 0.007452570833265781, + "learning_rate": 1.4228774897700079e-05, + "loss": 0.0112, + "num_input_tokens_seen": 25997312, + "step": 123190 + }, + { + "epoch": 13.552805280528053, + "grad_norm": 0.010082310065627098, + "learning_rate": 1.4226609068540747e-05, + "loss": 0.0145, + "num_input_tokens_seen": 25998432, + "step": 123195 + }, + { + "epoch": 13.553355335533553, + "grad_norm": 0.008406167849898338, + "learning_rate": 1.4224443338674597e-05, + "loss": 0.0633, + "num_input_tokens_seen": 25999488, + "step": 123200 + }, + { + "epoch": 13.553905390539054, + "grad_norm": 2.3064634799957275, + "learning_rate": 1.4222277708121573e-05, + "loss": 0.0787, + "num_input_tokens_seen": 26000544, + "step": 123205 + }, + { + "epoch": 13.554455445544555, + "grad_norm": 0.49078184366226196, + "learning_rate": 1.4220112176901646e-05, + "loss": 0.0526, + "num_input_tokens_seen": 26001600, + "step": 123210 + }, + { + "epoch": 13.555005500550054, + "grad_norm": 0.07941534370183945, + "learning_rate": 1.4217946745034782e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26002656, + "step": 123215 + }, + { + "epoch": 13.555555555555555, + "grad_norm": 0.15955044329166412, + "learning_rate": 1.4215781412540919e-05, + "loss": 0.0042, + "num_input_tokens_seen": 26003680, + "step": 123220 + }, + { + "epoch": 13.556105610561056, + "grad_norm": 0.22659434378147125, + "learning_rate": 1.4213616179440031e-05, + "loss": 0.0097, + "num_input_tokens_seen": 26004768, + "step": 123225 + }, + { + "epoch": 13.556655665566556, + "grad_norm": 0.08374781161546707, + "learning_rate": 1.421145104575207e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26005792, + "step": 123230 + }, + { + "epoch": 13.557205720572057, + "grad_norm": 0.6479905843734741, + "learning_rate": 1.4209286011496974e-05, + "loss": 0.0133, + "num_input_tokens_seen": 26006784, + "step": 123235 + }, + { + "epoch": 13.557755775577558, + "grad_norm": 0.217912495136261, + "learning_rate": 1.4207121076694716e-05, + "loss": 0.005, + "num_input_tokens_seen": 26007840, + "step": 123240 + }, + { + "epoch": 13.558305830583059, + "grad_norm": 0.007015028037130833, + "learning_rate": 1.4204956241365242e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26008960, + "step": 123245 + }, + { + "epoch": 13.558855885588558, + "grad_norm": 0.024303056299686432, + "learning_rate": 1.4202791505528515e-05, + "loss": 0.0022, + "num_input_tokens_seen": 26010016, + "step": 123250 + }, + { + "epoch": 13.55940594059406, + "grad_norm": 0.08359148353338242, + "learning_rate": 1.420062686920448e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26011136, + "step": 123255 + }, + { + "epoch": 13.55995599559956, + "grad_norm": 0.008541044779121876, + "learning_rate": 1.4198462332413073e-05, + "loss": 0.002, + "num_input_tokens_seen": 26012256, + "step": 123260 + }, + { + "epoch": 13.56050605060506, + "grad_norm": 0.005570517852902412, + "learning_rate": 1.4196297895174256e-05, + "loss": 0.0812, + "num_input_tokens_seen": 26013344, + "step": 123265 + }, + { + "epoch": 13.561056105610561, + "grad_norm": 0.0753386914730072, + "learning_rate": 1.4194133557507977e-05, + "loss": 0.0159, + "num_input_tokens_seen": 26014432, + "step": 123270 + }, + { + "epoch": 13.561606160616062, + "grad_norm": 0.040599722415208817, + "learning_rate": 1.4191969319434196e-05, + "loss": 0.0085, + "num_input_tokens_seen": 26015456, + "step": 123275 + }, + { + "epoch": 13.562156215621561, + "grad_norm": 0.188836470246315, + "learning_rate": 1.4189805180972848e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26016512, + "step": 123280 + }, + { + "epoch": 13.562706270627062, + "grad_norm": 0.020912883803248405, + "learning_rate": 1.4187641142143865e-05, + "loss": 0.1016, + "num_input_tokens_seen": 26017536, + "step": 123285 + }, + { + "epoch": 13.563256325632564, + "grad_norm": 0.033786699175834656, + "learning_rate": 1.4185477202967218e-05, + "loss": 0.0463, + "num_input_tokens_seen": 26018592, + "step": 123290 + }, + { + "epoch": 13.563806380638063, + "grad_norm": 0.007756144739687443, + "learning_rate": 1.4183313363462824e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26019648, + "step": 123295 + }, + { + "epoch": 13.564356435643564, + "grad_norm": 0.019810913130640984, + "learning_rate": 1.4181149623650646e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26020736, + "step": 123300 + }, + { + "epoch": 13.564906490649065, + "grad_norm": 2.200448751449585, + "learning_rate": 1.4178985983550628e-05, + "loss": 0.0297, + "num_input_tokens_seen": 26021888, + "step": 123305 + }, + { + "epoch": 13.565456545654566, + "grad_norm": 0.10728571563959122, + "learning_rate": 1.4176822443182695e-05, + "loss": 0.0223, + "num_input_tokens_seen": 26022944, + "step": 123310 + }, + { + "epoch": 13.566006600660065, + "grad_norm": 0.024278918281197548, + "learning_rate": 1.4174659002566804e-05, + "loss": 0.0284, + "num_input_tokens_seen": 26024032, + "step": 123315 + }, + { + "epoch": 13.566556655665567, + "grad_norm": 0.028936199843883514, + "learning_rate": 1.4172495661722879e-05, + "loss": 0.0039, + "num_input_tokens_seen": 26025056, + "step": 123320 + }, + { + "epoch": 13.567106710671068, + "grad_norm": 0.014743941836059093, + "learning_rate": 1.4170332420670863e-05, + "loss": 0.0913, + "num_input_tokens_seen": 26026080, + "step": 123325 + }, + { + "epoch": 13.567656765676567, + "grad_norm": 1.3082075119018555, + "learning_rate": 1.4168169279430707e-05, + "loss": 0.0935, + "num_input_tokens_seen": 26027136, + "step": 123330 + }, + { + "epoch": 13.568206820682068, + "grad_norm": 0.023676371201872826, + "learning_rate": 1.4166006238022327e-05, + "loss": 0.0153, + "num_input_tokens_seen": 26028224, + "step": 123335 + }, + { + "epoch": 13.56875687568757, + "grad_norm": 0.1152656227350235, + "learning_rate": 1.4163843296465681e-05, + "loss": 0.0309, + "num_input_tokens_seen": 26029248, + "step": 123340 + }, + { + "epoch": 13.569306930693068, + "grad_norm": 0.0337003618478775, + "learning_rate": 1.4161680454780679e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26030368, + "step": 123345 + }, + { + "epoch": 13.56985698569857, + "grad_norm": 0.10867850482463837, + "learning_rate": 1.415951771298728e-05, + "loss": 0.0148, + "num_input_tokens_seen": 26031392, + "step": 123350 + }, + { + "epoch": 13.57040704070407, + "grad_norm": 0.11495503783226013, + "learning_rate": 1.4157355071105393e-05, + "loss": 0.0094, + "num_input_tokens_seen": 26032416, + "step": 123355 + }, + { + "epoch": 13.570957095709572, + "grad_norm": 0.03549809008836746, + "learning_rate": 1.4155192529154965e-05, + "loss": 0.0231, + "num_input_tokens_seen": 26033472, + "step": 123360 + }, + { + "epoch": 13.571507150715071, + "grad_norm": 0.1486770659685135, + "learning_rate": 1.4153030087155928e-05, + "loss": 0.0073, + "num_input_tokens_seen": 26034528, + "step": 123365 + }, + { + "epoch": 13.572057205720572, + "grad_norm": 0.0021410146728157997, + "learning_rate": 1.415086774512821e-05, + "loss": 0.0093, + "num_input_tokens_seen": 26035584, + "step": 123370 + }, + { + "epoch": 13.572607260726073, + "grad_norm": 0.0931730717420578, + "learning_rate": 1.414870550309173e-05, + "loss": 0.0018, + "num_input_tokens_seen": 26036608, + "step": 123375 + }, + { + "epoch": 13.573157315731573, + "grad_norm": 0.9829291105270386, + "learning_rate": 1.4146543361066422e-05, + "loss": 0.0053, + "num_input_tokens_seen": 26037696, + "step": 123380 + }, + { + "epoch": 13.573707370737074, + "grad_norm": 2.341125249862671, + "learning_rate": 1.4144381319072216e-05, + "loss": 0.1048, + "num_input_tokens_seen": 26038752, + "step": 123385 + }, + { + "epoch": 13.574257425742575, + "grad_norm": 0.023000990971922874, + "learning_rate": 1.4142219377129052e-05, + "loss": 0.0212, + "num_input_tokens_seen": 26039840, + "step": 123390 + }, + { + "epoch": 13.574807480748074, + "grad_norm": 0.6247023344039917, + "learning_rate": 1.4140057535256837e-05, + "loss": 0.009, + "num_input_tokens_seen": 26040896, + "step": 123395 + }, + { + "epoch": 13.575357535753575, + "grad_norm": 0.009549139998853207, + "learning_rate": 1.4137895793475491e-05, + "loss": 0.0005, + "num_input_tokens_seen": 26041984, + "step": 123400 + }, + { + "epoch": 13.575907590759076, + "grad_norm": 0.05642047896981239, + "learning_rate": 1.413573415180496e-05, + "loss": 0.0057, + "num_input_tokens_seen": 26043040, + "step": 123405 + }, + { + "epoch": 13.576457645764577, + "grad_norm": 0.01021883636713028, + "learning_rate": 1.4133572610265144e-05, + "loss": 0.0091, + "num_input_tokens_seen": 26044064, + "step": 123410 + }, + { + "epoch": 13.577007700770077, + "grad_norm": 0.1194392517209053, + "learning_rate": 1.4131411168875974e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26045152, + "step": 123415 + }, + { + "epoch": 13.577557755775578, + "grad_norm": 0.008286423981189728, + "learning_rate": 1.4129249827657381e-05, + "loss": 0.0521, + "num_input_tokens_seen": 26046144, + "step": 123420 + }, + { + "epoch": 13.578107810781079, + "grad_norm": 0.16619500517845154, + "learning_rate": 1.4127088586629267e-05, + "loss": 0.0044, + "num_input_tokens_seen": 26047232, + "step": 123425 + }, + { + "epoch": 13.578657865786578, + "grad_norm": 2.0762040615081787, + "learning_rate": 1.412492744581157e-05, + "loss": 0.0875, + "num_input_tokens_seen": 26048352, + "step": 123430 + }, + { + "epoch": 13.57920792079208, + "grad_norm": 0.03315557911992073, + "learning_rate": 1.412276640522419e-05, + "loss": 0.0127, + "num_input_tokens_seen": 26049376, + "step": 123435 + }, + { + "epoch": 13.57975797579758, + "grad_norm": 0.0324474461376667, + "learning_rate": 1.4120605464887052e-05, + "loss": 0.1447, + "num_input_tokens_seen": 26050368, + "step": 123440 + }, + { + "epoch": 13.58030803080308, + "grad_norm": 0.01234036311507225, + "learning_rate": 1.4118444624820082e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26051424, + "step": 123445 + }, + { + "epoch": 13.58085808580858, + "grad_norm": 0.055763985961675644, + "learning_rate": 1.4116283885043174e-05, + "loss": 0.0351, + "num_input_tokens_seen": 26052448, + "step": 123450 + }, + { + "epoch": 13.581408140814082, + "grad_norm": 0.08760808408260345, + "learning_rate": 1.4114123245576272e-05, + "loss": 0.164, + "num_input_tokens_seen": 26053536, + "step": 123455 + }, + { + "epoch": 13.581958195819581, + "grad_norm": 1.2279001474380493, + "learning_rate": 1.4111962706439258e-05, + "loss": 0.015, + "num_input_tokens_seen": 26054624, + "step": 123460 + }, + { + "epoch": 13.582508250825082, + "grad_norm": 0.030855117365717888, + "learning_rate": 1.4109802267652066e-05, + "loss": 0.0273, + "num_input_tokens_seen": 26055648, + "step": 123465 + }, + { + "epoch": 13.583058305830583, + "grad_norm": 0.009207837283611298, + "learning_rate": 1.4107641929234594e-05, + "loss": 0.033, + "num_input_tokens_seen": 26056704, + "step": 123470 + }, + { + "epoch": 13.583608360836084, + "grad_norm": 0.05951971560716629, + "learning_rate": 1.4105481691206763e-05, + "loss": 0.0185, + "num_input_tokens_seen": 26057760, + "step": 123475 + }, + { + "epoch": 13.584158415841584, + "grad_norm": 1.802933692932129, + "learning_rate": 1.4103321553588483e-05, + "loss": 0.0626, + "num_input_tokens_seen": 26058912, + "step": 123480 + }, + { + "epoch": 13.584708470847085, + "grad_norm": 0.07284820079803467, + "learning_rate": 1.4101161516399655e-05, + "loss": 0.0242, + "num_input_tokens_seen": 26059904, + "step": 123485 + }, + { + "epoch": 13.585258525852586, + "grad_norm": 0.11476761102676392, + "learning_rate": 1.4099001579660199e-05, + "loss": 0.0128, + "num_input_tokens_seen": 26060992, + "step": 123490 + }, + { + "epoch": 13.585808580858085, + "grad_norm": 0.3323221802711487, + "learning_rate": 1.4096841743390005e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26062080, + "step": 123495 + }, + { + "epoch": 13.586358635863586, + "grad_norm": 0.1787508875131607, + "learning_rate": 1.409468200760899e-05, + "loss": 0.0068, + "num_input_tokens_seen": 26063104, + "step": 123500 + }, + { + "epoch": 13.586908690869087, + "grad_norm": 2.2631146907806396, + "learning_rate": 1.4092522372337067e-05, + "loss": 0.0203, + "num_input_tokens_seen": 26064160, + "step": 123505 + }, + { + "epoch": 13.587458745874587, + "grad_norm": 0.01619253307580948, + "learning_rate": 1.4090362837594121e-05, + "loss": 0.0106, + "num_input_tokens_seen": 26065248, + "step": 123510 + }, + { + "epoch": 13.588008800880088, + "grad_norm": 0.006300663575530052, + "learning_rate": 1.4088203403400075e-05, + "loss": 0.0159, + "num_input_tokens_seen": 26066336, + "step": 123515 + }, + { + "epoch": 13.588558855885589, + "grad_norm": 0.2787490785121918, + "learning_rate": 1.4086044069774823e-05, + "loss": 0.0832, + "num_input_tokens_seen": 26067328, + "step": 123520 + }, + { + "epoch": 13.589108910891088, + "grad_norm": 0.05324747785925865, + "learning_rate": 1.4083884836738257e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26068320, + "step": 123525 + }, + { + "epoch": 13.58965896589659, + "grad_norm": 0.08736086636781693, + "learning_rate": 1.4081725704310284e-05, + "loss": 0.0219, + "num_input_tokens_seen": 26069376, + "step": 123530 + }, + { + "epoch": 13.59020902090209, + "grad_norm": 1.0456434488296509, + "learning_rate": 1.4079566672510808e-05, + "loss": 0.0459, + "num_input_tokens_seen": 26070400, + "step": 123535 + }, + { + "epoch": 13.590759075907592, + "grad_norm": 0.05119550600647926, + "learning_rate": 1.4077407741359738e-05, + "loss": 0.0761, + "num_input_tokens_seen": 26071392, + "step": 123540 + }, + { + "epoch": 13.591309130913091, + "grad_norm": 0.025519343093037605, + "learning_rate": 1.4075248910876953e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26072416, + "step": 123545 + }, + { + "epoch": 13.591859185918592, + "grad_norm": 0.1601230949163437, + "learning_rate": 1.4073090181082354e-05, + "loss": 0.0067, + "num_input_tokens_seen": 26073472, + "step": 123550 + }, + { + "epoch": 13.592409240924093, + "grad_norm": 3.689103841781616, + "learning_rate": 1.4070931551995836e-05, + "loss": 0.0437, + "num_input_tokens_seen": 26074560, + "step": 123555 + }, + { + "epoch": 13.592959295929592, + "grad_norm": 0.010368525981903076, + "learning_rate": 1.4068773023637308e-05, + "loss": 0.0064, + "num_input_tokens_seen": 26075616, + "step": 123560 + }, + { + "epoch": 13.593509350935093, + "grad_norm": 0.7528486847877502, + "learning_rate": 1.4066614596026644e-05, + "loss": 0.0087, + "num_input_tokens_seen": 26076640, + "step": 123565 + }, + { + "epoch": 13.594059405940595, + "grad_norm": 0.05748075619339943, + "learning_rate": 1.4064456269183757e-05, + "loss": 0.0182, + "num_input_tokens_seen": 26077696, + "step": 123570 + }, + { + "epoch": 13.594609460946094, + "grad_norm": 0.11848396062850952, + "learning_rate": 1.406229804312852e-05, + "loss": 0.0105, + "num_input_tokens_seen": 26078752, + "step": 123575 + }, + { + "epoch": 13.595159515951595, + "grad_norm": 0.0329892672598362, + "learning_rate": 1.4060139917880837e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26079776, + "step": 123580 + }, + { + "epoch": 13.595709570957096, + "grad_norm": 1.2233555316925049, + "learning_rate": 1.4057981893460593e-05, + "loss": 0.0267, + "num_input_tokens_seen": 26080832, + "step": 123585 + }, + { + "epoch": 13.596259625962595, + "grad_norm": 0.06781340390443802, + "learning_rate": 1.4055823969887676e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26081920, + "step": 123590 + }, + { + "epoch": 13.596809680968097, + "grad_norm": 3.336688756942749, + "learning_rate": 1.4053666147181987e-05, + "loss": 0.0713, + "num_input_tokens_seen": 26082944, + "step": 123595 + }, + { + "epoch": 13.597359735973598, + "grad_norm": 0.0035194391384720802, + "learning_rate": 1.4051508425363396e-05, + "loss": 0.0462, + "num_input_tokens_seen": 26083936, + "step": 123600 + }, + { + "epoch": 13.597909790979099, + "grad_norm": 0.026595216244459152, + "learning_rate": 1.4049350804451811e-05, + "loss": 0.0198, + "num_input_tokens_seen": 26084960, + "step": 123605 + }, + { + "epoch": 13.598459845984598, + "grad_norm": 0.04288181662559509, + "learning_rate": 1.4047193284467092e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26086048, + "step": 123610 + }, + { + "epoch": 13.599009900990099, + "grad_norm": 0.058576054871082306, + "learning_rate": 1.4045035865429135e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26087040, + "step": 123615 + }, + { + "epoch": 13.5995599559956, + "grad_norm": 1.5123190879821777, + "learning_rate": 1.4042878547357841e-05, + "loss": 0.0336, + "num_input_tokens_seen": 26088128, + "step": 123620 + }, + { + "epoch": 13.6001100110011, + "grad_norm": 1.1752843856811523, + "learning_rate": 1.4040721330273062e-05, + "loss": 0.113, + "num_input_tokens_seen": 26089152, + "step": 123625 + }, + { + "epoch": 13.6006600660066, + "grad_norm": 0.027694018557667732, + "learning_rate": 1.403856421419471e-05, + "loss": 0.0366, + "num_input_tokens_seen": 26090208, + "step": 123630 + }, + { + "epoch": 13.601210121012102, + "grad_norm": 1.2263531684875488, + "learning_rate": 1.4036407199142647e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26091232, + "step": 123635 + }, + { + "epoch": 13.601760176017601, + "grad_norm": 1.0649125576019287, + "learning_rate": 1.4034250285136752e-05, + "loss": 0.0834, + "num_input_tokens_seen": 26092320, + "step": 123640 + }, + { + "epoch": 13.602310231023102, + "grad_norm": 0.1423913985490799, + "learning_rate": 1.403209347219691e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26093440, + "step": 123645 + }, + { + "epoch": 13.602860286028603, + "grad_norm": 4.524352550506592, + "learning_rate": 1.4029936760342999e-05, + "loss": 0.0808, + "num_input_tokens_seen": 26094528, + "step": 123650 + }, + { + "epoch": 13.603410341034103, + "grad_norm": 0.08190812170505524, + "learning_rate": 1.4027780149594907e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26095552, + "step": 123655 + }, + { + "epoch": 13.603960396039604, + "grad_norm": 0.40067189931869507, + "learning_rate": 1.4025623639972502e-05, + "loss": 0.0412, + "num_input_tokens_seen": 26096544, + "step": 123660 + }, + { + "epoch": 13.604510451045105, + "grad_norm": 0.05783018469810486, + "learning_rate": 1.4023467231495646e-05, + "loss": 0.0132, + "num_input_tokens_seen": 26097632, + "step": 123665 + }, + { + "epoch": 13.605060506050606, + "grad_norm": 0.014213393442332745, + "learning_rate": 1.4021310924184223e-05, + "loss": 0.0613, + "num_input_tokens_seen": 26098656, + "step": 123670 + }, + { + "epoch": 13.605610561056105, + "grad_norm": 0.01332093309611082, + "learning_rate": 1.4019154718058114e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26099776, + "step": 123675 + }, + { + "epoch": 13.606160616061606, + "grad_norm": 0.02478375844657421, + "learning_rate": 1.4016998613137194e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26100864, + "step": 123680 + }, + { + "epoch": 13.606710671067107, + "grad_norm": 2.0330417156219482, + "learning_rate": 1.4014842609441325e-05, + "loss": 0.0766, + "num_input_tokens_seen": 26101888, + "step": 123685 + }, + { + "epoch": 13.607260726072607, + "grad_norm": 0.2445906102657318, + "learning_rate": 1.4012686706990375e-05, + "loss": 0.0244, + "num_input_tokens_seen": 26102944, + "step": 123690 + }, + { + "epoch": 13.607810781078108, + "grad_norm": 0.21622677147388458, + "learning_rate": 1.4010530905804228e-05, + "loss": 0.0152, + "num_input_tokens_seen": 26104000, + "step": 123695 + }, + { + "epoch": 13.608360836083609, + "grad_norm": 3.5858657360076904, + "learning_rate": 1.4008375205902735e-05, + "loss": 0.1451, + "num_input_tokens_seen": 26105056, + "step": 123700 + }, + { + "epoch": 13.608910891089108, + "grad_norm": 0.023548424243927002, + "learning_rate": 1.4006219607305774e-05, + "loss": 0.017, + "num_input_tokens_seen": 26106176, + "step": 123705 + }, + { + "epoch": 13.60946094609461, + "grad_norm": 0.06332296133041382, + "learning_rate": 1.4004064110033222e-05, + "loss": 0.002, + "num_input_tokens_seen": 26107200, + "step": 123710 + }, + { + "epoch": 13.61001100110011, + "grad_norm": 0.055564746260643005, + "learning_rate": 1.4001908714104922e-05, + "loss": 0.0053, + "num_input_tokens_seen": 26108224, + "step": 123715 + }, + { + "epoch": 13.61056105610561, + "grad_norm": 0.035583265125751495, + "learning_rate": 1.3999753419540763e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26109280, + "step": 123720 + }, + { + "epoch": 13.61111111111111, + "grad_norm": 0.06958147883415222, + "learning_rate": 1.3997598226360586e-05, + "loss": 0.0055, + "num_input_tokens_seen": 26110240, + "step": 123725 + }, + { + "epoch": 13.611661166116612, + "grad_norm": 0.026745272800326347, + "learning_rate": 1.3995443134584268e-05, + "loss": 0.1379, + "num_input_tokens_seen": 26111296, + "step": 123730 + }, + { + "epoch": 13.612211221122113, + "grad_norm": 0.014711901545524597, + "learning_rate": 1.3993288144231684e-05, + "loss": 0.044, + "num_input_tokens_seen": 26112448, + "step": 123735 + }, + { + "epoch": 13.612761276127612, + "grad_norm": 0.003223478328436613, + "learning_rate": 1.3991133255322669e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26113472, + "step": 123740 + }, + { + "epoch": 13.613311331133113, + "grad_norm": 0.02526228129863739, + "learning_rate": 1.3988978467877106e-05, + "loss": 0.0074, + "num_input_tokens_seen": 26114592, + "step": 123745 + }, + { + "epoch": 13.613861386138614, + "grad_norm": 0.09690773487091064, + "learning_rate": 1.3986823781914842e-05, + "loss": 0.0048, + "num_input_tokens_seen": 26115584, + "step": 123750 + }, + { + "epoch": 13.614411441144114, + "grad_norm": 0.03015417978167534, + "learning_rate": 1.3984669197455735e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26116608, + "step": 123755 + }, + { + "epoch": 13.614961496149615, + "grad_norm": 0.32657498121261597, + "learning_rate": 1.398251471451964e-05, + "loss": 0.0093, + "num_input_tokens_seen": 26117760, + "step": 123760 + }, + { + "epoch": 13.615511551155116, + "grad_norm": 0.008303504437208176, + "learning_rate": 1.3980360333126425e-05, + "loss": 0.0156, + "num_input_tokens_seen": 26118784, + "step": 123765 + }, + { + "epoch": 13.616061606160617, + "grad_norm": 0.5245017409324646, + "learning_rate": 1.3978206053295951e-05, + "loss": 0.0467, + "num_input_tokens_seen": 26119840, + "step": 123770 + }, + { + "epoch": 13.616611661166116, + "grad_norm": 0.040555089712142944, + "learning_rate": 1.3976051875048058e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26120864, + "step": 123775 + }, + { + "epoch": 13.617161716171617, + "grad_norm": 0.03127053380012512, + "learning_rate": 1.3973897798402599e-05, + "loss": 0.0359, + "num_input_tokens_seen": 26121952, + "step": 123780 + }, + { + "epoch": 13.617711771177119, + "grad_norm": 0.0077400109730660915, + "learning_rate": 1.397174382337944e-05, + "loss": 0.0042, + "num_input_tokens_seen": 26123040, + "step": 123785 + }, + { + "epoch": 13.618261826182618, + "grad_norm": 0.03008793294429779, + "learning_rate": 1.3969589949998412e-05, + "loss": 0.0009, + "num_input_tokens_seen": 26124096, + "step": 123790 + }, + { + "epoch": 13.618811881188119, + "grad_norm": 0.017892051488161087, + "learning_rate": 1.3967436178279396e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26125152, + "step": 123795 + }, + { + "epoch": 13.61936193619362, + "grad_norm": 0.01417851448059082, + "learning_rate": 1.3965282508242227e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26126208, + "step": 123800 + }, + { + "epoch": 13.61991199119912, + "grad_norm": 2.374643325805664, + "learning_rate": 1.3963128939906744e-05, + "loss": 0.1463, + "num_input_tokens_seen": 26127232, + "step": 123805 + }, + { + "epoch": 13.62046204620462, + "grad_norm": 0.43544015288352966, + "learning_rate": 1.396097547329282e-05, + "loss": 0.0625, + "num_input_tokens_seen": 26128320, + "step": 123810 + }, + { + "epoch": 13.621012101210122, + "grad_norm": 3.773355007171631, + "learning_rate": 1.3958822108420273e-05, + "loss": 0.0585, + "num_input_tokens_seen": 26129408, + "step": 123815 + }, + { + "epoch": 13.62156215621562, + "grad_norm": 0.03288428112864494, + "learning_rate": 1.3956668845308968e-05, + "loss": 0.0094, + "num_input_tokens_seen": 26130496, + "step": 123820 + }, + { + "epoch": 13.622112211221122, + "grad_norm": 0.011801742017269135, + "learning_rate": 1.3954515683978754e-05, + "loss": 0.0157, + "num_input_tokens_seen": 26131552, + "step": 123825 + }, + { + "epoch": 13.622662266226623, + "grad_norm": 0.18859682977199554, + "learning_rate": 1.3952362624449461e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26132576, + "step": 123830 + }, + { + "epoch": 13.623212321232124, + "grad_norm": 0.012820371426641941, + "learning_rate": 1.3950209666740948e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26133632, + "step": 123835 + }, + { + "epoch": 13.623762376237623, + "grad_norm": 0.058433957397937775, + "learning_rate": 1.3948056810873045e-05, + "loss": 0.0273, + "num_input_tokens_seen": 26134688, + "step": 123840 + }, + { + "epoch": 13.624312431243125, + "grad_norm": 0.028468841686844826, + "learning_rate": 1.3945904056865605e-05, + "loss": 0.0166, + "num_input_tokens_seen": 26135776, + "step": 123845 + }, + { + "epoch": 13.624862486248626, + "grad_norm": 0.35525935888290405, + "learning_rate": 1.3943751404738454e-05, + "loss": 0.0134, + "num_input_tokens_seen": 26136864, + "step": 123850 + }, + { + "epoch": 13.625412541254125, + "grad_norm": 0.05785655602812767, + "learning_rate": 1.394159885451144e-05, + "loss": 0.0056, + "num_input_tokens_seen": 26137888, + "step": 123855 + }, + { + "epoch": 13.625962596259626, + "grad_norm": 0.07436251640319824, + "learning_rate": 1.3939446406204414e-05, + "loss": 0.0026, + "num_input_tokens_seen": 26138944, + "step": 123860 + }, + { + "epoch": 13.626512651265127, + "grad_norm": 1.541329026222229, + "learning_rate": 1.3937294059837194e-05, + "loss": 0.045, + "num_input_tokens_seen": 26139968, + "step": 123865 + }, + { + "epoch": 13.627062706270626, + "grad_norm": 0.005936745088547468, + "learning_rate": 1.3935141815429637e-05, + "loss": 0.0424, + "num_input_tokens_seen": 26141024, + "step": 123870 + }, + { + "epoch": 13.627612761276128, + "grad_norm": 0.06138753518462181, + "learning_rate": 1.3932989673001556e-05, + "loss": 0.0471, + "num_input_tokens_seen": 26142016, + "step": 123875 + }, + { + "epoch": 13.628162816281629, + "grad_norm": 0.023167312145233154, + "learning_rate": 1.3930837632572799e-05, + "loss": 0.0081, + "num_input_tokens_seen": 26142976, + "step": 123880 + }, + { + "epoch": 13.628712871287128, + "grad_norm": 0.0043855756521224976, + "learning_rate": 1.3928685694163212e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26144064, + "step": 123885 + }, + { + "epoch": 13.629262926292629, + "grad_norm": 2.2380118370056152, + "learning_rate": 1.3926533857792607e-05, + "loss": 0.0895, + "num_input_tokens_seen": 26145120, + "step": 123890 + }, + { + "epoch": 13.62981298129813, + "grad_norm": 0.017343562096357346, + "learning_rate": 1.3924382123480833e-05, + "loss": 0.0018, + "num_input_tokens_seen": 26146208, + "step": 123895 + }, + { + "epoch": 13.630363036303631, + "grad_norm": 0.11222352832555771, + "learning_rate": 1.3922230491247712e-05, + "loss": 0.1096, + "num_input_tokens_seen": 26147264, + "step": 123900 + }, + { + "epoch": 13.63091309130913, + "grad_norm": 0.007537432014942169, + "learning_rate": 1.3920078961113065e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26148320, + "step": 123905 + }, + { + "epoch": 13.631463146314632, + "grad_norm": 0.014970621094107628, + "learning_rate": 1.3917927533096748e-05, + "loss": 0.1587, + "num_input_tokens_seen": 26149376, + "step": 123910 + }, + { + "epoch": 13.632013201320133, + "grad_norm": 0.044938646256923676, + "learning_rate": 1.3915776207218565e-05, + "loss": 0.0435, + "num_input_tokens_seen": 26150432, + "step": 123915 + }, + { + "epoch": 13.632563256325632, + "grad_norm": 0.06820040196180344, + "learning_rate": 1.3913624983498363e-05, + "loss": 0.0884, + "num_input_tokens_seen": 26151520, + "step": 123920 + }, + { + "epoch": 13.633113311331133, + "grad_norm": 0.3666684329509735, + "learning_rate": 1.3911473861955965e-05, + "loss": 0.0073, + "num_input_tokens_seen": 26152544, + "step": 123925 + }, + { + "epoch": 13.633663366336634, + "grad_norm": 2.9724509716033936, + "learning_rate": 1.3909322842611177e-05, + "loss": 0.0173, + "num_input_tokens_seen": 26153536, + "step": 123930 + }, + { + "epoch": 13.634213421342134, + "grad_norm": 0.008800112642347813, + "learning_rate": 1.3907171925483841e-05, + "loss": 0.0061, + "num_input_tokens_seen": 26154592, + "step": 123935 + }, + { + "epoch": 13.634763476347635, + "grad_norm": 0.13538934290409088, + "learning_rate": 1.3905021110593786e-05, + "loss": 0.0059, + "num_input_tokens_seen": 26155616, + "step": 123940 + }, + { + "epoch": 13.635313531353136, + "grad_norm": 0.006306246854364872, + "learning_rate": 1.390287039796082e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26156672, + "step": 123945 + }, + { + "epoch": 13.635863586358635, + "grad_norm": 0.009092793799936771, + "learning_rate": 1.3900719787604783e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26157792, + "step": 123950 + }, + { + "epoch": 13.636413641364136, + "grad_norm": 0.025644488632678986, + "learning_rate": 1.3898569279545473e-05, + "loss": 0.0692, + "num_input_tokens_seen": 26158848, + "step": 123955 + }, + { + "epoch": 13.636963696369637, + "grad_norm": 0.014650820754468441, + "learning_rate": 1.3896418873802736e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26159840, + "step": 123960 + }, + { + "epoch": 13.637513751375138, + "grad_norm": 0.134572371840477, + "learning_rate": 1.3894268570396365e-05, + "loss": 0.0127, + "num_input_tokens_seen": 26160864, + "step": 123965 + }, + { + "epoch": 13.638063806380638, + "grad_norm": 0.1287684440612793, + "learning_rate": 1.3892118369346194e-05, + "loss": 0.0168, + "num_input_tokens_seen": 26161952, + "step": 123970 + }, + { + "epoch": 13.638613861386139, + "grad_norm": 0.0348961316049099, + "learning_rate": 1.3889968270672046e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26163040, + "step": 123975 + }, + { + "epoch": 13.63916391639164, + "grad_norm": 4.052585124969482, + "learning_rate": 1.3887818274393722e-05, + "loss": 0.078, + "num_input_tokens_seen": 26164096, + "step": 123980 + }, + { + "epoch": 13.63971397139714, + "grad_norm": 0.011567998677492142, + "learning_rate": 1.3885668380531053e-05, + "loss": 0.017, + "num_input_tokens_seen": 26165152, + "step": 123985 + }, + { + "epoch": 13.64026402640264, + "grad_norm": 0.6797411441802979, + "learning_rate": 1.3883518589103833e-05, + "loss": 0.0193, + "num_input_tokens_seen": 26166176, + "step": 123990 + }, + { + "epoch": 13.640814081408141, + "grad_norm": 2.197593927383423, + "learning_rate": 1.388136890013189e-05, + "loss": 0.0872, + "num_input_tokens_seen": 26167328, + "step": 123995 + }, + { + "epoch": 13.64136413641364, + "grad_norm": 0.034242283552885056, + "learning_rate": 1.3879219313635048e-05, + "loss": 0.0009, + "num_input_tokens_seen": 26168352, + "step": 124000 + }, + { + "epoch": 13.641914191419142, + "grad_norm": 0.014407981187105179, + "learning_rate": 1.3877069829633093e-05, + "loss": 0.0944, + "num_input_tokens_seen": 26169408, + "step": 124005 + }, + { + "epoch": 13.642464246424643, + "grad_norm": 1.0769602060317993, + "learning_rate": 1.3874920448145857e-05, + "loss": 0.0124, + "num_input_tokens_seen": 26170432, + "step": 124010 + }, + { + "epoch": 13.643014301430142, + "grad_norm": 0.04987417161464691, + "learning_rate": 1.3872771169193142e-05, + "loss": 0.0852, + "num_input_tokens_seen": 26171520, + "step": 124015 + }, + { + "epoch": 13.643564356435643, + "grad_norm": 0.2120146006345749, + "learning_rate": 1.3870621992794744e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26172640, + "step": 124020 + }, + { + "epoch": 13.644114411441144, + "grad_norm": 0.011838747188448906, + "learning_rate": 1.3868472918970484e-05, + "loss": 0.0793, + "num_input_tokens_seen": 26173696, + "step": 124025 + }, + { + "epoch": 13.644664466446645, + "grad_norm": 4.232265472412109, + "learning_rate": 1.3866323947740168e-05, + "loss": 0.0364, + "num_input_tokens_seen": 26174752, + "step": 124030 + }, + { + "epoch": 13.645214521452145, + "grad_norm": 0.004904808010905981, + "learning_rate": 1.386417507912361e-05, + "loss": 0.0499, + "num_input_tokens_seen": 26175744, + "step": 124035 + }, + { + "epoch": 13.645764576457646, + "grad_norm": 0.00443270243704319, + "learning_rate": 1.3862026313140608e-05, + "loss": 0.0061, + "num_input_tokens_seen": 26176864, + "step": 124040 + }, + { + "epoch": 13.646314631463147, + "grad_norm": 0.007907596416771412, + "learning_rate": 1.3859877649810956e-05, + "loss": 0.1234, + "num_input_tokens_seen": 26178016, + "step": 124045 + }, + { + "epoch": 13.646864686468646, + "grad_norm": 0.030709629878401756, + "learning_rate": 1.3857729089154464e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26179072, + "step": 124050 + }, + { + "epoch": 13.647414741474147, + "grad_norm": 0.007741543930023909, + "learning_rate": 1.385558063119094e-05, + "loss": 0.0042, + "num_input_tokens_seen": 26180128, + "step": 124055 + }, + { + "epoch": 13.647964796479648, + "grad_norm": 0.006227131001651287, + "learning_rate": 1.3853432275940188e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26181184, + "step": 124060 + }, + { + "epoch": 13.648514851485148, + "grad_norm": 0.18224304914474487, + "learning_rate": 1.3851284023422005e-05, + "loss": 0.0326, + "num_input_tokens_seen": 26182176, + "step": 124065 + }, + { + "epoch": 13.649064906490649, + "grad_norm": 0.3018646836280823, + "learning_rate": 1.3849135873656177e-05, + "loss": 0.0102, + "num_input_tokens_seen": 26183264, + "step": 124070 + }, + { + "epoch": 13.64961496149615, + "grad_norm": 0.03745873272418976, + "learning_rate": 1.3846987826662522e-05, + "loss": 0.0586, + "num_input_tokens_seen": 26184320, + "step": 124075 + }, + { + "epoch": 13.65016501650165, + "grad_norm": 0.06153426691889763, + "learning_rate": 1.3844839882460817e-05, + "loss": 0.0082, + "num_input_tokens_seen": 26185376, + "step": 124080 + }, + { + "epoch": 13.65071507150715, + "grad_norm": 0.006290742661803961, + "learning_rate": 1.3842692041070871e-05, + "loss": 0.0047, + "num_input_tokens_seen": 26186368, + "step": 124085 + }, + { + "epoch": 13.651265126512651, + "grad_norm": 0.010329836048185825, + "learning_rate": 1.3840544302512487e-05, + "loss": 0.0087, + "num_input_tokens_seen": 26187456, + "step": 124090 + }, + { + "epoch": 13.651815181518153, + "grad_norm": 0.04209323972463608, + "learning_rate": 1.3838396666805444e-05, + "loss": 0.01, + "num_input_tokens_seen": 26188448, + "step": 124095 + }, + { + "epoch": 13.652365236523652, + "grad_norm": 0.07264495640993118, + "learning_rate": 1.3836249133969554e-05, + "loss": 0.0478, + "num_input_tokens_seen": 26189536, + "step": 124100 + }, + { + "epoch": 13.652915291529153, + "grad_norm": 0.061751313507556915, + "learning_rate": 1.3834101704024588e-05, + "loss": 0.0594, + "num_input_tokens_seen": 26190656, + "step": 124105 + }, + { + "epoch": 13.653465346534654, + "grad_norm": 0.014312967658042908, + "learning_rate": 1.3831954376990346e-05, + "loss": 0.086, + "num_input_tokens_seen": 26191776, + "step": 124110 + }, + { + "epoch": 13.654015401540153, + "grad_norm": 0.010011821053922176, + "learning_rate": 1.3829807152886633e-05, + "loss": 0.08, + "num_input_tokens_seen": 26192832, + "step": 124115 + }, + { + "epoch": 13.654565456545654, + "grad_norm": 3.4124600887298584, + "learning_rate": 1.3827660031733219e-05, + "loss": 0.0734, + "num_input_tokens_seen": 26193856, + "step": 124120 + }, + { + "epoch": 13.655115511551156, + "grad_norm": 0.013745645992457867, + "learning_rate": 1.3825513013549912e-05, + "loss": 0.0251, + "num_input_tokens_seen": 26194912, + "step": 124125 + }, + { + "epoch": 13.655665566556655, + "grad_norm": 0.0379551537334919, + "learning_rate": 1.3823366098356487e-05, + "loss": 0.0997, + "num_input_tokens_seen": 26195968, + "step": 124130 + }, + { + "epoch": 13.656215621562156, + "grad_norm": 0.05512149631977081, + "learning_rate": 1.3821219286172727e-05, + "loss": 0.0078, + "num_input_tokens_seen": 26196960, + "step": 124135 + }, + { + "epoch": 13.656765676567657, + "grad_norm": 0.46487170457839966, + "learning_rate": 1.3819072577018422e-05, + "loss": 0.0081, + "num_input_tokens_seen": 26198048, + "step": 124140 + }, + { + "epoch": 13.657315731573158, + "grad_norm": 0.014842482283711433, + "learning_rate": 1.3816925970913359e-05, + "loss": 0.0012, + "num_input_tokens_seen": 26199072, + "step": 124145 + }, + { + "epoch": 13.657865786578657, + "grad_norm": 1.1882761716842651, + "learning_rate": 1.3814779467877333e-05, + "loss": 0.0791, + "num_input_tokens_seen": 26200192, + "step": 124150 + }, + { + "epoch": 13.658415841584159, + "grad_norm": 0.00460272328928113, + "learning_rate": 1.381263306793012e-05, + "loss": 0.0637, + "num_input_tokens_seen": 26201248, + "step": 124155 + }, + { + "epoch": 13.65896589658966, + "grad_norm": 0.8352373838424683, + "learning_rate": 1.3810486771091488e-05, + "loss": 0.01, + "num_input_tokens_seen": 26202368, + "step": 124160 + }, + { + "epoch": 13.659515951595159, + "grad_norm": 0.7682784795761108, + "learning_rate": 1.3808340577381229e-05, + "loss": 0.0125, + "num_input_tokens_seen": 26203424, + "step": 124165 + }, + { + "epoch": 13.66006600660066, + "grad_norm": 0.009931869804859161, + "learning_rate": 1.3806194486819129e-05, + "loss": 0.006, + "num_input_tokens_seen": 26204512, + "step": 124170 + }, + { + "epoch": 13.660616061606161, + "grad_norm": 2.1049275398254395, + "learning_rate": 1.3804048499424971e-05, + "loss": 0.0531, + "num_input_tokens_seen": 26205600, + "step": 124175 + }, + { + "epoch": 13.66116611661166, + "grad_norm": 0.03857802599668503, + "learning_rate": 1.3801902615218527e-05, + "loss": 0.1134, + "num_input_tokens_seen": 26206560, + "step": 124180 + }, + { + "epoch": 13.661716171617162, + "grad_norm": 0.02290601097047329, + "learning_rate": 1.3799756834219557e-05, + "loss": 0.0097, + "num_input_tokens_seen": 26207616, + "step": 124185 + }, + { + "epoch": 13.662266226622663, + "grad_norm": 0.06778721511363983, + "learning_rate": 1.3797611156447871e-05, + "loss": 0.0379, + "num_input_tokens_seen": 26208736, + "step": 124190 + }, + { + "epoch": 13.662816281628164, + "grad_norm": 0.09403432160615921, + "learning_rate": 1.3795465581923215e-05, + "loss": 0.0044, + "num_input_tokens_seen": 26209728, + "step": 124195 + }, + { + "epoch": 13.663366336633663, + "grad_norm": 0.028229767456650734, + "learning_rate": 1.3793320110665375e-05, + "loss": 0.0057, + "num_input_tokens_seen": 26210848, + "step": 124200 + }, + { + "epoch": 13.663916391639164, + "grad_norm": 2.2616331577301025, + "learning_rate": 1.3791174742694134e-05, + "loss": 0.1078, + "num_input_tokens_seen": 26211872, + "step": 124205 + }, + { + "epoch": 13.664466446644665, + "grad_norm": 2.4982049465179443, + "learning_rate": 1.3789029478029253e-05, + "loss": 0.1036, + "num_input_tokens_seen": 26212928, + "step": 124210 + }, + { + "epoch": 13.665016501650165, + "grad_norm": 0.04361322522163391, + "learning_rate": 1.3786884316690516e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26213920, + "step": 124215 + }, + { + "epoch": 13.665566556655666, + "grad_norm": 0.003092855215072632, + "learning_rate": 1.3784739258697674e-05, + "loss": 0.0061, + "num_input_tokens_seen": 26214912, + "step": 124220 + }, + { + "epoch": 13.666116611661167, + "grad_norm": 0.06394913047552109, + "learning_rate": 1.378259430407051e-05, + "loss": 0.0036, + "num_input_tokens_seen": 26215968, + "step": 124225 + }, + { + "epoch": 13.666666666666666, + "grad_norm": 0.008389786817133427, + "learning_rate": 1.3780449452828798e-05, + "loss": 0.0034, + "num_input_tokens_seen": 26216992, + "step": 124230 + }, + { + "epoch": 13.667216721672167, + "grad_norm": 0.011750208213925362, + "learning_rate": 1.3778304704992296e-05, + "loss": 0.0235, + "num_input_tokens_seen": 26218016, + "step": 124235 + }, + { + "epoch": 13.667766776677668, + "grad_norm": 0.016165290027856827, + "learning_rate": 1.3776160060580778e-05, + "loss": 0.0185, + "num_input_tokens_seen": 26219040, + "step": 124240 + }, + { + "epoch": 13.668316831683168, + "grad_norm": 0.14882104098796844, + "learning_rate": 1.3774015519614003e-05, + "loss": 0.0489, + "num_input_tokens_seen": 26220096, + "step": 124245 + }, + { + "epoch": 13.668866886688669, + "grad_norm": 0.05671907588839531, + "learning_rate": 1.3771871082111748e-05, + "loss": 0.0969, + "num_input_tokens_seen": 26221120, + "step": 124250 + }, + { + "epoch": 13.66941694169417, + "grad_norm": 0.24392619729042053, + "learning_rate": 1.3769726748093759e-05, + "loss": 0.0863, + "num_input_tokens_seen": 26222176, + "step": 124255 + }, + { + "epoch": 13.66996699669967, + "grad_norm": 0.030140750110149384, + "learning_rate": 1.3767582517579809e-05, + "loss": 0.0541, + "num_input_tokens_seen": 26223232, + "step": 124260 + }, + { + "epoch": 13.67051705170517, + "grad_norm": 1.6447460651397705, + "learning_rate": 1.376543839058967e-05, + "loss": 0.0497, + "num_input_tokens_seen": 26224256, + "step": 124265 + }, + { + "epoch": 13.671067106710671, + "grad_norm": 2.133432626724243, + "learning_rate": 1.3763294367143086e-05, + "loss": 0.1336, + "num_input_tokens_seen": 26225344, + "step": 124270 + }, + { + "epoch": 13.671617161716172, + "grad_norm": 0.014637826941907406, + "learning_rate": 1.3761150447259838e-05, + "loss": 0.0082, + "num_input_tokens_seen": 26226432, + "step": 124275 + }, + { + "epoch": 13.672167216721672, + "grad_norm": 0.5733208656311035, + "learning_rate": 1.3759006630959662e-05, + "loss": 0.0081, + "num_input_tokens_seen": 26227456, + "step": 124280 + }, + { + "epoch": 13.672717271727173, + "grad_norm": 0.368213951587677, + "learning_rate": 1.3756862918262326e-05, + "loss": 0.0657, + "num_input_tokens_seen": 26228480, + "step": 124285 + }, + { + "epoch": 13.673267326732674, + "grad_norm": 2.0737085342407227, + "learning_rate": 1.3754719309187603e-05, + "loss": 0.0597, + "num_input_tokens_seen": 26229504, + "step": 124290 + }, + { + "epoch": 13.673817381738173, + "grad_norm": 0.014912738464772701, + "learning_rate": 1.3752575803755225e-05, + "loss": 0.0022, + "num_input_tokens_seen": 26230592, + "step": 124295 + }, + { + "epoch": 13.674367436743674, + "grad_norm": 0.0308084674179554, + "learning_rate": 1.3750432401984966e-05, + "loss": 0.0134, + "num_input_tokens_seen": 26231744, + "step": 124300 + }, + { + "epoch": 13.674917491749175, + "grad_norm": 0.004320845473557711, + "learning_rate": 1.3748289103896579e-05, + "loss": 0.0018, + "num_input_tokens_seen": 26232768, + "step": 124305 + }, + { + "epoch": 13.675467546754675, + "grad_norm": 0.049788281321525574, + "learning_rate": 1.3746145909509794e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26233760, + "step": 124310 + }, + { + "epoch": 13.676017601760176, + "grad_norm": 0.01961767114698887, + "learning_rate": 1.3744002818844387e-05, + "loss": 0.1512, + "num_input_tokens_seen": 26234880, + "step": 124315 + }, + { + "epoch": 13.676567656765677, + "grad_norm": 1.215973973274231, + "learning_rate": 1.3741859831920107e-05, + "loss": 0.0589, + "num_input_tokens_seen": 26236000, + "step": 124320 + }, + { + "epoch": 13.677117711771178, + "grad_norm": 0.019802534952759743, + "learning_rate": 1.3739716948756709e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26237120, + "step": 124325 + }, + { + "epoch": 13.677667766776677, + "grad_norm": 1.0041180849075317, + "learning_rate": 1.373757416937394e-05, + "loss": 0.0339, + "num_input_tokens_seen": 26238208, + "step": 124330 + }, + { + "epoch": 13.678217821782178, + "grad_norm": 0.01784166879951954, + "learning_rate": 1.3735431493791532e-05, + "loss": 0.0465, + "num_input_tokens_seen": 26239296, + "step": 124335 + }, + { + "epoch": 13.67876787678768, + "grad_norm": 0.14572250843048096, + "learning_rate": 1.3733288922029248e-05, + "loss": 0.037, + "num_input_tokens_seen": 26240352, + "step": 124340 + }, + { + "epoch": 13.679317931793179, + "grad_norm": 0.03299808129668236, + "learning_rate": 1.3731146454106844e-05, + "loss": 0.0136, + "num_input_tokens_seen": 26241440, + "step": 124345 + }, + { + "epoch": 13.67986798679868, + "grad_norm": 0.08339163661003113, + "learning_rate": 1.3729004090044045e-05, + "loss": 0.002, + "num_input_tokens_seen": 26242496, + "step": 124350 + }, + { + "epoch": 13.680418041804181, + "grad_norm": 0.016177283599972725, + "learning_rate": 1.372686182986062e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26243584, + "step": 124355 + }, + { + "epoch": 13.68096809680968, + "grad_norm": 0.22375816106796265, + "learning_rate": 1.3724719673576286e-05, + "loss": 0.0325, + "num_input_tokens_seen": 26244608, + "step": 124360 + }, + { + "epoch": 13.681518151815181, + "grad_norm": 0.54301917552948, + "learning_rate": 1.3722577621210814e-05, + "loss": 0.0882, + "num_input_tokens_seen": 26245664, + "step": 124365 + }, + { + "epoch": 13.682068206820682, + "grad_norm": 0.42169126868247986, + "learning_rate": 1.3720435672783922e-05, + "loss": 0.007, + "num_input_tokens_seen": 26246784, + "step": 124370 + }, + { + "epoch": 13.682618261826182, + "grad_norm": 0.033010151237249374, + "learning_rate": 1.371829382831536e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26247840, + "step": 124375 + }, + { + "epoch": 13.683168316831683, + "grad_norm": 0.018386751413345337, + "learning_rate": 1.371615208782488e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26248864, + "step": 124380 + }, + { + "epoch": 13.683718371837184, + "grad_norm": 0.009621012955904007, + "learning_rate": 1.3714010451332204e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26249952, + "step": 124385 + }, + { + "epoch": 13.684268426842685, + "grad_norm": 0.011153480038046837, + "learning_rate": 1.3711868918857085e-05, + "loss": 0.0571, + "num_input_tokens_seen": 26251008, + "step": 124390 + }, + { + "epoch": 13.684818481848184, + "grad_norm": 0.1889835149049759, + "learning_rate": 1.3709727490419246e-05, + "loss": 0.0365, + "num_input_tokens_seen": 26252032, + "step": 124395 + }, + { + "epoch": 13.685368536853685, + "grad_norm": 0.011579680256545544, + "learning_rate": 1.3707586166038433e-05, + "loss": 0.0217, + "num_input_tokens_seen": 26253088, + "step": 124400 + }, + { + "epoch": 13.685918591859187, + "grad_norm": 0.024116652086377144, + "learning_rate": 1.3705444945734386e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26254144, + "step": 124405 + }, + { + "epoch": 13.686468646864686, + "grad_norm": 0.016233470290899277, + "learning_rate": 1.3703303829526826e-05, + "loss": 0.0065, + "num_input_tokens_seen": 26255200, + "step": 124410 + }, + { + "epoch": 13.687018701870187, + "grad_norm": 0.08871801942586899, + "learning_rate": 1.3701162817435503e-05, + "loss": 0.0546, + "num_input_tokens_seen": 26256288, + "step": 124415 + }, + { + "epoch": 13.687568756875688, + "grad_norm": 0.0734291523694992, + "learning_rate": 1.3699021909480136e-05, + "loss": 0.073, + "num_input_tokens_seen": 26257376, + "step": 124420 + }, + { + "epoch": 13.688118811881187, + "grad_norm": 0.007231613155454397, + "learning_rate": 1.369688110568046e-05, + "loss": 0.1176, + "num_input_tokens_seen": 26258432, + "step": 124425 + }, + { + "epoch": 13.688668866886688, + "grad_norm": 0.4315626323223114, + "learning_rate": 1.36947404060562e-05, + "loss": 0.0082, + "num_input_tokens_seen": 26259424, + "step": 124430 + }, + { + "epoch": 13.68921892189219, + "grad_norm": 0.08398903161287308, + "learning_rate": 1.3692599810627096e-05, + "loss": 0.044, + "num_input_tokens_seen": 26260416, + "step": 124435 + }, + { + "epoch": 13.689768976897689, + "grad_norm": 0.10659436881542206, + "learning_rate": 1.3690459319412885e-05, + "loss": 0.0327, + "num_input_tokens_seen": 26261472, + "step": 124440 + }, + { + "epoch": 13.69031903190319, + "grad_norm": 1.575243592262268, + "learning_rate": 1.368831893243328e-05, + "loss": 0.0067, + "num_input_tokens_seen": 26262528, + "step": 124445 + }, + { + "epoch": 13.690869086908691, + "grad_norm": 0.028944509103894234, + "learning_rate": 1.3686178649708004e-05, + "loss": 0.0204, + "num_input_tokens_seen": 26263584, + "step": 124450 + }, + { + "epoch": 13.691419141914192, + "grad_norm": 0.015305876731872559, + "learning_rate": 1.3684038471256787e-05, + "loss": 0.0146, + "num_input_tokens_seen": 26264672, + "step": 124455 + }, + { + "epoch": 13.691969196919691, + "grad_norm": 0.06229478120803833, + "learning_rate": 1.3681898397099362e-05, + "loss": 0.0299, + "num_input_tokens_seen": 26265792, + "step": 124460 + }, + { + "epoch": 13.692519251925193, + "grad_norm": 0.03191501647233963, + "learning_rate": 1.3679758427255457e-05, + "loss": 0.0595, + "num_input_tokens_seen": 26266848, + "step": 124465 + }, + { + "epoch": 13.693069306930694, + "grad_norm": 0.16004303097724915, + "learning_rate": 1.3677618561744787e-05, + "loss": 0.053, + "num_input_tokens_seen": 26267904, + "step": 124470 + }, + { + "epoch": 13.693619361936193, + "grad_norm": 0.039671890437603, + "learning_rate": 1.3675478800587062e-05, + "loss": 0.0079, + "num_input_tokens_seen": 26268960, + "step": 124475 + }, + { + "epoch": 13.694169416941694, + "grad_norm": 0.025699814781546593, + "learning_rate": 1.3673339143802027e-05, + "loss": 0.1172, + "num_input_tokens_seen": 26270048, + "step": 124480 + }, + { + "epoch": 13.694719471947195, + "grad_norm": 0.03360653668642044, + "learning_rate": 1.3671199591409379e-05, + "loss": 0.0451, + "num_input_tokens_seen": 26271168, + "step": 124485 + }, + { + "epoch": 13.695269526952695, + "grad_norm": 0.014559989795088768, + "learning_rate": 1.366906014342885e-05, + "loss": 0.0277, + "num_input_tokens_seen": 26272256, + "step": 124490 + }, + { + "epoch": 13.695819581958196, + "grad_norm": 0.03441258892416954, + "learning_rate": 1.3666920799880162e-05, + "loss": 0.0543, + "num_input_tokens_seen": 26273312, + "step": 124495 + }, + { + "epoch": 13.696369636963697, + "grad_norm": 0.011344675906002522, + "learning_rate": 1.3664781560783021e-05, + "loss": 0.0007, + "num_input_tokens_seen": 26274400, + "step": 124500 + }, + { + "epoch": 13.696919691969196, + "grad_norm": 0.016704337671399117, + "learning_rate": 1.3662642426157157e-05, + "loss": 0.0041, + "num_input_tokens_seen": 26275392, + "step": 124505 + }, + { + "epoch": 13.697469746974697, + "grad_norm": 0.12947575747966766, + "learning_rate": 1.366050339602227e-05, + "loss": 0.0693, + "num_input_tokens_seen": 26276416, + "step": 124510 + }, + { + "epoch": 13.698019801980198, + "grad_norm": 0.14184707403182983, + "learning_rate": 1.365836447039808e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26277472, + "step": 124515 + }, + { + "epoch": 13.6985698569857, + "grad_norm": 0.06120919808745384, + "learning_rate": 1.3656225649304314e-05, + "loss": 0.0073, + "num_input_tokens_seen": 26278560, + "step": 124520 + }, + { + "epoch": 13.699119911991199, + "grad_norm": 0.1074141412973404, + "learning_rate": 1.3654086932760663e-05, + "loss": 0.0589, + "num_input_tokens_seen": 26279648, + "step": 124525 + }, + { + "epoch": 13.6996699669967, + "grad_norm": 1.7754014730453491, + "learning_rate": 1.3651948320786856e-05, + "loss": 0.0356, + "num_input_tokens_seen": 26280800, + "step": 124530 + }, + { + "epoch": 13.7002200220022, + "grad_norm": 0.5918921232223511, + "learning_rate": 1.3649809813402597e-05, + "loss": 0.0169, + "num_input_tokens_seen": 26281856, + "step": 124535 + }, + { + "epoch": 13.7007700770077, + "grad_norm": 0.9925925731658936, + "learning_rate": 1.3647671410627582e-05, + "loss": 0.011, + "num_input_tokens_seen": 26282912, + "step": 124540 + }, + { + "epoch": 13.701320132013201, + "grad_norm": 0.011781503446400166, + "learning_rate": 1.3645533112481537e-05, + "loss": 0.0047, + "num_input_tokens_seen": 26284000, + "step": 124545 + }, + { + "epoch": 13.701870187018702, + "grad_norm": 0.2764006555080414, + "learning_rate": 1.3643394918984165e-05, + "loss": 0.0259, + "num_input_tokens_seen": 26285056, + "step": 124550 + }, + { + "epoch": 13.702420242024202, + "grad_norm": 0.14866745471954346, + "learning_rate": 1.364125683015518e-05, + "loss": 0.02, + "num_input_tokens_seen": 26286176, + "step": 124555 + }, + { + "epoch": 13.702970297029703, + "grad_norm": 0.046370092779397964, + "learning_rate": 1.3639118846014282e-05, + "loss": 0.1207, + "num_input_tokens_seen": 26287232, + "step": 124560 + }, + { + "epoch": 13.703520352035204, + "grad_norm": 0.05575694143772125, + "learning_rate": 1.3636980966581162e-05, + "loss": 0.087, + "num_input_tokens_seen": 26288256, + "step": 124565 + }, + { + "epoch": 13.704070407040705, + "grad_norm": 0.03401210531592369, + "learning_rate": 1.3634843191875546e-05, + "loss": 0.0609, + "num_input_tokens_seen": 26289312, + "step": 124570 + }, + { + "epoch": 13.704620462046204, + "grad_norm": 0.03851426765322685, + "learning_rate": 1.3632705521917107e-05, + "loss": 0.0355, + "num_input_tokens_seen": 26290400, + "step": 124575 + }, + { + "epoch": 13.705170517051705, + "grad_norm": 1.411158800125122, + "learning_rate": 1.3630567956725587e-05, + "loss": 0.0584, + "num_input_tokens_seen": 26291488, + "step": 124580 + }, + { + "epoch": 13.705720572057206, + "grad_norm": 0.05611962080001831, + "learning_rate": 1.3628430496320668e-05, + "loss": 0.0133, + "num_input_tokens_seen": 26292544, + "step": 124585 + }, + { + "epoch": 13.706270627062706, + "grad_norm": 0.026203559711575508, + "learning_rate": 1.3626293140722038e-05, + "loss": 0.0054, + "num_input_tokens_seen": 26293600, + "step": 124590 + }, + { + "epoch": 13.706820682068207, + "grad_norm": 0.017703356221318245, + "learning_rate": 1.3624155889949416e-05, + "loss": 0.0778, + "num_input_tokens_seen": 26294720, + "step": 124595 + }, + { + "epoch": 13.707370737073708, + "grad_norm": 0.34011319279670715, + "learning_rate": 1.362201874402248e-05, + "loss": 0.0077, + "num_input_tokens_seen": 26295776, + "step": 124600 + }, + { + "epoch": 13.707920792079207, + "grad_norm": 0.11382045596837997, + "learning_rate": 1.361988170296094e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26296864, + "step": 124605 + }, + { + "epoch": 13.708470847084708, + "grad_norm": 0.07503370195627213, + "learning_rate": 1.3617744766784496e-05, + "loss": 0.1982, + "num_input_tokens_seen": 26297984, + "step": 124610 + }, + { + "epoch": 13.70902090209021, + "grad_norm": 0.01021253690123558, + "learning_rate": 1.3615607935512828e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26299136, + "step": 124615 + }, + { + "epoch": 13.70957095709571, + "grad_norm": 0.04256720840930939, + "learning_rate": 1.3613471209165652e-05, + "loss": 0.069, + "num_input_tokens_seen": 26300160, + "step": 124620 + }, + { + "epoch": 13.71012101210121, + "grad_norm": 1.2069627046585083, + "learning_rate": 1.3611334587762634e-05, + "loss": 0.0081, + "num_input_tokens_seen": 26301184, + "step": 124625 + }, + { + "epoch": 13.710671067106711, + "grad_norm": 3.1244874000549316, + "learning_rate": 1.3609198071323492e-05, + "loss": 0.2321, + "num_input_tokens_seen": 26302240, + "step": 124630 + }, + { + "epoch": 13.711221122112212, + "grad_norm": 0.011872563511133194, + "learning_rate": 1.3607061659867892e-05, + "loss": 0.0219, + "num_input_tokens_seen": 26303328, + "step": 124635 + }, + { + "epoch": 13.711771177117711, + "grad_norm": 0.11367300152778625, + "learning_rate": 1.3604925353415543e-05, + "loss": 0.0588, + "num_input_tokens_seen": 26304480, + "step": 124640 + }, + { + "epoch": 13.712321232123212, + "grad_norm": 0.0473504401743412, + "learning_rate": 1.3602789151986137e-05, + "loss": 0.04, + "num_input_tokens_seen": 26305504, + "step": 124645 + }, + { + "epoch": 13.712871287128714, + "grad_norm": 2.8586535453796387, + "learning_rate": 1.3600653055599344e-05, + "loss": 0.0304, + "num_input_tokens_seen": 26306560, + "step": 124650 + }, + { + "epoch": 13.713421342134213, + "grad_norm": 0.2709487974643707, + "learning_rate": 1.3598517064274874e-05, + "loss": 0.0064, + "num_input_tokens_seen": 26307616, + "step": 124655 + }, + { + "epoch": 13.713971397139714, + "grad_norm": 1.789408802986145, + "learning_rate": 1.3596381178032389e-05, + "loss": 0.0449, + "num_input_tokens_seen": 26308640, + "step": 124660 + }, + { + "epoch": 13.714521452145215, + "grad_norm": 0.19695554673671722, + "learning_rate": 1.3594245396891586e-05, + "loss": 0.0183, + "num_input_tokens_seen": 26309760, + "step": 124665 + }, + { + "epoch": 13.715071507150714, + "grad_norm": 0.014544009231030941, + "learning_rate": 1.3592109720872161e-05, + "loss": 0.0665, + "num_input_tokens_seen": 26310816, + "step": 124670 + }, + { + "epoch": 13.715621562156215, + "grad_norm": 0.09342791140079498, + "learning_rate": 1.3589974149993778e-05, + "loss": 0.0251, + "num_input_tokens_seen": 26311840, + "step": 124675 + }, + { + "epoch": 13.716171617161717, + "grad_norm": 0.46361783146858215, + "learning_rate": 1.3587838684276139e-05, + "loss": 0.0061, + "num_input_tokens_seen": 26312864, + "step": 124680 + }, + { + "epoch": 13.716721672167218, + "grad_norm": 0.14870454370975494, + "learning_rate": 1.3585703323738912e-05, + "loss": 0.0042, + "num_input_tokens_seen": 26313920, + "step": 124685 + }, + { + "epoch": 13.717271727172717, + "grad_norm": 0.007816693745553493, + "learning_rate": 1.3583568068401763e-05, + "loss": 0.0476, + "num_input_tokens_seen": 26315040, + "step": 124690 + }, + { + "epoch": 13.717821782178218, + "grad_norm": 0.017340037971735, + "learning_rate": 1.3581432918284406e-05, + "loss": 0.0795, + "num_input_tokens_seen": 26316128, + "step": 124695 + }, + { + "epoch": 13.718371837183719, + "grad_norm": 0.716049075126648, + "learning_rate": 1.3579297873406493e-05, + "loss": 0.0671, + "num_input_tokens_seen": 26317248, + "step": 124700 + }, + { + "epoch": 13.718921892189218, + "grad_norm": 0.0157043244689703, + "learning_rate": 1.3577162933787724e-05, + "loss": 0.0092, + "num_input_tokens_seen": 26318336, + "step": 124705 + }, + { + "epoch": 13.71947194719472, + "grad_norm": 0.02899997867643833, + "learning_rate": 1.3575028099447761e-05, + "loss": 0.0997, + "num_input_tokens_seen": 26319392, + "step": 124710 + }, + { + "epoch": 13.72002200220022, + "grad_norm": 1.0449732542037964, + "learning_rate": 1.3572893370406272e-05, + "loss": 0.0175, + "num_input_tokens_seen": 26320384, + "step": 124715 + }, + { + "epoch": 13.72057205720572, + "grad_norm": 0.002710137516260147, + "learning_rate": 1.3570758746682944e-05, + "loss": 0.0081, + "num_input_tokens_seen": 26321504, + "step": 124720 + }, + { + "epoch": 13.721122112211221, + "grad_norm": 0.02126350812613964, + "learning_rate": 1.3568624228297454e-05, + "loss": 0.0088, + "num_input_tokens_seen": 26322496, + "step": 124725 + }, + { + "epoch": 13.721672167216722, + "grad_norm": 0.36300915479660034, + "learning_rate": 1.3566489815269462e-05, + "loss": 0.0048, + "num_input_tokens_seen": 26323520, + "step": 124730 + }, + { + "epoch": 13.722222222222221, + "grad_norm": 0.05975545570254326, + "learning_rate": 1.3564355507618657e-05, + "loss": 0.0394, + "num_input_tokens_seen": 26324608, + "step": 124735 + }, + { + "epoch": 13.722772277227723, + "grad_norm": 0.010652807541191578, + "learning_rate": 1.356222130536469e-05, + "loss": 0.0103, + "num_input_tokens_seen": 26325696, + "step": 124740 + }, + { + "epoch": 13.723322332233224, + "grad_norm": 1.7434276342391968, + "learning_rate": 1.356008720852725e-05, + "loss": 0.0615, + "num_input_tokens_seen": 26326816, + "step": 124745 + }, + { + "epoch": 13.723872387238725, + "grad_norm": 0.10122687369585037, + "learning_rate": 1.3557953217125986e-05, + "loss": 0.0335, + "num_input_tokens_seen": 26327840, + "step": 124750 + }, + { + "epoch": 13.724422442244224, + "grad_norm": 1.2408910989761353, + "learning_rate": 1.3555819331180577e-05, + "loss": 0.0111, + "num_input_tokens_seen": 26328896, + "step": 124755 + }, + { + "epoch": 13.724972497249725, + "grad_norm": 0.03555027022957802, + "learning_rate": 1.3553685550710701e-05, + "loss": 0.0064, + "num_input_tokens_seen": 26329984, + "step": 124760 + }, + { + "epoch": 13.725522552255226, + "grad_norm": 0.026822958141565323, + "learning_rate": 1.3551551875736002e-05, + "loss": 0.0039, + "num_input_tokens_seen": 26331040, + "step": 124765 + }, + { + "epoch": 13.726072607260726, + "grad_norm": 0.44829073548316956, + "learning_rate": 1.3549418306276169e-05, + "loss": 0.0066, + "num_input_tokens_seen": 26332160, + "step": 124770 + }, + { + "epoch": 13.726622662266227, + "grad_norm": 0.08876293152570724, + "learning_rate": 1.3547284842350838e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26333248, + "step": 124775 + }, + { + "epoch": 13.727172717271728, + "grad_norm": 0.12137750536203384, + "learning_rate": 1.354515148397969e-05, + "loss": 0.0187, + "num_input_tokens_seen": 26334240, + "step": 124780 + }, + { + "epoch": 13.727722772277227, + "grad_norm": 0.01808951422572136, + "learning_rate": 1.3543018231182391e-05, + "loss": 0.003, + "num_input_tokens_seen": 26335328, + "step": 124785 + }, + { + "epoch": 13.728272827282728, + "grad_norm": 0.026347601786255836, + "learning_rate": 1.3540885083978589e-05, + "loss": 0.0719, + "num_input_tokens_seen": 26336384, + "step": 124790 + }, + { + "epoch": 13.72882288228823, + "grad_norm": 0.6046893000602722, + "learning_rate": 1.3538752042387958e-05, + "loss": 0.0131, + "num_input_tokens_seen": 26337408, + "step": 124795 + }, + { + "epoch": 13.729372937293729, + "grad_norm": 0.0070387981832027435, + "learning_rate": 1.3536619106430154e-05, + "loss": 0.0229, + "num_input_tokens_seen": 26338432, + "step": 124800 + }, + { + "epoch": 13.72992299229923, + "grad_norm": 0.02581237256526947, + "learning_rate": 1.3534486276124817e-05, + "loss": 0.0872, + "num_input_tokens_seen": 26339456, + "step": 124805 + }, + { + "epoch": 13.73047304730473, + "grad_norm": 0.4228469729423523, + "learning_rate": 1.353235355149162e-05, + "loss": 0.0556, + "num_input_tokens_seen": 26340448, + "step": 124810 + }, + { + "epoch": 13.731023102310232, + "grad_norm": 0.009952560067176819, + "learning_rate": 1.3530220932550216e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26341536, + "step": 124815 + }, + { + "epoch": 13.731573157315731, + "grad_norm": 0.012462250888347626, + "learning_rate": 1.3528088419320273e-05, + "loss": 0.009, + "num_input_tokens_seen": 26342528, + "step": 124820 + }, + { + "epoch": 13.732123212321232, + "grad_norm": 0.03151794523000717, + "learning_rate": 1.3525956011821434e-05, + "loss": 0.0417, + "num_input_tokens_seen": 26343616, + "step": 124825 + }, + { + "epoch": 13.732673267326733, + "grad_norm": 0.6193128824234009, + "learning_rate": 1.3523823710073344e-05, + "loss": 0.008, + "num_input_tokens_seen": 26344640, + "step": 124830 + }, + { + "epoch": 13.733223322332233, + "grad_norm": 0.025650333613157272, + "learning_rate": 1.3521691514095663e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26345696, + "step": 124835 + }, + { + "epoch": 13.733773377337734, + "grad_norm": 0.04729636386036873, + "learning_rate": 1.3519559423908043e-05, + "loss": 0.0294, + "num_input_tokens_seen": 26346720, + "step": 124840 + }, + { + "epoch": 13.734323432343235, + "grad_norm": 0.018962906673550606, + "learning_rate": 1.3517427439530144e-05, + "loss": 0.0053, + "num_input_tokens_seen": 26347840, + "step": 124845 + }, + { + "epoch": 13.734873487348734, + "grad_norm": 0.032808542251586914, + "learning_rate": 1.3515295560981609e-05, + "loss": 0.0356, + "num_input_tokens_seen": 26348896, + "step": 124850 + }, + { + "epoch": 13.735423542354235, + "grad_norm": 0.0357372984290123, + "learning_rate": 1.3513163788282075e-05, + "loss": 0.0095, + "num_input_tokens_seen": 26349952, + "step": 124855 + }, + { + "epoch": 13.735973597359736, + "grad_norm": 0.04203999787569046, + "learning_rate": 1.3511032121451203e-05, + "loss": 0.0706, + "num_input_tokens_seen": 26351008, + "step": 124860 + }, + { + "epoch": 13.736523652365236, + "grad_norm": 0.03249501436948776, + "learning_rate": 1.350890056050863e-05, + "loss": 0.0111, + "num_input_tokens_seen": 26352064, + "step": 124865 + }, + { + "epoch": 13.737073707370737, + "grad_norm": 0.05173994600772858, + "learning_rate": 1.3506769105474004e-05, + "loss": 0.0061, + "num_input_tokens_seen": 26353120, + "step": 124870 + }, + { + "epoch": 13.737623762376238, + "grad_norm": 0.02765573374927044, + "learning_rate": 1.3504637756366983e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26354112, + "step": 124875 + }, + { + "epoch": 13.738173817381739, + "grad_norm": 0.9454023241996765, + "learning_rate": 1.350250651320719e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26355168, + "step": 124880 + }, + { + "epoch": 13.738723872387238, + "grad_norm": 0.04089931771159172, + "learning_rate": 1.3500375376014285e-05, + "loss": 0.0507, + "num_input_tokens_seen": 26356160, + "step": 124885 + }, + { + "epoch": 13.73927392739274, + "grad_norm": 2.6278181076049805, + "learning_rate": 1.3498244344807895e-05, + "loss": 0.0217, + "num_input_tokens_seen": 26357248, + "step": 124890 + }, + { + "epoch": 13.73982398239824, + "grad_norm": 0.7436321973800659, + "learning_rate": 1.3496113419607667e-05, + "loss": 0.0128, + "num_input_tokens_seen": 26358304, + "step": 124895 + }, + { + "epoch": 13.74037403740374, + "grad_norm": 0.05279914662241936, + "learning_rate": 1.3493982600433253e-05, + "loss": 0.0265, + "num_input_tokens_seen": 26359360, + "step": 124900 + }, + { + "epoch": 13.74092409240924, + "grad_norm": 0.0512375645339489, + "learning_rate": 1.349185188730427e-05, + "loss": 0.0049, + "num_input_tokens_seen": 26360480, + "step": 124905 + }, + { + "epoch": 13.741474147414742, + "grad_norm": 0.02140449732542038, + "learning_rate": 1.3489721280240373e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26361568, + "step": 124910 + }, + { + "epoch": 13.742024202420241, + "grad_norm": 0.018581438809633255, + "learning_rate": 1.3487590779261194e-05, + "loss": 0.1408, + "num_input_tokens_seen": 26362656, + "step": 124915 + }, + { + "epoch": 13.742574257425742, + "grad_norm": 0.02804349549114704, + "learning_rate": 1.3485460384386356e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26363776, + "step": 124920 + }, + { + "epoch": 13.743124312431243, + "grad_norm": 0.010955586098134518, + "learning_rate": 1.3483330095635504e-05, + "loss": 0.0103, + "num_input_tokens_seen": 26364800, + "step": 124925 + }, + { + "epoch": 13.743674367436743, + "grad_norm": 0.02486351877450943, + "learning_rate": 1.3481199913028272e-05, + "loss": 0.1152, + "num_input_tokens_seen": 26365888, + "step": 124930 + }, + { + "epoch": 13.744224422442244, + "grad_norm": 0.017231883481144905, + "learning_rate": 1.3479069836584304e-05, + "loss": 0.0126, + "num_input_tokens_seen": 26366976, + "step": 124935 + }, + { + "epoch": 13.744774477447745, + "grad_norm": 0.01949354261159897, + "learning_rate": 1.3476939866323221e-05, + "loss": 0.0018, + "num_input_tokens_seen": 26368000, + "step": 124940 + }, + { + "epoch": 13.745324532453246, + "grad_norm": 0.02095664106309414, + "learning_rate": 1.3474810002264643e-05, + "loss": 0.0044, + "num_input_tokens_seen": 26369024, + "step": 124945 + }, + { + "epoch": 13.745874587458745, + "grad_norm": 0.012641801498830318, + "learning_rate": 1.3472680244428215e-05, + "loss": 0.0111, + "num_input_tokens_seen": 26370080, + "step": 124950 + }, + { + "epoch": 13.746424642464246, + "grad_norm": 0.5117286443710327, + "learning_rate": 1.3470550592833558e-05, + "loss": 0.0112, + "num_input_tokens_seen": 26371168, + "step": 124955 + }, + { + "epoch": 13.746974697469748, + "grad_norm": 0.02418838068842888, + "learning_rate": 1.3468421047500312e-05, + "loss": 0.1287, + "num_input_tokens_seen": 26372160, + "step": 124960 + }, + { + "epoch": 13.747524752475247, + "grad_norm": 0.02820153720676899, + "learning_rate": 1.34662916084481e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26373216, + "step": 124965 + }, + { + "epoch": 13.748074807480748, + "grad_norm": 0.8256971836090088, + "learning_rate": 1.3464162275696532e-05, + "loss": 0.0125, + "num_input_tokens_seen": 26374272, + "step": 124970 + }, + { + "epoch": 13.748624862486249, + "grad_norm": 0.1594310998916626, + "learning_rate": 1.3462033049265255e-05, + "loss": 0.0104, + "num_input_tokens_seen": 26375296, + "step": 124975 + }, + { + "epoch": 13.749174917491748, + "grad_norm": 0.04143703356385231, + "learning_rate": 1.3459903929173872e-05, + "loss": 0.0023, + "num_input_tokens_seen": 26376320, + "step": 124980 + }, + { + "epoch": 13.74972497249725, + "grad_norm": 0.013367638923227787, + "learning_rate": 1.3457774915442017e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26377408, + "step": 124985 + }, + { + "epoch": 13.75027502750275, + "grad_norm": 0.06027277559041977, + "learning_rate": 1.3455646008089325e-05, + "loss": 0.0063, + "num_input_tokens_seen": 26378464, + "step": 124990 + }, + { + "epoch": 13.750825082508252, + "grad_norm": 0.052463728934526443, + "learning_rate": 1.3453517207135391e-05, + "loss": 0.0163, + "num_input_tokens_seen": 26379584, + "step": 124995 + }, + { + "epoch": 13.751375137513751, + "grad_norm": 1.0938037633895874, + "learning_rate": 1.345138851259986e-05, + "loss": 0.0891, + "num_input_tokens_seen": 26380640, + "step": 125000 + }, + { + "epoch": 13.751925192519252, + "grad_norm": 0.04392387345433235, + "learning_rate": 1.3449259924502331e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26381664, + "step": 125005 + }, + { + "epoch": 13.752475247524753, + "grad_norm": 0.4072237014770508, + "learning_rate": 1.3447131442862429e-05, + "loss": 0.064, + "num_input_tokens_seen": 26382720, + "step": 125010 + }, + { + "epoch": 13.753025302530252, + "grad_norm": 0.18695279955863953, + "learning_rate": 1.3445003067699782e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26383872, + "step": 125015 + }, + { + "epoch": 13.753575357535754, + "grad_norm": 0.02252059057354927, + "learning_rate": 1.3442874799033989e-05, + "loss": 0.0055, + "num_input_tokens_seen": 26384928, + "step": 125020 + }, + { + "epoch": 13.754125412541255, + "grad_norm": 2.2174060344696045, + "learning_rate": 1.3440746636884682e-05, + "loss": 0.0523, + "num_input_tokens_seen": 26385952, + "step": 125025 + }, + { + "epoch": 13.754675467546754, + "grad_norm": 0.04547013342380524, + "learning_rate": 1.3438618581271455e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26386944, + "step": 125030 + }, + { + "epoch": 13.755225522552255, + "grad_norm": 0.08191774040460587, + "learning_rate": 1.3436490632213943e-05, + "loss": 0.0083, + "num_input_tokens_seen": 26388064, + "step": 125035 + }, + { + "epoch": 13.755775577557756, + "grad_norm": 0.2597009837627411, + "learning_rate": 1.3434362789731742e-05, + "loss": 0.0051, + "num_input_tokens_seen": 26389120, + "step": 125040 + }, + { + "epoch": 13.756325632563257, + "grad_norm": 0.04146203398704529, + "learning_rate": 1.3432235053844467e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26390208, + "step": 125045 + }, + { + "epoch": 13.756875687568757, + "grad_norm": 0.01267981342971325, + "learning_rate": 1.3430107424571742e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26391264, + "step": 125050 + }, + { + "epoch": 13.757425742574258, + "grad_norm": 0.8504992723464966, + "learning_rate": 1.3427979901933152e-05, + "loss": 0.0487, + "num_input_tokens_seen": 26392352, + "step": 125055 + }, + { + "epoch": 13.757975797579759, + "grad_norm": 4.5208258628845215, + "learning_rate": 1.3425852485948331e-05, + "loss": 0.0386, + "num_input_tokens_seen": 26393376, + "step": 125060 + }, + { + "epoch": 13.758525852585258, + "grad_norm": 0.02212553657591343, + "learning_rate": 1.3423725176636864e-05, + "loss": 0.008, + "num_input_tokens_seen": 26394400, + "step": 125065 + }, + { + "epoch": 13.75907590759076, + "grad_norm": 0.04086609184741974, + "learning_rate": 1.3421597974018369e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26395456, + "step": 125070 + }, + { + "epoch": 13.75962596259626, + "grad_norm": 0.02905282936990261, + "learning_rate": 1.3419470878112455e-05, + "loss": 0.0161, + "num_input_tokens_seen": 26396480, + "step": 125075 + }, + { + "epoch": 13.76017601760176, + "grad_norm": 0.009110456332564354, + "learning_rate": 1.3417343888938713e-05, + "loss": 0.1057, + "num_input_tokens_seen": 26397472, + "step": 125080 + }, + { + "epoch": 13.76072607260726, + "grad_norm": 0.08556883782148361, + "learning_rate": 1.3415217006516765e-05, + "loss": 0.0018, + "num_input_tokens_seen": 26398528, + "step": 125085 + }, + { + "epoch": 13.761276127612762, + "grad_norm": 0.10158143192529678, + "learning_rate": 1.3413090230866204e-05, + "loss": 0.0403, + "num_input_tokens_seen": 26399648, + "step": 125090 + }, + { + "epoch": 13.761826182618261, + "grad_norm": 0.11744610965251923, + "learning_rate": 1.3410963562006618e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26400704, + "step": 125095 + }, + { + "epoch": 13.762376237623762, + "grad_norm": 0.7532512545585632, + "learning_rate": 1.3408836999957619e-05, + "loss": 0.0113, + "num_input_tokens_seen": 26401728, + "step": 125100 + }, + { + "epoch": 13.762926292629263, + "grad_norm": 1.1502647399902344, + "learning_rate": 1.340671054473882e-05, + "loss": 0.0647, + "num_input_tokens_seen": 26402816, + "step": 125105 + }, + { + "epoch": 13.763476347634764, + "grad_norm": 0.0943739041686058, + "learning_rate": 1.3404584196369796e-05, + "loss": 0.015, + "num_input_tokens_seen": 26403936, + "step": 125110 + }, + { + "epoch": 13.764026402640264, + "grad_norm": 0.0628739446401596, + "learning_rate": 1.3402457954870169e-05, + "loss": 0.0793, + "num_input_tokens_seen": 26404928, + "step": 125115 + }, + { + "epoch": 13.764576457645765, + "grad_norm": 0.062204595655202866, + "learning_rate": 1.3400331820259504e-05, + "loss": 0.0068, + "num_input_tokens_seen": 26405952, + "step": 125120 + }, + { + "epoch": 13.765126512651266, + "grad_norm": 0.15977565944194794, + "learning_rate": 1.3398205792557423e-05, + "loss": 0.0284, + "num_input_tokens_seen": 26406976, + "step": 125125 + }, + { + "epoch": 13.765676567656765, + "grad_norm": 4.569759368896484, + "learning_rate": 1.3396079871783517e-05, + "loss": 0.0227, + "num_input_tokens_seen": 26408000, + "step": 125130 + }, + { + "epoch": 13.766226622662266, + "grad_norm": 0.04788666218519211, + "learning_rate": 1.3393954057957364e-05, + "loss": 0.0022, + "num_input_tokens_seen": 26409088, + "step": 125135 + }, + { + "epoch": 13.766776677667767, + "grad_norm": 0.1371721476316452, + "learning_rate": 1.3391828351098578e-05, + "loss": 0.1293, + "num_input_tokens_seen": 26410112, + "step": 125140 + }, + { + "epoch": 13.767326732673267, + "grad_norm": 0.4421272873878479, + "learning_rate": 1.3389702751226735e-05, + "loss": 0.0084, + "num_input_tokens_seen": 26411168, + "step": 125145 + }, + { + "epoch": 13.767876787678768, + "grad_norm": 0.025299666449427605, + "learning_rate": 1.3387577258361436e-05, + "loss": 0.003, + "num_input_tokens_seen": 26412160, + "step": 125150 + }, + { + "epoch": 13.768426842684269, + "grad_norm": 0.05293379724025726, + "learning_rate": 1.3385451872522256e-05, + "loss": 0.0076, + "num_input_tokens_seen": 26413184, + "step": 125155 + }, + { + "epoch": 13.768976897689768, + "grad_norm": 0.34405869245529175, + "learning_rate": 1.3383326593728794e-05, + "loss": 0.0481, + "num_input_tokens_seen": 26414176, + "step": 125160 + }, + { + "epoch": 13.76952695269527, + "grad_norm": 0.2561715841293335, + "learning_rate": 1.3381201422000645e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26415232, + "step": 125165 + }, + { + "epoch": 13.77007700770077, + "grad_norm": 0.009291760623455048, + "learning_rate": 1.337907635735738e-05, + "loss": 0.0284, + "num_input_tokens_seen": 26416320, + "step": 125170 + }, + { + "epoch": 13.770627062706271, + "grad_norm": 0.30989107489585876, + "learning_rate": 1.3376951399818596e-05, + "loss": 0.0865, + "num_input_tokens_seen": 26417440, + "step": 125175 + }, + { + "epoch": 13.77117711771177, + "grad_norm": 0.05876469239592552, + "learning_rate": 1.3374826549403868e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26418560, + "step": 125180 + }, + { + "epoch": 13.771727172717272, + "grad_norm": 1.064638376235962, + "learning_rate": 1.3372701806132787e-05, + "loss": 0.0156, + "num_input_tokens_seen": 26419616, + "step": 125185 + }, + { + "epoch": 13.772277227722773, + "grad_norm": 0.0028316115494817495, + "learning_rate": 1.337057717002494e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26420704, + "step": 125190 + }, + { + "epoch": 13.772827282728272, + "grad_norm": 0.5934106707572937, + "learning_rate": 1.3368452641099893e-05, + "loss": 0.1314, + "num_input_tokens_seen": 26421696, + "step": 125195 + }, + { + "epoch": 13.773377337733773, + "grad_norm": 0.3431590795516968, + "learning_rate": 1.336632821937725e-05, + "loss": 0.0049, + "num_input_tokens_seen": 26422752, + "step": 125200 + }, + { + "epoch": 13.773927392739274, + "grad_norm": 4.71876859664917, + "learning_rate": 1.3364203904876576e-05, + "loss": 0.0904, + "num_input_tokens_seen": 26423776, + "step": 125205 + }, + { + "epoch": 13.774477447744774, + "grad_norm": 0.17888300120830536, + "learning_rate": 1.3362079697617441e-05, + "loss": 0.0108, + "num_input_tokens_seen": 26424928, + "step": 125210 + }, + { + "epoch": 13.775027502750275, + "grad_norm": 0.05447636544704437, + "learning_rate": 1.3359955597619433e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26426016, + "step": 125215 + }, + { + "epoch": 13.775577557755776, + "grad_norm": 0.01621362753212452, + "learning_rate": 1.335783160490213e-05, + "loss": 0.0161, + "num_input_tokens_seen": 26427008, + "step": 125220 + }, + { + "epoch": 13.776127612761275, + "grad_norm": 0.016836905851960182, + "learning_rate": 1.3355707719485117e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26428064, + "step": 125225 + }, + { + "epoch": 13.776677667766776, + "grad_norm": 1.6388541460037231, + "learning_rate": 1.3353583941387954e-05, + "loss": 0.0143, + "num_input_tokens_seen": 26429056, + "step": 125230 + }, + { + "epoch": 13.777227722772277, + "grad_norm": 0.006164941005408764, + "learning_rate": 1.3351460270630211e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26430176, + "step": 125235 + }, + { + "epoch": 13.777777777777779, + "grad_norm": 0.02765064314007759, + "learning_rate": 1.3349336707231475e-05, + "loss": 0.0282, + "num_input_tokens_seen": 26431200, + "step": 125240 + }, + { + "epoch": 13.778327832783278, + "grad_norm": 0.20376907289028168, + "learning_rate": 1.3347213251211304e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26432256, + "step": 125245 + }, + { + "epoch": 13.778877887788779, + "grad_norm": 0.006745324935764074, + "learning_rate": 1.3345089902589291e-05, + "loss": 0.0169, + "num_input_tokens_seen": 26433216, + "step": 125250 + }, + { + "epoch": 13.77942794279428, + "grad_norm": 0.048086971044540405, + "learning_rate": 1.3342966661384992e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26434304, + "step": 125255 + }, + { + "epoch": 13.77997799779978, + "grad_norm": 0.12320706248283386, + "learning_rate": 1.3340843527617963e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26435328, + "step": 125260 + }, + { + "epoch": 13.78052805280528, + "grad_norm": 0.48854565620422363, + "learning_rate": 1.33387205013078e-05, + "loss": 0.0404, + "num_input_tokens_seen": 26436448, + "step": 125265 + }, + { + "epoch": 13.781078107810782, + "grad_norm": 0.018055777996778488, + "learning_rate": 1.333659758247404e-05, + "loss": 0.0076, + "num_input_tokens_seen": 26437504, + "step": 125270 + }, + { + "epoch": 13.781628162816281, + "grad_norm": 0.8430953025817871, + "learning_rate": 1.3334474771136262e-05, + "loss": 0.151, + "num_input_tokens_seen": 26438560, + "step": 125275 + }, + { + "epoch": 13.782178217821782, + "grad_norm": 0.10522790253162384, + "learning_rate": 1.3332352067314047e-05, + "loss": 0.0594, + "num_input_tokens_seen": 26439520, + "step": 125280 + }, + { + "epoch": 13.782728272827283, + "grad_norm": 0.014151577837765217, + "learning_rate": 1.3330229471026934e-05, + "loss": 0.0311, + "num_input_tokens_seen": 26440544, + "step": 125285 + }, + { + "epoch": 13.783278327832782, + "grad_norm": 1.7372666597366333, + "learning_rate": 1.3328106982294502e-05, + "loss": 0.1349, + "num_input_tokens_seen": 26441632, + "step": 125290 + }, + { + "epoch": 13.783828382838283, + "grad_norm": 0.08256062120199203, + "learning_rate": 1.332598460113631e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26442624, + "step": 125295 + }, + { + "epoch": 13.784378437843785, + "grad_norm": 0.027568859979510307, + "learning_rate": 1.3323862327571895e-05, + "loss": 0.0034, + "num_input_tokens_seen": 26443712, + "step": 125300 + }, + { + "epoch": 13.784928492849286, + "grad_norm": 0.06901941448450089, + "learning_rate": 1.332174016162086e-05, + "loss": 0.0113, + "num_input_tokens_seen": 26444768, + "step": 125305 + }, + { + "epoch": 13.785478547854785, + "grad_norm": 0.027202000841498375, + "learning_rate": 1.3319618103302733e-05, + "loss": 0.0716, + "num_input_tokens_seen": 26445792, + "step": 125310 + }, + { + "epoch": 13.786028602860286, + "grad_norm": 0.005679582245647907, + "learning_rate": 1.3317496152637089e-05, + "loss": 0.001, + "num_input_tokens_seen": 26446848, + "step": 125315 + }, + { + "epoch": 13.786578657865787, + "grad_norm": 0.007137938868254423, + "learning_rate": 1.3315374309643475e-05, + "loss": 0.009, + "num_input_tokens_seen": 26447872, + "step": 125320 + }, + { + "epoch": 13.787128712871286, + "grad_norm": 0.002461732132360339, + "learning_rate": 1.3313252574341444e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26448960, + "step": 125325 + }, + { + "epoch": 13.787678767876788, + "grad_norm": 2.4362306594848633, + "learning_rate": 1.3311130946750556e-05, + "loss": 0.0656, + "num_input_tokens_seen": 26450016, + "step": 125330 + }, + { + "epoch": 13.788228822882289, + "grad_norm": 0.0497119277715683, + "learning_rate": 1.3309009426890362e-05, + "loss": 0.026, + "num_input_tokens_seen": 26451040, + "step": 125335 + }, + { + "epoch": 13.788778877887788, + "grad_norm": 0.06831298023462296, + "learning_rate": 1.3306888014780427e-05, + "loss": 0.0257, + "num_input_tokens_seen": 26452032, + "step": 125340 + }, + { + "epoch": 13.789328932893289, + "grad_norm": 0.02458752691745758, + "learning_rate": 1.3304766710440298e-05, + "loss": 0.0384, + "num_input_tokens_seen": 26453056, + "step": 125345 + }, + { + "epoch": 13.78987898789879, + "grad_norm": 0.18342497944831848, + "learning_rate": 1.330264551388951e-05, + "loss": 0.0078, + "num_input_tokens_seen": 26454112, + "step": 125350 + }, + { + "epoch": 13.79042904290429, + "grad_norm": 0.01596185564994812, + "learning_rate": 1.3300524425147637e-05, + "loss": 0.0015, + "num_input_tokens_seen": 26455168, + "step": 125355 + }, + { + "epoch": 13.79097909790979, + "grad_norm": 0.5429741144180298, + "learning_rate": 1.3298403444234194e-05, + "loss": 0.0082, + "num_input_tokens_seen": 26456192, + "step": 125360 + }, + { + "epoch": 13.791529152915292, + "grad_norm": 0.04869958758354187, + "learning_rate": 1.3296282571168773e-05, + "loss": 0.0143, + "num_input_tokens_seen": 26457280, + "step": 125365 + }, + { + "epoch": 13.792079207920793, + "grad_norm": 0.8304528594017029, + "learning_rate": 1.3294161805970901e-05, + "loss": 0.0368, + "num_input_tokens_seen": 26458272, + "step": 125370 + }, + { + "epoch": 13.792629262926292, + "grad_norm": 0.42049938440322876, + "learning_rate": 1.3292041148660111e-05, + "loss": 0.0661, + "num_input_tokens_seen": 26459424, + "step": 125375 + }, + { + "epoch": 13.793179317931793, + "grad_norm": 0.006582244765013456, + "learning_rate": 1.328992059925597e-05, + "loss": 0.0309, + "num_input_tokens_seen": 26460512, + "step": 125380 + }, + { + "epoch": 13.793729372937294, + "grad_norm": 0.00592983840033412, + "learning_rate": 1.3287800157778003e-05, + "loss": 0.001, + "num_input_tokens_seen": 26461600, + "step": 125385 + }, + { + "epoch": 13.794279427942794, + "grad_norm": 0.06439590454101562, + "learning_rate": 1.3285679824245762e-05, + "loss": 0.0097, + "num_input_tokens_seen": 26462688, + "step": 125390 + }, + { + "epoch": 13.794829482948295, + "grad_norm": 0.5872997641563416, + "learning_rate": 1.3283559598678796e-05, + "loss": 0.0048, + "num_input_tokens_seen": 26463744, + "step": 125395 + }, + { + "epoch": 13.795379537953796, + "grad_norm": 0.5021070241928101, + "learning_rate": 1.328143948109663e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26464832, + "step": 125400 + }, + { + "epoch": 13.795929592959295, + "grad_norm": 0.020147336646914482, + "learning_rate": 1.3279319471518826e-05, + "loss": 0.0131, + "num_input_tokens_seen": 26465888, + "step": 125405 + }, + { + "epoch": 13.796479647964796, + "grad_norm": 0.4666259288787842, + "learning_rate": 1.3277199569964895e-05, + "loss": 0.0041, + "num_input_tokens_seen": 26467008, + "step": 125410 + }, + { + "epoch": 13.797029702970297, + "grad_norm": 0.004045378416776657, + "learning_rate": 1.3275079776454405e-05, + "loss": 0.068, + "num_input_tokens_seen": 26468096, + "step": 125415 + }, + { + "epoch": 13.797579757975798, + "grad_norm": 0.4746675193309784, + "learning_rate": 1.3272960091006867e-05, + "loss": 0.0168, + "num_input_tokens_seen": 26469120, + "step": 125420 + }, + { + "epoch": 13.798129812981298, + "grad_norm": 0.14381980895996094, + "learning_rate": 1.327084051364183e-05, + "loss": 0.0251, + "num_input_tokens_seen": 26470144, + "step": 125425 + }, + { + "epoch": 13.798679867986799, + "grad_norm": 0.15261544287204742, + "learning_rate": 1.3268721044378834e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26471232, + "step": 125430 + }, + { + "epoch": 13.7992299229923, + "grad_norm": 1.315790057182312, + "learning_rate": 1.3266601683237401e-05, + "loss": 0.0575, + "num_input_tokens_seen": 26472320, + "step": 125435 + }, + { + "epoch": 13.7997799779978, + "grad_norm": 1.5029118061065674, + "learning_rate": 1.326448243023708e-05, + "loss": 0.1637, + "num_input_tokens_seen": 26473344, + "step": 125440 + }, + { + "epoch": 13.8003300330033, + "grad_norm": 0.0512503981590271, + "learning_rate": 1.3262363285397381e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26474432, + "step": 125445 + }, + { + "epoch": 13.800880088008801, + "grad_norm": 0.019326087087392807, + "learning_rate": 1.3260244248737846e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26475488, + "step": 125450 + }, + { + "epoch": 13.8014301430143, + "grad_norm": 0.01095760427415371, + "learning_rate": 1.325812532027802e-05, + "loss": 0.1331, + "num_input_tokens_seen": 26476544, + "step": 125455 + }, + { + "epoch": 13.801980198019802, + "grad_norm": 0.01576010324060917, + "learning_rate": 1.3256006500037408e-05, + "loss": 0.0829, + "num_input_tokens_seen": 26477600, + "step": 125460 + }, + { + "epoch": 13.802530253025303, + "grad_norm": 0.0049547781236469746, + "learning_rate": 1.3253887788035557e-05, + "loss": 0.0894, + "num_input_tokens_seen": 26478624, + "step": 125465 + }, + { + "epoch": 13.803080308030804, + "grad_norm": 0.6899165511131287, + "learning_rate": 1.3251769184291987e-05, + "loss": 0.0465, + "num_input_tokens_seen": 26479584, + "step": 125470 + }, + { + "epoch": 13.803630363036303, + "grad_norm": 0.02602735348045826, + "learning_rate": 1.3249650688826205e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26480608, + "step": 125475 + }, + { + "epoch": 13.804180418041804, + "grad_norm": 0.025082625448703766, + "learning_rate": 1.3247532301657773e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26481728, + "step": 125480 + }, + { + "epoch": 13.804730473047305, + "grad_norm": 1.5533992052078247, + "learning_rate": 1.3245414022806193e-05, + "loss": 0.17, + "num_input_tokens_seen": 26482816, + "step": 125485 + }, + { + "epoch": 13.805280528052805, + "grad_norm": 0.04374458268284798, + "learning_rate": 1.3243295852290984e-05, + "loss": 0.1243, + "num_input_tokens_seen": 26483840, + "step": 125490 + }, + { + "epoch": 13.805830583058306, + "grad_norm": 0.471079558134079, + "learning_rate": 1.3241177790131686e-05, + "loss": 0.0072, + "num_input_tokens_seen": 26484864, + "step": 125495 + }, + { + "epoch": 13.806380638063807, + "grad_norm": 0.009133524261415005, + "learning_rate": 1.32390598363478e-05, + "loss": 0.067, + "num_input_tokens_seen": 26485920, + "step": 125500 + }, + { + "epoch": 13.806930693069306, + "grad_norm": 2.685215711593628, + "learning_rate": 1.3236941990958857e-05, + "loss": 0.0334, + "num_input_tokens_seen": 26487040, + "step": 125505 + }, + { + "epoch": 13.807480748074807, + "grad_norm": 0.02053835429251194, + "learning_rate": 1.3234824253984384e-05, + "loss": 0.0026, + "num_input_tokens_seen": 26488128, + "step": 125510 + }, + { + "epoch": 13.808030803080309, + "grad_norm": 0.007081194315105677, + "learning_rate": 1.3232706625443881e-05, + "loss": 0.0588, + "num_input_tokens_seen": 26489152, + "step": 125515 + }, + { + "epoch": 13.808580858085808, + "grad_norm": 0.021591879427433014, + "learning_rate": 1.3230589105356884e-05, + "loss": 0.0041, + "num_input_tokens_seen": 26490240, + "step": 125520 + }, + { + "epoch": 13.809130913091309, + "grad_norm": 0.03168105334043503, + "learning_rate": 1.3228471693742889e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26491360, + "step": 125525 + }, + { + "epoch": 13.80968096809681, + "grad_norm": 0.015995316207408905, + "learning_rate": 1.3226354390621431e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26492384, + "step": 125530 + }, + { + "epoch": 13.810231023102311, + "grad_norm": 0.056885018944740295, + "learning_rate": 1.3224237196012007e-05, + "loss": 0.1223, + "num_input_tokens_seen": 26493376, + "step": 125535 + }, + { + "epoch": 13.81078107810781, + "grad_norm": 4.481255531311035, + "learning_rate": 1.322212010993414e-05, + "loss": 0.0499, + "num_input_tokens_seen": 26494400, + "step": 125540 + }, + { + "epoch": 13.811331133113312, + "grad_norm": 0.062199629843235016, + "learning_rate": 1.3220003132407347e-05, + "loss": 0.0429, + "num_input_tokens_seen": 26495424, + "step": 125545 + }, + { + "epoch": 13.811881188118813, + "grad_norm": 0.011171258985996246, + "learning_rate": 1.3217886263451124e-05, + "loss": 0.0283, + "num_input_tokens_seen": 26496512, + "step": 125550 + }, + { + "epoch": 13.812431243124312, + "grad_norm": 0.21178123354911804, + "learning_rate": 1.3215769503084995e-05, + "loss": 0.0069, + "num_input_tokens_seen": 26497536, + "step": 125555 + }, + { + "epoch": 13.812981298129813, + "grad_norm": 0.021466800943017006, + "learning_rate": 1.3213652851328462e-05, + "loss": 0.01, + "num_input_tokens_seen": 26498560, + "step": 125560 + }, + { + "epoch": 13.813531353135314, + "grad_norm": 0.16943752765655518, + "learning_rate": 1.3211536308201031e-05, + "loss": 0.0121, + "num_input_tokens_seen": 26499584, + "step": 125565 + }, + { + "epoch": 13.814081408140813, + "grad_norm": 0.014970064163208008, + "learning_rate": 1.3209419873722223e-05, + "loss": 0.0622, + "num_input_tokens_seen": 26500640, + "step": 125570 + }, + { + "epoch": 13.814631463146315, + "grad_norm": 0.06978929787874222, + "learning_rate": 1.3207303547911525e-05, + "loss": 0.0374, + "num_input_tokens_seen": 26501696, + "step": 125575 + }, + { + "epoch": 13.815181518151816, + "grad_norm": 0.00930242519825697, + "learning_rate": 1.320518733078846e-05, + "loss": 0.0822, + "num_input_tokens_seen": 26502720, + "step": 125580 + }, + { + "epoch": 13.815731573157315, + "grad_norm": 0.054376352578401566, + "learning_rate": 1.3203071222372527e-05, + "loss": 0.0318, + "num_input_tokens_seen": 26503744, + "step": 125585 + }, + { + "epoch": 13.816281628162816, + "grad_norm": 0.020780760794878006, + "learning_rate": 1.3200955222683215e-05, + "loss": 0.1033, + "num_input_tokens_seen": 26504800, + "step": 125590 + }, + { + "epoch": 13.816831683168317, + "grad_norm": 0.4167429208755493, + "learning_rate": 1.3198839331740035e-05, + "loss": 0.0051, + "num_input_tokens_seen": 26505792, + "step": 125595 + }, + { + "epoch": 13.817381738173818, + "grad_norm": 0.06091141700744629, + "learning_rate": 1.3196723549562489e-05, + "loss": 0.106, + "num_input_tokens_seen": 26506784, + "step": 125600 + }, + { + "epoch": 13.817931793179318, + "grad_norm": 0.05559857562184334, + "learning_rate": 1.319460787617009e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26507808, + "step": 125605 + }, + { + "epoch": 13.818481848184819, + "grad_norm": 0.017792800441384315, + "learning_rate": 1.3192492311582328e-05, + "loss": 0.0067, + "num_input_tokens_seen": 26508864, + "step": 125610 + }, + { + "epoch": 13.81903190319032, + "grad_norm": 0.11587285995483398, + "learning_rate": 1.3190376855818684e-05, + "loss": 0.0989, + "num_input_tokens_seen": 26509952, + "step": 125615 + }, + { + "epoch": 13.819581958195819, + "grad_norm": 0.0515432171523571, + "learning_rate": 1.3188261508898672e-05, + "loss": 0.002, + "num_input_tokens_seen": 26511008, + "step": 125620 + }, + { + "epoch": 13.82013201320132, + "grad_norm": 0.0072257681749761105, + "learning_rate": 1.3186146270841787e-05, + "loss": 0.0161, + "num_input_tokens_seen": 26512064, + "step": 125625 + }, + { + "epoch": 13.820682068206821, + "grad_norm": 0.07571463286876678, + "learning_rate": 1.318403114166753e-05, + "loss": 0.0472, + "num_input_tokens_seen": 26513152, + "step": 125630 + }, + { + "epoch": 13.82123212321232, + "grad_norm": 0.27466800808906555, + "learning_rate": 1.318191612139539e-05, + "loss": 0.0068, + "num_input_tokens_seen": 26514144, + "step": 125635 + }, + { + "epoch": 13.821782178217822, + "grad_norm": 0.0668390542268753, + "learning_rate": 1.3179801210044852e-05, + "loss": 0.0354, + "num_input_tokens_seen": 26515200, + "step": 125640 + }, + { + "epoch": 13.822332233223323, + "grad_norm": 0.25387293100357056, + "learning_rate": 1.3177686407635417e-05, + "loss": 0.0046, + "num_input_tokens_seen": 26516224, + "step": 125645 + }, + { + "epoch": 13.822882288228822, + "grad_norm": 0.019680844619870186, + "learning_rate": 1.3175571714186569e-05, + "loss": 0.1039, + "num_input_tokens_seen": 26517312, + "step": 125650 + }, + { + "epoch": 13.823432343234323, + "grad_norm": 0.009012551046907902, + "learning_rate": 1.3173457129717798e-05, + "loss": 0.0469, + "num_input_tokens_seen": 26518368, + "step": 125655 + }, + { + "epoch": 13.823982398239824, + "grad_norm": 0.02063538320362568, + "learning_rate": 1.3171342654248611e-05, + "loss": 0.0311, + "num_input_tokens_seen": 26519424, + "step": 125660 + }, + { + "epoch": 13.824532453245325, + "grad_norm": 1.3242539167404175, + "learning_rate": 1.3169228287798474e-05, + "loss": 0.052, + "num_input_tokens_seen": 26520512, + "step": 125665 + }, + { + "epoch": 13.825082508250825, + "grad_norm": 0.03138062357902527, + "learning_rate": 1.316711403038689e-05, + "loss": 0.0301, + "num_input_tokens_seen": 26521536, + "step": 125670 + }, + { + "epoch": 13.825632563256326, + "grad_norm": 2.2929816246032715, + "learning_rate": 1.316499988203333e-05, + "loss": 0.0191, + "num_input_tokens_seen": 26522592, + "step": 125675 + }, + { + "epoch": 13.826182618261827, + "grad_norm": 0.40590569376945496, + "learning_rate": 1.3162885842757284e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26523712, + "step": 125680 + }, + { + "epoch": 13.826732673267326, + "grad_norm": 0.007982578128576279, + "learning_rate": 1.316077191257825e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26524736, + "step": 125685 + }, + { + "epoch": 13.827282728272827, + "grad_norm": 0.1026611402630806, + "learning_rate": 1.3158658091515694e-05, + "loss": 0.0056, + "num_input_tokens_seen": 26525760, + "step": 125690 + }, + { + "epoch": 13.827832783278328, + "grad_norm": 3.3474254608154297, + "learning_rate": 1.3156544379589112e-05, + "loss": 0.0766, + "num_input_tokens_seen": 26526784, + "step": 125695 + }, + { + "epoch": 13.828382838283828, + "grad_norm": 0.002190694445744157, + "learning_rate": 1.3154430776817978e-05, + "loss": 0.0394, + "num_input_tokens_seen": 26527872, + "step": 125700 + }, + { + "epoch": 13.828932893289329, + "grad_norm": 0.019449202343821526, + "learning_rate": 1.315231728322176e-05, + "loss": 0.04, + "num_input_tokens_seen": 26528928, + "step": 125705 + }, + { + "epoch": 13.82948294829483, + "grad_norm": 0.009894479997456074, + "learning_rate": 1.3150203898819948e-05, + "loss": 0.0576, + "num_input_tokens_seen": 26530016, + "step": 125710 + }, + { + "epoch": 13.83003300330033, + "grad_norm": 2.454637289047241, + "learning_rate": 1.3148090623632022e-05, + "loss": 0.015, + "num_input_tokens_seen": 26531072, + "step": 125715 + }, + { + "epoch": 13.83058305830583, + "grad_norm": 0.15839087963104248, + "learning_rate": 1.3145977457677466e-05, + "loss": 0.0015, + "num_input_tokens_seen": 26532128, + "step": 125720 + }, + { + "epoch": 13.831133113311331, + "grad_norm": 0.018216414377093315, + "learning_rate": 1.3143864400975752e-05, + "loss": 0.0061, + "num_input_tokens_seen": 26533120, + "step": 125725 + }, + { + "epoch": 13.831683168316832, + "grad_norm": 0.04831791669130325, + "learning_rate": 1.3141751453546335e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26534144, + "step": 125730 + }, + { + "epoch": 13.832233223322332, + "grad_norm": 0.028644679114222527, + "learning_rate": 1.313963861540871e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26535104, + "step": 125735 + }, + { + "epoch": 13.832783278327833, + "grad_norm": 0.0050879004411399364, + "learning_rate": 1.3137525886582341e-05, + "loss": 0.0816, + "num_input_tokens_seen": 26536224, + "step": 125740 + }, + { + "epoch": 13.833333333333334, + "grad_norm": 0.039317548274993896, + "learning_rate": 1.3135413267086715e-05, + "loss": 0.0953, + "num_input_tokens_seen": 26537280, + "step": 125745 + }, + { + "epoch": 13.833883388338833, + "grad_norm": 0.033446524292230606, + "learning_rate": 1.3133300756941292e-05, + "loss": 0.0194, + "num_input_tokens_seen": 26538336, + "step": 125750 + }, + { + "epoch": 13.834433443344334, + "grad_norm": 0.2148102968931198, + "learning_rate": 1.3131188356165533e-05, + "loss": 0.0215, + "num_input_tokens_seen": 26539456, + "step": 125755 + }, + { + "epoch": 13.834983498349835, + "grad_norm": 0.006289573851972818, + "learning_rate": 1.3129076064778922e-05, + "loss": 0.0058, + "num_input_tokens_seen": 26540544, + "step": 125760 + }, + { + "epoch": 13.835533553355335, + "grad_norm": 0.010216564871370792, + "learning_rate": 1.3126963882800913e-05, + "loss": 0.0864, + "num_input_tokens_seen": 26541568, + "step": 125765 + }, + { + "epoch": 13.836083608360836, + "grad_norm": 0.05059461295604706, + "learning_rate": 1.3124851810250982e-05, + "loss": 0.0613, + "num_input_tokens_seen": 26542592, + "step": 125770 + }, + { + "epoch": 13.836633663366337, + "grad_norm": 0.05369914323091507, + "learning_rate": 1.3122739847148602e-05, + "loss": 0.0405, + "num_input_tokens_seen": 26543680, + "step": 125775 + }, + { + "epoch": 13.837183718371836, + "grad_norm": 0.1884228140115738, + "learning_rate": 1.3120627993513218e-05, + "loss": 0.0195, + "num_input_tokens_seen": 26544800, + "step": 125780 + }, + { + "epoch": 13.837733773377337, + "grad_norm": 0.018961863592267036, + "learning_rate": 1.3118516249364321e-05, + "loss": 0.0462, + "num_input_tokens_seen": 26545888, + "step": 125785 + }, + { + "epoch": 13.838283828382838, + "grad_norm": 0.019619012251496315, + "learning_rate": 1.3116404614721345e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26546944, + "step": 125790 + }, + { + "epoch": 13.83883388338834, + "grad_norm": 0.10997028648853302, + "learning_rate": 1.3114293089603763e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26547968, + "step": 125795 + }, + { + "epoch": 13.839383938393839, + "grad_norm": 0.010489143431186676, + "learning_rate": 1.311218167403105e-05, + "loss": 0.0023, + "num_input_tokens_seen": 26549024, + "step": 125800 + }, + { + "epoch": 13.83993399339934, + "grad_norm": 0.03497106209397316, + "learning_rate": 1.3110070368022646e-05, + "loss": 0.0097, + "num_input_tokens_seen": 26550016, + "step": 125805 + }, + { + "epoch": 13.840484048404841, + "grad_norm": 0.01665336824953556, + "learning_rate": 1.3107959171598025e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26551072, + "step": 125810 + }, + { + "epoch": 13.84103410341034, + "grad_norm": 0.24908395111560822, + "learning_rate": 1.3105848084776628e-05, + "loss": 0.0127, + "num_input_tokens_seen": 26552064, + "step": 125815 + }, + { + "epoch": 13.841584158415841, + "grad_norm": 0.22987763583660126, + "learning_rate": 1.3103737107577935e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26553088, + "step": 125820 + }, + { + "epoch": 13.842134213421343, + "grad_norm": 0.026443252339959145, + "learning_rate": 1.3101626240021378e-05, + "loss": 0.0053, + "num_input_tokens_seen": 26554144, + "step": 125825 + }, + { + "epoch": 13.842684268426842, + "grad_norm": 0.038917865604162216, + "learning_rate": 1.3099515482126424e-05, + "loss": 0.0098, + "num_input_tokens_seen": 26555232, + "step": 125830 + }, + { + "epoch": 13.843234323432343, + "grad_norm": 0.009204358793795109, + "learning_rate": 1.3097404833912529e-05, + "loss": 0.0126, + "num_input_tokens_seen": 26556320, + "step": 125835 + }, + { + "epoch": 13.843784378437844, + "grad_norm": 0.0029909780714660883, + "learning_rate": 1.3095294295399139e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26557376, + "step": 125840 + }, + { + "epoch": 13.844334433443345, + "grad_norm": 0.08485807478427887, + "learning_rate": 1.3093183866605712e-05, + "loss": 0.0724, + "num_input_tokens_seen": 26558496, + "step": 125845 + }, + { + "epoch": 13.844884488448844, + "grad_norm": 0.13508063554763794, + "learning_rate": 1.3091073547551691e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26559584, + "step": 125850 + }, + { + "epoch": 13.845434543454346, + "grad_norm": 0.036171749234199524, + "learning_rate": 1.3088963338256533e-05, + "loss": 0.0044, + "num_input_tokens_seen": 26560640, + "step": 125855 + }, + { + "epoch": 13.845984598459847, + "grad_norm": 0.4730065166950226, + "learning_rate": 1.3086853238739688e-05, + "loss": 0.0754, + "num_input_tokens_seen": 26561760, + "step": 125860 + }, + { + "epoch": 13.846534653465346, + "grad_norm": 0.06079423800110817, + "learning_rate": 1.3084743249020595e-05, + "loss": 0.024, + "num_input_tokens_seen": 26562784, + "step": 125865 + }, + { + "epoch": 13.847084708470847, + "grad_norm": 0.019727226346731186, + "learning_rate": 1.3082633369118713e-05, + "loss": 0.0406, + "num_input_tokens_seen": 26563808, + "step": 125870 + }, + { + "epoch": 13.847634763476348, + "grad_norm": 0.15591847896575928, + "learning_rate": 1.3080523599053485e-05, + "loss": 0.0044, + "num_input_tokens_seen": 26564928, + "step": 125875 + }, + { + "epoch": 13.848184818481847, + "grad_norm": 0.032775335013866425, + "learning_rate": 1.3078413938844336e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26565952, + "step": 125880 + }, + { + "epoch": 13.848734873487349, + "grad_norm": 0.03156265616416931, + "learning_rate": 1.307630438851073e-05, + "loss": 0.0408, + "num_input_tokens_seen": 26566944, + "step": 125885 + }, + { + "epoch": 13.84928492849285, + "grad_norm": 2.0891847610473633, + "learning_rate": 1.307419494807211e-05, + "loss": 0.0926, + "num_input_tokens_seen": 26567968, + "step": 125890 + }, + { + "epoch": 13.84983498349835, + "grad_norm": 0.009691271930932999, + "learning_rate": 1.3072085617547907e-05, + "loss": 0.1321, + "num_input_tokens_seen": 26568992, + "step": 125895 + }, + { + "epoch": 13.85038503850385, + "grad_norm": 0.01664670929312706, + "learning_rate": 1.3069976396957576e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26570048, + "step": 125900 + }, + { + "epoch": 13.850935093509351, + "grad_norm": 0.05144000053405762, + "learning_rate": 1.3067867286320537e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26571136, + "step": 125905 + }, + { + "epoch": 13.851485148514852, + "grad_norm": 1.644813895225525, + "learning_rate": 1.306575828565624e-05, + "loss": 0.0855, + "num_input_tokens_seen": 26572192, + "step": 125910 + }, + { + "epoch": 13.852035203520352, + "grad_norm": 0.015167297795414925, + "learning_rate": 1.3063649394984134e-05, + "loss": 0.1314, + "num_input_tokens_seen": 26573184, + "step": 125915 + }, + { + "epoch": 13.852585258525853, + "grad_norm": 0.2825928330421448, + "learning_rate": 1.3061540614323631e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26574304, + "step": 125920 + }, + { + "epoch": 13.853135313531354, + "grad_norm": 0.016038227826356888, + "learning_rate": 1.3059431943694191e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26575392, + "step": 125925 + }, + { + "epoch": 13.853685368536853, + "grad_norm": 0.04297204688191414, + "learning_rate": 1.3057323383115227e-05, + "loss": 0.122, + "num_input_tokens_seen": 26576448, + "step": 125930 + }, + { + "epoch": 13.854235423542354, + "grad_norm": 0.0735512375831604, + "learning_rate": 1.305521493260619e-05, + "loss": 0.0192, + "num_input_tokens_seen": 26577440, + "step": 125935 + }, + { + "epoch": 13.854785478547855, + "grad_norm": 0.020334888249635696, + "learning_rate": 1.30531065921865e-05, + "loss": 0.1292, + "num_input_tokens_seen": 26578560, + "step": 125940 + }, + { + "epoch": 13.855335533553355, + "grad_norm": 0.03991493955254555, + "learning_rate": 1.3050998361875594e-05, + "loss": 0.0099, + "num_input_tokens_seen": 26579552, + "step": 125945 + }, + { + "epoch": 13.855885588558856, + "grad_norm": 0.038826651871204376, + "learning_rate": 1.304889024169291e-05, + "loss": 0.0359, + "num_input_tokens_seen": 26580608, + "step": 125950 + }, + { + "epoch": 13.856435643564357, + "grad_norm": 0.9961110949516296, + "learning_rate": 1.304678223165786e-05, + "loss": 0.0947, + "num_input_tokens_seen": 26581632, + "step": 125955 + }, + { + "epoch": 13.856985698569858, + "grad_norm": 0.034974921494722366, + "learning_rate": 1.3044674331789891e-05, + "loss": 0.0046, + "num_input_tokens_seen": 26582720, + "step": 125960 + }, + { + "epoch": 13.857535753575357, + "grad_norm": 0.07870907336473465, + "learning_rate": 1.3042566542108415e-05, + "loss": 0.1147, + "num_input_tokens_seen": 26583744, + "step": 125965 + }, + { + "epoch": 13.858085808580858, + "grad_norm": 0.020211277529597282, + "learning_rate": 1.3040458862632865e-05, + "loss": 0.0026, + "num_input_tokens_seen": 26584832, + "step": 125970 + }, + { + "epoch": 13.85863586358636, + "grad_norm": 5.204082012176514, + "learning_rate": 1.3038351293382675e-05, + "loss": 0.1467, + "num_input_tokens_seen": 26585856, + "step": 125975 + }, + { + "epoch": 13.859185918591859, + "grad_norm": 0.036553118377923965, + "learning_rate": 1.3036243834377253e-05, + "loss": 0.0023, + "num_input_tokens_seen": 26586880, + "step": 125980 + }, + { + "epoch": 13.85973597359736, + "grad_norm": 1.771146535873413, + "learning_rate": 1.303413648563604e-05, + "loss": 0.0154, + "num_input_tokens_seen": 26587904, + "step": 125985 + }, + { + "epoch": 13.86028602860286, + "grad_norm": 0.056661371141672134, + "learning_rate": 1.3032029247178452e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26588928, + "step": 125990 + }, + { + "epoch": 13.86083608360836, + "grad_norm": 0.00696568563580513, + "learning_rate": 1.3029922119023893e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26589984, + "step": 125995 + }, + { + "epoch": 13.861386138613861, + "grad_norm": 0.01310319546610117, + "learning_rate": 1.3027815101191801e-05, + "loss": 0.0074, + "num_input_tokens_seen": 26591040, + "step": 126000 + }, + { + "epoch": 13.861936193619362, + "grad_norm": 1.096858263015747, + "learning_rate": 1.3025708193701592e-05, + "loss": 0.1302, + "num_input_tokens_seen": 26592128, + "step": 126005 + }, + { + "epoch": 13.862486248624862, + "grad_norm": 3.0368857383728027, + "learning_rate": 1.3023601396572694e-05, + "loss": 0.0699, + "num_input_tokens_seen": 26593184, + "step": 126010 + }, + { + "epoch": 13.863036303630363, + "grad_norm": 0.004676789976656437, + "learning_rate": 1.3021494709824514e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26594208, + "step": 126015 + }, + { + "epoch": 13.863586358635864, + "grad_norm": 0.02302596904337406, + "learning_rate": 1.3019388133476463e-05, + "loss": 0.0066, + "num_input_tokens_seen": 26595232, + "step": 126020 + }, + { + "epoch": 13.864136413641365, + "grad_norm": 0.1259714961051941, + "learning_rate": 1.301728166754796e-05, + "loss": 0.0115, + "num_input_tokens_seen": 26596288, + "step": 126025 + }, + { + "epoch": 13.864686468646864, + "grad_norm": 0.42867156863212585, + "learning_rate": 1.301517531205842e-05, + "loss": 0.0178, + "num_input_tokens_seen": 26597376, + "step": 126030 + }, + { + "epoch": 13.865236523652365, + "grad_norm": 0.01471613347530365, + "learning_rate": 1.3013069067027273e-05, + "loss": 0.0042, + "num_input_tokens_seen": 26598432, + "step": 126035 + }, + { + "epoch": 13.865786578657866, + "grad_norm": 0.18264059722423553, + "learning_rate": 1.3010962932473913e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26599488, + "step": 126040 + }, + { + "epoch": 13.866336633663366, + "grad_norm": 0.07553831487894058, + "learning_rate": 1.3008856908417747e-05, + "loss": 0.004, + "num_input_tokens_seen": 26600480, + "step": 126045 + }, + { + "epoch": 13.866886688668867, + "grad_norm": 1.4667903184890747, + "learning_rate": 1.3006750994878203e-05, + "loss": 0.0378, + "num_input_tokens_seen": 26601536, + "step": 126050 + }, + { + "epoch": 13.867436743674368, + "grad_norm": 0.03496193885803223, + "learning_rate": 1.3004645191874668e-05, + "loss": 0.0036, + "num_input_tokens_seen": 26602592, + "step": 126055 + }, + { + "epoch": 13.867986798679867, + "grad_norm": 0.00814379658550024, + "learning_rate": 1.3002539499426563e-05, + "loss": 0.055, + "num_input_tokens_seen": 26603616, + "step": 126060 + }, + { + "epoch": 13.868536853685368, + "grad_norm": 0.14060135185718536, + "learning_rate": 1.3000433917553306e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26604672, + "step": 126065 + }, + { + "epoch": 13.86908690869087, + "grad_norm": 0.027821935713291168, + "learning_rate": 1.299832844627428e-05, + "loss": 0.0371, + "num_input_tokens_seen": 26605696, + "step": 126070 + }, + { + "epoch": 13.869636963696369, + "grad_norm": 0.006537338253110647, + "learning_rate": 1.299622308560891e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26606816, + "step": 126075 + }, + { + "epoch": 13.87018701870187, + "grad_norm": 0.016564514487981796, + "learning_rate": 1.2994117835576598e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26607840, + "step": 126080 + }, + { + "epoch": 13.870737073707371, + "grad_norm": 1.585057258605957, + "learning_rate": 1.2992012696196715e-05, + "loss": 0.0727, + "num_input_tokens_seen": 26608896, + "step": 126085 + }, + { + "epoch": 13.871287128712872, + "grad_norm": 0.01027577556669712, + "learning_rate": 1.2989907667488713e-05, + "loss": 0.0233, + "num_input_tokens_seen": 26609920, + "step": 126090 + }, + { + "epoch": 13.871837183718371, + "grad_norm": 0.5685874223709106, + "learning_rate": 1.2987802749471955e-05, + "loss": 0.0047, + "num_input_tokens_seen": 26610944, + "step": 126095 + }, + { + "epoch": 13.872387238723872, + "grad_norm": 0.10331438481807709, + "learning_rate": 1.2985697942165869e-05, + "loss": 0.0298, + "num_input_tokens_seen": 26612000, + "step": 126100 + }, + { + "epoch": 13.872937293729374, + "grad_norm": 0.014594071544706821, + "learning_rate": 1.2983593245589837e-05, + "loss": 0.0092, + "num_input_tokens_seen": 26613056, + "step": 126105 + }, + { + "epoch": 13.873487348734873, + "grad_norm": 0.019756503403186798, + "learning_rate": 1.2981488659763253e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26614048, + "step": 126110 + }, + { + "epoch": 13.874037403740374, + "grad_norm": 0.07176924496889114, + "learning_rate": 1.2979384184705518e-05, + "loss": 0.0667, + "num_input_tokens_seen": 26615136, + "step": 126115 + }, + { + "epoch": 13.874587458745875, + "grad_norm": 0.6128235459327698, + "learning_rate": 1.2977279820436034e-05, + "loss": 0.0309, + "num_input_tokens_seen": 26616224, + "step": 126120 + }, + { + "epoch": 13.875137513751374, + "grad_norm": 1.7950632572174072, + "learning_rate": 1.2975175566974201e-05, + "loss": 0.0438, + "num_input_tokens_seen": 26617280, + "step": 126125 + }, + { + "epoch": 13.875687568756875, + "grad_norm": 0.013068482279777527, + "learning_rate": 1.2973071424339404e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26618368, + "step": 126130 + }, + { + "epoch": 13.876237623762377, + "grad_norm": 0.03539760038256645, + "learning_rate": 1.2970967392551026e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26619424, + "step": 126135 + }, + { + "epoch": 13.876787678767876, + "grad_norm": 1.4943346977233887, + "learning_rate": 1.2968863471628481e-05, + "loss": 0.0748, + "num_input_tokens_seen": 26620480, + "step": 126140 + }, + { + "epoch": 13.877337733773377, + "grad_norm": 0.024264050647616386, + "learning_rate": 1.296675966159113e-05, + "loss": 0.0038, + "num_input_tokens_seen": 26621504, + "step": 126145 + }, + { + "epoch": 13.877887788778878, + "grad_norm": 0.06109326332807541, + "learning_rate": 1.29646559624584e-05, + "loss": 0.0039, + "num_input_tokens_seen": 26622560, + "step": 126150 + }, + { + "epoch": 13.87843784378438, + "grad_norm": 0.018435021862387657, + "learning_rate": 1.2962552374249658e-05, + "loss": 0.0733, + "num_input_tokens_seen": 26623616, + "step": 126155 + }, + { + "epoch": 13.878987898789878, + "grad_norm": 1.8133594989776611, + "learning_rate": 1.2960448896984286e-05, + "loss": 0.0653, + "num_input_tokens_seen": 26624672, + "step": 126160 + }, + { + "epoch": 13.87953795379538, + "grad_norm": 0.009329304099082947, + "learning_rate": 1.2958345530681692e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26625728, + "step": 126165 + }, + { + "epoch": 13.88008800880088, + "grad_norm": 0.0561211071908474, + "learning_rate": 1.2956242275361236e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26626784, + "step": 126170 + }, + { + "epoch": 13.88063806380638, + "grad_norm": 0.0734938234090805, + "learning_rate": 1.2954139131042318e-05, + "loss": 0.0104, + "num_input_tokens_seen": 26627840, + "step": 126175 + }, + { + "epoch": 13.881188118811881, + "grad_norm": 0.008769775740802288, + "learning_rate": 1.2952036097744327e-05, + "loss": 0.0023, + "num_input_tokens_seen": 26628896, + "step": 126180 + }, + { + "epoch": 13.881738173817382, + "grad_norm": 0.111594557762146, + "learning_rate": 1.2949933175486629e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26629952, + "step": 126185 + }, + { + "epoch": 13.882288228822881, + "grad_norm": 0.009595793671905994, + "learning_rate": 1.2947830364288627e-05, + "loss": 0.0637, + "num_input_tokens_seen": 26631008, + "step": 126190 + }, + { + "epoch": 13.882838283828383, + "grad_norm": 1.2554601430892944, + "learning_rate": 1.2945727664169677e-05, + "loss": 0.0466, + "num_input_tokens_seen": 26632128, + "step": 126195 + }, + { + "epoch": 13.883388338833884, + "grad_norm": 3.6980297565460205, + "learning_rate": 1.2943625075149179e-05, + "loss": 0.1794, + "num_input_tokens_seen": 26633216, + "step": 126200 + }, + { + "epoch": 13.883938393839383, + "grad_norm": 0.017285704612731934, + "learning_rate": 1.2941522597246497e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26634336, + "step": 126205 + }, + { + "epoch": 13.884488448844884, + "grad_norm": 0.9170121550559998, + "learning_rate": 1.2939420230481014e-05, + "loss": 0.0063, + "num_input_tokens_seen": 26635360, + "step": 126210 + }, + { + "epoch": 13.885038503850385, + "grad_norm": 1.5614452362060547, + "learning_rate": 1.293731797487212e-05, + "loss": 0.0443, + "num_input_tokens_seen": 26636416, + "step": 126215 + }, + { + "epoch": 13.885588558855886, + "grad_norm": 0.014977974817156792, + "learning_rate": 1.2935215830439166e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26637440, + "step": 126220 + }, + { + "epoch": 13.886138613861386, + "grad_norm": 0.5752890110015869, + "learning_rate": 1.2933113797201545e-05, + "loss": 0.0515, + "num_input_tokens_seen": 26638432, + "step": 126225 + }, + { + "epoch": 13.886688668866887, + "grad_norm": 0.3135966956615448, + "learning_rate": 1.2931011875178618e-05, + "loss": 0.0314, + "num_input_tokens_seen": 26639488, + "step": 126230 + }, + { + "epoch": 13.887238723872388, + "grad_norm": 0.02334052324295044, + "learning_rate": 1.2928910064389762e-05, + "loss": 0.0172, + "num_input_tokens_seen": 26640512, + "step": 126235 + }, + { + "epoch": 13.887788778877887, + "grad_norm": 0.06450715661048889, + "learning_rate": 1.292680836485436e-05, + "loss": 0.078, + "num_input_tokens_seen": 26641600, + "step": 126240 + }, + { + "epoch": 13.888338833883388, + "grad_norm": 0.12438558787107468, + "learning_rate": 1.2924706776591761e-05, + "loss": 0.0077, + "num_input_tokens_seen": 26642656, + "step": 126245 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 0.01013268157839775, + "learning_rate": 1.2922605299621357e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26643680, + "step": 126250 + }, + { + "epoch": 13.88943894389439, + "grad_norm": 0.022341763600707054, + "learning_rate": 1.2920503933962503e-05, + "loss": 0.0095, + "num_input_tokens_seen": 26644672, + "step": 126255 + }, + { + "epoch": 13.88998899889989, + "grad_norm": 0.015606092289090157, + "learning_rate": 1.291840267963455e-05, + "loss": 0.0048, + "num_input_tokens_seen": 26645792, + "step": 126260 + }, + { + "epoch": 13.89053905390539, + "grad_norm": 2.260239362716675, + "learning_rate": 1.29163015366569e-05, + "loss": 0.0562, + "num_input_tokens_seen": 26646816, + "step": 126265 + }, + { + "epoch": 13.891089108910892, + "grad_norm": 0.08115344494581223, + "learning_rate": 1.2914200505048901e-05, + "loss": 0.021, + "num_input_tokens_seen": 26647904, + "step": 126270 + }, + { + "epoch": 13.891639163916391, + "grad_norm": 0.05904100090265274, + "learning_rate": 1.2912099584829906e-05, + "loss": 0.0119, + "num_input_tokens_seen": 26648960, + "step": 126275 + }, + { + "epoch": 13.892189218921892, + "grad_norm": 1.227927803993225, + "learning_rate": 1.29099987760193e-05, + "loss": 0.1094, + "num_input_tokens_seen": 26650016, + "step": 126280 + }, + { + "epoch": 13.892739273927393, + "grad_norm": 0.7409862875938416, + "learning_rate": 1.2907898078636426e-05, + "loss": 0.0146, + "num_input_tokens_seen": 26651040, + "step": 126285 + }, + { + "epoch": 13.893289328932893, + "grad_norm": 0.012289872393012047, + "learning_rate": 1.290579749270065e-05, + "loss": 0.0022, + "num_input_tokens_seen": 26652160, + "step": 126290 + }, + { + "epoch": 13.893839383938394, + "grad_norm": 0.053158991038799286, + "learning_rate": 1.2903697018231347e-05, + "loss": 0.0447, + "num_input_tokens_seen": 26653184, + "step": 126295 + }, + { + "epoch": 13.894389438943895, + "grad_norm": 0.02848566696047783, + "learning_rate": 1.2901596655247852e-05, + "loss": 0.0007, + "num_input_tokens_seen": 26654272, + "step": 126300 + }, + { + "epoch": 13.894939493949394, + "grad_norm": 0.1176762655377388, + "learning_rate": 1.2899496403769545e-05, + "loss": 0.0316, + "num_input_tokens_seen": 26655360, + "step": 126305 + }, + { + "epoch": 13.895489548954895, + "grad_norm": 0.010164182633161545, + "learning_rate": 1.2897396263815764e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26656448, + "step": 126310 + }, + { + "epoch": 13.896039603960396, + "grad_norm": 2.4299018383026123, + "learning_rate": 1.2895296235405885e-05, + "loss": 0.1169, + "num_input_tokens_seen": 26657536, + "step": 126315 + }, + { + "epoch": 13.896589658965897, + "grad_norm": 0.03234424069523811, + "learning_rate": 1.2893196318559242e-05, + "loss": 0.0325, + "num_input_tokens_seen": 26658560, + "step": 126320 + }, + { + "epoch": 13.897139713971397, + "grad_norm": 0.10505614429712296, + "learning_rate": 1.28910965132952e-05, + "loss": 0.0134, + "num_input_tokens_seen": 26659648, + "step": 126325 + }, + { + "epoch": 13.897689768976898, + "grad_norm": 0.05033623054623604, + "learning_rate": 1.2888996819633121e-05, + "loss": 0.0176, + "num_input_tokens_seen": 26660768, + "step": 126330 + }, + { + "epoch": 13.898239823982399, + "grad_norm": 0.02307484671473503, + "learning_rate": 1.2886897237592337e-05, + "loss": 0.0036, + "num_input_tokens_seen": 26661824, + "step": 126335 + }, + { + "epoch": 13.898789878987898, + "grad_norm": 0.01705673150718212, + "learning_rate": 1.2884797767192217e-05, + "loss": 0.0087, + "num_input_tokens_seen": 26662880, + "step": 126340 + }, + { + "epoch": 13.8993399339934, + "grad_norm": 0.10078249871730804, + "learning_rate": 1.2882698408452093e-05, + "loss": 0.003, + "num_input_tokens_seen": 26663968, + "step": 126345 + }, + { + "epoch": 13.8998899889989, + "grad_norm": 0.27071690559387207, + "learning_rate": 1.2880599161391324e-05, + "loss": 0.0285, + "num_input_tokens_seen": 26665056, + "step": 126350 + }, + { + "epoch": 13.9004400440044, + "grad_norm": 0.03673100471496582, + "learning_rate": 1.2878500026029267e-05, + "loss": 0.0049, + "num_input_tokens_seen": 26666144, + "step": 126355 + }, + { + "epoch": 13.900990099009901, + "grad_norm": 0.010039779357612133, + "learning_rate": 1.2876401002385247e-05, + "loss": 0.0549, + "num_input_tokens_seen": 26667200, + "step": 126360 + }, + { + "epoch": 13.901540154015402, + "grad_norm": 0.004219458904117346, + "learning_rate": 1.2874302090478635e-05, + "loss": 0.0025, + "num_input_tokens_seen": 26668224, + "step": 126365 + }, + { + "epoch": 13.902090209020901, + "grad_norm": 0.06510716676712036, + "learning_rate": 1.287220329032876e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26669280, + "step": 126370 + }, + { + "epoch": 13.902640264026402, + "grad_norm": 0.4219801425933838, + "learning_rate": 1.2870104601954958e-05, + "loss": 0.0101, + "num_input_tokens_seen": 26670368, + "step": 126375 + }, + { + "epoch": 13.903190319031903, + "grad_norm": 0.027643263339996338, + "learning_rate": 1.2868006025376581e-05, + "loss": 0.003, + "num_input_tokens_seen": 26671456, + "step": 126380 + }, + { + "epoch": 13.903740374037405, + "grad_norm": 0.08871329575777054, + "learning_rate": 1.2865907560612972e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26672576, + "step": 126385 + }, + { + "epoch": 13.904290429042904, + "grad_norm": 0.015203721821308136, + "learning_rate": 1.286380920768348e-05, + "loss": 0.0982, + "num_input_tokens_seen": 26673696, + "step": 126390 + }, + { + "epoch": 13.904840484048405, + "grad_norm": 0.010844804346561432, + "learning_rate": 1.2861710966607431e-05, + "loss": 0.0054, + "num_input_tokens_seen": 26674784, + "step": 126395 + }, + { + "epoch": 13.905390539053906, + "grad_norm": 0.009383388794958591, + "learning_rate": 1.2859612837404164e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26675840, + "step": 126400 + }, + { + "epoch": 13.905940594059405, + "grad_norm": 0.01504601165652275, + "learning_rate": 1.2857514820093019e-05, + "loss": 0.0705, + "num_input_tokens_seen": 26676928, + "step": 126405 + }, + { + "epoch": 13.906490649064907, + "grad_norm": 0.04016527906060219, + "learning_rate": 1.285541691469333e-05, + "loss": 0.0012, + "num_input_tokens_seen": 26677984, + "step": 126410 + }, + { + "epoch": 13.907040704070408, + "grad_norm": 0.011125599965453148, + "learning_rate": 1.285331912122445e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26679008, + "step": 126415 + }, + { + "epoch": 13.907590759075907, + "grad_norm": 0.20376694202423096, + "learning_rate": 1.2851221439705696e-05, + "loss": 0.0154, + "num_input_tokens_seen": 26680032, + "step": 126420 + }, + { + "epoch": 13.908140814081408, + "grad_norm": 0.04501976817846298, + "learning_rate": 1.2849123870156393e-05, + "loss": 0.0069, + "num_input_tokens_seen": 26681152, + "step": 126425 + }, + { + "epoch": 13.908690869086909, + "grad_norm": 0.08376172930002213, + "learning_rate": 1.2847026412595897e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26682272, + "step": 126430 + }, + { + "epoch": 13.909240924092408, + "grad_norm": 0.005687380209565163, + "learning_rate": 1.2844929067043516e-05, + "loss": 0.0069, + "num_input_tokens_seen": 26683360, + "step": 126435 + }, + { + "epoch": 13.90979097909791, + "grad_norm": 0.0067048328928649426, + "learning_rate": 1.2842831833518592e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26684384, + "step": 126440 + }, + { + "epoch": 13.91034103410341, + "grad_norm": 0.1418040692806244, + "learning_rate": 1.2840734712040465e-05, + "loss": 0.0056, + "num_input_tokens_seen": 26685408, + "step": 126445 + }, + { + "epoch": 13.910891089108912, + "grad_norm": 0.01595788449048996, + "learning_rate": 1.283863770262844e-05, + "loss": 0.0137, + "num_input_tokens_seen": 26686432, + "step": 126450 + }, + { + "epoch": 13.911441144114411, + "grad_norm": 0.011752861551940441, + "learning_rate": 1.2836540805301866e-05, + "loss": 0.0374, + "num_input_tokens_seen": 26687456, + "step": 126455 + }, + { + "epoch": 13.911991199119912, + "grad_norm": 0.007191996555775404, + "learning_rate": 1.2834444020080047e-05, + "loss": 0.0487, + "num_input_tokens_seen": 26688480, + "step": 126460 + }, + { + "epoch": 13.912541254125413, + "grad_norm": 0.017579711973667145, + "learning_rate": 1.2832347346982321e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26689568, + "step": 126465 + }, + { + "epoch": 13.913091309130913, + "grad_norm": 0.06544329971075058, + "learning_rate": 1.2830250786028023e-05, + "loss": 0.002, + "num_input_tokens_seen": 26690592, + "step": 126470 + }, + { + "epoch": 13.913641364136414, + "grad_norm": 1.702398657798767, + "learning_rate": 1.282815433723645e-05, + "loss": 0.0425, + "num_input_tokens_seen": 26691616, + "step": 126475 + }, + { + "epoch": 13.914191419141915, + "grad_norm": 0.0033183633349835873, + "learning_rate": 1.2826058000626951e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26692704, + "step": 126480 + }, + { + "epoch": 13.914741474147414, + "grad_norm": 0.085211381316185, + "learning_rate": 1.2823961776218832e-05, + "loss": 0.0036, + "num_input_tokens_seen": 26693792, + "step": 126485 + }, + { + "epoch": 13.915291529152915, + "grad_norm": 0.03233912214636803, + "learning_rate": 1.2821865664031407e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26694816, + "step": 126490 + }, + { + "epoch": 13.915841584158416, + "grad_norm": 0.0018994794227182865, + "learning_rate": 1.2819769664083999e-05, + "loss": 0.0817, + "num_input_tokens_seen": 26695904, + "step": 126495 + }, + { + "epoch": 13.916391639163916, + "grad_norm": 0.16675293445587158, + "learning_rate": 1.2817673776395933e-05, + "loss": 0.0122, + "num_input_tokens_seen": 26696928, + "step": 126500 + }, + { + "epoch": 13.916941694169417, + "grad_norm": 0.005564022809267044, + "learning_rate": 1.2815578000986529e-05, + "loss": 0.0051, + "num_input_tokens_seen": 26697952, + "step": 126505 + }, + { + "epoch": 13.917491749174918, + "grad_norm": 0.01497775036841631, + "learning_rate": 1.28134823378751e-05, + "loss": 0.0307, + "num_input_tokens_seen": 26698976, + "step": 126510 + }, + { + "epoch": 13.918041804180419, + "grad_norm": 0.2489689290523529, + "learning_rate": 1.2811386787080942e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26700000, + "step": 126515 + }, + { + "epoch": 13.918591859185918, + "grad_norm": 0.02591070719063282, + "learning_rate": 1.2809291348623387e-05, + "loss": 0.0079, + "num_input_tokens_seen": 26701120, + "step": 126520 + }, + { + "epoch": 13.91914191419142, + "grad_norm": 0.3601834774017334, + "learning_rate": 1.280719602252174e-05, + "loss": 0.0492, + "num_input_tokens_seen": 26702176, + "step": 126525 + }, + { + "epoch": 13.91969196919692, + "grad_norm": 0.06859418004751205, + "learning_rate": 1.280510080879533e-05, + "loss": 0.0211, + "num_input_tokens_seen": 26703200, + "step": 126530 + }, + { + "epoch": 13.92024202420242, + "grad_norm": 1.862744927406311, + "learning_rate": 1.2803005707463454e-05, + "loss": 0.0357, + "num_input_tokens_seen": 26704288, + "step": 126535 + }, + { + "epoch": 13.92079207920792, + "grad_norm": 0.9469918012619019, + "learning_rate": 1.2800910718545412e-05, + "loss": 0.0651, + "num_input_tokens_seen": 26705344, + "step": 126540 + }, + { + "epoch": 13.921342134213422, + "grad_norm": 0.01716184802353382, + "learning_rate": 1.279881584206053e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26706432, + "step": 126545 + }, + { + "epoch": 13.921892189218921, + "grad_norm": 0.013552162796258926, + "learning_rate": 1.2796721078028095e-05, + "loss": 0.0237, + "num_input_tokens_seen": 26707488, + "step": 126550 + }, + { + "epoch": 13.922442244224422, + "grad_norm": 0.0957925021648407, + "learning_rate": 1.2794626426467432e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26708544, + "step": 126555 + }, + { + "epoch": 13.922992299229923, + "grad_norm": 0.03684796392917633, + "learning_rate": 1.2792531887397846e-05, + "loss": 0.0762, + "num_input_tokens_seen": 26709600, + "step": 126560 + }, + { + "epoch": 13.923542354235423, + "grad_norm": 0.005980220157653093, + "learning_rate": 1.279043746083863e-05, + "loss": 0.0718, + "num_input_tokens_seen": 26710656, + "step": 126565 + }, + { + "epoch": 13.924092409240924, + "grad_norm": 0.06041954457759857, + "learning_rate": 1.27883431468091e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26711680, + "step": 126570 + }, + { + "epoch": 13.924642464246425, + "grad_norm": 0.005701772402971983, + "learning_rate": 1.2786248945328543e-05, + "loss": 0.0007, + "num_input_tokens_seen": 26712768, + "step": 126575 + }, + { + "epoch": 13.925192519251926, + "grad_norm": 0.03786270320415497, + "learning_rate": 1.278415485641627e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26713760, + "step": 126580 + }, + { + "epoch": 13.925742574257425, + "grad_norm": 0.018674926832318306, + "learning_rate": 1.278206088009159e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26714816, + "step": 126585 + }, + { + "epoch": 13.926292629262926, + "grad_norm": 0.09097419679164886, + "learning_rate": 1.2779967016373778e-05, + "loss": 0.0068, + "num_input_tokens_seen": 26715840, + "step": 126590 + }, + { + "epoch": 13.926842684268427, + "grad_norm": 0.054217711091041565, + "learning_rate": 1.2777873265282159e-05, + "loss": 0.0737, + "num_input_tokens_seen": 26716896, + "step": 126595 + }, + { + "epoch": 13.927392739273927, + "grad_norm": 0.11171313375234604, + "learning_rate": 1.2775779626836009e-05, + "loss": 0.1216, + "num_input_tokens_seen": 26717984, + "step": 126600 + }, + { + "epoch": 13.927942794279428, + "grad_norm": 0.08050142228603363, + "learning_rate": 1.2773686101054642e-05, + "loss": 0.002, + "num_input_tokens_seen": 26719072, + "step": 126605 + }, + { + "epoch": 13.928492849284929, + "grad_norm": 0.0015100192977115512, + "learning_rate": 1.2771592687957329e-05, + "loss": 0.0085, + "num_input_tokens_seen": 26720160, + "step": 126610 + }, + { + "epoch": 13.929042904290428, + "grad_norm": 0.13411355018615723, + "learning_rate": 1.2769499387563384e-05, + "loss": 0.0075, + "num_input_tokens_seen": 26721216, + "step": 126615 + }, + { + "epoch": 13.92959295929593, + "grad_norm": 0.006706532556563616, + "learning_rate": 1.2767406199892102e-05, + "loss": 0.013, + "num_input_tokens_seen": 26722336, + "step": 126620 + }, + { + "epoch": 13.93014301430143, + "grad_norm": 0.659547746181488, + "learning_rate": 1.2765313124962757e-05, + "loss": 0.0775, + "num_input_tokens_seen": 26723392, + "step": 126625 + }, + { + "epoch": 13.930693069306932, + "grad_norm": 0.10275458544492722, + "learning_rate": 1.2763220162794658e-05, + "loss": 0.0139, + "num_input_tokens_seen": 26724416, + "step": 126630 + }, + { + "epoch": 13.93124312431243, + "grad_norm": 0.1181541234254837, + "learning_rate": 1.276112731340708e-05, + "loss": 0.0428, + "num_input_tokens_seen": 26725440, + "step": 126635 + }, + { + "epoch": 13.931793179317932, + "grad_norm": 1.4933137893676758, + "learning_rate": 1.275903457681932e-05, + "loss": 0.0342, + "num_input_tokens_seen": 26726496, + "step": 126640 + }, + { + "epoch": 13.932343234323433, + "grad_norm": 0.06724724918603897, + "learning_rate": 1.2756941953050671e-05, + "loss": 0.0015, + "num_input_tokens_seen": 26727520, + "step": 126645 + }, + { + "epoch": 13.932893289328932, + "grad_norm": 0.01768522709608078, + "learning_rate": 1.2754849442120417e-05, + "loss": 0.0011, + "num_input_tokens_seen": 26728512, + "step": 126650 + }, + { + "epoch": 13.933443344334433, + "grad_norm": 0.002505590906366706, + "learning_rate": 1.2752757044047827e-05, + "loss": 0.0011, + "num_input_tokens_seen": 26729632, + "step": 126655 + }, + { + "epoch": 13.933993399339935, + "grad_norm": 0.13451893627643585, + "learning_rate": 1.275066475885221e-05, + "loss": 0.105, + "num_input_tokens_seen": 26730688, + "step": 126660 + }, + { + "epoch": 13.934543454345434, + "grad_norm": 1.7268786430358887, + "learning_rate": 1.2748572586552827e-05, + "loss": 0.1203, + "num_input_tokens_seen": 26731776, + "step": 126665 + }, + { + "epoch": 13.935093509350935, + "grad_norm": 0.06737973541021347, + "learning_rate": 1.2746480527168975e-05, + "loss": 0.0017, + "num_input_tokens_seen": 26732800, + "step": 126670 + }, + { + "epoch": 13.935643564356436, + "grad_norm": 0.071163609623909, + "learning_rate": 1.2744388580719937e-05, + "loss": 0.0041, + "num_input_tokens_seen": 26733824, + "step": 126675 + }, + { + "epoch": 13.936193619361937, + "grad_norm": 0.030249763280153275, + "learning_rate": 1.274229674722498e-05, + "loss": 0.0328, + "num_input_tokens_seen": 26734880, + "step": 126680 + }, + { + "epoch": 13.936743674367436, + "grad_norm": 0.0030586125794798136, + "learning_rate": 1.2740205026703406e-05, + "loss": 0.0095, + "num_input_tokens_seen": 26735936, + "step": 126685 + }, + { + "epoch": 13.937293729372938, + "grad_norm": 0.003917882684618235, + "learning_rate": 1.2738113419174464e-05, + "loss": 0.0555, + "num_input_tokens_seen": 26736960, + "step": 126690 + }, + { + "epoch": 13.937843784378439, + "grad_norm": 0.03318368270993233, + "learning_rate": 1.273602192465745e-05, + "loss": 0.029, + "num_input_tokens_seen": 26737920, + "step": 126695 + }, + { + "epoch": 13.938393839383938, + "grad_norm": 0.0124931326135993, + "learning_rate": 1.2733930543171647e-05, + "loss": 0.0284, + "num_input_tokens_seen": 26738976, + "step": 126700 + }, + { + "epoch": 13.938943894389439, + "grad_norm": 0.007768173702061176, + "learning_rate": 1.273183927473631e-05, + "loss": 0.0632, + "num_input_tokens_seen": 26740000, + "step": 126705 + }, + { + "epoch": 13.93949394939494, + "grad_norm": 0.16714900732040405, + "learning_rate": 1.2729748119370733e-05, + "loss": 0.0065, + "num_input_tokens_seen": 26741088, + "step": 126710 + }, + { + "epoch": 13.94004400440044, + "grad_norm": 0.8071889877319336, + "learning_rate": 1.2727657077094168e-05, + "loss": 0.0131, + "num_input_tokens_seen": 26742112, + "step": 126715 + }, + { + "epoch": 13.94059405940594, + "grad_norm": 0.12720070779323578, + "learning_rate": 1.2725566147925911e-05, + "loss": 0.0022, + "num_input_tokens_seen": 26743168, + "step": 126720 + }, + { + "epoch": 13.941144114411442, + "grad_norm": 0.015551849268376827, + "learning_rate": 1.2723475331885213e-05, + "loss": 0.0128, + "num_input_tokens_seen": 26744160, + "step": 126725 + }, + { + "epoch": 13.941694169416941, + "grad_norm": 0.022053182125091553, + "learning_rate": 1.272138462899135e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26745216, + "step": 126730 + }, + { + "epoch": 13.942244224422442, + "grad_norm": 0.005146441515535116, + "learning_rate": 1.2719294039263601e-05, + "loss": 0.0509, + "num_input_tokens_seen": 26746304, + "step": 126735 + }, + { + "epoch": 13.942794279427943, + "grad_norm": 0.022552940994501114, + "learning_rate": 1.271720356272122e-05, + "loss": 0.01, + "num_input_tokens_seen": 26747360, + "step": 126740 + }, + { + "epoch": 13.943344334433444, + "grad_norm": 3.0160257816314697, + "learning_rate": 1.2715113199383489e-05, + "loss": 0.1179, + "num_input_tokens_seen": 26748416, + "step": 126745 + }, + { + "epoch": 13.943894389438944, + "grad_norm": 0.012328792363405228, + "learning_rate": 1.2713022949269651e-05, + "loss": 0.0009, + "num_input_tokens_seen": 26749440, + "step": 126750 + }, + { + "epoch": 13.944444444444445, + "grad_norm": 0.08793166279792786, + "learning_rate": 1.2710932812398988e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26750496, + "step": 126755 + }, + { + "epoch": 13.944994499449946, + "grad_norm": 0.09906076639890671, + "learning_rate": 1.270884278879077e-05, + "loss": 0.003, + "num_input_tokens_seen": 26751520, + "step": 126760 + }, + { + "epoch": 13.945544554455445, + "grad_norm": 0.0270460806787014, + "learning_rate": 1.270675287846424e-05, + "loss": 0.0012, + "num_input_tokens_seen": 26752544, + "step": 126765 + }, + { + "epoch": 13.946094609460946, + "grad_norm": 0.06822384893894196, + "learning_rate": 1.2704663081438678e-05, + "loss": 0.0046, + "num_input_tokens_seen": 26753568, + "step": 126770 + }, + { + "epoch": 13.946644664466447, + "grad_norm": 0.7006605267524719, + "learning_rate": 1.2702573397733338e-05, + "loss": 0.0055, + "num_input_tokens_seen": 26754560, + "step": 126775 + }, + { + "epoch": 13.947194719471947, + "grad_norm": 0.01914721541106701, + "learning_rate": 1.2700483827367468e-05, + "loss": 0.1009, + "num_input_tokens_seen": 26755712, + "step": 126780 + }, + { + "epoch": 13.947744774477448, + "grad_norm": 0.014770098961889744, + "learning_rate": 1.2698394370360339e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26756768, + "step": 126785 + }, + { + "epoch": 13.948294829482949, + "grad_norm": 1.8367112874984741, + "learning_rate": 1.2696305026731204e-05, + "loss": 0.0621, + "num_input_tokens_seen": 26757824, + "step": 126790 + }, + { + "epoch": 13.948844884488448, + "grad_norm": 0.018804343417286873, + "learning_rate": 1.2694215796499331e-05, + "loss": 0.0008, + "num_input_tokens_seen": 26758912, + "step": 126795 + }, + { + "epoch": 13.94939493949395, + "grad_norm": 0.6640046834945679, + "learning_rate": 1.2692126679683966e-05, + "loss": 0.0273, + "num_input_tokens_seen": 26759936, + "step": 126800 + }, + { + "epoch": 13.94994499449945, + "grad_norm": 0.048077914863824844, + "learning_rate": 1.2690037676304356e-05, + "loss": 0.0685, + "num_input_tokens_seen": 26760960, + "step": 126805 + }, + { + "epoch": 13.950495049504951, + "grad_norm": 0.0691220685839653, + "learning_rate": 1.2687948786379761e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26762048, + "step": 126810 + }, + { + "epoch": 13.95104510451045, + "grad_norm": 0.02839779667556286, + "learning_rate": 1.2685860009929432e-05, + "loss": 0.009, + "num_input_tokens_seen": 26763104, + "step": 126815 + }, + { + "epoch": 13.951595159515952, + "grad_norm": 0.11774562299251556, + "learning_rate": 1.2683771346972633e-05, + "loss": 0.0084, + "num_input_tokens_seen": 26764128, + "step": 126820 + }, + { + "epoch": 13.952145214521453, + "grad_norm": 0.1336764544248581, + "learning_rate": 1.2681682797528605e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26765184, + "step": 126825 + }, + { + "epoch": 13.952695269526952, + "grad_norm": 0.014463192783296108, + "learning_rate": 1.2679594361616585e-05, + "loss": 0.018, + "num_input_tokens_seen": 26766208, + "step": 126830 + }, + { + "epoch": 13.953245324532453, + "grad_norm": 0.10557834059000015, + "learning_rate": 1.2677506039255843e-05, + "loss": 0.0194, + "num_input_tokens_seen": 26767328, + "step": 126835 + }, + { + "epoch": 13.953795379537954, + "grad_norm": 0.01032004039734602, + "learning_rate": 1.2675417830465602e-05, + "loss": 0.2018, + "num_input_tokens_seen": 26768416, + "step": 126840 + }, + { + "epoch": 13.954345434543454, + "grad_norm": 0.019216613844037056, + "learning_rate": 1.2673329735265124e-05, + "loss": 0.0987, + "num_input_tokens_seen": 26769472, + "step": 126845 + }, + { + "epoch": 13.954895489548955, + "grad_norm": 0.04544687271118164, + "learning_rate": 1.2671241753673657e-05, + "loss": 0.0754, + "num_input_tokens_seen": 26770496, + "step": 126850 + }, + { + "epoch": 13.955445544554456, + "grad_norm": 0.23882052302360535, + "learning_rate": 1.2669153885710434e-05, + "loss": 0.013, + "num_input_tokens_seen": 26771584, + "step": 126855 + }, + { + "epoch": 13.955995599559955, + "grad_norm": 0.005586514249444008, + "learning_rate": 1.2667066131394712e-05, + "loss": 0.0492, + "num_input_tokens_seen": 26772576, + "step": 126860 + }, + { + "epoch": 13.956545654565456, + "grad_norm": 0.025728760287165642, + "learning_rate": 1.266497849074572e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26773568, + "step": 126865 + }, + { + "epoch": 13.957095709570957, + "grad_norm": 2.348782539367676, + "learning_rate": 1.2662890963782686e-05, + "loss": 0.1703, + "num_input_tokens_seen": 26774560, + "step": 126870 + }, + { + "epoch": 13.957645764576458, + "grad_norm": 2.9407458305358887, + "learning_rate": 1.2660803550524882e-05, + "loss": 0.0902, + "num_input_tokens_seen": 26775616, + "step": 126875 + }, + { + "epoch": 13.958195819581958, + "grad_norm": 0.041528839617967606, + "learning_rate": 1.2658716250991526e-05, + "loss": 0.0041, + "num_input_tokens_seen": 26776640, + "step": 126880 + }, + { + "epoch": 13.958745874587459, + "grad_norm": 0.10659946501255035, + "learning_rate": 1.2656629065201863e-05, + "loss": 0.0261, + "num_input_tokens_seen": 26777664, + "step": 126885 + }, + { + "epoch": 13.95929592959296, + "grad_norm": 0.03068658709526062, + "learning_rate": 1.2654541993175134e-05, + "loss": 0.0927, + "num_input_tokens_seen": 26778656, + "step": 126890 + }, + { + "epoch": 13.95984598459846, + "grad_norm": 0.6124083995819092, + "learning_rate": 1.2652455034930554e-05, + "loss": 0.1109, + "num_input_tokens_seen": 26779744, + "step": 126895 + }, + { + "epoch": 13.96039603960396, + "grad_norm": 0.016268180683255196, + "learning_rate": 1.2650368190487372e-05, + "loss": 0.0099, + "num_input_tokens_seen": 26780832, + "step": 126900 + }, + { + "epoch": 13.960946094609461, + "grad_norm": 0.012504100799560547, + "learning_rate": 1.264828145986482e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26781856, + "step": 126905 + }, + { + "epoch": 13.96149614961496, + "grad_norm": 0.029668232426047325, + "learning_rate": 1.2646194843082143e-05, + "loss": 0.0047, + "num_input_tokens_seen": 26782912, + "step": 126910 + }, + { + "epoch": 13.962046204620462, + "grad_norm": 0.1571343094110489, + "learning_rate": 1.2644108340158555e-05, + "loss": 0.0073, + "num_input_tokens_seen": 26783936, + "step": 126915 + }, + { + "epoch": 13.962596259625963, + "grad_norm": 0.1746620237827301, + "learning_rate": 1.2642021951113287e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26785024, + "step": 126920 + }, + { + "epoch": 13.963146314631462, + "grad_norm": 0.0612829215824604, + "learning_rate": 1.263993567596558e-05, + "loss": 0.112, + "num_input_tokens_seen": 26786080, + "step": 126925 + }, + { + "epoch": 13.963696369636963, + "grad_norm": 0.050359129905700684, + "learning_rate": 1.2637849514734638e-05, + "loss": 0.0015, + "num_input_tokens_seen": 26787136, + "step": 126930 + }, + { + "epoch": 13.964246424642464, + "grad_norm": 0.08040095865726471, + "learning_rate": 1.263576346743972e-05, + "loss": 0.0048, + "num_input_tokens_seen": 26788224, + "step": 126935 + }, + { + "epoch": 13.964796479647966, + "grad_norm": 1.8399931192398071, + "learning_rate": 1.2633677534100041e-05, + "loss": 0.144, + "num_input_tokens_seen": 26789312, + "step": 126940 + }, + { + "epoch": 13.965346534653465, + "grad_norm": 0.7524603009223938, + "learning_rate": 1.2631591714734812e-05, + "loss": 0.0662, + "num_input_tokens_seen": 26790400, + "step": 126945 + }, + { + "epoch": 13.965896589658966, + "grad_norm": 0.05975163355469704, + "learning_rate": 1.2629506009363276e-05, + "loss": 0.0043, + "num_input_tokens_seen": 26791456, + "step": 126950 + }, + { + "epoch": 13.966446644664467, + "grad_norm": 0.32137179374694824, + "learning_rate": 1.2627420418004637e-05, + "loss": 0.0066, + "num_input_tokens_seen": 26792480, + "step": 126955 + }, + { + "epoch": 13.966996699669966, + "grad_norm": 0.0432986319065094, + "learning_rate": 1.2625334940678129e-05, + "loss": 0.0542, + "num_input_tokens_seen": 26793632, + "step": 126960 + }, + { + "epoch": 13.967546754675467, + "grad_norm": 0.01013341173529625, + "learning_rate": 1.2623249577402979e-05, + "loss": 0.0515, + "num_input_tokens_seen": 26794688, + "step": 126965 + }, + { + "epoch": 13.968096809680969, + "grad_norm": 0.041452132165431976, + "learning_rate": 1.2621164328198388e-05, + "loss": 0.0179, + "num_input_tokens_seen": 26795776, + "step": 126970 + }, + { + "epoch": 13.968646864686468, + "grad_norm": 0.33283576369285583, + "learning_rate": 1.2619079193083596e-05, + "loss": 0.0678, + "num_input_tokens_seen": 26796832, + "step": 126975 + }, + { + "epoch": 13.969196919691969, + "grad_norm": 0.020005658268928528, + "learning_rate": 1.2616994172077801e-05, + "loss": 0.0054, + "num_input_tokens_seen": 26797920, + "step": 126980 + }, + { + "epoch": 13.96974697469747, + "grad_norm": 0.035860639065504074, + "learning_rate": 1.2614909265200239e-05, + "loss": 0.0474, + "num_input_tokens_seen": 26798976, + "step": 126985 + }, + { + "epoch": 13.97029702970297, + "grad_norm": 0.09518013149499893, + "learning_rate": 1.2612824472470105e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26800064, + "step": 126990 + }, + { + "epoch": 13.97084708470847, + "grad_norm": 0.021641183644533157, + "learning_rate": 1.2610739793906623e-05, + "loss": 0.0026, + "num_input_tokens_seen": 26801216, + "step": 126995 + }, + { + "epoch": 13.971397139713972, + "grad_norm": 0.0420815534889698, + "learning_rate": 1.260865522952902e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26802272, + "step": 127000 + }, + { + "epoch": 13.971947194719473, + "grad_norm": 0.11716798692941666, + "learning_rate": 1.2606570779356484e-05, + "loss": 0.0313, + "num_input_tokens_seen": 26803392, + "step": 127005 + }, + { + "epoch": 13.972497249724972, + "grad_norm": 0.015970857813954353, + "learning_rate": 1.2604486443408247e-05, + "loss": 0.0443, + "num_input_tokens_seen": 26804544, + "step": 127010 + }, + { + "epoch": 13.973047304730473, + "grad_norm": 0.1562996804714203, + "learning_rate": 1.2602402221703502e-05, + "loss": 0.0128, + "num_input_tokens_seen": 26805600, + "step": 127015 + }, + { + "epoch": 13.973597359735974, + "grad_norm": 0.0069977892562747, + "learning_rate": 1.2600318114261466e-05, + "loss": 0.0045, + "num_input_tokens_seen": 26806592, + "step": 127020 + }, + { + "epoch": 13.974147414741473, + "grad_norm": 0.02900761552155018, + "learning_rate": 1.259823412110136e-05, + "loss": 0.0057, + "num_input_tokens_seen": 26807648, + "step": 127025 + }, + { + "epoch": 13.974697469746975, + "grad_norm": 0.025613976642489433, + "learning_rate": 1.2596150242242378e-05, + "loss": 0.0054, + "num_input_tokens_seen": 26808640, + "step": 127030 + }, + { + "epoch": 13.975247524752476, + "grad_norm": 0.07001517713069916, + "learning_rate": 1.2594066477703715e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26809664, + "step": 127035 + }, + { + "epoch": 13.975797579757975, + "grad_norm": 0.056572917848825455, + "learning_rate": 1.25919828275046e-05, + "loss": 0.0119, + "num_input_tokens_seen": 26810720, + "step": 127040 + }, + { + "epoch": 13.976347634763476, + "grad_norm": 0.008845902048051357, + "learning_rate": 1.2589899291664206e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26811776, + "step": 127045 + }, + { + "epoch": 13.976897689768977, + "grad_norm": 0.10504492372274399, + "learning_rate": 1.2587815870201774e-05, + "loss": 0.005, + "num_input_tokens_seen": 26812800, + "step": 127050 + }, + { + "epoch": 13.977447744774478, + "grad_norm": 0.008990871720016003, + "learning_rate": 1.2585732563136487e-05, + "loss": 0.0014, + "num_input_tokens_seen": 26813888, + "step": 127055 + }, + { + "epoch": 13.977997799779978, + "grad_norm": 0.05418064817786217, + "learning_rate": 1.2583649370487537e-05, + "loss": 0.0015, + "num_input_tokens_seen": 26814944, + "step": 127060 + }, + { + "epoch": 13.978547854785479, + "grad_norm": 0.031049424782395363, + "learning_rate": 1.2581566292274143e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26815968, + "step": 127065 + }, + { + "epoch": 13.97909790979098, + "grad_norm": 0.012418064288794994, + "learning_rate": 1.2579483328515485e-05, + "loss": 0.0577, + "num_input_tokens_seen": 26816960, + "step": 127070 + }, + { + "epoch": 13.979647964796479, + "grad_norm": 0.007906626909971237, + "learning_rate": 1.2577400479230766e-05, + "loss": 0.0023, + "num_input_tokens_seen": 26818016, + "step": 127075 + }, + { + "epoch": 13.98019801980198, + "grad_norm": 0.05074991658329964, + "learning_rate": 1.25753177444392e-05, + "loss": 0.0223, + "num_input_tokens_seen": 26819008, + "step": 127080 + }, + { + "epoch": 13.980748074807481, + "grad_norm": 0.02336478792130947, + "learning_rate": 1.2573235124159954e-05, + "loss": 0.002, + "num_input_tokens_seen": 26820064, + "step": 127085 + }, + { + "epoch": 13.98129812981298, + "grad_norm": 0.4463579058647156, + "learning_rate": 1.2571152618412249e-05, + "loss": 0.0753, + "num_input_tokens_seen": 26821184, + "step": 127090 + }, + { + "epoch": 13.981848184818482, + "grad_norm": 0.018325068056583405, + "learning_rate": 1.2569070227215258e-05, + "loss": 0.005, + "num_input_tokens_seen": 26822176, + "step": 127095 + }, + { + "epoch": 13.982398239823983, + "grad_norm": 0.050345528870821, + "learning_rate": 1.2566987950588188e-05, + "loss": 0.0109, + "num_input_tokens_seen": 26823264, + "step": 127100 + }, + { + "epoch": 13.982948294829484, + "grad_norm": 2.1394729614257812, + "learning_rate": 1.2564905788550219e-05, + "loss": 0.1422, + "num_input_tokens_seen": 26824256, + "step": 127105 + }, + { + "epoch": 13.983498349834983, + "grad_norm": 0.007974964566528797, + "learning_rate": 1.2562823741120541e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26825344, + "step": 127110 + }, + { + "epoch": 13.984048404840484, + "grad_norm": 0.015966225415468216, + "learning_rate": 1.256074180831836e-05, + "loss": 0.004, + "num_input_tokens_seen": 26826400, + "step": 127115 + }, + { + "epoch": 13.984598459845985, + "grad_norm": 0.029232658445835114, + "learning_rate": 1.2558659990162847e-05, + "loss": 0.0039, + "num_input_tokens_seen": 26827520, + "step": 127120 + }, + { + "epoch": 13.985148514851485, + "grad_norm": 0.2683647871017456, + "learning_rate": 1.2556578286673198e-05, + "loss": 0.007, + "num_input_tokens_seen": 26828480, + "step": 127125 + }, + { + "epoch": 13.985698569856986, + "grad_norm": 0.007267920300364494, + "learning_rate": 1.2554496697868592e-05, + "loss": 0.0366, + "num_input_tokens_seen": 26829568, + "step": 127130 + }, + { + "epoch": 13.986248624862487, + "grad_norm": 0.042193811386823654, + "learning_rate": 1.2552415223768215e-05, + "loss": 0.0052, + "num_input_tokens_seen": 26830592, + "step": 127135 + }, + { + "epoch": 13.986798679867986, + "grad_norm": 0.02575412392616272, + "learning_rate": 1.2550333864391267e-05, + "loss": 0.0012, + "num_input_tokens_seen": 26831648, + "step": 127140 + }, + { + "epoch": 13.987348734873487, + "grad_norm": 0.04132111370563507, + "learning_rate": 1.2548252619756906e-05, + "loss": 0.0034, + "num_input_tokens_seen": 26832736, + "step": 127145 + }, + { + "epoch": 13.987898789878988, + "grad_norm": 1.226434350013733, + "learning_rate": 1.2546171489884333e-05, + "loss": 0.0358, + "num_input_tokens_seen": 26833792, + "step": 127150 + }, + { + "epoch": 13.988448844884488, + "grad_norm": 0.012491490691900253, + "learning_rate": 1.2544090474792724e-05, + "loss": 0.0029, + "num_input_tokens_seen": 26834880, + "step": 127155 + }, + { + "epoch": 13.988998899889989, + "grad_norm": 0.01403071079403162, + "learning_rate": 1.2542009574501246e-05, + "loss": 0.0309, + "num_input_tokens_seen": 26835936, + "step": 127160 + }, + { + "epoch": 13.98954895489549, + "grad_norm": 0.0505196638405323, + "learning_rate": 1.2539928789029088e-05, + "loss": 0.0186, + "num_input_tokens_seen": 26837024, + "step": 127165 + }, + { + "epoch": 13.990099009900991, + "grad_norm": 0.04076697677373886, + "learning_rate": 1.253784811839543e-05, + "loss": 0.0135, + "num_input_tokens_seen": 26838144, + "step": 127170 + }, + { + "epoch": 13.99064906490649, + "grad_norm": 0.20651067793369293, + "learning_rate": 1.2535767562619449e-05, + "loss": 0.0576, + "num_input_tokens_seen": 26839200, + "step": 127175 + }, + { + "epoch": 13.991199119911991, + "grad_norm": 0.023506777361035347, + "learning_rate": 1.2533687121720323e-05, + "loss": 0.0011, + "num_input_tokens_seen": 26840288, + "step": 127180 + }, + { + "epoch": 13.991749174917492, + "grad_norm": 2.9429986476898193, + "learning_rate": 1.253160679571721e-05, + "loss": 0.0923, + "num_input_tokens_seen": 26841312, + "step": 127185 + }, + { + "epoch": 13.992299229922992, + "grad_norm": 0.005595281254500151, + "learning_rate": 1.2529526584629291e-05, + "loss": 0.1781, + "num_input_tokens_seen": 26842304, + "step": 127190 + }, + { + "epoch": 13.992849284928493, + "grad_norm": 0.06050257012248039, + "learning_rate": 1.2527446488475748e-05, + "loss": 0.0438, + "num_input_tokens_seen": 26843392, + "step": 127195 + }, + { + "epoch": 13.993399339933994, + "grad_norm": 0.006446303334087133, + "learning_rate": 1.252536650727575e-05, + "loss": 0.0011, + "num_input_tokens_seen": 26844448, + "step": 127200 + }, + { + "epoch": 13.993949394939493, + "grad_norm": 0.059163838624954224, + "learning_rate": 1.2523286641048466e-05, + "loss": 0.0077, + "num_input_tokens_seen": 26845568, + "step": 127205 + }, + { + "epoch": 13.994499449944994, + "grad_norm": 0.022607900202274323, + "learning_rate": 1.252120688981305e-05, + "loss": 0.0037, + "num_input_tokens_seen": 26846656, + "step": 127210 + }, + { + "epoch": 13.995049504950495, + "grad_norm": 1.292412519454956, + "learning_rate": 1.2519127253588692e-05, + "loss": 0.0182, + "num_input_tokens_seen": 26847744, + "step": 127215 + }, + { + "epoch": 13.995599559955995, + "grad_norm": 0.03612508252263069, + "learning_rate": 1.2517047732394543e-05, + "loss": 0.0043, + "num_input_tokens_seen": 26848832, + "step": 127220 + }, + { + "epoch": 13.996149614961496, + "grad_norm": 0.23070189356803894, + "learning_rate": 1.2514968326249772e-05, + "loss": 0.1004, + "num_input_tokens_seen": 26849920, + "step": 127225 + }, + { + "epoch": 13.996699669966997, + "grad_norm": 0.2236722856760025, + "learning_rate": 1.2512889035173558e-05, + "loss": 0.0085, + "num_input_tokens_seen": 26850976, + "step": 127230 + }, + { + "epoch": 13.997249724972498, + "grad_norm": 0.08163437992334366, + "learning_rate": 1.2510809859185043e-05, + "loss": 0.0148, + "num_input_tokens_seen": 26851968, + "step": 127235 + }, + { + "epoch": 13.997799779977997, + "grad_norm": 0.3900880813598633, + "learning_rate": 1.2508730798303411e-05, + "loss": 0.0417, + "num_input_tokens_seen": 26852960, + "step": 127240 + }, + { + "epoch": 13.998349834983498, + "grad_norm": 0.041677191853523254, + "learning_rate": 1.2506651852547804e-05, + "loss": 0.0073, + "num_input_tokens_seen": 26853984, + "step": 127245 + }, + { + "epoch": 13.998899889989, + "grad_norm": 0.014314040541648865, + "learning_rate": 1.2504573021937394e-05, + "loss": 0.149, + "num_input_tokens_seen": 26855104, + "step": 127250 + }, + { + "epoch": 13.999449944994499, + "grad_norm": 0.13144885003566742, + "learning_rate": 1.2502494306491345e-05, + "loss": 0.0161, + "num_input_tokens_seen": 26856160, + "step": 127255 + }, + { + "epoch": 14.0, + "grad_norm": 0.003266705898568034, + "learning_rate": 1.25004157062288e-05, + "loss": 0.0553, + "num_input_tokens_seen": 26857088, + "step": 127260 + }, + { + "epoch": 14.0, + "eval_loss": 0.07163964956998825, + "eval_runtime": 37.0011, + "eval_samples_per_second": 109.186, + "eval_steps_per_second": 27.297, + "num_input_tokens_seen": 26857088, + "step": 127260 + }, + { + "epoch": 14.000550055005501, + "grad_norm": 0.049773652106523514, + "learning_rate": 1.2498337221168935e-05, + "loss": 0.0517, + "num_input_tokens_seen": 26858208, + "step": 127265 + }, + { + "epoch": 14.001100110011, + "grad_norm": 1.8118990659713745, + "learning_rate": 1.2496258851330897e-05, + "loss": 0.0822, + "num_input_tokens_seen": 26859232, + "step": 127270 + }, + { + "epoch": 14.001650165016502, + "grad_norm": 0.02005581185221672, + "learning_rate": 1.2494180596733832e-05, + "loss": 0.002, + "num_input_tokens_seen": 26860288, + "step": 127275 + }, + { + "epoch": 14.002200220022003, + "grad_norm": 0.010245898738503456, + "learning_rate": 1.2492102457396903e-05, + "loss": 0.0051, + "num_input_tokens_seen": 26861312, + "step": 127280 + }, + { + "epoch": 14.002750275027502, + "grad_norm": 0.1115678995847702, + "learning_rate": 1.2490024433339265e-05, + "loss": 0.0368, + "num_input_tokens_seen": 26862400, + "step": 127285 + }, + { + "epoch": 14.003300330033003, + "grad_norm": 1.3046725988388062, + "learning_rate": 1.248794652458008e-05, + "loss": 0.1252, + "num_input_tokens_seen": 26863456, + "step": 127290 + }, + { + "epoch": 14.003850385038504, + "grad_norm": 0.023028414696455002, + "learning_rate": 1.2485868731138483e-05, + "loss": 0.1081, + "num_input_tokens_seen": 26864512, + "step": 127295 + }, + { + "epoch": 14.004400440044005, + "grad_norm": 0.013137212954461575, + "learning_rate": 1.2483791053033625e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26865600, + "step": 127300 + }, + { + "epoch": 14.004950495049505, + "grad_norm": 0.06138589233160019, + "learning_rate": 1.2481713490284655e-05, + "loss": 0.004, + "num_input_tokens_seen": 26866592, + "step": 127305 + }, + { + "epoch": 14.005500550055006, + "grad_norm": 2.0929148197174072, + "learning_rate": 1.2479636042910728e-05, + "loss": 0.0767, + "num_input_tokens_seen": 26867680, + "step": 127310 + }, + { + "epoch": 14.006050605060507, + "grad_norm": 0.04513144493103027, + "learning_rate": 1.2477558710930994e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26868736, + "step": 127315 + }, + { + "epoch": 14.006600660066006, + "grad_norm": 0.006824552081525326, + "learning_rate": 1.2475481494364597e-05, + "loss": 0.0024, + "num_input_tokens_seen": 26869792, + "step": 127320 + }, + { + "epoch": 14.007150715071507, + "grad_norm": 1.0711199045181274, + "learning_rate": 1.2473404393230664e-05, + "loss": 0.1199, + "num_input_tokens_seen": 26870944, + "step": 127325 + }, + { + "epoch": 14.007700770077008, + "grad_norm": 0.019699206575751305, + "learning_rate": 1.2471327407548362e-05, + "loss": 0.0481, + "num_input_tokens_seen": 26871968, + "step": 127330 + }, + { + "epoch": 14.008250825082508, + "grad_norm": 0.09839879721403122, + "learning_rate": 1.2469250537336813e-05, + "loss": 0.003, + "num_input_tokens_seen": 26872960, + "step": 127335 + }, + { + "epoch": 14.008800880088009, + "grad_norm": 0.03687338903546333, + "learning_rate": 1.2467173782615171e-05, + "loss": 0.0043, + "num_input_tokens_seen": 26873920, + "step": 127340 + }, + { + "epoch": 14.00935093509351, + "grad_norm": 1.320408582687378, + "learning_rate": 1.2465097143402582e-05, + "loss": 0.0964, + "num_input_tokens_seen": 26875040, + "step": 127345 + }, + { + "epoch": 14.009900990099009, + "grad_norm": 0.02611633948981762, + "learning_rate": 1.2463020619718172e-05, + "loss": 0.027, + "num_input_tokens_seen": 26876032, + "step": 127350 + }, + { + "epoch": 14.01045104510451, + "grad_norm": 1.6172817945480347, + "learning_rate": 1.2460944211581092e-05, + "loss": 0.0369, + "num_input_tokens_seen": 26876992, + "step": 127355 + }, + { + "epoch": 14.011001100110011, + "grad_norm": 0.009628145955502987, + "learning_rate": 1.2458867919010461e-05, + "loss": 0.0132, + "num_input_tokens_seen": 26877984, + "step": 127360 + }, + { + "epoch": 14.011551155115512, + "grad_norm": 0.041154589504003525, + "learning_rate": 1.2456791742025429e-05, + "loss": 0.0113, + "num_input_tokens_seen": 26879008, + "step": 127365 + }, + { + "epoch": 14.012101210121012, + "grad_norm": 0.05411781743168831, + "learning_rate": 1.2454715680645138e-05, + "loss": 0.0077, + "num_input_tokens_seen": 26880064, + "step": 127370 + }, + { + "epoch": 14.012651265126513, + "grad_norm": 0.06450781971216202, + "learning_rate": 1.2452639734888702e-05, + "loss": 0.0337, + "num_input_tokens_seen": 26881120, + "step": 127375 + }, + { + "epoch": 14.013201320132014, + "grad_norm": 1.254870057106018, + "learning_rate": 1.2450563904775276e-05, + "loss": 0.0523, + "num_input_tokens_seen": 26882176, + "step": 127380 + }, + { + "epoch": 14.013751375137513, + "grad_norm": 0.10307884961366653, + "learning_rate": 1.244848819032397e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26883200, + "step": 127385 + }, + { + "epoch": 14.014301430143014, + "grad_norm": 0.1691661924123764, + "learning_rate": 1.2446412591553935e-05, + "loss": 0.0313, + "num_input_tokens_seen": 26884256, + "step": 127390 + }, + { + "epoch": 14.014851485148515, + "grad_norm": 0.029464326798915863, + "learning_rate": 1.2444337108484283e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26885312, + "step": 127395 + }, + { + "epoch": 14.015401540154015, + "grad_norm": 0.07213851809501648, + "learning_rate": 1.244226174113415e-05, + "loss": 0.0395, + "num_input_tokens_seen": 26886336, + "step": 127400 + }, + { + "epoch": 14.015951595159516, + "grad_norm": 0.011420903727412224, + "learning_rate": 1.2440186489522676e-05, + "loss": 0.11, + "num_input_tokens_seen": 26887392, + "step": 127405 + }, + { + "epoch": 14.016501650165017, + "grad_norm": 0.0698750764131546, + "learning_rate": 1.2438111353668973e-05, + "loss": 0.0026, + "num_input_tokens_seen": 26888384, + "step": 127410 + }, + { + "epoch": 14.017051705170518, + "grad_norm": 1.6497031450271606, + "learning_rate": 1.2436036333592164e-05, + "loss": 0.0782, + "num_input_tokens_seen": 26889440, + "step": 127415 + }, + { + "epoch": 14.017601760176017, + "grad_norm": 0.029395362362265587, + "learning_rate": 1.2433961429311378e-05, + "loss": 0.044, + "num_input_tokens_seen": 26890496, + "step": 127420 + }, + { + "epoch": 14.018151815181518, + "grad_norm": 0.022522730752825737, + "learning_rate": 1.2431886640845738e-05, + "loss": 0.0921, + "num_input_tokens_seen": 26891520, + "step": 127425 + }, + { + "epoch": 14.01870187018702, + "grad_norm": 0.48902246356010437, + "learning_rate": 1.2429811968214377e-05, + "loss": 0.0068, + "num_input_tokens_seen": 26892544, + "step": 127430 + }, + { + "epoch": 14.019251925192519, + "grad_norm": 0.06887945532798767, + "learning_rate": 1.242773741143641e-05, + "loss": 0.0106, + "num_input_tokens_seen": 26893664, + "step": 127435 + }, + { + "epoch": 14.01980198019802, + "grad_norm": 1.9752227067947388, + "learning_rate": 1.2425662970530947e-05, + "loss": 0.0569, + "num_input_tokens_seen": 26894720, + "step": 127440 + }, + { + "epoch": 14.020352035203521, + "grad_norm": 0.09067824482917786, + "learning_rate": 1.242358864551712e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26895712, + "step": 127445 + }, + { + "epoch": 14.02090209020902, + "grad_norm": 0.04467726871371269, + "learning_rate": 1.2421514436414034e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26896736, + "step": 127450 + }, + { + "epoch": 14.021452145214521, + "grad_norm": 0.48359012603759766, + "learning_rate": 1.2419440343240813e-05, + "loss": 0.0229, + "num_input_tokens_seen": 26897792, + "step": 127455 + }, + { + "epoch": 14.022002200220022, + "grad_norm": 0.014929370954632759, + "learning_rate": 1.2417366366016583e-05, + "loss": 0.003, + "num_input_tokens_seen": 26898880, + "step": 127460 + }, + { + "epoch": 14.022552255225522, + "grad_norm": 0.03683706000447273, + "learning_rate": 1.241529250476044e-05, + "loss": 0.161, + "num_input_tokens_seen": 26899936, + "step": 127465 + }, + { + "epoch": 14.023102310231023, + "grad_norm": 0.44626104831695557, + "learning_rate": 1.2413218759491519e-05, + "loss": 0.1456, + "num_input_tokens_seen": 26900992, + "step": 127470 + }, + { + "epoch": 14.023652365236524, + "grad_norm": 0.14499585330486298, + "learning_rate": 1.2411145130228907e-05, + "loss": 0.0057, + "num_input_tokens_seen": 26902112, + "step": 127475 + }, + { + "epoch": 14.024202420242025, + "grad_norm": 1.7587823867797852, + "learning_rate": 1.2409071616991733e-05, + "loss": 0.074, + "num_input_tokens_seen": 26903168, + "step": 127480 + }, + { + "epoch": 14.024752475247524, + "grad_norm": 0.034086067229509354, + "learning_rate": 1.2406998219799113e-05, + "loss": 0.1068, + "num_input_tokens_seen": 26904224, + "step": 127485 + }, + { + "epoch": 14.025302530253025, + "grad_norm": 0.048761360347270966, + "learning_rate": 1.240492493867014e-05, + "loss": 0.0423, + "num_input_tokens_seen": 26905344, + "step": 127490 + }, + { + "epoch": 14.025852585258527, + "grad_norm": 0.03981197252869606, + "learning_rate": 1.2402851773623936e-05, + "loss": 0.0054, + "num_input_tokens_seen": 26906400, + "step": 127495 + }, + { + "epoch": 14.026402640264026, + "grad_norm": 0.007170111406594515, + "learning_rate": 1.2400778724679595e-05, + "loss": 0.0047, + "num_input_tokens_seen": 26907520, + "step": 127500 + }, + { + "epoch": 14.026952695269527, + "grad_norm": 0.025208137929439545, + "learning_rate": 1.2398705791856241e-05, + "loss": 0.0121, + "num_input_tokens_seen": 26908544, + "step": 127505 + }, + { + "epoch": 14.027502750275028, + "grad_norm": 0.012760418467223644, + "learning_rate": 1.2396632975172959e-05, + "loss": 0.0102, + "num_input_tokens_seen": 26909568, + "step": 127510 + }, + { + "epoch": 14.028052805280527, + "grad_norm": 0.00838131457567215, + "learning_rate": 1.2394560274648862e-05, + "loss": 0.0193, + "num_input_tokens_seen": 26910560, + "step": 127515 + }, + { + "epoch": 14.028602860286028, + "grad_norm": 0.01600298285484314, + "learning_rate": 1.2392487690303065e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26911552, + "step": 127520 + }, + { + "epoch": 14.02915291529153, + "grad_norm": 0.018792295828461647, + "learning_rate": 1.239041522215465e-05, + "loss": 0.0012, + "num_input_tokens_seen": 26912512, + "step": 127525 + }, + { + "epoch": 14.029702970297029, + "grad_norm": 0.4456583261489868, + "learning_rate": 1.2388342870222735e-05, + "loss": 0.0043, + "num_input_tokens_seen": 26913568, + "step": 127530 + }, + { + "epoch": 14.03025302530253, + "grad_norm": 0.037283528596162796, + "learning_rate": 1.2386270634526404e-05, + "loss": 0.1191, + "num_input_tokens_seen": 26914528, + "step": 127535 + }, + { + "epoch": 14.030803080308031, + "grad_norm": 0.007285637315362692, + "learning_rate": 1.2384198515084764e-05, + "loss": 0.0035, + "num_input_tokens_seen": 26915552, + "step": 127540 + }, + { + "epoch": 14.031353135313532, + "grad_norm": 0.04432940483093262, + "learning_rate": 1.2382126511916922e-05, + "loss": 0.0019, + "num_input_tokens_seen": 26916544, + "step": 127545 + }, + { + "epoch": 14.031903190319031, + "grad_norm": 0.05831887573003769, + "learning_rate": 1.2380054625041954e-05, + "loss": 0.0054, + "num_input_tokens_seen": 26917632, + "step": 127550 + }, + { + "epoch": 14.032453245324533, + "grad_norm": 0.00804541539400816, + "learning_rate": 1.2377982854478978e-05, + "loss": 0.0115, + "num_input_tokens_seen": 26918720, + "step": 127555 + }, + { + "epoch": 14.033003300330034, + "grad_norm": 3.0166637897491455, + "learning_rate": 1.2375911200247079e-05, + "loss": 0.0796, + "num_input_tokens_seen": 26919776, + "step": 127560 + }, + { + "epoch": 14.033553355335533, + "grad_norm": 0.013774573802947998, + "learning_rate": 1.2373839662365336e-05, + "loss": 0.0462, + "num_input_tokens_seen": 26920768, + "step": 127565 + }, + { + "epoch": 14.034103410341034, + "grad_norm": 1.0319393873214722, + "learning_rate": 1.2371768240852858e-05, + "loss": 0.007, + "num_input_tokens_seen": 26921856, + "step": 127570 + }, + { + "epoch": 14.034653465346535, + "grad_norm": 3.667281150817871, + "learning_rate": 1.2369696935728729e-05, + "loss": 0.0289, + "num_input_tokens_seen": 26922976, + "step": 127575 + }, + { + "epoch": 14.035203520352034, + "grad_norm": 1.418670415878296, + "learning_rate": 1.2367625747012054e-05, + "loss": 0.0449, + "num_input_tokens_seen": 26924032, + "step": 127580 + }, + { + "epoch": 14.035753575357536, + "grad_norm": 0.024120574817061424, + "learning_rate": 1.236555467472191e-05, + "loss": 0.0228, + "num_input_tokens_seen": 26925024, + "step": 127585 + }, + { + "epoch": 14.036303630363037, + "grad_norm": 0.431564062833786, + "learning_rate": 1.2363483718877377e-05, + "loss": 0.0065, + "num_input_tokens_seen": 26926048, + "step": 127590 + }, + { + "epoch": 14.036853685368538, + "grad_norm": 0.12783104181289673, + "learning_rate": 1.236141287949755e-05, + "loss": 0.0028, + "num_input_tokens_seen": 26927040, + "step": 127595 + }, + { + "epoch": 14.037403740374037, + "grad_norm": 1.764221429824829, + "learning_rate": 1.2359342156601517e-05, + "loss": 0.0814, + "num_input_tokens_seen": 26928128, + "step": 127600 + }, + { + "epoch": 14.037953795379538, + "grad_norm": 0.008861154317855835, + "learning_rate": 1.235727155020837e-05, + "loss": 0.1182, + "num_input_tokens_seen": 26929152, + "step": 127605 + }, + { + "epoch": 14.03850385038504, + "grad_norm": 0.038669392466545105, + "learning_rate": 1.2355201060337187e-05, + "loss": 0.0922, + "num_input_tokens_seen": 26930240, + "step": 127610 + }, + { + "epoch": 14.039053905390539, + "grad_norm": 0.9316443204879761, + "learning_rate": 1.2353130687007037e-05, + "loss": 0.0066, + "num_input_tokens_seen": 26931296, + "step": 127615 + }, + { + "epoch": 14.03960396039604, + "grad_norm": 0.07030661404132843, + "learning_rate": 1.2351060430237024e-05, + "loss": 0.0092, + "num_input_tokens_seen": 26932352, + "step": 127620 + }, + { + "epoch": 14.04015401540154, + "grad_norm": 1.1485315561294556, + "learning_rate": 1.2348990290046206e-05, + "loss": 0.0172, + "num_input_tokens_seen": 26933376, + "step": 127625 + }, + { + "epoch": 14.04070407040704, + "grad_norm": 1.3538744449615479, + "learning_rate": 1.2346920266453673e-05, + "loss": 0.0357, + "num_input_tokens_seen": 26934432, + "step": 127630 + }, + { + "epoch": 14.041254125412541, + "grad_norm": 0.08167186379432678, + "learning_rate": 1.2344850359478515e-05, + "loss": 0.1033, + "num_input_tokens_seen": 26935456, + "step": 127635 + }, + { + "epoch": 14.041804180418042, + "grad_norm": 0.37555792927742004, + "learning_rate": 1.2342780569139793e-05, + "loss": 0.0167, + "num_input_tokens_seen": 26936544, + "step": 127640 + }, + { + "epoch": 14.042354235423542, + "grad_norm": 0.05602937191724777, + "learning_rate": 1.2340710895456595e-05, + "loss": 0.0018, + "num_input_tokens_seen": 26937600, + "step": 127645 + }, + { + "epoch": 14.042904290429043, + "grad_norm": 0.43407735228538513, + "learning_rate": 1.233864133844799e-05, + "loss": 0.0031, + "num_input_tokens_seen": 26938624, + "step": 127650 + }, + { + "epoch": 14.043454345434544, + "grad_norm": 0.010372117161750793, + "learning_rate": 1.2336571898133034e-05, + "loss": 0.0105, + "num_input_tokens_seen": 26939616, + "step": 127655 + }, + { + "epoch": 14.044004400440045, + "grad_norm": 0.3102652132511139, + "learning_rate": 1.2334502574530838e-05, + "loss": 0.006, + "num_input_tokens_seen": 26940672, + "step": 127660 + }, + { + "epoch": 14.044554455445544, + "grad_norm": 0.03998725116252899, + "learning_rate": 1.2332433367660442e-05, + "loss": 0.008, + "num_input_tokens_seen": 26941728, + "step": 127665 + }, + { + "epoch": 14.045104510451045, + "grad_norm": 0.08530602604150772, + "learning_rate": 1.233036427754094e-05, + "loss": 0.005, + "num_input_tokens_seen": 26942784, + "step": 127670 + }, + { + "epoch": 14.045654565456546, + "grad_norm": 0.20739659667015076, + "learning_rate": 1.2328295304191393e-05, + "loss": 0.0076, + "num_input_tokens_seen": 26943840, + "step": 127675 + }, + { + "epoch": 14.046204620462046, + "grad_norm": 0.09817123413085938, + "learning_rate": 1.2326226447630856e-05, + "loss": 0.0328, + "num_input_tokens_seen": 26944896, + "step": 127680 + }, + { + "epoch": 14.046754675467547, + "grad_norm": 0.004814154468476772, + "learning_rate": 1.232415770787841e-05, + "loss": 0.01, + "num_input_tokens_seen": 26945984, + "step": 127685 + }, + { + "epoch": 14.047304730473048, + "grad_norm": 1.7960665225982666, + "learning_rate": 1.2322089084953117e-05, + "loss": 0.0406, + "num_input_tokens_seen": 26946976, + "step": 127690 + }, + { + "epoch": 14.047854785478547, + "grad_norm": 1.579721212387085, + "learning_rate": 1.2320020578874058e-05, + "loss": 0.073, + "num_input_tokens_seen": 26948000, + "step": 127695 + }, + { + "epoch": 14.048404840484048, + "grad_norm": 0.12035278975963593, + "learning_rate": 1.2317952189660284e-05, + "loss": 0.0034, + "num_input_tokens_seen": 26949024, + "step": 127700 + }, + { + "epoch": 14.04895489548955, + "grad_norm": 0.014613312669098377, + "learning_rate": 1.2315883917330848e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26950080, + "step": 127705 + }, + { + "epoch": 14.049504950495049, + "grad_norm": 0.004520548973232508, + "learning_rate": 1.2313815761904835e-05, + "loss": 0.002, + "num_input_tokens_seen": 26951136, + "step": 127710 + }, + { + "epoch": 14.05005500550055, + "grad_norm": 0.8607008457183838, + "learning_rate": 1.2311747723401277e-05, + "loss": 0.0495, + "num_input_tokens_seen": 26952256, + "step": 127715 + }, + { + "epoch": 14.05060506050605, + "grad_norm": 1.1567953824996948, + "learning_rate": 1.2309679801839266e-05, + "loss": 0.0115, + "num_input_tokens_seen": 26953312, + "step": 127720 + }, + { + "epoch": 14.051155115511552, + "grad_norm": 0.06268125027418137, + "learning_rate": 1.230761199723785e-05, + "loss": 0.0143, + "num_input_tokens_seen": 26954368, + "step": 127725 + }, + { + "epoch": 14.051705170517051, + "grad_norm": 0.05909866467118263, + "learning_rate": 1.2305544309616077e-05, + "loss": 0.005, + "num_input_tokens_seen": 26955424, + "step": 127730 + }, + { + "epoch": 14.052255225522552, + "grad_norm": 0.073470838367939, + "learning_rate": 1.2303476738993017e-05, + "loss": 0.0109, + "num_input_tokens_seen": 26956448, + "step": 127735 + }, + { + "epoch": 14.052805280528053, + "grad_norm": 0.11239027976989746, + "learning_rate": 1.2301409285387711e-05, + "loss": 0.0997, + "num_input_tokens_seen": 26957504, + "step": 127740 + }, + { + "epoch": 14.053355335533553, + "grad_norm": 0.0731443464756012, + "learning_rate": 1.2299341948819221e-05, + "loss": 0.0043, + "num_input_tokens_seen": 26958528, + "step": 127745 + }, + { + "epoch": 14.053905390539054, + "grad_norm": 0.015688525512814522, + "learning_rate": 1.229727472930661e-05, + "loss": 0.0022, + "num_input_tokens_seen": 26959616, + "step": 127750 + }, + { + "epoch": 14.054455445544555, + "grad_norm": 1.1702247858047485, + "learning_rate": 1.2295207626868916e-05, + "loss": 0.0276, + "num_input_tokens_seen": 26960704, + "step": 127755 + }, + { + "epoch": 14.055005500550054, + "grad_norm": 2.2394351959228516, + "learning_rate": 1.2293140641525206e-05, + "loss": 0.0554, + "num_input_tokens_seen": 26961792, + "step": 127760 + }, + { + "epoch": 14.055555555555555, + "grad_norm": 0.25341296195983887, + "learning_rate": 1.2291073773294512e-05, + "loss": 0.0079, + "num_input_tokens_seen": 26962848, + "step": 127765 + }, + { + "epoch": 14.056105610561056, + "grad_norm": 0.021608680486679077, + "learning_rate": 1.22890070221959e-05, + "loss": 0.0021, + "num_input_tokens_seen": 26963840, + "step": 127770 + }, + { + "epoch": 14.056655665566556, + "grad_norm": 0.11885429173707962, + "learning_rate": 1.2286940388248402e-05, + "loss": 0.003, + "num_input_tokens_seen": 26964832, + "step": 127775 + }, + { + "epoch": 14.057205720572057, + "grad_norm": 0.0072768148966133595, + "learning_rate": 1.2284873871471075e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26965888, + "step": 127780 + }, + { + "epoch": 14.057755775577558, + "grad_norm": 0.03677256405353546, + "learning_rate": 1.2282807471882976e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26966944, + "step": 127785 + }, + { + "epoch": 14.058305830583059, + "grad_norm": 0.04648115858435631, + "learning_rate": 1.2280741189503125e-05, + "loss": 0.0537, + "num_input_tokens_seen": 26968000, + "step": 127790 + }, + { + "epoch": 14.058855885588558, + "grad_norm": 0.2583465576171875, + "learning_rate": 1.2278675024350591e-05, + "loss": 0.0109, + "num_input_tokens_seen": 26969056, + "step": 127795 + }, + { + "epoch": 14.05940594059406, + "grad_norm": 0.5311135649681091, + "learning_rate": 1.2276608976444395e-05, + "loss": 0.0084, + "num_input_tokens_seen": 26970112, + "step": 127800 + }, + { + "epoch": 14.05995599559956, + "grad_norm": 0.0041960133239626884, + "learning_rate": 1.2274543045803588e-05, + "loss": 0.0016, + "num_input_tokens_seen": 26971136, + "step": 127805 + }, + { + "epoch": 14.06050605060506, + "grad_norm": 3.1681363582611084, + "learning_rate": 1.2272477232447222e-05, + "loss": 0.2352, + "num_input_tokens_seen": 26972160, + "step": 127810 + }, + { + "epoch": 14.061056105610561, + "grad_norm": 2.718607187271118, + "learning_rate": 1.2270411536394324e-05, + "loss": 0.1142, + "num_input_tokens_seen": 26973248, + "step": 127815 + }, + { + "epoch": 14.061606160616062, + "grad_norm": 0.010897264815866947, + "learning_rate": 1.2268345957663926e-05, + "loss": 0.0011, + "num_input_tokens_seen": 26974304, + "step": 127820 + }, + { + "epoch": 14.062156215621561, + "grad_norm": 0.5868134498596191, + "learning_rate": 1.2266280496275084e-05, + "loss": 0.0117, + "num_input_tokens_seen": 26975264, + "step": 127825 + }, + { + "epoch": 14.062706270627062, + "grad_norm": 0.04087473452091217, + "learning_rate": 1.2264215152246816e-05, + "loss": 0.0168, + "num_input_tokens_seen": 26976288, + "step": 127830 + }, + { + "epoch": 14.063256325632564, + "grad_norm": 1.8420871496200562, + "learning_rate": 1.2262149925598165e-05, + "loss": 0.0893, + "num_input_tokens_seen": 26977408, + "step": 127835 + }, + { + "epoch": 14.063806380638065, + "grad_norm": 3.7239151000976562, + "learning_rate": 1.2260084816348177e-05, + "loss": 0.0266, + "num_input_tokens_seen": 26978464, + "step": 127840 + }, + { + "epoch": 14.064356435643564, + "grad_norm": 0.009683397598564625, + "learning_rate": 1.2258019824515863e-05, + "loss": 0.0164, + "num_input_tokens_seen": 26979488, + "step": 127845 + }, + { + "epoch": 14.064906490649065, + "grad_norm": 0.2426908314228058, + "learning_rate": 1.2255954950120274e-05, + "loss": 0.0053, + "num_input_tokens_seen": 26980544, + "step": 127850 + }, + { + "epoch": 14.065456545654566, + "grad_norm": 0.024181896820664406, + "learning_rate": 1.2253890193180425e-05, + "loss": 0.0032, + "num_input_tokens_seen": 26981664, + "step": 127855 + }, + { + "epoch": 14.066006600660065, + "grad_norm": 0.11640658974647522, + "learning_rate": 1.2251825553715355e-05, + "loss": 0.0062, + "num_input_tokens_seen": 26982720, + "step": 127860 + }, + { + "epoch": 14.066556655665567, + "grad_norm": 0.06335534900426865, + "learning_rate": 1.22497610317441e-05, + "loss": 0.0409, + "num_input_tokens_seen": 26983776, + "step": 127865 + }, + { + "epoch": 14.067106710671068, + "grad_norm": 0.018143663182854652, + "learning_rate": 1.2247696627285669e-05, + "loss": 0.0027, + "num_input_tokens_seen": 26984832, + "step": 127870 + }, + { + "epoch": 14.067656765676567, + "grad_norm": 0.02414637990295887, + "learning_rate": 1.2245632340359108e-05, + "loss": 0.0059, + "num_input_tokens_seen": 26985856, + "step": 127875 + }, + { + "epoch": 14.068206820682068, + "grad_norm": 0.0425051674246788, + "learning_rate": 1.2243568170983428e-05, + "loss": 0.0048, + "num_input_tokens_seen": 26986976, + "step": 127880 + }, + { + "epoch": 14.06875687568757, + "grad_norm": 0.007340725976973772, + "learning_rate": 1.2241504119177665e-05, + "loss": 0.0366, + "num_input_tokens_seen": 26988000, + "step": 127885 + }, + { + "epoch": 14.069306930693068, + "grad_norm": 0.016246020793914795, + "learning_rate": 1.223944018496083e-05, + "loss": 0.0121, + "num_input_tokens_seen": 26989056, + "step": 127890 + }, + { + "epoch": 14.06985698569857, + "grad_norm": 0.004011169075965881, + "learning_rate": 1.2237376368351949e-05, + "loss": 0.0013, + "num_input_tokens_seen": 26990144, + "step": 127895 + }, + { + "epoch": 14.07040704070407, + "grad_norm": 0.09560481458902359, + "learning_rate": 1.2235312669370056e-05, + "loss": 0.0034, + "num_input_tokens_seen": 26991200, + "step": 127900 + }, + { + "epoch": 14.070957095709572, + "grad_norm": 3.014329671859741, + "learning_rate": 1.2233249088034151e-05, + "loss": 0.0381, + "num_input_tokens_seen": 26992224, + "step": 127905 + }, + { + "epoch": 14.071507150715071, + "grad_norm": 0.17197133600711823, + "learning_rate": 1.2231185624363272e-05, + "loss": 0.0253, + "num_input_tokens_seen": 26993184, + "step": 127910 + }, + { + "epoch": 14.072057205720572, + "grad_norm": 0.34171921014785767, + "learning_rate": 1.222912227837642e-05, + "loss": 0.2294, + "num_input_tokens_seen": 26994208, + "step": 127915 + }, + { + "epoch": 14.072607260726073, + "grad_norm": 0.5100273489952087, + "learning_rate": 1.222705905009262e-05, + "loss": 0.0099, + "num_input_tokens_seen": 26995328, + "step": 127920 + }, + { + "epoch": 14.073157315731573, + "grad_norm": 0.04256803169846535, + "learning_rate": 1.2224995939530897e-05, + "loss": 0.0243, + "num_input_tokens_seen": 26996384, + "step": 127925 + }, + { + "epoch": 14.073707370737074, + "grad_norm": 1.0474188327789307, + "learning_rate": 1.2222932946710247e-05, + "loss": 0.0073, + "num_input_tokens_seen": 26997472, + "step": 127930 + }, + { + "epoch": 14.074257425742575, + "grad_norm": 0.010887492448091507, + "learning_rate": 1.2220870071649704e-05, + "loss": 0.004, + "num_input_tokens_seen": 26998464, + "step": 127935 + }, + { + "epoch": 14.074807480748074, + "grad_norm": 0.025941194966435432, + "learning_rate": 1.2218807314368267e-05, + "loss": 0.0012, + "num_input_tokens_seen": 26999520, + "step": 127940 + }, + { + "epoch": 14.075357535753575, + "grad_norm": 0.024748319759964943, + "learning_rate": 1.2216744674884942e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27000608, + "step": 127945 + }, + { + "epoch": 14.075907590759076, + "grad_norm": 0.03203056380152702, + "learning_rate": 1.2214682153218745e-05, + "loss": 0.0078, + "num_input_tokens_seen": 27001696, + "step": 127950 + }, + { + "epoch": 14.076457645764576, + "grad_norm": 0.04722007364034653, + "learning_rate": 1.2212619749388691e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27002752, + "step": 127955 + }, + { + "epoch": 14.077007700770077, + "grad_norm": 0.02736014872789383, + "learning_rate": 1.221055746341379e-05, + "loss": 0.088, + "num_input_tokens_seen": 27003776, + "step": 127960 + }, + { + "epoch": 14.077557755775578, + "grad_norm": 0.034306660294532776, + "learning_rate": 1.2208495295313046e-05, + "loss": 0.0254, + "num_input_tokens_seen": 27004864, + "step": 127965 + }, + { + "epoch": 14.078107810781079, + "grad_norm": 0.06704957038164139, + "learning_rate": 1.2206433245105456e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27005920, + "step": 127970 + }, + { + "epoch": 14.078657865786578, + "grad_norm": 2.0215179920196533, + "learning_rate": 1.2204371312810028e-05, + "loss": 0.0676, + "num_input_tokens_seen": 27007008, + "step": 127975 + }, + { + "epoch": 14.07920792079208, + "grad_norm": 0.012172900140285492, + "learning_rate": 1.2202309498445772e-05, + "loss": 0.0107, + "num_input_tokens_seen": 27008096, + "step": 127980 + }, + { + "epoch": 14.07975797579758, + "grad_norm": 2.816376209259033, + "learning_rate": 1.2200247802031697e-05, + "loss": 0.0384, + "num_input_tokens_seen": 27009120, + "step": 127985 + }, + { + "epoch": 14.08030803080308, + "grad_norm": 0.036294810473918915, + "learning_rate": 1.2198186223586797e-05, + "loss": 0.0123, + "num_input_tokens_seen": 27010240, + "step": 127990 + }, + { + "epoch": 14.08085808580858, + "grad_norm": 0.008544456213712692, + "learning_rate": 1.2196124763130062e-05, + "loss": 0.0326, + "num_input_tokens_seen": 27011328, + "step": 127995 + }, + { + "epoch": 14.081408140814082, + "grad_norm": 0.0655752420425415, + "learning_rate": 1.219406342068051e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27012320, + "step": 128000 + }, + { + "epoch": 14.081958195819581, + "grad_norm": 0.01744694449007511, + "learning_rate": 1.219200219625712e-05, + "loss": 0.039, + "num_input_tokens_seen": 27013408, + "step": 128005 + }, + { + "epoch": 14.082508250825082, + "grad_norm": 0.061307020485401154, + "learning_rate": 1.21899410898789e-05, + "loss": 0.0322, + "num_input_tokens_seen": 27014400, + "step": 128010 + }, + { + "epoch": 14.083058305830583, + "grad_norm": 0.034103136509656906, + "learning_rate": 1.2187880101564859e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27015424, + "step": 128015 + }, + { + "epoch": 14.083608360836084, + "grad_norm": 0.01036479789763689, + "learning_rate": 1.2185819231333968e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27016512, + "step": 128020 + }, + { + "epoch": 14.084158415841584, + "grad_norm": 0.002529238350689411, + "learning_rate": 1.218375847920524e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27017600, + "step": 128025 + }, + { + "epoch": 14.084708470847085, + "grad_norm": 0.33735546469688416, + "learning_rate": 1.2181697845197651e-05, + "loss": 0.0058, + "num_input_tokens_seen": 27018592, + "step": 128030 + }, + { + "epoch": 14.085258525852586, + "grad_norm": 0.6207086443901062, + "learning_rate": 1.2179637329330204e-05, + "loss": 0.0065, + "num_input_tokens_seen": 27019648, + "step": 128035 + }, + { + "epoch": 14.085808580858085, + "grad_norm": 0.02200869284570217, + "learning_rate": 1.2177576931621895e-05, + "loss": 0.0306, + "num_input_tokens_seen": 27020640, + "step": 128040 + }, + { + "epoch": 14.086358635863586, + "grad_norm": 0.033853646367788315, + "learning_rate": 1.21755166520917e-05, + "loss": 0.0031, + "num_input_tokens_seen": 27021600, + "step": 128045 + }, + { + "epoch": 14.086908690869087, + "grad_norm": 0.06027887389063835, + "learning_rate": 1.2173456490758622e-05, + "loss": 0.0009, + "num_input_tokens_seen": 27022688, + "step": 128050 + }, + { + "epoch": 14.087458745874587, + "grad_norm": 0.029194951057434082, + "learning_rate": 1.2171396447641641e-05, + "loss": 0.007, + "num_input_tokens_seen": 27023712, + "step": 128055 + }, + { + "epoch": 14.088008800880088, + "grad_norm": 0.007003103848546743, + "learning_rate": 1.2169336522759733e-05, + "loss": 0.1096, + "num_input_tokens_seen": 27024832, + "step": 128060 + }, + { + "epoch": 14.088558855885589, + "grad_norm": 0.007966465316712856, + "learning_rate": 1.2167276716131895e-05, + "loss": 0.0941, + "num_input_tokens_seen": 27025952, + "step": 128065 + }, + { + "epoch": 14.089108910891088, + "grad_norm": 0.013124370016157627, + "learning_rate": 1.2165217027777109e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27027008, + "step": 128070 + }, + { + "epoch": 14.08965896589659, + "grad_norm": 0.05429534241557121, + "learning_rate": 1.216315745771437e-05, + "loss": 0.0019, + "num_input_tokens_seen": 27028000, + "step": 128075 + }, + { + "epoch": 14.09020902090209, + "grad_norm": 0.2781766951084137, + "learning_rate": 1.2161098005962648e-05, + "loss": 0.0416, + "num_input_tokens_seen": 27029024, + "step": 128080 + }, + { + "epoch": 14.090759075907592, + "grad_norm": 0.02773219160735607, + "learning_rate": 1.2159038672540915e-05, + "loss": 0.002, + "num_input_tokens_seen": 27030080, + "step": 128085 + }, + { + "epoch": 14.091309130913091, + "grad_norm": 0.12961935997009277, + "learning_rate": 1.215697945746816e-05, + "loss": 0.0235, + "num_input_tokens_seen": 27031136, + "step": 128090 + }, + { + "epoch": 14.091859185918592, + "grad_norm": 1.4079947471618652, + "learning_rate": 1.2154920360763364e-05, + "loss": 0.0983, + "num_input_tokens_seen": 27032256, + "step": 128095 + }, + { + "epoch": 14.092409240924093, + "grad_norm": 0.15083016455173492, + "learning_rate": 1.2152861382445513e-05, + "loss": 0.035, + "num_input_tokens_seen": 27033248, + "step": 128100 + }, + { + "epoch": 14.092959295929592, + "grad_norm": 3.714601993560791, + "learning_rate": 1.2150802522533573e-05, + "loss": 0.0321, + "num_input_tokens_seen": 27034336, + "step": 128105 + }, + { + "epoch": 14.093509350935093, + "grad_norm": 0.04822995141148567, + "learning_rate": 1.2148743781046513e-05, + "loss": 0.0698, + "num_input_tokens_seen": 27035360, + "step": 128110 + }, + { + "epoch": 14.094059405940595, + "grad_norm": 0.9248501062393188, + "learning_rate": 1.2146685158003321e-05, + "loss": 0.0057, + "num_input_tokens_seen": 27036352, + "step": 128115 + }, + { + "epoch": 14.094609460946094, + "grad_norm": 0.037489455193281174, + "learning_rate": 1.2144626653422955e-05, + "loss": 0.2122, + "num_input_tokens_seen": 27037408, + "step": 128120 + }, + { + "epoch": 14.095159515951595, + "grad_norm": 0.11699977517127991, + "learning_rate": 1.21425682673244e-05, + "loss": 0.0034, + "num_input_tokens_seen": 27038464, + "step": 128125 + }, + { + "epoch": 14.095709570957096, + "grad_norm": 0.5193140506744385, + "learning_rate": 1.2140509999726633e-05, + "loss": 0.0069, + "num_input_tokens_seen": 27039552, + "step": 128130 + }, + { + "epoch": 14.096259625962595, + "grad_norm": 0.931421160697937, + "learning_rate": 1.2138451850648602e-05, + "loss": 0.0155, + "num_input_tokens_seen": 27040608, + "step": 128135 + }, + { + "epoch": 14.096809680968097, + "grad_norm": 0.033540740609169006, + "learning_rate": 1.2136393820109299e-05, + "loss": 0.0425, + "num_input_tokens_seen": 27041632, + "step": 128140 + }, + { + "epoch": 14.097359735973598, + "grad_norm": 0.04169609397649765, + "learning_rate": 1.2134335908127673e-05, + "loss": 0.0118, + "num_input_tokens_seen": 27042656, + "step": 128145 + }, + { + "epoch": 14.097909790979099, + "grad_norm": 1.6635044813156128, + "learning_rate": 1.21322781147227e-05, + "loss": 0.0621, + "num_input_tokens_seen": 27043744, + "step": 128150 + }, + { + "epoch": 14.098459845984598, + "grad_norm": 0.054034166038036346, + "learning_rate": 1.2130220439913354e-05, + "loss": 0.0083, + "num_input_tokens_seen": 27044800, + "step": 128155 + }, + { + "epoch": 14.099009900990099, + "grad_norm": 5.18467378616333, + "learning_rate": 1.212816288371858e-05, + "loss": 0.1267, + "num_input_tokens_seen": 27045856, + "step": 128160 + }, + { + "epoch": 14.0995599559956, + "grad_norm": 0.014526857994496822, + "learning_rate": 1.2126105446157362e-05, + "loss": 0.0105, + "num_input_tokens_seen": 27046880, + "step": 128165 + }, + { + "epoch": 14.1001100110011, + "grad_norm": 0.019254164770245552, + "learning_rate": 1.2124048127248644e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27047936, + "step": 128170 + }, + { + "epoch": 14.1006600660066, + "grad_norm": 0.0335719920694828, + "learning_rate": 1.2121990927011407e-05, + "loss": 0.0019, + "num_input_tokens_seen": 27049024, + "step": 128175 + }, + { + "epoch": 14.101210121012102, + "grad_norm": 0.006833972875028849, + "learning_rate": 1.2119933845464587e-05, + "loss": 0.0007, + "num_input_tokens_seen": 27050080, + "step": 128180 + }, + { + "epoch": 14.101760176017601, + "grad_norm": 0.12678007781505585, + "learning_rate": 1.2117876882627161e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27051104, + "step": 128185 + }, + { + "epoch": 14.102310231023102, + "grad_norm": 0.011334393173456192, + "learning_rate": 1.2115820038518088e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27052192, + "step": 128190 + }, + { + "epoch": 14.102860286028603, + "grad_norm": 0.043876852840185165, + "learning_rate": 1.211376331315632e-05, + "loss": 0.0031, + "num_input_tokens_seen": 27053248, + "step": 128195 + }, + { + "epoch": 14.103410341034103, + "grad_norm": 0.0181033443659544, + "learning_rate": 1.2111706706560803e-05, + "loss": 0.0198, + "num_input_tokens_seen": 27054368, + "step": 128200 + }, + { + "epoch": 14.103960396039604, + "grad_norm": 0.011583533138036728, + "learning_rate": 1.21096502187505e-05, + "loss": 0.0092, + "num_input_tokens_seen": 27055424, + "step": 128205 + }, + { + "epoch": 14.104510451045105, + "grad_norm": 0.28851303458213806, + "learning_rate": 1.2107593849744367e-05, + "loss": 0.0039, + "num_input_tokens_seen": 27056512, + "step": 128210 + }, + { + "epoch": 14.105060506050606, + "grad_norm": 0.011894975788891315, + "learning_rate": 1.2105537599561364e-05, + "loss": 0.0035, + "num_input_tokens_seen": 27057536, + "step": 128215 + }, + { + "epoch": 14.105610561056105, + "grad_norm": 0.42678654193878174, + "learning_rate": 1.2103481468220435e-05, + "loss": 0.0077, + "num_input_tokens_seen": 27058624, + "step": 128220 + }, + { + "epoch": 14.106160616061606, + "grad_norm": 0.15995623171329498, + "learning_rate": 1.2101425455740516e-05, + "loss": 0.0104, + "num_input_tokens_seen": 27059648, + "step": 128225 + }, + { + "epoch": 14.106710671067107, + "grad_norm": 0.04352259263396263, + "learning_rate": 1.2099369562140581e-05, + "loss": 0.0114, + "num_input_tokens_seen": 27060768, + "step": 128230 + }, + { + "epoch": 14.107260726072607, + "grad_norm": 0.007339013274759054, + "learning_rate": 1.2097313787439557e-05, + "loss": 0.0195, + "num_input_tokens_seen": 27061792, + "step": 128235 + }, + { + "epoch": 14.107810781078108, + "grad_norm": 1.5763968229293823, + "learning_rate": 1.20952581316564e-05, + "loss": 0.0219, + "num_input_tokens_seen": 27062848, + "step": 128240 + }, + { + "epoch": 14.108360836083609, + "grad_norm": 0.6144962906837463, + "learning_rate": 1.2093202594810066e-05, + "loss": 0.061, + "num_input_tokens_seen": 27063904, + "step": 128245 + }, + { + "epoch": 14.108910891089108, + "grad_norm": 0.4634261727333069, + "learning_rate": 1.2091147176919482e-05, + "loss": 0.0133, + "num_input_tokens_seen": 27065088, + "step": 128250 + }, + { + "epoch": 14.10946094609461, + "grad_norm": 0.032753556966781616, + "learning_rate": 1.2089091878003609e-05, + "loss": 0.0733, + "num_input_tokens_seen": 27066144, + "step": 128255 + }, + { + "epoch": 14.11001100110011, + "grad_norm": 2.5249640941619873, + "learning_rate": 1.208703669808137e-05, + "loss": 0.0353, + "num_input_tokens_seen": 27067168, + "step": 128260 + }, + { + "epoch": 14.110561056105611, + "grad_norm": 0.016420258209109306, + "learning_rate": 1.2084981637171719e-05, + "loss": 0.0512, + "num_input_tokens_seen": 27068256, + "step": 128265 + }, + { + "epoch": 14.11111111111111, + "grad_norm": 0.019100219011306763, + "learning_rate": 1.2082926695293605e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27069312, + "step": 128270 + }, + { + "epoch": 14.111661166116612, + "grad_norm": 0.0024901984725147486, + "learning_rate": 1.2080871872465945e-05, + "loss": 0.06, + "num_input_tokens_seen": 27070304, + "step": 128275 + }, + { + "epoch": 14.112211221122113, + "grad_norm": 0.019049670547246933, + "learning_rate": 1.2078817168707704e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27071360, + "step": 128280 + }, + { + "epoch": 14.112761276127612, + "grad_norm": 0.2590100169181824, + "learning_rate": 1.2076762584037793e-05, + "loss": 0.0088, + "num_input_tokens_seen": 27072448, + "step": 128285 + }, + { + "epoch": 14.113311331133113, + "grad_norm": 0.006732570938766003, + "learning_rate": 1.207470811847517e-05, + "loss": 0.0007, + "num_input_tokens_seen": 27073536, + "step": 128290 + }, + { + "epoch": 14.113861386138614, + "grad_norm": 0.030772481113672256, + "learning_rate": 1.207265377203875e-05, + "loss": 0.0243, + "num_input_tokens_seen": 27074624, + "step": 128295 + }, + { + "epoch": 14.114411441144114, + "grad_norm": 0.018993502482771873, + "learning_rate": 1.2070599544747476e-05, + "loss": 0.055, + "num_input_tokens_seen": 27075616, + "step": 128300 + }, + { + "epoch": 14.114961496149615, + "grad_norm": 0.007212578784674406, + "learning_rate": 1.2068545436620293e-05, + "loss": 0.001, + "num_input_tokens_seen": 27076640, + "step": 128305 + }, + { + "epoch": 14.115511551155116, + "grad_norm": 1.2180957794189453, + "learning_rate": 1.2066491447676112e-05, + "loss": 0.006, + "num_input_tokens_seen": 27077632, + "step": 128310 + }, + { + "epoch": 14.116061606160615, + "grad_norm": 0.14079070091247559, + "learning_rate": 1.2064437577933882e-05, + "loss": 0.1007, + "num_input_tokens_seen": 27078720, + "step": 128315 + }, + { + "epoch": 14.116611661166116, + "grad_norm": 0.23880375921726227, + "learning_rate": 1.2062383827412513e-05, + "loss": 0.02, + "num_input_tokens_seen": 27079808, + "step": 128320 + }, + { + "epoch": 14.117161716171617, + "grad_norm": 0.011686422862112522, + "learning_rate": 1.2060330196130947e-05, + "loss": 0.0077, + "num_input_tokens_seen": 27080864, + "step": 128325 + }, + { + "epoch": 14.117711771177119, + "grad_norm": 0.012680401094257832, + "learning_rate": 1.2058276684108117e-05, + "loss": 0.006, + "num_input_tokens_seen": 27081952, + "step": 128330 + }, + { + "epoch": 14.118261826182618, + "grad_norm": 0.007085134275257587, + "learning_rate": 1.2056223291362931e-05, + "loss": 0.0109, + "num_input_tokens_seen": 27082976, + "step": 128335 + }, + { + "epoch": 14.118811881188119, + "grad_norm": 0.02268906868994236, + "learning_rate": 1.2054170017914332e-05, + "loss": 0.0014, + "num_input_tokens_seen": 27084032, + "step": 128340 + }, + { + "epoch": 14.11936193619362, + "grad_norm": 0.007026880048215389, + "learning_rate": 1.2052116863781238e-05, + "loss": 0.0154, + "num_input_tokens_seen": 27085120, + "step": 128345 + }, + { + "epoch": 14.11991199119912, + "grad_norm": 2.2310400009155273, + "learning_rate": 1.205006382898256e-05, + "loss": 0.1888, + "num_input_tokens_seen": 27086208, + "step": 128350 + }, + { + "epoch": 14.12046204620462, + "grad_norm": 0.29433178901672363, + "learning_rate": 1.204801091353723e-05, + "loss": 0.0576, + "num_input_tokens_seen": 27087232, + "step": 128355 + }, + { + "epoch": 14.121012101210122, + "grad_norm": 0.4599769115447998, + "learning_rate": 1.2045958117464167e-05, + "loss": 0.0112, + "num_input_tokens_seen": 27088256, + "step": 128360 + }, + { + "epoch": 14.12156215621562, + "grad_norm": 1.9013476371765137, + "learning_rate": 1.2043905440782302e-05, + "loss": 0.0196, + "num_input_tokens_seen": 27089280, + "step": 128365 + }, + { + "epoch": 14.122112211221122, + "grad_norm": 0.03746198117733002, + "learning_rate": 1.2041852883510546e-05, + "loss": 0.001, + "num_input_tokens_seen": 27090368, + "step": 128370 + }, + { + "epoch": 14.122662266226623, + "grad_norm": 0.005896704737097025, + "learning_rate": 1.2039800445667801e-05, + "loss": 0.001, + "num_input_tokens_seen": 27091360, + "step": 128375 + }, + { + "epoch": 14.123212321232122, + "grad_norm": 0.025105973705649376, + "learning_rate": 1.2037748127272998e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27092416, + "step": 128380 + }, + { + "epoch": 14.123762376237623, + "grad_norm": 0.3536946773529053, + "learning_rate": 1.2035695928345061e-05, + "loss": 0.0051, + "num_input_tokens_seen": 27093472, + "step": 128385 + }, + { + "epoch": 14.124312431243125, + "grad_norm": 0.5264992117881775, + "learning_rate": 1.2033643848902885e-05, + "loss": 0.0112, + "num_input_tokens_seen": 27094592, + "step": 128390 + }, + { + "epoch": 14.124862486248626, + "grad_norm": 0.009145300835371017, + "learning_rate": 1.2031591888965399e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27095616, + "step": 128395 + }, + { + "epoch": 14.125412541254125, + "grad_norm": 0.009253084659576416, + "learning_rate": 1.20295400485515e-05, + "loss": 0.0909, + "num_input_tokens_seen": 27096704, + "step": 128400 + }, + { + "epoch": 14.125962596259626, + "grad_norm": 0.0479777529835701, + "learning_rate": 1.2027488327680112e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27097792, + "step": 128405 + }, + { + "epoch": 14.126512651265127, + "grad_norm": 0.012553063221275806, + "learning_rate": 1.2025436726370134e-05, + "loss": 0.0487, + "num_input_tokens_seen": 27098816, + "step": 128410 + }, + { + "epoch": 14.127062706270626, + "grad_norm": 0.0331365130841732, + "learning_rate": 1.2023385244640477e-05, + "loss": 0.0125, + "num_input_tokens_seen": 27099872, + "step": 128415 + }, + { + "epoch": 14.127612761276128, + "grad_norm": 0.02433476597070694, + "learning_rate": 1.2021333882510063e-05, + "loss": 0.0429, + "num_input_tokens_seen": 27100896, + "step": 128420 + }, + { + "epoch": 14.128162816281629, + "grad_norm": 0.05379285290837288, + "learning_rate": 1.2019282639997777e-05, + "loss": 0.01, + "num_input_tokens_seen": 27102016, + "step": 128425 + }, + { + "epoch": 14.128712871287128, + "grad_norm": 0.946023166179657, + "learning_rate": 1.2017231517122543e-05, + "loss": 0.1345, + "num_input_tokens_seen": 27103072, + "step": 128430 + }, + { + "epoch": 14.129262926292629, + "grad_norm": 0.046211127191782, + "learning_rate": 1.2015180513903256e-05, + "loss": 0.0501, + "num_input_tokens_seen": 27104160, + "step": 128435 + }, + { + "epoch": 14.12981298129813, + "grad_norm": 0.0026977520901709795, + "learning_rate": 1.2013129630358802e-05, + "loss": 0.0014, + "num_input_tokens_seen": 27105184, + "step": 128440 + }, + { + "epoch": 14.130363036303631, + "grad_norm": 0.01793019287288189, + "learning_rate": 1.201107886650812e-05, + "loss": 0.0783, + "num_input_tokens_seen": 27106272, + "step": 128445 + }, + { + "epoch": 14.13091309130913, + "grad_norm": 0.021864090114831924, + "learning_rate": 1.200902822237008e-05, + "loss": 0.0262, + "num_input_tokens_seen": 27107360, + "step": 128450 + }, + { + "epoch": 14.131463146314632, + "grad_norm": 3.0899691581726074, + "learning_rate": 1.2006977697963603e-05, + "loss": 0.1153, + "num_input_tokens_seen": 27108480, + "step": 128455 + }, + { + "epoch": 14.132013201320133, + "grad_norm": 2.499786138534546, + "learning_rate": 1.200492729330758e-05, + "loss": 0.1316, + "num_input_tokens_seen": 27109472, + "step": 128460 + }, + { + "epoch": 14.132563256325632, + "grad_norm": 0.24599309265613556, + "learning_rate": 1.2002877008420895e-05, + "loss": 0.0071, + "num_input_tokens_seen": 27110528, + "step": 128465 + }, + { + "epoch": 14.133113311331133, + "grad_norm": 0.012604444287717342, + "learning_rate": 1.2000826843322458e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27111552, + "step": 128470 + }, + { + "epoch": 14.133663366336634, + "grad_norm": 0.385189026594162, + "learning_rate": 1.1998776798031158e-05, + "loss": 0.0206, + "num_input_tokens_seen": 27112544, + "step": 128475 + }, + { + "epoch": 14.134213421342134, + "grad_norm": 0.029649006202816963, + "learning_rate": 1.1996726872565906e-05, + "loss": 0.0189, + "num_input_tokens_seen": 27113632, + "step": 128480 + }, + { + "epoch": 14.134763476347635, + "grad_norm": 0.07218965142965317, + "learning_rate": 1.1994677066945584e-05, + "loss": 0.0059, + "num_input_tokens_seen": 27114752, + "step": 128485 + }, + { + "epoch": 14.135313531353136, + "grad_norm": 0.029969897121191025, + "learning_rate": 1.1992627381189073e-05, + "loss": 0.0064, + "num_input_tokens_seen": 27115840, + "step": 128490 + }, + { + "epoch": 14.135863586358635, + "grad_norm": 0.01364404708147049, + "learning_rate": 1.1990577815315282e-05, + "loss": 0.0931, + "num_input_tokens_seen": 27116864, + "step": 128495 + }, + { + "epoch": 14.136413641364136, + "grad_norm": 0.014972567558288574, + "learning_rate": 1.1988528369343074e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27117920, + "step": 128500 + }, + { + "epoch": 14.136963696369637, + "grad_norm": 0.27501121163368225, + "learning_rate": 1.1986479043291373e-05, + "loss": 0.0157, + "num_input_tokens_seen": 27119040, + "step": 128505 + }, + { + "epoch": 14.137513751375138, + "grad_norm": 0.006140085402876139, + "learning_rate": 1.1984429837179053e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27120096, + "step": 128510 + }, + { + "epoch": 14.138063806380638, + "grad_norm": 1.49181067943573, + "learning_rate": 1.1982380751024991e-05, + "loss": 0.1894, + "num_input_tokens_seen": 27121152, + "step": 128515 + }, + { + "epoch": 14.138613861386139, + "grad_norm": 2.706124782562256, + "learning_rate": 1.1980331784848084e-05, + "loss": 0.063, + "num_input_tokens_seen": 27122240, + "step": 128520 + }, + { + "epoch": 14.13916391639164, + "grad_norm": 0.05967895686626434, + "learning_rate": 1.1978282938667204e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27123232, + "step": 128525 + }, + { + "epoch": 14.13971397139714, + "grad_norm": 2.291966438293457, + "learning_rate": 1.1976234212501241e-05, + "loss": 0.0131, + "num_input_tokens_seen": 27124256, + "step": 128530 + }, + { + "epoch": 14.14026402640264, + "grad_norm": 0.060019414871931076, + "learning_rate": 1.197418560636909e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27125248, + "step": 128535 + }, + { + "epoch": 14.140814081408141, + "grad_norm": 0.03311997652053833, + "learning_rate": 1.1972137120289607e-05, + "loss": 0.0136, + "num_input_tokens_seen": 27126336, + "step": 128540 + }, + { + "epoch": 14.14136413641364, + "grad_norm": 0.03542887791991234, + "learning_rate": 1.1970088754281694e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27127392, + "step": 128545 + }, + { + "epoch": 14.141914191419142, + "grad_norm": 0.04877682030200958, + "learning_rate": 1.1968040508364214e-05, + "loss": 0.0928, + "num_input_tokens_seen": 27128448, + "step": 128550 + }, + { + "epoch": 14.142464246424643, + "grad_norm": 0.0075333802960813046, + "learning_rate": 1.196599238255606e-05, + "loss": 0.0328, + "num_input_tokens_seen": 27129504, + "step": 128555 + }, + { + "epoch": 14.143014301430142, + "grad_norm": 0.05227026715874672, + "learning_rate": 1.1963944376876088e-05, + "loss": 0.0139, + "num_input_tokens_seen": 27130528, + "step": 128560 + }, + { + "epoch": 14.143564356435643, + "grad_norm": 0.09211918711662292, + "learning_rate": 1.1961896491343188e-05, + "loss": 0.0121, + "num_input_tokens_seen": 27131616, + "step": 128565 + }, + { + "epoch": 14.144114411441144, + "grad_norm": 0.004895404912531376, + "learning_rate": 1.1959848725976242e-05, + "loss": 0.096, + "num_input_tokens_seen": 27132640, + "step": 128570 + }, + { + "epoch": 14.144664466446645, + "grad_norm": 0.015237958170473576, + "learning_rate": 1.195780108079411e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27133728, + "step": 128575 + }, + { + "epoch": 14.145214521452145, + "grad_norm": 1.9488807916641235, + "learning_rate": 1.195575355581566e-05, + "loss": 0.0369, + "num_input_tokens_seen": 27134784, + "step": 128580 + }, + { + "epoch": 14.145764576457646, + "grad_norm": 0.046529293060302734, + "learning_rate": 1.1953706151059768e-05, + "loss": 0.0045, + "num_input_tokens_seen": 27135776, + "step": 128585 + }, + { + "epoch": 14.146314631463147, + "grad_norm": 0.18310612440109253, + "learning_rate": 1.195165886654531e-05, + "loss": 0.0656, + "num_input_tokens_seen": 27136832, + "step": 128590 + }, + { + "epoch": 14.146864686468646, + "grad_norm": 0.04316579923033714, + "learning_rate": 1.1949611702291156e-05, + "loss": 0.0331, + "num_input_tokens_seen": 27137888, + "step": 128595 + }, + { + "epoch": 14.147414741474147, + "grad_norm": 0.07734780758619308, + "learning_rate": 1.194756465831617e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27138944, + "step": 128600 + }, + { + "epoch": 14.147964796479648, + "grad_norm": 0.10230222344398499, + "learning_rate": 1.1945517734639208e-05, + "loss": 0.0049, + "num_input_tokens_seen": 27139968, + "step": 128605 + }, + { + "epoch": 14.148514851485148, + "grad_norm": 0.014604332856833935, + "learning_rate": 1.1943470931279157e-05, + "loss": 0.0108, + "num_input_tokens_seen": 27141024, + "step": 128610 + }, + { + "epoch": 14.149064906490649, + "grad_norm": 0.08282092213630676, + "learning_rate": 1.1941424248254857e-05, + "loss": 0.0568, + "num_input_tokens_seen": 27142080, + "step": 128615 + }, + { + "epoch": 14.14961496149615, + "grad_norm": 0.038974106311798096, + "learning_rate": 1.1939377685585184e-05, + "loss": 0.0886, + "num_input_tokens_seen": 27143136, + "step": 128620 + }, + { + "epoch": 14.150165016501651, + "grad_norm": 0.030433766543865204, + "learning_rate": 1.193733124328901e-05, + "loss": 0.054, + "num_input_tokens_seen": 27144224, + "step": 128625 + }, + { + "epoch": 14.15071507150715, + "grad_norm": 1.2244153022766113, + "learning_rate": 1.1935284921385176e-05, + "loss": 0.0356, + "num_input_tokens_seen": 27145280, + "step": 128630 + }, + { + "epoch": 14.151265126512651, + "grad_norm": 0.5219004154205322, + "learning_rate": 1.1933238719892562e-05, + "loss": 0.0127, + "num_input_tokens_seen": 27146304, + "step": 128635 + }, + { + "epoch": 14.151815181518153, + "grad_norm": 0.00797155499458313, + "learning_rate": 1.1931192638830008e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27147392, + "step": 128640 + }, + { + "epoch": 14.152365236523652, + "grad_norm": 0.9521945714950562, + "learning_rate": 1.1929146678216377e-05, + "loss": 0.01, + "num_input_tokens_seen": 27148384, + "step": 128645 + }, + { + "epoch": 14.152915291529153, + "grad_norm": 0.4687485098838806, + "learning_rate": 1.192710083807054e-05, + "loss": 0.0086, + "num_input_tokens_seen": 27149344, + "step": 128650 + }, + { + "epoch": 14.153465346534654, + "grad_norm": 0.20809847116470337, + "learning_rate": 1.1925055118411336e-05, + "loss": 0.0082, + "num_input_tokens_seen": 27150432, + "step": 128655 + }, + { + "epoch": 14.154015401540153, + "grad_norm": 1.5100526809692383, + "learning_rate": 1.192300951925763e-05, + "loss": 0.1297, + "num_input_tokens_seen": 27151520, + "step": 128660 + }, + { + "epoch": 14.154565456545654, + "grad_norm": 0.0837687999010086, + "learning_rate": 1.192096404062826e-05, + "loss": 0.0328, + "num_input_tokens_seen": 27152576, + "step": 128665 + }, + { + "epoch": 14.155115511551156, + "grad_norm": 0.03735318407416344, + "learning_rate": 1.1918918682542101e-05, + "loss": 0.0195, + "num_input_tokens_seen": 27153664, + "step": 128670 + }, + { + "epoch": 14.155665566556655, + "grad_norm": 0.16082219779491425, + "learning_rate": 1.1916873445017982e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27154816, + "step": 128675 + }, + { + "epoch": 14.156215621562156, + "grad_norm": 0.030416321009397507, + "learning_rate": 1.1914828328074763e-05, + "loss": 0.2577, + "num_input_tokens_seen": 27155904, + "step": 128680 + }, + { + "epoch": 14.156765676567657, + "grad_norm": 0.23490171134471893, + "learning_rate": 1.1912783331731298e-05, + "loss": 0.0079, + "num_input_tokens_seen": 27156928, + "step": 128685 + }, + { + "epoch": 14.157315731573158, + "grad_norm": 0.023376815021038055, + "learning_rate": 1.1910738456006423e-05, + "loss": 0.0116, + "num_input_tokens_seen": 27157952, + "step": 128690 + }, + { + "epoch": 14.157865786578657, + "grad_norm": 0.1541920006275177, + "learning_rate": 1.1908693700918999e-05, + "loss": 0.028, + "num_input_tokens_seen": 27158944, + "step": 128695 + }, + { + "epoch": 14.158415841584159, + "grad_norm": 0.017404058948159218, + "learning_rate": 1.1906649066487854e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27160000, + "step": 128700 + }, + { + "epoch": 14.15896589658966, + "grad_norm": 0.26229777932167053, + "learning_rate": 1.1904604552731843e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27161088, + "step": 128705 + }, + { + "epoch": 14.159515951595159, + "grad_norm": 0.05188896507024765, + "learning_rate": 1.1902560159669814e-05, + "loss": 0.0086, + "num_input_tokens_seen": 27162112, + "step": 128710 + }, + { + "epoch": 14.16006600660066, + "grad_norm": 0.03367815911769867, + "learning_rate": 1.1900515887320599e-05, + "loss": 0.0089, + "num_input_tokens_seen": 27163200, + "step": 128715 + }, + { + "epoch": 14.160616061606161, + "grad_norm": 0.01389265339821577, + "learning_rate": 1.1898471735703052e-05, + "loss": 0.0739, + "num_input_tokens_seen": 27164224, + "step": 128720 + }, + { + "epoch": 14.16116611661166, + "grad_norm": 0.017407674342393875, + "learning_rate": 1.1896427704836e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27165344, + "step": 128725 + }, + { + "epoch": 14.161716171617162, + "grad_norm": 0.01726483181118965, + "learning_rate": 1.189438379473828e-05, + "loss": 0.0061, + "num_input_tokens_seen": 27166368, + "step": 128730 + }, + { + "epoch": 14.162266226622663, + "grad_norm": 0.028743335977196693, + "learning_rate": 1.1892340005428735e-05, + "loss": 0.0058, + "num_input_tokens_seen": 27167424, + "step": 128735 + }, + { + "epoch": 14.162816281628162, + "grad_norm": 5.1928911209106445, + "learning_rate": 1.1890296336926202e-05, + "loss": 0.1403, + "num_input_tokens_seen": 27168544, + "step": 128740 + }, + { + "epoch": 14.163366336633663, + "grad_norm": 0.06064820662140846, + "learning_rate": 1.1888252789249526e-05, + "loss": 0.0029, + "num_input_tokens_seen": 27169632, + "step": 128745 + }, + { + "epoch": 14.163916391639164, + "grad_norm": 0.010803301818668842, + "learning_rate": 1.1886209362417535e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27170624, + "step": 128750 + }, + { + "epoch": 14.164466446644665, + "grad_norm": 1.9683341979980469, + "learning_rate": 1.1884166056449048e-05, + "loss": 0.079, + "num_input_tokens_seen": 27171648, + "step": 128755 + }, + { + "epoch": 14.165016501650165, + "grad_norm": 0.012330528348684311, + "learning_rate": 1.1882122871362911e-05, + "loss": 0.0479, + "num_input_tokens_seen": 27172640, + "step": 128760 + }, + { + "epoch": 14.165566556655666, + "grad_norm": 0.028092484921216965, + "learning_rate": 1.1880079807177961e-05, + "loss": 0.0144, + "num_input_tokens_seen": 27173728, + "step": 128765 + }, + { + "epoch": 14.166116611661167, + "grad_norm": 0.06513094902038574, + "learning_rate": 1.1878036863913008e-05, + "loss": 0.0039, + "num_input_tokens_seen": 27174720, + "step": 128770 + }, + { + "epoch": 14.166666666666666, + "grad_norm": 2.9375298023223877, + "learning_rate": 1.1875994041586905e-05, + "loss": 0.012, + "num_input_tokens_seen": 27175840, + "step": 128775 + }, + { + "epoch": 14.167216721672167, + "grad_norm": 0.01170995831489563, + "learning_rate": 1.1873951340218459e-05, + "loss": 0.0792, + "num_input_tokens_seen": 27176896, + "step": 128780 + }, + { + "epoch": 14.167766776677668, + "grad_norm": 0.5559548139572144, + "learning_rate": 1.1871908759826512e-05, + "loss": 0.0812, + "num_input_tokens_seen": 27177888, + "step": 128785 + }, + { + "epoch": 14.168316831683168, + "grad_norm": 0.040774039924144745, + "learning_rate": 1.1869866300429874e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27178912, + "step": 128790 + }, + { + "epoch": 14.168866886688669, + "grad_norm": 0.006101343315094709, + "learning_rate": 1.1867823962047378e-05, + "loss": 0.0354, + "num_input_tokens_seen": 27179936, + "step": 128795 + }, + { + "epoch": 14.16941694169417, + "grad_norm": 2.3144962787628174, + "learning_rate": 1.1865781744697858e-05, + "loss": 0.062, + "num_input_tokens_seen": 27181120, + "step": 128800 + }, + { + "epoch": 14.16996699669967, + "grad_norm": 0.07462958991527557, + "learning_rate": 1.1863739648400112e-05, + "loss": 0.003, + "num_input_tokens_seen": 27182176, + "step": 128805 + }, + { + "epoch": 14.17051705170517, + "grad_norm": 0.060730960220098495, + "learning_rate": 1.1861697673172986e-05, + "loss": 0.0029, + "num_input_tokens_seen": 27183200, + "step": 128810 + }, + { + "epoch": 14.171067106710671, + "grad_norm": 1.3933995962142944, + "learning_rate": 1.1859655819035279e-05, + "loss": 0.0726, + "num_input_tokens_seen": 27184320, + "step": 128815 + }, + { + "epoch": 14.171617161716172, + "grad_norm": 0.04609999060630798, + "learning_rate": 1.185761408600582e-05, + "loss": 0.0582, + "num_input_tokens_seen": 27185376, + "step": 128820 + }, + { + "epoch": 14.172167216721672, + "grad_norm": 0.052015338093042374, + "learning_rate": 1.1855572474103433e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27186464, + "step": 128825 + }, + { + "epoch": 14.172717271727173, + "grad_norm": 0.008326933719217777, + "learning_rate": 1.1853530983346916e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27187520, + "step": 128830 + }, + { + "epoch": 14.173267326732674, + "grad_norm": 0.05839899182319641, + "learning_rate": 1.185148961375511e-05, + "loss": 0.0648, + "num_input_tokens_seen": 27188672, + "step": 128835 + }, + { + "epoch": 14.173817381738173, + "grad_norm": 0.33166441321372986, + "learning_rate": 1.1849448365346813e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27189696, + "step": 128840 + }, + { + "epoch": 14.174367436743674, + "grad_norm": 0.08853689581155777, + "learning_rate": 1.1847407238140828e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27190784, + "step": 128845 + }, + { + "epoch": 14.174917491749175, + "grad_norm": 0.05507967248558998, + "learning_rate": 1.1845366232155983e-05, + "loss": 0.0394, + "num_input_tokens_seen": 27191744, + "step": 128850 + }, + { + "epoch": 14.175467546754675, + "grad_norm": 0.0359179750084877, + "learning_rate": 1.1843325347411082e-05, + "loss": 0.0049, + "num_input_tokens_seen": 27192832, + "step": 128855 + }, + { + "epoch": 14.176017601760176, + "grad_norm": 0.23415416479110718, + "learning_rate": 1.184128458392495e-05, + "loss": 0.1626, + "num_input_tokens_seen": 27193856, + "step": 128860 + }, + { + "epoch": 14.176567656765677, + "grad_norm": 0.017992692068219185, + "learning_rate": 1.1839243941716383e-05, + "loss": 0.0063, + "num_input_tokens_seen": 27194912, + "step": 128865 + }, + { + "epoch": 14.177117711771178, + "grad_norm": 0.013633709400892258, + "learning_rate": 1.183720342080418e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27195936, + "step": 128870 + }, + { + "epoch": 14.177667766776677, + "grad_norm": 0.05507785081863403, + "learning_rate": 1.1835163021207157e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27197088, + "step": 128875 + }, + { + "epoch": 14.178217821782178, + "grad_norm": 0.8447260856628418, + "learning_rate": 1.183312274294412e-05, + "loss": 0.0087, + "num_input_tokens_seen": 27198176, + "step": 128880 + }, + { + "epoch": 14.17876787678768, + "grad_norm": 0.02089504897594452, + "learning_rate": 1.1831082586033882e-05, + "loss": 0.0117, + "num_input_tokens_seen": 27199264, + "step": 128885 + }, + { + "epoch": 14.179317931793179, + "grad_norm": 0.04528655484318733, + "learning_rate": 1.1829042550495233e-05, + "loss": 0.0009, + "num_input_tokens_seen": 27200288, + "step": 128890 + }, + { + "epoch": 14.17986798679868, + "grad_norm": 0.1859896332025528, + "learning_rate": 1.1827002636346974e-05, + "loss": 0.076, + "num_input_tokens_seen": 27201344, + "step": 128895 + }, + { + "epoch": 14.180418041804181, + "grad_norm": 0.01723637804389, + "learning_rate": 1.1824962843607921e-05, + "loss": 0.0056, + "num_input_tokens_seen": 27202368, + "step": 128900 + }, + { + "epoch": 14.18096809680968, + "grad_norm": 0.01545809954404831, + "learning_rate": 1.1822923172296848e-05, + "loss": 0.0033, + "num_input_tokens_seen": 27203392, + "step": 128905 + }, + { + "epoch": 14.181518151815181, + "grad_norm": 1.1416378021240234, + "learning_rate": 1.1820883622432574e-05, + "loss": 0.0112, + "num_input_tokens_seen": 27204416, + "step": 128910 + }, + { + "epoch": 14.182068206820682, + "grad_norm": 0.38508492708206177, + "learning_rate": 1.1818844194033901e-05, + "loss": 0.0038, + "num_input_tokens_seen": 27205504, + "step": 128915 + }, + { + "epoch": 14.182618261826182, + "grad_norm": 0.7925436496734619, + "learning_rate": 1.1816804887119604e-05, + "loss": 0.0717, + "num_input_tokens_seen": 27206560, + "step": 128920 + }, + { + "epoch": 14.183168316831683, + "grad_norm": 0.03292934596538544, + "learning_rate": 1.1814765701708502e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27207648, + "step": 128925 + }, + { + "epoch": 14.183718371837184, + "grad_norm": 0.017746947705745697, + "learning_rate": 1.181272663781937e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27208704, + "step": 128930 + }, + { + "epoch": 14.184268426842685, + "grad_norm": 0.009539778344333172, + "learning_rate": 1.1810687695471004e-05, + "loss": 0.0063, + "num_input_tokens_seen": 27209824, + "step": 128935 + }, + { + "epoch": 14.184818481848184, + "grad_norm": 0.3420461118221283, + "learning_rate": 1.1808648874682212e-05, + "loss": 0.0491, + "num_input_tokens_seen": 27210880, + "step": 128940 + }, + { + "epoch": 14.185368536853685, + "grad_norm": 0.04551989585161209, + "learning_rate": 1.1806610175471766e-05, + "loss": 0.0219, + "num_input_tokens_seen": 27211968, + "step": 128945 + }, + { + "epoch": 14.185918591859187, + "grad_norm": 0.013817716389894485, + "learning_rate": 1.1804571597858471e-05, + "loss": 0.0033, + "num_input_tokens_seen": 27213056, + "step": 128950 + }, + { + "epoch": 14.186468646864686, + "grad_norm": 1.4222750663757324, + "learning_rate": 1.1802533141861108e-05, + "loss": 0.0731, + "num_input_tokens_seen": 27214112, + "step": 128955 + }, + { + "epoch": 14.187018701870187, + "grad_norm": 1.3516597747802734, + "learning_rate": 1.1800494807498455e-05, + "loss": 0.0092, + "num_input_tokens_seen": 27215104, + "step": 128960 + }, + { + "epoch": 14.187568756875688, + "grad_norm": 0.14022725820541382, + "learning_rate": 1.1798456594789306e-05, + "loss": 0.0061, + "num_input_tokens_seen": 27216160, + "step": 128965 + }, + { + "epoch": 14.188118811881187, + "grad_norm": 0.42992478609085083, + "learning_rate": 1.179641850375245e-05, + "loss": 0.0072, + "num_input_tokens_seen": 27217280, + "step": 128970 + }, + { + "epoch": 14.188668866886688, + "grad_norm": 0.2930159866809845, + "learning_rate": 1.1794380534406677e-05, + "loss": 0.0305, + "num_input_tokens_seen": 27218272, + "step": 128975 + }, + { + "epoch": 14.18921892189219, + "grad_norm": 0.02872692048549652, + "learning_rate": 1.1792342686770763e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27219328, + "step": 128980 + }, + { + "epoch": 14.189768976897689, + "grad_norm": 0.5246127247810364, + "learning_rate": 1.179030496086348e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27220384, + "step": 128985 + }, + { + "epoch": 14.19031903190319, + "grad_norm": 0.04736243188381195, + "learning_rate": 1.1788267356703616e-05, + "loss": 0.0153, + "num_input_tokens_seen": 27221376, + "step": 128990 + }, + { + "epoch": 14.190869086908691, + "grad_norm": 0.05292923003435135, + "learning_rate": 1.1786229874309951e-05, + "loss": 0.0794, + "num_input_tokens_seen": 27222368, + "step": 128995 + }, + { + "epoch": 14.191419141914192, + "grad_norm": 0.0591539666056633, + "learning_rate": 1.1784192513701279e-05, + "loss": 0.0277, + "num_input_tokens_seen": 27223360, + "step": 129000 + }, + { + "epoch": 14.191969196919691, + "grad_norm": 0.016297858208417892, + "learning_rate": 1.1782155274896358e-05, + "loss": 0.0612, + "num_input_tokens_seen": 27224448, + "step": 129005 + }, + { + "epoch": 14.192519251925193, + "grad_norm": 0.024430975317955017, + "learning_rate": 1.178011815791396e-05, + "loss": 0.0605, + "num_input_tokens_seen": 27225472, + "step": 129010 + }, + { + "epoch": 14.193069306930694, + "grad_norm": 0.1545984297990799, + "learning_rate": 1.1778081162772883e-05, + "loss": 0.006, + "num_input_tokens_seen": 27226496, + "step": 129015 + }, + { + "epoch": 14.193619361936193, + "grad_norm": 0.03945233300328255, + "learning_rate": 1.1776044289491874e-05, + "loss": 0.004, + "num_input_tokens_seen": 27227584, + "step": 129020 + }, + { + "epoch": 14.194169416941694, + "grad_norm": 0.11580120027065277, + "learning_rate": 1.1774007538089721e-05, + "loss": 0.0035, + "num_input_tokens_seen": 27228736, + "step": 129025 + }, + { + "epoch": 14.194719471947195, + "grad_norm": 0.14848561584949493, + "learning_rate": 1.1771970908585203e-05, + "loss": 0.0102, + "num_input_tokens_seen": 27229760, + "step": 129030 + }, + { + "epoch": 14.195269526952695, + "grad_norm": 0.013494561426341534, + "learning_rate": 1.1769934400997073e-05, + "loss": 0.0162, + "num_input_tokens_seen": 27230816, + "step": 129035 + }, + { + "epoch": 14.195819581958196, + "grad_norm": 0.1023697555065155, + "learning_rate": 1.176789801534412e-05, + "loss": 0.0045, + "num_input_tokens_seen": 27231872, + "step": 129040 + }, + { + "epoch": 14.196369636963697, + "grad_norm": 0.1556624323129654, + "learning_rate": 1.1765861751645088e-05, + "loss": 0.0131, + "num_input_tokens_seen": 27232928, + "step": 129045 + }, + { + "epoch": 14.196919691969198, + "grad_norm": 0.047009408473968506, + "learning_rate": 1.176382560991876e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27234016, + "step": 129050 + }, + { + "epoch": 14.197469746974697, + "grad_norm": 0.02126404456794262, + "learning_rate": 1.176178959018391e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27235040, + "step": 129055 + }, + { + "epoch": 14.198019801980198, + "grad_norm": 0.11668895184993744, + "learning_rate": 1.1759753692459286e-05, + "loss": 0.0177, + "num_input_tokens_seen": 27236064, + "step": 129060 + }, + { + "epoch": 14.1985698569857, + "grad_norm": 0.22217318415641785, + "learning_rate": 1.1757717916763663e-05, + "loss": 0.014, + "num_input_tokens_seen": 27237088, + "step": 129065 + }, + { + "epoch": 14.199119911991199, + "grad_norm": 0.2446339875459671, + "learning_rate": 1.1755682263115795e-05, + "loss": 0.003, + "num_input_tokens_seen": 27238112, + "step": 129070 + }, + { + "epoch": 14.1996699669967, + "grad_norm": 0.004522038158029318, + "learning_rate": 1.1753646731534455e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27239104, + "step": 129075 + }, + { + "epoch": 14.2002200220022, + "grad_norm": 0.2424781769514084, + "learning_rate": 1.1751611322038389e-05, + "loss": 0.0084, + "num_input_tokens_seen": 27240128, + "step": 129080 + }, + { + "epoch": 14.2007700770077, + "grad_norm": 0.010574910789728165, + "learning_rate": 1.1749576034646363e-05, + "loss": 0.0814, + "num_input_tokens_seen": 27241184, + "step": 129085 + }, + { + "epoch": 14.201320132013201, + "grad_norm": 0.621362566947937, + "learning_rate": 1.1747540869377146e-05, + "loss": 0.0406, + "num_input_tokens_seen": 27242208, + "step": 129090 + }, + { + "epoch": 14.201870187018702, + "grad_norm": 0.02963152900338173, + "learning_rate": 1.1745505826249476e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27243264, + "step": 129095 + }, + { + "epoch": 14.202420242024202, + "grad_norm": 0.5846498608589172, + "learning_rate": 1.174347090528213e-05, + "loss": 0.0572, + "num_input_tokens_seen": 27244352, + "step": 129100 + }, + { + "epoch": 14.202970297029703, + "grad_norm": 0.20979374647140503, + "learning_rate": 1.1741436106493842e-05, + "loss": 0.0124, + "num_input_tokens_seen": 27245408, + "step": 129105 + }, + { + "epoch": 14.203520352035204, + "grad_norm": 0.6563933491706848, + "learning_rate": 1.1739401429903374e-05, + "loss": 0.0082, + "num_input_tokens_seen": 27246464, + "step": 129110 + }, + { + "epoch": 14.204070407040705, + "grad_norm": 0.0053391167894005775, + "learning_rate": 1.1737366875529488e-05, + "loss": 0.0196, + "num_input_tokens_seen": 27247520, + "step": 129115 + }, + { + "epoch": 14.204620462046204, + "grad_norm": 0.01664707437157631, + "learning_rate": 1.173533244339092e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27248544, + "step": 129120 + }, + { + "epoch": 14.205170517051705, + "grad_norm": 0.25305965542793274, + "learning_rate": 1.1733298133506437e-05, + "loss": 0.0882, + "num_input_tokens_seen": 27249600, + "step": 129125 + }, + { + "epoch": 14.205720572057206, + "grad_norm": 0.08374730497598648, + "learning_rate": 1.173126394589478e-05, + "loss": 0.1527, + "num_input_tokens_seen": 27250688, + "step": 129130 + }, + { + "epoch": 14.206270627062706, + "grad_norm": 0.22749517858028412, + "learning_rate": 1.1729229880574685e-05, + "loss": 0.0043, + "num_input_tokens_seen": 27251744, + "step": 129135 + }, + { + "epoch": 14.206820682068207, + "grad_norm": 3.718397617340088, + "learning_rate": 1.1727195937564911e-05, + "loss": 0.1482, + "num_input_tokens_seen": 27252832, + "step": 129140 + }, + { + "epoch": 14.207370737073708, + "grad_norm": 1.8981002569198608, + "learning_rate": 1.1725162116884203e-05, + "loss": 0.0804, + "num_input_tokens_seen": 27253824, + "step": 129145 + }, + { + "epoch": 14.207920792079207, + "grad_norm": 0.010833581909537315, + "learning_rate": 1.1723128418551315e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27254816, + "step": 129150 + }, + { + "epoch": 14.208470847084708, + "grad_norm": 1.5878138542175293, + "learning_rate": 1.1721094842584985e-05, + "loss": 0.0799, + "num_input_tokens_seen": 27255840, + "step": 129155 + }, + { + "epoch": 14.20902090209021, + "grad_norm": 0.01281706616282463, + "learning_rate": 1.1719061389003938e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27256896, + "step": 129160 + }, + { + "epoch": 14.209570957095709, + "grad_norm": 0.010136970318853855, + "learning_rate": 1.1717028057826932e-05, + "loss": 0.094, + "num_input_tokens_seen": 27257888, + "step": 129165 + }, + { + "epoch": 14.21012101210121, + "grad_norm": 0.05550255626440048, + "learning_rate": 1.171499484907271e-05, + "loss": 0.0521, + "num_input_tokens_seen": 27258944, + "step": 129170 + }, + { + "epoch": 14.210671067106711, + "grad_norm": 0.014197039417922497, + "learning_rate": 1.1712961762759999e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27259904, + "step": 129175 + }, + { + "epoch": 14.211221122112212, + "grad_norm": 4.1929097175598145, + "learning_rate": 1.1710928798907556e-05, + "loss": 0.0478, + "num_input_tokens_seen": 27260992, + "step": 129180 + }, + { + "epoch": 14.211771177117711, + "grad_norm": 0.0648784339427948, + "learning_rate": 1.1708895957534094e-05, + "loss": 0.0029, + "num_input_tokens_seen": 27262080, + "step": 129185 + }, + { + "epoch": 14.212321232123212, + "grad_norm": 0.03988896310329437, + "learning_rate": 1.1706863238658369e-05, + "loss": 0.127, + "num_input_tokens_seen": 27263136, + "step": 129190 + }, + { + "epoch": 14.212871287128714, + "grad_norm": 0.38551902770996094, + "learning_rate": 1.1704830642299098e-05, + "loss": 0.0069, + "num_input_tokens_seen": 27264160, + "step": 129195 + }, + { + "epoch": 14.213421342134213, + "grad_norm": 0.014639410190284252, + "learning_rate": 1.1702798168475021e-05, + "loss": 0.0234, + "num_input_tokens_seen": 27265248, + "step": 129200 + }, + { + "epoch": 14.213971397139714, + "grad_norm": 0.2966633141040802, + "learning_rate": 1.1700765817204884e-05, + "loss": 0.0065, + "num_input_tokens_seen": 27266272, + "step": 129205 + }, + { + "epoch": 14.214521452145215, + "grad_norm": 0.006570503115653992, + "learning_rate": 1.16987335885074e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27267360, + "step": 129210 + }, + { + "epoch": 14.215071507150714, + "grad_norm": 0.09579227864742279, + "learning_rate": 1.1696701482401312e-05, + "loss": 0.0019, + "num_input_tokens_seen": 27268352, + "step": 129215 + }, + { + "epoch": 14.215621562156215, + "grad_norm": 0.02420484460890293, + "learning_rate": 1.1694669498905345e-05, + "loss": 0.012, + "num_input_tokens_seen": 27269408, + "step": 129220 + }, + { + "epoch": 14.216171617161717, + "grad_norm": 0.0031976921018213034, + "learning_rate": 1.1692637638038203e-05, + "loss": 0.0051, + "num_input_tokens_seen": 27270400, + "step": 129225 + }, + { + "epoch": 14.216721672167218, + "grad_norm": 0.01521332748234272, + "learning_rate": 1.1690605899818655e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27271456, + "step": 129230 + }, + { + "epoch": 14.217271727172717, + "grad_norm": 1.4958505630493164, + "learning_rate": 1.1688574284265396e-05, + "loss": 0.0092, + "num_input_tokens_seen": 27272512, + "step": 129235 + }, + { + "epoch": 14.217821782178218, + "grad_norm": 0.02514842338860035, + "learning_rate": 1.1686542791397171e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27273600, + "step": 129240 + }, + { + "epoch": 14.218371837183719, + "grad_norm": 0.030032360926270485, + "learning_rate": 1.1684511421232691e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27274656, + "step": 129245 + }, + { + "epoch": 14.218921892189218, + "grad_norm": 0.011482383124530315, + "learning_rate": 1.1682480173790672e-05, + "loss": 0.0893, + "num_input_tokens_seen": 27275712, + "step": 129250 + }, + { + "epoch": 14.21947194719472, + "grad_norm": 0.10042844712734222, + "learning_rate": 1.1680449049089842e-05, + "loss": 0.0039, + "num_input_tokens_seen": 27276800, + "step": 129255 + }, + { + "epoch": 14.22002200220022, + "grad_norm": 0.006460716016590595, + "learning_rate": 1.1678418047148923e-05, + "loss": 0.0038, + "num_input_tokens_seen": 27277920, + "step": 129260 + }, + { + "epoch": 14.22057205720572, + "grad_norm": 0.03839772567152977, + "learning_rate": 1.167638716798664e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27279008, + "step": 129265 + }, + { + "epoch": 14.221122112211221, + "grad_norm": 0.24559779465198517, + "learning_rate": 1.1674356411621704e-05, + "loss": 0.0045, + "num_input_tokens_seen": 27280096, + "step": 129270 + }, + { + "epoch": 14.221672167216722, + "grad_norm": 0.020756816491484642, + "learning_rate": 1.167232577807282e-05, + "loss": 0.1827, + "num_input_tokens_seen": 27281184, + "step": 129275 + }, + { + "epoch": 14.222222222222221, + "grad_norm": 0.15023663640022278, + "learning_rate": 1.1670295267358727e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27282208, + "step": 129280 + }, + { + "epoch": 14.222772277227723, + "grad_norm": 1.6736533641815186, + "learning_rate": 1.1668264879498105e-05, + "loss": 0.075, + "num_input_tokens_seen": 27283296, + "step": 129285 + }, + { + "epoch": 14.223322332233224, + "grad_norm": 0.042581651359796524, + "learning_rate": 1.1666234614509708e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27284384, + "step": 129290 + }, + { + "epoch": 14.223872387238725, + "grad_norm": 0.021142857149243355, + "learning_rate": 1.1664204472412227e-05, + "loss": 0.001, + "num_input_tokens_seen": 27285376, + "step": 129295 + }, + { + "epoch": 14.224422442244224, + "grad_norm": 0.004816921427845955, + "learning_rate": 1.1662174453224367e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27286464, + "step": 129300 + }, + { + "epoch": 14.224972497249725, + "grad_norm": 0.019170889630913734, + "learning_rate": 1.1660144556964853e-05, + "loss": 0.1667, + "num_input_tokens_seen": 27287552, + "step": 129305 + }, + { + "epoch": 14.225522552255226, + "grad_norm": 0.010616620071232319, + "learning_rate": 1.1658114783652376e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27288608, + "step": 129310 + }, + { + "epoch": 14.226072607260726, + "grad_norm": 0.012073417194187641, + "learning_rate": 1.1656085133305653e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27289664, + "step": 129315 + }, + { + "epoch": 14.226622662266227, + "grad_norm": 0.005312861874699593, + "learning_rate": 1.16540556059434e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27290752, + "step": 129320 + }, + { + "epoch": 14.227172717271728, + "grad_norm": 0.05058727785944939, + "learning_rate": 1.1652026201584305e-05, + "loss": 0.004, + "num_input_tokens_seen": 27291840, + "step": 129325 + }, + { + "epoch": 14.227722772277227, + "grad_norm": 0.09442248195409775, + "learning_rate": 1.1649996920247086e-05, + "loss": 0.0884, + "num_input_tokens_seen": 27292896, + "step": 129330 + }, + { + "epoch": 14.228272827282728, + "grad_norm": 0.021046746522188187, + "learning_rate": 1.1647967761950431e-05, + "loss": 0.0062, + "num_input_tokens_seen": 27293888, + "step": 129335 + }, + { + "epoch": 14.22882288228823, + "grad_norm": 4.154267311096191, + "learning_rate": 1.164593872671306e-05, + "loss": 0.0665, + "num_input_tokens_seen": 27295008, + "step": 129340 + }, + { + "epoch": 14.229372937293729, + "grad_norm": 0.053658660501241684, + "learning_rate": 1.1643909814553652e-05, + "loss": 0.0722, + "num_input_tokens_seen": 27296000, + "step": 129345 + }, + { + "epoch": 14.22992299229923, + "grad_norm": 0.258023738861084, + "learning_rate": 1.1641881025490922e-05, + "loss": 0.0031, + "num_input_tokens_seen": 27297024, + "step": 129350 + }, + { + "epoch": 14.23047304730473, + "grad_norm": 0.002300201216712594, + "learning_rate": 1.163985235954357e-05, + "loss": 0.0006, + "num_input_tokens_seen": 27298080, + "step": 129355 + }, + { + "epoch": 14.231023102310232, + "grad_norm": 0.004489366430789232, + "learning_rate": 1.1637823816730293e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27299104, + "step": 129360 + }, + { + "epoch": 14.231573157315731, + "grad_norm": 0.02236386574804783, + "learning_rate": 1.1635795397069772e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27300160, + "step": 129365 + }, + { + "epoch": 14.232123212321232, + "grad_norm": 0.013566595502197742, + "learning_rate": 1.1633767100580709e-05, + "loss": 0.001, + "num_input_tokens_seen": 27301248, + "step": 129370 + }, + { + "epoch": 14.232673267326733, + "grad_norm": 0.050480347126722336, + "learning_rate": 1.1631738927281802e-05, + "loss": 0.0969, + "num_input_tokens_seen": 27302272, + "step": 129375 + }, + { + "epoch": 14.233223322332233, + "grad_norm": 0.05551097169518471, + "learning_rate": 1.1629710877191752e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27303328, + "step": 129380 + }, + { + "epoch": 14.233773377337734, + "grad_norm": 0.5013087391853333, + "learning_rate": 1.1627682950329242e-05, + "loss": 0.0156, + "num_input_tokens_seen": 27304384, + "step": 129385 + }, + { + "epoch": 14.234323432343235, + "grad_norm": 0.03808395192027092, + "learning_rate": 1.1625655146712952e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27305440, + "step": 129390 + }, + { + "epoch": 14.234873487348734, + "grad_norm": 0.007792860269546509, + "learning_rate": 1.162362746636159e-05, + "loss": 0.0478, + "num_input_tokens_seen": 27306464, + "step": 129395 + }, + { + "epoch": 14.235423542354235, + "grad_norm": 0.015839340165257454, + "learning_rate": 1.1621599909293822e-05, + "loss": 0.0182, + "num_input_tokens_seen": 27307456, + "step": 129400 + }, + { + "epoch": 14.235973597359736, + "grad_norm": 0.02490662969648838, + "learning_rate": 1.1619572475528354e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27308544, + "step": 129405 + }, + { + "epoch": 14.236523652365236, + "grad_norm": 0.20287853479385376, + "learning_rate": 1.1617545165083874e-05, + "loss": 0.0496, + "num_input_tokens_seen": 27309568, + "step": 129410 + }, + { + "epoch": 14.237073707370737, + "grad_norm": 0.004528282675892115, + "learning_rate": 1.1615517977979046e-05, + "loss": 0.002, + "num_input_tokens_seen": 27310592, + "step": 129415 + }, + { + "epoch": 14.237623762376238, + "grad_norm": 0.01172021683305502, + "learning_rate": 1.1613490914232578e-05, + "loss": 0.0364, + "num_input_tokens_seen": 27311616, + "step": 129420 + }, + { + "epoch": 14.238173817381739, + "grad_norm": 0.019933952018618584, + "learning_rate": 1.1611463973863132e-05, + "loss": 0.0242, + "num_input_tokens_seen": 27312736, + "step": 129425 + }, + { + "epoch": 14.238723872387238, + "grad_norm": 0.05068963021039963, + "learning_rate": 1.1609437156889396e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27313824, + "step": 129430 + }, + { + "epoch": 14.23927392739274, + "grad_norm": 0.00688830716535449, + "learning_rate": 1.1607410463330065e-05, + "loss": 0.0093, + "num_input_tokens_seen": 27314880, + "step": 129435 + }, + { + "epoch": 14.23982398239824, + "grad_norm": 0.11453837901353836, + "learning_rate": 1.1605383893203793e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27315936, + "step": 129440 + }, + { + "epoch": 14.24037403740374, + "grad_norm": 0.08700159937143326, + "learning_rate": 1.1603357446529282e-05, + "loss": 0.1001, + "num_input_tokens_seen": 27316928, + "step": 129445 + }, + { + "epoch": 14.24092409240924, + "grad_norm": 0.034671179950237274, + "learning_rate": 1.1601331123325185e-05, + "loss": 0.0065, + "num_input_tokens_seen": 27317952, + "step": 129450 + }, + { + "epoch": 14.241474147414742, + "grad_norm": 0.648689329624176, + "learning_rate": 1.15993049236102e-05, + "loss": 0.0113, + "num_input_tokens_seen": 27319008, + "step": 129455 + }, + { + "epoch": 14.242024202420241, + "grad_norm": 0.4578336179256439, + "learning_rate": 1.1597278847402983e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27319968, + "step": 129460 + }, + { + "epoch": 14.242574257425742, + "grad_norm": 0.013380956836044788, + "learning_rate": 1.1595252894722214e-05, + "loss": 0.005, + "num_input_tokens_seen": 27321120, + "step": 129465 + }, + { + "epoch": 14.243124312431243, + "grad_norm": 0.03291504830121994, + "learning_rate": 1.159322706558658e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27322144, + "step": 129470 + }, + { + "epoch": 14.243674367436745, + "grad_norm": 0.0018496867269277573, + "learning_rate": 1.1591201360014725e-05, + "loss": 0.0007, + "num_input_tokens_seen": 27323200, + "step": 129475 + }, + { + "epoch": 14.244224422442244, + "grad_norm": 0.014878914691507816, + "learning_rate": 1.1589175778025343e-05, + "loss": 0.0838, + "num_input_tokens_seen": 27324192, + "step": 129480 + }, + { + "epoch": 14.244774477447745, + "grad_norm": 0.010463141836225986, + "learning_rate": 1.1587150319637086e-05, + "loss": 0.0799, + "num_input_tokens_seen": 27325280, + "step": 129485 + }, + { + "epoch": 14.245324532453246, + "grad_norm": 0.04006011784076691, + "learning_rate": 1.1585124984868626e-05, + "loss": 0.0315, + "num_input_tokens_seen": 27326432, + "step": 129490 + }, + { + "epoch": 14.245874587458745, + "grad_norm": 0.03118104301393032, + "learning_rate": 1.1583099773738642e-05, + "loss": 0.0389, + "num_input_tokens_seen": 27327424, + "step": 129495 + }, + { + "epoch": 14.246424642464246, + "grad_norm": 0.04652518779039383, + "learning_rate": 1.1581074686265781e-05, + "loss": 0.0729, + "num_input_tokens_seen": 27328512, + "step": 129500 + }, + { + "epoch": 14.246974697469748, + "grad_norm": 0.24714785814285278, + "learning_rate": 1.1579049722468724e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27329568, + "step": 129505 + }, + { + "epoch": 14.247524752475247, + "grad_norm": 0.058961618691682816, + "learning_rate": 1.1577024882366125e-05, + "loss": 0.0665, + "num_input_tokens_seen": 27330624, + "step": 129510 + }, + { + "epoch": 14.248074807480748, + "grad_norm": 0.06008824706077576, + "learning_rate": 1.1575000165976638e-05, + "loss": 0.0363, + "num_input_tokens_seen": 27331712, + "step": 129515 + }, + { + "epoch": 14.248624862486249, + "grad_norm": 0.012657051905989647, + "learning_rate": 1.1572975573318932e-05, + "loss": 0.0088, + "num_input_tokens_seen": 27332768, + "step": 129520 + }, + { + "epoch": 14.249174917491748, + "grad_norm": 0.012721612118184566, + "learning_rate": 1.1570951104411667e-05, + "loss": 0.1857, + "num_input_tokens_seen": 27333856, + "step": 129525 + }, + { + "epoch": 14.24972497249725, + "grad_norm": 1.9848469495773315, + "learning_rate": 1.1568926759273513e-05, + "loss": 0.0429, + "num_input_tokens_seen": 27334912, + "step": 129530 + }, + { + "epoch": 14.25027502750275, + "grad_norm": 0.7971080541610718, + "learning_rate": 1.1566902537923116e-05, + "loss": 0.1083, + "num_input_tokens_seen": 27335936, + "step": 129535 + }, + { + "epoch": 14.250825082508252, + "grad_norm": 0.0265261922031641, + "learning_rate": 1.156487844037912e-05, + "loss": 0.0846, + "num_input_tokens_seen": 27336992, + "step": 129540 + }, + { + "epoch": 14.251375137513751, + "grad_norm": 0.02268890291452408, + "learning_rate": 1.1562854466660197e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27338048, + "step": 129545 + }, + { + "epoch": 14.251925192519252, + "grad_norm": 0.01831650547683239, + "learning_rate": 1.1560830616785004e-05, + "loss": 0.0126, + "num_input_tokens_seen": 27339040, + "step": 129550 + }, + { + "epoch": 14.252475247524753, + "grad_norm": 0.6711680293083191, + "learning_rate": 1.1558806890772175e-05, + "loss": 0.0065, + "num_input_tokens_seen": 27340128, + "step": 129555 + }, + { + "epoch": 14.253025302530252, + "grad_norm": 0.0409165620803833, + "learning_rate": 1.1556783288640386e-05, + "loss": 0.0351, + "num_input_tokens_seen": 27341120, + "step": 129560 + }, + { + "epoch": 14.253575357535754, + "grad_norm": 0.6984430551528931, + "learning_rate": 1.1554759810408264e-05, + "loss": 0.0145, + "num_input_tokens_seen": 27342144, + "step": 129565 + }, + { + "epoch": 14.254125412541255, + "grad_norm": 0.06833726912736893, + "learning_rate": 1.1552736456094476e-05, + "loss": 0.0149, + "num_input_tokens_seen": 27343200, + "step": 129570 + }, + { + "epoch": 14.254675467546754, + "grad_norm": 0.02905355393886566, + "learning_rate": 1.1550713225717658e-05, + "loss": 0.0265, + "num_input_tokens_seen": 27344224, + "step": 129575 + }, + { + "epoch": 14.255225522552255, + "grad_norm": 0.05848934128880501, + "learning_rate": 1.154869011929646e-05, + "loss": 0.0034, + "num_input_tokens_seen": 27345280, + "step": 129580 + }, + { + "epoch": 14.255775577557756, + "grad_norm": 0.16643142700195312, + "learning_rate": 1.1546667136849542e-05, + "loss": 0.0078, + "num_input_tokens_seen": 27346432, + "step": 129585 + }, + { + "epoch": 14.256325632563255, + "grad_norm": 0.01394888199865818, + "learning_rate": 1.1544644278395528e-05, + "loss": 0.0493, + "num_input_tokens_seen": 27347456, + "step": 129590 + }, + { + "epoch": 14.256875687568757, + "grad_norm": 0.011922691948711872, + "learning_rate": 1.1542621543953077e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27348640, + "step": 129595 + }, + { + "epoch": 14.257425742574258, + "grad_norm": 0.030468393117189407, + "learning_rate": 1.1540598933540822e-05, + "loss": 0.0155, + "num_input_tokens_seen": 27349728, + "step": 129600 + }, + { + "epoch": 14.257975797579759, + "grad_norm": 4.006891250610352, + "learning_rate": 1.1538576447177405e-05, + "loss": 0.0261, + "num_input_tokens_seen": 27350752, + "step": 129605 + }, + { + "epoch": 14.258525852585258, + "grad_norm": 0.019872233271598816, + "learning_rate": 1.153655408488148e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27351840, + "step": 129610 + }, + { + "epoch": 14.25907590759076, + "grad_norm": 0.005909370724111795, + "learning_rate": 1.1534531846671665e-05, + "loss": 0.0019, + "num_input_tokens_seen": 27352896, + "step": 129615 + }, + { + "epoch": 14.25962596259626, + "grad_norm": 0.07544095069169998, + "learning_rate": 1.1532509732566619e-05, + "loss": 0.001, + "num_input_tokens_seen": 27353984, + "step": 129620 + }, + { + "epoch": 14.26017601760176, + "grad_norm": 0.005987656302750111, + "learning_rate": 1.1530487742584967e-05, + "loss": 0.0006, + "num_input_tokens_seen": 27355040, + "step": 129625 + }, + { + "epoch": 14.26072607260726, + "grad_norm": 2.7248294353485107, + "learning_rate": 1.1528465876745337e-05, + "loss": 0.1004, + "num_input_tokens_seen": 27356064, + "step": 129630 + }, + { + "epoch": 14.261276127612762, + "grad_norm": 0.01567390188574791, + "learning_rate": 1.1526444135066372e-05, + "loss": 0.0268, + "num_input_tokens_seen": 27357184, + "step": 129635 + }, + { + "epoch": 14.261826182618261, + "grad_norm": 0.49770060181617737, + "learning_rate": 1.1524422517566707e-05, + "loss": 0.0108, + "num_input_tokens_seen": 27358240, + "step": 129640 + }, + { + "epoch": 14.262376237623762, + "grad_norm": 0.12147451192140579, + "learning_rate": 1.1522401024264984e-05, + "loss": 0.0043, + "num_input_tokens_seen": 27359328, + "step": 129645 + }, + { + "epoch": 14.262926292629263, + "grad_norm": 0.050466734915971756, + "learning_rate": 1.1520379655179825e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27360384, + "step": 129650 + }, + { + "epoch": 14.263476347634764, + "grad_norm": 0.014609161764383316, + "learning_rate": 1.1518358410329846e-05, + "loss": 0.1388, + "num_input_tokens_seen": 27361472, + "step": 129655 + }, + { + "epoch": 14.264026402640264, + "grad_norm": 0.058536745607852936, + "learning_rate": 1.151633728973369e-05, + "loss": 0.002, + "num_input_tokens_seen": 27362496, + "step": 129660 + }, + { + "epoch": 14.264576457645765, + "grad_norm": 0.01020438689738512, + "learning_rate": 1.1514316293409983e-05, + "loss": 0.001, + "num_input_tokens_seen": 27363520, + "step": 129665 + }, + { + "epoch": 14.265126512651266, + "grad_norm": 0.01968807354569435, + "learning_rate": 1.151229542137736e-05, + "loss": 0.0009, + "num_input_tokens_seen": 27364608, + "step": 129670 + }, + { + "epoch": 14.265676567656765, + "grad_norm": 0.9461125135421753, + "learning_rate": 1.1510274673654438e-05, + "loss": 0.0099, + "num_input_tokens_seen": 27365600, + "step": 129675 + }, + { + "epoch": 14.266226622662266, + "grad_norm": 0.06392540782690048, + "learning_rate": 1.1508254050259834e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27366656, + "step": 129680 + }, + { + "epoch": 14.266776677667767, + "grad_norm": 0.024760330095887184, + "learning_rate": 1.1506233551212186e-05, + "loss": 0.0392, + "num_input_tokens_seen": 27367680, + "step": 129685 + }, + { + "epoch": 14.267326732673267, + "grad_norm": 0.36210209131240845, + "learning_rate": 1.1504213176530099e-05, + "loss": 0.004, + "num_input_tokens_seen": 27368800, + "step": 129690 + }, + { + "epoch": 14.267876787678768, + "grad_norm": 0.11271829903125763, + "learning_rate": 1.1502192926232203e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27369888, + "step": 129695 + }, + { + "epoch": 14.268426842684269, + "grad_norm": 0.8590571284294128, + "learning_rate": 1.1500172800337128e-05, + "loss": 0.0877, + "num_input_tokens_seen": 27370944, + "step": 129700 + }, + { + "epoch": 14.268976897689768, + "grad_norm": 0.02128857560455799, + "learning_rate": 1.1498152798863474e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27371936, + "step": 129705 + }, + { + "epoch": 14.26952695269527, + "grad_norm": 0.006447190418839455, + "learning_rate": 1.1496132921829874e-05, + "loss": 0.0061, + "num_input_tokens_seen": 27372992, + "step": 129710 + }, + { + "epoch": 14.27007700770077, + "grad_norm": 0.016631148755550385, + "learning_rate": 1.1494113169254931e-05, + "loss": 0.0277, + "num_input_tokens_seen": 27373984, + "step": 129715 + }, + { + "epoch": 14.270627062706271, + "grad_norm": 0.002882674802094698, + "learning_rate": 1.1492093541157265e-05, + "loss": 0.0353, + "num_input_tokens_seen": 27375040, + "step": 129720 + }, + { + "epoch": 14.27117711771177, + "grad_norm": 0.023854197934269905, + "learning_rate": 1.1490074037555499e-05, + "loss": 0.1105, + "num_input_tokens_seen": 27376032, + "step": 129725 + }, + { + "epoch": 14.271727172717272, + "grad_norm": 0.21074648201465607, + "learning_rate": 1.1488054658468228e-05, + "loss": 0.0056, + "num_input_tokens_seen": 27377152, + "step": 129730 + }, + { + "epoch": 14.272277227722773, + "grad_norm": 0.15620431303977966, + "learning_rate": 1.1486035403914085e-05, + "loss": 0.0104, + "num_input_tokens_seen": 27378176, + "step": 129735 + }, + { + "epoch": 14.272827282728272, + "grad_norm": 0.1029430478811264, + "learning_rate": 1.1484016273911669e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27379232, + "step": 129740 + }, + { + "epoch": 14.273377337733773, + "grad_norm": 0.028627494350075722, + "learning_rate": 1.1481997268479578e-05, + "loss": 0.0346, + "num_input_tokens_seen": 27380288, + "step": 129745 + }, + { + "epoch": 14.273927392739274, + "grad_norm": 0.0296783410012722, + "learning_rate": 1.1479978387636434e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27381344, + "step": 129750 + }, + { + "epoch": 14.274477447744774, + "grad_norm": 0.012989350594580173, + "learning_rate": 1.147795963140084e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27382400, + "step": 129755 + }, + { + "epoch": 14.275027502750275, + "grad_norm": 3.626375198364258, + "learning_rate": 1.1475940999791413e-05, + "loss": 0.0499, + "num_input_tokens_seen": 27383424, + "step": 129760 + }, + { + "epoch": 14.275577557755776, + "grad_norm": 0.013980436138808727, + "learning_rate": 1.147392249282675e-05, + "loss": 0.0052, + "num_input_tokens_seen": 27384448, + "step": 129765 + }, + { + "epoch": 14.276127612761275, + "grad_norm": 0.01027559395879507, + "learning_rate": 1.1471904110525442e-05, + "loss": 0.0862, + "num_input_tokens_seen": 27385440, + "step": 129770 + }, + { + "epoch": 14.276677667766776, + "grad_norm": 0.3905324935913086, + "learning_rate": 1.1469885852906102e-05, + "loss": 0.0119, + "num_input_tokens_seen": 27386560, + "step": 129775 + }, + { + "epoch": 14.277227722772277, + "grad_norm": 0.11849288642406464, + "learning_rate": 1.1467867719987333e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27387616, + "step": 129780 + }, + { + "epoch": 14.277777777777779, + "grad_norm": 0.022918660193681717, + "learning_rate": 1.1465849711787744e-05, + "loss": 0.0506, + "num_input_tokens_seen": 27388704, + "step": 129785 + }, + { + "epoch": 14.278327832783278, + "grad_norm": 3.6656789779663086, + "learning_rate": 1.1463831828325921e-05, + "loss": 0.043, + "num_input_tokens_seen": 27389792, + "step": 129790 + }, + { + "epoch": 14.278877887788779, + "grad_norm": 0.010061751119792461, + "learning_rate": 1.1461814069620458e-05, + "loss": 0.0061, + "num_input_tokens_seen": 27390880, + "step": 129795 + }, + { + "epoch": 14.27942794279428, + "grad_norm": 0.07823111861944199, + "learning_rate": 1.1459796435689966e-05, + "loss": 0.063, + "num_input_tokens_seen": 27391936, + "step": 129800 + }, + { + "epoch": 14.27997799779978, + "grad_norm": 0.02728613279759884, + "learning_rate": 1.1457778926553026e-05, + "loss": 0.0064, + "num_input_tokens_seen": 27393024, + "step": 129805 + }, + { + "epoch": 14.28052805280528, + "grad_norm": 0.07963941991329193, + "learning_rate": 1.1455761542228236e-05, + "loss": 0.016, + "num_input_tokens_seen": 27394080, + "step": 129810 + }, + { + "epoch": 14.281078107810782, + "grad_norm": 0.2603183686733246, + "learning_rate": 1.1453744282734204e-05, + "loss": 0.1159, + "num_input_tokens_seen": 27395072, + "step": 129815 + }, + { + "epoch": 14.281628162816281, + "grad_norm": 0.0058570499531924725, + "learning_rate": 1.1451727148089502e-05, + "loss": 0.0039, + "num_input_tokens_seen": 27396128, + "step": 129820 + }, + { + "epoch": 14.282178217821782, + "grad_norm": 0.02461082488298416, + "learning_rate": 1.1449710138312739e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27397184, + "step": 129825 + }, + { + "epoch": 14.282728272827283, + "grad_norm": 0.01474383007735014, + "learning_rate": 1.1447693253422484e-05, + "loss": 0.0183, + "num_input_tokens_seen": 27398240, + "step": 129830 + }, + { + "epoch": 14.283278327832782, + "grad_norm": 0.38939759135246277, + "learning_rate": 1.1445676493437341e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27399296, + "step": 129835 + }, + { + "epoch": 14.283828382838283, + "grad_norm": 1.640649437904358, + "learning_rate": 1.1443659858375899e-05, + "loss": 0.0364, + "num_input_tokens_seen": 27400480, + "step": 129840 + }, + { + "epoch": 14.284378437843785, + "grad_norm": 0.01792934164404869, + "learning_rate": 1.1441643348256734e-05, + "loss": 0.0631, + "num_input_tokens_seen": 27401536, + "step": 129845 + }, + { + "epoch": 14.284928492849286, + "grad_norm": 0.08521737903356552, + "learning_rate": 1.1439626963098442e-05, + "loss": 0.0049, + "num_input_tokens_seen": 27402624, + "step": 129850 + }, + { + "epoch": 14.285478547854785, + "grad_norm": 0.0324343666434288, + "learning_rate": 1.1437610702919596e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27403712, + "step": 129855 + }, + { + "epoch": 14.286028602860286, + "grad_norm": 0.015633542090654373, + "learning_rate": 1.1435594567738791e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27404832, + "step": 129860 + }, + { + "epoch": 14.286578657865787, + "grad_norm": 0.006743628531694412, + "learning_rate": 1.1433578557574593e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27405888, + "step": 129865 + }, + { + "epoch": 14.287128712871286, + "grad_norm": 0.021263444796204567, + "learning_rate": 1.1431562672445593e-05, + "loss": 0.0886, + "num_input_tokens_seen": 27406944, + "step": 129870 + }, + { + "epoch": 14.287678767876788, + "grad_norm": 0.03758486360311508, + "learning_rate": 1.1429546912370376e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27407968, + "step": 129875 + }, + { + "epoch": 14.288228822882289, + "grad_norm": 3.2665491104125977, + "learning_rate": 1.1427531277367503e-05, + "loss": 0.0834, + "num_input_tokens_seen": 27409024, + "step": 129880 + }, + { + "epoch": 14.288778877887788, + "grad_norm": 0.5269120931625366, + "learning_rate": 1.1425515767455574e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27410144, + "step": 129885 + }, + { + "epoch": 14.289328932893289, + "grad_norm": 0.03218044713139534, + "learning_rate": 1.1423500382653151e-05, + "loss": 0.0249, + "num_input_tokens_seen": 27411200, + "step": 129890 + }, + { + "epoch": 14.28987898789879, + "grad_norm": 1.2696597576141357, + "learning_rate": 1.1421485122978792e-05, + "loss": 0.0629, + "num_input_tokens_seen": 27412256, + "step": 129895 + }, + { + "epoch": 14.290429042904291, + "grad_norm": 0.024709951132535934, + "learning_rate": 1.141946998845111e-05, + "loss": 0.0246, + "num_input_tokens_seen": 27413248, + "step": 129900 + }, + { + "epoch": 14.29097909790979, + "grad_norm": 0.07249895483255386, + "learning_rate": 1.1417454979088643e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27414304, + "step": 129905 + }, + { + "epoch": 14.291529152915292, + "grad_norm": 0.018579985946416855, + "learning_rate": 1.1415440094909988e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27415392, + "step": 129910 + }, + { + "epoch": 14.292079207920793, + "grad_norm": 0.17122307419776917, + "learning_rate": 1.1413425335933702e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27416416, + "step": 129915 + }, + { + "epoch": 14.292629262926292, + "grad_norm": 0.14415033161640167, + "learning_rate": 1.1411410702178346e-05, + "loss": 0.0085, + "num_input_tokens_seen": 27417504, + "step": 129920 + }, + { + "epoch": 14.293179317931793, + "grad_norm": 0.06045408546924591, + "learning_rate": 1.1409396193662498e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27418528, + "step": 129925 + }, + { + "epoch": 14.293729372937294, + "grad_norm": 0.11248759180307388, + "learning_rate": 1.1407381810404732e-05, + "loss": 0.0104, + "num_input_tokens_seen": 27419648, + "step": 129930 + }, + { + "epoch": 14.294279427942794, + "grad_norm": 0.20276014506816864, + "learning_rate": 1.1405367552423596e-05, + "loss": 0.0034, + "num_input_tokens_seen": 27420704, + "step": 129935 + }, + { + "epoch": 14.294829482948295, + "grad_norm": 0.9509342312812805, + "learning_rate": 1.1403353419737675e-05, + "loss": 0.0126, + "num_input_tokens_seen": 27421824, + "step": 129940 + }, + { + "epoch": 14.295379537953796, + "grad_norm": 0.037221092730760574, + "learning_rate": 1.140133941236551e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27422976, + "step": 129945 + }, + { + "epoch": 14.295929592959295, + "grad_norm": 0.018926935270428658, + "learning_rate": 1.1399325530325678e-05, + "loss": 0.0643, + "num_input_tokens_seen": 27424000, + "step": 129950 + }, + { + "epoch": 14.296479647964796, + "grad_norm": 0.11517360806465149, + "learning_rate": 1.139731177363674e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27425024, + "step": 129955 + }, + { + "epoch": 14.297029702970297, + "grad_norm": 0.034124985337257385, + "learning_rate": 1.1395298142317248e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27426080, + "step": 129960 + }, + { + "epoch": 14.297579757975798, + "grad_norm": 0.18532998859882355, + "learning_rate": 1.139328463638577e-05, + "loss": 0.0099, + "num_input_tokens_seen": 27427200, + "step": 129965 + }, + { + "epoch": 14.298129812981298, + "grad_norm": 2.617558479309082, + "learning_rate": 1.1391271255860853e-05, + "loss": 0.0449, + "num_input_tokens_seen": 27428288, + "step": 129970 + }, + { + "epoch": 14.298679867986799, + "grad_norm": 2.049403429031372, + "learning_rate": 1.1389258000761066e-05, + "loss": 0.0489, + "num_input_tokens_seen": 27429376, + "step": 129975 + }, + { + "epoch": 14.2992299229923, + "grad_norm": 0.005302551202476025, + "learning_rate": 1.1387244871104946e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27430464, + "step": 129980 + }, + { + "epoch": 14.2997799779978, + "grad_norm": 0.022811446338891983, + "learning_rate": 1.1385231866911061e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27431584, + "step": 129985 + }, + { + "epoch": 14.3003300330033, + "grad_norm": 0.046185169368982315, + "learning_rate": 1.138321898819797e-05, + "loss": 0.0505, + "num_input_tokens_seen": 27432608, + "step": 129990 + }, + { + "epoch": 14.300880088008801, + "grad_norm": 1.068656086921692, + "learning_rate": 1.1381206234984204e-05, + "loss": 0.2808, + "num_input_tokens_seen": 27433632, + "step": 129995 + }, + { + "epoch": 14.3014301430143, + "grad_norm": 0.02345147170126438, + "learning_rate": 1.1379193607288336e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27434656, + "step": 130000 + }, + { + "epoch": 14.301980198019802, + "grad_norm": 0.007358849048614502, + "learning_rate": 1.1377181105128903e-05, + "loss": 0.008, + "num_input_tokens_seen": 27435712, + "step": 130005 + }, + { + "epoch": 14.302530253025303, + "grad_norm": 1.9095473289489746, + "learning_rate": 1.1375168728524439e-05, + "loss": 0.0395, + "num_input_tokens_seen": 27436704, + "step": 130010 + }, + { + "epoch": 14.303080308030804, + "grad_norm": 0.8147926330566406, + "learning_rate": 1.1373156477493524e-05, + "loss": 0.0114, + "num_input_tokens_seen": 27437728, + "step": 130015 + }, + { + "epoch": 14.303630363036303, + "grad_norm": 0.2766634225845337, + "learning_rate": 1.1371144352054678e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27438784, + "step": 130020 + }, + { + "epoch": 14.304180418041804, + "grad_norm": 0.018349386751651764, + "learning_rate": 1.1369132352226464e-05, + "loss": 0.0964, + "num_input_tokens_seen": 27439840, + "step": 130025 + }, + { + "epoch": 14.304730473047305, + "grad_norm": 2.3105480670928955, + "learning_rate": 1.1367120478027416e-05, + "loss": 0.1104, + "num_input_tokens_seen": 27440864, + "step": 130030 + }, + { + "epoch": 14.305280528052805, + "grad_norm": 0.03217384219169617, + "learning_rate": 1.1365108729476068e-05, + "loss": 0.0014, + "num_input_tokens_seen": 27441888, + "step": 130035 + }, + { + "epoch": 14.305830583058306, + "grad_norm": 0.04022323712706566, + "learning_rate": 1.136309710659097e-05, + "loss": 0.016, + "num_input_tokens_seen": 27442944, + "step": 130040 + }, + { + "epoch": 14.306380638063807, + "grad_norm": 0.02440205216407776, + "learning_rate": 1.1361085609390662e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27444000, + "step": 130045 + }, + { + "epoch": 14.306930693069306, + "grad_norm": 0.17325431108474731, + "learning_rate": 1.1359074237893694e-05, + "loss": 0.0816, + "num_input_tokens_seen": 27445056, + "step": 130050 + }, + { + "epoch": 14.307480748074807, + "grad_norm": 0.4576801359653473, + "learning_rate": 1.1357062992118591e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27446176, + "step": 130055 + }, + { + "epoch": 14.308030803080309, + "grad_norm": 0.02478647418320179, + "learning_rate": 1.1355051872083886e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27447232, + "step": 130060 + }, + { + "epoch": 14.308580858085808, + "grad_norm": 0.03685392811894417, + "learning_rate": 1.1353040877808127e-05, + "loss": 0.002, + "num_input_tokens_seen": 27448256, + "step": 130065 + }, + { + "epoch": 14.309130913091309, + "grad_norm": 0.012461940757930279, + "learning_rate": 1.1351030009309824e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27449376, + "step": 130070 + }, + { + "epoch": 14.30968096809681, + "grad_norm": 0.034548211842775345, + "learning_rate": 1.1349019266607543e-05, + "loss": 0.0661, + "num_input_tokens_seen": 27450496, + "step": 130075 + }, + { + "epoch": 14.310231023102311, + "grad_norm": 3.4380784034729004, + "learning_rate": 1.1347008649719807e-05, + "loss": 0.1071, + "num_input_tokens_seen": 27451488, + "step": 130080 + }, + { + "epoch": 14.31078107810781, + "grad_norm": 0.19204501807689667, + "learning_rate": 1.134499815866513e-05, + "loss": 0.003, + "num_input_tokens_seen": 27452512, + "step": 130085 + }, + { + "epoch": 14.311331133113312, + "grad_norm": 0.049858737736940384, + "learning_rate": 1.134298779346206e-05, + "loss": 0.0529, + "num_input_tokens_seen": 27453600, + "step": 130090 + }, + { + "epoch": 14.311881188118813, + "grad_norm": 0.1898716241121292, + "learning_rate": 1.1340977554129109e-05, + "loss": 0.0188, + "num_input_tokens_seen": 27454688, + "step": 130095 + }, + { + "epoch": 14.312431243124312, + "grad_norm": 0.011455516330897808, + "learning_rate": 1.1338967440684814e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27455744, + "step": 130100 + }, + { + "epoch": 14.312981298129813, + "grad_norm": 0.02714937925338745, + "learning_rate": 1.1336957453147711e-05, + "loss": 0.1043, + "num_input_tokens_seen": 27456800, + "step": 130105 + }, + { + "epoch": 14.313531353135314, + "grad_norm": 0.016352728009223938, + "learning_rate": 1.1334947591536305e-05, + "loss": 0.0197, + "num_input_tokens_seen": 27457760, + "step": 130110 + }, + { + "epoch": 14.314081408140813, + "grad_norm": 0.04432721436023712, + "learning_rate": 1.1332937855869142e-05, + "loss": 0.0229, + "num_input_tokens_seen": 27458752, + "step": 130115 + }, + { + "epoch": 14.314631463146315, + "grad_norm": 0.020486555993556976, + "learning_rate": 1.1330928246164729e-05, + "loss": 0.0311, + "num_input_tokens_seen": 27459808, + "step": 130120 + }, + { + "epoch": 14.315181518151816, + "grad_norm": 0.14066612720489502, + "learning_rate": 1.1328918762441581e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27460864, + "step": 130125 + }, + { + "epoch": 14.315731573157315, + "grad_norm": 0.04038260504603386, + "learning_rate": 1.1326909404718234e-05, + "loss": 0.0089, + "num_input_tokens_seen": 27461952, + "step": 130130 + }, + { + "epoch": 14.316281628162816, + "grad_norm": 0.018332013860344887, + "learning_rate": 1.1324900173013197e-05, + "loss": 0.0052, + "num_input_tokens_seen": 27462976, + "step": 130135 + }, + { + "epoch": 14.316831683168317, + "grad_norm": 0.04619354382157326, + "learning_rate": 1.1322891067345006e-05, + "loss": 0.0558, + "num_input_tokens_seen": 27464032, + "step": 130140 + }, + { + "epoch": 14.317381738173818, + "grad_norm": 0.02375875972211361, + "learning_rate": 1.1320882087732162e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27465088, + "step": 130145 + }, + { + "epoch": 14.317931793179318, + "grad_norm": 0.01695285737514496, + "learning_rate": 1.1318873234193179e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27466176, + "step": 130150 + }, + { + "epoch": 14.318481848184819, + "grad_norm": 0.05439770966768265, + "learning_rate": 1.1316864506746571e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27467264, + "step": 130155 + }, + { + "epoch": 14.31903190319032, + "grad_norm": 0.02506212145090103, + "learning_rate": 1.1314855905410862e-05, + "loss": 0.0051, + "num_input_tokens_seen": 27468288, + "step": 130160 + }, + { + "epoch": 14.319581958195819, + "grad_norm": 0.021144114434719086, + "learning_rate": 1.1312847430204566e-05, + "loss": 0.0175, + "num_input_tokens_seen": 27469280, + "step": 130165 + }, + { + "epoch": 14.32013201320132, + "grad_norm": 0.05361330136656761, + "learning_rate": 1.1310839081146189e-05, + "loss": 0.057, + "num_input_tokens_seen": 27470368, + "step": 130170 + }, + { + "epoch": 14.320682068206821, + "grad_norm": 0.007880184799432755, + "learning_rate": 1.1308830858254228e-05, + "loss": 0.051, + "num_input_tokens_seen": 27471488, + "step": 130175 + }, + { + "epoch": 14.32123212321232, + "grad_norm": 0.3111788332462311, + "learning_rate": 1.1306822761547214e-05, + "loss": 0.0075, + "num_input_tokens_seen": 27472608, + "step": 130180 + }, + { + "epoch": 14.321782178217822, + "grad_norm": 0.009986608289182186, + "learning_rate": 1.1304814791043633e-05, + "loss": 0.0938, + "num_input_tokens_seen": 27473664, + "step": 130185 + }, + { + "epoch": 14.322332233223323, + "grad_norm": 0.005161338485777378, + "learning_rate": 1.1302806946762004e-05, + "loss": 0.0171, + "num_input_tokens_seen": 27474752, + "step": 130190 + }, + { + "epoch": 14.322882288228822, + "grad_norm": 0.061483047902584076, + "learning_rate": 1.130079922872084e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27475840, + "step": 130195 + }, + { + "epoch": 14.323432343234323, + "grad_norm": 0.14710023999214172, + "learning_rate": 1.1298791636938627e-05, + "loss": 0.0344, + "num_input_tokens_seen": 27476896, + "step": 130200 + }, + { + "epoch": 14.323982398239824, + "grad_norm": 0.058175791054964066, + "learning_rate": 1.1296784171433888e-05, + "loss": 0.0006, + "num_input_tokens_seen": 27478016, + "step": 130205 + }, + { + "epoch": 14.324532453245325, + "grad_norm": 0.30639421939849854, + "learning_rate": 1.1294776832225102e-05, + "loss": 0.0252, + "num_input_tokens_seen": 27479104, + "step": 130210 + }, + { + "epoch": 14.325082508250825, + "grad_norm": 0.11048613488674164, + "learning_rate": 1.1292769619330782e-05, + "loss": 0.0315, + "num_input_tokens_seen": 27480192, + "step": 130215 + }, + { + "epoch": 14.325632563256326, + "grad_norm": 3.62754487991333, + "learning_rate": 1.1290762532769436e-05, + "loss": 0.1298, + "num_input_tokens_seen": 27481216, + "step": 130220 + }, + { + "epoch": 14.326182618261827, + "grad_norm": 0.061687592417001724, + "learning_rate": 1.1288755572559545e-05, + "loss": 0.0065, + "num_input_tokens_seen": 27482368, + "step": 130225 + }, + { + "epoch": 14.326732673267326, + "grad_norm": 0.8454068303108215, + "learning_rate": 1.1286748738719623e-05, + "loss": 0.0728, + "num_input_tokens_seen": 27483424, + "step": 130230 + }, + { + "epoch": 14.327282728272827, + "grad_norm": 0.026015473529696465, + "learning_rate": 1.128474203126815e-05, + "loss": 0.1199, + "num_input_tokens_seen": 27484480, + "step": 130235 + }, + { + "epoch": 14.327832783278328, + "grad_norm": 0.006912867072969675, + "learning_rate": 1.1282735450223636e-05, + "loss": 0.0629, + "num_input_tokens_seen": 27485536, + "step": 130240 + }, + { + "epoch": 14.328382838283828, + "grad_norm": 0.27136048674583435, + "learning_rate": 1.128072899560456e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27486624, + "step": 130245 + }, + { + "epoch": 14.328932893289329, + "grad_norm": 0.1460942178964615, + "learning_rate": 1.127872266742942e-05, + "loss": 0.1081, + "num_input_tokens_seen": 27487648, + "step": 130250 + }, + { + "epoch": 14.32948294829483, + "grad_norm": 1.0197926759719849, + "learning_rate": 1.127671646571672e-05, + "loss": 0.0106, + "num_input_tokens_seen": 27488704, + "step": 130255 + }, + { + "epoch": 14.33003300330033, + "grad_norm": 0.03633682429790497, + "learning_rate": 1.1274710390484928e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27489792, + "step": 130260 + }, + { + "epoch": 14.33058305830583, + "grad_norm": 0.012790661305189133, + "learning_rate": 1.1272704441752551e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27490880, + "step": 130265 + }, + { + "epoch": 14.331133113311331, + "grad_norm": 0.00985762756317854, + "learning_rate": 1.1270698619538065e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27491968, + "step": 130270 + }, + { + "epoch": 14.331683168316832, + "grad_norm": 0.04806233197450638, + "learning_rate": 1.1268692923859961e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27493056, + "step": 130275 + }, + { + "epoch": 14.332233223322332, + "grad_norm": 0.060731060802936554, + "learning_rate": 1.1266687354736735e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27494144, + "step": 130280 + }, + { + "epoch": 14.332783278327833, + "grad_norm": 0.09006427228450775, + "learning_rate": 1.1264681912186852e-05, + "loss": 0.0429, + "num_input_tokens_seen": 27495232, + "step": 130285 + }, + { + "epoch": 14.333333333333334, + "grad_norm": 0.009566489607095718, + "learning_rate": 1.1262676596228813e-05, + "loss": 0.089, + "num_input_tokens_seen": 27496320, + "step": 130290 + }, + { + "epoch": 14.333883388338833, + "grad_norm": 0.021460331976413727, + "learning_rate": 1.1260671406881093e-05, + "loss": 0.0073, + "num_input_tokens_seen": 27497312, + "step": 130295 + }, + { + "epoch": 14.334433443344334, + "grad_norm": 4.789574146270752, + "learning_rate": 1.1258666344162163e-05, + "loss": 0.1227, + "num_input_tokens_seen": 27498400, + "step": 130300 + }, + { + "epoch": 14.334983498349835, + "grad_norm": 0.06824765354394913, + "learning_rate": 1.1256661408090513e-05, + "loss": 0.0564, + "num_input_tokens_seen": 27499488, + "step": 130305 + }, + { + "epoch": 14.335533553355335, + "grad_norm": 0.22342120110988617, + "learning_rate": 1.1254656598684626e-05, + "loss": 0.0143, + "num_input_tokens_seen": 27500512, + "step": 130310 + }, + { + "epoch": 14.336083608360836, + "grad_norm": 0.04547138139605522, + "learning_rate": 1.1252651915962966e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27501536, + "step": 130315 + }, + { + "epoch": 14.336633663366337, + "grad_norm": 0.042724672704935074, + "learning_rate": 1.1250647359944027e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27502624, + "step": 130320 + }, + { + "epoch": 14.337183718371838, + "grad_norm": 0.009709128178656101, + "learning_rate": 1.1248642930646264e-05, + "loss": 0.0038, + "num_input_tokens_seen": 27503616, + "step": 130325 + }, + { + "epoch": 14.337733773377337, + "grad_norm": 0.01452453713864088, + "learning_rate": 1.124663862808816e-05, + "loss": 0.0029, + "num_input_tokens_seen": 27504608, + "step": 130330 + }, + { + "epoch": 14.338283828382838, + "grad_norm": 0.34016138315200806, + "learning_rate": 1.1244634452288196e-05, + "loss": 0.0051, + "num_input_tokens_seen": 27505664, + "step": 130335 + }, + { + "epoch": 14.33883388338834, + "grad_norm": 2.10148549079895, + "learning_rate": 1.1242630403264829e-05, + "loss": 0.1373, + "num_input_tokens_seen": 27506752, + "step": 130340 + }, + { + "epoch": 14.339383938393839, + "grad_norm": 1.4321430921554565, + "learning_rate": 1.1240626481036545e-05, + "loss": 0.0724, + "num_input_tokens_seen": 27507840, + "step": 130345 + }, + { + "epoch": 14.33993399339934, + "grad_norm": 2.623107671737671, + "learning_rate": 1.1238622685621794e-05, + "loss": 0.0276, + "num_input_tokens_seen": 27508864, + "step": 130350 + }, + { + "epoch": 14.340484048404841, + "grad_norm": 0.014977118000388145, + "learning_rate": 1.1236619017039066e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27509952, + "step": 130355 + }, + { + "epoch": 14.34103410341034, + "grad_norm": 0.00675663398578763, + "learning_rate": 1.1234615475306804e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27511040, + "step": 130360 + }, + { + "epoch": 14.341584158415841, + "grad_norm": 0.010910918936133385, + "learning_rate": 1.1232612060443488e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27512128, + "step": 130365 + }, + { + "epoch": 14.342134213421343, + "grad_norm": 1.7880258560180664, + "learning_rate": 1.1230608772467587e-05, + "loss": 0.127, + "num_input_tokens_seen": 27513184, + "step": 130370 + }, + { + "epoch": 14.342684268426842, + "grad_norm": 0.01761956512928009, + "learning_rate": 1.122860561139755e-05, + "loss": 0.0173, + "num_input_tokens_seen": 27514208, + "step": 130375 + }, + { + "epoch": 14.343234323432343, + "grad_norm": 0.09717835485935211, + "learning_rate": 1.1226602577251858e-05, + "loss": 0.0298, + "num_input_tokens_seen": 27515296, + "step": 130380 + }, + { + "epoch": 14.343784378437844, + "grad_norm": 0.04724181815981865, + "learning_rate": 1.122459967004895e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27516352, + "step": 130385 + }, + { + "epoch": 14.344334433443345, + "grad_norm": 0.047234151512384415, + "learning_rate": 1.1222596889807297e-05, + "loss": 0.0489, + "num_input_tokens_seen": 27517376, + "step": 130390 + }, + { + "epoch": 14.344884488448844, + "grad_norm": 0.20474453270435333, + "learning_rate": 1.1220594236545364e-05, + "loss": 0.0395, + "num_input_tokens_seen": 27518400, + "step": 130395 + }, + { + "epoch": 14.345434543454346, + "grad_norm": 0.10238045454025269, + "learning_rate": 1.1218591710281596e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27519488, + "step": 130400 + }, + { + "epoch": 14.345984598459847, + "grad_norm": 0.13862073421478271, + "learning_rate": 1.1216589311034464e-05, + "loss": 0.0412, + "num_input_tokens_seen": 27520544, + "step": 130405 + }, + { + "epoch": 14.346534653465346, + "grad_norm": 0.08299267292022705, + "learning_rate": 1.1214587038822414e-05, + "loss": 0.1144, + "num_input_tokens_seen": 27521536, + "step": 130410 + }, + { + "epoch": 14.347084708470847, + "grad_norm": 1.7868525981903076, + "learning_rate": 1.1212584893663891e-05, + "loss": 0.1667, + "num_input_tokens_seen": 27522592, + "step": 130415 + }, + { + "epoch": 14.347634763476348, + "grad_norm": 0.032994505017995834, + "learning_rate": 1.1210582875577358e-05, + "loss": 0.0014, + "num_input_tokens_seen": 27523584, + "step": 130420 + }, + { + "epoch": 14.348184818481847, + "grad_norm": 0.6467168927192688, + "learning_rate": 1.1208580984581262e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27524640, + "step": 130425 + }, + { + "epoch": 14.348734873487349, + "grad_norm": 0.4936743974685669, + "learning_rate": 1.1206579220694071e-05, + "loss": 0.0263, + "num_input_tokens_seen": 27525664, + "step": 130430 + }, + { + "epoch": 14.34928492849285, + "grad_norm": 0.792893648147583, + "learning_rate": 1.120457758393422e-05, + "loss": 0.0417, + "num_input_tokens_seen": 27526720, + "step": 130435 + }, + { + "epoch": 14.34983498349835, + "grad_norm": 0.0665220096707344, + "learning_rate": 1.1202576074320147e-05, + "loss": 0.0062, + "num_input_tokens_seen": 27527776, + "step": 130440 + }, + { + "epoch": 14.35038503850385, + "grad_norm": 0.1281730830669403, + "learning_rate": 1.120057469187031e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27528832, + "step": 130445 + }, + { + "epoch": 14.350935093509351, + "grad_norm": 0.02482527680695057, + "learning_rate": 1.1198573436603154e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27529856, + "step": 130450 + }, + { + "epoch": 14.351485148514852, + "grad_norm": 0.0038367784582078457, + "learning_rate": 1.1196572308537137e-05, + "loss": 0.0137, + "num_input_tokens_seen": 27530880, + "step": 130455 + }, + { + "epoch": 14.352035203520352, + "grad_norm": 0.34358206391334534, + "learning_rate": 1.1194571307690685e-05, + "loss": 0.0199, + "num_input_tokens_seen": 27531936, + "step": 130460 + }, + { + "epoch": 14.352585258525853, + "grad_norm": 0.0647466629743576, + "learning_rate": 1.1192570434082239e-05, + "loss": 0.0033, + "num_input_tokens_seen": 27533024, + "step": 130465 + }, + { + "epoch": 14.353135313531354, + "grad_norm": 3.857821464538574, + "learning_rate": 1.119056968773025e-05, + "loss": 0.0188, + "num_input_tokens_seen": 27534016, + "step": 130470 + }, + { + "epoch": 14.353685368536853, + "grad_norm": 0.12290018796920776, + "learning_rate": 1.1188569068653152e-05, + "loss": 0.0679, + "num_input_tokens_seen": 27535104, + "step": 130475 + }, + { + "epoch": 14.354235423542354, + "grad_norm": 0.3365606665611267, + "learning_rate": 1.118656857686938e-05, + "loss": 0.0062, + "num_input_tokens_seen": 27536192, + "step": 130480 + }, + { + "epoch": 14.354785478547855, + "grad_norm": 0.03535430505871773, + "learning_rate": 1.1184568212397386e-05, + "loss": 0.0447, + "num_input_tokens_seen": 27537216, + "step": 130485 + }, + { + "epoch": 14.355335533553355, + "grad_norm": 0.10571128875017166, + "learning_rate": 1.1182567975255592e-05, + "loss": 0.0045, + "num_input_tokens_seen": 27538304, + "step": 130490 + }, + { + "epoch": 14.355885588558856, + "grad_norm": 0.7565444111824036, + "learning_rate": 1.1180567865462447e-05, + "loss": 0.0194, + "num_input_tokens_seen": 27539456, + "step": 130495 + }, + { + "epoch": 14.356435643564357, + "grad_norm": 0.038210801780223846, + "learning_rate": 1.1178567883036364e-05, + "loss": 0.0096, + "num_input_tokens_seen": 27540544, + "step": 130500 + }, + { + "epoch": 14.356985698569858, + "grad_norm": 0.17700813710689545, + "learning_rate": 1.117656802799579e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27541600, + "step": 130505 + }, + { + "epoch": 14.357535753575357, + "grad_norm": 0.005708036012947559, + "learning_rate": 1.1174568300359168e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27542720, + "step": 130510 + }, + { + "epoch": 14.358085808580858, + "grad_norm": 0.015291880816221237, + "learning_rate": 1.1172568700144903e-05, + "loss": 0.0043, + "num_input_tokens_seen": 27543840, + "step": 130515 + }, + { + "epoch": 14.35863586358636, + "grad_norm": 0.0068084257654845715, + "learning_rate": 1.1170569227371449e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27544928, + "step": 130520 + }, + { + "epoch": 14.359185918591859, + "grad_norm": 1.0631979703903198, + "learning_rate": 1.116856988205722e-05, + "loss": 0.0167, + "num_input_tokens_seen": 27546048, + "step": 130525 + }, + { + "epoch": 14.35973597359736, + "grad_norm": 0.0757886990904808, + "learning_rate": 1.1166570664220639e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27547072, + "step": 130530 + }, + { + "epoch": 14.36028602860286, + "grad_norm": 0.0153639642521739, + "learning_rate": 1.1164571573880137e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27548192, + "step": 130535 + }, + { + "epoch": 14.36083608360836, + "grad_norm": 0.30474480986595154, + "learning_rate": 1.116257261105414e-05, + "loss": 0.003, + "num_input_tokens_seen": 27549184, + "step": 130540 + }, + { + "epoch": 14.361386138613861, + "grad_norm": 2.324605703353882, + "learning_rate": 1.116057377576108e-05, + "loss": 0.0557, + "num_input_tokens_seen": 27550240, + "step": 130545 + }, + { + "epoch": 14.361936193619362, + "grad_norm": 3.7553045749664307, + "learning_rate": 1.115857506801937e-05, + "loss": 0.052, + "num_input_tokens_seen": 27551296, + "step": 130550 + }, + { + "epoch": 14.362486248624862, + "grad_norm": 1.35519278049469, + "learning_rate": 1.1156576487847423e-05, + "loss": 0.1638, + "num_input_tokens_seen": 27552320, + "step": 130555 + }, + { + "epoch": 14.363036303630363, + "grad_norm": 0.011214442551136017, + "learning_rate": 1.1154578035263669e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27553408, + "step": 130560 + }, + { + "epoch": 14.363586358635864, + "grad_norm": 0.021975314244627953, + "learning_rate": 1.1152579710286527e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27554432, + "step": 130565 + }, + { + "epoch": 14.364136413641365, + "grad_norm": 0.5453773140907288, + "learning_rate": 1.1150581512934419e-05, + "loss": 0.0269, + "num_input_tokens_seen": 27555520, + "step": 130570 + }, + { + "epoch": 14.364686468646864, + "grad_norm": 0.09257366508245468, + "learning_rate": 1.1148583443225755e-05, + "loss": 0.0393, + "num_input_tokens_seen": 27556576, + "step": 130575 + }, + { + "epoch": 14.365236523652365, + "grad_norm": 0.0385407991707325, + "learning_rate": 1.1146585501178944e-05, + "loss": 0.004, + "num_input_tokens_seen": 27557600, + "step": 130580 + }, + { + "epoch": 14.365786578657866, + "grad_norm": 0.036180831491947174, + "learning_rate": 1.1144587686812416e-05, + "loss": 0.0891, + "num_input_tokens_seen": 27558656, + "step": 130585 + }, + { + "epoch": 14.366336633663366, + "grad_norm": 2.5719053745269775, + "learning_rate": 1.1142590000144565e-05, + "loss": 0.1734, + "num_input_tokens_seen": 27559712, + "step": 130590 + }, + { + "epoch": 14.366886688668867, + "grad_norm": 0.012022258713841438, + "learning_rate": 1.1140592441193815e-05, + "loss": 0.0056, + "num_input_tokens_seen": 27560800, + "step": 130595 + }, + { + "epoch": 14.367436743674368, + "grad_norm": 0.08350289613008499, + "learning_rate": 1.113859500997858e-05, + "loss": 0.0019, + "num_input_tokens_seen": 27561792, + "step": 130600 + }, + { + "epoch": 14.367986798679867, + "grad_norm": 0.0019358732970431447, + "learning_rate": 1.1136597706517255e-05, + "loss": 0.0085, + "num_input_tokens_seen": 27562816, + "step": 130605 + }, + { + "epoch": 14.368536853685368, + "grad_norm": 0.07172267884016037, + "learning_rate": 1.1134600530828265e-05, + "loss": 0.0381, + "num_input_tokens_seen": 27563872, + "step": 130610 + }, + { + "epoch": 14.36908690869087, + "grad_norm": 1.055338978767395, + "learning_rate": 1.1132603482930001e-05, + "loss": 0.0944, + "num_input_tokens_seen": 27564992, + "step": 130615 + }, + { + "epoch": 14.369636963696369, + "grad_norm": 0.1609012484550476, + "learning_rate": 1.1130606562840876e-05, + "loss": 0.1012, + "num_input_tokens_seen": 27566048, + "step": 130620 + }, + { + "epoch": 14.37018701870187, + "grad_norm": 0.29044729471206665, + "learning_rate": 1.1128609770579305e-05, + "loss": 0.0074, + "num_input_tokens_seen": 27567104, + "step": 130625 + }, + { + "epoch": 14.370737073707371, + "grad_norm": 0.03232819586992264, + "learning_rate": 1.1126613106163672e-05, + "loss": 0.0328, + "num_input_tokens_seen": 27568128, + "step": 130630 + }, + { + "epoch": 14.371287128712872, + "grad_norm": 0.01190542709082365, + "learning_rate": 1.1124616569612399e-05, + "loss": 0.0009, + "num_input_tokens_seen": 27569152, + "step": 130635 + }, + { + "epoch": 14.371837183718371, + "grad_norm": 1.709163784980774, + "learning_rate": 1.1122620160943867e-05, + "loss": 0.0178, + "num_input_tokens_seen": 27570208, + "step": 130640 + }, + { + "epoch": 14.372387238723872, + "grad_norm": 0.044246673583984375, + "learning_rate": 1.1120623880176496e-05, + "loss": 0.0183, + "num_input_tokens_seen": 27571264, + "step": 130645 + }, + { + "epoch": 14.372937293729374, + "grad_norm": 0.18918024003505707, + "learning_rate": 1.1118627727328662e-05, + "loss": 0.003, + "num_input_tokens_seen": 27572288, + "step": 130650 + }, + { + "epoch": 14.373487348734873, + "grad_norm": 0.18402732908725739, + "learning_rate": 1.1116631702418778e-05, + "loss": 0.0051, + "num_input_tokens_seen": 27573376, + "step": 130655 + }, + { + "epoch": 14.374037403740374, + "grad_norm": 0.20800432562828064, + "learning_rate": 1.1114635805465245e-05, + "loss": 0.1328, + "num_input_tokens_seen": 27574464, + "step": 130660 + }, + { + "epoch": 14.374587458745875, + "grad_norm": 0.1381668895483017, + "learning_rate": 1.1112640036486446e-05, + "loss": 0.0067, + "num_input_tokens_seen": 27575520, + "step": 130665 + }, + { + "epoch": 14.375137513751374, + "grad_norm": 0.14875493943691254, + "learning_rate": 1.1110644395500785e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27576576, + "step": 130670 + }, + { + "epoch": 14.375687568756875, + "grad_norm": 0.052273523062467575, + "learning_rate": 1.110864888252665e-05, + "loss": 0.0359, + "num_input_tokens_seen": 27577600, + "step": 130675 + }, + { + "epoch": 14.376237623762377, + "grad_norm": 2.1423423290252686, + "learning_rate": 1.1106653497582417e-05, + "loss": 0.0364, + "num_input_tokens_seen": 27578688, + "step": 130680 + }, + { + "epoch": 14.376787678767876, + "grad_norm": 0.07540032267570496, + "learning_rate": 1.110465824068651e-05, + "loss": 0.0373, + "num_input_tokens_seen": 27579776, + "step": 130685 + }, + { + "epoch": 14.377337733773377, + "grad_norm": 0.24481786787509918, + "learning_rate": 1.11026631118573e-05, + "loss": 0.0108, + "num_input_tokens_seen": 27580768, + "step": 130690 + }, + { + "epoch": 14.377887788778878, + "grad_norm": 0.10747560113668442, + "learning_rate": 1.1100668111113166e-05, + "loss": 0.0781, + "num_input_tokens_seen": 27581824, + "step": 130695 + }, + { + "epoch": 14.37843784378438, + "grad_norm": 0.0429050549864769, + "learning_rate": 1.1098673238472513e-05, + "loss": 0.0035, + "num_input_tokens_seen": 27582848, + "step": 130700 + }, + { + "epoch": 14.378987898789878, + "grad_norm": 0.01686183176934719, + "learning_rate": 1.1096678493953707e-05, + "loss": 0.0019, + "num_input_tokens_seen": 27583840, + "step": 130705 + }, + { + "epoch": 14.37953795379538, + "grad_norm": 0.17085328698158264, + "learning_rate": 1.1094683877575149e-05, + "loss": 0.0782, + "num_input_tokens_seen": 27584864, + "step": 130710 + }, + { + "epoch": 14.38008800880088, + "grad_norm": 0.08063940703868866, + "learning_rate": 1.1092689389355223e-05, + "loss": 0.0189, + "num_input_tokens_seen": 27585920, + "step": 130715 + }, + { + "epoch": 14.38063806380638, + "grad_norm": 0.08156893402338028, + "learning_rate": 1.1090695029312299e-05, + "loss": 0.0327, + "num_input_tokens_seen": 27586976, + "step": 130720 + }, + { + "epoch": 14.381188118811881, + "grad_norm": 0.19183339178562164, + "learning_rate": 1.1088700797464772e-05, + "loss": 0.0904, + "num_input_tokens_seen": 27588032, + "step": 130725 + }, + { + "epoch": 14.381738173817382, + "grad_norm": 0.028623413294553757, + "learning_rate": 1.1086706693831003e-05, + "loss": 0.02, + "num_input_tokens_seen": 27589056, + "step": 130730 + }, + { + "epoch": 14.382288228822881, + "grad_norm": 0.08630499243736267, + "learning_rate": 1.1084712718429385e-05, + "loss": 0.01, + "num_input_tokens_seen": 27590048, + "step": 130735 + }, + { + "epoch": 14.382838283828383, + "grad_norm": 1.3513386249542236, + "learning_rate": 1.10827188712783e-05, + "loss": 0.0193, + "num_input_tokens_seen": 27591136, + "step": 130740 + }, + { + "epoch": 14.383388338833884, + "grad_norm": 0.02688094601035118, + "learning_rate": 1.108072515239611e-05, + "loss": 0.006, + "num_input_tokens_seen": 27592128, + "step": 130745 + }, + { + "epoch": 14.383938393839385, + "grad_norm": 0.3257489502429962, + "learning_rate": 1.1078731561801203e-05, + "loss": 0.0064, + "num_input_tokens_seen": 27593152, + "step": 130750 + }, + { + "epoch": 14.384488448844884, + "grad_norm": 0.005758095532655716, + "learning_rate": 1.107673809951194e-05, + "loss": 0.0153, + "num_input_tokens_seen": 27594176, + "step": 130755 + }, + { + "epoch": 14.385038503850385, + "grad_norm": 0.025129372254014015, + "learning_rate": 1.1074744765546714e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27595200, + "step": 130760 + }, + { + "epoch": 14.385588558855886, + "grad_norm": 0.03053436055779457, + "learning_rate": 1.1072751559923868e-05, + "loss": 0.0009, + "num_input_tokens_seen": 27596256, + "step": 130765 + }, + { + "epoch": 14.386138613861386, + "grad_norm": 0.02521321177482605, + "learning_rate": 1.1070758482661792e-05, + "loss": 0.0102, + "num_input_tokens_seen": 27597344, + "step": 130770 + }, + { + "epoch": 14.386688668866887, + "grad_norm": 0.004086192697286606, + "learning_rate": 1.1068765533778858e-05, + "loss": 0.072, + "num_input_tokens_seen": 27598432, + "step": 130775 + }, + { + "epoch": 14.387238723872388, + "grad_norm": 0.7319605350494385, + "learning_rate": 1.1066772713293418e-05, + "loss": 0.0279, + "num_input_tokens_seen": 27599424, + "step": 130780 + }, + { + "epoch": 14.387788778877887, + "grad_norm": 1.2182562351226807, + "learning_rate": 1.1064780021223859e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27600480, + "step": 130785 + }, + { + "epoch": 14.388338833883388, + "grad_norm": 0.2881034314632416, + "learning_rate": 1.1062787457588536e-05, + "loss": 0.0033, + "num_input_tokens_seen": 27601504, + "step": 130790 + }, + { + "epoch": 14.38888888888889, + "grad_norm": 0.011768513359129429, + "learning_rate": 1.1060795022405792e-05, + "loss": 0.0502, + "num_input_tokens_seen": 27602496, + "step": 130795 + }, + { + "epoch": 14.389438943894389, + "grad_norm": 0.03905494511127472, + "learning_rate": 1.1058802715694033e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27603616, + "step": 130800 + }, + { + "epoch": 14.38998899889989, + "grad_norm": 0.04279734194278717, + "learning_rate": 1.105681053747159e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27604672, + "step": 130805 + }, + { + "epoch": 14.39053905390539, + "grad_norm": 0.02382841520011425, + "learning_rate": 1.105481848775684e-05, + "loss": 0.0276, + "num_input_tokens_seen": 27605728, + "step": 130810 + }, + { + "epoch": 14.391089108910892, + "grad_norm": 0.16318176686763763, + "learning_rate": 1.1052826566568143e-05, + "loss": 0.0103, + "num_input_tokens_seen": 27606816, + "step": 130815 + }, + { + "epoch": 14.391639163916391, + "grad_norm": 0.06682950258255005, + "learning_rate": 1.1050834773923835e-05, + "loss": 0.003, + "num_input_tokens_seen": 27607904, + "step": 130820 + }, + { + "epoch": 14.392189218921892, + "grad_norm": 0.026158230379223824, + "learning_rate": 1.1048843109842292e-05, + "loss": 0.0262, + "num_input_tokens_seen": 27608928, + "step": 130825 + }, + { + "epoch": 14.392739273927393, + "grad_norm": 0.09581063687801361, + "learning_rate": 1.1046851574341868e-05, + "loss": 0.1049, + "num_input_tokens_seen": 27609952, + "step": 130830 + }, + { + "epoch": 14.393289328932893, + "grad_norm": 0.08159670233726501, + "learning_rate": 1.1044860167440926e-05, + "loss": 0.0049, + "num_input_tokens_seen": 27610944, + "step": 130835 + }, + { + "epoch": 14.393839383938394, + "grad_norm": 0.015140837989747524, + "learning_rate": 1.1042868889157811e-05, + "loss": 0.0315, + "num_input_tokens_seen": 27611968, + "step": 130840 + }, + { + "epoch": 14.394389438943895, + "grad_norm": 0.28719136118888855, + "learning_rate": 1.1040877739510868e-05, + "loss": 0.0178, + "num_input_tokens_seen": 27613024, + "step": 130845 + }, + { + "epoch": 14.394939493949394, + "grad_norm": 0.021670246496796608, + "learning_rate": 1.1038886718518468e-05, + "loss": 0.0148, + "num_input_tokens_seen": 27614112, + "step": 130850 + }, + { + "epoch": 14.395489548954895, + "grad_norm": 0.03893790766596794, + "learning_rate": 1.1036895826198929e-05, + "loss": 0.0115, + "num_input_tokens_seen": 27615168, + "step": 130855 + }, + { + "epoch": 14.396039603960396, + "grad_norm": 0.02919410727918148, + "learning_rate": 1.103490506257064e-05, + "loss": 0.0075, + "num_input_tokens_seen": 27616224, + "step": 130860 + }, + { + "epoch": 14.396589658965897, + "grad_norm": 0.5164393782615662, + "learning_rate": 1.103291442765193e-05, + "loss": 0.0215, + "num_input_tokens_seen": 27617312, + "step": 130865 + }, + { + "epoch": 14.397139713971397, + "grad_norm": 0.5129583477973938, + "learning_rate": 1.1030923921461137e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27618336, + "step": 130870 + }, + { + "epoch": 14.397689768976898, + "grad_norm": 0.008972112089395523, + "learning_rate": 1.1028933544016623e-05, + "loss": 0.1055, + "num_input_tokens_seen": 27619360, + "step": 130875 + }, + { + "epoch": 14.398239823982399, + "grad_norm": 0.6911659240722656, + "learning_rate": 1.1026943295336715e-05, + "loss": 0.0442, + "num_input_tokens_seen": 27620448, + "step": 130880 + }, + { + "epoch": 14.398789878987898, + "grad_norm": 0.08614882826805115, + "learning_rate": 1.1024953175439768e-05, + "loss": 0.0423, + "num_input_tokens_seen": 27621440, + "step": 130885 + }, + { + "epoch": 14.3993399339934, + "grad_norm": 0.010630601085722446, + "learning_rate": 1.1022963184344129e-05, + "loss": 0.009, + "num_input_tokens_seen": 27622528, + "step": 130890 + }, + { + "epoch": 14.3998899889989, + "grad_norm": 0.0076812272891402245, + "learning_rate": 1.102097332206812e-05, + "loss": 0.0662, + "num_input_tokens_seen": 27623552, + "step": 130895 + }, + { + "epoch": 14.4004400440044, + "grad_norm": 3.2307987213134766, + "learning_rate": 1.1018983588630104e-05, + "loss": 0.148, + "num_input_tokens_seen": 27624640, + "step": 130900 + }, + { + "epoch": 14.400990099009901, + "grad_norm": 0.008132151328027248, + "learning_rate": 1.101699398404841e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27625664, + "step": 130905 + }, + { + "epoch": 14.401540154015402, + "grad_norm": 2.045701503753662, + "learning_rate": 1.1015004508341361e-05, + "loss": 0.1002, + "num_input_tokens_seen": 27626688, + "step": 130910 + }, + { + "epoch": 14.402090209020901, + "grad_norm": 0.01619936153292656, + "learning_rate": 1.1013015161527305e-05, + "loss": 0.1134, + "num_input_tokens_seen": 27627712, + "step": 130915 + }, + { + "epoch": 14.402640264026402, + "grad_norm": 0.4272644817829132, + "learning_rate": 1.1011025943624575e-05, + "loss": 0.0122, + "num_input_tokens_seen": 27628768, + "step": 130920 + }, + { + "epoch": 14.403190319031903, + "grad_norm": 0.0026484804693609476, + "learning_rate": 1.100903685465152e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27629792, + "step": 130925 + }, + { + "epoch": 14.403740374037405, + "grad_norm": 0.006408711429685354, + "learning_rate": 1.1007047894626455e-05, + "loss": 0.004, + "num_input_tokens_seen": 27630848, + "step": 130930 + }, + { + "epoch": 14.404290429042904, + "grad_norm": 0.026723375543951988, + "learning_rate": 1.1005059063567707e-05, + "loss": 0.0713, + "num_input_tokens_seen": 27631872, + "step": 130935 + }, + { + "epoch": 14.404840484048405, + "grad_norm": 0.010069698095321655, + "learning_rate": 1.1003070361493614e-05, + "loss": 0.1084, + "num_input_tokens_seen": 27632928, + "step": 130940 + }, + { + "epoch": 14.405390539053906, + "grad_norm": 0.313303142786026, + "learning_rate": 1.1001081788422504e-05, + "loss": 0.0062, + "num_input_tokens_seen": 27633888, + "step": 130945 + }, + { + "epoch": 14.405940594059405, + "grad_norm": 0.6379001140594482, + "learning_rate": 1.0999093344372716e-05, + "loss": 0.0672, + "num_input_tokens_seen": 27634912, + "step": 130950 + }, + { + "epoch": 14.406490649064907, + "grad_norm": 0.019449109211564064, + "learning_rate": 1.0997105029362567e-05, + "loss": 0.0773, + "num_input_tokens_seen": 27635968, + "step": 130955 + }, + { + "epoch": 14.407040704070408, + "grad_norm": 0.010408961214125156, + "learning_rate": 1.0995116843410372e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27636992, + "step": 130960 + }, + { + "epoch": 14.407590759075907, + "grad_norm": 0.5258015394210815, + "learning_rate": 1.0993128786534474e-05, + "loss": 0.008, + "num_input_tokens_seen": 27637952, + "step": 130965 + }, + { + "epoch": 14.408140814081408, + "grad_norm": 1.7011440992355347, + "learning_rate": 1.0991140858753176e-05, + "loss": 0.0819, + "num_input_tokens_seen": 27639040, + "step": 130970 + }, + { + "epoch": 14.408690869086909, + "grad_norm": 0.21188093721866608, + "learning_rate": 1.0989153060084814e-05, + "loss": 0.0131, + "num_input_tokens_seen": 27640096, + "step": 130975 + }, + { + "epoch": 14.409240924092408, + "grad_norm": 0.02150804176926613, + "learning_rate": 1.0987165390547713e-05, + "loss": 0.0248, + "num_input_tokens_seen": 27641152, + "step": 130980 + }, + { + "epoch": 14.40979097909791, + "grad_norm": 0.13849137723445892, + "learning_rate": 1.0985177850160174e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27642176, + "step": 130985 + }, + { + "epoch": 14.41034103410341, + "grad_norm": 0.22526417672634125, + "learning_rate": 1.0983190438940538e-05, + "loss": 0.0073, + "num_input_tokens_seen": 27643296, + "step": 130990 + }, + { + "epoch": 14.410891089108912, + "grad_norm": 9.046173095703125, + "learning_rate": 1.0981203156907096e-05, + "loss": 0.0202, + "num_input_tokens_seen": 27644384, + "step": 130995 + }, + { + "epoch": 14.411441144114411, + "grad_norm": 1.6695796251296997, + "learning_rate": 1.0979216004078183e-05, + "loss": 0.0262, + "num_input_tokens_seen": 27645408, + "step": 131000 + }, + { + "epoch": 14.411991199119912, + "grad_norm": 0.014817187562584877, + "learning_rate": 1.0977228980472114e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27646528, + "step": 131005 + }, + { + "epoch": 14.412541254125413, + "grad_norm": 0.08973374962806702, + "learning_rate": 1.0975242086107187e-05, + "loss": 0.0623, + "num_input_tokens_seen": 27647520, + "step": 131010 + }, + { + "epoch": 14.413091309130913, + "grad_norm": 1.80584716796875, + "learning_rate": 1.0973255321001735e-05, + "loss": 0.1223, + "num_input_tokens_seen": 27648608, + "step": 131015 + }, + { + "epoch": 14.413641364136414, + "grad_norm": 0.03548888862133026, + "learning_rate": 1.0971268685174048e-05, + "loss": 0.001, + "num_input_tokens_seen": 27649600, + "step": 131020 + }, + { + "epoch": 14.414191419141915, + "grad_norm": 0.017864610999822617, + "learning_rate": 1.0969282178642456e-05, + "loss": 0.0833, + "num_input_tokens_seen": 27650656, + "step": 131025 + }, + { + "epoch": 14.414741474147414, + "grad_norm": 0.06458929181098938, + "learning_rate": 1.0967295801425248e-05, + "loss": 0.037, + "num_input_tokens_seen": 27651680, + "step": 131030 + }, + { + "epoch": 14.415291529152915, + "grad_norm": 1.3154011964797974, + "learning_rate": 1.096530955354074e-05, + "loss": 0.0206, + "num_input_tokens_seen": 27652736, + "step": 131035 + }, + { + "epoch": 14.415841584158416, + "grad_norm": 0.014261496253311634, + "learning_rate": 1.096332343500725e-05, + "loss": 0.078, + "num_input_tokens_seen": 27653792, + "step": 131040 + }, + { + "epoch": 14.416391639163916, + "grad_norm": 0.6146656274795532, + "learning_rate": 1.0961337445843062e-05, + "loss": 0.0064, + "num_input_tokens_seen": 27654816, + "step": 131045 + }, + { + "epoch": 14.416941694169417, + "grad_norm": 0.5545271039009094, + "learning_rate": 1.0959351586066499e-05, + "loss": 0.0261, + "num_input_tokens_seen": 27655872, + "step": 131050 + }, + { + "epoch": 14.417491749174918, + "grad_norm": 0.03372564539313316, + "learning_rate": 1.0957365855695847e-05, + "loss": 0.0072, + "num_input_tokens_seen": 27656928, + "step": 131055 + }, + { + "epoch": 14.418041804180419, + "grad_norm": 0.004844787996262312, + "learning_rate": 1.0955380254749417e-05, + "loss": 0.028, + "num_input_tokens_seen": 27657952, + "step": 131060 + }, + { + "epoch": 14.418591859185918, + "grad_norm": 0.01011961791664362, + "learning_rate": 1.0953394783245516e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27659072, + "step": 131065 + }, + { + "epoch": 14.41914191419142, + "grad_norm": 1.3596417903900146, + "learning_rate": 1.0951409441202426e-05, + "loss": 0.0103, + "num_input_tokens_seen": 27660160, + "step": 131070 + }, + { + "epoch": 14.41969196919692, + "grad_norm": 0.18211722373962402, + "learning_rate": 1.0949424228638461e-05, + "loss": 0.0499, + "num_input_tokens_seen": 27661216, + "step": 131075 + }, + { + "epoch": 14.42024202420242, + "grad_norm": 0.040942735970020294, + "learning_rate": 1.0947439145571914e-05, + "loss": 0.0127, + "num_input_tokens_seen": 27662176, + "step": 131080 + }, + { + "epoch": 14.42079207920792, + "grad_norm": 2.441025495529175, + "learning_rate": 1.0945454192021064e-05, + "loss": 0.0657, + "num_input_tokens_seen": 27663232, + "step": 131085 + }, + { + "epoch": 14.421342134213422, + "grad_norm": 0.008834844455122948, + "learning_rate": 1.0943469368004223e-05, + "loss": 0.001, + "num_input_tokens_seen": 27664288, + "step": 131090 + }, + { + "epoch": 14.421892189218921, + "grad_norm": 0.02189972810447216, + "learning_rate": 1.0941484673539686e-05, + "loss": 0.0123, + "num_input_tokens_seen": 27665408, + "step": 131095 + }, + { + "epoch": 14.422442244224422, + "grad_norm": 0.014172851108014584, + "learning_rate": 1.0939500108645734e-05, + "loss": 0.0198, + "num_input_tokens_seen": 27666496, + "step": 131100 + }, + { + "epoch": 14.422992299229923, + "grad_norm": 0.027957728132605553, + "learning_rate": 1.0937515673340667e-05, + "loss": 0.0035, + "num_input_tokens_seen": 27667584, + "step": 131105 + }, + { + "epoch": 14.423542354235423, + "grad_norm": 0.7256855368614197, + "learning_rate": 1.0935531367642762e-05, + "loss": 0.0114, + "num_input_tokens_seen": 27668640, + "step": 131110 + }, + { + "epoch": 14.424092409240924, + "grad_norm": 0.9435961842536926, + "learning_rate": 1.0933547191570318e-05, + "loss": 0.0061, + "num_input_tokens_seen": 27669760, + "step": 131115 + }, + { + "epoch": 14.424642464246425, + "grad_norm": 1.6601309776306152, + "learning_rate": 1.0931563145141627e-05, + "loss": 0.0546, + "num_input_tokens_seen": 27670752, + "step": 131120 + }, + { + "epoch": 14.425192519251926, + "grad_norm": 0.01244032010436058, + "learning_rate": 1.0929579228374959e-05, + "loss": 0.0043, + "num_input_tokens_seen": 27671840, + "step": 131125 + }, + { + "epoch": 14.425742574257425, + "grad_norm": 0.013337560929358006, + "learning_rate": 1.0927595441288619e-05, + "loss": 0.1421, + "num_input_tokens_seen": 27672928, + "step": 131130 + }, + { + "epoch": 14.426292629262926, + "grad_norm": 0.7852559089660645, + "learning_rate": 1.0925611783900865e-05, + "loss": 0.0288, + "num_input_tokens_seen": 27674016, + "step": 131135 + }, + { + "epoch": 14.426842684268427, + "grad_norm": 0.16599157452583313, + "learning_rate": 1.0923628256230006e-05, + "loss": 0.018, + "num_input_tokens_seen": 27675072, + "step": 131140 + }, + { + "epoch": 14.427392739273927, + "grad_norm": 0.04961154982447624, + "learning_rate": 1.0921644858294302e-05, + "loss": 0.0472, + "num_input_tokens_seen": 27676096, + "step": 131145 + }, + { + "epoch": 14.427942794279428, + "grad_norm": 0.05791082605719566, + "learning_rate": 1.091966159011204e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27677152, + "step": 131150 + }, + { + "epoch": 14.428492849284929, + "grad_norm": 0.03988814726471901, + "learning_rate": 1.0917678451701512e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27678272, + "step": 131155 + }, + { + "epoch": 14.429042904290428, + "grad_norm": 0.1658851057291031, + "learning_rate": 1.0915695443080976e-05, + "loss": 0.1396, + "num_input_tokens_seen": 27679328, + "step": 131160 + }, + { + "epoch": 14.42959295929593, + "grad_norm": 0.11734143644571304, + "learning_rate": 1.0913712564268724e-05, + "loss": 0.1842, + "num_input_tokens_seen": 27680384, + "step": 131165 + }, + { + "epoch": 14.43014301430143, + "grad_norm": 0.1386163830757141, + "learning_rate": 1.0911729815283015e-05, + "loss": 0.0422, + "num_input_tokens_seen": 27681440, + "step": 131170 + }, + { + "epoch": 14.430693069306932, + "grad_norm": 0.10089694708585739, + "learning_rate": 1.0909747196142131e-05, + "loss": 0.0073, + "num_input_tokens_seen": 27682528, + "step": 131175 + }, + { + "epoch": 14.43124312431243, + "grad_norm": 0.0384083054959774, + "learning_rate": 1.0907764706864357e-05, + "loss": 0.0481, + "num_input_tokens_seen": 27683584, + "step": 131180 + }, + { + "epoch": 14.431793179317932, + "grad_norm": 0.017815008759498596, + "learning_rate": 1.0905782347467944e-05, + "loss": 0.0417, + "num_input_tokens_seen": 27684608, + "step": 131185 + }, + { + "epoch": 14.432343234323433, + "grad_norm": 0.05693625658750534, + "learning_rate": 1.090380011797118e-05, + "loss": 0.1444, + "num_input_tokens_seen": 27685696, + "step": 131190 + }, + { + "epoch": 14.432893289328932, + "grad_norm": 0.12298166006803513, + "learning_rate": 1.090181801839233e-05, + "loss": 0.0073, + "num_input_tokens_seen": 27686688, + "step": 131195 + }, + { + "epoch": 14.433443344334433, + "grad_norm": 0.06123385578393936, + "learning_rate": 1.0899836048749645e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27687776, + "step": 131200 + }, + { + "epoch": 14.433993399339935, + "grad_norm": 0.06640637665987015, + "learning_rate": 1.0897854209061406e-05, + "loss": 0.0033, + "num_input_tokens_seen": 27688832, + "step": 131205 + }, + { + "epoch": 14.434543454345434, + "grad_norm": 0.004181183874607086, + "learning_rate": 1.089587249934588e-05, + "loss": 0.0209, + "num_input_tokens_seen": 27689920, + "step": 131210 + }, + { + "epoch": 14.435093509350935, + "grad_norm": 0.5291531085968018, + "learning_rate": 1.0893890919621335e-05, + "loss": 0.0054, + "num_input_tokens_seen": 27690976, + "step": 131215 + }, + { + "epoch": 14.435643564356436, + "grad_norm": 0.25820255279541016, + "learning_rate": 1.0891909469906032e-05, + "loss": 0.0538, + "num_input_tokens_seen": 27692000, + "step": 131220 + }, + { + "epoch": 14.436193619361935, + "grad_norm": 0.006176529917865992, + "learning_rate": 1.088992815021822e-05, + "loss": 0.0397, + "num_input_tokens_seen": 27693120, + "step": 131225 + }, + { + "epoch": 14.436743674367436, + "grad_norm": 0.11640923470258713, + "learning_rate": 1.0887946960576168e-05, + "loss": 0.0056, + "num_input_tokens_seen": 27694176, + "step": 131230 + }, + { + "epoch": 14.437293729372938, + "grad_norm": 0.10512356460094452, + "learning_rate": 1.0885965900998138e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27695232, + "step": 131235 + }, + { + "epoch": 14.437843784378439, + "grad_norm": 3.0715208053588867, + "learning_rate": 1.0883984971502398e-05, + "loss": 0.0091, + "num_input_tokens_seen": 27696288, + "step": 131240 + }, + { + "epoch": 14.438393839383938, + "grad_norm": 0.03544662147760391, + "learning_rate": 1.0882004172107194e-05, + "loss": 0.0755, + "num_input_tokens_seen": 27697344, + "step": 131245 + }, + { + "epoch": 14.438943894389439, + "grad_norm": 0.010663226246833801, + "learning_rate": 1.0880023502830772e-05, + "loss": 0.0497, + "num_input_tokens_seen": 27698400, + "step": 131250 + }, + { + "epoch": 14.43949394939494, + "grad_norm": 0.04982074350118637, + "learning_rate": 1.087804296369141e-05, + "loss": 0.0117, + "num_input_tokens_seen": 27699456, + "step": 131255 + }, + { + "epoch": 14.44004400440044, + "grad_norm": 0.020600276067852974, + "learning_rate": 1.0876062554707339e-05, + "loss": 0.0278, + "num_input_tokens_seen": 27700576, + "step": 131260 + }, + { + "epoch": 14.44059405940594, + "grad_norm": 0.5162415504455566, + "learning_rate": 1.087408227589682e-05, + "loss": 0.0095, + "num_input_tokens_seen": 27701568, + "step": 131265 + }, + { + "epoch": 14.441144114411442, + "grad_norm": 0.013424600474536419, + "learning_rate": 1.0872102127278123e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27702656, + "step": 131270 + }, + { + "epoch": 14.441694169416941, + "grad_norm": 0.09017351269721985, + "learning_rate": 1.0870122108869466e-05, + "loss": 0.0125, + "num_input_tokens_seen": 27703744, + "step": 131275 + }, + { + "epoch": 14.442244224422442, + "grad_norm": 0.03450101986527443, + "learning_rate": 1.0868142220689124e-05, + "loss": 0.0541, + "num_input_tokens_seen": 27704832, + "step": 131280 + }, + { + "epoch": 14.442794279427943, + "grad_norm": 0.11214909702539444, + "learning_rate": 1.0866162462755325e-05, + "loss": 0.0795, + "num_input_tokens_seen": 27705920, + "step": 131285 + }, + { + "epoch": 14.443344334433444, + "grad_norm": 0.016527866944670677, + "learning_rate": 1.0864182835086323e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27706976, + "step": 131290 + }, + { + "epoch": 14.443894389438944, + "grad_norm": 1.9001386165618896, + "learning_rate": 1.0862203337700375e-05, + "loss": 0.1682, + "num_input_tokens_seen": 27707968, + "step": 131295 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 0.01101511251181364, + "learning_rate": 1.0860223970615708e-05, + "loss": 0.1252, + "num_input_tokens_seen": 27709024, + "step": 131300 + }, + { + "epoch": 14.444994499449946, + "grad_norm": 0.8777021765708923, + "learning_rate": 1.0858244733850578e-05, + "loss": 0.011, + "num_input_tokens_seen": 27710080, + "step": 131305 + }, + { + "epoch": 14.445544554455445, + "grad_norm": 3.607171058654785, + "learning_rate": 1.085626562742322e-05, + "loss": 0.0752, + "num_input_tokens_seen": 27711168, + "step": 131310 + }, + { + "epoch": 14.446094609460946, + "grad_norm": 0.019173800945281982, + "learning_rate": 1.0854286651351867e-05, + "loss": 0.0702, + "num_input_tokens_seen": 27712224, + "step": 131315 + }, + { + "epoch": 14.446644664466447, + "grad_norm": 0.00544061791151762, + "learning_rate": 1.0852307805654766e-05, + "loss": 0.048, + "num_input_tokens_seen": 27713280, + "step": 131320 + }, + { + "epoch": 14.447194719471947, + "grad_norm": 0.13304442167282104, + "learning_rate": 1.0850329090350156e-05, + "loss": 0.0066, + "num_input_tokens_seen": 27714368, + "step": 131325 + }, + { + "epoch": 14.447744774477448, + "grad_norm": 0.008471479639410973, + "learning_rate": 1.0848350505456282e-05, + "loss": 0.1592, + "num_input_tokens_seen": 27715424, + "step": 131330 + }, + { + "epoch": 14.448294829482949, + "grad_norm": 1.3461732864379883, + "learning_rate": 1.0846372050991372e-05, + "loss": 0.0406, + "num_input_tokens_seen": 27716416, + "step": 131335 + }, + { + "epoch": 14.448844884488448, + "grad_norm": 0.08552948385477066, + "learning_rate": 1.0844393726973645e-05, + "loss": 0.0502, + "num_input_tokens_seen": 27717408, + "step": 131340 + }, + { + "epoch": 14.44939493949395, + "grad_norm": 0.04133811593055725, + "learning_rate": 1.0842415533421354e-05, + "loss": 0.0056, + "num_input_tokens_seen": 27718368, + "step": 131345 + }, + { + "epoch": 14.44994499449945, + "grad_norm": 0.02304782159626484, + "learning_rate": 1.0840437470352723e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27719392, + "step": 131350 + }, + { + "epoch": 14.450495049504951, + "grad_norm": 0.03910288214683533, + "learning_rate": 1.0838459537785995e-05, + "loss": 0.0119, + "num_input_tokens_seen": 27720448, + "step": 131355 + }, + { + "epoch": 14.45104510451045, + "grad_norm": 0.09116929024457932, + "learning_rate": 1.083648173573939e-05, + "loss": 0.0963, + "num_input_tokens_seen": 27721536, + "step": 131360 + }, + { + "epoch": 14.451595159515952, + "grad_norm": 1.3396724462509155, + "learning_rate": 1.0834504064231127e-05, + "loss": 0.1327, + "num_input_tokens_seen": 27722688, + "step": 131365 + }, + { + "epoch": 14.452145214521453, + "grad_norm": 0.20176969468593597, + "learning_rate": 1.0832526523279455e-05, + "loss": 0.0596, + "num_input_tokens_seen": 27723744, + "step": 131370 + }, + { + "epoch": 14.452695269526952, + "grad_norm": 0.06723655760288239, + "learning_rate": 1.0830549112902575e-05, + "loss": 0.002, + "num_input_tokens_seen": 27724832, + "step": 131375 + }, + { + "epoch": 14.453245324532453, + "grad_norm": 0.4426150918006897, + "learning_rate": 1.0828571833118726e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27725920, + "step": 131380 + }, + { + "epoch": 14.453795379537954, + "grad_norm": 0.05846283957362175, + "learning_rate": 1.082659468394614e-05, + "loss": 0.0126, + "num_input_tokens_seen": 27726976, + "step": 131385 + }, + { + "epoch": 14.454345434543454, + "grad_norm": 0.20596176385879517, + "learning_rate": 1.082461766540302e-05, + "loss": 0.0108, + "num_input_tokens_seen": 27727968, + "step": 131390 + }, + { + "epoch": 14.454895489548955, + "grad_norm": 0.02366500161588192, + "learning_rate": 1.0822640777507603e-05, + "loss": 0.0081, + "num_input_tokens_seen": 27728928, + "step": 131395 + }, + { + "epoch": 14.455445544554456, + "grad_norm": 0.04395447298884392, + "learning_rate": 1.0820664020278099e-05, + "loss": 0.0111, + "num_input_tokens_seen": 27730016, + "step": 131400 + }, + { + "epoch": 14.455995599559955, + "grad_norm": 0.4017828702926636, + "learning_rate": 1.0818687393732726e-05, + "loss": 0.0077, + "num_input_tokens_seen": 27731072, + "step": 131405 + }, + { + "epoch": 14.456545654565456, + "grad_norm": 2.2636239528656006, + "learning_rate": 1.081671089788972e-05, + "loss": 0.1013, + "num_input_tokens_seen": 27732160, + "step": 131410 + }, + { + "epoch": 14.457095709570957, + "grad_norm": 0.0774514302611351, + "learning_rate": 1.0814734532767274e-05, + "loss": 0.0039, + "num_input_tokens_seen": 27733216, + "step": 131415 + }, + { + "epoch": 14.457645764576458, + "grad_norm": 0.13750509917736053, + "learning_rate": 1.0812758298383621e-05, + "loss": 0.0095, + "num_input_tokens_seen": 27734304, + "step": 131420 + }, + { + "epoch": 14.458195819581958, + "grad_norm": 1.9427410364151, + "learning_rate": 1.0810782194756958e-05, + "loss": 0.0666, + "num_input_tokens_seen": 27735392, + "step": 131425 + }, + { + "epoch": 14.458745874587459, + "grad_norm": 0.14311744272708893, + "learning_rate": 1.0808806221905516e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27736416, + "step": 131430 + }, + { + "epoch": 14.45929592959296, + "grad_norm": 1.0565155744552612, + "learning_rate": 1.0806830379847488e-05, + "loss": 0.0132, + "num_input_tokens_seen": 27737504, + "step": 131435 + }, + { + "epoch": 14.45984598459846, + "grad_norm": 1.6255499124526978, + "learning_rate": 1.0804854668601094e-05, + "loss": 0.1254, + "num_input_tokens_seen": 27738624, + "step": 131440 + }, + { + "epoch": 14.46039603960396, + "grad_norm": 0.16488289833068848, + "learning_rate": 1.0802879088184553e-05, + "loss": 0.0067, + "num_input_tokens_seen": 27739680, + "step": 131445 + }, + { + "epoch": 14.460946094609461, + "grad_norm": 0.12694120407104492, + "learning_rate": 1.0800903638616052e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27740800, + "step": 131450 + }, + { + "epoch": 14.46149614961496, + "grad_norm": 0.017566420137882233, + "learning_rate": 1.079892831991382e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27741888, + "step": 131455 + }, + { + "epoch": 14.462046204620462, + "grad_norm": 0.07599986344575882, + "learning_rate": 1.079695313209605e-05, + "loss": 0.0328, + "num_input_tokens_seen": 27743040, + "step": 131460 + }, + { + "epoch": 14.462596259625963, + "grad_norm": 2.1220004558563232, + "learning_rate": 1.079497807518093e-05, + "loss": 0.1501, + "num_input_tokens_seen": 27744160, + "step": 131465 + }, + { + "epoch": 14.463146314631462, + "grad_norm": 0.03485710173845291, + "learning_rate": 1.0793003149186698e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27745184, + "step": 131470 + }, + { + "epoch": 14.463696369636963, + "grad_norm": 0.6983029842376709, + "learning_rate": 1.0791028354131542e-05, + "loss": 0.013, + "num_input_tokens_seen": 27746176, + "step": 131475 + }, + { + "epoch": 14.464246424642464, + "grad_norm": 0.08807334303855896, + "learning_rate": 1.0789053690033645e-05, + "loss": 0.0038, + "num_input_tokens_seen": 27747296, + "step": 131480 + }, + { + "epoch": 14.464796479647966, + "grad_norm": 1.8174595832824707, + "learning_rate": 1.0787079156911234e-05, + "loss": 0.102, + "num_input_tokens_seen": 27748384, + "step": 131485 + }, + { + "epoch": 14.465346534653465, + "grad_norm": 0.013848716393113136, + "learning_rate": 1.0785104754782484e-05, + "loss": 0.0048, + "num_input_tokens_seen": 27749504, + "step": 131490 + }, + { + "epoch": 14.465896589658966, + "grad_norm": 0.026409978047013283, + "learning_rate": 1.0783130483665605e-05, + "loss": 0.1304, + "num_input_tokens_seen": 27750624, + "step": 131495 + }, + { + "epoch": 14.466446644664467, + "grad_norm": 0.06507077813148499, + "learning_rate": 1.0781156343578797e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27751648, + "step": 131500 + }, + { + "epoch": 14.466996699669966, + "grad_norm": 0.01702524721622467, + "learning_rate": 1.077918233454024e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27752736, + "step": 131505 + }, + { + "epoch": 14.467546754675467, + "grad_norm": 0.2808849513530731, + "learning_rate": 1.0777208456568144e-05, + "loss": 0.0077, + "num_input_tokens_seen": 27753824, + "step": 131510 + }, + { + "epoch": 14.468096809680969, + "grad_norm": 0.046448759734630585, + "learning_rate": 1.0775234709680684e-05, + "loss": 0.0434, + "num_input_tokens_seen": 27754848, + "step": 131515 + }, + { + "epoch": 14.468646864686468, + "grad_norm": 0.05559738725423813, + "learning_rate": 1.0773261093896061e-05, + "loss": 0.0065, + "num_input_tokens_seen": 27755936, + "step": 131520 + }, + { + "epoch": 14.469196919691969, + "grad_norm": 0.016004519537091255, + "learning_rate": 1.0771287609232472e-05, + "loss": 0.1113, + "num_input_tokens_seen": 27757056, + "step": 131525 + }, + { + "epoch": 14.46974697469747, + "grad_norm": 0.01787746325135231, + "learning_rate": 1.0769314255708088e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27758112, + "step": 131530 + }, + { + "epoch": 14.47029702970297, + "grad_norm": 0.5464525818824768, + "learning_rate": 1.0767341033341116e-05, + "loss": 0.0426, + "num_input_tokens_seen": 27759200, + "step": 131535 + }, + { + "epoch": 14.47084708470847, + "grad_norm": 0.02473350614309311, + "learning_rate": 1.0765367942149723e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27760256, + "step": 131540 + }, + { + "epoch": 14.471397139713972, + "grad_norm": 0.1314643770456314, + "learning_rate": 1.0763394982152112e-05, + "loss": 0.0422, + "num_input_tokens_seen": 27761312, + "step": 131545 + }, + { + "epoch": 14.471947194719473, + "grad_norm": 0.017097515985369682, + "learning_rate": 1.0761422153366449e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27762400, + "step": 131550 + }, + { + "epoch": 14.472497249724972, + "grad_norm": 0.014012282714247704, + "learning_rate": 1.0759449455810928e-05, + "loss": 0.0136, + "num_input_tokens_seen": 27763456, + "step": 131555 + }, + { + "epoch": 14.473047304730473, + "grad_norm": 0.7042785882949829, + "learning_rate": 1.0757476889503734e-05, + "loss": 0.016, + "num_input_tokens_seen": 27764512, + "step": 131560 + }, + { + "epoch": 14.473597359735974, + "grad_norm": 0.01056602131575346, + "learning_rate": 1.075550445446303e-05, + "loss": 0.0172, + "num_input_tokens_seen": 27765536, + "step": 131565 + }, + { + "epoch": 14.474147414741473, + "grad_norm": 0.026393728330731392, + "learning_rate": 1.0753532150707021e-05, + "loss": 0.1217, + "num_input_tokens_seen": 27766592, + "step": 131570 + }, + { + "epoch": 14.474697469746975, + "grad_norm": 0.10248123854398727, + "learning_rate": 1.0751559978253867e-05, + "loss": 0.0227, + "num_input_tokens_seen": 27767616, + "step": 131575 + }, + { + "epoch": 14.475247524752476, + "grad_norm": 1.01675546169281, + "learning_rate": 1.074958793712173e-05, + "loss": 0.0144, + "num_input_tokens_seen": 27768672, + "step": 131580 + }, + { + "epoch": 14.475797579757975, + "grad_norm": 0.15046364068984985, + "learning_rate": 1.0747616027328822e-05, + "loss": 0.083, + "num_input_tokens_seen": 27769728, + "step": 131585 + }, + { + "epoch": 14.476347634763476, + "grad_norm": 0.02984924055635929, + "learning_rate": 1.0745644248893286e-05, + "loss": 0.057, + "num_input_tokens_seen": 27770784, + "step": 131590 + }, + { + "epoch": 14.476897689768977, + "grad_norm": 0.0202097836881876, + "learning_rate": 1.0743672601833318e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27771808, + "step": 131595 + }, + { + "epoch": 14.477447744774478, + "grad_norm": 0.10722542554140091, + "learning_rate": 1.074170108616708e-05, + "loss": 0.0031, + "num_input_tokens_seen": 27772864, + "step": 131600 + }, + { + "epoch": 14.477997799779978, + "grad_norm": 0.013564740307629108, + "learning_rate": 1.0739729701912731e-05, + "loss": 0.0224, + "num_input_tokens_seen": 27773888, + "step": 131605 + }, + { + "epoch": 14.478547854785479, + "grad_norm": 0.07878084480762482, + "learning_rate": 1.073775844908845e-05, + "loss": 0.0049, + "num_input_tokens_seen": 27774912, + "step": 131610 + }, + { + "epoch": 14.47909790979098, + "grad_norm": 1.860344409942627, + "learning_rate": 1.0735787327712406e-05, + "loss": 0.0725, + "num_input_tokens_seen": 27776000, + "step": 131615 + }, + { + "epoch": 14.479647964796479, + "grad_norm": 0.06413793563842773, + "learning_rate": 1.0733816337802776e-05, + "loss": 0.01, + "num_input_tokens_seen": 27777056, + "step": 131620 + }, + { + "epoch": 14.48019801980198, + "grad_norm": 0.003183621447533369, + "learning_rate": 1.0731845479377715e-05, + "loss": 0.0073, + "num_input_tokens_seen": 27778144, + "step": 131625 + }, + { + "epoch": 14.480748074807481, + "grad_norm": 0.12646685540676117, + "learning_rate": 1.0729874752455377e-05, + "loss": 0.0048, + "num_input_tokens_seen": 27779200, + "step": 131630 + }, + { + "epoch": 14.48129812981298, + "grad_norm": 3.048128604888916, + "learning_rate": 1.0727904157053947e-05, + "loss": 0.1084, + "num_input_tokens_seen": 27780256, + "step": 131635 + }, + { + "epoch": 14.481848184818482, + "grad_norm": 0.11929068714380264, + "learning_rate": 1.0725933693191556e-05, + "loss": 0.0335, + "num_input_tokens_seen": 27781312, + "step": 131640 + }, + { + "epoch": 14.482398239823983, + "grad_norm": 0.02246805839240551, + "learning_rate": 1.0723963360886405e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27782336, + "step": 131645 + }, + { + "epoch": 14.482948294829482, + "grad_norm": 1.001944661140442, + "learning_rate": 1.0721993160156631e-05, + "loss": 0.0121, + "num_input_tokens_seen": 27783392, + "step": 131650 + }, + { + "epoch": 14.483498349834983, + "grad_norm": 0.04280257970094681, + "learning_rate": 1.0720023091020387e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27784480, + "step": 131655 + }, + { + "epoch": 14.484048404840484, + "grad_norm": 0.029348015785217285, + "learning_rate": 1.0718053153495846e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27785536, + "step": 131660 + }, + { + "epoch": 14.484598459845985, + "grad_norm": 0.07082507014274597, + "learning_rate": 1.071608334760115e-05, + "loss": 0.0148, + "num_input_tokens_seen": 27786656, + "step": 131665 + }, + { + "epoch": 14.485148514851485, + "grad_norm": 1.559510350227356, + "learning_rate": 1.0714113673354456e-05, + "loss": 0.0645, + "num_input_tokens_seen": 27787616, + "step": 131670 + }, + { + "epoch": 14.485698569856986, + "grad_norm": 0.009555632248520851, + "learning_rate": 1.0712144130773932e-05, + "loss": 0.1003, + "num_input_tokens_seen": 27788736, + "step": 131675 + }, + { + "epoch": 14.486248624862487, + "grad_norm": 0.03149161487817764, + "learning_rate": 1.0710174719877711e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27789824, + "step": 131680 + }, + { + "epoch": 14.486798679867986, + "grad_norm": 0.00885417778044939, + "learning_rate": 1.070820544068396e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27790880, + "step": 131685 + }, + { + "epoch": 14.487348734873487, + "grad_norm": 0.26000481843948364, + "learning_rate": 1.0706236293210822e-05, + "loss": 0.0105, + "num_input_tokens_seen": 27791968, + "step": 131690 + }, + { + "epoch": 14.487898789878988, + "grad_norm": 0.17735087871551514, + "learning_rate": 1.0704267277476438e-05, + "loss": 0.0404, + "num_input_tokens_seen": 27793024, + "step": 131695 + }, + { + "epoch": 14.488448844884488, + "grad_norm": 0.617900013923645, + "learning_rate": 1.0702298393498961e-05, + "loss": 0.0321, + "num_input_tokens_seen": 27794080, + "step": 131700 + }, + { + "epoch": 14.488998899889989, + "grad_norm": 0.04730228707194328, + "learning_rate": 1.0700329641296541e-05, + "loss": 0.0075, + "num_input_tokens_seen": 27795104, + "step": 131705 + }, + { + "epoch": 14.48954895489549, + "grad_norm": 0.16779671609401703, + "learning_rate": 1.0698361020887328e-05, + "loss": 0.0524, + "num_input_tokens_seen": 27796256, + "step": 131710 + }, + { + "epoch": 14.490099009900991, + "grad_norm": 0.026953933760523796, + "learning_rate": 1.069639253228946e-05, + "loss": 0.0391, + "num_input_tokens_seen": 27797344, + "step": 131715 + }, + { + "epoch": 14.49064906490649, + "grad_norm": 2.361300468444824, + "learning_rate": 1.0694424175521067e-05, + "loss": 0.1874, + "num_input_tokens_seen": 27798432, + "step": 131720 + }, + { + "epoch": 14.491199119911991, + "grad_norm": 0.0411534309387207, + "learning_rate": 1.0692455950600306e-05, + "loss": 0.0143, + "num_input_tokens_seen": 27799552, + "step": 131725 + }, + { + "epoch": 14.491749174917492, + "grad_norm": 0.014789990149438381, + "learning_rate": 1.0690487857545309e-05, + "loss": 0.015, + "num_input_tokens_seen": 27800576, + "step": 131730 + }, + { + "epoch": 14.492299229922992, + "grad_norm": 0.25251084566116333, + "learning_rate": 1.068851989637423e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27801632, + "step": 131735 + }, + { + "epoch": 14.492849284928493, + "grad_norm": 0.01787063293159008, + "learning_rate": 1.0686552067105197e-05, + "loss": 0.001, + "num_input_tokens_seen": 27802752, + "step": 131740 + }, + { + "epoch": 14.493399339933994, + "grad_norm": 0.013327005319297314, + "learning_rate": 1.0684584369756335e-05, + "loss": 0.0943, + "num_input_tokens_seen": 27803808, + "step": 131745 + }, + { + "epoch": 14.493949394939493, + "grad_norm": 0.170673668384552, + "learning_rate": 1.0682616804345799e-05, + "loss": 0.0423, + "num_input_tokens_seen": 27804864, + "step": 131750 + }, + { + "epoch": 14.494499449944994, + "grad_norm": 0.04076186195015907, + "learning_rate": 1.0680649370891704e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27805952, + "step": 131755 + }, + { + "epoch": 14.495049504950495, + "grad_norm": 0.09479103982448578, + "learning_rate": 1.0678682069412194e-05, + "loss": 0.0046, + "num_input_tokens_seen": 27806944, + "step": 131760 + }, + { + "epoch": 14.495599559955995, + "grad_norm": 0.04413784667849541, + "learning_rate": 1.0676714899925409e-05, + "loss": 0.0078, + "num_input_tokens_seen": 27808000, + "step": 131765 + }, + { + "epoch": 14.496149614961496, + "grad_norm": 0.02439272589981556, + "learning_rate": 1.0674747862449458e-05, + "loss": 0.0129, + "num_input_tokens_seen": 27809024, + "step": 131770 + }, + { + "epoch": 14.496699669966997, + "grad_norm": 0.033908650279045105, + "learning_rate": 1.0672780957002494e-05, + "loss": 0.0314, + "num_input_tokens_seen": 27810016, + "step": 131775 + }, + { + "epoch": 14.497249724972498, + "grad_norm": 0.0706874430179596, + "learning_rate": 1.067081418360262e-05, + "loss": 0.0182, + "num_input_tokens_seen": 27811040, + "step": 131780 + }, + { + "epoch": 14.497799779977997, + "grad_norm": 0.007593590300530195, + "learning_rate": 1.066884754226798e-05, + "loss": 0.0101, + "num_input_tokens_seen": 27812160, + "step": 131785 + }, + { + "epoch": 14.498349834983498, + "grad_norm": 0.16618315875530243, + "learning_rate": 1.0666881033016701e-05, + "loss": 0.002, + "num_input_tokens_seen": 27813152, + "step": 131790 + }, + { + "epoch": 14.498899889989, + "grad_norm": 0.16827932000160217, + "learning_rate": 1.0664914655866895e-05, + "loss": 0.0043, + "num_input_tokens_seen": 27814240, + "step": 131795 + }, + { + "epoch": 14.499449944994499, + "grad_norm": 0.03403959050774574, + "learning_rate": 1.0662948410836701e-05, + "loss": 0.0877, + "num_input_tokens_seen": 27815232, + "step": 131800 + }, + { + "epoch": 14.5, + "grad_norm": 0.01696123369038105, + "learning_rate": 1.0660982297944221e-05, + "loss": 0.0486, + "num_input_tokens_seen": 27816384, + "step": 131805 + }, + { + "epoch": 14.500550055005501, + "grad_norm": 0.046144112944602966, + "learning_rate": 1.0659016317207599e-05, + "loss": 0.0357, + "num_input_tokens_seen": 27817472, + "step": 131810 + }, + { + "epoch": 14.501100110011, + "grad_norm": 0.013241183012723923, + "learning_rate": 1.065705046864493e-05, + "loss": 0.045, + "num_input_tokens_seen": 27818528, + "step": 131815 + }, + { + "epoch": 14.501650165016502, + "grad_norm": 0.37138330936431885, + "learning_rate": 1.0655084752274347e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27819552, + "step": 131820 + }, + { + "epoch": 14.502200220022003, + "grad_norm": 0.03759751096367836, + "learning_rate": 1.065311916811397e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27820608, + "step": 131825 + }, + { + "epoch": 14.502750275027502, + "grad_norm": 0.09380189329385757, + "learning_rate": 1.0651153716181902e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27821632, + "step": 131830 + }, + { + "epoch": 14.503300330033003, + "grad_norm": 0.013211680576205254, + "learning_rate": 1.0649188396496277e-05, + "loss": 0.026, + "num_input_tokens_seen": 27822720, + "step": 131835 + }, + { + "epoch": 14.503850385038504, + "grad_norm": 0.01357566099613905, + "learning_rate": 1.0647223209075184e-05, + "loss": 0.0196, + "num_input_tokens_seen": 27823776, + "step": 131840 + }, + { + "epoch": 14.504400440044005, + "grad_norm": 0.15398351848125458, + "learning_rate": 1.0645258153936747e-05, + "loss": 0.084, + "num_input_tokens_seen": 27824800, + "step": 131845 + }, + { + "epoch": 14.504950495049505, + "grad_norm": 0.02264932170510292, + "learning_rate": 1.0643293231099088e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27825792, + "step": 131850 + }, + { + "epoch": 14.505500550055006, + "grad_norm": 0.0036038425751030445, + "learning_rate": 1.0641328440580306e-05, + "loss": 0.0269, + "num_input_tokens_seen": 27826784, + "step": 131855 + }, + { + "epoch": 14.506050605060507, + "grad_norm": 0.031003721058368683, + "learning_rate": 1.0639363782398501e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27827808, + "step": 131860 + }, + { + "epoch": 14.506600660066006, + "grad_norm": 0.07513908296823502, + "learning_rate": 1.0637399256571797e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27828896, + "step": 131865 + }, + { + "epoch": 14.507150715071507, + "grad_norm": 0.036728136241436005, + "learning_rate": 1.0635434863118285e-05, + "loss": 0.0101, + "num_input_tokens_seen": 27830048, + "step": 131870 + }, + { + "epoch": 14.507700770077008, + "grad_norm": 0.007889015600085258, + "learning_rate": 1.0633470602056076e-05, + "loss": 0.0135, + "num_input_tokens_seen": 27831136, + "step": 131875 + }, + { + "epoch": 14.508250825082508, + "grad_norm": 0.08200865238904953, + "learning_rate": 1.0631506473403285e-05, + "loss": 0.0516, + "num_input_tokens_seen": 27832160, + "step": 131880 + }, + { + "epoch": 14.508800880088009, + "grad_norm": 0.014185191132128239, + "learning_rate": 1.0629542477177992e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27833216, + "step": 131885 + }, + { + "epoch": 14.50935093509351, + "grad_norm": 0.01180263888090849, + "learning_rate": 1.0627578613398322e-05, + "loss": 0.0553, + "num_input_tokens_seen": 27834240, + "step": 131890 + }, + { + "epoch": 14.509900990099009, + "grad_norm": 0.01770719699561596, + "learning_rate": 1.0625614882082354e-05, + "loss": 0.001, + "num_input_tokens_seen": 27835232, + "step": 131895 + }, + { + "epoch": 14.51045104510451, + "grad_norm": 0.12338755279779434, + "learning_rate": 1.0623651283248198e-05, + "loss": 0.0048, + "num_input_tokens_seen": 27836256, + "step": 131900 + }, + { + "epoch": 14.511001100110011, + "grad_norm": 1.4359701871871948, + "learning_rate": 1.0621687816913956e-05, + "loss": 0.1249, + "num_input_tokens_seen": 27837280, + "step": 131905 + }, + { + "epoch": 14.511551155115512, + "grad_norm": 0.2618594169616699, + "learning_rate": 1.061972448309771e-05, + "loss": 0.0696, + "num_input_tokens_seen": 27838368, + "step": 131910 + }, + { + "epoch": 14.512101210121012, + "grad_norm": 0.03493855148553848, + "learning_rate": 1.061776128181757e-05, + "loss": 0.004, + "num_input_tokens_seen": 27839456, + "step": 131915 + }, + { + "epoch": 14.512651265126513, + "grad_norm": 1.4711452722549438, + "learning_rate": 1.0615798213091616e-05, + "loss": 0.0199, + "num_input_tokens_seen": 27840448, + "step": 131920 + }, + { + "epoch": 14.513201320132014, + "grad_norm": 0.04167741537094116, + "learning_rate": 1.0613835276937958e-05, + "loss": 0.0468, + "num_input_tokens_seen": 27841504, + "step": 131925 + }, + { + "epoch": 14.513751375137513, + "grad_norm": 1.442983865737915, + "learning_rate": 1.0611872473374665e-05, + "loss": 0.0277, + "num_input_tokens_seen": 27842528, + "step": 131930 + }, + { + "epoch": 14.514301430143014, + "grad_norm": 0.061581436544656754, + "learning_rate": 1.0609909802419843e-05, + "loss": 0.0738, + "num_input_tokens_seen": 27843584, + "step": 131935 + }, + { + "epoch": 14.514851485148515, + "grad_norm": 0.028468362987041473, + "learning_rate": 1.0607947264091584e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27844640, + "step": 131940 + }, + { + "epoch": 14.515401540154015, + "grad_norm": 0.006125201936811209, + "learning_rate": 1.0605984858407959e-05, + "loss": 0.003, + "num_input_tokens_seen": 27845664, + "step": 131945 + }, + { + "epoch": 14.515951595159516, + "grad_norm": 4.002589702606201, + "learning_rate": 1.0604022585387074e-05, + "loss": 0.1442, + "num_input_tokens_seen": 27846688, + "step": 131950 + }, + { + "epoch": 14.516501650165017, + "grad_norm": 0.13449053466320038, + "learning_rate": 1.0602060445047e-05, + "loss": 0.0068, + "num_input_tokens_seen": 27847776, + "step": 131955 + }, + { + "epoch": 14.517051705170516, + "grad_norm": 0.024869946762919426, + "learning_rate": 1.060009843740582e-05, + "loss": 0.008, + "num_input_tokens_seen": 27848832, + "step": 131960 + }, + { + "epoch": 14.517601760176017, + "grad_norm": 0.07904112339019775, + "learning_rate": 1.0598136562481637e-05, + "loss": 0.0121, + "num_input_tokens_seen": 27849952, + "step": 131965 + }, + { + "epoch": 14.518151815181518, + "grad_norm": 0.03834934160113335, + "learning_rate": 1.0596174820292506e-05, + "loss": 0.0085, + "num_input_tokens_seen": 27851008, + "step": 131970 + }, + { + "epoch": 14.51870187018702, + "grad_norm": 0.009327548556029797, + "learning_rate": 1.0594213210856533e-05, + "loss": 0.0031, + "num_input_tokens_seen": 27852064, + "step": 131975 + }, + { + "epoch": 14.519251925192519, + "grad_norm": 0.0051565770991146564, + "learning_rate": 1.059225173419178e-05, + "loss": 0.004, + "num_input_tokens_seen": 27853152, + "step": 131980 + }, + { + "epoch": 14.51980198019802, + "grad_norm": 0.018392950296401978, + "learning_rate": 1.0590290390316321e-05, + "loss": 0.0118, + "num_input_tokens_seen": 27854208, + "step": 131985 + }, + { + "epoch": 14.520352035203521, + "grad_norm": 4.2028303146362305, + "learning_rate": 1.0588329179248244e-05, + "loss": 0.0539, + "num_input_tokens_seen": 27855296, + "step": 131990 + }, + { + "epoch": 14.52090209020902, + "grad_norm": 0.10526997596025467, + "learning_rate": 1.058636810100562e-05, + "loss": 0.047, + "num_input_tokens_seen": 27856384, + "step": 131995 + }, + { + "epoch": 14.521452145214521, + "grad_norm": 0.014556598849594593, + "learning_rate": 1.0584407155606533e-05, + "loss": 0.0465, + "num_input_tokens_seen": 27857376, + "step": 132000 + }, + { + "epoch": 14.522002200220022, + "grad_norm": 0.07531983405351639, + "learning_rate": 1.058244634306905e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27858432, + "step": 132005 + }, + { + "epoch": 14.522552255225522, + "grad_norm": 0.05518967658281326, + "learning_rate": 1.058048566341123e-05, + "loss": 0.0325, + "num_input_tokens_seen": 27859520, + "step": 132010 + }, + { + "epoch": 14.523102310231023, + "grad_norm": 0.02208852395415306, + "learning_rate": 1.0578525116651154e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27860608, + "step": 132015 + }, + { + "epoch": 14.523652365236524, + "grad_norm": 0.044925615191459656, + "learning_rate": 1.0576564702806893e-05, + "loss": 0.0069, + "num_input_tokens_seen": 27861632, + "step": 132020 + }, + { + "epoch": 14.524202420242025, + "grad_norm": 0.02726476825773716, + "learning_rate": 1.0574604421896523e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27862720, + "step": 132025 + }, + { + "epoch": 14.524752475247524, + "grad_norm": 0.205349862575531, + "learning_rate": 1.05726442739381e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27863744, + "step": 132030 + }, + { + "epoch": 14.525302530253025, + "grad_norm": 0.15194928646087646, + "learning_rate": 1.0570684258949686e-05, + "loss": 0.0295, + "num_input_tokens_seen": 27864864, + "step": 132035 + }, + { + "epoch": 14.525852585258527, + "grad_norm": 0.020787134766578674, + "learning_rate": 1.056872437694936e-05, + "loss": 0.0072, + "num_input_tokens_seen": 27865984, + "step": 132040 + }, + { + "epoch": 14.526402640264026, + "grad_norm": 0.01746116764843464, + "learning_rate": 1.0566764627955167e-05, + "loss": 0.05, + "num_input_tokens_seen": 27867040, + "step": 132045 + }, + { + "epoch": 14.526952695269527, + "grad_norm": 0.2815297544002533, + "learning_rate": 1.0564805011985177e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27868096, + "step": 132050 + }, + { + "epoch": 14.527502750275028, + "grad_norm": 1.312827706336975, + "learning_rate": 1.0562845529057463e-05, + "loss": 0.0575, + "num_input_tokens_seen": 27869088, + "step": 132055 + }, + { + "epoch": 14.528052805280527, + "grad_norm": 0.009463761933147907, + "learning_rate": 1.0560886179190066e-05, + "loss": 0.0007, + "num_input_tokens_seen": 27870176, + "step": 132060 + }, + { + "epoch": 14.528602860286028, + "grad_norm": 0.11728151887655258, + "learning_rate": 1.0558926962401058e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27871232, + "step": 132065 + }, + { + "epoch": 14.52915291529153, + "grad_norm": 0.012654256075620651, + "learning_rate": 1.0556967878708485e-05, + "loss": 0.004, + "num_input_tokens_seen": 27872224, + "step": 132070 + }, + { + "epoch": 14.52970297029703, + "grad_norm": 0.09624184668064117, + "learning_rate": 1.0555008928130407e-05, + "loss": 0.0047, + "num_input_tokens_seen": 27873248, + "step": 132075 + }, + { + "epoch": 14.53025302530253, + "grad_norm": 0.02178259566426277, + "learning_rate": 1.0553050110684893e-05, + "loss": 0.028, + "num_input_tokens_seen": 27874304, + "step": 132080 + }, + { + "epoch": 14.530803080308031, + "grad_norm": 0.021534349769353867, + "learning_rate": 1.055109142638997e-05, + "loss": 0.0271, + "num_input_tokens_seen": 27875328, + "step": 132085 + }, + { + "epoch": 14.531353135313532, + "grad_norm": 0.052934132516384125, + "learning_rate": 1.0549132875263715e-05, + "loss": 0.0119, + "num_input_tokens_seen": 27876320, + "step": 132090 + }, + { + "epoch": 14.531903190319031, + "grad_norm": 0.007007064297795296, + "learning_rate": 1.054717445732417e-05, + "loss": 0.0051, + "num_input_tokens_seen": 27877376, + "step": 132095 + }, + { + "epoch": 14.532453245324533, + "grad_norm": 0.7467523217201233, + "learning_rate": 1.054521617258937e-05, + "loss": 0.0152, + "num_input_tokens_seen": 27878400, + "step": 132100 + }, + { + "epoch": 14.533003300330034, + "grad_norm": 0.015355645678937435, + "learning_rate": 1.054325802107738e-05, + "loss": 0.0339, + "num_input_tokens_seen": 27879488, + "step": 132105 + }, + { + "epoch": 14.533553355335533, + "grad_norm": 1.528557300567627, + "learning_rate": 1.0541300002806242e-05, + "loss": 0.0187, + "num_input_tokens_seen": 27880480, + "step": 132110 + }, + { + "epoch": 14.534103410341034, + "grad_norm": 0.03899330273270607, + "learning_rate": 1.0539342117794013e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27881568, + "step": 132115 + }, + { + "epoch": 14.534653465346535, + "grad_norm": 0.420767605304718, + "learning_rate": 1.0537384366058729e-05, + "loss": 0.0091, + "num_input_tokens_seen": 27882688, + "step": 132120 + }, + { + "epoch": 14.535203520352034, + "grad_norm": 0.02793276123702526, + "learning_rate": 1.0535426747618424e-05, + "loss": 0.0616, + "num_input_tokens_seen": 27883744, + "step": 132125 + }, + { + "epoch": 14.535753575357536, + "grad_norm": 0.0056624142453074455, + "learning_rate": 1.0533469262491152e-05, + "loss": 0.0115, + "num_input_tokens_seen": 27884768, + "step": 132130 + }, + { + "epoch": 14.536303630363037, + "grad_norm": 0.0044333431869745255, + "learning_rate": 1.053151191069495e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27885856, + "step": 132135 + }, + { + "epoch": 14.536853685368538, + "grad_norm": 0.4561605751514435, + "learning_rate": 1.052955469224787e-05, + "loss": 0.0108, + "num_input_tokens_seen": 27887008, + "step": 132140 + }, + { + "epoch": 14.537403740374037, + "grad_norm": 0.005308938678354025, + "learning_rate": 1.0527597607167939e-05, + "loss": 0.059, + "num_input_tokens_seen": 27888128, + "step": 132145 + }, + { + "epoch": 14.537953795379538, + "grad_norm": 0.09124299883842468, + "learning_rate": 1.052564065547319e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27889184, + "step": 132150 + }, + { + "epoch": 14.53850385038504, + "grad_norm": 0.05652276799082756, + "learning_rate": 1.0523683837181672e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27890208, + "step": 132155 + }, + { + "epoch": 14.539053905390539, + "grad_norm": 0.7134369015693665, + "learning_rate": 1.0521727152311406e-05, + "loss": 0.0536, + "num_input_tokens_seen": 27891360, + "step": 132160 + }, + { + "epoch": 14.53960396039604, + "grad_norm": 7.360246658325195, + "learning_rate": 1.0519770600880433e-05, + "loss": 0.052, + "num_input_tokens_seen": 27892416, + "step": 132165 + }, + { + "epoch": 14.54015401540154, + "grad_norm": 0.008139344863593578, + "learning_rate": 1.0517814182906797e-05, + "loss": 0.0055, + "num_input_tokens_seen": 27893600, + "step": 132170 + }, + { + "epoch": 14.54070407040704, + "grad_norm": 0.7781285047531128, + "learning_rate": 1.0515857898408507e-05, + "loss": 0.166, + "num_input_tokens_seen": 27894624, + "step": 132175 + }, + { + "epoch": 14.541254125412541, + "grad_norm": 0.007320834789425135, + "learning_rate": 1.0513901747403615e-05, + "loss": 0.0125, + "num_input_tokens_seen": 27895744, + "step": 132180 + }, + { + "epoch": 14.541804180418042, + "grad_norm": 0.019200937822461128, + "learning_rate": 1.051194572991013e-05, + "loss": 0.0172, + "num_input_tokens_seen": 27896864, + "step": 132185 + }, + { + "epoch": 14.542354235423542, + "grad_norm": 0.11935172230005264, + "learning_rate": 1.0509989845946088e-05, + "loss": 0.0378, + "num_input_tokens_seen": 27897888, + "step": 132190 + }, + { + "epoch": 14.542904290429043, + "grad_norm": 0.00843055360019207, + "learning_rate": 1.0508034095529529e-05, + "loss": 0.0048, + "num_input_tokens_seen": 27898944, + "step": 132195 + }, + { + "epoch": 14.543454345434544, + "grad_norm": 0.029780631884932518, + "learning_rate": 1.0506078478678452e-05, + "loss": 0.1154, + "num_input_tokens_seen": 27900000, + "step": 132200 + }, + { + "epoch": 14.544004400440045, + "grad_norm": 0.00301681412383914, + "learning_rate": 1.0504122995410909e-05, + "loss": 0.0654, + "num_input_tokens_seen": 27901088, + "step": 132205 + }, + { + "epoch": 14.544554455445544, + "grad_norm": 0.1381458342075348, + "learning_rate": 1.0502167645744895e-05, + "loss": 0.0513, + "num_input_tokens_seen": 27902144, + "step": 132210 + }, + { + "epoch": 14.545104510451045, + "grad_norm": 0.20219478011131287, + "learning_rate": 1.0500212429698455e-05, + "loss": 0.007, + "num_input_tokens_seen": 27903232, + "step": 132215 + }, + { + "epoch": 14.545654565456546, + "grad_norm": 0.005964680574834347, + "learning_rate": 1.0498257347289592e-05, + "loss": 0.0052, + "num_input_tokens_seen": 27904320, + "step": 132220 + }, + { + "epoch": 14.546204620462046, + "grad_norm": 0.05295511335134506, + "learning_rate": 1.0496302398536334e-05, + "loss": 0.0793, + "num_input_tokens_seen": 27905408, + "step": 132225 + }, + { + "epoch": 14.546754675467547, + "grad_norm": 0.049957044422626495, + "learning_rate": 1.0494347583456703e-05, + "loss": 0.0829, + "num_input_tokens_seen": 27906400, + "step": 132230 + }, + { + "epoch": 14.547304730473048, + "grad_norm": 0.0779881477355957, + "learning_rate": 1.049239290206871e-05, + "loss": 0.0049, + "num_input_tokens_seen": 27907456, + "step": 132235 + }, + { + "epoch": 14.547854785478547, + "grad_norm": 0.03404160961508751, + "learning_rate": 1.0490438354390363e-05, + "loss": 0.0413, + "num_input_tokens_seen": 27908480, + "step": 132240 + }, + { + "epoch": 14.548404840484048, + "grad_norm": 1.6489672660827637, + "learning_rate": 1.0488483940439692e-05, + "loss": 0.0494, + "num_input_tokens_seen": 27909664, + "step": 132245 + }, + { + "epoch": 14.54895489548955, + "grad_norm": 0.005319063551723957, + "learning_rate": 1.048652966023468e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27910752, + "step": 132250 + }, + { + "epoch": 14.549504950495049, + "grad_norm": 0.03568904474377632, + "learning_rate": 1.0484575513793382e-05, + "loss": 0.0497, + "num_input_tokens_seen": 27911776, + "step": 132255 + }, + { + "epoch": 14.55005500550055, + "grad_norm": 0.029629606753587723, + "learning_rate": 1.0482621501133785e-05, + "loss": 0.002, + "num_input_tokens_seen": 27912800, + "step": 132260 + }, + { + "epoch": 14.55060506050605, + "grad_norm": 0.003491121344268322, + "learning_rate": 1.048066762227389e-05, + "loss": 0.0495, + "num_input_tokens_seen": 27913952, + "step": 132265 + }, + { + "epoch": 14.551155115511552, + "grad_norm": 0.8367383480072021, + "learning_rate": 1.0478713877231724e-05, + "loss": 0.0086, + "num_input_tokens_seen": 27915040, + "step": 132270 + }, + { + "epoch": 14.551705170517051, + "grad_norm": 1.3673148155212402, + "learning_rate": 1.0476760266025276e-05, + "loss": 0.0268, + "num_input_tokens_seen": 27916064, + "step": 132275 + }, + { + "epoch": 14.552255225522552, + "grad_norm": 0.049237243831157684, + "learning_rate": 1.0474806788672558e-05, + "loss": 0.0791, + "num_input_tokens_seen": 27917120, + "step": 132280 + }, + { + "epoch": 14.552805280528053, + "grad_norm": 0.2983643710613251, + "learning_rate": 1.0472853445191585e-05, + "loss": 0.0098, + "num_input_tokens_seen": 27918144, + "step": 132285 + }, + { + "epoch": 14.553355335533553, + "grad_norm": 1.151203989982605, + "learning_rate": 1.0470900235600342e-05, + "loss": 0.0174, + "num_input_tokens_seen": 27919232, + "step": 132290 + }, + { + "epoch": 14.553905390539054, + "grad_norm": 0.017060590907931328, + "learning_rate": 1.046894715991685e-05, + "loss": 0.004, + "num_input_tokens_seen": 27920320, + "step": 132295 + }, + { + "epoch": 14.554455445544555, + "grad_norm": 0.018183209002017975, + "learning_rate": 1.046699421815909e-05, + "loss": 0.049, + "num_input_tokens_seen": 27921344, + "step": 132300 + }, + { + "epoch": 14.555005500550054, + "grad_norm": 0.032161321491003036, + "learning_rate": 1.0465041410345072e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27922368, + "step": 132305 + }, + { + "epoch": 14.555555555555555, + "grad_norm": 0.1259613186120987, + "learning_rate": 1.0463088736492798e-05, + "loss": 0.0037, + "num_input_tokens_seen": 27923424, + "step": 132310 + }, + { + "epoch": 14.556105610561056, + "grad_norm": 3.965090036392212, + "learning_rate": 1.0461136196620256e-05, + "loss": 0.3168, + "num_input_tokens_seen": 27924448, + "step": 132315 + }, + { + "epoch": 14.556655665566556, + "grad_norm": 0.0415547713637352, + "learning_rate": 1.045918379074545e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27925504, + "step": 132320 + }, + { + "epoch": 14.557205720572057, + "grad_norm": 0.269837886095047, + "learning_rate": 1.0457231518886363e-05, + "loss": 0.0196, + "num_input_tokens_seen": 27926560, + "step": 132325 + }, + { + "epoch": 14.557755775577558, + "grad_norm": 0.02374718338251114, + "learning_rate": 1.0455279381061001e-05, + "loss": 0.0066, + "num_input_tokens_seen": 27927616, + "step": 132330 + }, + { + "epoch": 14.558305830583059, + "grad_norm": 0.10321434587240219, + "learning_rate": 1.0453327377287342e-05, + "loss": 0.0662, + "num_input_tokens_seen": 27928736, + "step": 132335 + }, + { + "epoch": 14.558855885588558, + "grad_norm": 0.34465956687927246, + "learning_rate": 1.0451375507583385e-05, + "loss": 0.0857, + "num_input_tokens_seen": 27929888, + "step": 132340 + }, + { + "epoch": 14.55940594059406, + "grad_norm": 0.015776434913277626, + "learning_rate": 1.0449423771967125e-05, + "loss": 0.0067, + "num_input_tokens_seen": 27930976, + "step": 132345 + }, + { + "epoch": 14.55995599559956, + "grad_norm": 0.013470395468175411, + "learning_rate": 1.044747217045654e-05, + "loss": 0.0485, + "num_input_tokens_seen": 27932032, + "step": 132350 + }, + { + "epoch": 14.56050605060506, + "grad_norm": 0.025948969647288322, + "learning_rate": 1.0445520703069628e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27933024, + "step": 132355 + }, + { + "epoch": 14.561056105610561, + "grad_norm": 0.9889805912971497, + "learning_rate": 1.0443569369824366e-05, + "loss": 0.012, + "num_input_tokens_seen": 27934080, + "step": 132360 + }, + { + "epoch": 14.561606160616062, + "grad_norm": 1.6362473964691162, + "learning_rate": 1.0441618170738723e-05, + "loss": 0.0871, + "num_input_tokens_seen": 27935168, + "step": 132365 + }, + { + "epoch": 14.562156215621561, + "grad_norm": 0.07010155916213989, + "learning_rate": 1.043966710583072e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27936224, + "step": 132370 + }, + { + "epoch": 14.562706270627062, + "grad_norm": 0.030642887577414513, + "learning_rate": 1.0437716175118306e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27937344, + "step": 132375 + }, + { + "epoch": 14.563256325632564, + "grad_norm": 0.09340636432170868, + "learning_rate": 1.0435765378619486e-05, + "loss": 0.0957, + "num_input_tokens_seen": 27938400, + "step": 132380 + }, + { + "epoch": 14.563806380638063, + "grad_norm": 0.17471539974212646, + "learning_rate": 1.0433814716352228e-05, + "loss": 0.0142, + "num_input_tokens_seen": 27939424, + "step": 132385 + }, + { + "epoch": 14.564356435643564, + "grad_norm": 3.0555455684661865, + "learning_rate": 1.0431864188334497e-05, + "loss": 0.0942, + "num_input_tokens_seen": 27940448, + "step": 132390 + }, + { + "epoch": 14.564906490649065, + "grad_norm": 0.016373934224247932, + "learning_rate": 1.042991379458429e-05, + "loss": 0.0097, + "num_input_tokens_seen": 27941536, + "step": 132395 + }, + { + "epoch": 14.565456545654566, + "grad_norm": 0.05734991282224655, + "learning_rate": 1.042796353511957e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27942624, + "step": 132400 + }, + { + "epoch": 14.566006600660065, + "grad_norm": 1.8348820209503174, + "learning_rate": 1.042601340995833e-05, + "loss": 0.0735, + "num_input_tokens_seen": 27943680, + "step": 132405 + }, + { + "epoch": 14.566556655665567, + "grad_norm": 0.6803194880485535, + "learning_rate": 1.0424063419118532e-05, + "loss": 0.0631, + "num_input_tokens_seen": 27944672, + "step": 132410 + }, + { + "epoch": 14.567106710671068, + "grad_norm": 0.06877203285694122, + "learning_rate": 1.0422113562618135e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27945792, + "step": 132415 + }, + { + "epoch": 14.567656765676567, + "grad_norm": 0.00995455589145422, + "learning_rate": 1.0420163840475133e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27946816, + "step": 132420 + }, + { + "epoch": 14.568206820682068, + "grad_norm": 0.8467687964439392, + "learning_rate": 1.0418214252707467e-05, + "loss": 0.0218, + "num_input_tokens_seen": 27947808, + "step": 132425 + }, + { + "epoch": 14.56875687568757, + "grad_norm": 0.02781095914542675, + "learning_rate": 1.0416264799333143e-05, + "loss": 0.0069, + "num_input_tokens_seen": 27948832, + "step": 132430 + }, + { + "epoch": 14.569306930693068, + "grad_norm": 0.0546472929418087, + "learning_rate": 1.0414315480370105e-05, + "loss": 0.0036, + "num_input_tokens_seen": 27949888, + "step": 132435 + }, + { + "epoch": 14.56985698569857, + "grad_norm": 0.20366516709327698, + "learning_rate": 1.0412366295836315e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27950944, + "step": 132440 + }, + { + "epoch": 14.57040704070407, + "grad_norm": 0.20272502303123474, + "learning_rate": 1.0410417245749751e-05, + "loss": 0.0173, + "num_input_tokens_seen": 27952032, + "step": 132445 + }, + { + "epoch": 14.570957095709572, + "grad_norm": 0.012209479697048664, + "learning_rate": 1.0408468330128365e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27953120, + "step": 132450 + }, + { + "epoch": 14.571507150715071, + "grad_norm": 0.030469078570604324, + "learning_rate": 1.0406519548990123e-05, + "loss": 0.0012, + "num_input_tokens_seen": 27954272, + "step": 132455 + }, + { + "epoch": 14.572057205720572, + "grad_norm": 1.101408839225769, + "learning_rate": 1.0404570902352998e-05, + "loss": 0.0462, + "num_input_tokens_seen": 27955264, + "step": 132460 + }, + { + "epoch": 14.572607260726073, + "grad_norm": 0.05038181692361832, + "learning_rate": 1.040262239023493e-05, + "loss": 0.0085, + "num_input_tokens_seen": 27956320, + "step": 132465 + }, + { + "epoch": 14.573157315731573, + "grad_norm": 0.01971021108329296, + "learning_rate": 1.0400674012653897e-05, + "loss": 0.0014, + "num_input_tokens_seen": 27957344, + "step": 132470 + }, + { + "epoch": 14.573707370737074, + "grad_norm": 0.30650991201400757, + "learning_rate": 1.0398725769627845e-05, + "loss": 0.0902, + "num_input_tokens_seen": 27958400, + "step": 132475 + }, + { + "epoch": 14.574257425742575, + "grad_norm": 0.5992584824562073, + "learning_rate": 1.0396777661174722e-05, + "loss": 0.0236, + "num_input_tokens_seen": 27959520, + "step": 132480 + }, + { + "epoch": 14.574807480748074, + "grad_norm": 0.015035890974104404, + "learning_rate": 1.0394829687312491e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27960640, + "step": 132485 + }, + { + "epoch": 14.575357535753575, + "grad_norm": 0.1718577891588211, + "learning_rate": 1.0392881848059107e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27961760, + "step": 132490 + }, + { + "epoch": 14.575907590759076, + "grad_norm": 0.012138298712670803, + "learning_rate": 1.0390934143432535e-05, + "loss": 0.12, + "num_input_tokens_seen": 27962816, + "step": 132495 + }, + { + "epoch": 14.576457645764577, + "grad_norm": 0.007092937361449003, + "learning_rate": 1.0388986573450707e-05, + "loss": 0.117, + "num_input_tokens_seen": 27963840, + "step": 132500 + }, + { + "epoch": 14.577007700770077, + "grad_norm": 0.027986349537968636, + "learning_rate": 1.0387039138131574e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27964928, + "step": 132505 + }, + { + "epoch": 14.577557755775578, + "grad_norm": 0.25286561250686646, + "learning_rate": 1.0385091837493085e-05, + "loss": 0.0042, + "num_input_tokens_seen": 27965920, + "step": 132510 + }, + { + "epoch": 14.578107810781079, + "grad_norm": 0.4975471794605255, + "learning_rate": 1.0383144671553197e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27966912, + "step": 132515 + }, + { + "epoch": 14.578657865786578, + "grad_norm": 0.04067756235599518, + "learning_rate": 1.0381197640329857e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27967936, + "step": 132520 + }, + { + "epoch": 14.57920792079208, + "grad_norm": 0.030138125643134117, + "learning_rate": 1.0379250743841e-05, + "loss": 0.009, + "num_input_tokens_seen": 27968992, + "step": 132525 + }, + { + "epoch": 14.57975797579758, + "grad_norm": 0.05855593457818031, + "learning_rate": 1.037730398210457e-05, + "loss": 0.0138, + "num_input_tokens_seen": 27970048, + "step": 132530 + }, + { + "epoch": 14.58030803080308, + "grad_norm": 0.06229911744594574, + "learning_rate": 1.0375357355138517e-05, + "loss": 0.0044, + "num_input_tokens_seen": 27971104, + "step": 132535 + }, + { + "epoch": 14.58085808580858, + "grad_norm": 0.08921603858470917, + "learning_rate": 1.0373410862960772e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27972160, + "step": 132540 + }, + { + "epoch": 14.581408140814082, + "grad_norm": 0.022153956815600395, + "learning_rate": 1.037146450558928e-05, + "loss": 0.0349, + "num_input_tokens_seen": 27973280, + "step": 132545 + }, + { + "epoch": 14.581958195819581, + "grad_norm": 1.829643726348877, + "learning_rate": 1.0369518283041987e-05, + "loss": 0.043, + "num_input_tokens_seen": 27974304, + "step": 132550 + }, + { + "epoch": 14.582508250825082, + "grad_norm": 0.056806083768606186, + "learning_rate": 1.0367572195336816e-05, + "loss": 0.1127, + "num_input_tokens_seen": 27975328, + "step": 132555 + }, + { + "epoch": 14.583058305830583, + "grad_norm": 0.030164705589413643, + "learning_rate": 1.0365626242491722e-05, + "loss": 0.029, + "num_input_tokens_seen": 27976352, + "step": 132560 + }, + { + "epoch": 14.583608360836084, + "grad_norm": 0.029076874256134033, + "learning_rate": 1.0363680424524616e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27977376, + "step": 132565 + }, + { + "epoch": 14.584158415841584, + "grad_norm": 0.017759738489985466, + "learning_rate": 1.0361734741453447e-05, + "loss": 0.0081, + "num_input_tokens_seen": 27978464, + "step": 132570 + }, + { + "epoch": 14.584708470847085, + "grad_norm": 2.4609508514404297, + "learning_rate": 1.0359789193296151e-05, + "loss": 0.0351, + "num_input_tokens_seen": 27979488, + "step": 132575 + }, + { + "epoch": 14.585258525852586, + "grad_norm": 1.0200375318527222, + "learning_rate": 1.0357843780070648e-05, + "loss": 0.0742, + "num_input_tokens_seen": 27980512, + "step": 132580 + }, + { + "epoch": 14.585808580858085, + "grad_norm": 0.22136616706848145, + "learning_rate": 1.0355898501794878e-05, + "loss": 0.0082, + "num_input_tokens_seen": 27981568, + "step": 132585 + }, + { + "epoch": 14.586358635863586, + "grad_norm": 0.13667231798171997, + "learning_rate": 1.0353953358486757e-05, + "loss": 0.0916, + "num_input_tokens_seen": 27982688, + "step": 132590 + }, + { + "epoch": 14.586908690869087, + "grad_norm": 0.04169846698641777, + "learning_rate": 1.035200835016423e-05, + "loss": 0.0016, + "num_input_tokens_seen": 27983776, + "step": 132595 + }, + { + "epoch": 14.587458745874587, + "grad_norm": 0.02366785891354084, + "learning_rate": 1.0350063476845204e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27984832, + "step": 132600 + }, + { + "epoch": 14.588008800880088, + "grad_norm": 0.08249279856681824, + "learning_rate": 1.0348118738547611e-05, + "loss": 0.004, + "num_input_tokens_seen": 27985888, + "step": 132605 + }, + { + "epoch": 14.588558855885589, + "grad_norm": 0.024792727082967758, + "learning_rate": 1.0346174135289388e-05, + "loss": 0.0181, + "num_input_tokens_seen": 27987040, + "step": 132610 + }, + { + "epoch": 14.589108910891088, + "grad_norm": 0.5872111916542053, + "learning_rate": 1.0344229667088437e-05, + "loss": 0.0326, + "num_input_tokens_seen": 27988128, + "step": 132615 + }, + { + "epoch": 14.58965896589659, + "grad_norm": 0.016955185681581497, + "learning_rate": 1.0342285333962697e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27989184, + "step": 132620 + }, + { + "epoch": 14.59020902090209, + "grad_norm": 0.013692520558834076, + "learning_rate": 1.034034113593007e-05, + "loss": 0.0333, + "num_input_tokens_seen": 27990240, + "step": 132625 + }, + { + "epoch": 14.590759075907592, + "grad_norm": 0.20090968906879425, + "learning_rate": 1.0338397073008488e-05, + "loss": 0.0128, + "num_input_tokens_seen": 27991264, + "step": 132630 + }, + { + "epoch": 14.591309130913091, + "grad_norm": 0.053070344030857086, + "learning_rate": 1.0336453145215869e-05, + "loss": 0.0028, + "num_input_tokens_seen": 27992352, + "step": 132635 + }, + { + "epoch": 14.591859185918592, + "grad_norm": 0.016604837030172348, + "learning_rate": 1.033450935257013e-05, + "loss": 0.0158, + "num_input_tokens_seen": 27993440, + "step": 132640 + }, + { + "epoch": 14.592409240924093, + "grad_norm": 0.2084970623254776, + "learning_rate": 1.033256569508917e-05, + "loss": 0.0276, + "num_input_tokens_seen": 27994496, + "step": 132645 + }, + { + "epoch": 14.592959295929592, + "grad_norm": 0.04119604825973511, + "learning_rate": 1.033062217279092e-05, + "loss": 0.0383, + "num_input_tokens_seen": 27995584, + "step": 132650 + }, + { + "epoch": 14.593509350935093, + "grad_norm": 0.010964118875563145, + "learning_rate": 1.0328678785693279e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27996576, + "step": 132655 + }, + { + "epoch": 14.594059405940595, + "grad_norm": 0.04085502028465271, + "learning_rate": 1.0326735533814166e-05, + "loss": 0.0022, + "num_input_tokens_seen": 27997696, + "step": 132660 + }, + { + "epoch": 14.594609460946094, + "grad_norm": 0.11131095141172409, + "learning_rate": 1.0324792417171502e-05, + "loss": 0.0025, + "num_input_tokens_seen": 27998720, + "step": 132665 + }, + { + "epoch": 14.595159515951595, + "grad_norm": 0.008510000072419643, + "learning_rate": 1.0322849435783174e-05, + "loss": 0.0076, + "num_input_tokens_seen": 27999776, + "step": 132670 + }, + { + "epoch": 14.595709570957096, + "grad_norm": 2.95931339263916, + "learning_rate": 1.0320906589667109e-05, + "loss": 0.0914, + "num_input_tokens_seen": 28000832, + "step": 132675 + }, + { + "epoch": 14.596259625962595, + "grad_norm": 0.013710115104913712, + "learning_rate": 1.0318963878841192e-05, + "loss": 0.0162, + "num_input_tokens_seen": 28001888, + "step": 132680 + }, + { + "epoch": 14.596809680968097, + "grad_norm": 0.013406965881586075, + "learning_rate": 1.0317021303323346e-05, + "loss": 0.0588, + "num_input_tokens_seen": 28002944, + "step": 132685 + }, + { + "epoch": 14.597359735973598, + "grad_norm": 0.03709704428911209, + "learning_rate": 1.0315078863131475e-05, + "loss": 0.0017, + "num_input_tokens_seen": 28004000, + "step": 132690 + }, + { + "epoch": 14.597909790979099, + "grad_norm": 0.005407686345279217, + "learning_rate": 1.0313136558283466e-05, + "loss": 0.0038, + "num_input_tokens_seen": 28005088, + "step": 132695 + }, + { + "epoch": 14.598459845984598, + "grad_norm": 0.03596198186278343, + "learning_rate": 1.0311194388797241e-05, + "loss": 0.0283, + "num_input_tokens_seen": 28006176, + "step": 132700 + }, + { + "epoch": 14.599009900990099, + "grad_norm": 0.11424289643764496, + "learning_rate": 1.0309252354690676e-05, + "loss": 0.0016, + "num_input_tokens_seen": 28007328, + "step": 132705 + }, + { + "epoch": 14.5995599559956, + "grad_norm": 0.2697034180164337, + "learning_rate": 1.0307310455981694e-05, + "loss": 0.0716, + "num_input_tokens_seen": 28008448, + "step": 132710 + }, + { + "epoch": 14.6001100110011, + "grad_norm": 0.024613134562969208, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.0072, + "num_input_tokens_seen": 28009472, + "step": 132715 + }, + { + "epoch": 14.6006600660066, + "grad_norm": 1.640714168548584, + "learning_rate": 1.030342706482802e-05, + "loss": 0.0576, + "num_input_tokens_seen": 28010464, + "step": 132720 + }, + { + "epoch": 14.601210121012102, + "grad_norm": 0.026567867025732994, + "learning_rate": 1.0301485572419132e-05, + "loss": 0.0014, + "num_input_tokens_seen": 28011488, + "step": 132725 + }, + { + "epoch": 14.601760176017601, + "grad_norm": 0.1132664903998375, + "learning_rate": 1.0299544215479392e-05, + "loss": 0.006, + "num_input_tokens_seen": 28012576, + "step": 132730 + }, + { + "epoch": 14.602310231023102, + "grad_norm": 0.02633284591138363, + "learning_rate": 1.0297602994026706e-05, + "loss": 0.002, + "num_input_tokens_seen": 28013664, + "step": 132735 + }, + { + "epoch": 14.602860286028603, + "grad_norm": 0.06580676883459091, + "learning_rate": 1.0295661908078953e-05, + "loss": 0.0018, + "num_input_tokens_seen": 28014720, + "step": 132740 + }, + { + "epoch": 14.603410341034103, + "grad_norm": 0.04862898960709572, + "learning_rate": 1.0293720957654025e-05, + "loss": 0.0806, + "num_input_tokens_seen": 28015712, + "step": 132745 + }, + { + "epoch": 14.603960396039604, + "grad_norm": 0.086769700050354, + "learning_rate": 1.0291780142769825e-05, + "loss": 0.0281, + "num_input_tokens_seen": 28016800, + "step": 132750 + }, + { + "epoch": 14.604510451045105, + "grad_norm": 2.5812795162200928, + "learning_rate": 1.0289839463444217e-05, + "loss": 0.0159, + "num_input_tokens_seen": 28017856, + "step": 132755 + }, + { + "epoch": 14.605060506050606, + "grad_norm": 0.009662503376603127, + "learning_rate": 1.0287898919695113e-05, + "loss": 0.0254, + "num_input_tokens_seen": 28018976, + "step": 132760 + }, + { + "epoch": 14.605610561056105, + "grad_norm": 0.2711222171783447, + "learning_rate": 1.0285958511540383e-05, + "loss": 0.0735, + "num_input_tokens_seen": 28020064, + "step": 132765 + }, + { + "epoch": 14.606160616061606, + "grad_norm": 0.8862748146057129, + "learning_rate": 1.0284018238997906e-05, + "loss": 0.0127, + "num_input_tokens_seen": 28021056, + "step": 132770 + }, + { + "epoch": 14.606710671067107, + "grad_norm": 3.007848024368286, + "learning_rate": 1.028207810208557e-05, + "loss": 0.1031, + "num_input_tokens_seen": 28022112, + "step": 132775 + }, + { + "epoch": 14.607260726072607, + "grad_norm": 1.7857095003128052, + "learning_rate": 1.028013810082126e-05, + "loss": 0.094, + "num_input_tokens_seen": 28023136, + "step": 132780 + }, + { + "epoch": 14.607810781078108, + "grad_norm": 0.245334655046463, + "learning_rate": 1.0278198235222861e-05, + "loss": 0.0166, + "num_input_tokens_seen": 28024128, + "step": 132785 + }, + { + "epoch": 14.608360836083609, + "grad_norm": 2.1724703311920166, + "learning_rate": 1.0276258505308245e-05, + "loss": 0.0606, + "num_input_tokens_seen": 28025184, + "step": 132790 + }, + { + "epoch": 14.608910891089108, + "grad_norm": 1.7239373922348022, + "learning_rate": 1.0274318911095279e-05, + "loss": 0.0921, + "num_input_tokens_seen": 28026272, + "step": 132795 + }, + { + "epoch": 14.60946094609461, + "grad_norm": 0.0628051906824112, + "learning_rate": 1.027237945260185e-05, + "loss": 0.002, + "num_input_tokens_seen": 28027328, + "step": 132800 + }, + { + "epoch": 14.61001100110011, + "grad_norm": 0.057782966643571854, + "learning_rate": 1.0270440129845835e-05, + "loss": 0.0017, + "num_input_tokens_seen": 28028352, + "step": 132805 + }, + { + "epoch": 14.61056105610561, + "grad_norm": 0.728524386882782, + "learning_rate": 1.0268500942845111e-05, + "loss": 0.0127, + "num_input_tokens_seen": 28029408, + "step": 132810 + }, + { + "epoch": 14.61111111111111, + "grad_norm": 1.2305911779403687, + "learning_rate": 1.0266561891617545e-05, + "loss": 0.0372, + "num_input_tokens_seen": 28030400, + "step": 132815 + }, + { + "epoch": 14.611661166116612, + "grad_norm": 0.012079562060534954, + "learning_rate": 1.0264622976181001e-05, + "loss": 0.0021, + "num_input_tokens_seen": 28031456, + "step": 132820 + }, + { + "epoch": 14.612211221122113, + "grad_norm": 0.008140690624713898, + "learning_rate": 1.0262684196553366e-05, + "loss": 0.0062, + "num_input_tokens_seen": 28032512, + "step": 132825 + }, + { + "epoch": 14.612761276127612, + "grad_norm": 0.022515205666422844, + "learning_rate": 1.0260745552752488e-05, + "loss": 0.0923, + "num_input_tokens_seen": 28033504, + "step": 132830 + }, + { + "epoch": 14.613311331133113, + "grad_norm": 0.01219237968325615, + "learning_rate": 1.0258807044796249e-05, + "loss": 0.125, + "num_input_tokens_seen": 28034528, + "step": 132835 + }, + { + "epoch": 14.613861386138614, + "grad_norm": 0.14582808315753937, + "learning_rate": 1.0256868672702515e-05, + "loss": 0.0585, + "num_input_tokens_seen": 28035616, + "step": 132840 + }, + { + "epoch": 14.614411441144114, + "grad_norm": 0.04056309163570404, + "learning_rate": 1.0254930436489147e-05, + "loss": 0.004, + "num_input_tokens_seen": 28036672, + "step": 132845 + }, + { + "epoch": 14.614961496149615, + "grad_norm": 0.005774840712547302, + "learning_rate": 1.0252992336174014e-05, + "loss": 0.0028, + "num_input_tokens_seen": 28037728, + "step": 132850 + }, + { + "epoch": 14.615511551155116, + "grad_norm": 0.014952760189771652, + "learning_rate": 1.0251054371774965e-05, + "loss": 0.0028, + "num_input_tokens_seen": 28038784, + "step": 132855 + }, + { + "epoch": 14.616061606160617, + "grad_norm": 0.025049323216080666, + "learning_rate": 1.024911654330987e-05, + "loss": 0.0151, + "num_input_tokens_seen": 28039840, + "step": 132860 + }, + { + "epoch": 14.616611661166116, + "grad_norm": 0.00826435349881649, + "learning_rate": 1.0247178850796602e-05, + "loss": 0.0045, + "num_input_tokens_seen": 28040896, + "step": 132865 + }, + { + "epoch": 14.617161716171617, + "grad_norm": 0.3886053264141083, + "learning_rate": 1.0245241294252993e-05, + "loss": 0.1172, + "num_input_tokens_seen": 28041888, + "step": 132870 + }, + { + "epoch": 14.617711771177119, + "grad_norm": 0.07952426373958588, + "learning_rate": 1.024330387369693e-05, + "loss": 0.0028, + "num_input_tokens_seen": 28042912, + "step": 132875 + }, + { + "epoch": 14.618261826182618, + "grad_norm": 0.43517646193504333, + "learning_rate": 1.024136658914625e-05, + "loss": 0.0108, + "num_input_tokens_seen": 28044000, + "step": 132880 + }, + { + "epoch": 14.618811881188119, + "grad_norm": 0.04811905696988106, + "learning_rate": 1.0239429440618806e-05, + "loss": 0.1242, + "num_input_tokens_seen": 28045088, + "step": 132885 + }, + { + "epoch": 14.61936193619362, + "grad_norm": 0.17481252551078796, + "learning_rate": 1.0237492428132459e-05, + "loss": 0.1278, + "num_input_tokens_seen": 28046080, + "step": 132890 + }, + { + "epoch": 14.61991199119912, + "grad_norm": 0.013062075711786747, + "learning_rate": 1.0235555551705057e-05, + "loss": 0.0062, + "num_input_tokens_seen": 28047136, + "step": 132895 + }, + { + "epoch": 14.62046204620462, + "grad_norm": 0.024216866120696068, + "learning_rate": 1.0233618811354465e-05, + "loss": 0.1041, + "num_input_tokens_seen": 28048224, + "step": 132900 + }, + { + "epoch": 14.621012101210122, + "grad_norm": 0.057916175574064255, + "learning_rate": 1.0231682207098525e-05, + "loss": 0.0015, + "num_input_tokens_seen": 28049280, + "step": 132905 + }, + { + "epoch": 14.62156215621562, + "grad_norm": 0.006079527549445629, + "learning_rate": 1.0229745738955071e-05, + "loss": 0.0021, + "num_input_tokens_seen": 28050336, + "step": 132910 + }, + { + "epoch": 14.622112211221122, + "grad_norm": 0.0060623870231211185, + "learning_rate": 1.0227809406941965e-05, + "loss": 0.0064, + "num_input_tokens_seen": 28051424, + "step": 132915 + }, + { + "epoch": 14.622662266226623, + "grad_norm": 0.04571658745408058, + "learning_rate": 1.0225873211077052e-05, + "loss": 0.0781, + "num_input_tokens_seen": 28052480, + "step": 132920 + }, + { + "epoch": 14.623212321232124, + "grad_norm": 0.008292547427117825, + "learning_rate": 1.0223937151378184e-05, + "loss": 0.0649, + "num_input_tokens_seen": 28053504, + "step": 132925 + }, + { + "epoch": 14.623762376237623, + "grad_norm": 0.035299595445394516, + "learning_rate": 1.0222001227863199e-05, + "loss": 0.0165, + "num_input_tokens_seen": 28054528, + "step": 132930 + }, + { + "epoch": 14.624312431243125, + "grad_norm": 0.033216170966625214, + "learning_rate": 1.0220065440549925e-05, + "loss": 0.0061, + "num_input_tokens_seen": 28055584, + "step": 132935 + }, + { + "epoch": 14.624862486248626, + "grad_norm": 0.99891197681427, + "learning_rate": 1.0218129789456227e-05, + "loss": 0.071, + "num_input_tokens_seen": 28056704, + "step": 132940 + }, + { + "epoch": 14.625412541254125, + "grad_norm": 0.4709778428077698, + "learning_rate": 1.0216194274599924e-05, + "loss": 0.1087, + "num_input_tokens_seen": 28057792, + "step": 132945 + }, + { + "epoch": 14.625962596259626, + "grad_norm": 0.5035399198532104, + "learning_rate": 1.0214258895998865e-05, + "loss": 0.0072, + "num_input_tokens_seen": 28058848, + "step": 132950 + }, + { + "epoch": 14.626512651265127, + "grad_norm": 0.7095303535461426, + "learning_rate": 1.0212323653670897e-05, + "loss": 0.0102, + "num_input_tokens_seen": 28059904, + "step": 132955 + }, + { + "epoch": 14.627062706270626, + "grad_norm": 0.012412721291184425, + "learning_rate": 1.0210388547633836e-05, + "loss": 0.0019, + "num_input_tokens_seen": 28060928, + "step": 132960 + }, + { + "epoch": 14.627612761276128, + "grad_norm": 0.0557388961315155, + "learning_rate": 1.0208453577905535e-05, + "loss": 0.0209, + "num_input_tokens_seen": 28062016, + "step": 132965 + }, + { + "epoch": 14.628162816281629, + "grad_norm": 0.00833494309335947, + "learning_rate": 1.0206518744503812e-05, + "loss": 0.0037, + "num_input_tokens_seen": 28063008, + "step": 132970 + }, + { + "epoch": 14.628712871287128, + "grad_norm": 0.0019845901988446712, + "learning_rate": 1.0204584047446505e-05, + "loss": 0.0366, + "num_input_tokens_seen": 28064032, + "step": 132975 + }, + { + "epoch": 14.629262926292629, + "grad_norm": 0.2415955513715744, + "learning_rate": 1.020264948675146e-05, + "loss": 0.1001, + "num_input_tokens_seen": 28065088, + "step": 132980 + }, + { + "epoch": 14.62981298129813, + "grad_norm": 0.050571076571941376, + "learning_rate": 1.0200715062436484e-05, + "loss": 0.0026, + "num_input_tokens_seen": 28066176, + "step": 132985 + }, + { + "epoch": 14.630363036303631, + "grad_norm": 1.7543128728866577, + "learning_rate": 1.0198780774519422e-05, + "loss": 0.0314, + "num_input_tokens_seen": 28067264, + "step": 132990 + }, + { + "epoch": 14.63091309130913, + "grad_norm": 0.036417532712221146, + "learning_rate": 1.0196846623018089e-05, + "loss": 0.0251, + "num_input_tokens_seen": 28068352, + "step": 132995 + }, + { + "epoch": 14.631463146314632, + "grad_norm": 0.07331860065460205, + "learning_rate": 1.0194912607950324e-05, + "loss": 0.056, + "num_input_tokens_seen": 28069504, + "step": 133000 + }, + { + "epoch": 14.632013201320133, + "grad_norm": 0.06523861736059189, + "learning_rate": 1.019297872933394e-05, + "loss": 0.0067, + "num_input_tokens_seen": 28070528, + "step": 133005 + }, + { + "epoch": 14.632563256325632, + "grad_norm": 0.06049716845154762, + "learning_rate": 1.0191044987186762e-05, + "loss": 0.0014, + "num_input_tokens_seen": 28071552, + "step": 133010 + }, + { + "epoch": 14.633113311331133, + "grad_norm": 0.28012391924858093, + "learning_rate": 1.0189111381526628e-05, + "loss": 0.0422, + "num_input_tokens_seen": 28072640, + "step": 133015 + }, + { + "epoch": 14.633663366336634, + "grad_norm": 0.03347951918840408, + "learning_rate": 1.0187177912371348e-05, + "loss": 0.0034, + "num_input_tokens_seen": 28073728, + "step": 133020 + }, + { + "epoch": 14.634213421342134, + "grad_norm": 0.04871243238449097, + "learning_rate": 1.0185244579738732e-05, + "loss": 0.0115, + "num_input_tokens_seen": 28074848, + "step": 133025 + }, + { + "epoch": 14.634763476347635, + "grad_norm": 0.03150555118918419, + "learning_rate": 1.0183311383646616e-05, + "loss": 0.0011, + "num_input_tokens_seen": 28075968, + "step": 133030 + }, + { + "epoch": 14.635313531353136, + "grad_norm": 1.264099359512329, + "learning_rate": 1.0181378324112789e-05, + "loss": 0.05, + "num_input_tokens_seen": 28077024, + "step": 133035 + }, + { + "epoch": 14.635863586358635, + "grad_norm": 0.15204544365406036, + "learning_rate": 1.0179445401155108e-05, + "loss": 0.0777, + "num_input_tokens_seen": 28078112, + "step": 133040 + }, + { + "epoch": 14.636413641364136, + "grad_norm": 0.11590515822172165, + "learning_rate": 1.0177512614791368e-05, + "loss": 0.0026, + "num_input_tokens_seen": 28079168, + "step": 133045 + }, + { + "epoch": 14.636963696369637, + "grad_norm": 0.6396220326423645, + "learning_rate": 1.0175579965039369e-05, + "loss": 0.0586, + "num_input_tokens_seen": 28080256, + "step": 133050 + }, + { + "epoch": 14.637513751375138, + "grad_norm": 0.04506482556462288, + "learning_rate": 1.0173647451916948e-05, + "loss": 0.0806, + "num_input_tokens_seen": 28081344, + "step": 133055 + }, + { + "epoch": 14.638063806380638, + "grad_norm": 0.061888352036476135, + "learning_rate": 1.0171715075441895e-05, + "loss": 0.0817, + "num_input_tokens_seen": 28082400, + "step": 133060 + }, + { + "epoch": 14.638613861386139, + "grad_norm": 0.014662982895970345, + "learning_rate": 1.016978283563203e-05, + "loss": 0.0009, + "num_input_tokens_seen": 28083520, + "step": 133065 + }, + { + "epoch": 14.63916391639164, + "grad_norm": 0.007366367615759373, + "learning_rate": 1.0167850732505166e-05, + "loss": 0.0006, + "num_input_tokens_seen": 28084544, + "step": 133070 + }, + { + "epoch": 14.63971397139714, + "grad_norm": 0.910416305065155, + "learning_rate": 1.01659187660791e-05, + "loss": 0.0157, + "num_input_tokens_seen": 28085600, + "step": 133075 + }, + { + "epoch": 14.64026402640264, + "grad_norm": 0.6057975888252258, + "learning_rate": 1.0163986936371648e-05, + "loss": 0.0071, + "num_input_tokens_seen": 28086656, + "step": 133080 + }, + { + "epoch": 14.640814081408141, + "grad_norm": 0.1621093451976776, + "learning_rate": 1.01620552434006e-05, + "loss": 0.004, + "num_input_tokens_seen": 28087680, + "step": 133085 + }, + { + "epoch": 14.64136413641364, + "grad_norm": 0.04210090637207031, + "learning_rate": 1.016012368718377e-05, + "loss": 0.002, + "num_input_tokens_seen": 28088768, + "step": 133090 + }, + { + "epoch": 14.641914191419142, + "grad_norm": 0.044769659638404846, + "learning_rate": 1.015819226773897e-05, + "loss": 0.0535, + "num_input_tokens_seen": 28089856, + "step": 133095 + }, + { + "epoch": 14.642464246424643, + "grad_norm": 0.012127420864999294, + "learning_rate": 1.015626098508398e-05, + "loss": 0.0037, + "num_input_tokens_seen": 28090912, + "step": 133100 + }, + { + "epoch": 14.643014301430142, + "grad_norm": 0.04167241230607033, + "learning_rate": 1.0154329839236617e-05, + "loss": 0.0015, + "num_input_tokens_seen": 28091936, + "step": 133105 + }, + { + "epoch": 14.643564356435643, + "grad_norm": 0.25660794973373413, + "learning_rate": 1.0152398830214665e-05, + "loss": 0.0026, + "num_input_tokens_seen": 28092928, + "step": 133110 + }, + { + "epoch": 14.644114411441144, + "grad_norm": 3.423112392425537, + "learning_rate": 1.0150467958035936e-05, + "loss": 0.1379, + "num_input_tokens_seen": 28094016, + "step": 133115 + }, + { + "epoch": 14.644664466446645, + "grad_norm": 0.10193916410207748, + "learning_rate": 1.014853722271821e-05, + "loss": 0.0454, + "num_input_tokens_seen": 28095008, + "step": 133120 + }, + { + "epoch": 14.645214521452145, + "grad_norm": 0.026898857206106186, + "learning_rate": 1.014660662427929e-05, + "loss": 0.0982, + "num_input_tokens_seen": 28096032, + "step": 133125 + }, + { + "epoch": 14.645764576457646, + "grad_norm": 0.01239487249404192, + "learning_rate": 1.014467616273698e-05, + "loss": 0.0686, + "num_input_tokens_seen": 28097088, + "step": 133130 + }, + { + "epoch": 14.646314631463147, + "grad_norm": 0.03822918236255646, + "learning_rate": 1.014274583810905e-05, + "loss": 0.0554, + "num_input_tokens_seen": 28098144, + "step": 133135 + }, + { + "epoch": 14.646864686468646, + "grad_norm": 0.0293556097894907, + "learning_rate": 1.0140815650413312e-05, + "loss": 0.0176, + "num_input_tokens_seen": 28099232, + "step": 133140 + }, + { + "epoch": 14.647414741474147, + "grad_norm": 0.008503129705786705, + "learning_rate": 1.0138885599667547e-05, + "loss": 0.0092, + "num_input_tokens_seen": 28100352, + "step": 133145 + }, + { + "epoch": 14.647964796479648, + "grad_norm": 1.4999489784240723, + "learning_rate": 1.0136955685889523e-05, + "loss": 0.0831, + "num_input_tokens_seen": 28101408, + "step": 133150 + }, + { + "epoch": 14.648514851485148, + "grad_norm": 0.18907971680164337, + "learning_rate": 1.0135025909097068e-05, + "loss": 0.005, + "num_input_tokens_seen": 28102464, + "step": 133155 + }, + { + "epoch": 14.649064906490649, + "grad_norm": 0.01377902738749981, + "learning_rate": 1.0133096269307934e-05, + "loss": 0.1132, + "num_input_tokens_seen": 28103456, + "step": 133160 + }, + { + "epoch": 14.64961496149615, + "grad_norm": 0.04607654735445976, + "learning_rate": 1.0131166766539926e-05, + "loss": 0.0468, + "num_input_tokens_seen": 28104448, + "step": 133165 + }, + { + "epoch": 14.65016501650165, + "grad_norm": 0.08292384445667267, + "learning_rate": 1.0129237400810821e-05, + "loss": 0.0057, + "num_input_tokens_seen": 28105472, + "step": 133170 + }, + { + "epoch": 14.65071507150715, + "grad_norm": 0.1506170779466629, + "learning_rate": 1.0127308172138392e-05, + "loss": 0.0208, + "num_input_tokens_seen": 28106464, + "step": 133175 + }, + { + "epoch": 14.651265126512651, + "grad_norm": 0.03255374729633331, + "learning_rate": 1.0125379080540423e-05, + "loss": 0.1007, + "num_input_tokens_seen": 28107520, + "step": 133180 + }, + { + "epoch": 14.651815181518153, + "grad_norm": 0.08636873960494995, + "learning_rate": 1.01234501260347e-05, + "loss": 0.1025, + "num_input_tokens_seen": 28108704, + "step": 133185 + }, + { + "epoch": 14.652365236523652, + "grad_norm": 0.014752413146197796, + "learning_rate": 1.0121521308639006e-05, + "loss": 0.0088, + "num_input_tokens_seen": 28109792, + "step": 133190 + }, + { + "epoch": 14.652915291529153, + "grad_norm": 0.022631850093603134, + "learning_rate": 1.0119592628371113e-05, + "loss": 0.0069, + "num_input_tokens_seen": 28110848, + "step": 133195 + }, + { + "epoch": 14.653465346534654, + "grad_norm": 1.8396955728530884, + "learning_rate": 1.0117664085248785e-05, + "loss": 0.0392, + "num_input_tokens_seen": 28111936, + "step": 133200 + }, + { + "epoch": 14.654015401540153, + "grad_norm": 0.025873912498354912, + "learning_rate": 1.0115735679289811e-05, + "loss": 0.0952, + "num_input_tokens_seen": 28112992, + "step": 133205 + }, + { + "epoch": 14.654565456545654, + "grad_norm": 0.038835637271404266, + "learning_rate": 1.0113807410511955e-05, + "loss": 0.0258, + "num_input_tokens_seen": 28114016, + "step": 133210 + }, + { + "epoch": 14.655115511551156, + "grad_norm": 1.1494532823562622, + "learning_rate": 1.0111879278932989e-05, + "loss": 0.0165, + "num_input_tokens_seen": 28115104, + "step": 133215 + }, + { + "epoch": 14.655665566556655, + "grad_norm": 0.01943175308406353, + "learning_rate": 1.01099512845707e-05, + "loss": 0.0457, + "num_input_tokens_seen": 28116160, + "step": 133220 + }, + { + "epoch": 14.656215621562156, + "grad_norm": 0.03589154779911041, + "learning_rate": 1.0108023427442837e-05, + "loss": 0.0111, + "num_input_tokens_seen": 28117184, + "step": 133225 + }, + { + "epoch": 14.656765676567657, + "grad_norm": 0.04878374561667442, + "learning_rate": 1.0106095707567181e-05, + "loss": 0.0093, + "num_input_tokens_seen": 28118272, + "step": 133230 + }, + { + "epoch": 14.657315731573158, + "grad_norm": 0.010103591717779636, + "learning_rate": 1.0104168124961488e-05, + "loss": 0.0105, + "num_input_tokens_seen": 28119360, + "step": 133235 + }, + { + "epoch": 14.657865786578657, + "grad_norm": 0.3244166970252991, + "learning_rate": 1.0102240679643529e-05, + "loss": 0.0301, + "num_input_tokens_seen": 28120448, + "step": 133240 + }, + { + "epoch": 14.658415841584159, + "grad_norm": 1.391217589378357, + "learning_rate": 1.0100313371631077e-05, + "loss": 0.0126, + "num_input_tokens_seen": 28121536, + "step": 133245 + }, + { + "epoch": 14.65896589658966, + "grad_norm": 1.7071747779846191, + "learning_rate": 1.0098386200941878e-05, + "loss": 0.0824, + "num_input_tokens_seen": 28122592, + "step": 133250 + }, + { + "epoch": 14.659515951595159, + "grad_norm": 0.0587281659245491, + "learning_rate": 1.0096459167593714e-05, + "loss": 0.0095, + "num_input_tokens_seen": 28123648, + "step": 133255 + }, + { + "epoch": 14.66006600660066, + "grad_norm": 0.7826296091079712, + "learning_rate": 1.0094532271604334e-05, + "loss": 0.0197, + "num_input_tokens_seen": 28124800, + "step": 133260 + }, + { + "epoch": 14.660616061606161, + "grad_norm": 0.0485733263194561, + "learning_rate": 1.0092605512991487e-05, + "loss": 0.0045, + "num_input_tokens_seen": 28125856, + "step": 133265 + }, + { + "epoch": 14.66116611661166, + "grad_norm": 0.033958856016397476, + "learning_rate": 1.0090678891772943e-05, + "loss": 0.0349, + "num_input_tokens_seen": 28126912, + "step": 133270 + }, + { + "epoch": 14.661716171617162, + "grad_norm": 0.05384200066328049, + "learning_rate": 1.0088752407966456e-05, + "loss": 0.0824, + "num_input_tokens_seen": 28127968, + "step": 133275 + }, + { + "epoch": 14.662266226622663, + "grad_norm": 0.05776546150445938, + "learning_rate": 1.0086826061589794e-05, + "loss": 0.0034, + "num_input_tokens_seen": 28129056, + "step": 133280 + }, + { + "epoch": 14.662816281628164, + "grad_norm": 0.06029457226395607, + "learning_rate": 1.0084899852660701e-05, + "loss": 0.0473, + "num_input_tokens_seen": 28130080, + "step": 133285 + }, + { + "epoch": 14.663366336633663, + "grad_norm": 0.06521666795015335, + "learning_rate": 1.0082973781196916e-05, + "loss": 0.0027, + "num_input_tokens_seen": 28131104, + "step": 133290 + }, + { + "epoch": 14.663916391639164, + "grad_norm": 0.054447073489427567, + "learning_rate": 1.0081047847216207e-05, + "loss": 0.0156, + "num_input_tokens_seen": 28132256, + "step": 133295 + }, + { + "epoch": 14.664466446644665, + "grad_norm": 0.015277066268026829, + "learning_rate": 1.007912205073632e-05, + "loss": 0.0016, + "num_input_tokens_seen": 28133312, + "step": 133300 + }, + { + "epoch": 14.665016501650165, + "grad_norm": 2.496171712875366, + "learning_rate": 1.0077196391775015e-05, + "loss": 0.0514, + "num_input_tokens_seen": 28134368, + "step": 133305 + }, + { + "epoch": 14.665566556655666, + "grad_norm": 0.030321665108203888, + "learning_rate": 1.007527087035003e-05, + "loss": 0.0621, + "num_input_tokens_seen": 28135392, + "step": 133310 + }, + { + "epoch": 14.666116611661167, + "grad_norm": 0.0707344263792038, + "learning_rate": 1.0073345486479103e-05, + "loss": 0.0631, + "num_input_tokens_seen": 28136480, + "step": 133315 + }, + { + "epoch": 14.666666666666666, + "grad_norm": 0.045623891055583954, + "learning_rate": 1.0071420240179997e-05, + "loss": 0.0209, + "num_input_tokens_seen": 28137440, + "step": 133320 + }, + { + "epoch": 14.667216721672167, + "grad_norm": 0.007717299275100231, + "learning_rate": 1.0069495131470439e-05, + "loss": 0.0015, + "num_input_tokens_seen": 28138592, + "step": 133325 + }, + { + "epoch": 14.667766776677668, + "grad_norm": 0.026858912780880928, + "learning_rate": 1.0067570160368178e-05, + "loss": 0.0041, + "num_input_tokens_seen": 28139584, + "step": 133330 + }, + { + "epoch": 14.668316831683168, + "grad_norm": 0.06776923686265945, + "learning_rate": 1.0065645326890969e-05, + "loss": 0.0017, + "num_input_tokens_seen": 28140704, + "step": 133335 + }, + { + "epoch": 14.668866886688669, + "grad_norm": 0.005371679086238146, + "learning_rate": 1.0063720631056528e-05, + "loss": 0.0015, + "num_input_tokens_seen": 28141792, + "step": 133340 + }, + { + "epoch": 14.66941694169417, + "grad_norm": 0.009801226668059826, + "learning_rate": 1.0061796072882618e-05, + "loss": 0.0026, + "num_input_tokens_seen": 28142880, + "step": 133345 + }, + { + "epoch": 14.66996699669967, + "grad_norm": 0.03309933841228485, + "learning_rate": 1.0059871652386957e-05, + "loss": 0.0045, + "num_input_tokens_seen": 28143904, + "step": 133350 + }, + { + "epoch": 14.67051705170517, + "grad_norm": 0.001806372543796897, + "learning_rate": 1.005794736958729e-05, + "loss": 0.0214, + "num_input_tokens_seen": 28145024, + "step": 133355 + }, + { + "epoch": 14.671067106710671, + "grad_norm": 0.0382038913667202, + "learning_rate": 1.0056023224501363e-05, + "loss": 0.0721, + "num_input_tokens_seen": 28146080, + "step": 133360 + }, + { + "epoch": 14.671617161716172, + "grad_norm": 0.05825207382440567, + "learning_rate": 1.0054099217146887e-05, + "loss": 0.0086, + "num_input_tokens_seen": 28147072, + "step": 133365 + }, + { + "epoch": 14.672167216721672, + "grad_norm": 0.12089572846889496, + "learning_rate": 1.0052175347541618e-05, + "loss": 0.0018, + "num_input_tokens_seen": 28148064, + "step": 133370 + }, + { + "epoch": 14.672717271727173, + "grad_norm": 0.649372398853302, + "learning_rate": 1.0050251615703266e-05, + "loss": 0.0127, + "num_input_tokens_seen": 28149152, + "step": 133375 + }, + { + "epoch": 14.673267326732674, + "grad_norm": 0.04479179158806801, + "learning_rate": 1.0048328021649578e-05, + "loss": 0.0022, + "num_input_tokens_seen": 28150240, + "step": 133380 + }, + { + "epoch": 14.673817381738173, + "grad_norm": 0.029452160000801086, + "learning_rate": 1.004640456539827e-05, + "loss": 0.002, + "num_input_tokens_seen": 28151264, + "step": 133385 + }, + { + "epoch": 14.674367436743674, + "grad_norm": 0.09428524225950241, + "learning_rate": 1.0044481246967075e-05, + "loss": 0.0066, + "num_input_tokens_seen": 28152288, + "step": 133390 + }, + { + "epoch": 14.674917491749175, + "grad_norm": 2.4131875038146973, + "learning_rate": 1.004255806637373e-05, + "loss": 0.0778, + "num_input_tokens_seen": 28153376, + "step": 133395 + }, + { + "epoch": 14.675467546754675, + "grad_norm": 0.017624113708734512, + "learning_rate": 1.004063502363595e-05, + "loss": 0.0017, + "num_input_tokens_seen": 28154496, + "step": 133400 + }, + { + "epoch": 14.676017601760176, + "grad_norm": 0.10038602352142334, + "learning_rate": 1.0038712118771448e-05, + "loss": 0.0035, + "num_input_tokens_seen": 28155584, + "step": 133405 + }, + { + "epoch": 14.676567656765677, + "grad_norm": 1.201232671737671, + "learning_rate": 1.0036789351797957e-05, + "loss": 0.0332, + "num_input_tokens_seen": 28156608, + "step": 133410 + }, + { + "epoch": 14.677117711771178, + "grad_norm": 0.02525145933032036, + "learning_rate": 1.00348667227332e-05, + "loss": 0.0273, + "num_input_tokens_seen": 28157664, + "step": 133415 + }, + { + "epoch": 14.677667766776677, + "grad_norm": 0.6861608624458313, + "learning_rate": 1.0032944231594904e-05, + "loss": 0.0554, + "num_input_tokens_seen": 28158720, + "step": 133420 + }, + { + "epoch": 14.678217821782178, + "grad_norm": 0.010229615494608879, + "learning_rate": 1.0031021878400781e-05, + "loss": 0.0044, + "num_input_tokens_seen": 28159776, + "step": 133425 + }, + { + "epoch": 14.67876787678768, + "grad_norm": 0.37032535672187805, + "learning_rate": 1.0029099663168535e-05, + "loss": 0.0122, + "num_input_tokens_seen": 28160896, + "step": 133430 + }, + { + "epoch": 14.679317931793179, + "grad_norm": 0.018746113404631615, + "learning_rate": 1.0027177585915903e-05, + "loss": 0.0867, + "num_input_tokens_seen": 28161888, + "step": 133435 + }, + { + "epoch": 14.67986798679868, + "grad_norm": 0.03344133123755455, + "learning_rate": 1.0025255646660584e-05, + "loss": 0.0067, + "num_input_tokens_seen": 28162912, + "step": 133440 + }, + { + "epoch": 14.680418041804181, + "grad_norm": 0.01771615445613861, + "learning_rate": 1.0023333845420293e-05, + "loss": 0.0039, + "num_input_tokens_seen": 28164000, + "step": 133445 + }, + { + "epoch": 14.68096809680968, + "grad_norm": 0.030542591586709023, + "learning_rate": 1.0021412182212762e-05, + "loss": 0.002, + "num_input_tokens_seen": 28165056, + "step": 133450 + }, + { + "epoch": 14.681518151815181, + "grad_norm": 0.011559533886611462, + "learning_rate": 1.0019490657055675e-05, + "loss": 0.0109, + "num_input_tokens_seen": 28166112, + "step": 133455 + }, + { + "epoch": 14.682068206820682, + "grad_norm": 0.17624370753765106, + "learning_rate": 1.0017569269966765e-05, + "loss": 0.0318, + "num_input_tokens_seen": 28167104, + "step": 133460 + }, + { + "epoch": 14.682618261826182, + "grad_norm": 0.00959976390004158, + "learning_rate": 1.0015648020963719e-05, + "loss": 0.0247, + "num_input_tokens_seen": 28168224, + "step": 133465 + }, + { + "epoch": 14.683168316831683, + "grad_norm": 2.740352153778076, + "learning_rate": 1.0013726910064255e-05, + "loss": 0.1112, + "num_input_tokens_seen": 28169248, + "step": 133470 + }, + { + "epoch": 14.683718371837184, + "grad_norm": 0.00930397491902113, + "learning_rate": 1.001180593728609e-05, + "loss": 0.0144, + "num_input_tokens_seen": 28170368, + "step": 133475 + }, + { + "epoch": 14.684268426842685, + "grad_norm": 0.5587307214736938, + "learning_rate": 1.000988510264691e-05, + "loss": 0.0072, + "num_input_tokens_seen": 28171424, + "step": 133480 + }, + { + "epoch": 14.684818481848184, + "grad_norm": 0.02323366515338421, + "learning_rate": 1.000796440616443e-05, + "loss": 0.053, + "num_input_tokens_seen": 28172384, + "step": 133485 + }, + { + "epoch": 14.685368536853685, + "grad_norm": 0.008707383647561073, + "learning_rate": 1.0006043847856344e-05, + "loss": 0.0095, + "num_input_tokens_seen": 28173408, + "step": 133490 + }, + { + "epoch": 14.685918591859187, + "grad_norm": 0.04206572473049164, + "learning_rate": 1.0004123427740364e-05, + "loss": 0.0106, + "num_input_tokens_seen": 28174528, + "step": 133495 + }, + { + "epoch": 14.686468646864686, + "grad_norm": 0.9536268711090088, + "learning_rate": 1.0002203145834175e-05, + "loss": 0.0876, + "num_input_tokens_seen": 28175520, + "step": 133500 + }, + { + "epoch": 14.687018701870187, + "grad_norm": 0.010523378849029541, + "learning_rate": 1.000028300215548e-05, + "loss": 0.0104, + "num_input_tokens_seen": 28176576, + "step": 133505 + }, + { + "epoch": 14.687568756875688, + "grad_norm": 0.0636952668428421, + "learning_rate": 9.998362996721989e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28177696, + "step": 133510 + }, + { + "epoch": 14.688118811881187, + "grad_norm": 0.3509669005870819, + "learning_rate": 9.996443129551381e-06, + "loss": 0.0032, + "num_input_tokens_seen": 28178784, + "step": 133515 + }, + { + "epoch": 14.688668866886688, + "grad_norm": 0.012878136709332466, + "learning_rate": 9.994523400661363e-06, + "loss": 0.1207, + "num_input_tokens_seen": 28179808, + "step": 133520 + }, + { + "epoch": 14.68921892189219, + "grad_norm": 0.0017607114277780056, + "learning_rate": 9.992603810069615e-06, + "loss": 0.0073, + "num_input_tokens_seen": 28180928, + "step": 133525 + }, + { + "epoch": 14.689768976897689, + "grad_norm": 0.032385122030973434, + "learning_rate": 9.990684357793836e-06, + "loss": 0.0213, + "num_input_tokens_seen": 28181984, + "step": 133530 + }, + { + "epoch": 14.69031903190319, + "grad_norm": 0.9026927351951599, + "learning_rate": 9.988765043851728e-06, + "loss": 0.0166, + "num_input_tokens_seen": 28183104, + "step": 133535 + }, + { + "epoch": 14.690869086908691, + "grad_norm": 0.01065028551965952, + "learning_rate": 9.986845868260957e-06, + "loss": 0.001, + "num_input_tokens_seen": 28184192, + "step": 133540 + }, + { + "epoch": 14.691419141914192, + "grad_norm": 0.8999691009521484, + "learning_rate": 9.984926831039237e-06, + "loss": 0.0313, + "num_input_tokens_seen": 28185216, + "step": 133545 + }, + { + "epoch": 14.691969196919691, + "grad_norm": 0.06227932497859001, + "learning_rate": 9.983007932204238e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28186240, + "step": 133550 + }, + { + "epoch": 14.692519251925193, + "grad_norm": 0.007423673756420612, + "learning_rate": 9.981089171773642e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28187264, + "step": 133555 + }, + { + "epoch": 14.693069306930694, + "grad_norm": 0.01477449107915163, + "learning_rate": 9.97917054976514e-06, + "loss": 0.0088, + "num_input_tokens_seen": 28188320, + "step": 133560 + }, + { + "epoch": 14.693619361936193, + "grad_norm": 0.005038486327975988, + "learning_rate": 9.977252066196416e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28189472, + "step": 133565 + }, + { + "epoch": 14.694169416941694, + "grad_norm": 0.06446564942598343, + "learning_rate": 9.97533372108516e-06, + "loss": 0.005, + "num_input_tokens_seen": 28190560, + "step": 133570 + }, + { + "epoch": 14.694719471947195, + "grad_norm": 3.8849353790283203, + "learning_rate": 9.973415514449041e-06, + "loss": 0.0327, + "num_input_tokens_seen": 28191552, + "step": 133575 + }, + { + "epoch": 14.695269526952695, + "grad_norm": 0.0575711652636528, + "learning_rate": 9.971497446305734e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28192640, + "step": 133580 + }, + { + "epoch": 14.695819581958196, + "grad_norm": 0.1034640371799469, + "learning_rate": 9.969579516672923e-06, + "loss": 0.01, + "num_input_tokens_seen": 28193728, + "step": 133585 + }, + { + "epoch": 14.696369636963697, + "grad_norm": 0.017250489443540573, + "learning_rate": 9.967661725568292e-06, + "loss": 0.0675, + "num_input_tokens_seen": 28194752, + "step": 133590 + }, + { + "epoch": 14.696919691969196, + "grad_norm": 0.009068213403224945, + "learning_rate": 9.965744073009502e-06, + "loss": 0.0078, + "num_input_tokens_seen": 28195840, + "step": 133595 + }, + { + "epoch": 14.697469746974697, + "grad_norm": 3.557938575744629, + "learning_rate": 9.963826559014248e-06, + "loss": 0.0913, + "num_input_tokens_seen": 28196864, + "step": 133600 + }, + { + "epoch": 14.698019801980198, + "grad_norm": 0.012725882232189178, + "learning_rate": 9.961909183600176e-06, + "loss": 0.0784, + "num_input_tokens_seen": 28197952, + "step": 133605 + }, + { + "epoch": 14.6985698569857, + "grad_norm": 0.1487584412097931, + "learning_rate": 9.95999194678498e-06, + "loss": 0.0318, + "num_input_tokens_seen": 28199008, + "step": 133610 + }, + { + "epoch": 14.699119911991199, + "grad_norm": 0.10775360465049744, + "learning_rate": 9.958074848586315e-06, + "loss": 0.0072, + "num_input_tokens_seen": 28200032, + "step": 133615 + }, + { + "epoch": 14.6996699669967, + "grad_norm": 4.052789688110352, + "learning_rate": 9.956157889021855e-06, + "loss": 0.016, + "num_input_tokens_seen": 28201152, + "step": 133620 + }, + { + "epoch": 14.7002200220022, + "grad_norm": 1.5872576236724854, + "learning_rate": 9.954241068109276e-06, + "loss": 0.0093, + "num_input_tokens_seen": 28202208, + "step": 133625 + }, + { + "epoch": 14.7007700770077, + "grad_norm": 0.034671850502491, + "learning_rate": 9.952324385866232e-06, + "loss": 0.0172, + "num_input_tokens_seen": 28203168, + "step": 133630 + }, + { + "epoch": 14.701320132013201, + "grad_norm": 0.009301742538809776, + "learning_rate": 9.950407842310403e-06, + "loss": 0.0047, + "num_input_tokens_seen": 28204224, + "step": 133635 + }, + { + "epoch": 14.701870187018702, + "grad_norm": 0.025952894240617752, + "learning_rate": 9.94849143745943e-06, + "loss": 0.1298, + "num_input_tokens_seen": 28205216, + "step": 133640 + }, + { + "epoch": 14.702420242024202, + "grad_norm": 0.04697166010737419, + "learning_rate": 9.946575171330993e-06, + "loss": 0.0561, + "num_input_tokens_seen": 28206304, + "step": 133645 + }, + { + "epoch": 14.702970297029703, + "grad_norm": 0.0604221411049366, + "learning_rate": 9.944659043942756e-06, + "loss": 0.0273, + "num_input_tokens_seen": 28207392, + "step": 133650 + }, + { + "epoch": 14.703520352035204, + "grad_norm": 1.4299919605255127, + "learning_rate": 9.942743055312364e-06, + "loss": 0.021, + "num_input_tokens_seen": 28208416, + "step": 133655 + }, + { + "epoch": 14.704070407040705, + "grad_norm": 0.010575052350759506, + "learning_rate": 9.940827205457493e-06, + "loss": 0.0263, + "num_input_tokens_seen": 28209472, + "step": 133660 + }, + { + "epoch": 14.704620462046204, + "grad_norm": 3.299173355102539, + "learning_rate": 9.938911494395792e-06, + "loss": 0.1292, + "num_input_tokens_seen": 28210592, + "step": 133665 + }, + { + "epoch": 14.705170517051705, + "grad_norm": 0.015604863874614239, + "learning_rate": 9.936995922144906e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28211616, + "step": 133670 + }, + { + "epoch": 14.705720572057206, + "grad_norm": 0.009765645489096642, + "learning_rate": 9.935080488722504e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28212672, + "step": 133675 + }, + { + "epoch": 14.706270627062706, + "grad_norm": 0.013912052847445011, + "learning_rate": 9.933165194146236e-06, + "loss": 0.0019, + "num_input_tokens_seen": 28213728, + "step": 133680 + }, + { + "epoch": 14.706820682068207, + "grad_norm": 0.3028756082057953, + "learning_rate": 9.93125003843376e-06, + "loss": 0.0098, + "num_input_tokens_seen": 28214752, + "step": 133685 + }, + { + "epoch": 14.707370737073708, + "grad_norm": 0.1389322727918625, + "learning_rate": 9.929335021602724e-06, + "loss": 0.0033, + "num_input_tokens_seen": 28215808, + "step": 133690 + }, + { + "epoch": 14.707920792079207, + "grad_norm": 0.02009674720466137, + "learning_rate": 9.927420143670766e-06, + "loss": 0.0033, + "num_input_tokens_seen": 28216864, + "step": 133695 + }, + { + "epoch": 14.708470847084708, + "grad_norm": 0.00345843518152833, + "learning_rate": 9.925505404655544e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28217920, + "step": 133700 + }, + { + "epoch": 14.70902090209021, + "grad_norm": 0.18581990897655487, + "learning_rate": 9.923590804574706e-06, + "loss": 0.0739, + "num_input_tokens_seen": 28218912, + "step": 133705 + }, + { + "epoch": 14.70957095709571, + "grad_norm": 0.16089677810668945, + "learning_rate": 9.921676343445905e-06, + "loss": 0.0028, + "num_input_tokens_seen": 28220000, + "step": 133710 + }, + { + "epoch": 14.71012101210121, + "grad_norm": 0.006244942080229521, + "learning_rate": 9.919762021286777e-06, + "loss": 0.1026, + "num_input_tokens_seen": 28221120, + "step": 133715 + }, + { + "epoch": 14.710671067106711, + "grad_norm": 2.651259422302246, + "learning_rate": 9.91784783811496e-06, + "loss": 0.0626, + "num_input_tokens_seen": 28222208, + "step": 133720 + }, + { + "epoch": 14.711221122112212, + "grad_norm": 0.023918574675917625, + "learning_rate": 9.91593379394811e-06, + "loss": 0.0045, + "num_input_tokens_seen": 28223264, + "step": 133725 + }, + { + "epoch": 14.711771177117711, + "grad_norm": 0.009966283105313778, + "learning_rate": 9.914019888803852e-06, + "loss": 0.0055, + "num_input_tokens_seen": 28224288, + "step": 133730 + }, + { + "epoch": 14.712321232123212, + "grad_norm": 0.03267093747854233, + "learning_rate": 9.912106122699832e-06, + "loss": 0.0022, + "num_input_tokens_seen": 28225312, + "step": 133735 + }, + { + "epoch": 14.712871287128714, + "grad_norm": 0.015462814830243587, + "learning_rate": 9.9101924956537e-06, + "loss": 0.003, + "num_input_tokens_seen": 28226368, + "step": 133740 + }, + { + "epoch": 14.713421342134213, + "grad_norm": 0.018116585910320282, + "learning_rate": 9.908279007683074e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28227392, + "step": 133745 + }, + { + "epoch": 14.713971397139714, + "grad_norm": 0.09839033335447311, + "learning_rate": 9.906365658805607e-06, + "loss": 0.0057, + "num_input_tokens_seen": 28228416, + "step": 133750 + }, + { + "epoch": 14.714521452145215, + "grad_norm": 0.009953828528523445, + "learning_rate": 9.904452449038918e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28229504, + "step": 133755 + }, + { + "epoch": 14.715071507150714, + "grad_norm": 2.314893960952759, + "learning_rate": 9.902539378400647e-06, + "loss": 0.0696, + "num_input_tokens_seen": 28230592, + "step": 133760 + }, + { + "epoch": 14.715621562156215, + "grad_norm": 0.109479159116745, + "learning_rate": 9.900626446908431e-06, + "loss": 0.0099, + "num_input_tokens_seen": 28231744, + "step": 133765 + }, + { + "epoch": 14.716171617161717, + "grad_norm": 0.06859633326530457, + "learning_rate": 9.898713654579892e-06, + "loss": 0.0377, + "num_input_tokens_seen": 28232736, + "step": 133770 + }, + { + "epoch": 14.716721672167218, + "grad_norm": 0.021258534863591194, + "learning_rate": 9.896801001432667e-06, + "loss": 0.0089, + "num_input_tokens_seen": 28233792, + "step": 133775 + }, + { + "epoch": 14.717271727172717, + "grad_norm": 0.049740005284547806, + "learning_rate": 9.894888487484386e-06, + "loss": 0.0047, + "num_input_tokens_seen": 28234816, + "step": 133780 + }, + { + "epoch": 14.717821782178218, + "grad_norm": 0.008291934616863728, + "learning_rate": 9.892976112752656e-06, + "loss": 0.1011, + "num_input_tokens_seen": 28235776, + "step": 133785 + }, + { + "epoch": 14.718371837183719, + "grad_norm": 0.020019622519612312, + "learning_rate": 9.891063877255116e-06, + "loss": 0.0467, + "num_input_tokens_seen": 28236832, + "step": 133790 + }, + { + "epoch": 14.718921892189218, + "grad_norm": 0.0076540871523320675, + "learning_rate": 9.889151781009393e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28237888, + "step": 133795 + }, + { + "epoch": 14.71947194719472, + "grad_norm": 0.05064795911312103, + "learning_rate": 9.887239824033115e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28238976, + "step": 133800 + }, + { + "epoch": 14.72002200220022, + "grad_norm": 0.008992968127131462, + "learning_rate": 9.885328006343894e-06, + "loss": 0.0242, + "num_input_tokens_seen": 28239968, + "step": 133805 + }, + { + "epoch": 14.72057205720572, + "grad_norm": 0.05945756658911705, + "learning_rate": 9.883416327959346e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28240960, + "step": 133810 + }, + { + "epoch": 14.721122112211221, + "grad_norm": 0.005570156965404749, + "learning_rate": 9.881504788897103e-06, + "loss": 0.0079, + "num_input_tokens_seen": 28242112, + "step": 133815 + }, + { + "epoch": 14.721672167216722, + "grad_norm": 0.044492948800325394, + "learning_rate": 9.87959338917476e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28243136, + "step": 133820 + }, + { + "epoch": 14.722222222222221, + "grad_norm": 0.005503543186932802, + "learning_rate": 9.877682128809965e-06, + "loss": 0.0025, + "num_input_tokens_seen": 28244160, + "step": 133825 + }, + { + "epoch": 14.722772277227723, + "grad_norm": 0.027067724615335464, + "learning_rate": 9.875771007820317e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28245216, + "step": 133830 + }, + { + "epoch": 14.723322332233224, + "grad_norm": 0.06548184156417847, + "learning_rate": 9.873860026223423e-06, + "loss": 0.0061, + "num_input_tokens_seen": 28246272, + "step": 133835 + }, + { + "epoch": 14.723872387238725, + "grad_norm": 0.5643356442451477, + "learning_rate": 9.87194918403691e-06, + "loss": 0.0563, + "num_input_tokens_seen": 28247360, + "step": 133840 + }, + { + "epoch": 14.724422442244224, + "grad_norm": 0.6038884520530701, + "learning_rate": 9.870038481278377e-06, + "loss": 0.0153, + "num_input_tokens_seen": 28248416, + "step": 133845 + }, + { + "epoch": 14.724972497249725, + "grad_norm": 0.006132534239441156, + "learning_rate": 9.868127917965436e-06, + "loss": 0.0318, + "num_input_tokens_seen": 28249408, + "step": 133850 + }, + { + "epoch": 14.725522552255226, + "grad_norm": 0.019012819975614548, + "learning_rate": 9.86621749411571e-06, + "loss": 0.0696, + "num_input_tokens_seen": 28250496, + "step": 133855 + }, + { + "epoch": 14.726072607260726, + "grad_norm": 2.4638311862945557, + "learning_rate": 9.864307209746785e-06, + "loss": 0.141, + "num_input_tokens_seen": 28251520, + "step": 133860 + }, + { + "epoch": 14.726622662266227, + "grad_norm": 0.1592070609331131, + "learning_rate": 9.862397064876285e-06, + "loss": 0.0028, + "num_input_tokens_seen": 28252576, + "step": 133865 + }, + { + "epoch": 14.727172717271728, + "grad_norm": 0.3444586992263794, + "learning_rate": 9.8604870595218e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28253664, + "step": 133870 + }, + { + "epoch": 14.727722772277227, + "grad_norm": 2.7607083320617676, + "learning_rate": 9.858577193700952e-06, + "loss": 0.0905, + "num_input_tokens_seen": 28254720, + "step": 133875 + }, + { + "epoch": 14.728272827282728, + "grad_norm": 0.0352310873568058, + "learning_rate": 9.856667467431322e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28255776, + "step": 133880 + }, + { + "epoch": 14.72882288228823, + "grad_norm": 0.3035467863082886, + "learning_rate": 9.854757880730522e-06, + "loss": 0.0742, + "num_input_tokens_seen": 28256832, + "step": 133885 + }, + { + "epoch": 14.729372937293729, + "grad_norm": 1.6830230951309204, + "learning_rate": 9.85284843361616e-06, + "loss": 0.0397, + "num_input_tokens_seen": 28257888, + "step": 133890 + }, + { + "epoch": 14.72992299229923, + "grad_norm": 0.027411244809627533, + "learning_rate": 9.850939126105815e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28258944, + "step": 133895 + }, + { + "epoch": 14.73047304730473, + "grad_norm": 0.06404906511306763, + "learning_rate": 9.849029958217107e-06, + "loss": 0.039, + "num_input_tokens_seen": 28260032, + "step": 133900 + }, + { + "epoch": 14.731023102310232, + "grad_norm": 0.015829328447580338, + "learning_rate": 9.847120929967607e-06, + "loss": 0.0367, + "num_input_tokens_seen": 28261152, + "step": 133905 + }, + { + "epoch": 14.731573157315731, + "grad_norm": 0.29534897208213806, + "learning_rate": 9.845212041374927e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28262208, + "step": 133910 + }, + { + "epoch": 14.732123212321232, + "grad_norm": 0.029402432963252068, + "learning_rate": 9.843303292456659e-06, + "loss": 0.002, + "num_input_tokens_seen": 28263264, + "step": 133915 + }, + { + "epoch": 14.732673267326733, + "grad_norm": 0.7900384664535522, + "learning_rate": 9.841394683230388e-06, + "loss": 0.2093, + "num_input_tokens_seen": 28264320, + "step": 133920 + }, + { + "epoch": 14.733223322332233, + "grad_norm": 0.03153722360730171, + "learning_rate": 9.839486213713714e-06, + "loss": 0.0633, + "num_input_tokens_seen": 28265376, + "step": 133925 + }, + { + "epoch": 14.733773377337734, + "grad_norm": 0.015820616856217384, + "learning_rate": 9.837577883924221e-06, + "loss": 0.0252, + "num_input_tokens_seen": 28266432, + "step": 133930 + }, + { + "epoch": 14.734323432343235, + "grad_norm": 0.2763756513595581, + "learning_rate": 9.835669693879482e-06, + "loss": 0.0045, + "num_input_tokens_seen": 28267456, + "step": 133935 + }, + { + "epoch": 14.734873487348734, + "grad_norm": 0.03370387852191925, + "learning_rate": 9.833761643597115e-06, + "loss": 0.0981, + "num_input_tokens_seen": 28268512, + "step": 133940 + }, + { + "epoch": 14.735423542354235, + "grad_norm": 0.0081192497164011, + "learning_rate": 9.83185373309468e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28269536, + "step": 133945 + }, + { + "epoch": 14.735973597359736, + "grad_norm": 0.0031747668981552124, + "learning_rate": 9.829945962389778e-06, + "loss": 0.0102, + "num_input_tokens_seen": 28270592, + "step": 133950 + }, + { + "epoch": 14.736523652365236, + "grad_norm": 0.13496406376361847, + "learning_rate": 9.828038331499987e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28271616, + "step": 133955 + }, + { + "epoch": 14.737073707370737, + "grad_norm": 1.9318305253982544, + "learning_rate": 9.826130840442876e-06, + "loss": 0.0736, + "num_input_tokens_seen": 28272704, + "step": 133960 + }, + { + "epoch": 14.737623762376238, + "grad_norm": 0.09523527324199677, + "learning_rate": 9.82422348923604e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28273792, + "step": 133965 + }, + { + "epoch": 14.738173817381739, + "grad_norm": 0.06372958421707153, + "learning_rate": 9.82231627789706e-06, + "loss": 0.0037, + "num_input_tokens_seen": 28274848, + "step": 133970 + }, + { + "epoch": 14.738723872387238, + "grad_norm": 1.2708618640899658, + "learning_rate": 9.820409206443498e-06, + "loss": 0.0155, + "num_input_tokens_seen": 28275936, + "step": 133975 + }, + { + "epoch": 14.73927392739274, + "grad_norm": 0.012731221504509449, + "learning_rate": 9.818502274892949e-06, + "loss": 0.0019, + "num_input_tokens_seen": 28276960, + "step": 133980 + }, + { + "epoch": 14.73982398239824, + "grad_norm": 0.011032904498279095, + "learning_rate": 9.816595483262974e-06, + "loss": 0.1075, + "num_input_tokens_seen": 28277920, + "step": 133985 + }, + { + "epoch": 14.74037403740374, + "grad_norm": 0.050900887697935104, + "learning_rate": 9.814688831571159e-06, + "loss": 0.0159, + "num_input_tokens_seen": 28278976, + "step": 133990 + }, + { + "epoch": 14.74092409240924, + "grad_norm": 0.07145927101373672, + "learning_rate": 9.812782319835063e-06, + "loss": 0.0376, + "num_input_tokens_seen": 28280000, + "step": 133995 + }, + { + "epoch": 14.741474147414742, + "grad_norm": 0.018436891958117485, + "learning_rate": 9.810875948072265e-06, + "loss": 0.001, + "num_input_tokens_seen": 28281024, + "step": 134000 + }, + { + "epoch": 14.742024202420241, + "grad_norm": 0.0035689068026840687, + "learning_rate": 9.808969716300343e-06, + "loss": 0.0182, + "num_input_tokens_seen": 28282080, + "step": 134005 + }, + { + "epoch": 14.742574257425742, + "grad_norm": 2.4722251892089844, + "learning_rate": 9.807063624536847e-06, + "loss": 0.0638, + "num_input_tokens_seen": 28283072, + "step": 134010 + }, + { + "epoch": 14.743124312431243, + "grad_norm": 0.019820181652903557, + "learning_rate": 9.805157672799367e-06, + "loss": 0.002, + "num_input_tokens_seen": 28284032, + "step": 134015 + }, + { + "epoch": 14.743674367436743, + "grad_norm": 0.019586365669965744, + "learning_rate": 9.803251861105447e-06, + "loss": 0.0735, + "num_input_tokens_seen": 28285024, + "step": 134020 + }, + { + "epoch": 14.744224422442244, + "grad_norm": 0.007965253666043282, + "learning_rate": 9.801346189472666e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28286144, + "step": 134025 + }, + { + "epoch": 14.744774477447745, + "grad_norm": 2.81972074508667, + "learning_rate": 9.799440657918587e-06, + "loss": 0.095, + "num_input_tokens_seen": 28287200, + "step": 134030 + }, + { + "epoch": 14.745324532453246, + "grad_norm": 0.019216032698750496, + "learning_rate": 9.797535266460766e-06, + "loss": 0.002, + "num_input_tokens_seen": 28288224, + "step": 134035 + }, + { + "epoch": 14.745874587458745, + "grad_norm": 0.4656031131744385, + "learning_rate": 9.795630015116775e-06, + "loss": 0.1108, + "num_input_tokens_seen": 28289280, + "step": 134040 + }, + { + "epoch": 14.746424642464246, + "grad_norm": 0.17535053193569183, + "learning_rate": 9.793724903904164e-06, + "loss": 0.1123, + "num_input_tokens_seen": 28290304, + "step": 134045 + }, + { + "epoch": 14.746974697469748, + "grad_norm": 0.03228973597288132, + "learning_rate": 9.791819932840487e-06, + "loss": 0.0089, + "num_input_tokens_seen": 28291328, + "step": 134050 + }, + { + "epoch": 14.747524752475247, + "grad_norm": 0.8051905632019043, + "learning_rate": 9.789915101943309e-06, + "loss": 0.08, + "num_input_tokens_seen": 28292384, + "step": 134055 + }, + { + "epoch": 14.748074807480748, + "grad_norm": 0.017061639577150345, + "learning_rate": 9.788010411230184e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28293472, + "step": 134060 + }, + { + "epoch": 14.748624862486249, + "grad_norm": 0.16973625123500824, + "learning_rate": 9.786105860718675e-06, + "loss": 0.0266, + "num_input_tokens_seen": 28294496, + "step": 134065 + }, + { + "epoch": 14.749174917491748, + "grad_norm": 0.2433072328567505, + "learning_rate": 9.78420145042633e-06, + "loss": 0.003, + "num_input_tokens_seen": 28295552, + "step": 134070 + }, + { + "epoch": 14.74972497249725, + "grad_norm": 0.0033815132919698954, + "learning_rate": 9.782297180370689e-06, + "loss": 0.0323, + "num_input_tokens_seen": 28296608, + "step": 134075 + }, + { + "epoch": 14.75027502750275, + "grad_norm": 0.012036743573844433, + "learning_rate": 9.780393050569315e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28297568, + "step": 134080 + }, + { + "epoch": 14.750825082508252, + "grad_norm": 0.04487147182226181, + "learning_rate": 9.778489061039753e-06, + "loss": 0.0054, + "num_input_tokens_seen": 28298624, + "step": 134085 + }, + { + "epoch": 14.751375137513751, + "grad_norm": 0.025362631306052208, + "learning_rate": 9.776585211799563e-06, + "loss": 0.1306, + "num_input_tokens_seen": 28299712, + "step": 134090 + }, + { + "epoch": 14.751925192519252, + "grad_norm": 0.0732516422867775, + "learning_rate": 9.77468150286628e-06, + "loss": 0.0442, + "num_input_tokens_seen": 28300736, + "step": 134095 + }, + { + "epoch": 14.752475247524753, + "grad_norm": 0.05657421797513962, + "learning_rate": 9.772777934257447e-06, + "loss": 0.0055, + "num_input_tokens_seen": 28301792, + "step": 134100 + }, + { + "epoch": 14.753025302530252, + "grad_norm": 0.13614422082901, + "learning_rate": 9.770874505990619e-06, + "loss": 0.0683, + "num_input_tokens_seen": 28302848, + "step": 134105 + }, + { + "epoch": 14.753575357535754, + "grad_norm": 0.644819974899292, + "learning_rate": 9.768971218083325e-06, + "loss": 0.1533, + "num_input_tokens_seen": 28303904, + "step": 134110 + }, + { + "epoch": 14.754125412541255, + "grad_norm": 0.2089531123638153, + "learning_rate": 9.767068070553116e-06, + "loss": 0.0053, + "num_input_tokens_seen": 28304928, + "step": 134115 + }, + { + "epoch": 14.754675467546754, + "grad_norm": 0.16669932007789612, + "learning_rate": 9.765165063417537e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28305920, + "step": 134120 + }, + { + "epoch": 14.755225522552255, + "grad_norm": 0.05578852444887161, + "learning_rate": 9.763262196694111e-06, + "loss": 0.061, + "num_input_tokens_seen": 28306976, + "step": 134125 + }, + { + "epoch": 14.755775577557756, + "grad_norm": 0.01849646121263504, + "learning_rate": 9.7613594704004e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28308064, + "step": 134130 + }, + { + "epoch": 14.756325632563257, + "grad_norm": 0.24644845724105835, + "learning_rate": 9.759456884553913e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28309088, + "step": 134135 + }, + { + "epoch": 14.756875687568757, + "grad_norm": 0.5885047316551208, + "learning_rate": 9.757554439172203e-06, + "loss": 0.007, + "num_input_tokens_seen": 28310176, + "step": 134140 + }, + { + "epoch": 14.757425742574258, + "grad_norm": 0.06417372822761536, + "learning_rate": 9.755652134272809e-06, + "loss": 0.059, + "num_input_tokens_seen": 28311200, + "step": 134145 + }, + { + "epoch": 14.757975797579759, + "grad_norm": 0.016139551997184753, + "learning_rate": 9.753749969873242e-06, + "loss": 0.056, + "num_input_tokens_seen": 28312224, + "step": 134150 + }, + { + "epoch": 14.758525852585258, + "grad_norm": 0.07887112349271774, + "learning_rate": 9.751847945991055e-06, + "loss": 0.0022, + "num_input_tokens_seen": 28313280, + "step": 134155 + }, + { + "epoch": 14.75907590759076, + "grad_norm": 0.008261101320385933, + "learning_rate": 9.74994606264376e-06, + "loss": 0.0472, + "num_input_tokens_seen": 28314304, + "step": 134160 + }, + { + "epoch": 14.75962596259626, + "grad_norm": 0.012645133771002293, + "learning_rate": 9.748044319848903e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28315296, + "step": 134165 + }, + { + "epoch": 14.76017601760176, + "grad_norm": 3.520453929901123, + "learning_rate": 9.746142717623997e-06, + "loss": 0.0626, + "num_input_tokens_seen": 28316384, + "step": 134170 + }, + { + "epoch": 14.76072607260726, + "grad_norm": 0.04044586420059204, + "learning_rate": 9.744241255986572e-06, + "loss": 0.0055, + "num_input_tokens_seen": 28317408, + "step": 134175 + }, + { + "epoch": 14.761276127612762, + "grad_norm": 0.5388176441192627, + "learning_rate": 9.742339934954164e-06, + "loss": 0.0417, + "num_input_tokens_seen": 28318432, + "step": 134180 + }, + { + "epoch": 14.761826182618261, + "grad_norm": 0.10292263329029083, + "learning_rate": 9.740438754544288e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28319488, + "step": 134185 + }, + { + "epoch": 14.762376237623762, + "grad_norm": 0.015211805701255798, + "learning_rate": 9.738537714774454e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28320544, + "step": 134190 + }, + { + "epoch": 14.762926292629263, + "grad_norm": 0.7181902527809143, + "learning_rate": 9.736636815662197e-06, + "loss": 0.0438, + "num_input_tokens_seen": 28321728, + "step": 134195 + }, + { + "epoch": 14.763476347634764, + "grad_norm": 0.032392896711826324, + "learning_rate": 9.734736057225036e-06, + "loss": 0.0085, + "num_input_tokens_seen": 28322784, + "step": 134200 + }, + { + "epoch": 14.764026402640264, + "grad_norm": 0.02834131196141243, + "learning_rate": 9.732835439480492e-06, + "loss": 0.0037, + "num_input_tokens_seen": 28323744, + "step": 134205 + }, + { + "epoch": 14.764576457645765, + "grad_norm": 0.241450697183609, + "learning_rate": 9.73093496244608e-06, + "loss": 0.0037, + "num_input_tokens_seen": 28324800, + "step": 134210 + }, + { + "epoch": 14.765126512651266, + "grad_norm": 0.05998606979846954, + "learning_rate": 9.729034626139309e-06, + "loss": 0.0786, + "num_input_tokens_seen": 28325888, + "step": 134215 + }, + { + "epoch": 14.765676567656765, + "grad_norm": 0.6841714382171631, + "learning_rate": 9.727134430577703e-06, + "loss": 0.0116, + "num_input_tokens_seen": 28326944, + "step": 134220 + }, + { + "epoch": 14.766226622662266, + "grad_norm": 0.018871722742915154, + "learning_rate": 9.72523437577876e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28328000, + "step": 134225 + }, + { + "epoch": 14.766776677667767, + "grad_norm": 0.008280863985419273, + "learning_rate": 9.723334461760006e-06, + "loss": 0.002, + "num_input_tokens_seen": 28329056, + "step": 134230 + }, + { + "epoch": 14.767326732673267, + "grad_norm": 0.026531901210546494, + "learning_rate": 9.721434688538955e-06, + "loss": 0.0046, + "num_input_tokens_seen": 28330112, + "step": 134235 + }, + { + "epoch": 14.767876787678768, + "grad_norm": 0.002523222705349326, + "learning_rate": 9.7195350561331e-06, + "loss": 0.0157, + "num_input_tokens_seen": 28331136, + "step": 134240 + }, + { + "epoch": 14.768426842684269, + "grad_norm": 0.020581528544425964, + "learning_rate": 9.717635564559966e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28332160, + "step": 134245 + }, + { + "epoch": 14.768976897689768, + "grad_norm": 0.013097128830850124, + "learning_rate": 9.715736213837046e-06, + "loss": 0.0436, + "num_input_tokens_seen": 28333216, + "step": 134250 + }, + { + "epoch": 14.76952695269527, + "grad_norm": 1.0631182193756104, + "learning_rate": 9.713837003981849e-06, + "loss": 0.0952, + "num_input_tokens_seen": 28334240, + "step": 134255 + }, + { + "epoch": 14.77007700770077, + "grad_norm": 0.19499318301677704, + "learning_rate": 9.711937935011888e-06, + "loss": 0.0793, + "num_input_tokens_seen": 28335264, + "step": 134260 + }, + { + "epoch": 14.770627062706271, + "grad_norm": 0.01917671598494053, + "learning_rate": 9.710039006944654e-06, + "loss": 0.0054, + "num_input_tokens_seen": 28336352, + "step": 134265 + }, + { + "epoch": 14.77117711771177, + "grad_norm": 0.38057294487953186, + "learning_rate": 9.708140219797663e-06, + "loss": 0.0424, + "num_input_tokens_seen": 28337408, + "step": 134270 + }, + { + "epoch": 14.771727172717272, + "grad_norm": 0.030967911705374718, + "learning_rate": 9.706241573588393e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28338496, + "step": 134275 + }, + { + "epoch": 14.772277227722773, + "grad_norm": 0.038751740008592606, + "learning_rate": 9.704343068334368e-06, + "loss": 0.0891, + "num_input_tokens_seen": 28339552, + "step": 134280 + }, + { + "epoch": 14.772827282728272, + "grad_norm": 0.10493854433298111, + "learning_rate": 9.702444704053063e-06, + "loss": 0.0261, + "num_input_tokens_seen": 28340576, + "step": 134285 + }, + { + "epoch": 14.773377337733773, + "grad_norm": 0.08400697261095047, + "learning_rate": 9.700546480761985e-06, + "loss": 0.0393, + "num_input_tokens_seen": 28341632, + "step": 134290 + }, + { + "epoch": 14.773927392739274, + "grad_norm": 0.5056151747703552, + "learning_rate": 9.698648398478637e-06, + "loss": 0.0058, + "num_input_tokens_seen": 28342816, + "step": 134295 + }, + { + "epoch": 14.774477447744774, + "grad_norm": 0.05099537968635559, + "learning_rate": 9.696750457220497e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28343808, + "step": 134300 + }, + { + "epoch": 14.775027502750275, + "grad_norm": 0.1730688512325287, + "learning_rate": 9.694852657005071e-06, + "loss": 0.003, + "num_input_tokens_seen": 28344864, + "step": 134305 + }, + { + "epoch": 14.775577557755776, + "grad_norm": 0.5641416311264038, + "learning_rate": 9.692954997849838e-06, + "loss": 0.0053, + "num_input_tokens_seen": 28345952, + "step": 134310 + }, + { + "epoch": 14.776127612761275, + "grad_norm": 0.00755304703488946, + "learning_rate": 9.691057479772292e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28347008, + "step": 134315 + }, + { + "epoch": 14.776677667766776, + "grad_norm": 0.0582122765481472, + "learning_rate": 9.689160102789933e-06, + "loss": 0.006, + "num_input_tokens_seen": 28348096, + "step": 134320 + }, + { + "epoch": 14.777227722772277, + "grad_norm": 0.0532243549823761, + "learning_rate": 9.68726286692023e-06, + "loss": 0.0127, + "num_input_tokens_seen": 28349120, + "step": 134325 + }, + { + "epoch": 14.777777777777779, + "grad_norm": 0.4001016914844513, + "learning_rate": 9.685365772180684e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28350176, + "step": 134330 + }, + { + "epoch": 14.778327832783278, + "grad_norm": 0.00396556593477726, + "learning_rate": 9.683468818588775e-06, + "loss": 0.0636, + "num_input_tokens_seen": 28351296, + "step": 134335 + }, + { + "epoch": 14.778877887788779, + "grad_norm": 0.05545904114842415, + "learning_rate": 9.681572006161976e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28352352, + "step": 134340 + }, + { + "epoch": 14.77942794279428, + "grad_norm": 0.25502851605415344, + "learning_rate": 9.679675334917776e-06, + "loss": 0.1232, + "num_input_tokens_seen": 28353440, + "step": 134345 + }, + { + "epoch": 14.77997799779978, + "grad_norm": 0.24091050028800964, + "learning_rate": 9.677778804873658e-06, + "loss": 0.0039, + "num_input_tokens_seen": 28354496, + "step": 134350 + }, + { + "epoch": 14.78052805280528, + "grad_norm": 0.003745291382074356, + "learning_rate": 9.675882416047107e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28355616, + "step": 134355 + }, + { + "epoch": 14.781078107810782, + "grad_norm": 0.16485056281089783, + "learning_rate": 9.673986168455598e-06, + "loss": 0.0557, + "num_input_tokens_seen": 28356672, + "step": 134360 + }, + { + "epoch": 14.781628162816281, + "grad_norm": 0.37479841709136963, + "learning_rate": 9.672090062116593e-06, + "loss": 0.0039, + "num_input_tokens_seen": 28357792, + "step": 134365 + }, + { + "epoch": 14.782178217821782, + "grad_norm": 0.01772763580083847, + "learning_rate": 9.67019409704758e-06, + "loss": 0.005, + "num_input_tokens_seen": 28358880, + "step": 134370 + }, + { + "epoch": 14.782728272827283, + "grad_norm": 0.019247030839323997, + "learning_rate": 9.66829827326604e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28359904, + "step": 134375 + }, + { + "epoch": 14.783278327832782, + "grad_norm": 0.3185054361820221, + "learning_rate": 9.666402590789431e-06, + "loss": 0.0194, + "num_input_tokens_seen": 28360992, + "step": 134380 + }, + { + "epoch": 14.783828382838283, + "grad_norm": 2.201326847076416, + "learning_rate": 9.664507049635237e-06, + "loss": 0.0465, + "num_input_tokens_seen": 28361984, + "step": 134385 + }, + { + "epoch": 14.784378437843785, + "grad_norm": 0.039467208087444305, + "learning_rate": 9.662611649820915e-06, + "loss": 0.003, + "num_input_tokens_seen": 28362944, + "step": 134390 + }, + { + "epoch": 14.784928492849286, + "grad_norm": 2.79533314704895, + "learning_rate": 9.660716391363953e-06, + "loss": 0.0947, + "num_input_tokens_seen": 28363936, + "step": 134395 + }, + { + "epoch": 14.785478547854785, + "grad_norm": 0.005961306858807802, + "learning_rate": 9.658821274281798e-06, + "loss": 0.0086, + "num_input_tokens_seen": 28364960, + "step": 134400 + }, + { + "epoch": 14.786028602860286, + "grad_norm": 0.04993681237101555, + "learning_rate": 9.656926298591926e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28366016, + "step": 134405 + }, + { + "epoch": 14.786578657865787, + "grad_norm": 0.68498295545578, + "learning_rate": 9.65503146431181e-06, + "loss": 0.0355, + "num_input_tokens_seen": 28367072, + "step": 134410 + }, + { + "epoch": 14.787128712871286, + "grad_norm": 0.012232405133545399, + "learning_rate": 9.653136771458895e-06, + "loss": 0.0048, + "num_input_tokens_seen": 28368128, + "step": 134415 + }, + { + "epoch": 14.787678767876788, + "grad_norm": 0.04613010585308075, + "learning_rate": 9.651242220050666e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28369216, + "step": 134420 + }, + { + "epoch": 14.788228822882289, + "grad_norm": 0.006009348668158054, + "learning_rate": 9.649347810104562e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28370336, + "step": 134425 + }, + { + "epoch": 14.788778877887788, + "grad_norm": 0.03662487491965294, + "learning_rate": 9.647453541638054e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28371328, + "step": 134430 + }, + { + "epoch": 14.789328932893289, + "grad_norm": 0.08478468656539917, + "learning_rate": 9.645559414668609e-06, + "loss": 0.047, + "num_input_tokens_seen": 28372352, + "step": 134435 + }, + { + "epoch": 14.78987898789879, + "grad_norm": 0.019678983837366104, + "learning_rate": 9.643665429213666e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28373376, + "step": 134440 + }, + { + "epoch": 14.79042904290429, + "grad_norm": 0.3531002998352051, + "learning_rate": 9.641771585290699e-06, + "loss": 0.0045, + "num_input_tokens_seen": 28374496, + "step": 134445 + }, + { + "epoch": 14.79097909790979, + "grad_norm": 0.04250076413154602, + "learning_rate": 9.639877882917154e-06, + "loss": 0.0297, + "num_input_tokens_seen": 28375456, + "step": 134450 + }, + { + "epoch": 14.791529152915292, + "grad_norm": 0.013252489268779755, + "learning_rate": 9.637984322110475e-06, + "loss": 0.031, + "num_input_tokens_seen": 28376448, + "step": 134455 + }, + { + "epoch": 14.792079207920793, + "grad_norm": 0.33915191888809204, + "learning_rate": 9.636090902888121e-06, + "loss": 0.0469, + "num_input_tokens_seen": 28377472, + "step": 134460 + }, + { + "epoch": 14.792629262926292, + "grad_norm": 0.012702166102826595, + "learning_rate": 9.634197625267547e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28378528, + "step": 134465 + }, + { + "epoch": 14.793179317931793, + "grad_norm": 0.047504719346761703, + "learning_rate": 9.632304489266209e-06, + "loss": 0.1325, + "num_input_tokens_seen": 28379552, + "step": 134470 + }, + { + "epoch": 14.793729372937294, + "grad_norm": 0.03698543831706047, + "learning_rate": 9.630411494901544e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28380544, + "step": 134475 + }, + { + "epoch": 14.794279427942794, + "grad_norm": 0.960358738899231, + "learning_rate": 9.628518642190996e-06, + "loss": 0.0129, + "num_input_tokens_seen": 28381632, + "step": 134480 + }, + { + "epoch": 14.794829482948295, + "grad_norm": 0.027261747047305107, + "learning_rate": 9.626625931152013e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28382720, + "step": 134485 + }, + { + "epoch": 14.795379537953796, + "grad_norm": 1.9028379917144775, + "learning_rate": 9.624733361802044e-06, + "loss": 0.0754, + "num_input_tokens_seen": 28383712, + "step": 134490 + }, + { + "epoch": 14.795929592959295, + "grad_norm": 0.10506801307201385, + "learning_rate": 9.62284093415854e-06, + "loss": 0.0046, + "num_input_tokens_seen": 28384736, + "step": 134495 + }, + { + "epoch": 14.796479647964796, + "grad_norm": 0.14061152935028076, + "learning_rate": 9.620948648238928e-06, + "loss": 0.0255, + "num_input_tokens_seen": 28385760, + "step": 134500 + }, + { + "epoch": 14.797029702970297, + "grad_norm": 0.032497696578502655, + "learning_rate": 9.619056504060647e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28386816, + "step": 134505 + }, + { + "epoch": 14.797579757975798, + "grad_norm": 0.008741654455661774, + "learning_rate": 9.61716450164115e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28387904, + "step": 134510 + }, + { + "epoch": 14.798129812981298, + "grad_norm": 0.008907482028007507, + "learning_rate": 9.615272640997856e-06, + "loss": 0.0419, + "num_input_tokens_seen": 28388896, + "step": 134515 + }, + { + "epoch": 14.798679867986799, + "grad_norm": 0.11125198751688004, + "learning_rate": 9.613380922148212e-06, + "loss": 0.0083, + "num_input_tokens_seen": 28389952, + "step": 134520 + }, + { + "epoch": 14.7992299229923, + "grad_norm": 0.13585928082466125, + "learning_rate": 9.611489345109664e-06, + "loss": 0.0049, + "num_input_tokens_seen": 28390976, + "step": 134525 + }, + { + "epoch": 14.7997799779978, + "grad_norm": 0.01708090864121914, + "learning_rate": 9.609597909899623e-06, + "loss": 0.004, + "num_input_tokens_seen": 28392160, + "step": 134530 + }, + { + "epoch": 14.8003300330033, + "grad_norm": 6.033710479736328, + "learning_rate": 9.607706616535544e-06, + "loss": 0.1283, + "num_input_tokens_seen": 28393184, + "step": 134535 + }, + { + "epoch": 14.800880088008801, + "grad_norm": 0.026429681107401848, + "learning_rate": 9.605815465034837e-06, + "loss": 0.001, + "num_input_tokens_seen": 28394208, + "step": 134540 + }, + { + "epoch": 14.8014301430143, + "grad_norm": 2.307403087615967, + "learning_rate": 9.603924455414942e-06, + "loss": 0.0302, + "num_input_tokens_seen": 28395264, + "step": 134545 + }, + { + "epoch": 14.801980198019802, + "grad_norm": 3.6204893589019775, + "learning_rate": 9.6020335876933e-06, + "loss": 0.1578, + "num_input_tokens_seen": 28396320, + "step": 134550 + }, + { + "epoch": 14.802530253025303, + "grad_norm": 0.321399062871933, + "learning_rate": 9.600142861887313e-06, + "loss": 0.0218, + "num_input_tokens_seen": 28397376, + "step": 134555 + }, + { + "epoch": 14.803080308030804, + "grad_norm": 1.590975284576416, + "learning_rate": 9.598252278014427e-06, + "loss": 0.046, + "num_input_tokens_seen": 28398400, + "step": 134560 + }, + { + "epoch": 14.803630363036303, + "grad_norm": 0.12010108679533005, + "learning_rate": 9.596361836092063e-06, + "loss": 0.0054, + "num_input_tokens_seen": 28399424, + "step": 134565 + }, + { + "epoch": 14.804180418041804, + "grad_norm": 1.297563910484314, + "learning_rate": 9.59447153613763e-06, + "loss": 0.1904, + "num_input_tokens_seen": 28400480, + "step": 134570 + }, + { + "epoch": 14.804730473047305, + "grad_norm": 0.10135666280984879, + "learning_rate": 9.59258137816856e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28401472, + "step": 134575 + }, + { + "epoch": 14.805280528052805, + "grad_norm": 1.1303763389587402, + "learning_rate": 9.590691362202276e-06, + "loss": 0.0474, + "num_input_tokens_seen": 28402560, + "step": 134580 + }, + { + "epoch": 14.805830583058306, + "grad_norm": 0.006101507693529129, + "learning_rate": 9.588801488256203e-06, + "loss": 0.002, + "num_input_tokens_seen": 28403584, + "step": 134585 + }, + { + "epoch": 14.806380638063807, + "grad_norm": 0.17887340486049652, + "learning_rate": 9.586911756347753e-06, + "loss": 0.0064, + "num_input_tokens_seen": 28404608, + "step": 134590 + }, + { + "epoch": 14.806930693069306, + "grad_norm": 1.3398610353469849, + "learning_rate": 9.585022166494332e-06, + "loss": 0.0508, + "num_input_tokens_seen": 28405632, + "step": 134595 + }, + { + "epoch": 14.807480748074807, + "grad_norm": 0.05559590831398964, + "learning_rate": 9.583132718713372e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28406720, + "step": 134600 + }, + { + "epoch": 14.808030803080309, + "grad_norm": 0.0381532721221447, + "learning_rate": 9.581243413022264e-06, + "loss": 0.002, + "num_input_tokens_seen": 28407712, + "step": 134605 + }, + { + "epoch": 14.808580858085808, + "grad_norm": 0.33038777112960815, + "learning_rate": 9.579354249438454e-06, + "loss": 0.0055, + "num_input_tokens_seen": 28408704, + "step": 134610 + }, + { + "epoch": 14.809130913091309, + "grad_norm": 0.07639887183904648, + "learning_rate": 9.577465227979338e-06, + "loss": 0.0076, + "num_input_tokens_seen": 28409696, + "step": 134615 + }, + { + "epoch": 14.80968096809681, + "grad_norm": 1.6924771070480347, + "learning_rate": 9.575576348662313e-06, + "loss": 0.0784, + "num_input_tokens_seen": 28410816, + "step": 134620 + }, + { + "epoch": 14.810231023102311, + "grad_norm": 0.012752888724207878, + "learning_rate": 9.573687611504812e-06, + "loss": 0.0485, + "num_input_tokens_seen": 28411872, + "step": 134625 + }, + { + "epoch": 14.81078107810781, + "grad_norm": 0.013304822146892548, + "learning_rate": 9.57179901652422e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28412896, + "step": 134630 + }, + { + "epoch": 14.811331133113312, + "grad_norm": 0.09021815657615662, + "learning_rate": 9.569910563737949e-06, + "loss": 0.0123, + "num_input_tokens_seen": 28413984, + "step": 134635 + }, + { + "epoch": 14.811881188118813, + "grad_norm": 0.021642133593559265, + "learning_rate": 9.568022253163422e-06, + "loss": 0.012, + "num_input_tokens_seen": 28415040, + "step": 134640 + }, + { + "epoch": 14.812431243124312, + "grad_norm": 0.5447090268135071, + "learning_rate": 9.566134084818018e-06, + "loss": 0.0096, + "num_input_tokens_seen": 28416032, + "step": 134645 + }, + { + "epoch": 14.812981298129813, + "grad_norm": 0.055656373500823975, + "learning_rate": 9.56424605871916e-06, + "loss": 0.0276, + "num_input_tokens_seen": 28417088, + "step": 134650 + }, + { + "epoch": 14.813531353135314, + "grad_norm": 0.666616678237915, + "learning_rate": 9.562358174884228e-06, + "loss": 0.0152, + "num_input_tokens_seen": 28418176, + "step": 134655 + }, + { + "epoch": 14.814081408140813, + "grad_norm": 0.020226627588272095, + "learning_rate": 9.560470433330645e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28419168, + "step": 134660 + }, + { + "epoch": 14.814631463146315, + "grad_norm": 0.007903119549155235, + "learning_rate": 9.558582834075785e-06, + "loss": 0.0068, + "num_input_tokens_seen": 28420160, + "step": 134665 + }, + { + "epoch": 14.815181518151816, + "grad_norm": 0.07682755589485168, + "learning_rate": 9.556695377137062e-06, + "loss": 0.002, + "num_input_tokens_seen": 28421216, + "step": 134670 + }, + { + "epoch": 14.815731573157315, + "grad_norm": 0.015485410578548908, + "learning_rate": 9.554808062531873e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28422336, + "step": 134675 + }, + { + "epoch": 14.816281628162816, + "grad_norm": 2.7857253551483154, + "learning_rate": 9.5529208902776e-06, + "loss": 0.0299, + "num_input_tokens_seen": 28423424, + "step": 134680 + }, + { + "epoch": 14.816831683168317, + "grad_norm": 0.05082188919186592, + "learning_rate": 9.55103386039165e-06, + "loss": 0.0442, + "num_input_tokens_seen": 28424512, + "step": 134685 + }, + { + "epoch": 14.817381738173818, + "grad_norm": 0.08408145606517792, + "learning_rate": 9.549146972891398e-06, + "loss": 0.0032, + "num_input_tokens_seen": 28425536, + "step": 134690 + }, + { + "epoch": 14.817931793179318, + "grad_norm": 0.01961064711213112, + "learning_rate": 9.547260227794247e-06, + "loss": 0.0069, + "num_input_tokens_seen": 28426624, + "step": 134695 + }, + { + "epoch": 14.818481848184819, + "grad_norm": 1.25093412399292, + "learning_rate": 9.54537362511759e-06, + "loss": 0.0669, + "num_input_tokens_seen": 28427648, + "step": 134700 + }, + { + "epoch": 14.81903190319032, + "grad_norm": 0.0068001458421349525, + "learning_rate": 9.543487164878802e-06, + "loss": 0.1006, + "num_input_tokens_seen": 28428704, + "step": 134705 + }, + { + "epoch": 14.819581958195819, + "grad_norm": 0.01858665980398655, + "learning_rate": 9.541600847095284e-06, + "loss": 0.0077, + "num_input_tokens_seen": 28429824, + "step": 134710 + }, + { + "epoch": 14.82013201320132, + "grad_norm": 0.009059444069862366, + "learning_rate": 9.539714671784413e-06, + "loss": 0.0894, + "num_input_tokens_seen": 28430880, + "step": 134715 + }, + { + "epoch": 14.820682068206821, + "grad_norm": 0.035832975059747696, + "learning_rate": 9.537828638963556e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28431968, + "step": 134720 + }, + { + "epoch": 14.82123212321232, + "grad_norm": 0.01248915959149599, + "learning_rate": 9.53594274865013e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28433024, + "step": 134725 + }, + { + "epoch": 14.821782178217822, + "grad_norm": 0.016229718923568726, + "learning_rate": 9.53405700086149e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28434080, + "step": 134730 + }, + { + "epoch": 14.822332233223323, + "grad_norm": 0.7241396903991699, + "learning_rate": 9.532171395615036e-06, + "loss": 0.0503, + "num_input_tokens_seen": 28435168, + "step": 134735 + }, + { + "epoch": 14.822882288228822, + "grad_norm": 0.04948832467198372, + "learning_rate": 9.530285932928134e-06, + "loss": 0.0076, + "num_input_tokens_seen": 28436224, + "step": 134740 + }, + { + "epoch": 14.823432343234323, + "grad_norm": 0.27644678950309753, + "learning_rate": 9.528400612818156e-06, + "loss": 0.0199, + "num_input_tokens_seen": 28437312, + "step": 134745 + }, + { + "epoch": 14.823982398239824, + "grad_norm": 0.22890079021453857, + "learning_rate": 9.526515435302485e-06, + "loss": 0.0752, + "num_input_tokens_seen": 28438400, + "step": 134750 + }, + { + "epoch": 14.824532453245325, + "grad_norm": 0.006078374572098255, + "learning_rate": 9.524630400398504e-06, + "loss": 0.0043, + "num_input_tokens_seen": 28439488, + "step": 134755 + }, + { + "epoch": 14.825082508250825, + "grad_norm": 0.1900070458650589, + "learning_rate": 9.522745508123573e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28440576, + "step": 134760 + }, + { + "epoch": 14.825632563256326, + "grad_norm": 1.6211470365524292, + "learning_rate": 9.520860758495076e-06, + "loss": 0.0525, + "num_input_tokens_seen": 28441632, + "step": 134765 + }, + { + "epoch": 14.826182618261827, + "grad_norm": 0.045930616557598114, + "learning_rate": 9.51897615153037e-06, + "loss": 0.0544, + "num_input_tokens_seen": 28442720, + "step": 134770 + }, + { + "epoch": 14.826732673267326, + "grad_norm": 0.015591560862958431, + "learning_rate": 9.517091687246842e-06, + "loss": 0.0114, + "num_input_tokens_seen": 28443744, + "step": 134775 + }, + { + "epoch": 14.827282728272827, + "grad_norm": 0.024202262982726097, + "learning_rate": 9.51520736566184e-06, + "loss": 0.0133, + "num_input_tokens_seen": 28444800, + "step": 134780 + }, + { + "epoch": 14.827832783278328, + "grad_norm": 0.04332336410880089, + "learning_rate": 9.513323186792744e-06, + "loss": 0.009, + "num_input_tokens_seen": 28445856, + "step": 134785 + }, + { + "epoch": 14.828382838283828, + "grad_norm": 0.18885837495326996, + "learning_rate": 9.511439150656922e-06, + "loss": 0.002, + "num_input_tokens_seen": 28446912, + "step": 134790 + }, + { + "epoch": 14.828932893289329, + "grad_norm": 0.03950580954551697, + "learning_rate": 9.509555257271728e-06, + "loss": 0.0067, + "num_input_tokens_seen": 28447936, + "step": 134795 + }, + { + "epoch": 14.82948294829483, + "grad_norm": 0.0067748879082500935, + "learning_rate": 9.507671506654536e-06, + "loss": 0.0318, + "num_input_tokens_seen": 28449024, + "step": 134800 + }, + { + "epoch": 14.83003300330033, + "grad_norm": 2.114103078842163, + "learning_rate": 9.505787898822696e-06, + "loss": 0.1027, + "num_input_tokens_seen": 28450048, + "step": 134805 + }, + { + "epoch": 14.83058305830583, + "grad_norm": 0.034101858735084534, + "learning_rate": 9.503904433793573e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28451104, + "step": 134810 + }, + { + "epoch": 14.831133113311331, + "grad_norm": 0.8436809182167053, + "learning_rate": 9.502021111584533e-06, + "loss": 0.0153, + "num_input_tokens_seen": 28452128, + "step": 134815 + }, + { + "epoch": 14.831683168316832, + "grad_norm": 0.15637008845806122, + "learning_rate": 9.500137932212922e-06, + "loss": 0.0232, + "num_input_tokens_seen": 28453184, + "step": 134820 + }, + { + "epoch": 14.832233223322332, + "grad_norm": 0.10791696608066559, + "learning_rate": 9.49825489569611e-06, + "loss": 0.0523, + "num_input_tokens_seen": 28454304, + "step": 134825 + }, + { + "epoch": 14.832783278327833, + "grad_norm": 0.22361740469932556, + "learning_rate": 9.496372002051446e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28455360, + "step": 134830 + }, + { + "epoch": 14.833333333333334, + "grad_norm": 0.047128476202487946, + "learning_rate": 9.494489251296271e-06, + "loss": 0.0022, + "num_input_tokens_seen": 28456352, + "step": 134835 + }, + { + "epoch": 14.833883388338833, + "grad_norm": 0.030450379475951195, + "learning_rate": 9.49260664344795e-06, + "loss": 0.0204, + "num_input_tokens_seen": 28457440, + "step": 134840 + }, + { + "epoch": 14.834433443344334, + "grad_norm": 3.735504388809204, + "learning_rate": 9.49072417852383e-06, + "loss": 0.0796, + "num_input_tokens_seen": 28458432, + "step": 134845 + }, + { + "epoch": 14.834983498349835, + "grad_norm": 0.36248132586479187, + "learning_rate": 9.488841856541272e-06, + "loss": 0.003, + "num_input_tokens_seen": 28459552, + "step": 134850 + }, + { + "epoch": 14.835533553355335, + "grad_norm": 0.12376982718706131, + "learning_rate": 9.486959677517618e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28460608, + "step": 134855 + }, + { + "epoch": 14.836083608360836, + "grad_norm": 0.008450839668512344, + "learning_rate": 9.485077641470203e-06, + "loss": 0.048, + "num_input_tokens_seen": 28461696, + "step": 134860 + }, + { + "epoch": 14.836633663366337, + "grad_norm": 0.023017188534140587, + "learning_rate": 9.483195748416381e-06, + "loss": 0.0106, + "num_input_tokens_seen": 28462784, + "step": 134865 + }, + { + "epoch": 14.837183718371836, + "grad_norm": 0.05194098502397537, + "learning_rate": 9.4813139983735e-06, + "loss": 0.0906, + "num_input_tokens_seen": 28463872, + "step": 134870 + }, + { + "epoch": 14.837733773377337, + "grad_norm": 0.9189838171005249, + "learning_rate": 9.479432391358909e-06, + "loss": 0.0499, + "num_input_tokens_seen": 28464992, + "step": 134875 + }, + { + "epoch": 14.838283828382838, + "grad_norm": 0.19656214118003845, + "learning_rate": 9.477550927389942e-06, + "loss": 0.1041, + "num_input_tokens_seen": 28466048, + "step": 134880 + }, + { + "epoch": 14.83883388338834, + "grad_norm": 0.045308806002140045, + "learning_rate": 9.475669606483933e-06, + "loss": 0.0155, + "num_input_tokens_seen": 28467168, + "step": 134885 + }, + { + "epoch": 14.839383938393839, + "grad_norm": 0.060912180691957474, + "learning_rate": 9.473788428658234e-06, + "loss": 0.0049, + "num_input_tokens_seen": 28468224, + "step": 134890 + }, + { + "epoch": 14.83993399339934, + "grad_norm": 0.022710595279932022, + "learning_rate": 9.471907393930171e-06, + "loss": 0.0053, + "num_input_tokens_seen": 28469280, + "step": 134895 + }, + { + "epoch": 14.840484048404841, + "grad_norm": 0.12474192678928375, + "learning_rate": 9.470026502317084e-06, + "loss": 0.006, + "num_input_tokens_seen": 28470336, + "step": 134900 + }, + { + "epoch": 14.84103410341034, + "grad_norm": 0.5285745859146118, + "learning_rate": 9.468145753836319e-06, + "loss": 0.0072, + "num_input_tokens_seen": 28471360, + "step": 134905 + }, + { + "epoch": 14.841584158415841, + "grad_norm": 1.2490901947021484, + "learning_rate": 9.466265148505196e-06, + "loss": 0.01, + "num_input_tokens_seen": 28472384, + "step": 134910 + }, + { + "epoch": 14.842134213421343, + "grad_norm": 0.5641657114028931, + "learning_rate": 9.46438468634106e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28473408, + "step": 134915 + }, + { + "epoch": 14.842684268426842, + "grad_norm": 0.005805505905300379, + "learning_rate": 9.462504367361227e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28474464, + "step": 134920 + }, + { + "epoch": 14.843234323432343, + "grad_norm": 0.054251786321401596, + "learning_rate": 9.460624191583036e-06, + "loss": 0.018, + "num_input_tokens_seen": 28475520, + "step": 134925 + }, + { + "epoch": 14.843784378437844, + "grad_norm": 0.048247095197439194, + "learning_rate": 9.458744159023824e-06, + "loss": 0.0896, + "num_input_tokens_seen": 28476544, + "step": 134930 + }, + { + "epoch": 14.844334433443345, + "grad_norm": 0.14778804779052734, + "learning_rate": 9.456864269700899e-06, + "loss": 0.0414, + "num_input_tokens_seen": 28477536, + "step": 134935 + }, + { + "epoch": 14.844884488448844, + "grad_norm": 0.017858486622571945, + "learning_rate": 9.45498452363161e-06, + "loss": 0.0526, + "num_input_tokens_seen": 28478656, + "step": 134940 + }, + { + "epoch": 14.845434543454346, + "grad_norm": 0.2215149998664856, + "learning_rate": 9.453104920833266e-06, + "loss": 0.0877, + "num_input_tokens_seen": 28479744, + "step": 134945 + }, + { + "epoch": 14.845984598459847, + "grad_norm": 0.028252920135855675, + "learning_rate": 9.451225461323188e-06, + "loss": 0.001, + "num_input_tokens_seen": 28480832, + "step": 134950 + }, + { + "epoch": 14.846534653465346, + "grad_norm": 2.6815011501312256, + "learning_rate": 9.449346145118704e-06, + "loss": 0.0761, + "num_input_tokens_seen": 28481920, + "step": 134955 + }, + { + "epoch": 14.847084708470847, + "grad_norm": 0.1679019033908844, + "learning_rate": 9.447466972237132e-06, + "loss": 0.062, + "num_input_tokens_seen": 28483008, + "step": 134960 + }, + { + "epoch": 14.847634763476348, + "grad_norm": 0.0075704073533415794, + "learning_rate": 9.445587942695806e-06, + "loss": 0.047, + "num_input_tokens_seen": 28484128, + "step": 134965 + }, + { + "epoch": 14.848184818481847, + "grad_norm": 0.02053126133978367, + "learning_rate": 9.44370905651203e-06, + "loss": 0.0067, + "num_input_tokens_seen": 28485184, + "step": 134970 + }, + { + "epoch": 14.848734873487349, + "grad_norm": 0.026769110932946205, + "learning_rate": 9.441830313703113e-06, + "loss": 0.026, + "num_input_tokens_seen": 28486208, + "step": 134975 + }, + { + "epoch": 14.84928492849285, + "grad_norm": 2.8757028579711914, + "learning_rate": 9.439951714286383e-06, + "loss": 0.0946, + "num_input_tokens_seen": 28487264, + "step": 134980 + }, + { + "epoch": 14.84983498349835, + "grad_norm": 0.07289808243513107, + "learning_rate": 9.438073258279151e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28488288, + "step": 134985 + }, + { + "epoch": 14.85038503850385, + "grad_norm": 0.025274943560361862, + "learning_rate": 9.436194945698736e-06, + "loss": 0.0391, + "num_input_tokens_seen": 28489248, + "step": 134990 + }, + { + "epoch": 14.850935093509351, + "grad_norm": 0.03118116594851017, + "learning_rate": 9.43431677656245e-06, + "loss": 0.0812, + "num_input_tokens_seen": 28490304, + "step": 134995 + }, + { + "epoch": 14.851485148514852, + "grad_norm": 0.19826707243919373, + "learning_rate": 9.432438750887584e-06, + "loss": 0.0349, + "num_input_tokens_seen": 28491360, + "step": 135000 + }, + { + "epoch": 14.852035203520352, + "grad_norm": 0.033483702689409256, + "learning_rate": 9.430560868691468e-06, + "loss": 0.1142, + "num_input_tokens_seen": 28492416, + "step": 135005 + }, + { + "epoch": 14.852585258525853, + "grad_norm": 0.009524354711174965, + "learning_rate": 9.428683129991395e-06, + "loss": 0.0091, + "num_input_tokens_seen": 28493504, + "step": 135010 + }, + { + "epoch": 14.853135313531354, + "grad_norm": 0.3858366310596466, + "learning_rate": 9.426805534804678e-06, + "loss": 0.1254, + "num_input_tokens_seen": 28494656, + "step": 135015 + }, + { + "epoch": 14.853685368536853, + "grad_norm": 0.015642857179045677, + "learning_rate": 9.424928083148629e-06, + "loss": 0.0563, + "num_input_tokens_seen": 28495744, + "step": 135020 + }, + { + "epoch": 14.854235423542354, + "grad_norm": 0.005371910985559225, + "learning_rate": 9.423050775040537e-06, + "loss": 0.0819, + "num_input_tokens_seen": 28496832, + "step": 135025 + }, + { + "epoch": 14.854785478547855, + "grad_norm": 0.5863136053085327, + "learning_rate": 9.42117361049772e-06, + "loss": 0.0393, + "num_input_tokens_seen": 28497888, + "step": 135030 + }, + { + "epoch": 14.855335533553355, + "grad_norm": 0.019936952739953995, + "learning_rate": 9.419296589537461e-06, + "loss": 0.0121, + "num_input_tokens_seen": 28498944, + "step": 135035 + }, + { + "epoch": 14.855885588558856, + "grad_norm": 0.13357916474342346, + "learning_rate": 9.41741971217707e-06, + "loss": 0.0021, + "num_input_tokens_seen": 28499968, + "step": 135040 + }, + { + "epoch": 14.856435643564357, + "grad_norm": 0.020643046125769615, + "learning_rate": 9.415542978433851e-06, + "loss": 0.004, + "num_input_tokens_seen": 28501024, + "step": 135045 + }, + { + "epoch": 14.856985698569858, + "grad_norm": 0.010653827339410782, + "learning_rate": 9.413666388325087e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28502112, + "step": 135050 + }, + { + "epoch": 14.857535753575357, + "grad_norm": 6.091559410095215, + "learning_rate": 9.411789941868088e-06, + "loss": 0.0594, + "num_input_tokens_seen": 28503168, + "step": 135055 + }, + { + "epoch": 14.858085808580858, + "grad_norm": 1.7770227193832397, + "learning_rate": 9.409913639080135e-06, + "loss": 0.065, + "num_input_tokens_seen": 28504224, + "step": 135060 + }, + { + "epoch": 14.85863586358636, + "grad_norm": 0.07825268805027008, + "learning_rate": 9.408037479978534e-06, + "loss": 0.0037, + "num_input_tokens_seen": 28505312, + "step": 135065 + }, + { + "epoch": 14.859185918591859, + "grad_norm": 0.007489589508622885, + "learning_rate": 9.406161464580563e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28506400, + "step": 135070 + }, + { + "epoch": 14.85973597359736, + "grad_norm": 0.6367296576499939, + "learning_rate": 9.404285592903517e-06, + "loss": 0.0094, + "num_input_tokens_seen": 28507424, + "step": 135075 + }, + { + "epoch": 14.86028602860286, + "grad_norm": 3.1156296730041504, + "learning_rate": 9.402409864964695e-06, + "loss": 0.0315, + "num_input_tokens_seen": 28508416, + "step": 135080 + }, + { + "epoch": 14.86083608360836, + "grad_norm": 0.07322607934474945, + "learning_rate": 9.40053428078137e-06, + "loss": 0.0019, + "num_input_tokens_seen": 28509472, + "step": 135085 + }, + { + "epoch": 14.861386138613861, + "grad_norm": 0.05429638922214508, + "learning_rate": 9.398658840370844e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28510496, + "step": 135090 + }, + { + "epoch": 14.861936193619362, + "grad_norm": 0.07032449543476105, + "learning_rate": 9.396783543750384e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28511616, + "step": 135095 + }, + { + "epoch": 14.862486248624862, + "grad_norm": 1.366246223449707, + "learning_rate": 9.394908390937282e-06, + "loss": 0.0313, + "num_input_tokens_seen": 28512672, + "step": 135100 + }, + { + "epoch": 14.863036303630363, + "grad_norm": 0.00292785931378603, + "learning_rate": 9.393033381948831e-06, + "loss": 0.2129, + "num_input_tokens_seen": 28513696, + "step": 135105 + }, + { + "epoch": 14.863586358635864, + "grad_norm": 0.0042138490825891495, + "learning_rate": 9.391158516802295e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28514752, + "step": 135110 + }, + { + "epoch": 14.864136413641365, + "grad_norm": 1.699600100517273, + "learning_rate": 9.389283795514967e-06, + "loss": 0.0193, + "num_input_tokens_seen": 28515936, + "step": 135115 + }, + { + "epoch": 14.864686468646864, + "grad_norm": 0.02203618735074997, + "learning_rate": 9.38740921810412e-06, + "loss": 0.0573, + "num_input_tokens_seen": 28516992, + "step": 135120 + }, + { + "epoch": 14.865236523652365, + "grad_norm": 0.007700526621192694, + "learning_rate": 9.385534784587025e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28518016, + "step": 135125 + }, + { + "epoch": 14.865786578657866, + "grad_norm": 0.007558898534625769, + "learning_rate": 9.383660494980962e-06, + "loss": 0.0107, + "num_input_tokens_seen": 28519072, + "step": 135130 + }, + { + "epoch": 14.866336633663366, + "grad_norm": 0.017482660710811615, + "learning_rate": 9.381786349303212e-06, + "loss": 0.0558, + "num_input_tokens_seen": 28520128, + "step": 135135 + }, + { + "epoch": 14.866886688668867, + "grad_norm": 0.24968905746936798, + "learning_rate": 9.379912347571038e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28521216, + "step": 135140 + }, + { + "epoch": 14.867436743674368, + "grad_norm": 0.02979844994843006, + "learning_rate": 9.378038489801724e-06, + "loss": 0.1033, + "num_input_tokens_seen": 28522336, + "step": 135145 + }, + { + "epoch": 14.867986798679867, + "grad_norm": 0.018490230664610863, + "learning_rate": 9.376164776012524e-06, + "loss": 0.0664, + "num_input_tokens_seen": 28523360, + "step": 135150 + }, + { + "epoch": 14.868536853685368, + "grad_norm": 0.050897639244794846, + "learning_rate": 9.374291206220718e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28524416, + "step": 135155 + }, + { + "epoch": 14.86908690869087, + "grad_norm": 0.06656377017498016, + "learning_rate": 9.37241778044358e-06, + "loss": 0.0103, + "num_input_tokens_seen": 28525504, + "step": 135160 + }, + { + "epoch": 14.869636963696369, + "grad_norm": 0.031562477350234985, + "learning_rate": 9.370544498698359e-06, + "loss": 0.0057, + "num_input_tokens_seen": 28526528, + "step": 135165 + }, + { + "epoch": 14.87018701870187, + "grad_norm": 2.4805896282196045, + "learning_rate": 9.368671361002338e-06, + "loss": 0.2002, + "num_input_tokens_seen": 28527616, + "step": 135170 + }, + { + "epoch": 14.870737073707371, + "grad_norm": 1.4338918924331665, + "learning_rate": 9.366798367372764e-06, + "loss": 0.012, + "num_input_tokens_seen": 28528640, + "step": 135175 + }, + { + "epoch": 14.871287128712872, + "grad_norm": 2.6580119132995605, + "learning_rate": 9.364925517826917e-06, + "loss": 0.0659, + "num_input_tokens_seen": 28529696, + "step": 135180 + }, + { + "epoch": 14.871837183718371, + "grad_norm": 0.06827875226736069, + "learning_rate": 9.36305281238204e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28530720, + "step": 135185 + }, + { + "epoch": 14.872387238723872, + "grad_norm": 0.022730056196451187, + "learning_rate": 9.361180251055402e-06, + "loss": 0.002, + "num_input_tokens_seen": 28531776, + "step": 135190 + }, + { + "epoch": 14.872937293729374, + "grad_norm": 0.1474473923444748, + "learning_rate": 9.35930783386427e-06, + "loss": 0.019, + "num_input_tokens_seen": 28532800, + "step": 135195 + }, + { + "epoch": 14.873487348734873, + "grad_norm": 0.17761360108852386, + "learning_rate": 9.357435560825883e-06, + "loss": 0.0066, + "num_input_tokens_seen": 28533824, + "step": 135200 + }, + { + "epoch": 14.874037403740374, + "grad_norm": 3.163635730743408, + "learning_rate": 9.355563431957517e-06, + "loss": 0.0338, + "num_input_tokens_seen": 28534912, + "step": 135205 + }, + { + "epoch": 14.874587458745875, + "grad_norm": 0.04256192222237587, + "learning_rate": 9.353691447276409e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28536000, + "step": 135210 + }, + { + "epoch": 14.875137513751374, + "grad_norm": 0.7826997637748718, + "learning_rate": 9.351819606799814e-06, + "loss": 0.0086, + "num_input_tokens_seen": 28537088, + "step": 135215 + }, + { + "epoch": 14.875687568756875, + "grad_norm": 1.8872153759002686, + "learning_rate": 9.349947910545001e-06, + "loss": 0.0738, + "num_input_tokens_seen": 28538176, + "step": 135220 + }, + { + "epoch": 14.876237623762377, + "grad_norm": 0.0342671237885952, + "learning_rate": 9.348076358529198e-06, + "loss": 0.003, + "num_input_tokens_seen": 28539232, + "step": 135225 + }, + { + "epoch": 14.876787678767876, + "grad_norm": 0.046678368002176285, + "learning_rate": 9.346204950769673e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28540256, + "step": 135230 + }, + { + "epoch": 14.877337733773377, + "grad_norm": 1.5118064880371094, + "learning_rate": 9.344333687283668e-06, + "loss": 0.0252, + "num_input_tokens_seen": 28541376, + "step": 135235 + }, + { + "epoch": 14.877887788778878, + "grad_norm": 0.03702834993600845, + "learning_rate": 9.342462568088416e-06, + "loss": 0.0055, + "num_input_tokens_seen": 28542432, + "step": 135240 + }, + { + "epoch": 14.87843784378438, + "grad_norm": 0.018224909901618958, + "learning_rate": 9.340591593201175e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28543488, + "step": 135245 + }, + { + "epoch": 14.878987898789878, + "grad_norm": 0.051652297377586365, + "learning_rate": 9.33872076263919e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28544608, + "step": 135250 + }, + { + "epoch": 14.87953795379538, + "grad_norm": 0.013943159021437168, + "learning_rate": 9.336850076419704e-06, + "loss": 0.066, + "num_input_tokens_seen": 28545696, + "step": 135255 + }, + { + "epoch": 14.88008800880088, + "grad_norm": 0.03631392493844032, + "learning_rate": 9.334979534559957e-06, + "loss": 0.091, + "num_input_tokens_seen": 28546752, + "step": 135260 + }, + { + "epoch": 14.88063806380638, + "grad_norm": 0.009882154874503613, + "learning_rate": 9.333109137077178e-06, + "loss": 0.0554, + "num_input_tokens_seen": 28547744, + "step": 135265 + }, + { + "epoch": 14.881188118811881, + "grad_norm": 0.22530026733875275, + "learning_rate": 9.331238883988616e-06, + "loss": 0.0582, + "num_input_tokens_seen": 28548768, + "step": 135270 + }, + { + "epoch": 14.881738173817382, + "grad_norm": 1.9852268695831299, + "learning_rate": 9.329368775311503e-06, + "loss": 0.1058, + "num_input_tokens_seen": 28549856, + "step": 135275 + }, + { + "epoch": 14.882288228822881, + "grad_norm": 0.10741667449474335, + "learning_rate": 9.32749881106309e-06, + "loss": 0.0178, + "num_input_tokens_seen": 28550976, + "step": 135280 + }, + { + "epoch": 14.882838283828383, + "grad_norm": 0.022508710622787476, + "learning_rate": 9.325628991260602e-06, + "loss": 0.0097, + "num_input_tokens_seen": 28552032, + "step": 135285 + }, + { + "epoch": 14.883388338833884, + "grad_norm": 2.846116304397583, + "learning_rate": 9.323759315921258e-06, + "loss": 0.1113, + "num_input_tokens_seen": 28553088, + "step": 135290 + }, + { + "epoch": 14.883938393839383, + "grad_norm": 0.5565772652626038, + "learning_rate": 9.321889785062312e-06, + "loss": 0.004, + "num_input_tokens_seen": 28554208, + "step": 135295 + }, + { + "epoch": 14.884488448844884, + "grad_norm": 0.005145155359059572, + "learning_rate": 9.320020398700977e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28555232, + "step": 135300 + }, + { + "epoch": 14.885038503850385, + "grad_norm": 0.07493823766708374, + "learning_rate": 9.318151156854487e-06, + "loss": 0.0365, + "num_input_tokens_seen": 28556352, + "step": 135305 + }, + { + "epoch": 14.885588558855886, + "grad_norm": 1.2322601079940796, + "learning_rate": 9.316282059540085e-06, + "loss": 0.1481, + "num_input_tokens_seen": 28557408, + "step": 135310 + }, + { + "epoch": 14.886138613861386, + "grad_norm": 0.2681088149547577, + "learning_rate": 9.314413106774975e-06, + "loss": 0.0453, + "num_input_tokens_seen": 28558432, + "step": 135315 + }, + { + "epoch": 14.886688668866887, + "grad_norm": 0.02786741591989994, + "learning_rate": 9.3125442985764e-06, + "loss": 0.0091, + "num_input_tokens_seen": 28559488, + "step": 135320 + }, + { + "epoch": 14.887238723872388, + "grad_norm": 2.095466136932373, + "learning_rate": 9.31067563496158e-06, + "loss": 0.0827, + "num_input_tokens_seen": 28560480, + "step": 135325 + }, + { + "epoch": 14.887788778877887, + "grad_norm": 1.7906455993652344, + "learning_rate": 9.308807115947715e-06, + "loss": 0.0684, + "num_input_tokens_seen": 28561536, + "step": 135330 + }, + { + "epoch": 14.888338833883388, + "grad_norm": 1.8316274881362915, + "learning_rate": 9.306938741552063e-06, + "loss": 0.03, + "num_input_tokens_seen": 28562624, + "step": 135335 + }, + { + "epoch": 14.88888888888889, + "grad_norm": 0.02134726755321026, + "learning_rate": 9.305070511791816e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28563712, + "step": 135340 + }, + { + "epoch": 14.88943894389439, + "grad_norm": 0.07552114874124527, + "learning_rate": 9.303202426684216e-06, + "loss": 0.0326, + "num_input_tokens_seen": 28564736, + "step": 135345 + }, + { + "epoch": 14.88998899889989, + "grad_norm": 0.11723954975605011, + "learning_rate": 9.301334486246463e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28565760, + "step": 135350 + }, + { + "epoch": 14.89053905390539, + "grad_norm": 2.3884356021881104, + "learning_rate": 9.29946669049577e-06, + "loss": 0.1045, + "num_input_tokens_seen": 28566816, + "step": 135355 + }, + { + "epoch": 14.891089108910892, + "grad_norm": 0.01635941118001938, + "learning_rate": 9.29759903944936e-06, + "loss": 0.0332, + "num_input_tokens_seen": 28567936, + "step": 135360 + }, + { + "epoch": 14.891639163916391, + "grad_norm": 0.0102480323985219, + "learning_rate": 9.295731533124444e-06, + "loss": 0.0253, + "num_input_tokens_seen": 28568960, + "step": 135365 + }, + { + "epoch": 14.892189218921892, + "grad_norm": 0.04788690432906151, + "learning_rate": 9.293864171538242e-06, + "loss": 0.0047, + "num_input_tokens_seen": 28569984, + "step": 135370 + }, + { + "epoch": 14.892739273927393, + "grad_norm": 0.18273043632507324, + "learning_rate": 9.291996954707958e-06, + "loss": 0.0046, + "num_input_tokens_seen": 28570976, + "step": 135375 + }, + { + "epoch": 14.893289328932893, + "grad_norm": 0.010898313485085964, + "learning_rate": 9.290129882650791e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28572000, + "step": 135380 + }, + { + "epoch": 14.893839383938394, + "grad_norm": 0.13838323950767517, + "learning_rate": 9.28826295538397e-06, + "loss": 0.0084, + "num_input_tokens_seen": 28573088, + "step": 135385 + }, + { + "epoch": 14.894389438943895, + "grad_norm": 0.08511539548635483, + "learning_rate": 9.286396172924672e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28574144, + "step": 135390 + }, + { + "epoch": 14.894939493949394, + "grad_norm": 3.8786299228668213, + "learning_rate": 9.284529535290137e-06, + "loss": 0.0426, + "num_input_tokens_seen": 28575104, + "step": 135395 + }, + { + "epoch": 14.895489548954895, + "grad_norm": 0.1443406194448471, + "learning_rate": 9.282663042497553e-06, + "loss": 0.0439, + "num_input_tokens_seen": 28576224, + "step": 135400 + }, + { + "epoch": 14.896039603960396, + "grad_norm": 0.02440936677157879, + "learning_rate": 9.280796694564111e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28577216, + "step": 135405 + }, + { + "epoch": 14.896589658965897, + "grad_norm": 1.647602915763855, + "learning_rate": 9.278930491507031e-06, + "loss": 0.0283, + "num_input_tokens_seen": 28578304, + "step": 135410 + }, + { + "epoch": 14.897139713971397, + "grad_norm": 1.75662362575531, + "learning_rate": 9.277064433343502e-06, + "loss": 0.0294, + "num_input_tokens_seen": 28579264, + "step": 135415 + }, + { + "epoch": 14.897689768976898, + "grad_norm": 0.025011178106069565, + "learning_rate": 9.27519852009072e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28580320, + "step": 135420 + }, + { + "epoch": 14.898239823982399, + "grad_norm": 1.611901044845581, + "learning_rate": 9.273332751765899e-06, + "loss": 0.0084, + "num_input_tokens_seen": 28581344, + "step": 135425 + }, + { + "epoch": 14.898789878987898, + "grad_norm": 1.8734216690063477, + "learning_rate": 9.271467128386213e-06, + "loss": 0.0123, + "num_input_tokens_seen": 28582464, + "step": 135430 + }, + { + "epoch": 14.8993399339934, + "grad_norm": 0.035834360867738724, + "learning_rate": 9.269601649968878e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28583520, + "step": 135435 + }, + { + "epoch": 14.8998899889989, + "grad_norm": 1.0190616846084595, + "learning_rate": 9.267736316531067e-06, + "loss": 0.0083, + "num_input_tokens_seen": 28584544, + "step": 135440 + }, + { + "epoch": 14.9004400440044, + "grad_norm": 0.47880056500434875, + "learning_rate": 9.265871128089989e-06, + "loss": 0.0081, + "num_input_tokens_seen": 28585600, + "step": 135445 + }, + { + "epoch": 14.900990099009901, + "grad_norm": 2.3418259620666504, + "learning_rate": 9.264006084662821e-06, + "loss": 0.0496, + "num_input_tokens_seen": 28586624, + "step": 135450 + }, + { + "epoch": 14.901540154015402, + "grad_norm": 0.031225698068737984, + "learning_rate": 9.262141186266756e-06, + "loss": 0.0138, + "num_input_tokens_seen": 28587648, + "step": 135455 + }, + { + "epoch": 14.902090209020901, + "grad_norm": 0.018838126212358475, + "learning_rate": 9.260276432918993e-06, + "loss": 0.0167, + "num_input_tokens_seen": 28588736, + "step": 135460 + }, + { + "epoch": 14.902640264026402, + "grad_norm": 0.02984931506216526, + "learning_rate": 9.258411824636701e-06, + "loss": 0.0111, + "num_input_tokens_seen": 28589760, + "step": 135465 + }, + { + "epoch": 14.903190319031903, + "grad_norm": 4.05570650100708, + "learning_rate": 9.256547361437082e-06, + "loss": 0.0692, + "num_input_tokens_seen": 28590784, + "step": 135470 + }, + { + "epoch": 14.903740374037405, + "grad_norm": 0.021220305934548378, + "learning_rate": 9.2546830433373e-06, + "loss": 0.0458, + "num_input_tokens_seen": 28591840, + "step": 135475 + }, + { + "epoch": 14.904290429042904, + "grad_norm": 0.005608230829238892, + "learning_rate": 9.252818870354554e-06, + "loss": 0.0393, + "num_input_tokens_seen": 28592832, + "step": 135480 + }, + { + "epoch": 14.904840484048405, + "grad_norm": 0.07151266932487488, + "learning_rate": 9.250954842506026e-06, + "loss": 0.0025, + "num_input_tokens_seen": 28593792, + "step": 135485 + }, + { + "epoch": 14.905390539053906, + "grad_norm": 0.020563101395964622, + "learning_rate": 9.249090959808881e-06, + "loss": 0.0043, + "num_input_tokens_seen": 28594880, + "step": 135490 + }, + { + "epoch": 14.905940594059405, + "grad_norm": 3.3852527141571045, + "learning_rate": 9.24722722228032e-06, + "loss": 0.0197, + "num_input_tokens_seen": 28595872, + "step": 135495 + }, + { + "epoch": 14.906490649064907, + "grad_norm": 0.08026695996522903, + "learning_rate": 9.2453636299375e-06, + "loss": 0.0019, + "num_input_tokens_seen": 28596864, + "step": 135500 + }, + { + "epoch": 14.907040704070408, + "grad_norm": 0.00657885055989027, + "learning_rate": 9.24350018279759e-06, + "loss": 0.002, + "num_input_tokens_seen": 28597824, + "step": 135505 + }, + { + "epoch": 14.907590759075907, + "grad_norm": 0.006198568269610405, + "learning_rate": 9.241636880877797e-06, + "loss": 0.0021, + "num_input_tokens_seen": 28598880, + "step": 135510 + }, + { + "epoch": 14.908140814081408, + "grad_norm": 0.06977599114179611, + "learning_rate": 9.239773724195272e-06, + "loss": 0.0625, + "num_input_tokens_seen": 28599904, + "step": 135515 + }, + { + "epoch": 14.908690869086909, + "grad_norm": 0.213098406791687, + "learning_rate": 9.23791071276718e-06, + "loss": 0.0331, + "num_input_tokens_seen": 28600992, + "step": 135520 + }, + { + "epoch": 14.909240924092408, + "grad_norm": 0.05812736228108406, + "learning_rate": 9.236047846610716e-06, + "loss": 0.0706, + "num_input_tokens_seen": 28602048, + "step": 135525 + }, + { + "epoch": 14.90979097909791, + "grad_norm": 0.053905487060546875, + "learning_rate": 9.234185125743023e-06, + "loss": 0.0126, + "num_input_tokens_seen": 28603168, + "step": 135530 + }, + { + "epoch": 14.91034103410341, + "grad_norm": 0.1829836666584015, + "learning_rate": 9.23232255018128e-06, + "loss": 0.0221, + "num_input_tokens_seen": 28604320, + "step": 135535 + }, + { + "epoch": 14.910891089108912, + "grad_norm": 0.5592558979988098, + "learning_rate": 9.230460119942664e-06, + "loss": 0.0097, + "num_input_tokens_seen": 28605440, + "step": 135540 + }, + { + "epoch": 14.911441144114411, + "grad_norm": 0.2784270644187927, + "learning_rate": 9.22859783504432e-06, + "loss": 0.0244, + "num_input_tokens_seen": 28606464, + "step": 135545 + }, + { + "epoch": 14.911991199119912, + "grad_norm": 0.05943021923303604, + "learning_rate": 9.22673569550343e-06, + "loss": 0.0561, + "num_input_tokens_seen": 28607584, + "step": 135550 + }, + { + "epoch": 14.912541254125413, + "grad_norm": 0.15954206883907318, + "learning_rate": 9.224873701337141e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28608640, + "step": 135555 + }, + { + "epoch": 14.913091309130913, + "grad_norm": 0.07747966796159744, + "learning_rate": 9.22301185256263e-06, + "loss": 0.0676, + "num_input_tokens_seen": 28609696, + "step": 135560 + }, + { + "epoch": 14.913641364136414, + "grad_norm": 0.01079083513468504, + "learning_rate": 9.221150149197038e-06, + "loss": 0.0535, + "num_input_tokens_seen": 28610688, + "step": 135565 + }, + { + "epoch": 14.914191419141915, + "grad_norm": 0.042861711233854294, + "learning_rate": 9.219288591257536e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28611712, + "step": 135570 + }, + { + "epoch": 14.914741474147414, + "grad_norm": 0.1367328017950058, + "learning_rate": 9.217427178761287e-06, + "loss": 0.0078, + "num_input_tokens_seen": 28612736, + "step": 135575 + }, + { + "epoch": 14.915291529152915, + "grad_norm": 0.02468530274927616, + "learning_rate": 9.215565911725429e-06, + "loss": 0.0546, + "num_input_tokens_seen": 28613888, + "step": 135580 + }, + { + "epoch": 14.915841584158416, + "grad_norm": 0.005157250910997391, + "learning_rate": 9.213704790167135e-06, + "loss": 0.0006, + "num_input_tokens_seen": 28614944, + "step": 135585 + }, + { + "epoch": 14.916391639163916, + "grad_norm": 0.021342480555176735, + "learning_rate": 9.21184381410354e-06, + "loss": 0.0738, + "num_input_tokens_seen": 28616064, + "step": 135590 + }, + { + "epoch": 14.916941694169417, + "grad_norm": 0.008742145262658596, + "learning_rate": 9.209982983551804e-06, + "loss": 0.0067, + "num_input_tokens_seen": 28617184, + "step": 135595 + }, + { + "epoch": 14.917491749174918, + "grad_norm": 0.025426462292671204, + "learning_rate": 9.208122298529087e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28618304, + "step": 135600 + }, + { + "epoch": 14.918041804180419, + "grad_norm": 0.008314333856105804, + "learning_rate": 9.206261759052523e-06, + "loss": 0.0404, + "num_input_tokens_seen": 28619424, + "step": 135605 + }, + { + "epoch": 14.918591859185918, + "grad_norm": 0.04706871137022972, + "learning_rate": 9.20440136513927e-06, + "loss": 0.048, + "num_input_tokens_seen": 28620448, + "step": 135610 + }, + { + "epoch": 14.91914191419142, + "grad_norm": 0.025254134088754654, + "learning_rate": 9.202541116806473e-06, + "loss": 0.0466, + "num_input_tokens_seen": 28621408, + "step": 135615 + }, + { + "epoch": 14.91969196919692, + "grad_norm": 0.03357452154159546, + "learning_rate": 9.200681014071267e-06, + "loss": 0.0054, + "num_input_tokens_seen": 28622432, + "step": 135620 + }, + { + "epoch": 14.92024202420242, + "grad_norm": 0.018577737733721733, + "learning_rate": 9.198821056950799e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28623456, + "step": 135625 + }, + { + "epoch": 14.92079207920792, + "grad_norm": 0.026052139699459076, + "learning_rate": 9.196961245462218e-06, + "loss": 0.0882, + "num_input_tokens_seen": 28624448, + "step": 135630 + }, + { + "epoch": 14.921342134213422, + "grad_norm": 0.061845723539590836, + "learning_rate": 9.19510157962267e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28625472, + "step": 135635 + }, + { + "epoch": 14.921892189218921, + "grad_norm": 0.015060827136039734, + "learning_rate": 9.193242059449285e-06, + "loss": 0.0151, + "num_input_tokens_seen": 28626464, + "step": 135640 + }, + { + "epoch": 14.922442244224422, + "grad_norm": 4.759767055511475, + "learning_rate": 9.191382684959199e-06, + "loss": 0.1109, + "num_input_tokens_seen": 28627488, + "step": 135645 + }, + { + "epoch": 14.922992299229923, + "grad_norm": 4.576531410217285, + "learning_rate": 9.18952345616955e-06, + "loss": 0.033, + "num_input_tokens_seen": 28628544, + "step": 135650 + }, + { + "epoch": 14.923542354235423, + "grad_norm": 0.11686166375875473, + "learning_rate": 9.187664373097476e-06, + "loss": 0.0076, + "num_input_tokens_seen": 28629568, + "step": 135655 + }, + { + "epoch": 14.924092409240924, + "grad_norm": 0.1675979644060135, + "learning_rate": 9.185805435760123e-06, + "loss": 0.0022, + "num_input_tokens_seen": 28630592, + "step": 135660 + }, + { + "epoch": 14.924642464246425, + "grad_norm": 1.628260612487793, + "learning_rate": 9.18394664417461e-06, + "loss": 0.1272, + "num_input_tokens_seen": 28631680, + "step": 135665 + }, + { + "epoch": 14.925192519251926, + "grad_norm": 0.008520485833287239, + "learning_rate": 9.182087998358062e-06, + "loss": 0.0028, + "num_input_tokens_seen": 28632768, + "step": 135670 + }, + { + "epoch": 14.925742574257425, + "grad_norm": 0.023924555629491806, + "learning_rate": 9.180229498327633e-06, + "loss": 0.0196, + "num_input_tokens_seen": 28633792, + "step": 135675 + }, + { + "epoch": 14.926292629262926, + "grad_norm": 0.1139354482293129, + "learning_rate": 9.178371144100426e-06, + "loss": 0.003, + "num_input_tokens_seen": 28634784, + "step": 135680 + }, + { + "epoch": 14.926842684268427, + "grad_norm": 0.011961527168750763, + "learning_rate": 9.176512935693579e-06, + "loss": 0.0973, + "num_input_tokens_seen": 28635872, + "step": 135685 + }, + { + "epoch": 14.927392739273927, + "grad_norm": 0.5779435038566589, + "learning_rate": 9.174654873124228e-06, + "loss": 0.0117, + "num_input_tokens_seen": 28636896, + "step": 135690 + }, + { + "epoch": 14.927942794279428, + "grad_norm": 0.06941019743680954, + "learning_rate": 9.172796956409482e-06, + "loss": 0.0486, + "num_input_tokens_seen": 28637920, + "step": 135695 + }, + { + "epoch": 14.928492849284929, + "grad_norm": 0.009165933355689049, + "learning_rate": 9.17093918556648e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28639008, + "step": 135700 + }, + { + "epoch": 14.929042904290428, + "grad_norm": 0.7711551785469055, + "learning_rate": 9.169081560612327e-06, + "loss": 0.0675, + "num_input_tokens_seen": 28640064, + "step": 135705 + }, + { + "epoch": 14.92959295929593, + "grad_norm": 1.7336803674697876, + "learning_rate": 9.167224081564155e-06, + "loss": 0.029, + "num_input_tokens_seen": 28641120, + "step": 135710 + }, + { + "epoch": 14.93014301430143, + "grad_norm": 0.02030300721526146, + "learning_rate": 9.165366748439089e-06, + "loss": 0.0085, + "num_input_tokens_seen": 28642144, + "step": 135715 + }, + { + "epoch": 14.930693069306932, + "grad_norm": 0.022443585097789764, + "learning_rate": 9.16350956125423e-06, + "loss": 0.0457, + "num_input_tokens_seen": 28643136, + "step": 135720 + }, + { + "epoch": 14.93124312431243, + "grad_norm": 0.11157455295324326, + "learning_rate": 9.161652520026712e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28644224, + "step": 135725 + }, + { + "epoch": 14.931793179317932, + "grad_norm": 0.05402630940079689, + "learning_rate": 9.159795624773643e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28645312, + "step": 135730 + }, + { + "epoch": 14.932343234323433, + "grad_norm": 2.423842430114746, + "learning_rate": 9.157938875512131e-06, + "loss": 0.11, + "num_input_tokens_seen": 28646368, + "step": 135735 + }, + { + "epoch": 14.932893289328932, + "grad_norm": 0.11413241177797318, + "learning_rate": 9.156082272259292e-06, + "loss": 0.0162, + "num_input_tokens_seen": 28647424, + "step": 135740 + }, + { + "epoch": 14.933443344334433, + "grad_norm": 0.043706346303224564, + "learning_rate": 9.154225815032242e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28648448, + "step": 135745 + }, + { + "epoch": 14.933993399339935, + "grad_norm": 0.06307608634233475, + "learning_rate": 9.152369503848097e-06, + "loss": 0.002, + "num_input_tokens_seen": 28649440, + "step": 135750 + }, + { + "epoch": 14.934543454345434, + "grad_norm": 0.06628836691379547, + "learning_rate": 9.150513338723956e-06, + "loss": 0.008, + "num_input_tokens_seen": 28650560, + "step": 135755 + }, + { + "epoch": 14.935093509350935, + "grad_norm": 0.010239491239190102, + "learning_rate": 9.14865731967692e-06, + "loss": 0.0056, + "num_input_tokens_seen": 28651616, + "step": 135760 + }, + { + "epoch": 14.935643564356436, + "grad_norm": 0.012044036760926247, + "learning_rate": 9.146801446724104e-06, + "loss": 0.0063, + "num_input_tokens_seen": 28652704, + "step": 135765 + }, + { + "epoch": 14.936193619361937, + "grad_norm": 0.015721317380666733, + "learning_rate": 9.144945719882613e-06, + "loss": 0.0054, + "num_input_tokens_seen": 28653728, + "step": 135770 + }, + { + "epoch": 14.936743674367436, + "grad_norm": 2.836672306060791, + "learning_rate": 9.143090139169558e-06, + "loss": 0.0289, + "num_input_tokens_seen": 28654720, + "step": 135775 + }, + { + "epoch": 14.937293729372938, + "grad_norm": 0.01769259385764599, + "learning_rate": 9.14123470460203e-06, + "loss": 0.0016, + "num_input_tokens_seen": 28655840, + "step": 135780 + }, + { + "epoch": 14.937843784378439, + "grad_norm": 0.014772390946745872, + "learning_rate": 9.139379416197125e-06, + "loss": 0.0238, + "num_input_tokens_seen": 28656928, + "step": 135785 + }, + { + "epoch": 14.938393839383938, + "grad_norm": 0.05053121969103813, + "learning_rate": 9.137524273971956e-06, + "loss": 0.0033, + "num_input_tokens_seen": 28657984, + "step": 135790 + }, + { + "epoch": 14.938943894389439, + "grad_norm": 0.00245430925861001, + "learning_rate": 9.135669277943609e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28659040, + "step": 135795 + }, + { + "epoch": 14.93949394939494, + "grad_norm": 0.31935468316078186, + "learning_rate": 9.133814428129183e-06, + "loss": 0.0066, + "num_input_tokens_seen": 28660160, + "step": 135800 + }, + { + "epoch": 14.94004400440044, + "grad_norm": 0.23457489907741547, + "learning_rate": 9.131959724545786e-06, + "loss": 0.0104, + "num_input_tokens_seen": 28661216, + "step": 135805 + }, + { + "epoch": 14.94059405940594, + "grad_norm": 0.09639639407396317, + "learning_rate": 9.130105167210492e-06, + "loss": 0.0021, + "num_input_tokens_seen": 28662208, + "step": 135810 + }, + { + "epoch": 14.941144114411442, + "grad_norm": 0.012576880864799023, + "learning_rate": 9.128250756140414e-06, + "loss": 0.0585, + "num_input_tokens_seen": 28663328, + "step": 135815 + }, + { + "epoch": 14.941694169416941, + "grad_norm": 0.045101944357156754, + "learning_rate": 9.126396491352624e-06, + "loss": 0.0829, + "num_input_tokens_seen": 28664384, + "step": 135820 + }, + { + "epoch": 14.942244224422442, + "grad_norm": 0.06914827972650528, + "learning_rate": 9.12454237286422e-06, + "loss": 0.0408, + "num_input_tokens_seen": 28665472, + "step": 135825 + }, + { + "epoch": 14.942794279427943, + "grad_norm": 0.0103859668597579, + "learning_rate": 9.1226884006923e-06, + "loss": 0.0021, + "num_input_tokens_seen": 28666496, + "step": 135830 + }, + { + "epoch": 14.943344334433444, + "grad_norm": 1.8325121402740479, + "learning_rate": 9.120834574853935e-06, + "loss": 0.0261, + "num_input_tokens_seen": 28667616, + "step": 135835 + }, + { + "epoch": 14.943894389438944, + "grad_norm": 0.02498689480125904, + "learning_rate": 9.118980895366225e-06, + "loss": 0.001, + "num_input_tokens_seen": 28668672, + "step": 135840 + }, + { + "epoch": 14.944444444444445, + "grad_norm": 0.07831604033708572, + "learning_rate": 9.117127362246239e-06, + "loss": 0.0077, + "num_input_tokens_seen": 28669792, + "step": 135845 + }, + { + "epoch": 14.944994499449946, + "grad_norm": 0.09896375983953476, + "learning_rate": 9.115273975511076e-06, + "loss": 0.002, + "num_input_tokens_seen": 28670784, + "step": 135850 + }, + { + "epoch": 14.945544554455445, + "grad_norm": 0.00875595398247242, + "learning_rate": 9.113420735177804e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28671872, + "step": 135855 + }, + { + "epoch": 14.946094609460946, + "grad_norm": 0.5489925742149353, + "learning_rate": 9.11156764126351e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28672928, + "step": 135860 + }, + { + "epoch": 14.946644664466447, + "grad_norm": 0.9996310472488403, + "learning_rate": 9.109714693785282e-06, + "loss": 0.0096, + "num_input_tokens_seen": 28674016, + "step": 135865 + }, + { + "epoch": 14.947194719471947, + "grad_norm": 0.006275269202888012, + "learning_rate": 9.107861892760178e-06, + "loss": 0.0666, + "num_input_tokens_seen": 28675040, + "step": 135870 + }, + { + "epoch": 14.947744774477448, + "grad_norm": 0.16744396090507507, + "learning_rate": 9.106009238205296e-06, + "loss": 0.0064, + "num_input_tokens_seen": 28676096, + "step": 135875 + }, + { + "epoch": 14.948294829482949, + "grad_norm": 0.020295249298214912, + "learning_rate": 9.104156730137692e-06, + "loss": 0.0008, + "num_input_tokens_seen": 28677056, + "step": 135880 + }, + { + "epoch": 14.948844884488448, + "grad_norm": 0.4196853041648865, + "learning_rate": 9.102304368574446e-06, + "loss": 0.005, + "num_input_tokens_seen": 28678080, + "step": 135885 + }, + { + "epoch": 14.94939493949395, + "grad_norm": 0.0711301937699318, + "learning_rate": 9.10045215353264e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28679168, + "step": 135890 + }, + { + "epoch": 14.94994499449945, + "grad_norm": 3.354231119155884, + "learning_rate": 9.09860008502933e-06, + "loss": 0.0451, + "num_input_tokens_seen": 28680192, + "step": 135895 + }, + { + "epoch": 14.950495049504951, + "grad_norm": 0.005032324697822332, + "learning_rate": 9.096748163081603e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28681184, + "step": 135900 + }, + { + "epoch": 14.95104510451045, + "grad_norm": 1.6717127561569214, + "learning_rate": 9.094896387706515e-06, + "loss": 0.036, + "num_input_tokens_seen": 28682272, + "step": 135905 + }, + { + "epoch": 14.951595159515952, + "grad_norm": 0.0414985716342926, + "learning_rate": 9.093044758921127e-06, + "loss": 0.0043, + "num_input_tokens_seen": 28683264, + "step": 135910 + }, + { + "epoch": 14.952145214521453, + "grad_norm": 1.667089819908142, + "learning_rate": 9.091193276742513e-06, + "loss": 0.0691, + "num_input_tokens_seen": 28684288, + "step": 135915 + }, + { + "epoch": 14.952695269526952, + "grad_norm": 0.018217751756310463, + "learning_rate": 9.089341941187742e-06, + "loss": 0.016, + "num_input_tokens_seen": 28685344, + "step": 135920 + }, + { + "epoch": 14.953245324532453, + "grad_norm": 0.05523696169257164, + "learning_rate": 9.087490752273867e-06, + "loss": 0.0059, + "num_input_tokens_seen": 28686368, + "step": 135925 + }, + { + "epoch": 14.953795379537954, + "grad_norm": 0.031326375901699066, + "learning_rate": 9.085639710017963e-06, + "loss": 0.1295, + "num_input_tokens_seen": 28687424, + "step": 135930 + }, + { + "epoch": 14.954345434543454, + "grad_norm": 0.04136247932910919, + "learning_rate": 9.08378881443707e-06, + "loss": 0.0012, + "num_input_tokens_seen": 28688448, + "step": 135935 + }, + { + "epoch": 14.954895489548955, + "grad_norm": 0.02367902174592018, + "learning_rate": 9.081938065548259e-06, + "loss": 0.067, + "num_input_tokens_seen": 28689568, + "step": 135940 + }, + { + "epoch": 14.955445544554456, + "grad_norm": 0.008331991732120514, + "learning_rate": 9.080087463368595e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28690624, + "step": 135945 + }, + { + "epoch": 14.955995599559955, + "grad_norm": 0.01952613890171051, + "learning_rate": 9.07823700791512e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28691616, + "step": 135950 + }, + { + "epoch": 14.956545654565456, + "grad_norm": 0.08702550083398819, + "learning_rate": 9.0763866992049e-06, + "loss": 0.002, + "num_input_tokens_seen": 28692704, + "step": 135955 + }, + { + "epoch": 14.957095709570957, + "grad_norm": 0.16268761456012726, + "learning_rate": 9.074536537254975e-06, + "loss": 0.0028, + "num_input_tokens_seen": 28693760, + "step": 135960 + }, + { + "epoch": 14.957645764576458, + "grad_norm": 0.007180969696491957, + "learning_rate": 9.072686522082413e-06, + "loss": 0.037, + "num_input_tokens_seen": 28694880, + "step": 135965 + }, + { + "epoch": 14.958195819581958, + "grad_norm": 0.2731024920940399, + "learning_rate": 9.070836653704248e-06, + "loss": 0.0533, + "num_input_tokens_seen": 28695936, + "step": 135970 + }, + { + "epoch": 14.958745874587459, + "grad_norm": 0.06039218232035637, + "learning_rate": 9.068986932137542e-06, + "loss": 0.0986, + "num_input_tokens_seen": 28696992, + "step": 135975 + }, + { + "epoch": 14.95929592959296, + "grad_norm": 3.064598560333252, + "learning_rate": 9.067137357399341e-06, + "loss": 0.1077, + "num_input_tokens_seen": 28698016, + "step": 135980 + }, + { + "epoch": 14.95984598459846, + "grad_norm": 0.05293326452374458, + "learning_rate": 9.065287929506686e-06, + "loss": 0.0039, + "num_input_tokens_seen": 28699040, + "step": 135985 + }, + { + "epoch": 14.96039603960396, + "grad_norm": 0.005968694109469652, + "learning_rate": 9.063438648476633e-06, + "loss": 0.0668, + "num_input_tokens_seen": 28700064, + "step": 135990 + }, + { + "epoch": 14.960946094609461, + "grad_norm": 0.7944592237472534, + "learning_rate": 9.06158951432621e-06, + "loss": 0.0348, + "num_input_tokens_seen": 28701152, + "step": 135995 + }, + { + "epoch": 14.96149614961496, + "grad_norm": 0.014865164645016193, + "learning_rate": 9.059740527072469e-06, + "loss": 0.0072, + "num_input_tokens_seen": 28702208, + "step": 136000 + }, + { + "epoch": 14.962046204620462, + "grad_norm": 0.13702833652496338, + "learning_rate": 9.057891686732459e-06, + "loss": 0.0335, + "num_input_tokens_seen": 28703328, + "step": 136005 + }, + { + "epoch": 14.962596259625963, + "grad_norm": 0.008800308220088482, + "learning_rate": 9.056042993323202e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28704384, + "step": 136010 + }, + { + "epoch": 14.963146314631462, + "grad_norm": 0.01588364690542221, + "learning_rate": 9.054194446861754e-06, + "loss": 0.0498, + "num_input_tokens_seen": 28705440, + "step": 136015 + }, + { + "epoch": 14.963696369636963, + "grad_norm": 0.06281344592571259, + "learning_rate": 9.052346047365148e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28706528, + "step": 136020 + }, + { + "epoch": 14.964246424642464, + "grad_norm": 0.028179630637168884, + "learning_rate": 9.050497794850404e-06, + "loss": 0.0791, + "num_input_tokens_seen": 28707488, + "step": 136025 + }, + { + "epoch": 14.964796479647966, + "grad_norm": 0.028641436249017715, + "learning_rate": 9.048649689334571e-06, + "loss": 0.1651, + "num_input_tokens_seen": 28708512, + "step": 136030 + }, + { + "epoch": 14.965346534653465, + "grad_norm": 0.003308633342385292, + "learning_rate": 9.046801730834678e-06, + "loss": 0.0075, + "num_input_tokens_seen": 28709568, + "step": 136035 + }, + { + "epoch": 14.965896589658966, + "grad_norm": 0.0022984750103205442, + "learning_rate": 9.044953919367769e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28710624, + "step": 136040 + }, + { + "epoch": 14.966446644664467, + "grad_norm": 0.006338176317512989, + "learning_rate": 9.043106254950862e-06, + "loss": 0.0007, + "num_input_tokens_seen": 28711712, + "step": 136045 + }, + { + "epoch": 14.966996699669966, + "grad_norm": 0.1582474559545517, + "learning_rate": 9.041258737600977e-06, + "loss": 0.0688, + "num_input_tokens_seen": 28712736, + "step": 136050 + }, + { + "epoch": 14.967546754675467, + "grad_norm": 0.5019440054893494, + "learning_rate": 9.039411367335157e-06, + "loss": 0.0282, + "num_input_tokens_seen": 28713824, + "step": 136055 + }, + { + "epoch": 14.968096809680969, + "grad_norm": 0.015592455863952637, + "learning_rate": 9.037564144170421e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28714912, + "step": 136060 + }, + { + "epoch": 14.968646864686468, + "grad_norm": 0.004367165267467499, + "learning_rate": 9.035717068123806e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28715936, + "step": 136065 + }, + { + "epoch": 14.969196919691969, + "grad_norm": 0.011778109706938267, + "learning_rate": 9.033870139212324e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28716992, + "step": 136070 + }, + { + "epoch": 14.96974697469747, + "grad_norm": 1.43180513381958, + "learning_rate": 9.032023357452995e-06, + "loss": 0.0666, + "num_input_tokens_seen": 28718080, + "step": 136075 + }, + { + "epoch": 14.97029702970297, + "grad_norm": 0.13391928374767303, + "learning_rate": 9.030176722862852e-06, + "loss": 0.0522, + "num_input_tokens_seen": 28719072, + "step": 136080 + }, + { + "epoch": 14.97084708470847, + "grad_norm": 0.034673698246479034, + "learning_rate": 9.028330235458898e-06, + "loss": 0.0108, + "num_input_tokens_seen": 28720192, + "step": 136085 + }, + { + "epoch": 14.971397139713972, + "grad_norm": 0.08004053682088852, + "learning_rate": 9.026483895258159e-06, + "loss": 0.0334, + "num_input_tokens_seen": 28721248, + "step": 136090 + }, + { + "epoch": 14.971947194719473, + "grad_norm": 0.03448718413710594, + "learning_rate": 9.024637702277664e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28722272, + "step": 136095 + }, + { + "epoch": 14.972497249724972, + "grad_norm": 0.007438465021550655, + "learning_rate": 9.022791656534407e-06, + "loss": 0.002, + "num_input_tokens_seen": 28723328, + "step": 136100 + }, + { + "epoch": 14.973047304730473, + "grad_norm": 0.02079644985496998, + "learning_rate": 9.020945758045424e-06, + "loss": 0.0048, + "num_input_tokens_seen": 28724416, + "step": 136105 + }, + { + "epoch": 14.973597359735974, + "grad_norm": 0.005639323964715004, + "learning_rate": 9.019100006827714e-06, + "loss": 0.0521, + "num_input_tokens_seen": 28725472, + "step": 136110 + }, + { + "epoch": 14.974147414741473, + "grad_norm": 0.053843602538108826, + "learning_rate": 9.017254402898274e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28726560, + "step": 136115 + }, + { + "epoch": 14.974697469746975, + "grad_norm": 0.3021577000617981, + "learning_rate": 9.015408946274148e-06, + "loss": 0.0068, + "num_input_tokens_seen": 28727584, + "step": 136120 + }, + { + "epoch": 14.975247524752476, + "grad_norm": 0.027920525521039963, + "learning_rate": 9.013563636972319e-06, + "loss": 0.0134, + "num_input_tokens_seen": 28728576, + "step": 136125 + }, + { + "epoch": 14.975797579757975, + "grad_norm": 0.05268818885087967, + "learning_rate": 9.011718475009812e-06, + "loss": 0.1542, + "num_input_tokens_seen": 28729664, + "step": 136130 + }, + { + "epoch": 14.976347634763476, + "grad_norm": 0.4264681339263916, + "learning_rate": 9.009873460403626e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28730720, + "step": 136135 + }, + { + "epoch": 14.976897689768977, + "grad_norm": 0.013935445807874203, + "learning_rate": 9.00802859317075e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28731712, + "step": 136140 + }, + { + "epoch": 14.977447744774478, + "grad_norm": 0.09157479554414749, + "learning_rate": 9.006183873328206e-06, + "loss": 0.1272, + "num_input_tokens_seen": 28732768, + "step": 136145 + }, + { + "epoch": 14.977997799779978, + "grad_norm": 0.04546623304486275, + "learning_rate": 9.004339300892986e-06, + "loss": 0.0925, + "num_input_tokens_seen": 28733792, + "step": 136150 + }, + { + "epoch": 14.978547854785479, + "grad_norm": 0.12272610515356064, + "learning_rate": 9.002494875882106e-06, + "loss": 0.0101, + "num_input_tokens_seen": 28734816, + "step": 136155 + }, + { + "epoch": 14.97909790979098, + "grad_norm": 0.4333813786506653, + "learning_rate": 9.000650598312554e-06, + "loss": 0.0482, + "num_input_tokens_seen": 28735808, + "step": 136160 + }, + { + "epoch": 14.979647964796479, + "grad_norm": 0.03966999053955078, + "learning_rate": 8.998806468201321e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28736896, + "step": 136165 + }, + { + "epoch": 14.98019801980198, + "grad_norm": 0.013379506766796112, + "learning_rate": 8.99696248556542e-06, + "loss": 0.0821, + "num_input_tokens_seen": 28737920, + "step": 136170 + }, + { + "epoch": 14.980748074807481, + "grad_norm": 0.039919015020132065, + "learning_rate": 8.995118650421817e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28738976, + "step": 136175 + }, + { + "epoch": 14.98129812981298, + "grad_norm": 1.6759006977081299, + "learning_rate": 8.993274962787543e-06, + "loss": 0.0326, + "num_input_tokens_seen": 28740032, + "step": 136180 + }, + { + "epoch": 14.981848184818482, + "grad_norm": 0.10550026595592499, + "learning_rate": 8.991431422679573e-06, + "loss": 0.0611, + "num_input_tokens_seen": 28741088, + "step": 136185 + }, + { + "epoch": 14.982398239823983, + "grad_norm": 0.03904648870229721, + "learning_rate": 8.98958803011489e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28742176, + "step": 136190 + }, + { + "epoch": 14.982948294829484, + "grad_norm": 0.11311739683151245, + "learning_rate": 8.9877447851105e-06, + "loss": 0.0039, + "num_input_tokens_seen": 28743168, + "step": 136195 + }, + { + "epoch": 14.983498349834983, + "grad_norm": 1.8562943935394287, + "learning_rate": 8.985901687683374e-06, + "loss": 0.0375, + "num_input_tokens_seen": 28744224, + "step": 136200 + }, + { + "epoch": 14.984048404840484, + "grad_norm": 0.052085720002651215, + "learning_rate": 8.984058737850506e-06, + "loss": 0.0173, + "num_input_tokens_seen": 28745280, + "step": 136205 + }, + { + "epoch": 14.984598459845985, + "grad_norm": 0.22454342246055603, + "learning_rate": 8.982215935628897e-06, + "loss": 0.1429, + "num_input_tokens_seen": 28746304, + "step": 136210 + }, + { + "epoch": 14.985148514851485, + "grad_norm": 0.07254097610712051, + "learning_rate": 8.980373281035504e-06, + "loss": 0.005, + "num_input_tokens_seen": 28747392, + "step": 136215 + }, + { + "epoch": 14.985698569856986, + "grad_norm": 0.023290859535336494, + "learning_rate": 8.978530774087335e-06, + "loss": 0.0507, + "num_input_tokens_seen": 28748448, + "step": 136220 + }, + { + "epoch": 14.986248624862487, + "grad_norm": 1.5537725687026978, + "learning_rate": 8.976688414801349e-06, + "loss": 0.0399, + "num_input_tokens_seen": 28749568, + "step": 136225 + }, + { + "epoch": 14.986798679867986, + "grad_norm": 0.021395517513155937, + "learning_rate": 8.974846203194547e-06, + "loss": 0.1159, + "num_input_tokens_seen": 28750624, + "step": 136230 + }, + { + "epoch": 14.987348734873487, + "grad_norm": 2.803995370864868, + "learning_rate": 8.973004139283892e-06, + "loss": 0.0873, + "num_input_tokens_seen": 28751648, + "step": 136235 + }, + { + "epoch": 14.987898789878988, + "grad_norm": 0.07796210795640945, + "learning_rate": 8.971162223086363e-06, + "loss": 0.0977, + "num_input_tokens_seen": 28752736, + "step": 136240 + }, + { + "epoch": 14.988448844884488, + "grad_norm": 0.031331755220890045, + "learning_rate": 8.969320454618951e-06, + "loss": 0.0109, + "num_input_tokens_seen": 28753792, + "step": 136245 + }, + { + "epoch": 14.988998899889989, + "grad_norm": 0.059915415942668915, + "learning_rate": 8.967478833898612e-06, + "loss": 0.0118, + "num_input_tokens_seen": 28754880, + "step": 136250 + }, + { + "epoch": 14.98954895489549, + "grad_norm": 0.0747634768486023, + "learning_rate": 8.965637360942335e-06, + "loss": 0.0015, + "num_input_tokens_seen": 28755936, + "step": 136255 + }, + { + "epoch": 14.990099009900991, + "grad_norm": 0.05992542952299118, + "learning_rate": 8.963796035767077e-06, + "loss": 0.0028, + "num_input_tokens_seen": 28757024, + "step": 136260 + }, + { + "epoch": 14.99064906490649, + "grad_norm": 0.3521992266178131, + "learning_rate": 8.961954858389815e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28758080, + "step": 136265 + }, + { + "epoch": 14.991199119911991, + "grad_norm": 0.038506221026182175, + "learning_rate": 8.960113828827529e-06, + "loss": 0.0084, + "num_input_tokens_seen": 28759136, + "step": 136270 + }, + { + "epoch": 14.991749174917492, + "grad_norm": 0.015318602323532104, + "learning_rate": 8.958272947097168e-06, + "loss": 0.0072, + "num_input_tokens_seen": 28760224, + "step": 136275 + }, + { + "epoch": 14.992299229922992, + "grad_norm": 0.020063208416104317, + "learning_rate": 8.956432213215715e-06, + "loss": 0.0838, + "num_input_tokens_seen": 28761280, + "step": 136280 + }, + { + "epoch": 14.992849284928493, + "grad_norm": 1.7001487016677856, + "learning_rate": 8.95459162720013e-06, + "loss": 0.0949, + "num_input_tokens_seen": 28762368, + "step": 136285 + }, + { + "epoch": 14.993399339933994, + "grad_norm": 0.07712623476982117, + "learning_rate": 8.952751189067355e-06, + "loss": 0.0114, + "num_input_tokens_seen": 28763456, + "step": 136290 + }, + { + "epoch": 14.993949394939493, + "grad_norm": 1.0201518535614014, + "learning_rate": 8.950910898834392e-06, + "loss": 0.0798, + "num_input_tokens_seen": 28764544, + "step": 136295 + }, + { + "epoch": 14.994499449944994, + "grad_norm": 0.08757666498422623, + "learning_rate": 8.94907075651818e-06, + "loss": 0.0021, + "num_input_tokens_seen": 28765600, + "step": 136300 + }, + { + "epoch": 14.995049504950495, + "grad_norm": 0.007269629277288914, + "learning_rate": 8.947230762135672e-06, + "loss": 0.0009, + "num_input_tokens_seen": 28766592, + "step": 136305 + }, + { + "epoch": 14.995599559955995, + "grad_norm": 1.55271577835083, + "learning_rate": 8.945390915703845e-06, + "loss": 0.0229, + "num_input_tokens_seen": 28767584, + "step": 136310 + }, + { + "epoch": 14.996149614961496, + "grad_norm": 2.0651795864105225, + "learning_rate": 8.943551217239637e-06, + "loss": 0.0568, + "num_input_tokens_seen": 28768608, + "step": 136315 + }, + { + "epoch": 14.996699669966997, + "grad_norm": 0.0991252139210701, + "learning_rate": 8.941711666760014e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28769664, + "step": 136320 + }, + { + "epoch": 14.997249724972498, + "grad_norm": 0.05270492285490036, + "learning_rate": 8.939872264281937e-06, + "loss": 0.0025, + "num_input_tokens_seen": 28770752, + "step": 136325 + }, + { + "epoch": 14.997799779977997, + "grad_norm": 0.07687690854072571, + "learning_rate": 8.938033009822342e-06, + "loss": 0.0043, + "num_input_tokens_seen": 28771776, + "step": 136330 + }, + { + "epoch": 14.998349834983498, + "grad_norm": 0.021479323506355286, + "learning_rate": 8.9361939033982e-06, + "loss": 0.0103, + "num_input_tokens_seen": 28772768, + "step": 136335 + }, + { + "epoch": 14.998899889989, + "grad_norm": 0.19651219248771667, + "learning_rate": 8.934354945026437e-06, + "loss": 0.0241, + "num_input_tokens_seen": 28773888, + "step": 136340 + }, + { + "epoch": 14.999449944994499, + "grad_norm": 0.0290157962590456, + "learning_rate": 8.932516134724028e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28774880, + "step": 136345 + }, + { + "epoch": 15.0, + "grad_norm": 0.029646126553416252, + "learning_rate": 8.930677472507898e-06, + "loss": 0.0197, + "num_input_tokens_seen": 28775840, + "step": 136350 + }, + { + "epoch": 15.0, + "eval_loss": 0.07325445860624313, + "eval_runtime": 37.0111, + "eval_samples_per_second": 109.156, + "eval_steps_per_second": 27.289, + "num_input_tokens_seen": 28775840, + "step": 136350 + }, + { + "epoch": 15.000550055005501, + "grad_norm": 1.9577964544296265, + "learning_rate": 8.928838958395005e-06, + "loss": 0.0133, + "num_input_tokens_seen": 28776896, + "step": 136355 + }, + { + "epoch": 15.001100110011, + "grad_norm": 0.0412089042365551, + "learning_rate": 8.927000592402298e-06, + "loss": 0.0076, + "num_input_tokens_seen": 28777952, + "step": 136360 + }, + { + "epoch": 15.001650165016502, + "grad_norm": 0.00899057649075985, + "learning_rate": 8.925162374546705e-06, + "loss": 0.044, + "num_input_tokens_seen": 28778912, + "step": 136365 + }, + { + "epoch": 15.002200220022003, + "grad_norm": 0.11804023385047913, + "learning_rate": 8.923324304845188e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28780000, + "step": 136370 + }, + { + "epoch": 15.002750275027502, + "grad_norm": 0.005234569311141968, + "learning_rate": 8.921486383314668e-06, + "loss": 0.0495, + "num_input_tokens_seen": 28781088, + "step": 136375 + }, + { + "epoch": 15.003300330033003, + "grad_norm": 0.00793320219963789, + "learning_rate": 8.91964860997209e-06, + "loss": 0.04, + "num_input_tokens_seen": 28782144, + "step": 136380 + }, + { + "epoch": 15.003850385038504, + "grad_norm": 0.052150629460811615, + "learning_rate": 8.917810984834405e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28783200, + "step": 136385 + }, + { + "epoch": 15.004400440044005, + "grad_norm": 0.05122637748718262, + "learning_rate": 8.91597350791853e-06, + "loss": 0.0478, + "num_input_tokens_seen": 28784288, + "step": 136390 + }, + { + "epoch": 15.004950495049505, + "grad_norm": 0.4077633321285248, + "learning_rate": 8.914136179241416e-06, + "loss": 0.037, + "num_input_tokens_seen": 28785344, + "step": 136395 + }, + { + "epoch": 15.005500550055006, + "grad_norm": 0.029744533821940422, + "learning_rate": 8.912298998819993e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28786400, + "step": 136400 + }, + { + "epoch": 15.006050605060507, + "grad_norm": 0.04419837146997452, + "learning_rate": 8.91046196667118e-06, + "loss": 0.007, + "num_input_tokens_seen": 28787488, + "step": 136405 + }, + { + "epoch": 15.006600660066006, + "grad_norm": 0.04517812281847, + "learning_rate": 8.908625082811919e-06, + "loss": 0.0088, + "num_input_tokens_seen": 28788480, + "step": 136410 + }, + { + "epoch": 15.007150715071507, + "grad_norm": 0.6863852739334106, + "learning_rate": 8.90678834725914e-06, + "loss": 0.007, + "num_input_tokens_seen": 28789600, + "step": 136415 + }, + { + "epoch": 15.007700770077008, + "grad_norm": 0.017312007024884224, + "learning_rate": 8.904951760029781e-06, + "loss": 0.0037, + "num_input_tokens_seen": 28790592, + "step": 136420 + }, + { + "epoch": 15.008250825082508, + "grad_norm": 0.7700185775756836, + "learning_rate": 8.903115321140757e-06, + "loss": 0.0053, + "num_input_tokens_seen": 28791584, + "step": 136425 + }, + { + "epoch": 15.008800880088009, + "grad_norm": 0.15613244473934174, + "learning_rate": 8.901279030608984e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28792704, + "step": 136430 + }, + { + "epoch": 15.00935093509351, + "grad_norm": 0.143956258893013, + "learning_rate": 8.899442888451401e-06, + "loss": 0.1257, + "num_input_tokens_seen": 28793728, + "step": 136435 + }, + { + "epoch": 15.009900990099009, + "grad_norm": 0.02389637939631939, + "learning_rate": 8.897606894684923e-06, + "loss": 0.0987, + "num_input_tokens_seen": 28794720, + "step": 136440 + }, + { + "epoch": 15.01045104510451, + "grad_norm": 0.542974054813385, + "learning_rate": 8.895771049326487e-06, + "loss": 0.0056, + "num_input_tokens_seen": 28795776, + "step": 136445 + }, + { + "epoch": 15.011001100110011, + "grad_norm": 0.009445926174521446, + "learning_rate": 8.893935352393001e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28796864, + "step": 136450 + }, + { + "epoch": 15.011551155115512, + "grad_norm": 0.025664832442998886, + "learning_rate": 8.892099803901377e-06, + "loss": 0.0396, + "num_input_tokens_seen": 28797888, + "step": 136455 + }, + { + "epoch": 15.012101210121012, + "grad_norm": 0.01213796902447939, + "learning_rate": 8.890264403868545e-06, + "loss": 0.0183, + "num_input_tokens_seen": 28799008, + "step": 136460 + }, + { + "epoch": 15.012651265126513, + "grad_norm": 0.4379136860370636, + "learning_rate": 8.888429152311412e-06, + "loss": 0.0058, + "num_input_tokens_seen": 28800096, + "step": 136465 + }, + { + "epoch": 15.013201320132014, + "grad_norm": 0.01755598932504654, + "learning_rate": 8.886594049246893e-06, + "loss": 0.0037, + "num_input_tokens_seen": 28801152, + "step": 136470 + }, + { + "epoch": 15.013751375137513, + "grad_norm": 0.008659831248223782, + "learning_rate": 8.884759094691913e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28802176, + "step": 136475 + }, + { + "epoch": 15.014301430143014, + "grad_norm": 0.0411585308611393, + "learning_rate": 8.882924288663368e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28803232, + "step": 136480 + }, + { + "epoch": 15.014851485148515, + "grad_norm": 0.15385399758815765, + "learning_rate": 8.881089631178185e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28804288, + "step": 136485 + }, + { + "epoch": 15.015401540154015, + "grad_norm": 0.019228888675570488, + "learning_rate": 8.879255122253255e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28805344, + "step": 136490 + }, + { + "epoch": 15.015951595159516, + "grad_norm": 0.055333152413368225, + "learning_rate": 8.877420761905498e-06, + "loss": 0.1273, + "num_input_tokens_seen": 28806336, + "step": 136495 + }, + { + "epoch": 15.016501650165017, + "grad_norm": 0.17643436789512634, + "learning_rate": 8.875586550151821e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28807424, + "step": 136500 + }, + { + "epoch": 15.017051705170518, + "grad_norm": 1.2547060251235962, + "learning_rate": 8.873752487009121e-06, + "loss": 0.0185, + "num_input_tokens_seen": 28808416, + "step": 136505 + }, + { + "epoch": 15.017601760176017, + "grad_norm": 0.005505451932549477, + "learning_rate": 8.871918572494314e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28809504, + "step": 136510 + }, + { + "epoch": 15.018151815181518, + "grad_norm": 0.7136451601982117, + "learning_rate": 8.870084806624293e-06, + "loss": 0.1168, + "num_input_tokens_seen": 28810528, + "step": 136515 + }, + { + "epoch": 15.01870187018702, + "grad_norm": 0.04336782172322273, + "learning_rate": 8.868251189415953e-06, + "loss": 0.0014, + "num_input_tokens_seen": 28811520, + "step": 136520 + }, + { + "epoch": 15.019251925192519, + "grad_norm": 0.027069246396422386, + "learning_rate": 8.8664177208862e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28812672, + "step": 136525 + }, + { + "epoch": 15.01980198019802, + "grad_norm": 0.01717054285109043, + "learning_rate": 8.864584401051934e-06, + "loss": 0.0151, + "num_input_tokens_seen": 28813664, + "step": 136530 + }, + { + "epoch": 15.020352035203521, + "grad_norm": 0.01184281799942255, + "learning_rate": 8.86275122993006e-06, + "loss": 0.0621, + "num_input_tokens_seen": 28814656, + "step": 136535 + }, + { + "epoch": 15.02090209020902, + "grad_norm": 0.016154872253537178, + "learning_rate": 8.860918207537464e-06, + "loss": 0.0019, + "num_input_tokens_seen": 28815712, + "step": 136540 + }, + { + "epoch": 15.021452145214521, + "grad_norm": 0.12859229743480682, + "learning_rate": 8.859085333891032e-06, + "loss": 0.0054, + "num_input_tokens_seen": 28816832, + "step": 136545 + }, + { + "epoch": 15.022002200220022, + "grad_norm": 0.5744978785514832, + "learning_rate": 8.857252609007666e-06, + "loss": 0.0096, + "num_input_tokens_seen": 28817856, + "step": 136550 + }, + { + "epoch": 15.022552255225522, + "grad_norm": 0.10047025233507156, + "learning_rate": 8.855420032904255e-06, + "loss": 0.0048, + "num_input_tokens_seen": 28818880, + "step": 136555 + }, + { + "epoch": 15.023102310231023, + "grad_norm": 0.01677185855805874, + "learning_rate": 8.853587605597702e-06, + "loss": 0.0102, + "num_input_tokens_seen": 28819968, + "step": 136560 + }, + { + "epoch": 15.023652365236524, + "grad_norm": 0.014124894514679909, + "learning_rate": 8.851755327104882e-06, + "loss": 0.0043, + "num_input_tokens_seen": 28820928, + "step": 136565 + }, + { + "epoch": 15.024202420242025, + "grad_norm": 0.056792907416820526, + "learning_rate": 8.849923197442675e-06, + "loss": 0.0505, + "num_input_tokens_seen": 28821920, + "step": 136570 + }, + { + "epoch": 15.024752475247524, + "grad_norm": 0.050678882747888565, + "learning_rate": 8.848091216627988e-06, + "loss": 0.0358, + "num_input_tokens_seen": 28822944, + "step": 136575 + }, + { + "epoch": 15.025302530253025, + "grad_norm": 0.39970681071281433, + "learning_rate": 8.846259384677682e-06, + "loss": 0.0051, + "num_input_tokens_seen": 28823968, + "step": 136580 + }, + { + "epoch": 15.025852585258527, + "grad_norm": 0.10438935458660126, + "learning_rate": 8.844427701608654e-06, + "loss": 0.0022, + "num_input_tokens_seen": 28825024, + "step": 136585 + }, + { + "epoch": 15.026402640264026, + "grad_norm": 0.008233638480305672, + "learning_rate": 8.842596167437792e-06, + "loss": 0.0088, + "num_input_tokens_seen": 28826080, + "step": 136590 + }, + { + "epoch": 15.026952695269527, + "grad_norm": 0.0029905152041465044, + "learning_rate": 8.84076478218196e-06, + "loss": 0.0125, + "num_input_tokens_seen": 28827136, + "step": 136595 + }, + { + "epoch": 15.027502750275028, + "grad_norm": 0.03402545675635338, + "learning_rate": 8.838933545858053e-06, + "loss": 0.059, + "num_input_tokens_seen": 28828224, + "step": 136600 + }, + { + "epoch": 15.028052805280527, + "grad_norm": 0.12711623311042786, + "learning_rate": 8.837102458482931e-06, + "loss": 0.0052, + "num_input_tokens_seen": 28829248, + "step": 136605 + }, + { + "epoch": 15.028602860286028, + "grad_norm": 0.06066502630710602, + "learning_rate": 8.83527152007348e-06, + "loss": 0.017, + "num_input_tokens_seen": 28830240, + "step": 136610 + }, + { + "epoch": 15.02915291529153, + "grad_norm": 0.006182474084198475, + "learning_rate": 8.833440730646585e-06, + "loss": 0.0062, + "num_input_tokens_seen": 28831328, + "step": 136615 + }, + { + "epoch": 15.029702970297029, + "grad_norm": 1.8243643045425415, + "learning_rate": 8.8316100902191e-06, + "loss": 0.0453, + "num_input_tokens_seen": 28832384, + "step": 136620 + }, + { + "epoch": 15.03025302530253, + "grad_norm": 0.025873536244034767, + "learning_rate": 8.829779598807913e-06, + "loss": 0.0128, + "num_input_tokens_seen": 28833408, + "step": 136625 + }, + { + "epoch": 15.030803080308031, + "grad_norm": 2.1338398456573486, + "learning_rate": 8.827949256429882e-06, + "loss": 0.134, + "num_input_tokens_seen": 28834464, + "step": 136630 + }, + { + "epoch": 15.031353135313532, + "grad_norm": 0.06324143707752228, + "learning_rate": 8.826119063101893e-06, + "loss": 0.0012, + "num_input_tokens_seen": 28835488, + "step": 136635 + }, + { + "epoch": 15.031903190319031, + "grad_norm": 3.607703924179077, + "learning_rate": 8.824289018840792e-06, + "loss": 0.0749, + "num_input_tokens_seen": 28836576, + "step": 136640 + }, + { + "epoch": 15.032453245324533, + "grad_norm": 0.046375613659620285, + "learning_rate": 8.82245912366346e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28837664, + "step": 136645 + }, + { + "epoch": 15.033003300330034, + "grad_norm": 0.00917845405638218, + "learning_rate": 8.820629377586765e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28838720, + "step": 136650 + }, + { + "epoch": 15.033553355335533, + "grad_norm": 0.06328332424163818, + "learning_rate": 8.818799780627556e-06, + "loss": 0.1636, + "num_input_tokens_seen": 28839776, + "step": 136655 + }, + { + "epoch": 15.034103410341034, + "grad_norm": 0.0881892666220665, + "learning_rate": 8.816970332802718e-06, + "loss": 0.0033, + "num_input_tokens_seen": 28840800, + "step": 136660 + }, + { + "epoch": 15.034653465346535, + "grad_norm": 0.022441305220127106, + "learning_rate": 8.815141034129087e-06, + "loss": 0.0163, + "num_input_tokens_seen": 28841888, + "step": 136665 + }, + { + "epoch": 15.035203520352034, + "grad_norm": 3.3631131649017334, + "learning_rate": 8.813311884623535e-06, + "loss": 0.0669, + "num_input_tokens_seen": 28842944, + "step": 136670 + }, + { + "epoch": 15.035753575357536, + "grad_norm": 0.6292247772216797, + "learning_rate": 8.81148288430293e-06, + "loss": 0.0212, + "num_input_tokens_seen": 28844000, + "step": 136675 + }, + { + "epoch": 15.036303630363037, + "grad_norm": 1.1332347393035889, + "learning_rate": 8.80965403318412e-06, + "loss": 0.0424, + "num_input_tokens_seen": 28845024, + "step": 136680 + }, + { + "epoch": 15.036853685368538, + "grad_norm": 0.03850870579481125, + "learning_rate": 8.807825331283951e-06, + "loss": 0.0012, + "num_input_tokens_seen": 28846112, + "step": 136685 + }, + { + "epoch": 15.037403740374037, + "grad_norm": 0.03622988983988762, + "learning_rate": 8.805996778619293e-06, + "loss": 0.0032, + "num_input_tokens_seen": 28847104, + "step": 136690 + }, + { + "epoch": 15.037953795379538, + "grad_norm": 0.5202054381370544, + "learning_rate": 8.804168375206983e-06, + "loss": 0.0104, + "num_input_tokens_seen": 28848160, + "step": 136695 + }, + { + "epoch": 15.03850385038504, + "grad_norm": 3.8318231105804443, + "learning_rate": 8.802340121063885e-06, + "loss": 0.0259, + "num_input_tokens_seen": 28849152, + "step": 136700 + }, + { + "epoch": 15.039053905390539, + "grad_norm": 3.6765801906585693, + "learning_rate": 8.800512016206852e-06, + "loss": 0.0667, + "num_input_tokens_seen": 28850272, + "step": 136705 + }, + { + "epoch": 15.03960396039604, + "grad_norm": 0.23795291781425476, + "learning_rate": 8.798684060652718e-06, + "loss": 0.029, + "num_input_tokens_seen": 28851296, + "step": 136710 + }, + { + "epoch": 15.04015401540154, + "grad_norm": 2.1737749576568604, + "learning_rate": 8.796856254418348e-06, + "loss": 0.0622, + "num_input_tokens_seen": 28852320, + "step": 136715 + }, + { + "epoch": 15.04070407040704, + "grad_norm": 0.34227436780929565, + "learning_rate": 8.79502859752057e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28853376, + "step": 136720 + }, + { + "epoch": 15.041254125412541, + "grad_norm": 0.129657581448555, + "learning_rate": 8.793201089976236e-06, + "loss": 0.0565, + "num_input_tokens_seen": 28854400, + "step": 136725 + }, + { + "epoch": 15.041804180418042, + "grad_norm": 0.023152804002165794, + "learning_rate": 8.791373731802202e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28855424, + "step": 136730 + }, + { + "epoch": 15.042354235423542, + "grad_norm": 0.09974955767393112, + "learning_rate": 8.789546523015288e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28856480, + "step": 136735 + }, + { + "epoch": 15.042904290429043, + "grad_norm": 1.582686424255371, + "learning_rate": 8.787719463632355e-06, + "loss": 0.1008, + "num_input_tokens_seen": 28857568, + "step": 136740 + }, + { + "epoch": 15.043454345434544, + "grad_norm": 0.7717307806015015, + "learning_rate": 8.785892553670225e-06, + "loss": 0.0075, + "num_input_tokens_seen": 28858688, + "step": 136745 + }, + { + "epoch": 15.044004400440045, + "grad_norm": 0.03400025889277458, + "learning_rate": 8.784065793145748e-06, + "loss": 0.0013, + "num_input_tokens_seen": 28859712, + "step": 136750 + }, + { + "epoch": 15.044554455445544, + "grad_norm": 0.030470632016658783, + "learning_rate": 8.78223918207575e-06, + "loss": 0.0259, + "num_input_tokens_seen": 28860800, + "step": 136755 + }, + { + "epoch": 15.045104510451045, + "grad_norm": 0.07741586863994598, + "learning_rate": 8.780412720477068e-06, + "loss": 0.0436, + "num_input_tokens_seen": 28861824, + "step": 136760 + }, + { + "epoch": 15.045654565456546, + "grad_norm": 0.017759185284376144, + "learning_rate": 8.778586408366552e-06, + "loss": 0.0084, + "num_input_tokens_seen": 28862880, + "step": 136765 + }, + { + "epoch": 15.046204620462046, + "grad_norm": 0.008680485188961029, + "learning_rate": 8.77676024576101e-06, + "loss": 0.0053, + "num_input_tokens_seen": 28863936, + "step": 136770 + }, + { + "epoch": 15.046754675467547, + "grad_norm": 0.02065538614988327, + "learning_rate": 8.774934232677292e-06, + "loss": 0.043, + "num_input_tokens_seen": 28865056, + "step": 136775 + }, + { + "epoch": 15.047304730473048, + "grad_norm": 1.405772089958191, + "learning_rate": 8.773108369132213e-06, + "loss": 0.0437, + "num_input_tokens_seen": 28866048, + "step": 136780 + }, + { + "epoch": 15.047854785478547, + "grad_norm": 0.049052149057388306, + "learning_rate": 8.771282655142606e-06, + "loss": 0.0069, + "num_input_tokens_seen": 28867072, + "step": 136785 + }, + { + "epoch": 15.048404840484048, + "grad_norm": 0.9339203834533691, + "learning_rate": 8.769457090725311e-06, + "loss": 0.1026, + "num_input_tokens_seen": 28868128, + "step": 136790 + }, + { + "epoch": 15.04895489548955, + "grad_norm": 0.7302775382995605, + "learning_rate": 8.767631675897132e-06, + "loss": 0.0712, + "num_input_tokens_seen": 28869152, + "step": 136795 + }, + { + "epoch": 15.049504950495049, + "grad_norm": 0.025412892922759056, + "learning_rate": 8.76580641067491e-06, + "loss": 0.0028, + "num_input_tokens_seen": 28870240, + "step": 136800 + }, + { + "epoch": 15.05005500550055, + "grad_norm": 0.4469072222709656, + "learning_rate": 8.76398129507546e-06, + "loss": 0.0336, + "num_input_tokens_seen": 28871264, + "step": 136805 + }, + { + "epoch": 15.05060506050605, + "grad_norm": 0.008627469651401043, + "learning_rate": 8.762156329115597e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28872288, + "step": 136810 + }, + { + "epoch": 15.051155115511552, + "grad_norm": 0.5295925140380859, + "learning_rate": 8.760331512812148e-06, + "loss": 0.0781, + "num_input_tokens_seen": 28873280, + "step": 136815 + }, + { + "epoch": 15.051705170517051, + "grad_norm": 0.4153941571712494, + "learning_rate": 8.75850684618193e-06, + "loss": 0.0088, + "num_input_tokens_seen": 28874304, + "step": 136820 + }, + { + "epoch": 15.052255225522552, + "grad_norm": 0.03339581936597824, + "learning_rate": 8.756682329241767e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28875328, + "step": 136825 + }, + { + "epoch": 15.052805280528053, + "grad_norm": 0.010609190911054611, + "learning_rate": 8.754857962008473e-06, + "loss": 0.0056, + "num_input_tokens_seen": 28876448, + "step": 136830 + }, + { + "epoch": 15.053355335533553, + "grad_norm": 0.6993365287780762, + "learning_rate": 8.753033744498846e-06, + "loss": 0.0214, + "num_input_tokens_seen": 28877504, + "step": 136835 + }, + { + "epoch": 15.053905390539054, + "grad_norm": 0.029764216393232346, + "learning_rate": 8.751209676729712e-06, + "loss": 0.1478, + "num_input_tokens_seen": 28878624, + "step": 136840 + }, + { + "epoch": 15.054455445544555, + "grad_norm": 0.08585590124130249, + "learning_rate": 8.749385758717882e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28879648, + "step": 136845 + }, + { + "epoch": 15.055005500550054, + "grad_norm": 0.057182375341653824, + "learning_rate": 8.747561990480175e-06, + "loss": 0.004, + "num_input_tokens_seen": 28880672, + "step": 136850 + }, + { + "epoch": 15.055555555555555, + "grad_norm": 0.007493680343031883, + "learning_rate": 8.745738372033388e-06, + "loss": 0.0035, + "num_input_tokens_seen": 28881728, + "step": 136855 + }, + { + "epoch": 15.056105610561056, + "grad_norm": 0.03394922986626625, + "learning_rate": 8.743914903394324e-06, + "loss": 0.0062, + "num_input_tokens_seen": 28882816, + "step": 136860 + }, + { + "epoch": 15.056655665566556, + "grad_norm": 0.582901656627655, + "learning_rate": 8.742091584579803e-06, + "loss": 0.008, + "num_input_tokens_seen": 28883904, + "step": 136865 + }, + { + "epoch": 15.057205720572057, + "grad_norm": 1.6563760042190552, + "learning_rate": 8.740268415606615e-06, + "loss": 0.0351, + "num_input_tokens_seen": 28885024, + "step": 136870 + }, + { + "epoch": 15.057755775577558, + "grad_norm": 0.028802137821912766, + "learning_rate": 8.738445396491571e-06, + "loss": 0.0102, + "num_input_tokens_seen": 28886080, + "step": 136875 + }, + { + "epoch": 15.058305830583059, + "grad_norm": 0.15199242532253265, + "learning_rate": 8.736622527251481e-06, + "loss": 0.0264, + "num_input_tokens_seen": 28887136, + "step": 136880 + }, + { + "epoch": 15.058855885588558, + "grad_norm": 0.02984583005309105, + "learning_rate": 8.734799807903129e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28888256, + "step": 136885 + }, + { + "epoch": 15.05940594059406, + "grad_norm": 0.07174104452133179, + "learning_rate": 8.73297723846333e-06, + "loss": 0.1101, + "num_input_tokens_seen": 28889344, + "step": 136890 + }, + { + "epoch": 15.05995599559956, + "grad_norm": 0.06075878441333771, + "learning_rate": 8.731154818948875e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28890432, + "step": 136895 + }, + { + "epoch": 15.06050605060506, + "grad_norm": 0.09845767915248871, + "learning_rate": 8.729332549376542e-06, + "loss": 0.0093, + "num_input_tokens_seen": 28891488, + "step": 136900 + }, + { + "epoch": 15.061056105610561, + "grad_norm": 0.014314509928226471, + "learning_rate": 8.727510429763162e-06, + "loss": 0.1048, + "num_input_tokens_seen": 28892544, + "step": 136905 + }, + { + "epoch": 15.061606160616062, + "grad_norm": 0.37421754002571106, + "learning_rate": 8.725688460125501e-06, + "loss": 0.1324, + "num_input_tokens_seen": 28893632, + "step": 136910 + }, + { + "epoch": 15.062156215621561, + "grad_norm": 0.20707058906555176, + "learning_rate": 8.723866640480366e-06, + "loss": 0.0069, + "num_input_tokens_seen": 28894720, + "step": 136915 + }, + { + "epoch": 15.062706270627062, + "grad_norm": 1.2686223983764648, + "learning_rate": 8.722044970844545e-06, + "loss": 0.0087, + "num_input_tokens_seen": 28895776, + "step": 136920 + }, + { + "epoch": 15.063256325632564, + "grad_norm": 0.31364426016807556, + "learning_rate": 8.720223451234817e-06, + "loss": 0.0327, + "num_input_tokens_seen": 28896864, + "step": 136925 + }, + { + "epoch": 15.063806380638065, + "grad_norm": 0.002294007921591401, + "learning_rate": 8.718402081667976e-06, + "loss": 0.001, + "num_input_tokens_seen": 28897888, + "step": 136930 + }, + { + "epoch": 15.064356435643564, + "grad_norm": 0.028248634189367294, + "learning_rate": 8.716580862160809e-06, + "loss": 0.009, + "num_input_tokens_seen": 28898912, + "step": 136935 + }, + { + "epoch": 15.064906490649065, + "grad_norm": 0.010893238708376884, + "learning_rate": 8.714759792730112e-06, + "loss": 0.002, + "num_input_tokens_seen": 28900000, + "step": 136940 + }, + { + "epoch": 15.065456545654566, + "grad_norm": 0.008000294677913189, + "learning_rate": 8.71293887339266e-06, + "loss": 0.1118, + "num_input_tokens_seen": 28901088, + "step": 136945 + }, + { + "epoch": 15.066006600660065, + "grad_norm": 0.015326540917158127, + "learning_rate": 8.711118104165228e-06, + "loss": 0.011, + "num_input_tokens_seen": 28902144, + "step": 136950 + }, + { + "epoch": 15.066556655665567, + "grad_norm": 0.037848230451345444, + "learning_rate": 8.70929748506461e-06, + "loss": 0.0174, + "num_input_tokens_seen": 28903136, + "step": 136955 + }, + { + "epoch": 15.067106710671068, + "grad_norm": 0.005336351227015257, + "learning_rate": 8.707477016107566e-06, + "loss": 0.0024, + "num_input_tokens_seen": 28904256, + "step": 136960 + }, + { + "epoch": 15.067656765676567, + "grad_norm": 0.047015491873025894, + "learning_rate": 8.705656697310904e-06, + "loss": 0.0049, + "num_input_tokens_seen": 28905312, + "step": 136965 + }, + { + "epoch": 15.068206820682068, + "grad_norm": 0.00615065498277545, + "learning_rate": 8.703836528691384e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28906368, + "step": 136970 + }, + { + "epoch": 15.06875687568757, + "grad_norm": 0.05954447016119957, + "learning_rate": 8.702016510265776e-06, + "loss": 0.0905, + "num_input_tokens_seen": 28907424, + "step": 136975 + }, + { + "epoch": 15.069306930693068, + "grad_norm": 0.05982106924057007, + "learning_rate": 8.70019664205087e-06, + "loss": 0.0084, + "num_input_tokens_seen": 28908416, + "step": 136980 + }, + { + "epoch": 15.06985698569857, + "grad_norm": 4.591664791107178, + "learning_rate": 8.698376924063423e-06, + "loss": 0.0717, + "num_input_tokens_seen": 28909408, + "step": 136985 + }, + { + "epoch": 15.07040704070407, + "grad_norm": 0.019473062828183174, + "learning_rate": 8.696557356320211e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28910496, + "step": 136990 + }, + { + "epoch": 15.070957095709572, + "grad_norm": 0.013134286738932133, + "learning_rate": 8.694737938838016e-06, + "loss": 0.1047, + "num_input_tokens_seen": 28911552, + "step": 136995 + }, + { + "epoch": 15.071507150715071, + "grad_norm": 0.005594285205006599, + "learning_rate": 8.692918671633588e-06, + "loss": 0.0328, + "num_input_tokens_seen": 28912576, + "step": 137000 + }, + { + "epoch": 15.072057205720572, + "grad_norm": 2.074500560760498, + "learning_rate": 8.691099554723715e-06, + "loss": 0.0748, + "num_input_tokens_seen": 28913600, + "step": 137005 + }, + { + "epoch": 15.072607260726073, + "grad_norm": 0.1729373037815094, + "learning_rate": 8.68928058812514e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28914688, + "step": 137010 + }, + { + "epoch": 15.073157315731573, + "grad_norm": 1.2710075378417969, + "learning_rate": 8.68746177185465e-06, + "loss": 0.0588, + "num_input_tokens_seen": 28915712, + "step": 137015 + }, + { + "epoch": 15.073707370737074, + "grad_norm": 0.6028501391410828, + "learning_rate": 8.685643105928986e-06, + "loss": 0.0044, + "num_input_tokens_seen": 28916800, + "step": 137020 + }, + { + "epoch": 15.074257425742575, + "grad_norm": 3.9775936603546143, + "learning_rate": 8.683824590364926e-06, + "loss": 0.0552, + "num_input_tokens_seen": 28917888, + "step": 137025 + }, + { + "epoch": 15.074807480748074, + "grad_norm": 0.0844900831580162, + "learning_rate": 8.682006225179229e-06, + "loss": 0.0545, + "num_input_tokens_seen": 28918976, + "step": 137030 + }, + { + "epoch": 15.075357535753575, + "grad_norm": 1.8741141557693481, + "learning_rate": 8.680188010388643e-06, + "loss": 0.0461, + "num_input_tokens_seen": 28920032, + "step": 137035 + }, + { + "epoch": 15.075907590759076, + "grad_norm": 0.08340948820114136, + "learning_rate": 8.678369946009943e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28921088, + "step": 137040 + }, + { + "epoch": 15.076457645764576, + "grad_norm": 1.5857411623001099, + "learning_rate": 8.676552032059868e-06, + "loss": 0.0229, + "num_input_tokens_seen": 28922176, + "step": 137045 + }, + { + "epoch": 15.077007700770077, + "grad_norm": 0.03935983404517174, + "learning_rate": 8.674734268555179e-06, + "loss": 0.0401, + "num_input_tokens_seen": 28923296, + "step": 137050 + }, + { + "epoch": 15.077557755775578, + "grad_norm": 0.5067785382270813, + "learning_rate": 8.672916655512637e-06, + "loss": 0.0124, + "num_input_tokens_seen": 28924320, + "step": 137055 + }, + { + "epoch": 15.078107810781079, + "grad_norm": 0.03126613423228264, + "learning_rate": 8.671099192948989e-06, + "loss": 0.0817, + "num_input_tokens_seen": 28925344, + "step": 137060 + }, + { + "epoch": 15.078657865786578, + "grad_norm": 0.07326901704072952, + "learning_rate": 8.669281880880978e-06, + "loss": 0.037, + "num_input_tokens_seen": 28926400, + "step": 137065 + }, + { + "epoch": 15.07920792079208, + "grad_norm": 0.27397075295448303, + "learning_rate": 8.667464719325366e-06, + "loss": 0.0045, + "num_input_tokens_seen": 28927424, + "step": 137070 + }, + { + "epoch": 15.07975797579758, + "grad_norm": 0.03489266335964203, + "learning_rate": 8.665647708298877e-06, + "loss": 0.0702, + "num_input_tokens_seen": 28928544, + "step": 137075 + }, + { + "epoch": 15.08030803080308, + "grad_norm": 0.1976776421070099, + "learning_rate": 8.663830847818295e-06, + "loss": 0.0107, + "num_input_tokens_seen": 28929568, + "step": 137080 + }, + { + "epoch": 15.08085808580858, + "grad_norm": 0.12591435015201569, + "learning_rate": 8.662014137900343e-06, + "loss": 0.0034, + "num_input_tokens_seen": 28930688, + "step": 137085 + }, + { + "epoch": 15.081408140814082, + "grad_norm": 0.02675376459956169, + "learning_rate": 8.660197578561757e-06, + "loss": 0.0135, + "num_input_tokens_seen": 28931808, + "step": 137090 + }, + { + "epoch": 15.081958195819581, + "grad_norm": 0.04552670195698738, + "learning_rate": 8.658381169819299e-06, + "loss": 0.0017, + "num_input_tokens_seen": 28932768, + "step": 137095 + }, + { + "epoch": 15.082508250825082, + "grad_norm": 0.007945769466459751, + "learning_rate": 8.656564911689694e-06, + "loss": 0.0026, + "num_input_tokens_seen": 28933856, + "step": 137100 + }, + { + "epoch": 15.083058305830583, + "grad_norm": 0.09822914749383926, + "learning_rate": 8.654748804189685e-06, + "loss": 0.1105, + "num_input_tokens_seen": 28934912, + "step": 137105 + }, + { + "epoch": 15.083608360836084, + "grad_norm": 0.07462556660175323, + "learning_rate": 8.652932847336023e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28936000, + "step": 137110 + }, + { + "epoch": 15.084158415841584, + "grad_norm": 0.10474714636802673, + "learning_rate": 8.651117041145426e-06, + "loss": 0.0097, + "num_input_tokens_seen": 28937088, + "step": 137115 + }, + { + "epoch": 15.084708470847085, + "grad_norm": 0.2560470998287201, + "learning_rate": 8.649301385634648e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28938144, + "step": 137120 + }, + { + "epoch": 15.085258525852586, + "grad_norm": 0.0067342733964324, + "learning_rate": 8.647485880820402e-06, + "loss": 0.0122, + "num_input_tokens_seen": 28939168, + "step": 137125 + }, + { + "epoch": 15.085808580858085, + "grad_norm": 0.18244662880897522, + "learning_rate": 8.645670526719443e-06, + "loss": 0.0038, + "num_input_tokens_seen": 28940288, + "step": 137130 + }, + { + "epoch": 15.086358635863586, + "grad_norm": 0.10425490885972977, + "learning_rate": 8.643855323348482e-06, + "loss": 0.0027, + "num_input_tokens_seen": 28941408, + "step": 137135 + }, + { + "epoch": 15.086908690869087, + "grad_norm": 0.03747482970356941, + "learning_rate": 8.642040270724258e-06, + "loss": 0.0756, + "num_input_tokens_seen": 28942496, + "step": 137140 + }, + { + "epoch": 15.087458745874587, + "grad_norm": 0.07763449102640152, + "learning_rate": 8.640225368863508e-06, + "loss": 0.0404, + "num_input_tokens_seen": 28943520, + "step": 137145 + }, + { + "epoch": 15.088008800880088, + "grad_norm": 0.027325691655278206, + "learning_rate": 8.63841061778294e-06, + "loss": 0.0743, + "num_input_tokens_seen": 28944544, + "step": 137150 + }, + { + "epoch": 15.088558855885589, + "grad_norm": 1.6398494243621826, + "learning_rate": 8.636596017499301e-06, + "loss": 0.0753, + "num_input_tokens_seen": 28945568, + "step": 137155 + }, + { + "epoch": 15.089108910891088, + "grad_norm": 0.09682527184486389, + "learning_rate": 8.634781568029299e-06, + "loss": 0.0078, + "num_input_tokens_seen": 28946624, + "step": 137160 + }, + { + "epoch": 15.08965896589659, + "grad_norm": 0.0064353966154158115, + "learning_rate": 8.632967269389657e-06, + "loss": 0.0507, + "num_input_tokens_seen": 28947712, + "step": 137165 + }, + { + "epoch": 15.09020902090209, + "grad_norm": 1.8581815958023071, + "learning_rate": 8.631153121597113e-06, + "loss": 0.142, + "num_input_tokens_seen": 28948800, + "step": 137170 + }, + { + "epoch": 15.090759075907592, + "grad_norm": 0.03499570116400719, + "learning_rate": 8.629339124668368e-06, + "loss": 0.0337, + "num_input_tokens_seen": 28949888, + "step": 137175 + }, + { + "epoch": 15.091309130913091, + "grad_norm": 0.0936988815665245, + "learning_rate": 8.627525278620158e-06, + "loss": 0.0216, + "num_input_tokens_seen": 28951008, + "step": 137180 + }, + { + "epoch": 15.091859185918592, + "grad_norm": 0.2989426553249359, + "learning_rate": 8.62571158346919e-06, + "loss": 0.0047, + "num_input_tokens_seen": 28952096, + "step": 137185 + }, + { + "epoch": 15.092409240924093, + "grad_norm": 0.0463024266064167, + "learning_rate": 8.623898039232172e-06, + "loss": 0.0286, + "num_input_tokens_seen": 28953184, + "step": 137190 + }, + { + "epoch": 15.092959295929592, + "grad_norm": 0.024105502292513847, + "learning_rate": 8.622084645925829e-06, + "loss": 0.0683, + "num_input_tokens_seen": 28954240, + "step": 137195 + }, + { + "epoch": 15.093509350935093, + "grad_norm": 0.6687297224998474, + "learning_rate": 8.620271403566871e-06, + "loss": 0.0099, + "num_input_tokens_seen": 28955264, + "step": 137200 + }, + { + "epoch": 15.094059405940595, + "grad_norm": 0.04001560062170029, + "learning_rate": 8.618458312172023e-06, + "loss": 0.0025, + "num_input_tokens_seen": 28956320, + "step": 137205 + }, + { + "epoch": 15.094609460946094, + "grad_norm": 0.02064700797200203, + "learning_rate": 8.616645371757985e-06, + "loss": 0.1008, + "num_input_tokens_seen": 28957408, + "step": 137210 + }, + { + "epoch": 15.095159515951595, + "grad_norm": 0.0021832678467035294, + "learning_rate": 8.61483258234145e-06, + "loss": 0.0036, + "num_input_tokens_seen": 28958496, + "step": 137215 + }, + { + "epoch": 15.095709570957096, + "grad_norm": 0.016681095585227013, + "learning_rate": 8.613019943939146e-06, + "loss": 0.0029, + "num_input_tokens_seen": 28959520, + "step": 137220 + }, + { + "epoch": 15.096259625962595, + "grad_norm": 3.8835737705230713, + "learning_rate": 8.611207456567774e-06, + "loss": 0.0619, + "num_input_tokens_seen": 28960544, + "step": 137225 + }, + { + "epoch": 15.096809680968097, + "grad_norm": 0.006794173736125231, + "learning_rate": 8.609395120244046e-06, + "loss": 0.0686, + "num_input_tokens_seen": 28961600, + "step": 137230 + }, + { + "epoch": 15.097359735973598, + "grad_norm": 0.00589201133698225, + "learning_rate": 8.607582934984659e-06, + "loss": 0.0951, + "num_input_tokens_seen": 28962688, + "step": 137235 + }, + { + "epoch": 15.097909790979099, + "grad_norm": 0.02525252103805542, + "learning_rate": 8.605770900806306e-06, + "loss": 0.0053, + "num_input_tokens_seen": 28963744, + "step": 137240 + }, + { + "epoch": 15.098459845984598, + "grad_norm": 0.03404594957828522, + "learning_rate": 8.603959017725701e-06, + "loss": 0.008, + "num_input_tokens_seen": 28964832, + "step": 137245 + }, + { + "epoch": 15.099009900990099, + "grad_norm": 0.026615919545292854, + "learning_rate": 8.602147285759535e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28965888, + "step": 137250 + }, + { + "epoch": 15.0995599559956, + "grad_norm": 0.11641839891672134, + "learning_rate": 8.600335704924506e-06, + "loss": 0.004, + "num_input_tokens_seen": 28966976, + "step": 137255 + }, + { + "epoch": 15.1001100110011, + "grad_norm": 0.021590525284409523, + "learning_rate": 8.598524275237322e-06, + "loss": 0.0295, + "num_input_tokens_seen": 28968032, + "step": 137260 + }, + { + "epoch": 15.1006600660066, + "grad_norm": 0.01523022260516882, + "learning_rate": 8.596712996714662e-06, + "loss": 0.0023, + "num_input_tokens_seen": 28969120, + "step": 137265 + }, + { + "epoch": 15.101210121012102, + "grad_norm": 0.015549881383776665, + "learning_rate": 8.594901869373234e-06, + "loss": 0.005, + "num_input_tokens_seen": 28970208, + "step": 137270 + }, + { + "epoch": 15.101760176017601, + "grad_norm": 1.2283358573913574, + "learning_rate": 8.593090893229716e-06, + "loss": 0.0202, + "num_input_tokens_seen": 28971264, + "step": 137275 + }, + { + "epoch": 15.102310231023102, + "grad_norm": 0.02662770263850689, + "learning_rate": 8.591280068300809e-06, + "loss": 0.1006, + "num_input_tokens_seen": 28972320, + "step": 137280 + }, + { + "epoch": 15.102860286028603, + "grad_norm": 0.05550026148557663, + "learning_rate": 8.589469394603202e-06, + "loss": 0.0219, + "num_input_tokens_seen": 28973440, + "step": 137285 + }, + { + "epoch": 15.103410341034103, + "grad_norm": 0.27966904640197754, + "learning_rate": 8.587658872153579e-06, + "loss": 0.0084, + "num_input_tokens_seen": 28974496, + "step": 137290 + }, + { + "epoch": 15.103960396039604, + "grad_norm": 0.2322726547718048, + "learning_rate": 8.585848500968636e-06, + "loss": 0.0135, + "num_input_tokens_seen": 28975520, + "step": 137295 + }, + { + "epoch": 15.104510451045105, + "grad_norm": 0.039287373423576355, + "learning_rate": 8.58403828106505e-06, + "loss": 0.0831, + "num_input_tokens_seen": 28976512, + "step": 137300 + }, + { + "epoch": 15.105060506050606, + "grad_norm": 0.969963550567627, + "learning_rate": 8.582228212459494e-06, + "loss": 0.0139, + "num_input_tokens_seen": 28977472, + "step": 137305 + }, + { + "epoch": 15.105610561056105, + "grad_norm": 1.0941908359527588, + "learning_rate": 8.580418295168669e-06, + "loss": 0.0137, + "num_input_tokens_seen": 28978560, + "step": 137310 + }, + { + "epoch": 15.106160616061606, + "grad_norm": 0.003001301782205701, + "learning_rate": 8.578608529209244e-06, + "loss": 0.0619, + "num_input_tokens_seen": 28979552, + "step": 137315 + }, + { + "epoch": 15.106710671067107, + "grad_norm": 0.033160410821437836, + "learning_rate": 8.576798914597916e-06, + "loss": 0.0018, + "num_input_tokens_seen": 28980544, + "step": 137320 + }, + { + "epoch": 15.107260726072607, + "grad_norm": 0.018107842653989792, + "learning_rate": 8.574989451351354e-06, + "loss": 0.0087, + "num_input_tokens_seen": 28981600, + "step": 137325 + }, + { + "epoch": 15.107810781078108, + "grad_norm": 0.028629060834646225, + "learning_rate": 8.57318013948622e-06, + "loss": 0.0149, + "num_input_tokens_seen": 28982624, + "step": 137330 + }, + { + "epoch": 15.108360836083609, + "grad_norm": 0.009014473296701908, + "learning_rate": 8.571370979019205e-06, + "loss": 0.0219, + "num_input_tokens_seen": 28983648, + "step": 137335 + }, + { + "epoch": 15.108910891089108, + "grad_norm": 0.005335877649486065, + "learning_rate": 8.56956196996698e-06, + "loss": 0.0682, + "num_input_tokens_seen": 28984704, + "step": 137340 + }, + { + "epoch": 15.10946094609461, + "grad_norm": 0.10286115109920502, + "learning_rate": 8.567753112346224e-06, + "loss": 0.1735, + "num_input_tokens_seen": 28985792, + "step": 137345 + }, + { + "epoch": 15.11001100110011, + "grad_norm": 0.027905916795134544, + "learning_rate": 8.565944406173607e-06, + "loss": 0.0104, + "num_input_tokens_seen": 28986880, + "step": 137350 + }, + { + "epoch": 15.110561056105611, + "grad_norm": 0.7695012092590332, + "learning_rate": 8.564135851465781e-06, + "loss": 0.0685, + "num_input_tokens_seen": 28987904, + "step": 137355 + }, + { + "epoch": 15.11111111111111, + "grad_norm": 0.08118141442537308, + "learning_rate": 8.56232744823944e-06, + "loss": 0.0076, + "num_input_tokens_seen": 28989024, + "step": 137360 + }, + { + "epoch": 15.111661166116612, + "grad_norm": 2.4751501083374023, + "learning_rate": 8.560519196511233e-06, + "loss": 0.0557, + "num_input_tokens_seen": 28990144, + "step": 137365 + }, + { + "epoch": 15.112211221122113, + "grad_norm": 0.3392435312271118, + "learning_rate": 8.558711096297827e-06, + "loss": 0.0042, + "num_input_tokens_seen": 28991200, + "step": 137370 + }, + { + "epoch": 15.112761276127612, + "grad_norm": 0.012858315370976925, + "learning_rate": 8.556903147615902e-06, + "loss": 0.0011, + "num_input_tokens_seen": 28992256, + "step": 137375 + }, + { + "epoch": 15.113311331133113, + "grad_norm": 0.18541285395622253, + "learning_rate": 8.555095350482103e-06, + "loss": 0.0315, + "num_input_tokens_seen": 28993376, + "step": 137380 + }, + { + "epoch": 15.113861386138614, + "grad_norm": 0.5270071625709534, + "learning_rate": 8.553287704913107e-06, + "loss": 0.0944, + "num_input_tokens_seen": 28994432, + "step": 137385 + }, + { + "epoch": 15.114411441144114, + "grad_norm": 0.008831880055367947, + "learning_rate": 8.551480210925557e-06, + "loss": 0.0022, + "num_input_tokens_seen": 28995456, + "step": 137390 + }, + { + "epoch": 15.114961496149615, + "grad_norm": 0.023423409089446068, + "learning_rate": 8.549672868536124e-06, + "loss": 0.0196, + "num_input_tokens_seen": 28996512, + "step": 137395 + }, + { + "epoch": 15.115511551155116, + "grad_norm": 0.11220183223485947, + "learning_rate": 8.547865677761469e-06, + "loss": 0.1243, + "num_input_tokens_seen": 28997568, + "step": 137400 + }, + { + "epoch": 15.116061606160615, + "grad_norm": 0.02819433994591236, + "learning_rate": 8.546058638618235e-06, + "loss": 0.0082, + "num_input_tokens_seen": 28998592, + "step": 137405 + }, + { + "epoch": 15.116611661166116, + "grad_norm": 0.2330145239830017, + "learning_rate": 8.54425175112309e-06, + "loss": 0.0031, + "num_input_tokens_seen": 28999680, + "step": 137410 + }, + { + "epoch": 15.117161716171617, + "grad_norm": 0.020245380699634552, + "learning_rate": 8.542445015292671e-06, + "loss": 0.005, + "num_input_tokens_seen": 29000736, + "step": 137415 + }, + { + "epoch": 15.117711771177119, + "grad_norm": 0.08464103937149048, + "learning_rate": 8.540638431143646e-06, + "loss": 0.1275, + "num_input_tokens_seen": 29001760, + "step": 137420 + }, + { + "epoch": 15.118261826182618, + "grad_norm": 0.018475621938705444, + "learning_rate": 8.538831998692653e-06, + "loss": 0.0101, + "num_input_tokens_seen": 29002848, + "step": 137425 + }, + { + "epoch": 15.118811881188119, + "grad_norm": 1.8708821535110474, + "learning_rate": 8.537025717956349e-06, + "loss": 0.0615, + "num_input_tokens_seen": 29003872, + "step": 137430 + }, + { + "epoch": 15.11936193619362, + "grad_norm": 0.022070592269301414, + "learning_rate": 8.535219588951384e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29004896, + "step": 137435 + }, + { + "epoch": 15.11991199119912, + "grad_norm": 0.004770530387759209, + "learning_rate": 8.533413611694392e-06, + "loss": 0.0107, + "num_input_tokens_seen": 29005984, + "step": 137440 + }, + { + "epoch": 15.12046204620462, + "grad_norm": 0.008191940374672413, + "learning_rate": 8.531607786202033e-06, + "loss": 0.0071, + "num_input_tokens_seen": 29006976, + "step": 137445 + }, + { + "epoch": 15.121012101210122, + "grad_norm": 0.14665140211582184, + "learning_rate": 8.529802112490934e-06, + "loss": 0.0283, + "num_input_tokens_seen": 29008064, + "step": 137450 + }, + { + "epoch": 15.12156215621562, + "grad_norm": 0.35384485125541687, + "learning_rate": 8.527996590577747e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29009120, + "step": 137455 + }, + { + "epoch": 15.122112211221122, + "grad_norm": 0.08752112090587616, + "learning_rate": 8.52619122047912e-06, + "loss": 0.0381, + "num_input_tokens_seen": 29010080, + "step": 137460 + }, + { + "epoch": 15.122662266226623, + "grad_norm": 0.014179364778101444, + "learning_rate": 8.524386002211685e-06, + "loss": 0.0264, + "num_input_tokens_seen": 29011200, + "step": 137465 + }, + { + "epoch": 15.123212321232122, + "grad_norm": 0.35154908895492554, + "learning_rate": 8.522580935792065e-06, + "loss": 0.0205, + "num_input_tokens_seen": 29012256, + "step": 137470 + }, + { + "epoch": 15.123762376237623, + "grad_norm": 0.06527609378099442, + "learning_rate": 8.520776021236923e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29013312, + "step": 137475 + }, + { + "epoch": 15.124312431243125, + "grad_norm": 0.005391643848270178, + "learning_rate": 8.518971258562872e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29014368, + "step": 137480 + }, + { + "epoch": 15.124862486248626, + "grad_norm": 0.07281611114740372, + "learning_rate": 8.517166647786553e-06, + "loss": 0.002, + "num_input_tokens_seen": 29015520, + "step": 137485 + }, + { + "epoch": 15.125412541254125, + "grad_norm": 0.11298729479312897, + "learning_rate": 8.515362188924608e-06, + "loss": 0.0263, + "num_input_tokens_seen": 29016576, + "step": 137490 + }, + { + "epoch": 15.125962596259626, + "grad_norm": 1.3977774381637573, + "learning_rate": 8.513557881993653e-06, + "loss": 0.017, + "num_input_tokens_seen": 29017632, + "step": 137495 + }, + { + "epoch": 15.126512651265127, + "grad_norm": 0.509686291217804, + "learning_rate": 8.51175372701033e-06, + "loss": 0.0039, + "num_input_tokens_seen": 29018720, + "step": 137500 + }, + { + "epoch": 15.127062706270626, + "grad_norm": 0.026481512933969498, + "learning_rate": 8.509949723991253e-06, + "loss": 0.0415, + "num_input_tokens_seen": 29019744, + "step": 137505 + }, + { + "epoch": 15.127612761276128, + "grad_norm": 0.10976015031337738, + "learning_rate": 8.50814587295306e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29020800, + "step": 137510 + }, + { + "epoch": 15.128162816281629, + "grad_norm": 0.20250795781612396, + "learning_rate": 8.50634217391238e-06, + "loss": 0.0597, + "num_input_tokens_seen": 29021856, + "step": 137515 + }, + { + "epoch": 15.128712871287128, + "grad_norm": 0.32345569133758545, + "learning_rate": 8.504538626885818e-06, + "loss": 0.0056, + "num_input_tokens_seen": 29022944, + "step": 137520 + }, + { + "epoch": 15.129262926292629, + "grad_norm": 0.03799113631248474, + "learning_rate": 8.50273523189002e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29024064, + "step": 137525 + }, + { + "epoch": 15.12981298129813, + "grad_norm": 0.05268248915672302, + "learning_rate": 8.500931988941584e-06, + "loss": 0.0615, + "num_input_tokens_seen": 29025152, + "step": 137530 + }, + { + "epoch": 15.130363036303631, + "grad_norm": 0.05369865894317627, + "learning_rate": 8.499128898057151e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29026208, + "step": 137535 + }, + { + "epoch": 15.13091309130913, + "grad_norm": 2.6561200618743896, + "learning_rate": 8.497325959253321e-06, + "loss": 0.0317, + "num_input_tokens_seen": 29027264, + "step": 137540 + }, + { + "epoch": 15.131463146314632, + "grad_norm": 0.02425713837146759, + "learning_rate": 8.495523172546722e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29028256, + "step": 137545 + }, + { + "epoch": 15.132013201320133, + "grad_norm": 0.028068356215953827, + "learning_rate": 8.49372053795397e-06, + "loss": 0.0767, + "num_input_tokens_seen": 29029280, + "step": 137550 + }, + { + "epoch": 15.132563256325632, + "grad_norm": 0.025753861293196678, + "learning_rate": 8.491918055491668e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29030304, + "step": 137555 + }, + { + "epoch": 15.133113311331133, + "grad_norm": 7.937704563140869, + "learning_rate": 8.490115725176446e-06, + "loss": 0.0707, + "num_input_tokens_seen": 29031360, + "step": 137560 + }, + { + "epoch": 15.133663366336634, + "grad_norm": 1.4638491868972778, + "learning_rate": 8.488313547024892e-06, + "loss": 0.0943, + "num_input_tokens_seen": 29032448, + "step": 137565 + }, + { + "epoch": 15.134213421342134, + "grad_norm": 1.3882144689559937, + "learning_rate": 8.486511521053633e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29033472, + "step": 137570 + }, + { + "epoch": 15.134763476347635, + "grad_norm": 0.12842914462089539, + "learning_rate": 8.484709647279281e-06, + "loss": 0.0022, + "num_input_tokens_seen": 29034496, + "step": 137575 + }, + { + "epoch": 15.135313531353136, + "grad_norm": 0.14534056186676025, + "learning_rate": 8.482907925718427e-06, + "loss": 0.0357, + "num_input_tokens_seen": 29035456, + "step": 137580 + }, + { + "epoch": 15.135863586358635, + "grad_norm": 0.39104291796684265, + "learning_rate": 8.481106356387694e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29036512, + "step": 137585 + }, + { + "epoch": 15.136413641364136, + "grad_norm": 0.007773836608976126, + "learning_rate": 8.479304939303676e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29037536, + "step": 137590 + }, + { + "epoch": 15.136963696369637, + "grad_norm": 0.023579252883791924, + "learning_rate": 8.47750367448297e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29038656, + "step": 137595 + }, + { + "epoch": 15.137513751375138, + "grad_norm": 0.049630992114543915, + "learning_rate": 8.475702561942184e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29039680, + "step": 137600 + }, + { + "epoch": 15.138063806380638, + "grad_norm": 0.08380749821662903, + "learning_rate": 8.473901601697918e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29040768, + "step": 137605 + }, + { + "epoch": 15.138613861386139, + "grad_norm": 0.00402890145778656, + "learning_rate": 8.472100793766777e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29041792, + "step": 137610 + }, + { + "epoch": 15.13916391639164, + "grad_norm": 0.06661723554134369, + "learning_rate": 8.470300138165354e-06, + "loss": 0.0398, + "num_input_tokens_seen": 29042848, + "step": 137615 + }, + { + "epoch": 15.13971397139714, + "grad_norm": 1.0199692249298096, + "learning_rate": 8.468499634910235e-06, + "loss": 0.0257, + "num_input_tokens_seen": 29043904, + "step": 137620 + }, + { + "epoch": 15.14026402640264, + "grad_norm": 1.5763823986053467, + "learning_rate": 8.466699284018023e-06, + "loss": 0.0714, + "num_input_tokens_seen": 29044896, + "step": 137625 + }, + { + "epoch": 15.140814081408141, + "grad_norm": 0.16437940299510956, + "learning_rate": 8.46489908550531e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29046016, + "step": 137630 + }, + { + "epoch": 15.14136413641364, + "grad_norm": 0.09118334949016571, + "learning_rate": 8.463099039388695e-06, + "loss": 0.0315, + "num_input_tokens_seen": 29047040, + "step": 137635 + }, + { + "epoch": 15.141914191419142, + "grad_norm": 0.020497538149356842, + "learning_rate": 8.461299145684761e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29048096, + "step": 137640 + }, + { + "epoch": 15.142464246424643, + "grad_norm": 0.9730647206306458, + "learning_rate": 8.45949940441009e-06, + "loss": 0.0125, + "num_input_tokens_seen": 29049152, + "step": 137645 + }, + { + "epoch": 15.143014301430142, + "grad_norm": 0.11284872144460678, + "learning_rate": 8.457699815581283e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29050176, + "step": 137650 + }, + { + "epoch": 15.143564356435643, + "grad_norm": 0.018943309783935547, + "learning_rate": 8.455900379214909e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29051200, + "step": 137655 + }, + { + "epoch": 15.144114411441144, + "grad_norm": 0.022695446386933327, + "learning_rate": 8.454101095327565e-06, + "loss": 0.0667, + "num_input_tokens_seen": 29052256, + "step": 137660 + }, + { + "epoch": 15.144664466446645, + "grad_norm": 0.0049606808461248875, + "learning_rate": 8.45230196393584e-06, + "loss": 0.0107, + "num_input_tokens_seen": 29053312, + "step": 137665 + }, + { + "epoch": 15.145214521452145, + "grad_norm": 1.8377437591552734, + "learning_rate": 8.450502985056299e-06, + "loss": 0.0464, + "num_input_tokens_seen": 29054400, + "step": 137670 + }, + { + "epoch": 15.145764576457646, + "grad_norm": 0.07556430995464325, + "learning_rate": 8.44870415870554e-06, + "loss": 0.0076, + "num_input_tokens_seen": 29055456, + "step": 137675 + }, + { + "epoch": 15.146314631463147, + "grad_norm": 1.2967121601104736, + "learning_rate": 8.44690548490013e-06, + "loss": 0.0227, + "num_input_tokens_seen": 29056512, + "step": 137680 + }, + { + "epoch": 15.146864686468646, + "grad_norm": 0.06335490196943283, + "learning_rate": 8.445106963656634e-06, + "loss": 0.0356, + "num_input_tokens_seen": 29057568, + "step": 137685 + }, + { + "epoch": 15.147414741474147, + "grad_norm": 0.15518517792224884, + "learning_rate": 8.443308594991661e-06, + "loss": 0.0123, + "num_input_tokens_seen": 29058592, + "step": 137690 + }, + { + "epoch": 15.147964796479648, + "grad_norm": 0.034710854291915894, + "learning_rate": 8.441510378921757e-06, + "loss": 0.0057, + "num_input_tokens_seen": 29059648, + "step": 137695 + }, + { + "epoch": 15.148514851485148, + "grad_norm": 0.02712062932550907, + "learning_rate": 8.439712315463519e-06, + "loss": 0.0269, + "num_input_tokens_seen": 29060736, + "step": 137700 + }, + { + "epoch": 15.149064906490649, + "grad_norm": 0.4335329234600067, + "learning_rate": 8.437914404633501e-06, + "loss": 0.0715, + "num_input_tokens_seen": 29061792, + "step": 137705 + }, + { + "epoch": 15.14961496149615, + "grad_norm": 0.07459349930286407, + "learning_rate": 8.436116646448275e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29062848, + "step": 137710 + }, + { + "epoch": 15.150165016501651, + "grad_norm": 0.009249004535377026, + "learning_rate": 8.43431904092441e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29063872, + "step": 137715 + }, + { + "epoch": 15.15071507150715, + "grad_norm": 0.05796569585800171, + "learning_rate": 8.432521588078479e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29064960, + "step": 137720 + }, + { + "epoch": 15.151265126512651, + "grad_norm": 0.006679212674498558, + "learning_rate": 8.430724287927056e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29065984, + "step": 137725 + }, + { + "epoch": 15.151815181518153, + "grad_norm": 0.018899977207183838, + "learning_rate": 8.428927140486695e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29067072, + "step": 137730 + }, + { + "epoch": 15.152365236523652, + "grad_norm": 0.07163440436124802, + "learning_rate": 8.427130145773953e-06, + "loss": 0.0122, + "num_input_tokens_seen": 29068032, + "step": 137735 + }, + { + "epoch": 15.152915291529153, + "grad_norm": 0.0188862644135952, + "learning_rate": 8.425333303805411e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29069152, + "step": 137740 + }, + { + "epoch": 15.153465346534654, + "grad_norm": 0.008768915198743343, + "learning_rate": 8.4235366145976e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29070272, + "step": 137745 + }, + { + "epoch": 15.154015401540153, + "grad_norm": 0.013078202493488789, + "learning_rate": 8.421740078167111e-06, + "loss": 0.0831, + "num_input_tokens_seen": 29071328, + "step": 137750 + }, + { + "epoch": 15.154565456545654, + "grad_norm": 2.9565422534942627, + "learning_rate": 8.419943694530494e-06, + "loss": 0.0924, + "num_input_tokens_seen": 29072352, + "step": 137755 + }, + { + "epoch": 15.155115511551156, + "grad_norm": 0.6522651314735413, + "learning_rate": 8.418147463704287e-06, + "loss": 0.1126, + "num_input_tokens_seen": 29073440, + "step": 137760 + }, + { + "epoch": 15.155665566556655, + "grad_norm": 0.1417413353919983, + "learning_rate": 8.41635138570507e-06, + "loss": 0.0139, + "num_input_tokens_seen": 29074528, + "step": 137765 + }, + { + "epoch": 15.156215621562156, + "grad_norm": 0.1010979413986206, + "learning_rate": 8.414555460549375e-06, + "loss": 0.0051, + "num_input_tokens_seen": 29075616, + "step": 137770 + }, + { + "epoch": 15.156765676567657, + "grad_norm": 0.008122983388602734, + "learning_rate": 8.412759688253763e-06, + "loss": 0.157, + "num_input_tokens_seen": 29076640, + "step": 137775 + }, + { + "epoch": 15.157315731573158, + "grad_norm": 0.013616321608424187, + "learning_rate": 8.410964068834796e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29077664, + "step": 137780 + }, + { + "epoch": 15.157865786578657, + "grad_norm": 0.18436887860298157, + "learning_rate": 8.409168602309003e-06, + "loss": 0.1111, + "num_input_tokens_seen": 29078720, + "step": 137785 + }, + { + "epoch": 15.158415841584159, + "grad_norm": 0.029543563723564148, + "learning_rate": 8.407373288692952e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29079776, + "step": 137790 + }, + { + "epoch": 15.15896589658966, + "grad_norm": 0.3319084942340851, + "learning_rate": 8.40557812800317e-06, + "loss": 0.0054, + "num_input_tokens_seen": 29080832, + "step": 137795 + }, + { + "epoch": 15.159515951595159, + "grad_norm": 2.314405679702759, + "learning_rate": 8.40378312025622e-06, + "loss": 0.0146, + "num_input_tokens_seen": 29081952, + "step": 137800 + }, + { + "epoch": 15.16006600660066, + "grad_norm": 0.027918055653572083, + "learning_rate": 8.40198826546863e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29083040, + "step": 137805 + }, + { + "epoch": 15.160616061606161, + "grad_norm": 0.028214355930685997, + "learning_rate": 8.400193563656947e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29084064, + "step": 137810 + }, + { + "epoch": 15.16116611661166, + "grad_norm": 0.062087398022413254, + "learning_rate": 8.398399014837727e-06, + "loss": 0.006, + "num_input_tokens_seen": 29085088, + "step": 137815 + }, + { + "epoch": 15.161716171617162, + "grad_norm": 2.800583839416504, + "learning_rate": 8.396604619027485e-06, + "loss": 0.1268, + "num_input_tokens_seen": 29086112, + "step": 137820 + }, + { + "epoch": 15.162266226622663, + "grad_norm": 0.5659613013267517, + "learning_rate": 8.39481037624278e-06, + "loss": 0.006, + "num_input_tokens_seen": 29087168, + "step": 137825 + }, + { + "epoch": 15.162816281628162, + "grad_norm": 0.0028932325076311827, + "learning_rate": 8.393016286500132e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29088192, + "step": 137830 + }, + { + "epoch": 15.163366336633663, + "grad_norm": 0.05744260549545288, + "learning_rate": 8.391222349816086e-06, + "loss": 0.0259, + "num_input_tokens_seen": 29089184, + "step": 137835 + }, + { + "epoch": 15.163916391639164, + "grad_norm": 0.891782820224762, + "learning_rate": 8.389428566207181e-06, + "loss": 0.0074, + "num_input_tokens_seen": 29090176, + "step": 137840 + }, + { + "epoch": 15.164466446644665, + "grad_norm": 0.012967953458428383, + "learning_rate": 8.387634935689942e-06, + "loss": 0.0604, + "num_input_tokens_seen": 29091200, + "step": 137845 + }, + { + "epoch": 15.165016501650165, + "grad_norm": 0.027765333652496338, + "learning_rate": 8.38584145828089e-06, + "loss": 0.0314, + "num_input_tokens_seen": 29092224, + "step": 137850 + }, + { + "epoch": 15.165566556655666, + "grad_norm": 0.9963408708572388, + "learning_rate": 8.384048133996578e-06, + "loss": 0.0138, + "num_input_tokens_seen": 29093216, + "step": 137855 + }, + { + "epoch": 15.166116611661167, + "grad_norm": 0.013669576495885849, + "learning_rate": 8.382254962853509e-06, + "loss": 0.058, + "num_input_tokens_seen": 29094272, + "step": 137860 + }, + { + "epoch": 15.166666666666666, + "grad_norm": 0.018166987225413322, + "learning_rate": 8.380461944868223e-06, + "loss": 0.0486, + "num_input_tokens_seen": 29095392, + "step": 137865 + }, + { + "epoch": 15.167216721672167, + "grad_norm": 0.07149826735258102, + "learning_rate": 8.378669080057253e-06, + "loss": 0.003, + "num_input_tokens_seen": 29096416, + "step": 137870 + }, + { + "epoch": 15.167766776677668, + "grad_norm": 0.007578811142593622, + "learning_rate": 8.376876368437106e-06, + "loss": 0.0029, + "num_input_tokens_seen": 29097504, + "step": 137875 + }, + { + "epoch": 15.168316831683168, + "grad_norm": 2.040395736694336, + "learning_rate": 8.375083810024323e-06, + "loss": 0.0333, + "num_input_tokens_seen": 29098496, + "step": 137880 + }, + { + "epoch": 15.168866886688669, + "grad_norm": 0.07116009294986725, + "learning_rate": 8.373291404835405e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29099520, + "step": 137885 + }, + { + "epoch": 15.16941694169417, + "grad_norm": 0.03118080645799637, + "learning_rate": 8.37149915288688e-06, + "loss": 0.0037, + "num_input_tokens_seen": 29100608, + "step": 137890 + }, + { + "epoch": 15.16996699669967, + "grad_norm": 0.07314877212047577, + "learning_rate": 8.36970705419528e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29101632, + "step": 137895 + }, + { + "epoch": 15.17051705170517, + "grad_norm": 2.200899124145508, + "learning_rate": 8.3679151087771e-06, + "loss": 0.0242, + "num_input_tokens_seen": 29102656, + "step": 137900 + }, + { + "epoch": 15.171067106710671, + "grad_norm": 0.45157861709594727, + "learning_rate": 8.366123316648875e-06, + "loss": 0.0112, + "num_input_tokens_seen": 29103648, + "step": 137905 + }, + { + "epoch": 15.171617161716172, + "grad_norm": 0.09853300452232361, + "learning_rate": 8.3643316778271e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29104640, + "step": 137910 + }, + { + "epoch": 15.172167216721672, + "grad_norm": 0.00308560230769217, + "learning_rate": 8.362540192328308e-06, + "loss": 0.0033, + "num_input_tokens_seen": 29105696, + "step": 137915 + }, + { + "epoch": 15.172717271727173, + "grad_norm": 0.025649921968579292, + "learning_rate": 8.360748860168987e-06, + "loss": 0.0445, + "num_input_tokens_seen": 29106784, + "step": 137920 + }, + { + "epoch": 15.173267326732674, + "grad_norm": 0.11483034491539001, + "learning_rate": 8.358957681365664e-06, + "loss": 0.0161, + "num_input_tokens_seen": 29107808, + "step": 137925 + }, + { + "epoch": 15.173817381738173, + "grad_norm": 0.03277549520134926, + "learning_rate": 8.357166655934847e-06, + "loss": 0.006, + "num_input_tokens_seen": 29108800, + "step": 137930 + }, + { + "epoch": 15.174367436743674, + "grad_norm": 0.01045642327517271, + "learning_rate": 8.355375783893036e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29109792, + "step": 137935 + }, + { + "epoch": 15.174917491749175, + "grad_norm": 0.33097344636917114, + "learning_rate": 8.353585065256744e-06, + "loss": 0.0029, + "num_input_tokens_seen": 29110784, + "step": 137940 + }, + { + "epoch": 15.175467546754675, + "grad_norm": 0.010663582943379879, + "learning_rate": 8.351794500042462e-06, + "loss": 0.0142, + "num_input_tokens_seen": 29111872, + "step": 137945 + }, + { + "epoch": 15.176017601760176, + "grad_norm": 0.44583287835121155, + "learning_rate": 8.350004088266702e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29112928, + "step": 137950 + }, + { + "epoch": 15.176567656765677, + "grad_norm": 0.7325696349143982, + "learning_rate": 8.348213829945975e-06, + "loss": 0.0082, + "num_input_tokens_seen": 29114016, + "step": 137955 + }, + { + "epoch": 15.177117711771178, + "grad_norm": 0.0058209579437971115, + "learning_rate": 8.34642372509676e-06, + "loss": 0.1029, + "num_input_tokens_seen": 29115040, + "step": 137960 + }, + { + "epoch": 15.177667766776677, + "grad_norm": 0.007051095832139254, + "learning_rate": 8.344633773735578e-06, + "loss": 0.0023, + "num_input_tokens_seen": 29116128, + "step": 137965 + }, + { + "epoch": 15.178217821782178, + "grad_norm": 0.24757763743400574, + "learning_rate": 8.342843975878912e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29117216, + "step": 137970 + }, + { + "epoch": 15.17876787678768, + "grad_norm": 0.007883846759796143, + "learning_rate": 8.341054331543252e-06, + "loss": 0.0068, + "num_input_tokens_seen": 29118240, + "step": 137975 + }, + { + "epoch": 15.179317931793179, + "grad_norm": 0.004625687375664711, + "learning_rate": 8.339264840745101e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29119264, + "step": 137980 + }, + { + "epoch": 15.17986798679868, + "grad_norm": 0.05157145857810974, + "learning_rate": 8.337475503500952e-06, + "loss": 0.092, + "num_input_tokens_seen": 29120256, + "step": 137985 + }, + { + "epoch": 15.180418041804181, + "grad_norm": 0.006115838885307312, + "learning_rate": 8.335686319827305e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29121280, + "step": 137990 + }, + { + "epoch": 15.18096809680968, + "grad_norm": 0.003398788860067725, + "learning_rate": 8.333897289740639e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29122336, + "step": 137995 + }, + { + "epoch": 15.181518151815181, + "grad_norm": 1.3854717016220093, + "learning_rate": 8.332108413257435e-06, + "loss": 0.0097, + "num_input_tokens_seen": 29123360, + "step": 138000 + }, + { + "epoch": 15.182068206820682, + "grad_norm": 0.003859013319015503, + "learning_rate": 8.330319690394193e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29124480, + "step": 138005 + }, + { + "epoch": 15.182618261826182, + "grad_norm": 0.2688295245170593, + "learning_rate": 8.328531121167394e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29125472, + "step": 138010 + }, + { + "epoch": 15.183168316831683, + "grad_norm": 0.0771985799074173, + "learning_rate": 8.32674270559353e-06, + "loss": 0.0051, + "num_input_tokens_seen": 29126560, + "step": 138015 + }, + { + "epoch": 15.183718371837184, + "grad_norm": 0.04140692949295044, + "learning_rate": 8.324954443689082e-06, + "loss": 0.0351, + "num_input_tokens_seen": 29127584, + "step": 138020 + }, + { + "epoch": 15.184268426842685, + "grad_norm": 0.024280408397316933, + "learning_rate": 8.323166335470515e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29128672, + "step": 138025 + }, + { + "epoch": 15.184818481848184, + "grad_norm": 0.8552475571632385, + "learning_rate": 8.321378380954331e-06, + "loss": 0.0063, + "num_input_tokens_seen": 29129728, + "step": 138030 + }, + { + "epoch": 15.185368536853685, + "grad_norm": 0.03808954730629921, + "learning_rate": 8.319590580156994e-06, + "loss": 0.0939, + "num_input_tokens_seen": 29130880, + "step": 138035 + }, + { + "epoch": 15.185918591859187, + "grad_norm": 0.015142909251153469, + "learning_rate": 8.317802933094983e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29131936, + "step": 138040 + }, + { + "epoch": 15.186468646864686, + "grad_norm": 0.005699295550584793, + "learning_rate": 8.316015439784789e-06, + "loss": 0.0035, + "num_input_tokens_seen": 29132992, + "step": 138045 + }, + { + "epoch": 15.187018701870187, + "grad_norm": 0.15449588000774384, + "learning_rate": 8.314228100242864e-06, + "loss": 0.0734, + "num_input_tokens_seen": 29134112, + "step": 138050 + }, + { + "epoch": 15.187568756875688, + "grad_norm": 0.09307719022035599, + "learning_rate": 8.3124409144857e-06, + "loss": 0.0279, + "num_input_tokens_seen": 29135168, + "step": 138055 + }, + { + "epoch": 15.188118811881187, + "grad_norm": 0.0029494811315089464, + "learning_rate": 8.310653882529753e-06, + "loss": 0.001, + "num_input_tokens_seen": 29136192, + "step": 138060 + }, + { + "epoch": 15.188668866886688, + "grad_norm": 1.2670153379440308, + "learning_rate": 8.308867004391502e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29137312, + "step": 138065 + }, + { + "epoch": 15.18921892189219, + "grad_norm": 0.0199362114071846, + "learning_rate": 8.307080280087423e-06, + "loss": 0.074, + "num_input_tokens_seen": 29138368, + "step": 138070 + }, + { + "epoch": 15.189768976897689, + "grad_norm": 0.02839401178061962, + "learning_rate": 8.305293709633965e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29139456, + "step": 138075 + }, + { + "epoch": 15.19031903190319, + "grad_norm": 0.0055047390051186085, + "learning_rate": 8.303507293047613e-06, + "loss": 0.078, + "num_input_tokens_seen": 29140480, + "step": 138080 + }, + { + "epoch": 15.190869086908691, + "grad_norm": 0.01459323801100254, + "learning_rate": 8.301721030344823e-06, + "loss": 0.0327, + "num_input_tokens_seen": 29141472, + "step": 138085 + }, + { + "epoch": 15.191419141914192, + "grad_norm": 0.007636490277945995, + "learning_rate": 8.29993492154205e-06, + "loss": 0.1257, + "num_input_tokens_seen": 29142560, + "step": 138090 + }, + { + "epoch": 15.191969196919691, + "grad_norm": 0.8761721849441528, + "learning_rate": 8.29814896665576e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29143584, + "step": 138095 + }, + { + "epoch": 15.192519251925193, + "grad_norm": 0.01552582811564207, + "learning_rate": 8.29636316570242e-06, + "loss": 0.0321, + "num_input_tokens_seen": 29144640, + "step": 138100 + }, + { + "epoch": 15.193069306930694, + "grad_norm": 0.2091124802827835, + "learning_rate": 8.294577518698493e-06, + "loss": 0.0116, + "num_input_tokens_seen": 29145728, + "step": 138105 + }, + { + "epoch": 15.193619361936193, + "grad_norm": 0.03388401120901108, + "learning_rate": 8.292792025660429e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29146688, + "step": 138110 + }, + { + "epoch": 15.194169416941694, + "grad_norm": 0.02858424186706543, + "learning_rate": 8.291006686604677e-06, + "loss": 0.0559, + "num_input_tokens_seen": 29147680, + "step": 138115 + }, + { + "epoch": 15.194719471947195, + "grad_norm": 0.027668969705700874, + "learning_rate": 8.289221501547698e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29148736, + "step": 138120 + }, + { + "epoch": 15.195269526952695, + "grad_norm": 0.08340191096067429, + "learning_rate": 8.287436470505943e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29149824, + "step": 138125 + }, + { + "epoch": 15.195819581958196, + "grad_norm": 0.0788566991686821, + "learning_rate": 8.285651593495878e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29150880, + "step": 138130 + }, + { + "epoch": 15.196369636963697, + "grad_norm": 0.14595066010951996, + "learning_rate": 8.28386687053394e-06, + "loss": 0.0075, + "num_input_tokens_seen": 29151968, + "step": 138135 + }, + { + "epoch": 15.196919691969198, + "grad_norm": 0.12689253687858582, + "learning_rate": 8.282082301636573e-06, + "loss": 0.0278, + "num_input_tokens_seen": 29153024, + "step": 138140 + }, + { + "epoch": 15.197469746974697, + "grad_norm": 0.012070336379110813, + "learning_rate": 8.280297886820237e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29154144, + "step": 138145 + }, + { + "epoch": 15.198019801980198, + "grad_norm": 0.019340621307492256, + "learning_rate": 8.278513626101367e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29155168, + "step": 138150 + }, + { + "epoch": 15.1985698569857, + "grad_norm": 0.012132521718740463, + "learning_rate": 8.276729519496412e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29156224, + "step": 138155 + }, + { + "epoch": 15.199119911991199, + "grad_norm": 0.02411588653922081, + "learning_rate": 8.274945567021828e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29157184, + "step": 138160 + }, + { + "epoch": 15.1996699669967, + "grad_norm": 0.04027939215302467, + "learning_rate": 8.273161768694032e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29158208, + "step": 138165 + }, + { + "epoch": 15.2002200220022, + "grad_norm": 0.008086156100034714, + "learning_rate": 8.271378124529488e-06, + "loss": 0.0399, + "num_input_tokens_seen": 29159232, + "step": 138170 + }, + { + "epoch": 15.2007700770077, + "grad_norm": 0.006737208925187588, + "learning_rate": 8.269594634544617e-06, + "loss": 0.0568, + "num_input_tokens_seen": 29160224, + "step": 138175 + }, + { + "epoch": 15.201320132013201, + "grad_norm": 1.925087571144104, + "learning_rate": 8.267811298755865e-06, + "loss": 0.0842, + "num_input_tokens_seen": 29161280, + "step": 138180 + }, + { + "epoch": 15.201870187018702, + "grad_norm": 0.015909343957901, + "learning_rate": 8.266028117179673e-06, + "loss": 0.0403, + "num_input_tokens_seen": 29162336, + "step": 138185 + }, + { + "epoch": 15.202420242024202, + "grad_norm": 0.6885631084442139, + "learning_rate": 8.264245089832461e-06, + "loss": 0.0145, + "num_input_tokens_seen": 29163424, + "step": 138190 + }, + { + "epoch": 15.202970297029703, + "grad_norm": 0.1449880599975586, + "learning_rate": 8.262462216730685e-06, + "loss": 0.0317, + "num_input_tokens_seen": 29164512, + "step": 138195 + }, + { + "epoch": 15.203520352035204, + "grad_norm": 0.02244599722325802, + "learning_rate": 8.26067949789075e-06, + "loss": 0.0456, + "num_input_tokens_seen": 29165536, + "step": 138200 + }, + { + "epoch": 15.204070407040705, + "grad_norm": 0.10863005369901657, + "learning_rate": 8.25889693332911e-06, + "loss": 0.1654, + "num_input_tokens_seen": 29166592, + "step": 138205 + }, + { + "epoch": 15.204620462046204, + "grad_norm": 0.024929553270339966, + "learning_rate": 8.257114523062177e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29167616, + "step": 138210 + }, + { + "epoch": 15.205170517051705, + "grad_norm": 0.014814277179539204, + "learning_rate": 8.255332267106383e-06, + "loss": 0.001, + "num_input_tokens_seen": 29168672, + "step": 138215 + }, + { + "epoch": 15.205720572057206, + "grad_norm": 0.010778789408504963, + "learning_rate": 8.253550165478166e-06, + "loss": 0.1518, + "num_input_tokens_seen": 29169728, + "step": 138220 + }, + { + "epoch": 15.206270627062706, + "grad_norm": 0.009981226176023483, + "learning_rate": 8.25176821819394e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29170784, + "step": 138225 + }, + { + "epoch": 15.206820682068207, + "grad_norm": 0.03883765637874603, + "learning_rate": 8.249986425270124e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29171904, + "step": 138230 + }, + { + "epoch": 15.207370737073708, + "grad_norm": 1.0490632057189941, + "learning_rate": 8.248204786723144e-06, + "loss": 0.0106, + "num_input_tokens_seen": 29172896, + "step": 138235 + }, + { + "epoch": 15.207920792079207, + "grad_norm": 0.010294727981090546, + "learning_rate": 8.246423302569423e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29173984, + "step": 138240 + }, + { + "epoch": 15.208470847084708, + "grad_norm": 0.01625189743936062, + "learning_rate": 8.244641972825384e-06, + "loss": 0.0991, + "num_input_tokens_seen": 29175008, + "step": 138245 + }, + { + "epoch": 15.20902090209021, + "grad_norm": 0.47876280546188354, + "learning_rate": 8.242860797507445e-06, + "loss": 0.0057, + "num_input_tokens_seen": 29176064, + "step": 138250 + }, + { + "epoch": 15.209570957095709, + "grad_norm": 0.05586381256580353, + "learning_rate": 8.241079776632008e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29177120, + "step": 138255 + }, + { + "epoch": 15.21012101210121, + "grad_norm": 0.004252316430211067, + "learning_rate": 8.239298910215502e-06, + "loss": 0.0428, + "num_input_tokens_seen": 29178208, + "step": 138260 + }, + { + "epoch": 15.210671067106711, + "grad_norm": 0.22243572771549225, + "learning_rate": 8.23751819827433e-06, + "loss": 0.0136, + "num_input_tokens_seen": 29179264, + "step": 138265 + }, + { + "epoch": 15.211221122112212, + "grad_norm": 0.014630359597504139, + "learning_rate": 8.235737640824908e-06, + "loss": 0.026, + "num_input_tokens_seen": 29180288, + "step": 138270 + }, + { + "epoch": 15.211771177117711, + "grad_norm": 0.03513917699456215, + "learning_rate": 8.233957237883657e-06, + "loss": 0.0033, + "num_input_tokens_seen": 29181344, + "step": 138275 + }, + { + "epoch": 15.212321232123212, + "grad_norm": 0.029122143983840942, + "learning_rate": 8.232176989466968e-06, + "loss": 0.1918, + "num_input_tokens_seen": 29182432, + "step": 138280 + }, + { + "epoch": 15.212871287128714, + "grad_norm": 0.08487056195735931, + "learning_rate": 8.230396895591267e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29183488, + "step": 138285 + }, + { + "epoch": 15.213421342134213, + "grad_norm": 0.056837648153305054, + "learning_rate": 8.228616956272942e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29184512, + "step": 138290 + }, + { + "epoch": 15.213971397139714, + "grad_norm": 0.15905626118183136, + "learning_rate": 8.226837171528407e-06, + "loss": 0.0204, + "num_input_tokens_seen": 29185600, + "step": 138295 + }, + { + "epoch": 15.214521452145215, + "grad_norm": 0.029817108064889908, + "learning_rate": 8.225057541374074e-06, + "loss": 0.059, + "num_input_tokens_seen": 29186656, + "step": 138300 + }, + { + "epoch": 15.215071507150714, + "grad_norm": 0.0026316028088331223, + "learning_rate": 8.223278065826326e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29187712, + "step": 138305 + }, + { + "epoch": 15.215621562156215, + "grad_norm": 0.6801297664642334, + "learning_rate": 8.221498744901584e-06, + "loss": 0.0083, + "num_input_tokens_seen": 29188704, + "step": 138310 + }, + { + "epoch": 15.216171617161717, + "grad_norm": 3.3689677715301514, + "learning_rate": 8.219719578616228e-06, + "loss": 0.3053, + "num_input_tokens_seen": 29189760, + "step": 138315 + }, + { + "epoch": 15.216721672167218, + "grad_norm": 0.02378876507282257, + "learning_rate": 8.217940566986673e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29190784, + "step": 138320 + }, + { + "epoch": 15.217271727172717, + "grad_norm": 0.3544043302536011, + "learning_rate": 8.2161617100293e-06, + "loss": 0.0218, + "num_input_tokens_seen": 29191840, + "step": 138325 + }, + { + "epoch": 15.217821782178218, + "grad_norm": 0.02314455807209015, + "learning_rate": 8.21438300776051e-06, + "loss": 0.0167, + "num_input_tokens_seen": 29192928, + "step": 138330 + }, + { + "epoch": 15.218371837183719, + "grad_norm": 0.013526800088584423, + "learning_rate": 8.212604460196704e-06, + "loss": 0.0071, + "num_input_tokens_seen": 29194016, + "step": 138335 + }, + { + "epoch": 15.218921892189218, + "grad_norm": 0.01919485442340374, + "learning_rate": 8.21082606735426e-06, + "loss": 0.0405, + "num_input_tokens_seen": 29195040, + "step": 138340 + }, + { + "epoch": 15.21947194719472, + "grad_norm": 0.010253243148326874, + "learning_rate": 8.209047829249586e-06, + "loss": 0.0017, + "num_input_tokens_seen": 29196064, + "step": 138345 + }, + { + "epoch": 15.22002200220022, + "grad_norm": 0.028591489419341087, + "learning_rate": 8.207269745899048e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29197056, + "step": 138350 + }, + { + "epoch": 15.22057205720572, + "grad_norm": 0.014591176062822342, + "learning_rate": 8.205491817319052e-06, + "loss": 0.0057, + "num_input_tokens_seen": 29198112, + "step": 138355 + }, + { + "epoch": 15.221122112211221, + "grad_norm": 0.007389707025140524, + "learning_rate": 8.203714043525986e-06, + "loss": 0.0104, + "num_input_tokens_seen": 29199104, + "step": 138360 + }, + { + "epoch": 15.221672167216722, + "grad_norm": 0.11001792550086975, + "learning_rate": 8.201936424536219e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29200192, + "step": 138365 + }, + { + "epoch": 15.222222222222221, + "grad_norm": 1.2123432159423828, + "learning_rate": 8.200158960366152e-06, + "loss": 0.0082, + "num_input_tokens_seen": 29201184, + "step": 138370 + }, + { + "epoch": 15.222772277227723, + "grad_norm": 0.2965910732746124, + "learning_rate": 8.198381651032158e-06, + "loss": 0.0057, + "num_input_tokens_seen": 29202240, + "step": 138375 + }, + { + "epoch": 15.223322332233224, + "grad_norm": 0.1185789629817009, + "learning_rate": 8.196604496550612e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29203328, + "step": 138380 + }, + { + "epoch": 15.223872387238725, + "grad_norm": 0.04526596516370773, + "learning_rate": 8.194827496937896e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29204352, + "step": 138385 + }, + { + "epoch": 15.224422442244224, + "grad_norm": 0.2406357079744339, + "learning_rate": 8.193050652210393e-06, + "loss": 0.0056, + "num_input_tokens_seen": 29205344, + "step": 138390 + }, + { + "epoch": 15.224972497249725, + "grad_norm": 0.06359715014696121, + "learning_rate": 8.191273962384486e-06, + "loss": 0.0062, + "num_input_tokens_seen": 29206400, + "step": 138395 + }, + { + "epoch": 15.225522552255226, + "grad_norm": 0.020408926531672478, + "learning_rate": 8.18949742747654e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29207424, + "step": 138400 + }, + { + "epoch": 15.226072607260726, + "grad_norm": 0.12233363837003708, + "learning_rate": 8.187721047502923e-06, + "loss": 0.0119, + "num_input_tokens_seen": 29208512, + "step": 138405 + }, + { + "epoch": 15.226622662266227, + "grad_norm": 1.6274174451828003, + "learning_rate": 8.185944822480013e-06, + "loss": 0.0445, + "num_input_tokens_seen": 29209504, + "step": 138410 + }, + { + "epoch": 15.227172717271728, + "grad_norm": 0.1825486570596695, + "learning_rate": 8.18416875242419e-06, + "loss": 0.002, + "num_input_tokens_seen": 29210560, + "step": 138415 + }, + { + "epoch": 15.227722772277227, + "grad_norm": 0.05764865130186081, + "learning_rate": 8.182392837351804e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29211616, + "step": 138420 + }, + { + "epoch": 15.228272827282728, + "grad_norm": 0.03686605021357536, + "learning_rate": 8.180617077279245e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29212672, + "step": 138425 + }, + { + "epoch": 15.22882288228823, + "grad_norm": 2.407668352127075, + "learning_rate": 8.178841472222856e-06, + "loss": 0.0446, + "num_input_tokens_seen": 29213760, + "step": 138430 + }, + { + "epoch": 15.229372937293729, + "grad_norm": 0.20264048874378204, + "learning_rate": 8.177066022199026e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29214848, + "step": 138435 + }, + { + "epoch": 15.22992299229923, + "grad_norm": 0.03097439557313919, + "learning_rate": 8.175290727224095e-06, + "loss": 0.051, + "num_input_tokens_seen": 29215840, + "step": 138440 + }, + { + "epoch": 15.23047304730473, + "grad_norm": 2.6421895027160645, + "learning_rate": 8.173515587314438e-06, + "loss": 0.0368, + "num_input_tokens_seen": 29216928, + "step": 138445 + }, + { + "epoch": 15.231023102310232, + "grad_norm": 1.7683827877044678, + "learning_rate": 8.171740602486419e-06, + "loss": 0.0694, + "num_input_tokens_seen": 29217984, + "step": 138450 + }, + { + "epoch": 15.231573157315731, + "grad_norm": 2.8301076889038086, + "learning_rate": 8.169965772756385e-06, + "loss": 0.0897, + "num_input_tokens_seen": 29219072, + "step": 138455 + }, + { + "epoch": 15.232123212321232, + "grad_norm": 0.5054491758346558, + "learning_rate": 8.16819109814071e-06, + "loss": 0.0055, + "num_input_tokens_seen": 29220128, + "step": 138460 + }, + { + "epoch": 15.232673267326733, + "grad_norm": 0.21628186106681824, + "learning_rate": 8.166416578655739e-06, + "loss": 0.0396, + "num_input_tokens_seen": 29221248, + "step": 138465 + }, + { + "epoch": 15.233223322332233, + "grad_norm": 1.221248984336853, + "learning_rate": 8.164642214317814e-06, + "loss": 0.1361, + "num_input_tokens_seen": 29222304, + "step": 138470 + }, + { + "epoch": 15.233773377337734, + "grad_norm": 0.005339917726814747, + "learning_rate": 8.162868005143318e-06, + "loss": 0.0023, + "num_input_tokens_seen": 29223360, + "step": 138475 + }, + { + "epoch": 15.234323432343235, + "grad_norm": 2.713611125946045, + "learning_rate": 8.161093951148582e-06, + "loss": 0.1248, + "num_input_tokens_seen": 29224416, + "step": 138480 + }, + { + "epoch": 15.234873487348734, + "grad_norm": 0.03721780702471733, + "learning_rate": 8.15932005234997e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29225504, + "step": 138485 + }, + { + "epoch": 15.235423542354235, + "grad_norm": 0.01938590407371521, + "learning_rate": 8.157546308763825e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29226656, + "step": 138490 + }, + { + "epoch": 15.235973597359736, + "grad_norm": 0.05000777915120125, + "learning_rate": 8.155772720406484e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29227776, + "step": 138495 + }, + { + "epoch": 15.236523652365236, + "grad_norm": 0.09082164615392685, + "learning_rate": 8.153999287294307e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29228832, + "step": 138500 + }, + { + "epoch": 15.237073707370737, + "grad_norm": 0.15528658032417297, + "learning_rate": 8.15222600944363e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29229920, + "step": 138505 + }, + { + "epoch": 15.237623762376238, + "grad_norm": 0.06365691870450974, + "learning_rate": 8.150452886870812e-06, + "loss": 0.0551, + "num_input_tokens_seen": 29230976, + "step": 138510 + }, + { + "epoch": 15.238173817381739, + "grad_norm": 0.003933160565793514, + "learning_rate": 8.148679919592186e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29231968, + "step": 138515 + }, + { + "epoch": 15.238723872387238, + "grad_norm": 0.009442687965929508, + "learning_rate": 8.146907107624083e-06, + "loss": 0.0702, + "num_input_tokens_seen": 29232960, + "step": 138520 + }, + { + "epoch": 15.23927392739274, + "grad_norm": 0.27778616547584534, + "learning_rate": 8.145134450982858e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29233984, + "step": 138525 + }, + { + "epoch": 15.23982398239824, + "grad_norm": 0.00493256188929081, + "learning_rate": 8.143361949684825e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29235040, + "step": 138530 + }, + { + "epoch": 15.24037403740374, + "grad_norm": 1.3715591430664062, + "learning_rate": 8.14158960374635e-06, + "loss": 0.0096, + "num_input_tokens_seen": 29236096, + "step": 138535 + }, + { + "epoch": 15.24092409240924, + "grad_norm": 0.10656893253326416, + "learning_rate": 8.139817413183757e-06, + "loss": 0.0257, + "num_input_tokens_seen": 29237152, + "step": 138540 + }, + { + "epoch": 15.241474147414742, + "grad_norm": 0.02973003312945366, + "learning_rate": 8.138045378013364e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29238208, + "step": 138545 + }, + { + "epoch": 15.242024202420241, + "grad_norm": 3.2671308517456055, + "learning_rate": 8.136273498251529e-06, + "loss": 0.0778, + "num_input_tokens_seen": 29239264, + "step": 138550 + }, + { + "epoch": 15.242574257425742, + "grad_norm": 0.00386921432800591, + "learning_rate": 8.134501773914557e-06, + "loss": 0.1066, + "num_input_tokens_seen": 29240320, + "step": 138555 + }, + { + "epoch": 15.243124312431243, + "grad_norm": 0.8488212823867798, + "learning_rate": 8.132730205018793e-06, + "loss": 0.023, + "num_input_tokens_seen": 29241408, + "step": 138560 + }, + { + "epoch": 15.243674367436745, + "grad_norm": 0.04663631692528725, + "learning_rate": 8.130958791580567e-06, + "loss": 0.0538, + "num_input_tokens_seen": 29242432, + "step": 138565 + }, + { + "epoch": 15.244224422442244, + "grad_norm": 0.012118509039282799, + "learning_rate": 8.129187533616192e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29243552, + "step": 138570 + }, + { + "epoch": 15.244774477447745, + "grad_norm": 0.011077402159571648, + "learning_rate": 8.127416431142006e-06, + "loss": 0.0099, + "num_input_tokens_seen": 29244608, + "step": 138575 + }, + { + "epoch": 15.245324532453246, + "grad_norm": 0.011441133916378021, + "learning_rate": 8.125645484174321e-06, + "loss": 0.0477, + "num_input_tokens_seen": 29245728, + "step": 138580 + }, + { + "epoch": 15.245874587458745, + "grad_norm": 0.04316839948296547, + "learning_rate": 8.123874692729471e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29246688, + "step": 138585 + }, + { + "epoch": 15.246424642464246, + "grad_norm": 0.02613123506307602, + "learning_rate": 8.122104056823765e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29247712, + "step": 138590 + }, + { + "epoch": 15.246974697469748, + "grad_norm": 0.05968351289629936, + "learning_rate": 8.120333576473523e-06, + "loss": 0.0131, + "num_input_tokens_seen": 29248800, + "step": 138595 + }, + { + "epoch": 15.247524752475247, + "grad_norm": 0.01769699715077877, + "learning_rate": 8.118563251695077e-06, + "loss": 0.0288, + "num_input_tokens_seen": 29249856, + "step": 138600 + }, + { + "epoch": 15.248074807480748, + "grad_norm": 0.002820845227688551, + "learning_rate": 8.116793082504733e-06, + "loss": 0.004, + "num_input_tokens_seen": 29250880, + "step": 138605 + }, + { + "epoch": 15.248624862486249, + "grad_norm": 0.12940341234207153, + "learning_rate": 8.115023068918797e-06, + "loss": 0.0198, + "num_input_tokens_seen": 29251936, + "step": 138610 + }, + { + "epoch": 15.249174917491748, + "grad_norm": 0.03267320618033409, + "learning_rate": 8.113253210953592e-06, + "loss": 0.0065, + "num_input_tokens_seen": 29252960, + "step": 138615 + }, + { + "epoch": 15.24972497249725, + "grad_norm": 0.039836473762989044, + "learning_rate": 8.111483508625428e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29254048, + "step": 138620 + }, + { + "epoch": 15.25027502750275, + "grad_norm": 1.6269431114196777, + "learning_rate": 8.109713961950627e-06, + "loss": 0.0063, + "num_input_tokens_seen": 29255072, + "step": 138625 + }, + { + "epoch": 15.250825082508252, + "grad_norm": 0.5087607502937317, + "learning_rate": 8.107944570945487e-06, + "loss": 0.004, + "num_input_tokens_seen": 29256128, + "step": 138630 + }, + { + "epoch": 15.251375137513751, + "grad_norm": 0.0024625754449516535, + "learning_rate": 8.106175335626304e-06, + "loss": 0.0404, + "num_input_tokens_seen": 29257152, + "step": 138635 + }, + { + "epoch": 15.251925192519252, + "grad_norm": 0.026052061468362808, + "learning_rate": 8.10440625600941e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29258208, + "step": 138640 + }, + { + "epoch": 15.252475247524753, + "grad_norm": 0.19169428944587708, + "learning_rate": 8.102637332111085e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29259264, + "step": 138645 + }, + { + "epoch": 15.253025302530252, + "grad_norm": 0.5125167965888977, + "learning_rate": 8.100868563947645e-06, + "loss": 0.1117, + "num_input_tokens_seen": 29260320, + "step": 138650 + }, + { + "epoch": 15.253575357535754, + "grad_norm": 0.009059274569153786, + "learning_rate": 8.099099951535397e-06, + "loss": 0.009, + "num_input_tokens_seen": 29261440, + "step": 138655 + }, + { + "epoch": 15.254125412541255, + "grad_norm": 0.016462666913866997, + "learning_rate": 8.097331494890629e-06, + "loss": 0.0093, + "num_input_tokens_seen": 29262560, + "step": 138660 + }, + { + "epoch": 15.254675467546754, + "grad_norm": 0.041162122040987015, + "learning_rate": 8.09556319402965e-06, + "loss": 0.0441, + "num_input_tokens_seen": 29263552, + "step": 138665 + }, + { + "epoch": 15.255225522552255, + "grad_norm": 0.0523335225880146, + "learning_rate": 8.09379504896875e-06, + "loss": 0.0022, + "num_input_tokens_seen": 29264640, + "step": 138670 + }, + { + "epoch": 15.255775577557756, + "grad_norm": 0.012977365404367447, + "learning_rate": 8.092027059724227e-06, + "loss": 0.0192, + "num_input_tokens_seen": 29265728, + "step": 138675 + }, + { + "epoch": 15.256325632563255, + "grad_norm": 0.08887704461812973, + "learning_rate": 8.090259226312386e-06, + "loss": 0.0223, + "num_input_tokens_seen": 29266720, + "step": 138680 + }, + { + "epoch": 15.256875687568757, + "grad_norm": 0.022051885724067688, + "learning_rate": 8.088491548749503e-06, + "loss": 0.0689, + "num_input_tokens_seen": 29267776, + "step": 138685 + }, + { + "epoch": 15.257425742574258, + "grad_norm": 0.1455751359462738, + "learning_rate": 8.086724027051887e-06, + "loss": 0.0658, + "num_input_tokens_seen": 29268832, + "step": 138690 + }, + { + "epoch": 15.257975797579759, + "grad_norm": 0.057846784591674805, + "learning_rate": 8.084956661235809e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29269952, + "step": 138695 + }, + { + "epoch": 15.258525852585258, + "grad_norm": 0.1651347130537033, + "learning_rate": 8.08318945131758e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29271008, + "step": 138700 + }, + { + "epoch": 15.25907590759076, + "grad_norm": 0.03306133300065994, + "learning_rate": 8.081422397313467e-06, + "loss": 0.0992, + "num_input_tokens_seen": 29272032, + "step": 138705 + }, + { + "epoch": 15.25962596259626, + "grad_norm": 0.48485004901885986, + "learning_rate": 8.079655499239766e-06, + "loss": 0.0156, + "num_input_tokens_seen": 29273120, + "step": 138710 + }, + { + "epoch": 15.26017601760176, + "grad_norm": 0.006784210912883282, + "learning_rate": 8.07788875711277e-06, + "loss": 0.001, + "num_input_tokens_seen": 29274144, + "step": 138715 + }, + { + "epoch": 15.26072607260726, + "grad_norm": 1.946243166923523, + "learning_rate": 8.076122170948744e-06, + "loss": 0.0171, + "num_input_tokens_seen": 29275232, + "step": 138720 + }, + { + "epoch": 15.261276127612762, + "grad_norm": 0.055115051567554474, + "learning_rate": 8.074355740763986e-06, + "loss": 0.0146, + "num_input_tokens_seen": 29276288, + "step": 138725 + }, + { + "epoch": 15.261826182618261, + "grad_norm": 0.01229211874306202, + "learning_rate": 8.072589466574764e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29277344, + "step": 138730 + }, + { + "epoch": 15.262376237623762, + "grad_norm": 0.040207888931035995, + "learning_rate": 8.07082334839736e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29278400, + "step": 138735 + }, + { + "epoch": 15.262926292629263, + "grad_norm": 0.005667516961693764, + "learning_rate": 8.069057386248064e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29279456, + "step": 138740 + }, + { + "epoch": 15.263476347634764, + "grad_norm": 0.03669988736510277, + "learning_rate": 8.067291580143132e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29280448, + "step": 138745 + }, + { + "epoch": 15.264026402640264, + "grad_norm": 0.06949889659881592, + "learning_rate": 8.065525930098857e-06, + "loss": 0.0337, + "num_input_tokens_seen": 29281440, + "step": 138750 + }, + { + "epoch": 15.264576457645765, + "grad_norm": 2.694118022918701, + "learning_rate": 8.063760436131504e-06, + "loss": 0.0732, + "num_input_tokens_seen": 29282432, + "step": 138755 + }, + { + "epoch": 15.265126512651266, + "grad_norm": 0.00907987356185913, + "learning_rate": 8.061995098257336e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29283488, + "step": 138760 + }, + { + "epoch": 15.265676567656765, + "grad_norm": 0.03115818277001381, + "learning_rate": 8.060229916492631e-06, + "loss": 0.0248, + "num_input_tokens_seen": 29284544, + "step": 138765 + }, + { + "epoch": 15.266226622662266, + "grad_norm": 0.000895751581992954, + "learning_rate": 8.058464890853659e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29285632, + "step": 138770 + }, + { + "epoch": 15.266776677667767, + "grad_norm": 0.03845367580652237, + "learning_rate": 8.056700021356694e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29286688, + "step": 138775 + }, + { + "epoch": 15.267326732673267, + "grad_norm": 0.012577764689922333, + "learning_rate": 8.054935308017997e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29287776, + "step": 138780 + }, + { + "epoch": 15.267876787678768, + "grad_norm": 0.03433440625667572, + "learning_rate": 8.053170750853819e-06, + "loss": 0.0571, + "num_input_tokens_seen": 29288864, + "step": 138785 + }, + { + "epoch": 15.268426842684269, + "grad_norm": 0.014130642637610435, + "learning_rate": 8.051406349880435e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29289984, + "step": 138790 + }, + { + "epoch": 15.268976897689768, + "grad_norm": 0.034532591700553894, + "learning_rate": 8.049642105114117e-06, + "loss": 0.1005, + "num_input_tokens_seen": 29290976, + "step": 138795 + }, + { + "epoch": 15.26952695269527, + "grad_norm": 3.9768033027648926, + "learning_rate": 8.047878016571103e-06, + "loss": 0.0713, + "num_input_tokens_seen": 29292064, + "step": 138800 + }, + { + "epoch": 15.27007700770077, + "grad_norm": 0.016326788812875748, + "learning_rate": 8.046114084267673e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29293184, + "step": 138805 + }, + { + "epoch": 15.270627062706271, + "grad_norm": 0.023631105199456215, + "learning_rate": 8.044350308220064e-06, + "loss": 0.001, + "num_input_tokens_seen": 29294208, + "step": 138810 + }, + { + "epoch": 15.27117711771177, + "grad_norm": 0.006210504099726677, + "learning_rate": 8.04258668844455e-06, + "loss": 0.0224, + "num_input_tokens_seen": 29295264, + "step": 138815 + }, + { + "epoch": 15.271727172717272, + "grad_norm": 2.223480224609375, + "learning_rate": 8.040823224957372e-06, + "loss": 0.0786, + "num_input_tokens_seen": 29296352, + "step": 138820 + }, + { + "epoch": 15.272277227722773, + "grad_norm": 0.010530022904276848, + "learning_rate": 8.039059917774788e-06, + "loss": 0.0065, + "num_input_tokens_seen": 29297440, + "step": 138825 + }, + { + "epoch": 15.272827282728272, + "grad_norm": 0.010193593800067902, + "learning_rate": 8.037296766913057e-06, + "loss": 0.1446, + "num_input_tokens_seen": 29298528, + "step": 138830 + }, + { + "epoch": 15.273377337733773, + "grad_norm": 0.042519327253103256, + "learning_rate": 8.035533772388414e-06, + "loss": 0.0892, + "num_input_tokens_seen": 29299616, + "step": 138835 + }, + { + "epoch": 15.273927392739274, + "grad_norm": 2.4984827041625977, + "learning_rate": 8.033770934217124e-06, + "loss": 0.0248, + "num_input_tokens_seen": 29300704, + "step": 138840 + }, + { + "epoch": 15.274477447744774, + "grad_norm": 0.005123734939843416, + "learning_rate": 8.032008252415419e-06, + "loss": 0.0164, + "num_input_tokens_seen": 29301792, + "step": 138845 + }, + { + "epoch": 15.275027502750275, + "grad_norm": 0.2232099324464798, + "learning_rate": 8.030245726999552e-06, + "loss": 0.0371, + "num_input_tokens_seen": 29302784, + "step": 138850 + }, + { + "epoch": 15.275577557755776, + "grad_norm": 0.24115684628486633, + "learning_rate": 8.028483357985774e-06, + "loss": 0.0257, + "num_input_tokens_seen": 29303904, + "step": 138855 + }, + { + "epoch": 15.276127612761275, + "grad_norm": 2.5000839233398438, + "learning_rate": 8.026721145390315e-06, + "loss": 0.1612, + "num_input_tokens_seen": 29304992, + "step": 138860 + }, + { + "epoch": 15.276677667766776, + "grad_norm": 4.101102352142334, + "learning_rate": 8.024959089229429e-06, + "loss": 0.1929, + "num_input_tokens_seen": 29305984, + "step": 138865 + }, + { + "epoch": 15.277227722772277, + "grad_norm": 1.7246463298797607, + "learning_rate": 8.02319718951935e-06, + "loss": 0.0159, + "num_input_tokens_seen": 29307072, + "step": 138870 + }, + { + "epoch": 15.277777777777779, + "grad_norm": 0.017864150926470757, + "learning_rate": 8.021435446276306e-06, + "loss": 0.0023, + "num_input_tokens_seen": 29308032, + "step": 138875 + }, + { + "epoch": 15.278327832783278, + "grad_norm": 0.03695174679160118, + "learning_rate": 8.01967385951655e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29309056, + "step": 138880 + }, + { + "epoch": 15.278877887788779, + "grad_norm": 0.0043762498535215855, + "learning_rate": 8.017912429256307e-06, + "loss": 0.0542, + "num_input_tokens_seen": 29310144, + "step": 138885 + }, + { + "epoch": 15.27942794279428, + "grad_norm": 0.1368684321641922, + "learning_rate": 8.016151155511826e-06, + "loss": 0.0076, + "num_input_tokens_seen": 29311200, + "step": 138890 + }, + { + "epoch": 15.27997799779978, + "grad_norm": 0.003283256432041526, + "learning_rate": 8.01439003829933e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29312288, + "step": 138895 + }, + { + "epoch": 15.28052805280528, + "grad_norm": 0.017103953287005424, + "learning_rate": 8.012629077635041e-06, + "loss": 0.0111, + "num_input_tokens_seen": 29313312, + "step": 138900 + }, + { + "epoch": 15.281078107810782, + "grad_norm": 0.04685753583908081, + "learning_rate": 8.010868273535201e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29314336, + "step": 138905 + }, + { + "epoch": 15.281628162816281, + "grad_norm": 0.02916833758354187, + "learning_rate": 8.009107626016037e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29315392, + "step": 138910 + }, + { + "epoch": 15.282178217821782, + "grad_norm": 0.020796576514840126, + "learning_rate": 8.007347135093782e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29316448, + "step": 138915 + }, + { + "epoch": 15.282728272827283, + "grad_norm": 1.5212922096252441, + "learning_rate": 8.005586800784654e-06, + "loss": 0.2068, + "num_input_tokens_seen": 29317472, + "step": 138920 + }, + { + "epoch": 15.283278327832782, + "grad_norm": 0.017353424802422523, + "learning_rate": 8.003826623104869e-06, + "loss": 0.016, + "num_input_tokens_seen": 29318496, + "step": 138925 + }, + { + "epoch": 15.283828382838283, + "grad_norm": 0.10909616202116013, + "learning_rate": 8.002066602070668e-06, + "loss": 0.0172, + "num_input_tokens_seen": 29319520, + "step": 138930 + }, + { + "epoch": 15.284378437843785, + "grad_norm": 0.005249450448900461, + "learning_rate": 8.000306737698251e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29320608, + "step": 138935 + }, + { + "epoch": 15.284928492849286, + "grad_norm": 0.0020371470600366592, + "learning_rate": 7.998547030003853e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29321632, + "step": 138940 + }, + { + "epoch": 15.285478547854785, + "grad_norm": 0.07087457925081253, + "learning_rate": 7.996787479003698e-06, + "loss": 0.0312, + "num_input_tokens_seen": 29322656, + "step": 138945 + }, + { + "epoch": 15.286028602860286, + "grad_norm": 0.024174537509679794, + "learning_rate": 7.995028084713981e-06, + "loss": 0.116, + "num_input_tokens_seen": 29323680, + "step": 138950 + }, + { + "epoch": 15.286578657865787, + "grad_norm": 3.790236473083496, + "learning_rate": 7.993268847150945e-06, + "loss": 0.0321, + "num_input_tokens_seen": 29324736, + "step": 138955 + }, + { + "epoch": 15.287128712871286, + "grad_norm": 0.009982836432754993, + "learning_rate": 7.991509766330776e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29325760, + "step": 138960 + }, + { + "epoch": 15.287678767876788, + "grad_norm": 0.01668327860534191, + "learning_rate": 7.989750842269702e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29326784, + "step": 138965 + }, + { + "epoch": 15.288228822882289, + "grad_norm": 2.630990743637085, + "learning_rate": 7.98799207498394e-06, + "loss": 0.0999, + "num_input_tokens_seen": 29327840, + "step": 138970 + }, + { + "epoch": 15.288778877887788, + "grad_norm": 0.007261313498020172, + "learning_rate": 7.986233464489683e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29328864, + "step": 138975 + }, + { + "epoch": 15.289328932893289, + "grad_norm": 0.027402985841035843, + "learning_rate": 7.984475010803158e-06, + "loss": 0.0055, + "num_input_tokens_seen": 29329920, + "step": 138980 + }, + { + "epoch": 15.28987898789879, + "grad_norm": 0.030224032700061798, + "learning_rate": 7.982716713940552e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29331008, + "step": 138985 + }, + { + "epoch": 15.290429042904291, + "grad_norm": 0.06735019385814667, + "learning_rate": 7.98095857391809e-06, + "loss": 0.0277, + "num_input_tokens_seen": 29332128, + "step": 138990 + }, + { + "epoch": 15.29097909790979, + "grad_norm": 0.11168399453163147, + "learning_rate": 7.97920059075196e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29333184, + "step": 138995 + }, + { + "epoch": 15.291529152915292, + "grad_norm": 0.00894887000322342, + "learning_rate": 7.977442764458367e-06, + "loss": 0.0143, + "num_input_tokens_seen": 29334208, + "step": 139000 + }, + { + "epoch": 15.292079207920793, + "grad_norm": 0.006360330618917942, + "learning_rate": 7.975685095053525e-06, + "loss": 0.0033, + "num_input_tokens_seen": 29335200, + "step": 139005 + }, + { + "epoch": 15.292629262926292, + "grad_norm": 0.008939089253544807, + "learning_rate": 7.973927582553625e-06, + "loss": 0.0097, + "num_input_tokens_seen": 29336256, + "step": 139010 + }, + { + "epoch": 15.293179317931793, + "grad_norm": 0.08504778891801834, + "learning_rate": 7.972170226974856e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29337280, + "step": 139015 + }, + { + "epoch": 15.293729372937294, + "grad_norm": 0.19837237894535065, + "learning_rate": 7.970413028333423e-06, + "loss": 0.003, + "num_input_tokens_seen": 29338272, + "step": 139020 + }, + { + "epoch": 15.294279427942794, + "grad_norm": 0.01746867224574089, + "learning_rate": 7.968655986645523e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29339264, + "step": 139025 + }, + { + "epoch": 15.294829482948295, + "grad_norm": 1.2180354595184326, + "learning_rate": 7.966899101927356e-06, + "loss": 0.0945, + "num_input_tokens_seen": 29340320, + "step": 139030 + }, + { + "epoch": 15.295379537953796, + "grad_norm": 0.10419508069753647, + "learning_rate": 7.965142374195106e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29341376, + "step": 139035 + }, + { + "epoch": 15.295929592959295, + "grad_norm": 0.17293986678123474, + "learning_rate": 7.963385803464957e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29342464, + "step": 139040 + }, + { + "epoch": 15.296479647964796, + "grad_norm": 0.06451662629842758, + "learning_rate": 7.961629389753115e-06, + "loss": 0.004, + "num_input_tokens_seen": 29343552, + "step": 139045 + }, + { + "epoch": 15.297029702970297, + "grad_norm": 0.2529256343841553, + "learning_rate": 7.959873133075748e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29344576, + "step": 139050 + }, + { + "epoch": 15.297579757975798, + "grad_norm": 0.398652046918869, + "learning_rate": 7.958117033449056e-06, + "loss": 0.1884, + "num_input_tokens_seen": 29345600, + "step": 139055 + }, + { + "epoch": 15.298129812981298, + "grad_norm": 0.025703594088554382, + "learning_rate": 7.956361090889231e-06, + "loss": 0.0904, + "num_input_tokens_seen": 29346592, + "step": 139060 + }, + { + "epoch": 15.298679867986799, + "grad_norm": 0.0360996313393116, + "learning_rate": 7.954605305412437e-06, + "loss": 0.0261, + "num_input_tokens_seen": 29347680, + "step": 139065 + }, + { + "epoch": 15.2992299229923, + "grad_norm": 3.1073801517486572, + "learning_rate": 7.952849677034877e-06, + "loss": 0.1249, + "num_input_tokens_seen": 29348800, + "step": 139070 + }, + { + "epoch": 15.2997799779978, + "grad_norm": 1.310543179512024, + "learning_rate": 7.95109420577271e-06, + "loss": 0.0474, + "num_input_tokens_seen": 29349824, + "step": 139075 + }, + { + "epoch": 15.3003300330033, + "grad_norm": 0.1206541433930397, + "learning_rate": 7.94933889164213e-06, + "loss": 0.0161, + "num_input_tokens_seen": 29350912, + "step": 139080 + }, + { + "epoch": 15.300880088008801, + "grad_norm": 0.06014484539628029, + "learning_rate": 7.947583734659319e-06, + "loss": 0.001, + "num_input_tokens_seen": 29351936, + "step": 139085 + }, + { + "epoch": 15.3014301430143, + "grad_norm": 0.02596113085746765, + "learning_rate": 7.945828734840438e-06, + "loss": 0.0093, + "num_input_tokens_seen": 29352992, + "step": 139090 + }, + { + "epoch": 15.301980198019802, + "grad_norm": 0.10765431076288223, + "learning_rate": 7.944073892201679e-06, + "loss": 0.0338, + "num_input_tokens_seen": 29354016, + "step": 139095 + }, + { + "epoch": 15.302530253025303, + "grad_norm": 0.02191299758851528, + "learning_rate": 7.942319206759196e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29355072, + "step": 139100 + }, + { + "epoch": 15.303080308030804, + "grad_norm": 0.016152964904904366, + "learning_rate": 7.940564678529183e-06, + "loss": 0.0332, + "num_input_tokens_seen": 29356160, + "step": 139105 + }, + { + "epoch": 15.303630363036303, + "grad_norm": 0.018255695700645447, + "learning_rate": 7.938810307527788e-06, + "loss": 0.0461, + "num_input_tokens_seen": 29357120, + "step": 139110 + }, + { + "epoch": 15.304180418041804, + "grad_norm": 1.1747260093688965, + "learning_rate": 7.937056093771194e-06, + "loss": 0.0171, + "num_input_tokens_seen": 29358144, + "step": 139115 + }, + { + "epoch": 15.304730473047305, + "grad_norm": 0.5637889504432678, + "learning_rate": 7.935302037275574e-06, + "loss": 0.0073, + "num_input_tokens_seen": 29359296, + "step": 139120 + }, + { + "epoch": 15.305280528052805, + "grad_norm": 0.005205994937568903, + "learning_rate": 7.933548138057081e-06, + "loss": 0.0174, + "num_input_tokens_seen": 29360352, + "step": 139125 + }, + { + "epoch": 15.305830583058306, + "grad_norm": 0.01209341175854206, + "learning_rate": 7.931794396131891e-06, + "loss": 0.0831, + "num_input_tokens_seen": 29361344, + "step": 139130 + }, + { + "epoch": 15.306380638063807, + "grad_norm": 0.22156769037246704, + "learning_rate": 7.930040811516154e-06, + "loss": 0.0129, + "num_input_tokens_seen": 29362464, + "step": 139135 + }, + { + "epoch": 15.306930693069306, + "grad_norm": 0.007091546431183815, + "learning_rate": 7.92828738422604e-06, + "loss": 0.0054, + "num_input_tokens_seen": 29363520, + "step": 139140 + }, + { + "epoch": 15.307480748074807, + "grad_norm": 0.005829480476677418, + "learning_rate": 7.926534114277717e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29364544, + "step": 139145 + }, + { + "epoch": 15.308030803080309, + "grad_norm": 0.05128460004925728, + "learning_rate": 7.924781001687329e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29365664, + "step": 139150 + }, + { + "epoch": 15.308580858085808, + "grad_norm": 0.019031085073947906, + "learning_rate": 7.923028046471052e-06, + "loss": 0.028, + "num_input_tokens_seen": 29366752, + "step": 139155 + }, + { + "epoch": 15.309130913091309, + "grad_norm": 0.06409385055303574, + "learning_rate": 7.921275248645027e-06, + "loss": 0.005, + "num_input_tokens_seen": 29367776, + "step": 139160 + }, + { + "epoch": 15.30968096809681, + "grad_norm": 0.008573172613978386, + "learning_rate": 7.919522608225406e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29368864, + "step": 139165 + }, + { + "epoch": 15.310231023102311, + "grad_norm": 0.04815882444381714, + "learning_rate": 7.917770125228346e-06, + "loss": 0.0151, + "num_input_tokens_seen": 29369920, + "step": 139170 + }, + { + "epoch": 15.31078107810781, + "grad_norm": 0.18325266242027283, + "learning_rate": 7.916017799670006e-06, + "loss": 0.0224, + "num_input_tokens_seen": 29370976, + "step": 139175 + }, + { + "epoch": 15.311331133113312, + "grad_norm": 0.01394837535917759, + "learning_rate": 7.91426563156654e-06, + "loss": 0.0142, + "num_input_tokens_seen": 29372000, + "step": 139180 + }, + { + "epoch": 15.311881188118813, + "grad_norm": 0.06809645891189575, + "learning_rate": 7.91251362093409e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29373024, + "step": 139185 + }, + { + "epoch": 15.312431243124312, + "grad_norm": 5.786816596984863, + "learning_rate": 7.91076176778879e-06, + "loss": 0.0819, + "num_input_tokens_seen": 29374016, + "step": 139190 + }, + { + "epoch": 15.312981298129813, + "grad_norm": 0.0871812254190445, + "learning_rate": 7.909010072146803e-06, + "loss": 0.026, + "num_input_tokens_seen": 29375040, + "step": 139195 + }, + { + "epoch": 15.313531353135314, + "grad_norm": 0.030034996569156647, + "learning_rate": 7.907258534024276e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29376096, + "step": 139200 + }, + { + "epoch": 15.314081408140813, + "grad_norm": 0.0057994043454527855, + "learning_rate": 7.905507153437337e-06, + "loss": 0.0423, + "num_input_tokens_seen": 29377152, + "step": 139205 + }, + { + "epoch": 15.314631463146315, + "grad_norm": 0.26569685339927673, + "learning_rate": 7.903755930402144e-06, + "loss": 0.0528, + "num_input_tokens_seen": 29378112, + "step": 139210 + }, + { + "epoch": 15.315181518151816, + "grad_norm": 0.13408203423023224, + "learning_rate": 7.90200486493482e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29379136, + "step": 139215 + }, + { + "epoch": 15.315731573157315, + "grad_norm": 0.05091014876961708, + "learning_rate": 7.90025395705152e-06, + "loss": 0.0407, + "num_input_tokens_seen": 29380160, + "step": 139220 + }, + { + "epoch": 15.316281628162816, + "grad_norm": 1.779828667640686, + "learning_rate": 7.898503206768368e-06, + "loss": 0.0303, + "num_input_tokens_seen": 29381216, + "step": 139225 + }, + { + "epoch": 15.316831683168317, + "grad_norm": 0.022237123921513557, + "learning_rate": 7.896752614101508e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29382240, + "step": 139230 + }, + { + "epoch": 15.317381738173818, + "grad_norm": 0.009133148938417435, + "learning_rate": 7.895002179067077e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29383296, + "step": 139235 + }, + { + "epoch": 15.317931793179318, + "grad_norm": 1.7536076307296753, + "learning_rate": 7.893251901681194e-06, + "loss": 0.0216, + "num_input_tokens_seen": 29384384, + "step": 139240 + }, + { + "epoch": 15.318481848184819, + "grad_norm": 0.01188444159924984, + "learning_rate": 7.891501781960009e-06, + "loss": 0.0076, + "num_input_tokens_seen": 29385472, + "step": 139245 + }, + { + "epoch": 15.31903190319032, + "grad_norm": 0.2249612808227539, + "learning_rate": 7.889751819919644e-06, + "loss": 0.006, + "num_input_tokens_seen": 29386528, + "step": 139250 + }, + { + "epoch": 15.319581958195819, + "grad_norm": 2.3494391441345215, + "learning_rate": 7.88800201557621e-06, + "loss": 0.0564, + "num_input_tokens_seen": 29387584, + "step": 139255 + }, + { + "epoch": 15.32013201320132, + "grad_norm": 0.0316472128033638, + "learning_rate": 7.886252368945866e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29388640, + "step": 139260 + }, + { + "epoch": 15.320682068206821, + "grad_norm": 0.08988583832979202, + "learning_rate": 7.88450288004471e-06, + "loss": 0.0427, + "num_input_tokens_seen": 29389728, + "step": 139265 + }, + { + "epoch": 15.32123212321232, + "grad_norm": 0.03111477941274643, + "learning_rate": 7.882753548888891e-06, + "loss": 0.0339, + "num_input_tokens_seen": 29390848, + "step": 139270 + }, + { + "epoch": 15.321782178217822, + "grad_norm": 0.15147079527378082, + "learning_rate": 7.881004375494516e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29391904, + "step": 139275 + }, + { + "epoch": 15.322332233223323, + "grad_norm": 0.009146510623395443, + "learning_rate": 7.879255359877705e-06, + "loss": 0.0039, + "num_input_tokens_seen": 29392928, + "step": 139280 + }, + { + "epoch": 15.322882288228822, + "grad_norm": 0.07790946960449219, + "learning_rate": 7.877506502054576e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29393952, + "step": 139285 + }, + { + "epoch": 15.323432343234323, + "grad_norm": 0.009599183686077595, + "learning_rate": 7.875757802041258e-06, + "loss": 0.0004, + "num_input_tokens_seen": 29394944, + "step": 139290 + }, + { + "epoch": 15.323982398239824, + "grad_norm": 0.010050593875348568, + "learning_rate": 7.874009259853871e-06, + "loss": 0.014, + "num_input_tokens_seen": 29396000, + "step": 139295 + }, + { + "epoch": 15.324532453245325, + "grad_norm": 0.0391283854842186, + "learning_rate": 7.872260875508523e-06, + "loss": 0.0035, + "num_input_tokens_seen": 29397024, + "step": 139300 + }, + { + "epoch": 15.325082508250825, + "grad_norm": 0.027664905413985252, + "learning_rate": 7.87051264902132e-06, + "loss": 0.0313, + "num_input_tokens_seen": 29398080, + "step": 139305 + }, + { + "epoch": 15.325632563256326, + "grad_norm": 0.006801900919526815, + "learning_rate": 7.868764580408388e-06, + "loss": 0.0285, + "num_input_tokens_seen": 29399168, + "step": 139310 + }, + { + "epoch": 15.326182618261827, + "grad_norm": 0.08827120065689087, + "learning_rate": 7.867016669685817e-06, + "loss": 0.0449, + "num_input_tokens_seen": 29400224, + "step": 139315 + }, + { + "epoch": 15.326732673267326, + "grad_norm": 0.009108652360737324, + "learning_rate": 7.86526891686975e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29401344, + "step": 139320 + }, + { + "epoch": 15.327282728272827, + "grad_norm": 0.07513882964849472, + "learning_rate": 7.863521321976276e-06, + "loss": 0.0125, + "num_input_tokens_seen": 29402432, + "step": 139325 + }, + { + "epoch": 15.327832783278328, + "grad_norm": 0.1226062923669815, + "learning_rate": 7.861773885021495e-06, + "loss": 0.0193, + "num_input_tokens_seen": 29403520, + "step": 139330 + }, + { + "epoch": 15.328382838283828, + "grad_norm": 0.011369423940777779, + "learning_rate": 7.860026606021528e-06, + "loss": 0.0075, + "num_input_tokens_seen": 29404576, + "step": 139335 + }, + { + "epoch": 15.328932893289329, + "grad_norm": 0.20927952229976654, + "learning_rate": 7.858279484992465e-06, + "loss": 0.0055, + "num_input_tokens_seen": 29405664, + "step": 139340 + }, + { + "epoch": 15.32948294829483, + "grad_norm": 0.004686607513576746, + "learning_rate": 7.856532521950414e-06, + "loss": 0.0049, + "num_input_tokens_seen": 29406688, + "step": 139345 + }, + { + "epoch": 15.33003300330033, + "grad_norm": 0.005744342226535082, + "learning_rate": 7.854785716911484e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29407808, + "step": 139350 + }, + { + "epoch": 15.33058305830583, + "grad_norm": 4.770918846130371, + "learning_rate": 7.853039069891757e-06, + "loss": 0.1086, + "num_input_tokens_seen": 29408896, + "step": 139355 + }, + { + "epoch": 15.331133113311331, + "grad_norm": 2.658883810043335, + "learning_rate": 7.851292580907352e-06, + "loss": 0.0739, + "num_input_tokens_seen": 29409952, + "step": 139360 + }, + { + "epoch": 15.331683168316832, + "grad_norm": 0.23086094856262207, + "learning_rate": 7.849546249974347e-06, + "loss": 0.0068, + "num_input_tokens_seen": 29411008, + "step": 139365 + }, + { + "epoch": 15.332233223322332, + "grad_norm": 0.0085914246737957, + "learning_rate": 7.84780007710885e-06, + "loss": 0.0437, + "num_input_tokens_seen": 29412064, + "step": 139370 + }, + { + "epoch": 15.332783278327833, + "grad_norm": 0.037481699138879776, + "learning_rate": 7.846054062326944e-06, + "loss": 0.0542, + "num_input_tokens_seen": 29413120, + "step": 139375 + }, + { + "epoch": 15.333333333333334, + "grad_norm": 0.09298353642225266, + "learning_rate": 7.844308205644723e-06, + "loss": 0.1059, + "num_input_tokens_seen": 29414176, + "step": 139380 + }, + { + "epoch": 15.333883388338833, + "grad_norm": 0.010603204369544983, + "learning_rate": 7.842562507078291e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29415232, + "step": 139385 + }, + { + "epoch": 15.334433443344334, + "grad_norm": 2.3384525775909424, + "learning_rate": 7.840816966643727e-06, + "loss": 0.0868, + "num_input_tokens_seen": 29416256, + "step": 139390 + }, + { + "epoch": 15.334983498349835, + "grad_norm": 0.006313707679510117, + "learning_rate": 7.839071584357114e-06, + "loss": 0.0279, + "num_input_tokens_seen": 29417312, + "step": 139395 + }, + { + "epoch": 15.335533553355335, + "grad_norm": 0.286989688873291, + "learning_rate": 7.83732636023454e-06, + "loss": 0.1084, + "num_input_tokens_seen": 29418400, + "step": 139400 + }, + { + "epoch": 15.336083608360836, + "grad_norm": 0.3652116358280182, + "learning_rate": 7.835581294292093e-06, + "loss": 0.005, + "num_input_tokens_seen": 29419392, + "step": 139405 + }, + { + "epoch": 15.336633663366337, + "grad_norm": 0.03251747414469719, + "learning_rate": 7.833836386545867e-06, + "loss": 0.0792, + "num_input_tokens_seen": 29420384, + "step": 139410 + }, + { + "epoch": 15.337183718371838, + "grad_norm": 0.13651877641677856, + "learning_rate": 7.832091637011934e-06, + "loss": 0.065, + "num_input_tokens_seen": 29421376, + "step": 139415 + }, + { + "epoch": 15.337733773377337, + "grad_norm": 2.301262140274048, + "learning_rate": 7.830347045706368e-06, + "loss": 0.0227, + "num_input_tokens_seen": 29422432, + "step": 139420 + }, + { + "epoch": 15.338283828382838, + "grad_norm": 0.1501273512840271, + "learning_rate": 7.828602612645258e-06, + "loss": 0.0899, + "num_input_tokens_seen": 29423488, + "step": 139425 + }, + { + "epoch": 15.33883388338834, + "grad_norm": 2.7277419567108154, + "learning_rate": 7.826858337844676e-06, + "loss": 0.0627, + "num_input_tokens_seen": 29424544, + "step": 139430 + }, + { + "epoch": 15.339383938393839, + "grad_norm": 0.037229716777801514, + "learning_rate": 7.825114221320696e-06, + "loss": 0.0081, + "num_input_tokens_seen": 29425600, + "step": 139435 + }, + { + "epoch": 15.33993399339934, + "grad_norm": 0.0252192672342062, + "learning_rate": 7.823370263089406e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29426624, + "step": 139440 + }, + { + "epoch": 15.340484048404841, + "grad_norm": 0.5498932600021362, + "learning_rate": 7.821626463166861e-06, + "loss": 0.0042, + "num_input_tokens_seen": 29427648, + "step": 139445 + }, + { + "epoch": 15.34103410341034, + "grad_norm": 0.01884070783853531, + "learning_rate": 7.81988282156915e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29428736, + "step": 139450 + }, + { + "epoch": 15.341584158415841, + "grad_norm": 1.4335697889328003, + "learning_rate": 7.818139338312327e-06, + "loss": 0.0281, + "num_input_tokens_seen": 29429824, + "step": 139455 + }, + { + "epoch": 15.342134213421343, + "grad_norm": 0.004943969659507275, + "learning_rate": 7.816396013412471e-06, + "loss": 0.0035, + "num_input_tokens_seen": 29430848, + "step": 139460 + }, + { + "epoch": 15.342684268426842, + "grad_norm": 0.021985089406371117, + "learning_rate": 7.814652846885654e-06, + "loss": 0.1026, + "num_input_tokens_seen": 29431936, + "step": 139465 + }, + { + "epoch": 15.343234323432343, + "grad_norm": 0.24563679099082947, + "learning_rate": 7.812909838747929e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29433024, + "step": 139470 + }, + { + "epoch": 15.343784378437844, + "grad_norm": 0.2160271257162094, + "learning_rate": 7.811166989015372e-06, + "loss": 0.0045, + "num_input_tokens_seen": 29434080, + "step": 139475 + }, + { + "epoch": 15.344334433443345, + "grad_norm": 0.1663743406534195, + "learning_rate": 7.809424297704035e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29435136, + "step": 139480 + }, + { + "epoch": 15.344884488448844, + "grad_norm": 0.02823888137936592, + "learning_rate": 7.807681764829993e-06, + "loss": 0.0087, + "num_input_tokens_seen": 29436256, + "step": 139485 + }, + { + "epoch": 15.345434543454346, + "grad_norm": 0.9111496210098267, + "learning_rate": 7.805939390409292e-06, + "loss": 0.0049, + "num_input_tokens_seen": 29437280, + "step": 139490 + }, + { + "epoch": 15.345984598459847, + "grad_norm": 1.0220986604690552, + "learning_rate": 7.804197174457995e-06, + "loss": 0.0457, + "num_input_tokens_seen": 29438336, + "step": 139495 + }, + { + "epoch": 15.346534653465346, + "grad_norm": 0.19192767143249512, + "learning_rate": 7.802455116992169e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29439424, + "step": 139500 + }, + { + "epoch": 15.347084708470847, + "grad_norm": 0.011415177024900913, + "learning_rate": 7.800713218027855e-06, + "loss": 0.0375, + "num_input_tokens_seen": 29440512, + "step": 139505 + }, + { + "epoch": 15.347634763476348, + "grad_norm": 0.531868040561676, + "learning_rate": 7.798971477581125e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29441632, + "step": 139510 + }, + { + "epoch": 15.348184818481847, + "grad_norm": 0.08370207995176315, + "learning_rate": 7.797229895668007e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29442688, + "step": 139515 + }, + { + "epoch": 15.348734873487349, + "grad_norm": 0.0468796007335186, + "learning_rate": 7.795488472304571e-06, + "loss": 0.0069, + "num_input_tokens_seen": 29443744, + "step": 139520 + }, + { + "epoch": 15.34928492849285, + "grad_norm": 0.02196335606276989, + "learning_rate": 7.793747207506868e-06, + "loss": 0.001, + "num_input_tokens_seen": 29444736, + "step": 139525 + }, + { + "epoch": 15.34983498349835, + "grad_norm": 0.27047932147979736, + "learning_rate": 7.792006101290933e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29445760, + "step": 139530 + }, + { + "epoch": 15.35038503850385, + "grad_norm": 0.0408170111477375, + "learning_rate": 7.790265153672829e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29446848, + "step": 139535 + }, + { + "epoch": 15.350935093509351, + "grad_norm": 0.01618526317179203, + "learning_rate": 7.788524364668594e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29447872, + "step": 139540 + }, + { + "epoch": 15.351485148514852, + "grad_norm": 0.0310041606426239, + "learning_rate": 7.786783734294261e-06, + "loss": 0.0062, + "num_input_tokens_seen": 29448960, + "step": 139545 + }, + { + "epoch": 15.352035203520352, + "grad_norm": 4.71721076965332, + "learning_rate": 7.785043262565884e-06, + "loss": 0.119, + "num_input_tokens_seen": 29450048, + "step": 139550 + }, + { + "epoch": 15.352585258525853, + "grad_norm": 0.02615361660718918, + "learning_rate": 7.783302949499501e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29451104, + "step": 139555 + }, + { + "epoch": 15.353135313531354, + "grad_norm": 0.08189691603183746, + "learning_rate": 7.781562795111164e-06, + "loss": 0.0137, + "num_input_tokens_seen": 29452128, + "step": 139560 + }, + { + "epoch": 15.353685368536853, + "grad_norm": 0.1729857325553894, + "learning_rate": 7.7798227994169e-06, + "loss": 0.1276, + "num_input_tokens_seen": 29453216, + "step": 139565 + }, + { + "epoch": 15.354235423542354, + "grad_norm": 0.08545149117708206, + "learning_rate": 7.778082962432736e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29454272, + "step": 139570 + }, + { + "epoch": 15.354785478547855, + "grad_norm": 0.017836857587099075, + "learning_rate": 7.77634328417472e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29455296, + "step": 139575 + }, + { + "epoch": 15.355335533553355, + "grad_norm": 0.1750621348619461, + "learning_rate": 7.774603764658891e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29456352, + "step": 139580 + }, + { + "epoch": 15.355885588558856, + "grad_norm": 0.019318966194987297, + "learning_rate": 7.772864403901264e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29457376, + "step": 139585 + }, + { + "epoch": 15.356435643564357, + "grad_norm": 2.9626352787017822, + "learning_rate": 7.77112520191789e-06, + "loss": 0.1043, + "num_input_tokens_seen": 29458368, + "step": 139590 + }, + { + "epoch": 15.356985698569858, + "grad_norm": 0.0201124157756567, + "learning_rate": 7.769386158724776e-06, + "loss": 0.0045, + "num_input_tokens_seen": 29459424, + "step": 139595 + }, + { + "epoch": 15.357535753575357, + "grad_norm": 0.0028075668960809708, + "learning_rate": 7.767647274337971e-06, + "loss": 0.1177, + "num_input_tokens_seen": 29460512, + "step": 139600 + }, + { + "epoch": 15.358085808580858, + "grad_norm": 0.8792227506637573, + "learning_rate": 7.765908548773485e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29461632, + "step": 139605 + }, + { + "epoch": 15.35863586358636, + "grad_norm": 0.011183702386915684, + "learning_rate": 7.76416998204735e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29462720, + "step": 139610 + }, + { + "epoch": 15.359185918591859, + "grad_norm": 4.2128682136535645, + "learning_rate": 7.7624315741756e-06, + "loss": 0.1716, + "num_input_tokens_seen": 29463808, + "step": 139615 + }, + { + "epoch": 15.35973597359736, + "grad_norm": 1.8287789821624756, + "learning_rate": 7.760693325174235e-06, + "loss": 0.0408, + "num_input_tokens_seen": 29464864, + "step": 139620 + }, + { + "epoch": 15.36028602860286, + "grad_norm": 6.752775192260742, + "learning_rate": 7.758955235059295e-06, + "loss": 0.0209, + "num_input_tokens_seen": 29465920, + "step": 139625 + }, + { + "epoch": 15.36083608360836, + "grad_norm": 0.045927491039037704, + "learning_rate": 7.757217303846787e-06, + "loss": 0.0204, + "num_input_tokens_seen": 29467008, + "step": 139630 + }, + { + "epoch": 15.361386138613861, + "grad_norm": 0.36992233991622925, + "learning_rate": 7.75547953155273e-06, + "loss": 0.0335, + "num_input_tokens_seen": 29468032, + "step": 139635 + }, + { + "epoch": 15.361936193619362, + "grad_norm": 0.12919862568378448, + "learning_rate": 7.753741918193155e-06, + "loss": 0.0502, + "num_input_tokens_seen": 29469120, + "step": 139640 + }, + { + "epoch": 15.362486248624862, + "grad_norm": 0.15053075551986694, + "learning_rate": 7.752004463784055e-06, + "loss": 0.0179, + "num_input_tokens_seen": 29470176, + "step": 139645 + }, + { + "epoch": 15.363036303630363, + "grad_norm": 0.07089242339134216, + "learning_rate": 7.750267168341463e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29471232, + "step": 139650 + }, + { + "epoch": 15.363586358635864, + "grad_norm": 0.001931369537487626, + "learning_rate": 7.748530031881379e-06, + "loss": 0.013, + "num_input_tokens_seen": 29472320, + "step": 139655 + }, + { + "epoch": 15.364136413641365, + "grad_norm": 0.006292537320405245, + "learning_rate": 7.74679305441981e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29473472, + "step": 139660 + }, + { + "epoch": 15.364686468646864, + "grad_norm": 0.011925015598535538, + "learning_rate": 7.745056235972767e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29474560, + "step": 139665 + }, + { + "epoch": 15.365236523652365, + "grad_norm": 0.027221133932471275, + "learning_rate": 7.743319576556263e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29475584, + "step": 139670 + }, + { + "epoch": 15.365786578657866, + "grad_norm": 2.643609046936035, + "learning_rate": 7.74158307618631e-06, + "loss": 0.0551, + "num_input_tokens_seen": 29476640, + "step": 139675 + }, + { + "epoch": 15.366336633663366, + "grad_norm": 0.008520432747900486, + "learning_rate": 7.739846734878906e-06, + "loss": 0.0508, + "num_input_tokens_seen": 29477696, + "step": 139680 + }, + { + "epoch": 15.366886688668867, + "grad_norm": 0.0385994017124176, + "learning_rate": 7.73811055265004e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29478720, + "step": 139685 + }, + { + "epoch": 15.367436743674368, + "grad_norm": 0.15661300718784332, + "learning_rate": 7.73637452951573e-06, + "loss": 0.0023, + "num_input_tokens_seen": 29479776, + "step": 139690 + }, + { + "epoch": 15.367986798679867, + "grad_norm": 0.015461285598576069, + "learning_rate": 7.734638665491972e-06, + "loss": 0.012, + "num_input_tokens_seen": 29480832, + "step": 139695 + }, + { + "epoch": 15.368536853685368, + "grad_norm": 0.11808103322982788, + "learning_rate": 7.73290296059477e-06, + "loss": 0.0811, + "num_input_tokens_seen": 29481856, + "step": 139700 + }, + { + "epoch": 15.36908690869087, + "grad_norm": 0.06800787150859833, + "learning_rate": 7.731167414840118e-06, + "loss": 0.0042, + "num_input_tokens_seen": 29482880, + "step": 139705 + }, + { + "epoch": 15.369636963696369, + "grad_norm": 0.031751107424497604, + "learning_rate": 7.729432028244002e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29483872, + "step": 139710 + }, + { + "epoch": 15.37018701870187, + "grad_norm": 0.11189407855272293, + "learning_rate": 7.72769680082243e-06, + "loss": 0.0232, + "num_input_tokens_seen": 29484960, + "step": 139715 + }, + { + "epoch": 15.370737073707371, + "grad_norm": 0.03247413784265518, + "learning_rate": 7.725961732591383e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29485952, + "step": 139720 + }, + { + "epoch": 15.371287128712872, + "grad_norm": 0.07214788347482681, + "learning_rate": 7.724226823566854e-06, + "loss": 0.0884, + "num_input_tokens_seen": 29486944, + "step": 139725 + }, + { + "epoch": 15.371837183718371, + "grad_norm": 0.04443168267607689, + "learning_rate": 7.722492073764848e-06, + "loss": 0.2324, + "num_input_tokens_seen": 29487968, + "step": 139730 + }, + { + "epoch": 15.372387238723872, + "grad_norm": 0.22604219615459442, + "learning_rate": 7.720757483201332e-06, + "loss": 0.0037, + "num_input_tokens_seen": 29488960, + "step": 139735 + }, + { + "epoch": 15.372937293729374, + "grad_norm": 0.02523038350045681, + "learning_rate": 7.719023051892313e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29489984, + "step": 139740 + }, + { + "epoch": 15.373487348734873, + "grad_norm": 0.013956154696643353, + "learning_rate": 7.717288779853757e-06, + "loss": 0.001, + "num_input_tokens_seen": 29491008, + "step": 139745 + }, + { + "epoch": 15.374037403740374, + "grad_norm": 0.03025910072028637, + "learning_rate": 7.715554667101657e-06, + "loss": 0.0882, + "num_input_tokens_seen": 29492032, + "step": 139750 + }, + { + "epoch": 15.374587458745875, + "grad_norm": 2.588029384613037, + "learning_rate": 7.713820713652004e-06, + "loss": 0.163, + "num_input_tokens_seen": 29492992, + "step": 139755 + }, + { + "epoch": 15.375137513751374, + "grad_norm": 0.2734358012676239, + "learning_rate": 7.712086919520762e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29494048, + "step": 139760 + }, + { + "epoch": 15.375687568756875, + "grad_norm": 0.01670621521770954, + "learning_rate": 7.710353284723928e-06, + "loss": 0.0435, + "num_input_tokens_seen": 29495104, + "step": 139765 + }, + { + "epoch": 15.376237623762377, + "grad_norm": 0.05765993520617485, + "learning_rate": 7.70861980927747e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29496128, + "step": 139770 + }, + { + "epoch": 15.376787678767876, + "grad_norm": 0.045747626572847366, + "learning_rate": 7.70688649319736e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29497152, + "step": 139775 + }, + { + "epoch": 15.377337733773377, + "grad_norm": 0.1220720112323761, + "learning_rate": 7.70515333649958e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29498208, + "step": 139780 + }, + { + "epoch": 15.377887788778878, + "grad_norm": 1.7616738080978394, + "learning_rate": 7.703420339200101e-06, + "loss": 0.0777, + "num_input_tokens_seen": 29499296, + "step": 139785 + }, + { + "epoch": 15.37843784378438, + "grad_norm": 0.01168576069176197, + "learning_rate": 7.701687501314905e-06, + "loss": 0.0935, + "num_input_tokens_seen": 29500320, + "step": 139790 + }, + { + "epoch": 15.378987898789878, + "grad_norm": 0.0351286344230175, + "learning_rate": 7.699954822859957e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29501408, + "step": 139795 + }, + { + "epoch": 15.37953795379538, + "grad_norm": 0.02162564918398857, + "learning_rate": 7.698222303851213e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29502464, + "step": 139800 + }, + { + "epoch": 15.38008800880088, + "grad_norm": 0.5396594405174255, + "learning_rate": 7.696489944304652e-06, + "loss": 0.0248, + "num_input_tokens_seen": 29503616, + "step": 139805 + }, + { + "epoch": 15.38063806380638, + "grad_norm": 0.019045215100049973, + "learning_rate": 7.694757744236241e-06, + "loss": 0.0037, + "num_input_tokens_seen": 29504640, + "step": 139810 + }, + { + "epoch": 15.381188118811881, + "grad_norm": 0.06700213998556137, + "learning_rate": 7.693025703661952e-06, + "loss": 0.0259, + "num_input_tokens_seen": 29505696, + "step": 139815 + }, + { + "epoch": 15.381738173817382, + "grad_norm": 0.42476361989974976, + "learning_rate": 7.691293822597742e-06, + "loss": 0.0082, + "num_input_tokens_seen": 29506752, + "step": 139820 + }, + { + "epoch": 15.382288228822881, + "grad_norm": 0.02843630313873291, + "learning_rate": 7.689562101059561e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29507872, + "step": 139825 + }, + { + "epoch": 15.382838283828383, + "grad_norm": 0.012304531410336494, + "learning_rate": 7.68783053906339e-06, + "loss": 0.0075, + "num_input_tokens_seen": 29508896, + "step": 139830 + }, + { + "epoch": 15.383388338833884, + "grad_norm": 0.011919701471924782, + "learning_rate": 7.686099136625169e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29509920, + "step": 139835 + }, + { + "epoch": 15.383938393839385, + "grad_norm": 0.729033887386322, + "learning_rate": 7.684367893760863e-06, + "loss": 0.007, + "num_input_tokens_seen": 29511008, + "step": 139840 + }, + { + "epoch": 15.384488448844884, + "grad_norm": 0.11019812524318695, + "learning_rate": 7.682636810486437e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29512000, + "step": 139845 + }, + { + "epoch": 15.385038503850385, + "grad_norm": 0.12670332193374634, + "learning_rate": 7.680905886817832e-06, + "loss": 0.023, + "num_input_tokens_seen": 29513024, + "step": 139850 + }, + { + "epoch": 15.385588558855886, + "grad_norm": 0.07365767657756805, + "learning_rate": 7.679175122771013e-06, + "loss": 0.0022, + "num_input_tokens_seen": 29514112, + "step": 139855 + }, + { + "epoch": 15.386138613861386, + "grad_norm": 0.07013674080371857, + "learning_rate": 7.677444518361918e-06, + "loss": 0.1624, + "num_input_tokens_seen": 29515200, + "step": 139860 + }, + { + "epoch": 15.386688668866887, + "grad_norm": 0.01586349494755268, + "learning_rate": 7.675714073606504e-06, + "loss": 0.0485, + "num_input_tokens_seen": 29516192, + "step": 139865 + }, + { + "epoch": 15.387238723872388, + "grad_norm": 2.506816864013672, + "learning_rate": 7.673983788520731e-06, + "loss": 0.0365, + "num_input_tokens_seen": 29517344, + "step": 139870 + }, + { + "epoch": 15.387788778877887, + "grad_norm": 0.01242398377507925, + "learning_rate": 7.672253663120526e-06, + "loss": 0.1067, + "num_input_tokens_seen": 29518368, + "step": 139875 + }, + { + "epoch": 15.388338833883388, + "grad_norm": 0.8360460996627808, + "learning_rate": 7.670523697421853e-06, + "loss": 0.0176, + "num_input_tokens_seen": 29519456, + "step": 139880 + }, + { + "epoch": 15.38888888888889, + "grad_norm": 0.056243155151605606, + "learning_rate": 7.668793891440638e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29520576, + "step": 139885 + }, + { + "epoch": 15.389438943894389, + "grad_norm": 0.0033445984590798616, + "learning_rate": 7.667064245192843e-06, + "loss": 0.1028, + "num_input_tokens_seen": 29521632, + "step": 139890 + }, + { + "epoch": 15.38998899889989, + "grad_norm": 0.49846532940864563, + "learning_rate": 7.66533475869439e-06, + "loss": 0.0723, + "num_input_tokens_seen": 29522784, + "step": 139895 + }, + { + "epoch": 15.39053905390539, + "grad_norm": 0.012329096905887127, + "learning_rate": 7.663605431961229e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29523808, + "step": 139900 + }, + { + "epoch": 15.391089108910892, + "grad_norm": 0.03926790878176689, + "learning_rate": 7.661876265009307e-06, + "loss": 0.0296, + "num_input_tokens_seen": 29524864, + "step": 139905 + }, + { + "epoch": 15.391639163916391, + "grad_norm": 0.03628915175795555, + "learning_rate": 7.660147257854544e-06, + "loss": 0.0925, + "num_input_tokens_seen": 29525952, + "step": 139910 + }, + { + "epoch": 15.392189218921892, + "grad_norm": 0.00520628085359931, + "learning_rate": 7.658418410512889e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29527072, + "step": 139915 + }, + { + "epoch": 15.392739273927393, + "grad_norm": 0.06378279626369476, + "learning_rate": 7.65668972300027e-06, + "loss": 0.002, + "num_input_tokens_seen": 29528064, + "step": 139920 + }, + { + "epoch": 15.393289328932893, + "grad_norm": 0.031943872570991516, + "learning_rate": 7.654961195332608e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29529088, + "step": 139925 + }, + { + "epoch": 15.393839383938394, + "grad_norm": 0.011646609753370285, + "learning_rate": 7.653232827525859e-06, + "loss": 0.0119, + "num_input_tokens_seen": 29530176, + "step": 139930 + }, + { + "epoch": 15.394389438943895, + "grad_norm": 0.08202021569013596, + "learning_rate": 7.651504619595931e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29531232, + "step": 139935 + }, + { + "epoch": 15.394939493949394, + "grad_norm": 0.1245594248175621, + "learning_rate": 7.649776571558768e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29532320, + "step": 139940 + }, + { + "epoch": 15.395489548954895, + "grad_norm": 0.05392327904701233, + "learning_rate": 7.64804868343029e-06, + "loss": 0.05, + "num_input_tokens_seen": 29533344, + "step": 139945 + }, + { + "epoch": 15.396039603960396, + "grad_norm": 0.784566342830658, + "learning_rate": 7.646320955226413e-06, + "loss": 0.0332, + "num_input_tokens_seen": 29534336, + "step": 139950 + }, + { + "epoch": 15.396589658965897, + "grad_norm": 0.13724488019943237, + "learning_rate": 7.644593386963067e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29535360, + "step": 139955 + }, + { + "epoch": 15.397139713971397, + "grad_norm": 0.06789206713438034, + "learning_rate": 7.642865978656186e-06, + "loss": 0.012, + "num_input_tokens_seen": 29536480, + "step": 139960 + }, + { + "epoch": 15.397689768976898, + "grad_norm": 0.007886507548391819, + "learning_rate": 7.641138730321671e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29537504, + "step": 139965 + }, + { + "epoch": 15.398239823982399, + "grad_norm": 0.07452560216188431, + "learning_rate": 7.639411641975461e-06, + "loss": 0.0107, + "num_input_tokens_seen": 29538560, + "step": 139970 + }, + { + "epoch": 15.398789878987898, + "grad_norm": 0.01890924945473671, + "learning_rate": 7.637684713633453e-06, + "loss": 0.001, + "num_input_tokens_seen": 29539616, + "step": 139975 + }, + { + "epoch": 15.3993399339934, + "grad_norm": 0.13329769670963287, + "learning_rate": 7.635957945311573e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29540640, + "step": 139980 + }, + { + "epoch": 15.3998899889989, + "grad_norm": 0.20233066380023956, + "learning_rate": 7.634231337025747e-06, + "loss": 0.1095, + "num_input_tokens_seen": 29541664, + "step": 139985 + }, + { + "epoch": 15.4004400440044, + "grad_norm": 0.43281346559524536, + "learning_rate": 7.63250488879187e-06, + "loss": 0.0108, + "num_input_tokens_seen": 29542752, + "step": 139990 + }, + { + "epoch": 15.400990099009901, + "grad_norm": 0.02350027859210968, + "learning_rate": 7.630778600625869e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29543808, + "step": 139995 + }, + { + "epoch": 15.401540154015402, + "grad_norm": 1.569395899772644, + "learning_rate": 7.629052472543638e-06, + "loss": 0.0533, + "num_input_tokens_seen": 29544896, + "step": 140000 + }, + { + "epoch": 15.402090209020901, + "grad_norm": 0.23832058906555176, + "learning_rate": 7.627326504561103e-06, + "loss": 0.0286, + "num_input_tokens_seen": 29545984, + "step": 140005 + }, + { + "epoch": 15.402640264026402, + "grad_norm": 2.434994697570801, + "learning_rate": 7.625600696694155e-06, + "loss": 0.1062, + "num_input_tokens_seen": 29547072, + "step": 140010 + }, + { + "epoch": 15.403190319031903, + "grad_norm": 0.13326872885227203, + "learning_rate": 7.623875048958709e-06, + "loss": 0.1694, + "num_input_tokens_seen": 29548192, + "step": 140015 + }, + { + "epoch": 15.403740374037405, + "grad_norm": 0.037746675312519073, + "learning_rate": 7.6221495613706785e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29549216, + "step": 140020 + }, + { + "epoch": 15.404290429042904, + "grad_norm": 0.07768991589546204, + "learning_rate": 7.620424233945947e-06, + "loss": 0.0017, + "num_input_tokens_seen": 29550208, + "step": 140025 + }, + { + "epoch": 15.404840484048405, + "grad_norm": 0.26732420921325684, + "learning_rate": 7.618699066700433e-06, + "loss": 0.0061, + "num_input_tokens_seen": 29551264, + "step": 140030 + }, + { + "epoch": 15.405390539053906, + "grad_norm": 2.4993772506713867, + "learning_rate": 7.616974059650031e-06, + "loss": 0.08, + "num_input_tokens_seen": 29552352, + "step": 140035 + }, + { + "epoch": 15.405940594059405, + "grad_norm": 0.015076245181262493, + "learning_rate": 7.615249212810621e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29553408, + "step": 140040 + }, + { + "epoch": 15.406490649064907, + "grad_norm": 0.07367545366287231, + "learning_rate": 7.613524526198135e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29554432, + "step": 140045 + }, + { + "epoch": 15.407040704070408, + "grad_norm": 0.018114643171429634, + "learning_rate": 7.611799999828439e-06, + "loss": 0.0403, + "num_input_tokens_seen": 29555456, + "step": 140050 + }, + { + "epoch": 15.407590759075907, + "grad_norm": 0.006190522573888302, + "learning_rate": 7.610075633717451e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29556512, + "step": 140055 + }, + { + "epoch": 15.408140814081408, + "grad_norm": 0.020348913967609406, + "learning_rate": 7.60835142788105e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29557472, + "step": 140060 + }, + { + "epoch": 15.408690869086909, + "grad_norm": 0.013912850059568882, + "learning_rate": 7.606627382335119e-06, + "loss": 0.0756, + "num_input_tokens_seen": 29558496, + "step": 140065 + }, + { + "epoch": 15.409240924092408, + "grad_norm": 0.009770845994353294, + "learning_rate": 7.604903497095558e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29559552, + "step": 140070 + }, + { + "epoch": 15.40979097909791, + "grad_norm": 0.05051115155220032, + "learning_rate": 7.603179772178254e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29560544, + "step": 140075 + }, + { + "epoch": 15.41034103410341, + "grad_norm": 0.01886334829032421, + "learning_rate": 7.601456207599103e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29561568, + "step": 140080 + }, + { + "epoch": 15.410891089108912, + "grad_norm": 0.010093910619616508, + "learning_rate": 7.599732803373979e-06, + "loss": 0.0145, + "num_input_tokens_seen": 29562688, + "step": 140085 + }, + { + "epoch": 15.411441144114411, + "grad_norm": 0.051497358828783035, + "learning_rate": 7.598009559518765e-06, + "loss": 0.001, + "num_input_tokens_seen": 29563712, + "step": 140090 + }, + { + "epoch": 15.411991199119912, + "grad_norm": 0.057287298142910004, + "learning_rate": 7.5962864760493525e-06, + "loss": 0.0076, + "num_input_tokens_seen": 29564736, + "step": 140095 + }, + { + "epoch": 15.412541254125413, + "grad_norm": 0.0940563753247261, + "learning_rate": 7.5945635529815986e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29565824, + "step": 140100 + }, + { + "epoch": 15.413091309130913, + "grad_norm": 0.12407480180263519, + "learning_rate": 7.592840790331418e-06, + "loss": 0.002, + "num_input_tokens_seen": 29566880, + "step": 140105 + }, + { + "epoch": 15.413641364136414, + "grad_norm": 0.08350525796413422, + "learning_rate": 7.591118188114671e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29567872, + "step": 140110 + }, + { + "epoch": 15.414191419141915, + "grad_norm": 0.06479927152395248, + "learning_rate": 7.589395746347228e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29568928, + "step": 140115 + }, + { + "epoch": 15.414741474147414, + "grad_norm": 0.14229175448417664, + "learning_rate": 7.587673465044976e-06, + "loss": 0.0729, + "num_input_tokens_seen": 29570016, + "step": 140120 + }, + { + "epoch": 15.415291529152915, + "grad_norm": 0.12961921095848083, + "learning_rate": 7.585951344223774e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29571072, + "step": 140125 + }, + { + "epoch": 15.415841584158416, + "grad_norm": 0.01708763651549816, + "learning_rate": 7.584229383899502e-06, + "loss": 0.0125, + "num_input_tokens_seen": 29572224, + "step": 140130 + }, + { + "epoch": 15.416391639163916, + "grad_norm": 1.4634652137756348, + "learning_rate": 7.582507584088039e-06, + "loss": 0.0463, + "num_input_tokens_seen": 29573312, + "step": 140135 + }, + { + "epoch": 15.416941694169417, + "grad_norm": 0.04299718886613846, + "learning_rate": 7.580785944805238e-06, + "loss": 0.0084, + "num_input_tokens_seen": 29574368, + "step": 140140 + }, + { + "epoch": 15.417491749174918, + "grad_norm": 0.1332738995552063, + "learning_rate": 7.579064466066985e-06, + "loss": 0.004, + "num_input_tokens_seen": 29575392, + "step": 140145 + }, + { + "epoch": 15.418041804180419, + "grad_norm": 0.024217693135142326, + "learning_rate": 7.577343147889132e-06, + "loss": 0.0164, + "num_input_tokens_seen": 29576384, + "step": 140150 + }, + { + "epoch": 15.418591859185918, + "grad_norm": 0.586630642414093, + "learning_rate": 7.57562199028754e-06, + "loss": 0.0214, + "num_input_tokens_seen": 29577568, + "step": 140155 + }, + { + "epoch": 15.41914191419142, + "grad_norm": 0.3982292413711548, + "learning_rate": 7.573900993278079e-06, + "loss": 0.0399, + "num_input_tokens_seen": 29578560, + "step": 140160 + }, + { + "epoch": 15.41969196919692, + "grad_norm": 0.07930586487054825, + "learning_rate": 7.572180156876613e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29579648, + "step": 140165 + }, + { + "epoch": 15.42024202420242, + "grad_norm": 0.5076232552528381, + "learning_rate": 7.570459481099007e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29580768, + "step": 140170 + }, + { + "epoch": 15.42079207920792, + "grad_norm": 0.12778525054454803, + "learning_rate": 7.568738965961111e-06, + "loss": 0.1329, + "num_input_tokens_seen": 29581888, + "step": 140175 + }, + { + "epoch": 15.421342134213422, + "grad_norm": 0.05630846694111824, + "learning_rate": 7.5670186114787765e-06, + "loss": 0.0371, + "num_input_tokens_seen": 29582912, + "step": 140180 + }, + { + "epoch": 15.421892189218921, + "grad_norm": 0.038127489387989044, + "learning_rate": 7.565298417667865e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29584000, + "step": 140185 + }, + { + "epoch": 15.422442244224422, + "grad_norm": 0.010441082529723644, + "learning_rate": 7.563578384544234e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29585024, + "step": 140190 + }, + { + "epoch": 15.422992299229923, + "grad_norm": 0.18787668645381927, + "learning_rate": 7.56185851212374e-06, + "loss": 0.1073, + "num_input_tokens_seen": 29586112, + "step": 140195 + }, + { + "epoch": 15.423542354235423, + "grad_norm": 1.6330302953720093, + "learning_rate": 7.5601388004222315e-06, + "loss": 0.0591, + "num_input_tokens_seen": 29587296, + "step": 140200 + }, + { + "epoch": 15.424092409240924, + "grad_norm": 0.006731150206178427, + "learning_rate": 7.558419249455545e-06, + "loss": 0.0428, + "num_input_tokens_seen": 29588384, + "step": 140205 + }, + { + "epoch": 15.424642464246425, + "grad_norm": 0.005368067417293787, + "learning_rate": 7.5566998592395464e-06, + "loss": 0.0873, + "num_input_tokens_seen": 29589408, + "step": 140210 + }, + { + "epoch": 15.425192519251926, + "grad_norm": 0.01363902073353529, + "learning_rate": 7.554980629790068e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29590432, + "step": 140215 + }, + { + "epoch": 15.425742574257425, + "grad_norm": 0.02076076902449131, + "learning_rate": 7.553261561122962e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29591488, + "step": 140220 + }, + { + "epoch": 15.426292629262926, + "grad_norm": 0.010332925245165825, + "learning_rate": 7.551542653254082e-06, + "loss": 0.0619, + "num_input_tokens_seen": 29592544, + "step": 140225 + }, + { + "epoch": 15.426842684268427, + "grad_norm": 0.03961048275232315, + "learning_rate": 7.5498239061992495e-06, + "loss": 0.0479, + "num_input_tokens_seen": 29593600, + "step": 140230 + }, + { + "epoch": 15.427392739273927, + "grad_norm": 0.016247473657131195, + "learning_rate": 7.548105319974328e-06, + "loss": 0.037, + "num_input_tokens_seen": 29594624, + "step": 140235 + }, + { + "epoch": 15.427942794279428, + "grad_norm": 2.848410129547119, + "learning_rate": 7.546386894595137e-06, + "loss": 0.1108, + "num_input_tokens_seen": 29595744, + "step": 140240 + }, + { + "epoch": 15.428492849284929, + "grad_norm": 0.06471219658851624, + "learning_rate": 7.5446686300775195e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29596800, + "step": 140245 + }, + { + "epoch": 15.429042904290428, + "grad_norm": 0.0014545666053891182, + "learning_rate": 7.542950526437326e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29597888, + "step": 140250 + }, + { + "epoch": 15.42959295929593, + "grad_norm": 0.006104079075157642, + "learning_rate": 7.541232583690375e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29598944, + "step": 140255 + }, + { + "epoch": 15.43014301430143, + "grad_norm": 0.017891591414809227, + "learning_rate": 7.53951480185251e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29599936, + "step": 140260 + }, + { + "epoch": 15.430693069306932, + "grad_norm": 1.4916139841079712, + "learning_rate": 7.537797180939551e-06, + "loss": 0.0507, + "num_input_tokens_seen": 29601056, + "step": 140265 + }, + { + "epoch": 15.43124312431243, + "grad_norm": 0.04778330773115158, + "learning_rate": 7.536079720967345e-06, + "loss": 0.0076, + "num_input_tokens_seen": 29602112, + "step": 140270 + }, + { + "epoch": 15.431793179317932, + "grad_norm": 0.0027216433081775904, + "learning_rate": 7.534362421951707e-06, + "loss": 0.0403, + "num_input_tokens_seen": 29603232, + "step": 140275 + }, + { + "epoch": 15.432343234323433, + "grad_norm": 0.03262520954012871, + "learning_rate": 7.5326452839084654e-06, + "loss": 0.0334, + "num_input_tokens_seen": 29604288, + "step": 140280 + }, + { + "epoch": 15.432893289328932, + "grad_norm": 0.1892201155424118, + "learning_rate": 7.53092830685346e-06, + "loss": 0.1188, + "num_input_tokens_seen": 29605344, + "step": 140285 + }, + { + "epoch": 15.433443344334433, + "grad_norm": 0.011098486371338367, + "learning_rate": 7.529211490802498e-06, + "loss": 0.0271, + "num_input_tokens_seen": 29606432, + "step": 140290 + }, + { + "epoch": 15.433993399339935, + "grad_norm": 0.05748692899942398, + "learning_rate": 7.52749483577142e-06, + "loss": 0.065, + "num_input_tokens_seen": 29607488, + "step": 140295 + }, + { + "epoch": 15.434543454345434, + "grad_norm": 0.021955644711852074, + "learning_rate": 7.52577834177603e-06, + "loss": 0.0062, + "num_input_tokens_seen": 29608480, + "step": 140300 + }, + { + "epoch": 15.435093509350935, + "grad_norm": 0.011249718256294727, + "learning_rate": 7.524062008832153e-06, + "loss": 0.152, + "num_input_tokens_seen": 29609504, + "step": 140305 + }, + { + "epoch": 15.435643564356436, + "grad_norm": 0.0010625397553667426, + "learning_rate": 7.5223458369556215e-06, + "loss": 0.0008, + "num_input_tokens_seen": 29610624, + "step": 140310 + }, + { + "epoch": 15.436193619361935, + "grad_norm": 0.008625062182545662, + "learning_rate": 7.520629826162234e-06, + "loss": 0.0072, + "num_input_tokens_seen": 29611616, + "step": 140315 + }, + { + "epoch": 15.436743674367436, + "grad_norm": 0.030761633068323135, + "learning_rate": 7.5189139764678205e-06, + "loss": 0.0006, + "num_input_tokens_seen": 29612640, + "step": 140320 + }, + { + "epoch": 15.437293729372938, + "grad_norm": 0.04874119535088539, + "learning_rate": 7.517198287888189e-06, + "loss": 0.1229, + "num_input_tokens_seen": 29613728, + "step": 140325 + }, + { + "epoch": 15.437843784378439, + "grad_norm": 0.024805821478366852, + "learning_rate": 7.515482760439144e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29614784, + "step": 140330 + }, + { + "epoch": 15.438393839383938, + "grad_norm": 0.3133966624736786, + "learning_rate": 7.513767394136504e-06, + "loss": 0.0903, + "num_input_tokens_seen": 29615808, + "step": 140335 + }, + { + "epoch": 15.438943894389439, + "grad_norm": 0.013933948241174221, + "learning_rate": 7.512052188996086e-06, + "loss": 0.0083, + "num_input_tokens_seen": 29616928, + "step": 140340 + }, + { + "epoch": 15.43949394939494, + "grad_norm": 0.08736679702997208, + "learning_rate": 7.510337145033686e-06, + "loss": 0.0064, + "num_input_tokens_seen": 29618080, + "step": 140345 + }, + { + "epoch": 15.44004400440044, + "grad_norm": 0.1770090013742447, + "learning_rate": 7.508622262265122e-06, + "loss": 0.0278, + "num_input_tokens_seen": 29619168, + "step": 140350 + }, + { + "epoch": 15.44059405940594, + "grad_norm": 0.017148153856396675, + "learning_rate": 7.506907540706187e-06, + "loss": 0.1098, + "num_input_tokens_seen": 29620160, + "step": 140355 + }, + { + "epoch": 15.441144114411442, + "grad_norm": 0.42268165946006775, + "learning_rate": 7.505192980372691e-06, + "loss": 0.0302, + "num_input_tokens_seen": 29621184, + "step": 140360 + }, + { + "epoch": 15.441694169416941, + "grad_norm": 0.20894241333007812, + "learning_rate": 7.503478581280443e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29622208, + "step": 140365 + }, + { + "epoch": 15.442244224422442, + "grad_norm": 0.1649462878704071, + "learning_rate": 7.501764343445231e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29623232, + "step": 140370 + }, + { + "epoch": 15.442794279427943, + "grad_norm": 0.07627276331186295, + "learning_rate": 7.5000502668828686e-06, + "loss": 0.1, + "num_input_tokens_seen": 29624320, + "step": 140375 + }, + { + "epoch": 15.443344334433444, + "grad_norm": 0.054389748722314835, + "learning_rate": 7.498336351609136e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29625376, + "step": 140380 + }, + { + "epoch": 15.443894389438944, + "grad_norm": 0.028719257563352585, + "learning_rate": 7.496622597639849e-06, + "loss": 0.0017, + "num_input_tokens_seen": 29626496, + "step": 140385 + }, + { + "epoch": 15.444444444444445, + "grad_norm": 0.16630695760250092, + "learning_rate": 7.4949090049907855e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29627520, + "step": 140390 + }, + { + "epoch": 15.444994499449946, + "grad_norm": 0.09544417262077332, + "learning_rate": 7.4931955736777446e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29628576, + "step": 140395 + }, + { + "epoch": 15.445544554455445, + "grad_norm": 0.04755145311355591, + "learning_rate": 7.491482303716526e-06, + "loss": 0.0116, + "num_input_tokens_seen": 29629632, + "step": 140400 + }, + { + "epoch": 15.446094609460946, + "grad_norm": 0.07899582386016846, + "learning_rate": 7.489769195122906e-06, + "loss": 0.0113, + "num_input_tokens_seen": 29630688, + "step": 140405 + }, + { + "epoch": 15.446644664466447, + "grad_norm": 1.0699166059494019, + "learning_rate": 7.48805624791269e-06, + "loss": 0.0138, + "num_input_tokens_seen": 29631712, + "step": 140410 + }, + { + "epoch": 15.447194719471947, + "grad_norm": 0.018428003415465355, + "learning_rate": 7.48634346210165e-06, + "loss": 0.1024, + "num_input_tokens_seen": 29632768, + "step": 140415 + }, + { + "epoch": 15.447744774477448, + "grad_norm": 0.015714719891548157, + "learning_rate": 7.484630837705575e-06, + "loss": 0.004, + "num_input_tokens_seen": 29633824, + "step": 140420 + }, + { + "epoch": 15.448294829482949, + "grad_norm": 0.69525146484375, + "learning_rate": 7.4829183747402635e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29634848, + "step": 140425 + }, + { + "epoch": 15.448844884488448, + "grad_norm": 0.08654139190912247, + "learning_rate": 7.481206073221478e-06, + "loss": 0.0022, + "num_input_tokens_seen": 29635808, + "step": 140430 + }, + { + "epoch": 15.44939493949395, + "grad_norm": 0.036335498094558716, + "learning_rate": 7.479493933165019e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29636832, + "step": 140435 + }, + { + "epoch": 15.44994499449945, + "grad_norm": 1.013702392578125, + "learning_rate": 7.477781954586657e-06, + "loss": 0.2175, + "num_input_tokens_seen": 29637888, + "step": 140440 + }, + { + "epoch": 15.450495049504951, + "grad_norm": 0.0655202642083168, + "learning_rate": 7.476070137502164e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29638944, + "step": 140445 + }, + { + "epoch": 15.45104510451045, + "grad_norm": 0.023552419617772102, + "learning_rate": 7.474358481927321e-06, + "loss": 0.063, + "num_input_tokens_seen": 29640128, + "step": 140450 + }, + { + "epoch": 15.451595159515952, + "grad_norm": 0.02353043481707573, + "learning_rate": 7.472646987877907e-06, + "loss": 0.1148, + "num_input_tokens_seen": 29641216, + "step": 140455 + }, + { + "epoch": 15.452145214521453, + "grad_norm": 0.3363190293312073, + "learning_rate": 7.470935655369704e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29642304, + "step": 140460 + }, + { + "epoch": 15.452695269526952, + "grad_norm": 0.006269268225878477, + "learning_rate": 7.469224484418477e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29643360, + "step": 140465 + }, + { + "epoch": 15.453245324532453, + "grad_norm": 3.3095977306365967, + "learning_rate": 7.467513475039986e-06, + "loss": 0.2187, + "num_input_tokens_seen": 29644416, + "step": 140470 + }, + { + "epoch": 15.453795379537954, + "grad_norm": 0.03388764709234238, + "learning_rate": 7.465802627250013e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29645440, + "step": 140475 + }, + { + "epoch": 15.454345434543454, + "grad_norm": 0.03499613329768181, + "learning_rate": 7.464091941064322e-06, + "loss": 0.038, + "num_input_tokens_seen": 29646496, + "step": 140480 + }, + { + "epoch": 15.454895489548955, + "grad_norm": 0.04151632636785507, + "learning_rate": 7.4623814164986875e-06, + "loss": 0.0875, + "num_input_tokens_seen": 29647488, + "step": 140485 + }, + { + "epoch": 15.455445544554456, + "grad_norm": 0.008126167580485344, + "learning_rate": 7.4606710535688685e-06, + "loss": 0.0197, + "num_input_tokens_seen": 29648480, + "step": 140490 + }, + { + "epoch": 15.455995599559955, + "grad_norm": 0.020480729639530182, + "learning_rate": 7.458960852290622e-06, + "loss": 0.0229, + "num_input_tokens_seen": 29649536, + "step": 140495 + }, + { + "epoch": 15.456545654565456, + "grad_norm": 0.038212694227695465, + "learning_rate": 7.457250812679722e-06, + "loss": 0.0611, + "num_input_tokens_seen": 29650624, + "step": 140500 + }, + { + "epoch": 15.457095709570957, + "grad_norm": 0.09314867109060287, + "learning_rate": 7.4555409347519176e-06, + "loss": 0.0061, + "num_input_tokens_seen": 29651616, + "step": 140505 + }, + { + "epoch": 15.457645764576458, + "grad_norm": 0.13920682668685913, + "learning_rate": 7.453831218522972e-06, + "loss": 0.0062, + "num_input_tokens_seen": 29652672, + "step": 140510 + }, + { + "epoch": 15.458195819581958, + "grad_norm": 0.00985579751431942, + "learning_rate": 7.452121664008654e-06, + "loss": 0.0035, + "num_input_tokens_seen": 29653696, + "step": 140515 + }, + { + "epoch": 15.458745874587459, + "grad_norm": 0.012405674904584885, + "learning_rate": 7.450412271224702e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29654688, + "step": 140520 + }, + { + "epoch": 15.45929592959296, + "grad_norm": 1.3007588386535645, + "learning_rate": 7.4487030401868855e-06, + "loss": 0.0091, + "num_input_tokens_seen": 29655840, + "step": 140525 + }, + { + "epoch": 15.45984598459846, + "grad_norm": 1.711163878440857, + "learning_rate": 7.446993970910946e-06, + "loss": 0.0187, + "num_input_tokens_seen": 29656896, + "step": 140530 + }, + { + "epoch": 15.46039603960396, + "grad_norm": 0.3955821692943573, + "learning_rate": 7.445285063412638e-06, + "loss": 0.0775, + "num_input_tokens_seen": 29657952, + "step": 140535 + }, + { + "epoch": 15.460946094609461, + "grad_norm": 0.027933163568377495, + "learning_rate": 7.443576317707724e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29659008, + "step": 140540 + }, + { + "epoch": 15.46149614961496, + "grad_norm": 0.019878482446074486, + "learning_rate": 7.441867733811933e-06, + "loss": 0.1014, + "num_input_tokens_seen": 29660032, + "step": 140545 + }, + { + "epoch": 15.462046204620462, + "grad_norm": 0.03727225214242935, + "learning_rate": 7.440159311741029e-06, + "loss": 0.0134, + "num_input_tokens_seen": 29661184, + "step": 140550 + }, + { + "epoch": 15.462596259625963, + "grad_norm": 0.0019400574965402484, + "learning_rate": 7.438451051510753e-06, + "loss": 0.1282, + "num_input_tokens_seen": 29662240, + "step": 140555 + }, + { + "epoch": 15.463146314631462, + "grad_norm": 0.1277930736541748, + "learning_rate": 7.436742953136835e-06, + "loss": 0.0192, + "num_input_tokens_seen": 29663264, + "step": 140560 + }, + { + "epoch": 15.463696369636963, + "grad_norm": 0.0075387791730463505, + "learning_rate": 7.435035016635033e-06, + "loss": 0.0017, + "num_input_tokens_seen": 29664256, + "step": 140565 + }, + { + "epoch": 15.464246424642464, + "grad_norm": 0.013247916474938393, + "learning_rate": 7.433327242021085e-06, + "loss": 0.1132, + "num_input_tokens_seen": 29665344, + "step": 140570 + }, + { + "epoch": 15.464796479647966, + "grad_norm": 0.004971968941390514, + "learning_rate": 7.4316196293107375e-06, + "loss": 0.0101, + "num_input_tokens_seen": 29666368, + "step": 140575 + }, + { + "epoch": 15.465346534653465, + "grad_norm": 0.010346696712076664, + "learning_rate": 7.429912178519721e-06, + "loss": 0.0117, + "num_input_tokens_seen": 29667392, + "step": 140580 + }, + { + "epoch": 15.465896589658966, + "grad_norm": 0.035684552043676376, + "learning_rate": 7.428204889663768e-06, + "loss": 0.0264, + "num_input_tokens_seen": 29668416, + "step": 140585 + }, + { + "epoch": 15.466446644664467, + "grad_norm": 0.06072503328323364, + "learning_rate": 7.426497762758614e-06, + "loss": 0.0229, + "num_input_tokens_seen": 29669472, + "step": 140590 + }, + { + "epoch": 15.466996699669966, + "grad_norm": 0.03866223990917206, + "learning_rate": 7.42479079782e-06, + "loss": 0.089, + "num_input_tokens_seen": 29670528, + "step": 140595 + }, + { + "epoch": 15.467546754675467, + "grad_norm": 2.9809484481811523, + "learning_rate": 7.423083994863664e-06, + "loss": 0.0771, + "num_input_tokens_seen": 29671520, + "step": 140600 + }, + { + "epoch": 15.468096809680969, + "grad_norm": 0.12466597557067871, + "learning_rate": 7.421377353905329e-06, + "loss": 0.004, + "num_input_tokens_seen": 29672608, + "step": 140605 + }, + { + "epoch": 15.468646864686468, + "grad_norm": 0.030977578833699226, + "learning_rate": 7.419670874960716e-06, + "loss": 0.0422, + "num_input_tokens_seen": 29673696, + "step": 140610 + }, + { + "epoch": 15.469196919691969, + "grad_norm": 1.0966600179672241, + "learning_rate": 7.417964558045567e-06, + "loss": 0.0244, + "num_input_tokens_seen": 29674688, + "step": 140615 + }, + { + "epoch": 15.46974697469747, + "grad_norm": 0.054666515439748764, + "learning_rate": 7.416258403175594e-06, + "loss": 0.0299, + "num_input_tokens_seen": 29675808, + "step": 140620 + }, + { + "epoch": 15.47029702970297, + "grad_norm": 0.026833051815629005, + "learning_rate": 7.414552410366532e-06, + "loss": 0.0349, + "num_input_tokens_seen": 29676896, + "step": 140625 + }, + { + "epoch": 15.47084708470847, + "grad_norm": 0.026492079719901085, + "learning_rate": 7.4128465796341084e-06, + "loss": 0.0042, + "num_input_tokens_seen": 29677952, + "step": 140630 + }, + { + "epoch": 15.471397139713972, + "grad_norm": 0.039089567959308624, + "learning_rate": 7.411140910994033e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29678976, + "step": 140635 + }, + { + "epoch": 15.471947194719473, + "grad_norm": 0.009602395817637444, + "learning_rate": 7.409435404462037e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29680000, + "step": 140640 + }, + { + "epoch": 15.472497249724972, + "grad_norm": 0.0054933493956923485, + "learning_rate": 7.407730060053828e-06, + "loss": 0.0591, + "num_input_tokens_seen": 29681120, + "step": 140645 + }, + { + "epoch": 15.473047304730473, + "grad_norm": 0.0913873016834259, + "learning_rate": 7.406024877785128e-06, + "loss": 0.004, + "num_input_tokens_seen": 29682208, + "step": 140650 + }, + { + "epoch": 15.473597359735974, + "grad_norm": 0.4044114053249359, + "learning_rate": 7.404319857671662e-06, + "loss": 0.0059, + "num_input_tokens_seen": 29683232, + "step": 140655 + }, + { + "epoch": 15.474147414741473, + "grad_norm": 0.5234854221343994, + "learning_rate": 7.402614999729129e-06, + "loss": 0.0846, + "num_input_tokens_seen": 29684320, + "step": 140660 + }, + { + "epoch": 15.474697469746975, + "grad_norm": 0.0205170139670372, + "learning_rate": 7.40091030397326e-06, + "loss": 0.0705, + "num_input_tokens_seen": 29685344, + "step": 140665 + }, + { + "epoch": 15.475247524752476, + "grad_norm": 0.05173468589782715, + "learning_rate": 7.399205770419745e-06, + "loss": 0.0057, + "num_input_tokens_seen": 29686336, + "step": 140670 + }, + { + "epoch": 15.475797579757975, + "grad_norm": 0.0075773810967803, + "learning_rate": 7.397501399084314e-06, + "loss": 0.012, + "num_input_tokens_seen": 29687424, + "step": 140675 + }, + { + "epoch": 15.476347634763476, + "grad_norm": 0.01208034809678793, + "learning_rate": 7.395797189982656e-06, + "loss": 0.095, + "num_input_tokens_seen": 29688416, + "step": 140680 + }, + { + "epoch": 15.476897689768977, + "grad_norm": 0.060982249677181244, + "learning_rate": 7.39409314313049e-06, + "loss": 0.013, + "num_input_tokens_seen": 29689440, + "step": 140685 + }, + { + "epoch": 15.477447744774478, + "grad_norm": 0.006776586640626192, + "learning_rate": 7.392389258543528e-06, + "loss": 0.0315, + "num_input_tokens_seen": 29690432, + "step": 140690 + }, + { + "epoch": 15.477997799779978, + "grad_norm": 3.5558667182922363, + "learning_rate": 7.3906855362374575e-06, + "loss": 0.0426, + "num_input_tokens_seen": 29691456, + "step": 140695 + }, + { + "epoch": 15.478547854785479, + "grad_norm": 0.07267355918884277, + "learning_rate": 7.388981976227993e-06, + "loss": 0.0102, + "num_input_tokens_seen": 29692544, + "step": 140700 + }, + { + "epoch": 15.47909790979098, + "grad_norm": 0.12938424944877625, + "learning_rate": 7.387278578530834e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29693536, + "step": 140705 + }, + { + "epoch": 15.479647964796479, + "grad_norm": 0.041732776910066605, + "learning_rate": 7.3855753431616606e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29694560, + "step": 140710 + }, + { + "epoch": 15.48019801980198, + "grad_norm": 0.013010811991989613, + "learning_rate": 7.383872270136205e-06, + "loss": 0.0089, + "num_input_tokens_seen": 29695520, + "step": 140715 + }, + { + "epoch": 15.480748074807481, + "grad_norm": 1.5668537616729736, + "learning_rate": 7.382169359470134e-06, + "loss": 0.046, + "num_input_tokens_seen": 29696512, + "step": 140720 + }, + { + "epoch": 15.48129812981298, + "grad_norm": 0.015085524879395962, + "learning_rate": 7.380466611179165e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29697504, + "step": 140725 + }, + { + "epoch": 15.481848184818482, + "grad_norm": 0.004691129084676504, + "learning_rate": 7.378764025278978e-06, + "loss": 0.0156, + "num_input_tokens_seen": 29698528, + "step": 140730 + }, + { + "epoch": 15.482398239823983, + "grad_norm": 0.1578415334224701, + "learning_rate": 7.377061601785262e-06, + "loss": 0.0422, + "num_input_tokens_seen": 29699584, + "step": 140735 + }, + { + "epoch": 15.482948294829482, + "grad_norm": 1.9787864685058594, + "learning_rate": 7.375359340713711e-06, + "loss": 0.114, + "num_input_tokens_seen": 29700640, + "step": 140740 + }, + { + "epoch": 15.483498349834983, + "grad_norm": 0.0383271686732769, + "learning_rate": 7.373657242080023e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29701696, + "step": 140745 + }, + { + "epoch": 15.484048404840484, + "grad_norm": 0.027604855597019196, + "learning_rate": 7.3719553058998716e-06, + "loss": 0.1336, + "num_input_tokens_seen": 29702720, + "step": 140750 + }, + { + "epoch": 15.484598459845985, + "grad_norm": 0.01969798281788826, + "learning_rate": 7.370253532188959e-06, + "loss": 0.1965, + "num_input_tokens_seen": 29703776, + "step": 140755 + }, + { + "epoch": 15.485148514851485, + "grad_norm": 0.3663683235645294, + "learning_rate": 7.368551920962951e-06, + "loss": 0.0748, + "num_input_tokens_seen": 29704800, + "step": 140760 + }, + { + "epoch": 15.485698569856986, + "grad_norm": 0.25257787108421326, + "learning_rate": 7.366850472237538e-06, + "loss": 0.0809, + "num_input_tokens_seen": 29705984, + "step": 140765 + }, + { + "epoch": 15.486248624862487, + "grad_norm": 0.01616215519607067, + "learning_rate": 7.365149186028414e-06, + "loss": 0.0134, + "num_input_tokens_seen": 29707072, + "step": 140770 + }, + { + "epoch": 15.486798679867986, + "grad_norm": 0.07925250381231308, + "learning_rate": 7.3634480623512355e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29708096, + "step": 140775 + }, + { + "epoch": 15.487348734873487, + "grad_norm": 0.022873925045132637, + "learning_rate": 7.361747101221706e-06, + "loss": 0.0153, + "num_input_tokens_seen": 29709120, + "step": 140780 + }, + { + "epoch": 15.487898789878988, + "grad_norm": 0.046577345579862595, + "learning_rate": 7.360046302655479e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29710208, + "step": 140785 + }, + { + "epoch": 15.488448844884488, + "grad_norm": 1.2452163696289062, + "learning_rate": 7.358345666668249e-06, + "loss": 0.008, + "num_input_tokens_seen": 29711328, + "step": 140790 + }, + { + "epoch": 15.488998899889989, + "grad_norm": 0.104517862200737, + "learning_rate": 7.3566451932756744e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29712352, + "step": 140795 + }, + { + "epoch": 15.48954895489549, + "grad_norm": 0.0975007563829422, + "learning_rate": 7.354944882493436e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29713440, + "step": 140800 + }, + { + "epoch": 15.490099009900991, + "grad_norm": 0.2602461874485016, + "learning_rate": 7.353244734337209e-06, + "loss": 0.0061, + "num_input_tokens_seen": 29714496, + "step": 140805 + }, + { + "epoch": 15.49064906490649, + "grad_norm": 0.14877189695835114, + "learning_rate": 7.351544748822653e-06, + "loss": 0.093, + "num_input_tokens_seen": 29715584, + "step": 140810 + }, + { + "epoch": 15.491199119911991, + "grad_norm": 0.08469627052545547, + "learning_rate": 7.349844925965446e-06, + "loss": 0.0022, + "num_input_tokens_seen": 29716736, + "step": 140815 + }, + { + "epoch": 15.491749174917492, + "grad_norm": 0.07229779660701752, + "learning_rate": 7.3481452657812475e-06, + "loss": 0.0482, + "num_input_tokens_seen": 29717792, + "step": 140820 + }, + { + "epoch": 15.492299229922992, + "grad_norm": 0.6760910749435425, + "learning_rate": 7.3464457682857115e-06, + "loss": 0.0086, + "num_input_tokens_seen": 29718880, + "step": 140825 + }, + { + "epoch": 15.492849284928493, + "grad_norm": 0.033745840191841125, + "learning_rate": 7.344746433494529e-06, + "loss": 0.073, + "num_input_tokens_seen": 29719904, + "step": 140830 + }, + { + "epoch": 15.493399339933994, + "grad_norm": 0.008159126155078411, + "learning_rate": 7.343047261423336e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29720928, + "step": 140835 + }, + { + "epoch": 15.493949394939493, + "grad_norm": 0.012285241857171059, + "learning_rate": 7.341348252087815e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29722016, + "step": 140840 + }, + { + "epoch": 15.494499449944994, + "grad_norm": 1.3286607265472412, + "learning_rate": 7.339649405503612e-06, + "loss": 0.0479, + "num_input_tokens_seen": 29723040, + "step": 140845 + }, + { + "epoch": 15.495049504950495, + "grad_norm": 1.5590537786483765, + "learning_rate": 7.337950721686379e-06, + "loss": 0.0461, + "num_input_tokens_seen": 29724160, + "step": 140850 + }, + { + "epoch": 15.495599559955995, + "grad_norm": 0.011391882784664631, + "learning_rate": 7.336252200651777e-06, + "loss": 0.0324, + "num_input_tokens_seen": 29725248, + "step": 140855 + }, + { + "epoch": 15.496149614961496, + "grad_norm": 0.05554600805044174, + "learning_rate": 7.334553842415465e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29726368, + "step": 140860 + }, + { + "epoch": 15.496699669966997, + "grad_norm": 0.23959247767925262, + "learning_rate": 7.3328556469931e-06, + "loss": 0.0106, + "num_input_tokens_seen": 29727392, + "step": 140865 + }, + { + "epoch": 15.497249724972498, + "grad_norm": 0.017392516136169434, + "learning_rate": 7.331157614400331e-06, + "loss": 0.003, + "num_input_tokens_seen": 29728480, + "step": 140870 + }, + { + "epoch": 15.497799779977997, + "grad_norm": 0.30625632405281067, + "learning_rate": 7.329459744652792e-06, + "loss": 0.0672, + "num_input_tokens_seen": 29729536, + "step": 140875 + }, + { + "epoch": 15.498349834983498, + "grad_norm": 0.014384463429450989, + "learning_rate": 7.32776203776615e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29730560, + "step": 140880 + }, + { + "epoch": 15.498899889989, + "grad_norm": 0.1229434534907341, + "learning_rate": 7.326064493756033e-06, + "loss": 0.043, + "num_input_tokens_seen": 29731648, + "step": 140885 + }, + { + "epoch": 15.499449944994499, + "grad_norm": 0.12711955606937408, + "learning_rate": 7.324367112638114e-06, + "loss": 0.0204, + "num_input_tokens_seen": 29732704, + "step": 140890 + }, + { + "epoch": 15.5, + "grad_norm": 0.009965198114514351, + "learning_rate": 7.322669894428019e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29733792, + "step": 140895 + }, + { + "epoch": 15.500550055005501, + "grad_norm": 1.0408769845962524, + "learning_rate": 7.320972839141388e-06, + "loss": 0.0033, + "num_input_tokens_seen": 29734784, + "step": 140900 + }, + { + "epoch": 15.501100110011, + "grad_norm": 0.0033096775878220797, + "learning_rate": 7.319275946793874e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29735808, + "step": 140905 + }, + { + "epoch": 15.501650165016502, + "grad_norm": 0.112865149974823, + "learning_rate": 7.317579217401102e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29736832, + "step": 140910 + }, + { + "epoch": 15.502200220022003, + "grad_norm": 0.042992882430553436, + "learning_rate": 7.315882650978717e-06, + "loss": 0.0078, + "num_input_tokens_seen": 29737824, + "step": 140915 + }, + { + "epoch": 15.502750275027502, + "grad_norm": 0.03635763004422188, + "learning_rate": 7.314186247542365e-06, + "loss": 0.0162, + "num_input_tokens_seen": 29738912, + "step": 140920 + }, + { + "epoch": 15.503300330033003, + "grad_norm": 0.04002704471349716, + "learning_rate": 7.312490007107664e-06, + "loss": 0.0901, + "num_input_tokens_seen": 29740000, + "step": 140925 + }, + { + "epoch": 15.503850385038504, + "grad_norm": 0.004842523951083422, + "learning_rate": 7.310793929690263e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29741024, + "step": 140930 + }, + { + "epoch": 15.504400440044005, + "grad_norm": 0.03780589997768402, + "learning_rate": 7.309098015305785e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29742080, + "step": 140935 + }, + { + "epoch": 15.504950495049505, + "grad_norm": 0.04268059879541397, + "learning_rate": 7.3074022639698535e-06, + "loss": 0.0722, + "num_input_tokens_seen": 29743168, + "step": 140940 + }, + { + "epoch": 15.505500550055006, + "grad_norm": 0.05220386013388634, + "learning_rate": 7.305706675698107e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29744192, + "step": 140945 + }, + { + "epoch": 15.506050605060507, + "grad_norm": 0.02190142683684826, + "learning_rate": 7.304011250506171e-06, + "loss": 0.0815, + "num_input_tokens_seen": 29745312, + "step": 140950 + }, + { + "epoch": 15.506600660066006, + "grad_norm": 0.1613195836544037, + "learning_rate": 7.3023159884096805e-06, + "loss": 0.0189, + "num_input_tokens_seen": 29746336, + "step": 140955 + }, + { + "epoch": 15.507150715071507, + "grad_norm": 0.047887131571769714, + "learning_rate": 7.300620889424251e-06, + "loss": 0.0031, + "num_input_tokens_seen": 29747424, + "step": 140960 + }, + { + "epoch": 15.507700770077008, + "grad_norm": 0.01696953922510147, + "learning_rate": 7.298925953565497e-06, + "loss": 0.0597, + "num_input_tokens_seen": 29748448, + "step": 140965 + }, + { + "epoch": 15.508250825082508, + "grad_norm": 3.0552093982696533, + "learning_rate": 7.297231180849048e-06, + "loss": 0.1092, + "num_input_tokens_seen": 29749536, + "step": 140970 + }, + { + "epoch": 15.508800880088009, + "grad_norm": 0.02093457616865635, + "learning_rate": 7.295536571290526e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29750624, + "step": 140975 + }, + { + "epoch": 15.50935093509351, + "grad_norm": 0.010571066290140152, + "learning_rate": 7.2938421249055564e-06, + "loss": 0.056, + "num_input_tokens_seen": 29751648, + "step": 140980 + }, + { + "epoch": 15.509900990099009, + "grad_norm": 0.06438903510570526, + "learning_rate": 7.292147841709749e-06, + "loss": 0.002, + "num_input_tokens_seen": 29752672, + "step": 140985 + }, + { + "epoch": 15.51045104510451, + "grad_norm": 0.06370101869106293, + "learning_rate": 7.290453721718707e-06, + "loss": 0.0038, + "num_input_tokens_seen": 29753760, + "step": 140990 + }, + { + "epoch": 15.511001100110011, + "grad_norm": 0.06729791313409805, + "learning_rate": 7.288759764948063e-06, + "loss": 0.0987, + "num_input_tokens_seen": 29754784, + "step": 140995 + }, + { + "epoch": 15.511551155115512, + "grad_norm": 0.7348319888114929, + "learning_rate": 7.287065971413415e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29755840, + "step": 141000 + }, + { + "epoch": 15.512101210121012, + "grad_norm": 0.012727025896310806, + "learning_rate": 7.285372341130381e-06, + "loss": 0.0035, + "num_input_tokens_seen": 29756896, + "step": 141005 + }, + { + "epoch": 15.512651265126513, + "grad_norm": 0.12851470708847046, + "learning_rate": 7.283678874114577e-06, + "loss": 0.0054, + "num_input_tokens_seen": 29757984, + "step": 141010 + }, + { + "epoch": 15.513201320132014, + "grad_norm": 0.08054864406585693, + "learning_rate": 7.281985570381597e-06, + "loss": 0.0353, + "num_input_tokens_seen": 29759040, + "step": 141015 + }, + { + "epoch": 15.513751375137513, + "grad_norm": 0.21135994791984558, + "learning_rate": 7.280292429947061e-06, + "loss": 0.0401, + "num_input_tokens_seen": 29760064, + "step": 141020 + }, + { + "epoch": 15.514301430143014, + "grad_norm": 0.4340543746948242, + "learning_rate": 7.278599452826557e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29761120, + "step": 141025 + }, + { + "epoch": 15.514851485148515, + "grad_norm": 0.007653451059013605, + "learning_rate": 7.2769066390357e-06, + "loss": 0.0403, + "num_input_tokens_seen": 29762240, + "step": 141030 + }, + { + "epoch": 15.515401540154015, + "grad_norm": 2.7132842540740967, + "learning_rate": 7.275213988590099e-06, + "loss": 0.0925, + "num_input_tokens_seen": 29763296, + "step": 141035 + }, + { + "epoch": 15.515951595159516, + "grad_norm": 0.2035355567932129, + "learning_rate": 7.273521501505337e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29764320, + "step": 141040 + }, + { + "epoch": 15.516501650165017, + "grad_norm": 0.0346921943128109, + "learning_rate": 7.271829177797029e-06, + "loss": 0.1392, + "num_input_tokens_seen": 29765408, + "step": 141045 + }, + { + "epoch": 15.517051705170516, + "grad_norm": 0.03605706989765167, + "learning_rate": 7.270137017480757e-06, + "loss": 0.0052, + "num_input_tokens_seen": 29766432, + "step": 141050 + }, + { + "epoch": 15.517601760176017, + "grad_norm": 0.15771044790744781, + "learning_rate": 7.268445020572132e-06, + "loss": 0.0118, + "num_input_tokens_seen": 29767456, + "step": 141055 + }, + { + "epoch": 15.518151815181518, + "grad_norm": 0.1668325960636139, + "learning_rate": 7.266753187086733e-06, + "loss": 0.004, + "num_input_tokens_seen": 29768544, + "step": 141060 + }, + { + "epoch": 15.51870187018702, + "grad_norm": 0.02731410413980484, + "learning_rate": 7.265061517040161e-06, + "loss": 0.0077, + "num_input_tokens_seen": 29769600, + "step": 141065 + }, + { + "epoch": 15.519251925192519, + "grad_norm": 1.2288931608200073, + "learning_rate": 7.263370010448014e-06, + "loss": 0.0579, + "num_input_tokens_seen": 29770688, + "step": 141070 + }, + { + "epoch": 15.51980198019802, + "grad_norm": 0.05091341957449913, + "learning_rate": 7.261678667325866e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29771712, + "step": 141075 + }, + { + "epoch": 15.520352035203521, + "grad_norm": 4.358949184417725, + "learning_rate": 7.259987487689324e-06, + "loss": 0.0307, + "num_input_tokens_seen": 29772704, + "step": 141080 + }, + { + "epoch": 15.52090209020902, + "grad_norm": 0.02707599848508835, + "learning_rate": 7.258296471553958e-06, + "loss": 0.0262, + "num_input_tokens_seen": 29773760, + "step": 141085 + }, + { + "epoch": 15.521452145214521, + "grad_norm": 0.02242876961827278, + "learning_rate": 7.256605618935358e-06, + "loss": 0.0106, + "num_input_tokens_seen": 29774784, + "step": 141090 + }, + { + "epoch": 15.522002200220022, + "grad_norm": 0.26294180750846863, + "learning_rate": 7.254914929849118e-06, + "loss": 0.0146, + "num_input_tokens_seen": 29775872, + "step": 141095 + }, + { + "epoch": 15.522552255225522, + "grad_norm": 0.0228380486369133, + "learning_rate": 7.253224404310805e-06, + "loss": 0.1106, + "num_input_tokens_seen": 29776896, + "step": 141100 + }, + { + "epoch": 15.523102310231023, + "grad_norm": 0.026061272248625755, + "learning_rate": 7.251534042336014e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29777984, + "step": 141105 + }, + { + "epoch": 15.523652365236524, + "grad_norm": 0.0400393083691597, + "learning_rate": 7.249843843940315e-06, + "loss": 0.0749, + "num_input_tokens_seen": 29779040, + "step": 141110 + }, + { + "epoch": 15.524202420242025, + "grad_norm": 0.832104504108429, + "learning_rate": 7.248153809139285e-06, + "loss": 0.0075, + "num_input_tokens_seen": 29780096, + "step": 141115 + }, + { + "epoch": 15.524752475247524, + "grad_norm": 0.05204956978559494, + "learning_rate": 7.2464639379485e-06, + "loss": 0.1048, + "num_input_tokens_seen": 29781248, + "step": 141120 + }, + { + "epoch": 15.525302530253025, + "grad_norm": 0.014161533676087856, + "learning_rate": 7.244774230383547e-06, + "loss": 0.1009, + "num_input_tokens_seen": 29782272, + "step": 141125 + }, + { + "epoch": 15.525852585258527, + "grad_norm": 0.035935595631599426, + "learning_rate": 7.24308468645998e-06, + "loss": 0.087, + "num_input_tokens_seen": 29783232, + "step": 141130 + }, + { + "epoch": 15.526402640264026, + "grad_norm": 0.8310142159461975, + "learning_rate": 7.241395306193391e-06, + "loss": 0.0573, + "num_input_tokens_seen": 29784320, + "step": 141135 + }, + { + "epoch": 15.526952695269527, + "grad_norm": 1.4929425716400146, + "learning_rate": 7.2397060895993324e-06, + "loss": 0.0394, + "num_input_tokens_seen": 29785408, + "step": 141140 + }, + { + "epoch": 15.527502750275028, + "grad_norm": 0.030647605657577515, + "learning_rate": 7.238017036693379e-06, + "loss": 0.0063, + "num_input_tokens_seen": 29786496, + "step": 141145 + }, + { + "epoch": 15.528052805280527, + "grad_norm": 0.02569678984582424, + "learning_rate": 7.236328147491109e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29787520, + "step": 141150 + }, + { + "epoch": 15.528602860286028, + "grad_norm": 0.009129982441663742, + "learning_rate": 7.234639422008069e-06, + "loss": 0.1623, + "num_input_tokens_seen": 29788608, + "step": 141155 + }, + { + "epoch": 15.52915291529153, + "grad_norm": 0.7953326106071472, + "learning_rate": 7.2329508602598435e-06, + "loss": 0.1275, + "num_input_tokens_seen": 29789664, + "step": 141160 + }, + { + "epoch": 15.52970297029703, + "grad_norm": 0.7321408987045288, + "learning_rate": 7.231262462261973e-06, + "loss": 0.0663, + "num_input_tokens_seen": 29790720, + "step": 141165 + }, + { + "epoch": 15.53025302530253, + "grad_norm": 0.17222382128238678, + "learning_rate": 7.2295742280300395e-06, + "loss": 0.0676, + "num_input_tokens_seen": 29791776, + "step": 141170 + }, + { + "epoch": 15.530803080308031, + "grad_norm": 0.037354517728090286, + "learning_rate": 7.227886157579586e-06, + "loss": 0.1094, + "num_input_tokens_seen": 29792832, + "step": 141175 + }, + { + "epoch": 15.531353135313532, + "grad_norm": 0.15086892247200012, + "learning_rate": 7.226198250926178e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29793888, + "step": 141180 + }, + { + "epoch": 15.531903190319031, + "grad_norm": 0.16300863027572632, + "learning_rate": 7.2245105080853805e-06, + "loss": 0.0478, + "num_input_tokens_seen": 29794912, + "step": 141185 + }, + { + "epoch": 15.532453245324533, + "grad_norm": 0.2835819125175476, + "learning_rate": 7.2228229290727315e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29796000, + "step": 141190 + }, + { + "epoch": 15.533003300330034, + "grad_norm": 0.02309015393257141, + "learning_rate": 7.221135513903798e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29796992, + "step": 141195 + }, + { + "epoch": 15.533553355335533, + "grad_norm": 0.4628717005252838, + "learning_rate": 7.219448262594125e-06, + "loss": 0.004, + "num_input_tokens_seen": 29798144, + "step": 141200 + }, + { + "epoch": 15.534103410341034, + "grad_norm": 0.04342246428132057, + "learning_rate": 7.2177611751592626e-06, + "loss": 0.0112, + "num_input_tokens_seen": 29799168, + "step": 141205 + }, + { + "epoch": 15.534653465346535, + "grad_norm": 0.05483236908912659, + "learning_rate": 7.216074251614768e-06, + "loss": 0.0578, + "num_input_tokens_seen": 29800160, + "step": 141210 + }, + { + "epoch": 15.535203520352034, + "grad_norm": 0.008094590157270432, + "learning_rate": 7.21438749197618e-06, + "loss": 0.0054, + "num_input_tokens_seen": 29801216, + "step": 141215 + }, + { + "epoch": 15.535753575357536, + "grad_norm": 0.745389997959137, + "learning_rate": 7.2127008962590534e-06, + "loss": 0.0499, + "num_input_tokens_seen": 29802240, + "step": 141220 + }, + { + "epoch": 15.536303630363037, + "grad_norm": 2.7393434047698975, + "learning_rate": 7.211014464478929e-06, + "loss": 0.1259, + "num_input_tokens_seen": 29803264, + "step": 141225 + }, + { + "epoch": 15.536853685368538, + "grad_norm": 0.06084306538105011, + "learning_rate": 7.209328196651341e-06, + "loss": 0.055, + "num_input_tokens_seen": 29804352, + "step": 141230 + }, + { + "epoch": 15.537403740374037, + "grad_norm": 0.08362036943435669, + "learning_rate": 7.207642092791836e-06, + "loss": 0.0535, + "num_input_tokens_seen": 29805408, + "step": 141235 + }, + { + "epoch": 15.537953795379538, + "grad_norm": 0.17101982235908508, + "learning_rate": 7.205956152915957e-06, + "loss": 0.0573, + "num_input_tokens_seen": 29806432, + "step": 141240 + }, + { + "epoch": 15.53850385038504, + "grad_norm": 0.018322182819247246, + "learning_rate": 7.204270377039249e-06, + "loss": 0.0114, + "num_input_tokens_seen": 29807488, + "step": 141245 + }, + { + "epoch": 15.539053905390539, + "grad_norm": 0.6561571359634399, + "learning_rate": 7.2025847651772425e-06, + "loss": 0.0624, + "num_input_tokens_seen": 29808544, + "step": 141250 + }, + { + "epoch": 15.53960396039604, + "grad_norm": 0.015270252712070942, + "learning_rate": 7.2008993173454665e-06, + "loss": 0.0135, + "num_input_tokens_seen": 29809696, + "step": 141255 + }, + { + "epoch": 15.54015401540154, + "grad_norm": 3.8864009380340576, + "learning_rate": 7.199214033559457e-06, + "loss": 0.0978, + "num_input_tokens_seen": 29810688, + "step": 141260 + }, + { + "epoch": 15.54070407040704, + "grad_norm": 0.05365203320980072, + "learning_rate": 7.197528913834753e-06, + "loss": 0.0278, + "num_input_tokens_seen": 29811680, + "step": 141265 + }, + { + "epoch": 15.541254125412541, + "grad_norm": 0.032998986542224884, + "learning_rate": 7.195843958186888e-06, + "loss": 0.0017, + "num_input_tokens_seen": 29812768, + "step": 141270 + }, + { + "epoch": 15.541804180418042, + "grad_norm": 0.0832347497344017, + "learning_rate": 7.194159166631387e-06, + "loss": 0.006, + "num_input_tokens_seen": 29813792, + "step": 141275 + }, + { + "epoch": 15.542354235423542, + "grad_norm": 0.009584825485944748, + "learning_rate": 7.192474539183769e-06, + "loss": 0.0043, + "num_input_tokens_seen": 29814816, + "step": 141280 + }, + { + "epoch": 15.542904290429043, + "grad_norm": 1.6389315128326416, + "learning_rate": 7.190790075859577e-06, + "loss": 0.035, + "num_input_tokens_seen": 29815904, + "step": 141285 + }, + { + "epoch": 15.543454345434544, + "grad_norm": 0.21631139516830444, + "learning_rate": 7.189105776674318e-06, + "loss": 0.0143, + "num_input_tokens_seen": 29816928, + "step": 141290 + }, + { + "epoch": 15.544004400440045, + "grad_norm": 0.008924854919314384, + "learning_rate": 7.187421641643524e-06, + "loss": 0.0372, + "num_input_tokens_seen": 29817888, + "step": 141295 + }, + { + "epoch": 15.544554455445544, + "grad_norm": 3.4062767028808594, + "learning_rate": 7.185737670782727e-06, + "loss": 0.1231, + "num_input_tokens_seen": 29818912, + "step": 141300 + }, + { + "epoch": 15.545104510451045, + "grad_norm": 0.016241351142525673, + "learning_rate": 7.184053864107429e-06, + "loss": 0.1522, + "num_input_tokens_seen": 29819936, + "step": 141305 + }, + { + "epoch": 15.545654565456546, + "grad_norm": 0.154079869389534, + "learning_rate": 7.182370221633164e-06, + "loss": 0.0042, + "num_input_tokens_seen": 29820992, + "step": 141310 + }, + { + "epoch": 15.546204620462046, + "grad_norm": 0.08649606257677078, + "learning_rate": 7.180686743375439e-06, + "loss": 0.0035, + "num_input_tokens_seen": 29822016, + "step": 141315 + }, + { + "epoch": 15.546754675467547, + "grad_norm": 0.05633179470896721, + "learning_rate": 7.17900342934977e-06, + "loss": 0.0048, + "num_input_tokens_seen": 29823072, + "step": 141320 + }, + { + "epoch": 15.547304730473048, + "grad_norm": 0.046243004500865936, + "learning_rate": 7.177320279571684e-06, + "loss": 0.0453, + "num_input_tokens_seen": 29824128, + "step": 141325 + }, + { + "epoch": 15.547854785478547, + "grad_norm": 0.019967522472143173, + "learning_rate": 7.175637294056678e-06, + "loss": 0.0082, + "num_input_tokens_seen": 29825248, + "step": 141330 + }, + { + "epoch": 15.548404840484048, + "grad_norm": 0.016713282093405724, + "learning_rate": 7.173954472820277e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29826336, + "step": 141335 + }, + { + "epoch": 15.54895489548955, + "grad_norm": 0.05536985024809837, + "learning_rate": 7.172271815877984e-06, + "loss": 0.0456, + "num_input_tokens_seen": 29827360, + "step": 141340 + }, + { + "epoch": 15.549504950495049, + "grad_norm": 0.5715088844299316, + "learning_rate": 7.1705893232453e-06, + "loss": 0.1119, + "num_input_tokens_seen": 29828448, + "step": 141345 + }, + { + "epoch": 15.55005500550055, + "grad_norm": 0.05380436033010483, + "learning_rate": 7.168906994937738e-06, + "loss": 0.1123, + "num_input_tokens_seen": 29829536, + "step": 141350 + }, + { + "epoch": 15.55060506050605, + "grad_norm": 0.059495747089385986, + "learning_rate": 7.167224830970806e-06, + "loss": 0.0676, + "num_input_tokens_seen": 29830592, + "step": 141355 + }, + { + "epoch": 15.551155115511552, + "grad_norm": 0.04604689031839371, + "learning_rate": 7.165542831360011e-06, + "loss": 0.02, + "num_input_tokens_seen": 29831680, + "step": 141360 + }, + { + "epoch": 15.551705170517051, + "grad_norm": 2.0047929286956787, + "learning_rate": 7.16386099612085e-06, + "loss": 0.0416, + "num_input_tokens_seen": 29832704, + "step": 141365 + }, + { + "epoch": 15.552255225522552, + "grad_norm": 1.0853286981582642, + "learning_rate": 7.162179325268817e-06, + "loss": 0.0297, + "num_input_tokens_seen": 29833760, + "step": 141370 + }, + { + "epoch": 15.552805280528053, + "grad_norm": 1.5435370206832886, + "learning_rate": 7.160497818819417e-06, + "loss": 0.0485, + "num_input_tokens_seen": 29834848, + "step": 141375 + }, + { + "epoch": 15.553355335533553, + "grad_norm": 0.03282105550169945, + "learning_rate": 7.15881647678815e-06, + "loss": 0.0082, + "num_input_tokens_seen": 29835904, + "step": 141380 + }, + { + "epoch": 15.553905390539054, + "grad_norm": 2.119762897491455, + "learning_rate": 7.157135299190515e-06, + "loss": 0.0424, + "num_input_tokens_seen": 29836960, + "step": 141385 + }, + { + "epoch": 15.554455445544555, + "grad_norm": 0.028411291539669037, + "learning_rate": 7.155454286042004e-06, + "loss": 0.0051, + "num_input_tokens_seen": 29838016, + "step": 141390 + }, + { + "epoch": 15.555005500550054, + "grad_norm": 0.06081686541438103, + "learning_rate": 7.153773437358102e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29839072, + "step": 141395 + }, + { + "epoch": 15.555555555555555, + "grad_norm": 0.6781903505325317, + "learning_rate": 7.152092753154313e-06, + "loss": 0.005, + "num_input_tokens_seen": 29840160, + "step": 141400 + }, + { + "epoch": 15.556105610561056, + "grad_norm": 0.039991993457078934, + "learning_rate": 7.150412233446114e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29841216, + "step": 141405 + }, + { + "epoch": 15.556655665566556, + "grad_norm": 0.04351881891489029, + "learning_rate": 7.1487318782489985e-06, + "loss": 0.0116, + "num_input_tokens_seen": 29842176, + "step": 141410 + }, + { + "epoch": 15.557205720572057, + "grad_norm": 0.0521058663725853, + "learning_rate": 7.147051687578463e-06, + "loss": 0.0706, + "num_input_tokens_seen": 29843328, + "step": 141415 + }, + { + "epoch": 15.557755775577558, + "grad_norm": 0.23827718198299408, + "learning_rate": 7.1453716614499784e-06, + "loss": 0.01, + "num_input_tokens_seen": 29844320, + "step": 141420 + }, + { + "epoch": 15.558305830583059, + "grad_norm": 0.010768407955765724, + "learning_rate": 7.143691799879046e-06, + "loss": 0.0369, + "num_input_tokens_seen": 29845408, + "step": 141425 + }, + { + "epoch": 15.558855885588558, + "grad_norm": 0.011610979214310646, + "learning_rate": 7.1420121028811265e-06, + "loss": 0.0687, + "num_input_tokens_seen": 29846496, + "step": 141430 + }, + { + "epoch": 15.55940594059406, + "grad_norm": 0.09747228771448135, + "learning_rate": 7.140332570471714e-06, + "loss": 0.0072, + "num_input_tokens_seen": 29847520, + "step": 141435 + }, + { + "epoch": 15.55995599559956, + "grad_norm": 0.02230432629585266, + "learning_rate": 7.138653202666296e-06, + "loss": 0.1887, + "num_input_tokens_seen": 29848512, + "step": 141440 + }, + { + "epoch": 15.56050605060506, + "grad_norm": 0.06146679446101189, + "learning_rate": 7.1369739994803285e-06, + "loss": 0.0039, + "num_input_tokens_seen": 29849536, + "step": 141445 + }, + { + "epoch": 15.561056105610561, + "grad_norm": 0.1809920370578766, + "learning_rate": 7.135294960929312e-06, + "loss": 0.0108, + "num_input_tokens_seen": 29850592, + "step": 141450 + }, + { + "epoch": 15.561606160616062, + "grad_norm": 0.03203883022069931, + "learning_rate": 7.133616087028699e-06, + "loss": 0.0297, + "num_input_tokens_seen": 29851648, + "step": 141455 + }, + { + "epoch": 15.562156215621561, + "grad_norm": 0.1033792793750763, + "learning_rate": 7.131937377793985e-06, + "loss": 0.0481, + "num_input_tokens_seen": 29852640, + "step": 141460 + }, + { + "epoch": 15.562706270627062, + "grad_norm": 0.07450395822525024, + "learning_rate": 7.1302588332406186e-06, + "loss": 0.0021, + "num_input_tokens_seen": 29853728, + "step": 141465 + }, + { + "epoch": 15.563256325632564, + "grad_norm": 0.11842917650938034, + "learning_rate": 7.128580453384082e-06, + "loss": 0.0561, + "num_input_tokens_seen": 29854816, + "step": 141470 + }, + { + "epoch": 15.563806380638063, + "grad_norm": 0.09370240569114685, + "learning_rate": 7.126902238239852e-06, + "loss": 0.0063, + "num_input_tokens_seen": 29855904, + "step": 141475 + }, + { + "epoch": 15.564356435643564, + "grad_norm": 2.850400686264038, + "learning_rate": 7.125224187823382e-06, + "loss": 0.0487, + "num_input_tokens_seen": 29856960, + "step": 141480 + }, + { + "epoch": 15.564906490649065, + "grad_norm": 0.01996968872845173, + "learning_rate": 7.123546302150149e-06, + "loss": 0.0039, + "num_input_tokens_seen": 29857984, + "step": 141485 + }, + { + "epoch": 15.565456545654566, + "grad_norm": 0.11959251016378403, + "learning_rate": 7.121868581235613e-06, + "loss": 0.0826, + "num_input_tokens_seen": 29859008, + "step": 141490 + }, + { + "epoch": 15.566006600660065, + "grad_norm": 0.06136389821767807, + "learning_rate": 7.120191025095218e-06, + "loss": 0.0055, + "num_input_tokens_seen": 29860032, + "step": 141495 + }, + { + "epoch": 15.566556655665567, + "grad_norm": 0.13797292113304138, + "learning_rate": 7.118513633744461e-06, + "loss": 0.0244, + "num_input_tokens_seen": 29861088, + "step": 141500 + }, + { + "epoch": 15.567106710671068, + "grad_norm": 0.030552009120583534, + "learning_rate": 7.116836407198782e-06, + "loss": 0.0548, + "num_input_tokens_seen": 29862176, + "step": 141505 + }, + { + "epoch": 15.567656765676567, + "grad_norm": 2.9029781818389893, + "learning_rate": 7.115159345473635e-06, + "loss": 0.0849, + "num_input_tokens_seen": 29863296, + "step": 141510 + }, + { + "epoch": 15.568206820682068, + "grad_norm": 0.5291649103164673, + "learning_rate": 7.113482448584488e-06, + "loss": 0.006, + "num_input_tokens_seen": 29864288, + "step": 141515 + }, + { + "epoch": 15.56875687568757, + "grad_norm": 0.01975608617067337, + "learning_rate": 7.111805716546782e-06, + "loss": 0.1693, + "num_input_tokens_seen": 29865440, + "step": 141520 + }, + { + "epoch": 15.569306930693068, + "grad_norm": 0.04356217384338379, + "learning_rate": 7.11012914937598e-06, + "loss": 0.0176, + "num_input_tokens_seen": 29866464, + "step": 141525 + }, + { + "epoch": 15.56985698569857, + "grad_norm": 0.10080962628126144, + "learning_rate": 7.1084527470875404e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29867520, + "step": 141530 + }, + { + "epoch": 15.57040704070407, + "grad_norm": 0.045566946268081665, + "learning_rate": 7.106776509696903e-06, + "loss": 0.007, + "num_input_tokens_seen": 29868608, + "step": 141535 + }, + { + "epoch": 15.570957095709572, + "grad_norm": 0.2821434736251831, + "learning_rate": 7.105100437219523e-06, + "loss": 0.0105, + "num_input_tokens_seen": 29869632, + "step": 141540 + }, + { + "epoch": 15.571507150715071, + "grad_norm": 0.01888458803296089, + "learning_rate": 7.103424529670841e-06, + "loss": 0.0663, + "num_input_tokens_seen": 29870624, + "step": 141545 + }, + { + "epoch": 15.572057205720572, + "grad_norm": 0.20778384804725647, + "learning_rate": 7.101748787066303e-06, + "loss": 0.002, + "num_input_tokens_seen": 29871712, + "step": 141550 + }, + { + "epoch": 15.572607260726073, + "grad_norm": 0.5027577877044678, + "learning_rate": 7.100073209421371e-06, + "loss": 0.0055, + "num_input_tokens_seen": 29872768, + "step": 141555 + }, + { + "epoch": 15.573157315731573, + "grad_norm": 0.9870631694793701, + "learning_rate": 7.098397796751466e-06, + "loss": 0.0275, + "num_input_tokens_seen": 29873824, + "step": 141560 + }, + { + "epoch": 15.573707370737074, + "grad_norm": 0.03615141287446022, + "learning_rate": 7.096722549072043e-06, + "loss": 0.0505, + "num_input_tokens_seen": 29874880, + "step": 141565 + }, + { + "epoch": 15.574257425742575, + "grad_norm": 0.08285288512706757, + "learning_rate": 7.095047466398533e-06, + "loss": 0.0131, + "num_input_tokens_seen": 29875904, + "step": 141570 + }, + { + "epoch": 15.574807480748074, + "grad_norm": 0.028147777542471886, + "learning_rate": 7.093372548746386e-06, + "loss": 0.0032, + "num_input_tokens_seen": 29877024, + "step": 141575 + }, + { + "epoch": 15.575357535753575, + "grad_norm": 0.10008464008569717, + "learning_rate": 7.091697796131025e-06, + "loss": 0.0013, + "num_input_tokens_seen": 29878048, + "step": 141580 + }, + { + "epoch": 15.575907590759076, + "grad_norm": 0.08449429273605347, + "learning_rate": 7.090023208567889e-06, + "loss": 0.0055, + "num_input_tokens_seen": 29879072, + "step": 141585 + }, + { + "epoch": 15.576457645764577, + "grad_norm": 0.17101643979549408, + "learning_rate": 7.088348786072424e-06, + "loss": 0.0943, + "num_input_tokens_seen": 29880160, + "step": 141590 + }, + { + "epoch": 15.577007700770077, + "grad_norm": 0.023680973798036575, + "learning_rate": 7.086674528660048e-06, + "loss": 0.0221, + "num_input_tokens_seen": 29881216, + "step": 141595 + }, + { + "epoch": 15.577557755775578, + "grad_norm": 0.5114891529083252, + "learning_rate": 7.085000436346204e-06, + "loss": 0.0143, + "num_input_tokens_seen": 29882208, + "step": 141600 + }, + { + "epoch": 15.578107810781079, + "grad_norm": 0.017049144953489304, + "learning_rate": 7.083326509146312e-06, + "loss": 0.026, + "num_input_tokens_seen": 29883232, + "step": 141605 + }, + { + "epoch": 15.578657865786578, + "grad_norm": 0.22050859034061432, + "learning_rate": 7.08165274707579e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29884256, + "step": 141610 + }, + { + "epoch": 15.57920792079208, + "grad_norm": 0.2830222249031067, + "learning_rate": 7.07997915015009e-06, + "loss": 0.0087, + "num_input_tokens_seen": 29885376, + "step": 141615 + }, + { + "epoch": 15.57975797579758, + "grad_norm": 0.026248669251799583, + "learning_rate": 7.078305718384618e-06, + "loss": 0.0239, + "num_input_tokens_seen": 29886432, + "step": 141620 + }, + { + "epoch": 15.58030803080308, + "grad_norm": 0.13450998067855835, + "learning_rate": 7.076632451794807e-06, + "loss": 0.0033, + "num_input_tokens_seen": 29887520, + "step": 141625 + }, + { + "epoch": 15.58085808580858, + "grad_norm": 0.048628464341163635, + "learning_rate": 7.074959350396076e-06, + "loss": 0.0102, + "num_input_tokens_seen": 29888576, + "step": 141630 + }, + { + "epoch": 15.581408140814082, + "grad_norm": 0.04067190736532211, + "learning_rate": 7.0732864142038365e-06, + "loss": 0.0023, + "num_input_tokens_seen": 29889664, + "step": 141635 + }, + { + "epoch": 15.581958195819581, + "grad_norm": 0.08012575656175613, + "learning_rate": 7.071613643233513e-06, + "loss": 0.0095, + "num_input_tokens_seen": 29890720, + "step": 141640 + }, + { + "epoch": 15.582508250825082, + "grad_norm": 1.6123616695404053, + "learning_rate": 7.0699410375005215e-06, + "loss": 0.0428, + "num_input_tokens_seen": 29891808, + "step": 141645 + }, + { + "epoch": 15.583058305830583, + "grad_norm": 0.009669280610978603, + "learning_rate": 7.068268597020292e-06, + "loss": 0.0066, + "num_input_tokens_seen": 29892864, + "step": 141650 + }, + { + "epoch": 15.583608360836084, + "grad_norm": 0.020866205915808678, + "learning_rate": 7.066596321808222e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29893888, + "step": 141655 + }, + { + "epoch": 15.584158415841584, + "grad_norm": 0.02978854440152645, + "learning_rate": 7.064924211879722e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29894976, + "step": 141660 + }, + { + "epoch": 15.584708470847085, + "grad_norm": 0.0203262772411108, + "learning_rate": 7.06325226725022e-06, + "loss": 0.0103, + "num_input_tokens_seen": 29896032, + "step": 141665 + }, + { + "epoch": 15.585258525852586, + "grad_norm": 0.05689811334013939, + "learning_rate": 7.061580487935096e-06, + "loss": 0.1455, + "num_input_tokens_seen": 29897152, + "step": 141670 + }, + { + "epoch": 15.585808580858085, + "grad_norm": 0.06207333877682686, + "learning_rate": 7.059908873949794e-06, + "loss": 0.0092, + "num_input_tokens_seen": 29898176, + "step": 141675 + }, + { + "epoch": 15.586358635863586, + "grad_norm": 0.09069309383630753, + "learning_rate": 7.0582374253097025e-06, + "loss": 0.0041, + "num_input_tokens_seen": 29899200, + "step": 141680 + }, + { + "epoch": 15.586908690869087, + "grad_norm": 0.03432276099920273, + "learning_rate": 7.056566142030222e-06, + "loss": 0.004, + "num_input_tokens_seen": 29900320, + "step": 141685 + }, + { + "epoch": 15.587458745874587, + "grad_norm": 0.12190926820039749, + "learning_rate": 7.0548950241267675e-06, + "loss": 0.171, + "num_input_tokens_seen": 29901344, + "step": 141690 + }, + { + "epoch": 15.588008800880088, + "grad_norm": 0.10436432808637619, + "learning_rate": 7.053224071614725e-06, + "loss": 0.0056, + "num_input_tokens_seen": 29902400, + "step": 141695 + }, + { + "epoch": 15.588558855885589, + "grad_norm": 1.1579620838165283, + "learning_rate": 7.051553284509507e-06, + "loss": 0.0778, + "num_input_tokens_seen": 29903456, + "step": 141700 + }, + { + "epoch": 15.589108910891088, + "grad_norm": 0.029124939814209938, + "learning_rate": 7.049882662826518e-06, + "loss": 0.0463, + "num_input_tokens_seen": 29904512, + "step": 141705 + }, + { + "epoch": 15.58965896589659, + "grad_norm": 0.007433310616761446, + "learning_rate": 7.048212206581137e-06, + "loss": 0.0044, + "num_input_tokens_seen": 29905568, + "step": 141710 + }, + { + "epoch": 15.59020902090209, + "grad_norm": 0.26472795009613037, + "learning_rate": 7.046541915788779e-06, + "loss": 0.0592, + "num_input_tokens_seen": 29906624, + "step": 141715 + }, + { + "epoch": 15.590759075907592, + "grad_norm": 0.34388747811317444, + "learning_rate": 7.04487179046483e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29907616, + "step": 141720 + }, + { + "epoch": 15.591309130913091, + "grad_norm": 2.2200019359588623, + "learning_rate": 7.043201830624674e-06, + "loss": 0.0273, + "num_input_tokens_seen": 29908704, + "step": 141725 + }, + { + "epoch": 15.591859185918592, + "grad_norm": 0.02560681849718094, + "learning_rate": 7.04153203628371e-06, + "loss": 0.0053, + "num_input_tokens_seen": 29909792, + "step": 141730 + }, + { + "epoch": 15.592409240924093, + "grad_norm": 0.28253430128097534, + "learning_rate": 7.0398624074573265e-06, + "loss": 0.0791, + "num_input_tokens_seen": 29910816, + "step": 141735 + }, + { + "epoch": 15.592959295929592, + "grad_norm": 0.061836984008550644, + "learning_rate": 7.038192944160923e-06, + "loss": 0.058, + "num_input_tokens_seen": 29911904, + "step": 141740 + }, + { + "epoch": 15.593509350935093, + "grad_norm": 0.034463461488485336, + "learning_rate": 7.036523646409876e-06, + "loss": 0.0269, + "num_input_tokens_seen": 29913024, + "step": 141745 + }, + { + "epoch": 15.594059405940595, + "grad_norm": 0.027196761220693588, + "learning_rate": 7.03485451421956e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29914016, + "step": 141750 + }, + { + "epoch": 15.594609460946094, + "grad_norm": 0.018972601741552353, + "learning_rate": 7.033185547605373e-06, + "loss": 0.1607, + "num_input_tokens_seen": 29915040, + "step": 141755 + }, + { + "epoch": 15.595159515951595, + "grad_norm": 0.5389490127563477, + "learning_rate": 7.031516746582692e-06, + "loss": 0.0712, + "num_input_tokens_seen": 29916192, + "step": 141760 + }, + { + "epoch": 15.595709570957096, + "grad_norm": 0.01256997138261795, + "learning_rate": 7.029848111166906e-06, + "loss": 0.1579, + "num_input_tokens_seen": 29917280, + "step": 141765 + }, + { + "epoch": 15.596259625962595, + "grad_norm": 1.9425938129425049, + "learning_rate": 7.028179641373389e-06, + "loss": 0.064, + "num_input_tokens_seen": 29918336, + "step": 141770 + }, + { + "epoch": 15.596809680968097, + "grad_norm": 0.006613722071051598, + "learning_rate": 7.026511337217509e-06, + "loss": 0.0016, + "num_input_tokens_seen": 29919392, + "step": 141775 + }, + { + "epoch": 15.597359735973598, + "grad_norm": 1.3957505226135254, + "learning_rate": 7.024843198714656e-06, + "loss": 0.04, + "num_input_tokens_seen": 29920480, + "step": 141780 + }, + { + "epoch": 15.597909790979099, + "grad_norm": 0.027188988402485847, + "learning_rate": 7.023175225880191e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29921536, + "step": 141785 + }, + { + "epoch": 15.598459845984598, + "grad_norm": 0.044047027826309204, + "learning_rate": 7.021507418729492e-06, + "loss": 0.0106, + "num_input_tokens_seen": 29922688, + "step": 141790 + }, + { + "epoch": 15.599009900990099, + "grad_norm": 0.03447505831718445, + "learning_rate": 7.019839777277942e-06, + "loss": 0.0529, + "num_input_tokens_seen": 29923680, + "step": 141795 + }, + { + "epoch": 15.5995599559956, + "grad_norm": 0.06678654998540878, + "learning_rate": 7.01817230154089e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29924768, + "step": 141800 + }, + { + "epoch": 15.6001100110011, + "grad_norm": 0.09100481122732162, + "learning_rate": 7.016504991533726e-06, + "loss": 0.004, + "num_input_tokens_seen": 29925792, + "step": 141805 + }, + { + "epoch": 15.6006600660066, + "grad_norm": 0.025973882526159286, + "learning_rate": 7.014837847271799e-06, + "loss": 0.1284, + "num_input_tokens_seen": 29926880, + "step": 141810 + }, + { + "epoch": 15.601210121012102, + "grad_norm": 1.2484517097473145, + "learning_rate": 7.01317086877048e-06, + "loss": 0.0407, + "num_input_tokens_seen": 29927872, + "step": 141815 + }, + { + "epoch": 15.601760176017601, + "grad_norm": 0.011286861263215542, + "learning_rate": 7.011504056045143e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29928960, + "step": 141820 + }, + { + "epoch": 15.602310231023102, + "grad_norm": 0.012321235612034798, + "learning_rate": 7.009837409111133e-06, + "loss": 0.0009, + "num_input_tokens_seen": 29930048, + "step": 141825 + }, + { + "epoch": 15.602860286028603, + "grad_norm": 2.3243696689605713, + "learning_rate": 7.0081709279838256e-06, + "loss": 0.0376, + "num_input_tokens_seen": 29931136, + "step": 141830 + }, + { + "epoch": 15.603410341034103, + "grad_norm": 1.5146112442016602, + "learning_rate": 7.006504612678569e-06, + "loss": 0.0144, + "num_input_tokens_seen": 29932192, + "step": 141835 + }, + { + "epoch": 15.603960396039604, + "grad_norm": 0.022796960547566414, + "learning_rate": 7.0048384632107305e-06, + "loss": 0.0014, + "num_input_tokens_seen": 29933312, + "step": 141840 + }, + { + "epoch": 15.604510451045105, + "grad_norm": 0.03716578334569931, + "learning_rate": 7.003172479595651e-06, + "loss": 0.0037, + "num_input_tokens_seen": 29934304, + "step": 141845 + }, + { + "epoch": 15.605060506050606, + "grad_norm": 0.08949317038059235, + "learning_rate": 7.0015066618487e-06, + "loss": 0.1007, + "num_input_tokens_seen": 29935424, + "step": 141850 + }, + { + "epoch": 15.605610561056105, + "grad_norm": 0.18087899684906006, + "learning_rate": 6.999841009985231e-06, + "loss": 0.0747, + "num_input_tokens_seen": 29936384, + "step": 141855 + }, + { + "epoch": 15.606160616061606, + "grad_norm": 0.007252807728946209, + "learning_rate": 6.998175524020581e-06, + "loss": 0.0493, + "num_input_tokens_seen": 29937408, + "step": 141860 + }, + { + "epoch": 15.606710671067107, + "grad_norm": 0.02972397953271866, + "learning_rate": 6.996510203970119e-06, + "loss": 0.0046, + "num_input_tokens_seen": 29938464, + "step": 141865 + }, + { + "epoch": 15.607260726072607, + "grad_norm": 0.029188236221671104, + "learning_rate": 6.994845049849175e-06, + "loss": 0.0012, + "num_input_tokens_seen": 29939552, + "step": 141870 + }, + { + "epoch": 15.607810781078108, + "grad_norm": 0.020104406401515007, + "learning_rate": 6.993180061673105e-06, + "loss": 0.005, + "num_input_tokens_seen": 29940608, + "step": 141875 + }, + { + "epoch": 15.608360836083609, + "grad_norm": 0.5459932684898376, + "learning_rate": 6.991515239457261e-06, + "loss": 0.0085, + "num_input_tokens_seen": 29941632, + "step": 141880 + }, + { + "epoch": 15.608910891089108, + "grad_norm": 0.014462675899267197, + "learning_rate": 6.989850583216978e-06, + "loss": 0.0051, + "num_input_tokens_seen": 29942688, + "step": 141885 + }, + { + "epoch": 15.60946094609461, + "grad_norm": 0.10558751970529556, + "learning_rate": 6.988186092967594e-06, + "loss": 0.0128, + "num_input_tokens_seen": 29943712, + "step": 141890 + }, + { + "epoch": 15.61001100110011, + "grad_norm": 0.06323465704917908, + "learning_rate": 6.986521768724463e-06, + "loss": 0.0702, + "num_input_tokens_seen": 29944768, + "step": 141895 + }, + { + "epoch": 15.61056105610561, + "grad_norm": 0.16552422940731049, + "learning_rate": 6.984857610502912e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29945856, + "step": 141900 + }, + { + "epoch": 15.61111111111111, + "grad_norm": 0.15367518365383148, + "learning_rate": 6.983193618318282e-06, + "loss": 0.0651, + "num_input_tokens_seen": 29946912, + "step": 141905 + }, + { + "epoch": 15.611661166116612, + "grad_norm": 0.17378047108650208, + "learning_rate": 6.9815297921859174e-06, + "loss": 0.0291, + "num_input_tokens_seen": 29947968, + "step": 141910 + }, + { + "epoch": 15.612211221122113, + "grad_norm": 0.01289659645408392, + "learning_rate": 6.97986613212114e-06, + "loss": 0.0325, + "num_input_tokens_seen": 29949056, + "step": 141915 + }, + { + "epoch": 15.612761276127612, + "grad_norm": 0.013033783063292503, + "learning_rate": 6.978202638139297e-06, + "loss": 0.039, + "num_input_tokens_seen": 29950080, + "step": 141920 + }, + { + "epoch": 15.613311331133113, + "grad_norm": 0.7676658034324646, + "learning_rate": 6.976539310255706e-06, + "loss": 0.0187, + "num_input_tokens_seen": 29951136, + "step": 141925 + }, + { + "epoch": 15.613861386138614, + "grad_norm": 0.6673507690429688, + "learning_rate": 6.974876148485704e-06, + "loss": 0.0357, + "num_input_tokens_seen": 29952224, + "step": 141930 + }, + { + "epoch": 15.614411441144114, + "grad_norm": 0.003033805638551712, + "learning_rate": 6.973213152844626e-06, + "loss": 0.0092, + "num_input_tokens_seen": 29953280, + "step": 141935 + }, + { + "epoch": 15.614961496149615, + "grad_norm": 0.042761046439409256, + "learning_rate": 6.971550323347784e-06, + "loss": 0.0265, + "num_input_tokens_seen": 29954272, + "step": 141940 + }, + { + "epoch": 15.615511551155116, + "grad_norm": 1.4927129745483398, + "learning_rate": 6.9698876600105215e-06, + "loss": 0.1184, + "num_input_tokens_seen": 29955296, + "step": 141945 + }, + { + "epoch": 15.616061606160617, + "grad_norm": 0.07206033915281296, + "learning_rate": 6.9682251628481435e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29956256, + "step": 141950 + }, + { + "epoch": 15.616611661166116, + "grad_norm": 1.7094531059265137, + "learning_rate": 6.966562831875989e-06, + "loss": 0.0522, + "num_input_tokens_seen": 29957344, + "step": 141955 + }, + { + "epoch": 15.617161716171617, + "grad_norm": 0.0251963771879673, + "learning_rate": 6.964900667109364e-06, + "loss": 0.0024, + "num_input_tokens_seen": 29958400, + "step": 141960 + }, + { + "epoch": 15.617711771177119, + "grad_norm": 5.592431545257568, + "learning_rate": 6.963238668563596e-06, + "loss": 0.0663, + "num_input_tokens_seen": 29959520, + "step": 141965 + }, + { + "epoch": 15.618261826182618, + "grad_norm": 0.0638657957315445, + "learning_rate": 6.961576836254011e-06, + "loss": 0.0015, + "num_input_tokens_seen": 29960544, + "step": 141970 + }, + { + "epoch": 15.618811881188119, + "grad_norm": 0.026982471346855164, + "learning_rate": 6.9599151701959075e-06, + "loss": 0.0047, + "num_input_tokens_seen": 29961600, + "step": 141975 + }, + { + "epoch": 15.61936193619362, + "grad_norm": 0.08400695025920868, + "learning_rate": 6.958253670404616e-06, + "loss": 0.062, + "num_input_tokens_seen": 29962624, + "step": 141980 + }, + { + "epoch": 15.61991199119912, + "grad_norm": 0.0239995326846838, + "learning_rate": 6.956592336895437e-06, + "loss": 0.0121, + "num_input_tokens_seen": 29963584, + "step": 141985 + }, + { + "epoch": 15.62046204620462, + "grad_norm": 0.06065119430422783, + "learning_rate": 6.95493116968369e-06, + "loss": 0.0018, + "num_input_tokens_seen": 29964640, + "step": 141990 + }, + { + "epoch": 15.621012101210122, + "grad_norm": 0.027357909828424454, + "learning_rate": 6.95327016878469e-06, + "loss": 0.004, + "num_input_tokens_seen": 29965728, + "step": 141995 + }, + { + "epoch": 15.62156215621562, + "grad_norm": 0.7028151750564575, + "learning_rate": 6.951609334213729e-06, + "loss": 0.0058, + "num_input_tokens_seen": 29966816, + "step": 142000 + }, + { + "epoch": 15.622112211221122, + "grad_norm": 1.7727231979370117, + "learning_rate": 6.9499486659861335e-06, + "loss": 0.0228, + "num_input_tokens_seen": 29967872, + "step": 142005 + }, + { + "epoch": 15.622662266226623, + "grad_norm": 0.0389975979924202, + "learning_rate": 6.9482881641172004e-06, + "loss": 0.0396, + "num_input_tokens_seen": 29968960, + "step": 142010 + }, + { + "epoch": 15.623212321232124, + "grad_norm": 0.21724091470241547, + "learning_rate": 6.9466278286222264e-06, + "loss": 0.0062, + "num_input_tokens_seen": 29970016, + "step": 142015 + }, + { + "epoch": 15.623762376237623, + "grad_norm": 0.02164602279663086, + "learning_rate": 6.944967659516522e-06, + "loss": 0.0019, + "num_input_tokens_seen": 29971040, + "step": 142020 + }, + { + "epoch": 15.624312431243125, + "grad_norm": 0.05021480843424797, + "learning_rate": 6.943307656815384e-06, + "loss": 0.0011, + "num_input_tokens_seen": 29972128, + "step": 142025 + }, + { + "epoch": 15.624862486248626, + "grad_norm": 1.9110119342803955, + "learning_rate": 6.941647820534122e-06, + "loss": 0.0333, + "num_input_tokens_seen": 29973152, + "step": 142030 + }, + { + "epoch": 15.625412541254125, + "grad_norm": 0.5279118418693542, + "learning_rate": 6.939988150688029e-06, + "loss": 0.0968, + "num_input_tokens_seen": 29974208, + "step": 142035 + }, + { + "epoch": 15.625962596259626, + "grad_norm": 0.04126833751797676, + "learning_rate": 6.938328647292391e-06, + "loss": 0.0061, + "num_input_tokens_seen": 29975296, + "step": 142040 + }, + { + "epoch": 15.626512651265127, + "grad_norm": 0.0036104717291891575, + "learning_rate": 6.936669310362512e-06, + "loss": 0.0026, + "num_input_tokens_seen": 29976288, + "step": 142045 + }, + { + "epoch": 15.627062706270626, + "grad_norm": 0.02310333400964737, + "learning_rate": 6.9350101399136836e-06, + "loss": 0.0359, + "num_input_tokens_seen": 29977312, + "step": 142050 + }, + { + "epoch": 15.627612761276128, + "grad_norm": 0.019035160541534424, + "learning_rate": 6.933351135961205e-06, + "loss": 0.0068, + "num_input_tokens_seen": 29978400, + "step": 142055 + }, + { + "epoch": 15.628162816281629, + "grad_norm": 0.007104204036295414, + "learning_rate": 6.931692298520359e-06, + "loss": 0.0237, + "num_input_tokens_seen": 29979456, + "step": 142060 + }, + { + "epoch": 15.628712871287128, + "grad_norm": 0.030285170301795006, + "learning_rate": 6.9300336276064305e-06, + "loss": 0.0088, + "num_input_tokens_seen": 29980512, + "step": 142065 + }, + { + "epoch": 15.629262926292629, + "grad_norm": 0.018221905454993248, + "learning_rate": 6.928375123234718e-06, + "loss": 0.0007, + "num_input_tokens_seen": 29981568, + "step": 142070 + }, + { + "epoch": 15.62981298129813, + "grad_norm": 1.3878470659255981, + "learning_rate": 6.9267167854204895e-06, + "loss": 0.0386, + "num_input_tokens_seen": 29982688, + "step": 142075 + }, + { + "epoch": 15.630363036303631, + "grad_norm": 0.025846317410469055, + "learning_rate": 6.925058614179042e-06, + "loss": 0.0025, + "num_input_tokens_seen": 29983680, + "step": 142080 + }, + { + "epoch": 15.63091309130913, + "grad_norm": 0.7755401730537415, + "learning_rate": 6.923400609525666e-06, + "loss": 0.0554, + "num_input_tokens_seen": 29984832, + "step": 142085 + }, + { + "epoch": 15.631463146314632, + "grad_norm": 0.02332364022731781, + "learning_rate": 6.921742771475623e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29985888, + "step": 142090 + }, + { + "epoch": 15.632013201320133, + "grad_norm": 1.5982104539871216, + "learning_rate": 6.92008510004421e-06, + "loss": 0.0121, + "num_input_tokens_seen": 29986976, + "step": 142095 + }, + { + "epoch": 15.632563256325632, + "grad_norm": 0.021944692358374596, + "learning_rate": 6.918427595246687e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29988032, + "step": 142100 + }, + { + "epoch": 15.633113311331133, + "grad_norm": 3.8733034133911133, + "learning_rate": 6.916770257098343e-06, + "loss": 0.0521, + "num_input_tokens_seen": 29989120, + "step": 142105 + }, + { + "epoch": 15.633663366336634, + "grad_norm": 0.02437102422118187, + "learning_rate": 6.915113085614458e-06, + "loss": 0.0005, + "num_input_tokens_seen": 29990240, + "step": 142110 + }, + { + "epoch": 15.634213421342134, + "grad_norm": 0.04626598581671715, + "learning_rate": 6.91345608081029e-06, + "loss": 0.0036, + "num_input_tokens_seen": 29991296, + "step": 142115 + }, + { + "epoch": 15.634763476347635, + "grad_norm": 0.10967772454023361, + "learning_rate": 6.911799242701126e-06, + "loss": 0.0028, + "num_input_tokens_seen": 29992352, + "step": 142120 + }, + { + "epoch": 15.635313531353136, + "grad_norm": 0.13895919919013977, + "learning_rate": 6.910142571302228e-06, + "loss": 0.0846, + "num_input_tokens_seen": 29993408, + "step": 142125 + }, + { + "epoch": 15.635863586358635, + "grad_norm": 0.15043503046035767, + "learning_rate": 6.908486066628861e-06, + "loss": 0.0027, + "num_input_tokens_seen": 29994432, + "step": 142130 + }, + { + "epoch": 15.636413641364136, + "grad_norm": 0.008186660706996918, + "learning_rate": 6.906829728696293e-06, + "loss": 0.0034, + "num_input_tokens_seen": 29995520, + "step": 142135 + }, + { + "epoch": 15.636963696369637, + "grad_norm": 1.0781614780426025, + "learning_rate": 6.905173557519795e-06, + "loss": 0.0127, + "num_input_tokens_seen": 29996544, + "step": 142140 + }, + { + "epoch": 15.637513751375138, + "grad_norm": 0.007076701615005732, + "learning_rate": 6.903517553114635e-06, + "loss": 0.0517, + "num_input_tokens_seen": 29997632, + "step": 142145 + }, + { + "epoch": 15.638063806380638, + "grad_norm": 0.11147337406873703, + "learning_rate": 6.901861715496074e-06, + "loss": 0.0109, + "num_input_tokens_seen": 29998688, + "step": 142150 + }, + { + "epoch": 15.638613861386139, + "grad_norm": 0.21202948689460754, + "learning_rate": 6.900206044679361e-06, + "loss": 0.0116, + "num_input_tokens_seen": 29999712, + "step": 142155 + }, + { + "epoch": 15.63916391639164, + "grad_norm": 1.5304185152053833, + "learning_rate": 6.898550540679763e-06, + "loss": 0.0368, + "num_input_tokens_seen": 30000704, + "step": 142160 + }, + { + "epoch": 15.63971397139714, + "grad_norm": 0.01914449781179428, + "learning_rate": 6.8968952035125376e-06, + "loss": 0.054, + "num_input_tokens_seen": 30001792, + "step": 142165 + }, + { + "epoch": 15.64026402640264, + "grad_norm": 0.12993374466896057, + "learning_rate": 6.895240033192951e-06, + "loss": 0.0536, + "num_input_tokens_seen": 30002816, + "step": 142170 + }, + { + "epoch": 15.640814081408141, + "grad_norm": 0.01584656536579132, + "learning_rate": 6.893585029736249e-06, + "loss": 0.0085, + "num_input_tokens_seen": 30003872, + "step": 142175 + }, + { + "epoch": 15.64136413641364, + "grad_norm": 3.6636340618133545, + "learning_rate": 6.891930193157678e-06, + "loss": 0.0938, + "num_input_tokens_seen": 30004960, + "step": 142180 + }, + { + "epoch": 15.641914191419142, + "grad_norm": 0.008312291465699673, + "learning_rate": 6.890275523472506e-06, + "loss": 0.1378, + "num_input_tokens_seen": 30006016, + "step": 142185 + }, + { + "epoch": 15.642464246424643, + "grad_norm": 1.0834765434265137, + "learning_rate": 6.888621020695965e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30007072, + "step": 142190 + }, + { + "epoch": 15.643014301430142, + "grad_norm": 1.6706191301345825, + "learning_rate": 6.8869666848433155e-06, + "loss": 0.0419, + "num_input_tokens_seen": 30008160, + "step": 142195 + }, + { + "epoch": 15.643564356435643, + "grad_norm": 0.10467896610498428, + "learning_rate": 6.885312515929809e-06, + "loss": 0.003, + "num_input_tokens_seen": 30009184, + "step": 142200 + }, + { + "epoch": 15.644114411441144, + "grad_norm": 0.013864338397979736, + "learning_rate": 6.883658513970676e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30010144, + "step": 142205 + }, + { + "epoch": 15.644664466446645, + "grad_norm": 2.2584540843963623, + "learning_rate": 6.88200467898118e-06, + "loss": 0.0693, + "num_input_tokens_seen": 30011264, + "step": 142210 + }, + { + "epoch": 15.645214521452145, + "grad_norm": 0.02290242910385132, + "learning_rate": 6.880351010976544e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30012288, + "step": 142215 + }, + { + "epoch": 15.645764576457646, + "grad_norm": 3.576835870742798, + "learning_rate": 6.878697509972015e-06, + "loss": 0.0783, + "num_input_tokens_seen": 30013312, + "step": 142220 + }, + { + "epoch": 15.646314631463147, + "grad_norm": 0.01251580473035574, + "learning_rate": 6.877044175982847e-06, + "loss": 0.0591, + "num_input_tokens_seen": 30014368, + "step": 142225 + }, + { + "epoch": 15.646864686468646, + "grad_norm": 2.2579987049102783, + "learning_rate": 6.875391009024257e-06, + "loss": 0.1357, + "num_input_tokens_seen": 30015392, + "step": 142230 + }, + { + "epoch": 15.647414741474147, + "grad_norm": 0.04225834086537361, + "learning_rate": 6.873738009111499e-06, + "loss": 0.0119, + "num_input_tokens_seen": 30016512, + "step": 142235 + }, + { + "epoch": 15.647964796479648, + "grad_norm": 0.16871556639671326, + "learning_rate": 6.872085176259796e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30017632, + "step": 142240 + }, + { + "epoch": 15.648514851485148, + "grad_norm": 0.01726241037249565, + "learning_rate": 6.87043251048439e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30018688, + "step": 142245 + }, + { + "epoch": 15.649064906490649, + "grad_norm": 0.10751986503601074, + "learning_rate": 6.868780011800499e-06, + "loss": 0.002, + "num_input_tokens_seen": 30019744, + "step": 142250 + }, + { + "epoch": 15.64961496149615, + "grad_norm": 1.517615795135498, + "learning_rate": 6.867127680223365e-06, + "loss": 0.0559, + "num_input_tokens_seen": 30020704, + "step": 142255 + }, + { + "epoch": 15.65016501650165, + "grad_norm": 0.023137805983424187, + "learning_rate": 6.865475515768219e-06, + "loss": 0.0113, + "num_input_tokens_seen": 30021728, + "step": 142260 + }, + { + "epoch": 15.65071507150715, + "grad_norm": 1.6410565376281738, + "learning_rate": 6.863823518450277e-06, + "loss": 0.0146, + "num_input_tokens_seen": 30022752, + "step": 142265 + }, + { + "epoch": 15.651265126512651, + "grad_norm": 0.04805946722626686, + "learning_rate": 6.86217168828478e-06, + "loss": 0.0765, + "num_input_tokens_seen": 30023840, + "step": 142270 + }, + { + "epoch": 15.651815181518153, + "grad_norm": 0.01208890974521637, + "learning_rate": 6.8605200252869425e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30024864, + "step": 142275 + }, + { + "epoch": 15.652365236523652, + "grad_norm": 0.016321897506713867, + "learning_rate": 6.85886852947197e-06, + "loss": 0.001, + "num_input_tokens_seen": 30025952, + "step": 142280 + }, + { + "epoch": 15.652915291529153, + "grad_norm": 0.02207140251994133, + "learning_rate": 6.85721720085512e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30027040, + "step": 142285 + }, + { + "epoch": 15.653465346534654, + "grad_norm": 0.009269954636693, + "learning_rate": 6.855566039451594e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30028128, + "step": 142290 + }, + { + "epoch": 15.654015401540153, + "grad_norm": 0.014457483775913715, + "learning_rate": 6.853915045276599e-06, + "loss": 0.0054, + "num_input_tokens_seen": 30029184, + "step": 142295 + }, + { + "epoch": 15.654565456545654, + "grad_norm": 0.023976191878318787, + "learning_rate": 6.852264218345369e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30030208, + "step": 142300 + }, + { + "epoch": 15.655115511551156, + "grad_norm": 0.006099373567849398, + "learning_rate": 6.850613558673105e-06, + "loss": 0.0058, + "num_input_tokens_seen": 30031232, + "step": 142305 + }, + { + "epoch": 15.655665566556655, + "grad_norm": 0.09939661622047424, + "learning_rate": 6.848963066275027e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30032256, + "step": 142310 + }, + { + "epoch": 15.656215621562156, + "grad_norm": 0.46794843673706055, + "learning_rate": 6.8473127411663564e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30033344, + "step": 142315 + }, + { + "epoch": 15.656765676567657, + "grad_norm": 0.01757952570915222, + "learning_rate": 6.8456625833622835e-06, + "loss": 0.0535, + "num_input_tokens_seen": 30034400, + "step": 142320 + }, + { + "epoch": 15.657315731573158, + "grad_norm": 0.0195328239351511, + "learning_rate": 6.844012592878035e-06, + "loss": 0.0083, + "num_input_tokens_seen": 30035520, + "step": 142325 + }, + { + "epoch": 15.657865786578657, + "grad_norm": 0.513620138168335, + "learning_rate": 6.842362769728802e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30036608, + "step": 142330 + }, + { + "epoch": 15.658415841584159, + "grad_norm": 0.01098956074565649, + "learning_rate": 6.8407131139298e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30037664, + "step": 142335 + }, + { + "epoch": 15.65896589658966, + "grad_norm": 0.08839701116085052, + "learning_rate": 6.839063625496239e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30038720, + "step": 142340 + }, + { + "epoch": 15.659515951595159, + "grad_norm": 0.29093772172927856, + "learning_rate": 6.837414304443307e-06, + "loss": 0.0117, + "num_input_tokens_seen": 30039712, + "step": 142345 + }, + { + "epoch": 15.66006600660066, + "grad_norm": 0.0639844760298729, + "learning_rate": 6.835765150786219e-06, + "loss": 0.0359, + "num_input_tokens_seen": 30040736, + "step": 142350 + }, + { + "epoch": 15.660616061606161, + "grad_norm": 0.0833885595202446, + "learning_rate": 6.834116164540158e-06, + "loss": 0.0795, + "num_input_tokens_seen": 30041792, + "step": 142355 + }, + { + "epoch": 15.66116611661166, + "grad_norm": 0.009006747975945473, + "learning_rate": 6.832467345720342e-06, + "loss": 0.0565, + "num_input_tokens_seen": 30042880, + "step": 142360 + }, + { + "epoch": 15.661716171617162, + "grad_norm": 0.03609346225857735, + "learning_rate": 6.830818694341947e-06, + "loss": 0.0095, + "num_input_tokens_seen": 30043936, + "step": 142365 + }, + { + "epoch": 15.662266226622663, + "grad_norm": 0.08528371900320053, + "learning_rate": 6.82917021042018e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30045024, + "step": 142370 + }, + { + "epoch": 15.662816281628164, + "grad_norm": 0.18354183435440063, + "learning_rate": 6.827521893970237e-06, + "loss": 0.0342, + "num_input_tokens_seen": 30046080, + "step": 142375 + }, + { + "epoch": 15.663366336633663, + "grad_norm": 1.472775936126709, + "learning_rate": 6.8258737450073e-06, + "loss": 0.0486, + "num_input_tokens_seen": 30047072, + "step": 142380 + }, + { + "epoch": 15.663916391639164, + "grad_norm": 0.10713064670562744, + "learning_rate": 6.824225763546571e-06, + "loss": 0.0054, + "num_input_tokens_seen": 30048160, + "step": 142385 + }, + { + "epoch": 15.664466446644665, + "grad_norm": 0.032650209963321686, + "learning_rate": 6.822577949603229e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30049184, + "step": 142390 + }, + { + "epoch": 15.665016501650165, + "grad_norm": 0.2584185004234314, + "learning_rate": 6.820930303192449e-06, + "loss": 0.1088, + "num_input_tokens_seen": 30050272, + "step": 142395 + }, + { + "epoch": 15.665566556655666, + "grad_norm": 3.001420259475708, + "learning_rate": 6.819282824329448e-06, + "loss": 0.0633, + "num_input_tokens_seen": 30051296, + "step": 142400 + }, + { + "epoch": 15.666116611661167, + "grad_norm": 0.022187724709510803, + "learning_rate": 6.817635513029385e-06, + "loss": 0.0044, + "num_input_tokens_seen": 30052320, + "step": 142405 + }, + { + "epoch": 15.666666666666666, + "grad_norm": 0.07344284653663635, + "learning_rate": 6.81598836930746e-06, + "loss": 0.0126, + "num_input_tokens_seen": 30053440, + "step": 142410 + }, + { + "epoch": 15.667216721672167, + "grad_norm": 2.054784059524536, + "learning_rate": 6.8143413931788415e-06, + "loss": 0.0946, + "num_input_tokens_seen": 30054528, + "step": 142415 + }, + { + "epoch": 15.667766776677668, + "grad_norm": 0.004406735301017761, + "learning_rate": 6.812694584658708e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30055520, + "step": 142420 + }, + { + "epoch": 15.668316831683168, + "grad_norm": 0.025078127160668373, + "learning_rate": 6.811047943762239e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30056640, + "step": 142425 + }, + { + "epoch": 15.668866886688669, + "grad_norm": 0.03033858723938465, + "learning_rate": 6.809401470504615e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30057728, + "step": 142430 + }, + { + "epoch": 15.66941694169417, + "grad_norm": 0.06889643520116806, + "learning_rate": 6.807755164901014e-06, + "loss": 0.0064, + "num_input_tokens_seen": 30058720, + "step": 142435 + }, + { + "epoch": 15.66996699669967, + "grad_norm": 0.055816031992435455, + "learning_rate": 6.806109026966606e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30059808, + "step": 142440 + }, + { + "epoch": 15.67051705170517, + "grad_norm": 0.07905809581279755, + "learning_rate": 6.8044630567165505e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30060832, + "step": 142445 + }, + { + "epoch": 15.671067106710671, + "grad_norm": 0.013286146335303783, + "learning_rate": 6.802817254166038e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30061888, + "step": 142450 + }, + { + "epoch": 15.671617161716172, + "grad_norm": 0.6544754505157471, + "learning_rate": 6.801171619330213e-06, + "loss": 0.0277, + "num_input_tokens_seen": 30062944, + "step": 142455 + }, + { + "epoch": 15.672167216721672, + "grad_norm": 0.00876881554722786, + "learning_rate": 6.79952615222427e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30063968, + "step": 142460 + }, + { + "epoch": 15.672717271727173, + "grad_norm": 0.6742439270019531, + "learning_rate": 6.797880852863361e-06, + "loss": 0.0103, + "num_input_tokens_seen": 30065056, + "step": 142465 + }, + { + "epoch": 15.673267326732674, + "grad_norm": 0.05897163227200508, + "learning_rate": 6.796235721262642e-06, + "loss": 0.0454, + "num_input_tokens_seen": 30066112, + "step": 142470 + }, + { + "epoch": 15.673817381738173, + "grad_norm": 0.019612599164247513, + "learning_rate": 6.794590757437291e-06, + "loss": 0.1378, + "num_input_tokens_seen": 30067168, + "step": 142475 + }, + { + "epoch": 15.674367436743674, + "grad_norm": 1.39590585231781, + "learning_rate": 6.792945961402456e-06, + "loss": 0.0434, + "num_input_tokens_seen": 30068192, + "step": 142480 + }, + { + "epoch": 15.674917491749175, + "grad_norm": 0.014249657280743122, + "learning_rate": 6.791301333173297e-06, + "loss": 0.0478, + "num_input_tokens_seen": 30069248, + "step": 142485 + }, + { + "epoch": 15.675467546754675, + "grad_norm": 0.04420489817857742, + "learning_rate": 6.7896568727649875e-06, + "loss": 0.0841, + "num_input_tokens_seen": 30070272, + "step": 142490 + }, + { + "epoch": 15.676017601760176, + "grad_norm": 1.891031265258789, + "learning_rate": 6.788012580192662e-06, + "loss": 0.2031, + "num_input_tokens_seen": 30071264, + "step": 142495 + }, + { + "epoch": 15.676567656765677, + "grad_norm": 0.09692558646202087, + "learning_rate": 6.786368455471498e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30072256, + "step": 142500 + }, + { + "epoch": 15.677117711771178, + "grad_norm": 0.03002808429300785, + "learning_rate": 6.784724498616632e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30073280, + "step": 142505 + }, + { + "epoch": 15.677667766776677, + "grad_norm": 0.04585866630077362, + "learning_rate": 6.783080709643214e-06, + "loss": 0.0891, + "num_input_tokens_seen": 30074336, + "step": 142510 + }, + { + "epoch": 15.678217821782178, + "grad_norm": 1.4714622497558594, + "learning_rate": 6.781437088566397e-06, + "loss": 0.0927, + "num_input_tokens_seen": 30075424, + "step": 142515 + }, + { + "epoch": 15.67876787678768, + "grad_norm": 0.022891566157341003, + "learning_rate": 6.779793635401335e-06, + "loss": 0.0409, + "num_input_tokens_seen": 30076544, + "step": 142520 + }, + { + "epoch": 15.679317931793179, + "grad_norm": 0.006490072701126337, + "learning_rate": 6.778150350163179e-06, + "loss": 0.001, + "num_input_tokens_seen": 30077696, + "step": 142525 + }, + { + "epoch": 15.67986798679868, + "grad_norm": 1.5092295408248901, + "learning_rate": 6.776507232867069e-06, + "loss": 0.0364, + "num_input_tokens_seen": 30078720, + "step": 142530 + }, + { + "epoch": 15.680418041804181, + "grad_norm": 0.033833179622888565, + "learning_rate": 6.774864283528137e-06, + "loss": 0.007, + "num_input_tokens_seen": 30079776, + "step": 142535 + }, + { + "epoch": 15.68096809680968, + "grad_norm": 0.012630457058548927, + "learning_rate": 6.773221502161536e-06, + "loss": 0.1278, + "num_input_tokens_seen": 30080864, + "step": 142540 + }, + { + "epoch": 15.681518151815181, + "grad_norm": 0.06704293936491013, + "learning_rate": 6.77157888878241e-06, + "loss": 0.0158, + "num_input_tokens_seen": 30081952, + "step": 142545 + }, + { + "epoch": 15.682068206820682, + "grad_norm": 0.042018089443445206, + "learning_rate": 6.769936443405897e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30083008, + "step": 142550 + }, + { + "epoch": 15.682618261826182, + "grad_norm": 0.02572271227836609, + "learning_rate": 6.768294166047134e-06, + "loss": 0.0879, + "num_input_tokens_seen": 30084032, + "step": 142555 + }, + { + "epoch": 15.683168316831683, + "grad_norm": 0.03059118054807186, + "learning_rate": 6.766652056721248e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30085024, + "step": 142560 + }, + { + "epoch": 15.683718371837184, + "grad_norm": 0.014616437256336212, + "learning_rate": 6.76501011544339e-06, + "loss": 0.001, + "num_input_tokens_seen": 30086112, + "step": 142565 + }, + { + "epoch": 15.684268426842685, + "grad_norm": 0.05840734392404556, + "learning_rate": 6.7633683422286735e-06, + "loss": 0.0032, + "num_input_tokens_seen": 30087200, + "step": 142570 + }, + { + "epoch": 15.684818481848184, + "grad_norm": 0.16248458623886108, + "learning_rate": 6.761726737092242e-06, + "loss": 0.0151, + "num_input_tokens_seen": 30088256, + "step": 142575 + }, + { + "epoch": 15.685368536853685, + "grad_norm": 0.24641847610473633, + "learning_rate": 6.7600853000492324e-06, + "loss": 0.1585, + "num_input_tokens_seen": 30089376, + "step": 142580 + }, + { + "epoch": 15.685918591859187, + "grad_norm": 0.09727608412504196, + "learning_rate": 6.758444031114755e-06, + "loss": 0.006, + "num_input_tokens_seen": 30090432, + "step": 142585 + }, + { + "epoch": 15.686468646864686, + "grad_norm": 1.674699306488037, + "learning_rate": 6.7568029303039515e-06, + "loss": 0.1046, + "num_input_tokens_seen": 30091488, + "step": 142590 + }, + { + "epoch": 15.687018701870187, + "grad_norm": 1.818623661994934, + "learning_rate": 6.755161997631937e-06, + "loss": 0.051, + "num_input_tokens_seen": 30092640, + "step": 142595 + }, + { + "epoch": 15.687568756875688, + "grad_norm": 0.5385724306106567, + "learning_rate": 6.753521233113838e-06, + "loss": 0.0121, + "num_input_tokens_seen": 30093632, + "step": 142600 + }, + { + "epoch": 15.688118811881187, + "grad_norm": 1.373556137084961, + "learning_rate": 6.751880636764787e-06, + "loss": 0.05, + "num_input_tokens_seen": 30094752, + "step": 142605 + }, + { + "epoch": 15.688668866886688, + "grad_norm": 0.9003069400787354, + "learning_rate": 6.750240208599887e-06, + "loss": 0.0116, + "num_input_tokens_seen": 30095776, + "step": 142610 + }, + { + "epoch": 15.68921892189219, + "grad_norm": 0.06736867874860764, + "learning_rate": 6.748599948634274e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30096768, + "step": 142615 + }, + { + "epoch": 15.689768976897689, + "grad_norm": 0.0064743394032120705, + "learning_rate": 6.7469598568830505e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30097824, + "step": 142620 + }, + { + "epoch": 15.69031903190319, + "grad_norm": 0.00883476808667183, + "learning_rate": 6.745319933361344e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30098848, + "step": 142625 + }, + { + "epoch": 15.690869086908691, + "grad_norm": 0.1948530375957489, + "learning_rate": 6.7436801780842605e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30099904, + "step": 142630 + }, + { + "epoch": 15.691419141914192, + "grad_norm": 0.1066196858882904, + "learning_rate": 6.742040591066914e-06, + "loss": 0.0509, + "num_input_tokens_seen": 30101024, + "step": 142635 + }, + { + "epoch": 15.691969196919691, + "grad_norm": 0.10054241865873337, + "learning_rate": 6.740401172324426e-06, + "loss": 0.0082, + "num_input_tokens_seen": 30102048, + "step": 142640 + }, + { + "epoch": 15.692519251925193, + "grad_norm": 0.5204966068267822, + "learning_rate": 6.738761921871892e-06, + "loss": 0.0252, + "num_input_tokens_seen": 30103104, + "step": 142645 + }, + { + "epoch": 15.693069306930694, + "grad_norm": 0.020252225920557976, + "learning_rate": 6.737122839724436e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30104224, + "step": 142650 + }, + { + "epoch": 15.693619361936193, + "grad_norm": 0.06334200501441956, + "learning_rate": 6.735483925897146e-06, + "loss": 0.005, + "num_input_tokens_seen": 30105280, + "step": 142655 + }, + { + "epoch": 15.694169416941694, + "grad_norm": 0.055044230073690414, + "learning_rate": 6.733845180405135e-06, + "loss": 0.1292, + "num_input_tokens_seen": 30106400, + "step": 142660 + }, + { + "epoch": 15.694719471947195, + "grad_norm": 0.05271265655755997, + "learning_rate": 6.732206603263519e-06, + "loss": 0.0219, + "num_input_tokens_seen": 30107392, + "step": 142665 + }, + { + "epoch": 15.695269526952695, + "grad_norm": 0.23712725937366486, + "learning_rate": 6.730568194487385e-06, + "loss": 0.1036, + "num_input_tokens_seen": 30108448, + "step": 142670 + }, + { + "epoch": 15.695819581958196, + "grad_norm": 0.09590814262628555, + "learning_rate": 6.728929954091834e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30109472, + "step": 142675 + }, + { + "epoch": 15.696369636963697, + "grad_norm": 2.78539776802063, + "learning_rate": 6.727291882091974e-06, + "loss": 0.0598, + "num_input_tokens_seen": 30110496, + "step": 142680 + }, + { + "epoch": 15.696919691969196, + "grad_norm": 0.054449353367090225, + "learning_rate": 6.725653978502888e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30111520, + "step": 142685 + }, + { + "epoch": 15.697469746974697, + "grad_norm": 0.049360353499650955, + "learning_rate": 6.7240162433396815e-06, + "loss": 0.0744, + "num_input_tokens_seen": 30112576, + "step": 142690 + }, + { + "epoch": 15.698019801980198, + "grad_norm": 0.0199586171656847, + "learning_rate": 6.722378676617455e-06, + "loss": 0.0331, + "num_input_tokens_seen": 30113568, + "step": 142695 + }, + { + "epoch": 15.6985698569857, + "grad_norm": 0.057668838649988174, + "learning_rate": 6.7207412783512865e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30114560, + "step": 142700 + }, + { + "epoch": 15.699119911991199, + "grad_norm": 0.1269604116678238, + "learning_rate": 6.71910404855628e-06, + "loss": 0.1287, + "num_input_tokens_seen": 30115520, + "step": 142705 + }, + { + "epoch": 15.6996699669967, + "grad_norm": 0.010750263929367065, + "learning_rate": 6.717466987247514e-06, + "loss": 0.0257, + "num_input_tokens_seen": 30116576, + "step": 142710 + }, + { + "epoch": 15.7002200220022, + "grad_norm": 0.044557925313711166, + "learning_rate": 6.715830094440081e-06, + "loss": 0.0951, + "num_input_tokens_seen": 30117632, + "step": 142715 + }, + { + "epoch": 15.7007700770077, + "grad_norm": 0.013546367175877094, + "learning_rate": 6.714193370149077e-06, + "loss": 0.0109, + "num_input_tokens_seen": 30118656, + "step": 142720 + }, + { + "epoch": 15.701320132013201, + "grad_norm": 0.017500028014183044, + "learning_rate": 6.71255681438957e-06, + "loss": 0.0925, + "num_input_tokens_seen": 30119648, + "step": 142725 + }, + { + "epoch": 15.701870187018702, + "grad_norm": 0.47589439153671265, + "learning_rate": 6.710920427176659e-06, + "loss": 0.0709, + "num_input_tokens_seen": 30120704, + "step": 142730 + }, + { + "epoch": 15.702420242024202, + "grad_norm": 0.009838470257818699, + "learning_rate": 6.709284208525413e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30121728, + "step": 142735 + }, + { + "epoch": 15.702970297029703, + "grad_norm": 2.209996223449707, + "learning_rate": 6.707648158450924e-06, + "loss": 0.0213, + "num_input_tokens_seen": 30122784, + "step": 142740 + }, + { + "epoch": 15.703520352035204, + "grad_norm": 0.012718440033495426, + "learning_rate": 6.706012276968257e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30123872, + "step": 142745 + }, + { + "epoch": 15.704070407040705, + "grad_norm": 0.47010231018066406, + "learning_rate": 6.704376564092496e-06, + "loss": 0.1055, + "num_input_tokens_seen": 30124864, + "step": 142750 + }, + { + "epoch": 15.704620462046204, + "grad_norm": 0.3656904399394989, + "learning_rate": 6.702741019838724e-06, + "loss": 0.0571, + "num_input_tokens_seen": 30125888, + "step": 142755 + }, + { + "epoch": 15.705170517051705, + "grad_norm": 0.2932392954826355, + "learning_rate": 6.7011056442219995e-06, + "loss": 0.0103, + "num_input_tokens_seen": 30126976, + "step": 142760 + }, + { + "epoch": 15.705720572057206, + "grad_norm": 0.016556352376937866, + "learning_rate": 6.699470437257413e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30127968, + "step": 142765 + }, + { + "epoch": 15.706270627062706, + "grad_norm": 0.1370263248682022, + "learning_rate": 6.69783539896002e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30128992, + "step": 142770 + }, + { + "epoch": 15.706820682068207, + "grad_norm": 0.02291039004921913, + "learning_rate": 6.696200529344896e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30130112, + "step": 142775 + }, + { + "epoch": 15.707370737073708, + "grad_norm": 0.024894988164305687, + "learning_rate": 6.6945658284271125e-06, + "loss": 0.041, + "num_input_tokens_seen": 30131200, + "step": 142780 + }, + { + "epoch": 15.707920792079207, + "grad_norm": 0.007556950207799673, + "learning_rate": 6.692931296221727e-06, + "loss": 0.0362, + "num_input_tokens_seen": 30132256, + "step": 142785 + }, + { + "epoch": 15.708470847084708, + "grad_norm": 0.08072946220636368, + "learning_rate": 6.691296932743815e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30133248, + "step": 142790 + }, + { + "epoch": 15.70902090209021, + "grad_norm": 0.034752849489450455, + "learning_rate": 6.689662738008437e-06, + "loss": 0.1316, + "num_input_tokens_seen": 30134368, + "step": 142795 + }, + { + "epoch": 15.70957095709571, + "grad_norm": 0.0366324819624424, + "learning_rate": 6.6880287120306416e-06, + "loss": 0.0571, + "num_input_tokens_seen": 30135424, + "step": 142800 + }, + { + "epoch": 15.71012101210121, + "grad_norm": 0.07084015011787415, + "learning_rate": 6.686394854825498e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30136416, + "step": 142805 + }, + { + "epoch": 15.710671067106711, + "grad_norm": 0.12966854870319366, + "learning_rate": 6.684761166408066e-06, + "loss": 0.0475, + "num_input_tokens_seen": 30137472, + "step": 142810 + }, + { + "epoch": 15.711221122112212, + "grad_norm": 3.40936541557312, + "learning_rate": 6.683127646793411e-06, + "loss": 0.0885, + "num_input_tokens_seen": 30138496, + "step": 142815 + }, + { + "epoch": 15.711771177117711, + "grad_norm": 0.09088015556335449, + "learning_rate": 6.681494295996576e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30139584, + "step": 142820 + }, + { + "epoch": 15.712321232123212, + "grad_norm": 0.06164313480257988, + "learning_rate": 6.679861114032612e-06, + "loss": 0.0075, + "num_input_tokens_seen": 30140704, + "step": 142825 + }, + { + "epoch": 15.712871287128714, + "grad_norm": 0.09046010673046112, + "learning_rate": 6.678228100916578e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30141760, + "step": 142830 + }, + { + "epoch": 15.713421342134213, + "grad_norm": 0.02639855444431305, + "learning_rate": 6.676595256663523e-06, + "loss": 0.0701, + "num_input_tokens_seen": 30142816, + "step": 142835 + }, + { + "epoch": 15.713971397139714, + "grad_norm": 0.39963915944099426, + "learning_rate": 6.674962581288505e-06, + "loss": 0.0991, + "num_input_tokens_seen": 30143808, + "step": 142840 + }, + { + "epoch": 15.714521452145215, + "grad_norm": 0.07917655259370804, + "learning_rate": 6.673330074806564e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30144864, + "step": 142845 + }, + { + "epoch": 15.715071507150714, + "grad_norm": 0.023826349526643753, + "learning_rate": 6.671697737232738e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30145888, + "step": 142850 + }, + { + "epoch": 15.715621562156215, + "grad_norm": 0.37132528424263, + "learning_rate": 6.670065568582087e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30146944, + "step": 142855 + }, + { + "epoch": 15.716171617161717, + "grad_norm": 0.06537892669439316, + "learning_rate": 6.66843356886964e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30147968, + "step": 142860 + }, + { + "epoch": 15.716721672167218, + "grad_norm": 0.09904114156961441, + "learning_rate": 6.66680173811044e-06, + "loss": 0.0142, + "num_input_tokens_seen": 30149056, + "step": 142865 + }, + { + "epoch": 15.717271727172717, + "grad_norm": 0.012820374220609665, + "learning_rate": 6.665170076319541e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30150112, + "step": 142870 + }, + { + "epoch": 15.717821782178218, + "grad_norm": 0.17617656290531158, + "learning_rate": 6.663538583511966e-06, + "loss": 0.0353, + "num_input_tokens_seen": 30151200, + "step": 142875 + }, + { + "epoch": 15.718371837183719, + "grad_norm": 0.06538976728916168, + "learning_rate": 6.661907259702763e-06, + "loss": 0.0064, + "num_input_tokens_seen": 30152224, + "step": 142880 + }, + { + "epoch": 15.718921892189218, + "grad_norm": 0.05651204288005829, + "learning_rate": 6.660276104906954e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30153248, + "step": 142885 + }, + { + "epoch": 15.71947194719472, + "grad_norm": 0.02464975230395794, + "learning_rate": 6.658645119139578e-06, + "loss": 0.002, + "num_input_tokens_seen": 30154272, + "step": 142890 + }, + { + "epoch": 15.72002200220022, + "grad_norm": 0.014509673230350018, + "learning_rate": 6.657014302415679e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30155296, + "step": 142895 + }, + { + "epoch": 15.72057205720572, + "grad_norm": 0.03921254724264145, + "learning_rate": 6.655383654750269e-06, + "loss": 0.0568, + "num_input_tokens_seen": 30156352, + "step": 142900 + }, + { + "epoch": 15.721122112211221, + "grad_norm": 0.653817892074585, + "learning_rate": 6.653753176158392e-06, + "loss": 0.0825, + "num_input_tokens_seen": 30157440, + "step": 142905 + }, + { + "epoch": 15.721672167216722, + "grad_norm": 0.007425138261169195, + "learning_rate": 6.65212286665507e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30158528, + "step": 142910 + }, + { + "epoch": 15.722222222222221, + "grad_norm": 1.57747220993042, + "learning_rate": 6.65049272625532e-06, + "loss": 0.0902, + "num_input_tokens_seen": 30159616, + "step": 142915 + }, + { + "epoch": 15.722772277227723, + "grad_norm": 0.02666233293712139, + "learning_rate": 6.648862754974172e-06, + "loss": 0.0104, + "num_input_tokens_seen": 30160672, + "step": 142920 + }, + { + "epoch": 15.723322332233224, + "grad_norm": 0.019816933199763298, + "learning_rate": 6.647232952826651e-06, + "loss": 0.0627, + "num_input_tokens_seen": 30161728, + "step": 142925 + }, + { + "epoch": 15.723872387238725, + "grad_norm": 0.02137235924601555, + "learning_rate": 6.645603319827787e-06, + "loss": 0.1195, + "num_input_tokens_seen": 30162752, + "step": 142930 + }, + { + "epoch": 15.724422442244224, + "grad_norm": 0.15165089070796967, + "learning_rate": 6.643973855992585e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30163808, + "step": 142935 + }, + { + "epoch": 15.724972497249725, + "grad_norm": 0.043700531125068665, + "learning_rate": 6.642344561336064e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30164864, + "step": 142940 + }, + { + "epoch": 15.725522552255226, + "grad_norm": 0.08347935229539871, + "learning_rate": 6.6407154358732435e-06, + "loss": 0.0072, + "num_input_tokens_seen": 30165920, + "step": 142945 + }, + { + "epoch": 15.726072607260726, + "grad_norm": 0.041376058012247086, + "learning_rate": 6.639086479619139e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30166944, + "step": 142950 + }, + { + "epoch": 15.726622662266227, + "grad_norm": 0.5881844162940979, + "learning_rate": 6.637457692588772e-06, + "loss": 0.0129, + "num_input_tokens_seen": 30168000, + "step": 142955 + }, + { + "epoch": 15.727172717271728, + "grad_norm": 0.07876396924257278, + "learning_rate": 6.635829074797148e-06, + "loss": 0.0886, + "num_input_tokens_seen": 30169056, + "step": 142960 + }, + { + "epoch": 15.727722772277227, + "grad_norm": 0.022609231993556023, + "learning_rate": 6.634200626259265e-06, + "loss": 0.0124, + "num_input_tokens_seen": 30170144, + "step": 142965 + }, + { + "epoch": 15.728272827282728, + "grad_norm": 0.03810252621769905, + "learning_rate": 6.632572346990151e-06, + "loss": 0.0056, + "num_input_tokens_seen": 30171168, + "step": 142970 + }, + { + "epoch": 15.72882288228823, + "grad_norm": 0.2631438970565796, + "learning_rate": 6.6309442370047965e-06, + "loss": 0.0164, + "num_input_tokens_seen": 30172256, + "step": 142975 + }, + { + "epoch": 15.729372937293729, + "grad_norm": 0.20616070926189423, + "learning_rate": 6.629316296318213e-06, + "loss": 0.0162, + "num_input_tokens_seen": 30173376, + "step": 142980 + }, + { + "epoch": 15.72992299229923, + "grad_norm": 0.06616324186325073, + "learning_rate": 6.627688524945414e-06, + "loss": 0.0053, + "num_input_tokens_seen": 30174432, + "step": 142985 + }, + { + "epoch": 15.73047304730473, + "grad_norm": 0.06625479459762573, + "learning_rate": 6.626060922901389e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30175456, + "step": 142990 + }, + { + "epoch": 15.731023102310232, + "grad_norm": 0.027773726731538773, + "learning_rate": 6.624433490201148e-06, + "loss": 0.001, + "num_input_tokens_seen": 30176544, + "step": 142995 + }, + { + "epoch": 15.731573157315731, + "grad_norm": 0.024499548599123955, + "learning_rate": 6.6228062268596815e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30177600, + "step": 143000 + }, + { + "epoch": 15.732123212321232, + "grad_norm": 0.04520252346992493, + "learning_rate": 6.621179132891989e-06, + "loss": 0.0794, + "num_input_tokens_seen": 30178688, + "step": 143005 + }, + { + "epoch": 15.732673267326733, + "grad_norm": 0.3034247159957886, + "learning_rate": 6.619552208313079e-06, + "loss": 0.0391, + "num_input_tokens_seen": 30179744, + "step": 143010 + }, + { + "epoch": 15.733223322332233, + "grad_norm": 0.4141062796115875, + "learning_rate": 6.61792545313793e-06, + "loss": 0.2059, + "num_input_tokens_seen": 30180832, + "step": 143015 + }, + { + "epoch": 15.733773377337734, + "grad_norm": 0.017027372494339943, + "learning_rate": 6.616298867381546e-06, + "loss": 0.0053, + "num_input_tokens_seen": 30181888, + "step": 143020 + }, + { + "epoch": 15.734323432343235, + "grad_norm": 0.10049566626548767, + "learning_rate": 6.6146724510589066e-06, + "loss": 0.004, + "num_input_tokens_seen": 30182944, + "step": 143025 + }, + { + "epoch": 15.734873487348734, + "grad_norm": 0.025040607899427414, + "learning_rate": 6.6130462041850195e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30184064, + "step": 143030 + }, + { + "epoch": 15.735423542354235, + "grad_norm": 0.0524725578725338, + "learning_rate": 6.611420126774853e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30185088, + "step": 143035 + }, + { + "epoch": 15.735973597359736, + "grad_norm": 0.07284191995859146, + "learning_rate": 6.609794218843404e-06, + "loss": 0.0502, + "num_input_tokens_seen": 30186176, + "step": 143040 + }, + { + "epoch": 15.736523652365236, + "grad_norm": 2.2952358722686768, + "learning_rate": 6.608168480405666e-06, + "loss": 0.0444, + "num_input_tokens_seen": 30187232, + "step": 143045 + }, + { + "epoch": 15.737073707370737, + "grad_norm": 0.04119696840643883, + "learning_rate": 6.60654291147661e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30188320, + "step": 143050 + }, + { + "epoch": 15.737623762376238, + "grad_norm": 0.046893954277038574, + "learning_rate": 6.604917512071218e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30189376, + "step": 143055 + }, + { + "epoch": 15.738173817381739, + "grad_norm": 0.015642983838915825, + "learning_rate": 6.603292282204482e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30190432, + "step": 143060 + }, + { + "epoch": 15.738723872387238, + "grad_norm": 0.06334549188613892, + "learning_rate": 6.601667221891356e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30191520, + "step": 143065 + }, + { + "epoch": 15.73927392739274, + "grad_norm": 0.009601402096450329, + "learning_rate": 6.6000423311468525e-06, + "loss": 0.2291, + "num_input_tokens_seen": 30192576, + "step": 143070 + }, + { + "epoch": 15.73982398239824, + "grad_norm": 0.04738784581422806, + "learning_rate": 6.598417609985927e-06, + "loss": 0.0529, + "num_input_tokens_seen": 30193664, + "step": 143075 + }, + { + "epoch": 15.74037403740374, + "grad_norm": 0.020557943731546402, + "learning_rate": 6.5967930584235505e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30194720, + "step": 143080 + }, + { + "epoch": 15.74092409240924, + "grad_norm": 0.37680739164352417, + "learning_rate": 6.595168676474708e-06, + "loss": 0.0525, + "num_input_tokens_seen": 30195776, + "step": 143085 + }, + { + "epoch": 15.741474147414742, + "grad_norm": 0.01831931434571743, + "learning_rate": 6.593544464154358e-06, + "loss": 0.004, + "num_input_tokens_seen": 30196864, + "step": 143090 + }, + { + "epoch": 15.742024202420241, + "grad_norm": 0.016587210819125175, + "learning_rate": 6.5919204214774765e-06, + "loss": 0.0157, + "num_input_tokens_seen": 30197920, + "step": 143095 + }, + { + "epoch": 15.742574257425742, + "grad_norm": 0.26307156682014465, + "learning_rate": 6.590296548459041e-06, + "loss": 0.0524, + "num_input_tokens_seen": 30199040, + "step": 143100 + }, + { + "epoch": 15.743124312431243, + "grad_norm": 0.16854248940944672, + "learning_rate": 6.588672845113997e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30200032, + "step": 143105 + }, + { + "epoch": 15.743674367436743, + "grad_norm": 0.015308639965951443, + "learning_rate": 6.58704931145733e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30201056, + "step": 143110 + }, + { + "epoch": 15.744224422442244, + "grad_norm": 1.6775811910629272, + "learning_rate": 6.585425947503987e-06, + "loss": 0.0407, + "num_input_tokens_seen": 30202016, + "step": 143115 + }, + { + "epoch": 15.744774477447745, + "grad_norm": 0.06334466487169266, + "learning_rate": 6.583802753268936e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30203072, + "step": 143120 + }, + { + "epoch": 15.745324532453246, + "grad_norm": 0.3669103682041168, + "learning_rate": 6.582179728767146e-06, + "loss": 0.0335, + "num_input_tokens_seen": 30204160, + "step": 143125 + }, + { + "epoch": 15.745874587458745, + "grad_norm": 0.0718034878373146, + "learning_rate": 6.58055687401356e-06, + "loss": 0.0143, + "num_input_tokens_seen": 30205216, + "step": 143130 + }, + { + "epoch": 15.746424642464246, + "grad_norm": 0.007055702153593302, + "learning_rate": 6.57893418902315e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30206240, + "step": 143135 + }, + { + "epoch": 15.746974697469748, + "grad_norm": 0.04399208724498749, + "learning_rate": 6.577311673810857e-06, + "loss": 0.0062, + "num_input_tokens_seen": 30207232, + "step": 143140 + }, + { + "epoch": 15.747524752475247, + "grad_norm": 0.08440748602151871, + "learning_rate": 6.575689328391648e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30208256, + "step": 143145 + }, + { + "epoch": 15.748074807480748, + "grad_norm": 0.09196463227272034, + "learning_rate": 6.574067152780467e-06, + "loss": 0.172, + "num_input_tokens_seen": 30209280, + "step": 143150 + }, + { + "epoch": 15.748624862486249, + "grad_norm": 1.967848777770996, + "learning_rate": 6.572445146992262e-06, + "loss": 0.0403, + "num_input_tokens_seen": 30210400, + "step": 143155 + }, + { + "epoch": 15.749174917491748, + "grad_norm": 0.026234770193696022, + "learning_rate": 6.570823311041999e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30211392, + "step": 143160 + }, + { + "epoch": 15.74972497249725, + "grad_norm": 0.24675863981246948, + "learning_rate": 6.569201644944606e-06, + "loss": 0.0033, + "num_input_tokens_seen": 30212544, + "step": 143165 + }, + { + "epoch": 15.75027502750275, + "grad_norm": 2.750227928161621, + "learning_rate": 6.567580148715044e-06, + "loss": 0.0339, + "num_input_tokens_seen": 30213664, + "step": 143170 + }, + { + "epoch": 15.750825082508252, + "grad_norm": 3.4433958530426025, + "learning_rate": 6.565958822368251e-06, + "loss": 0.092, + "num_input_tokens_seen": 30214688, + "step": 143175 + }, + { + "epoch": 15.751375137513751, + "grad_norm": 0.06485200673341751, + "learning_rate": 6.564337665919154e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30215744, + "step": 143180 + }, + { + "epoch": 15.751925192519252, + "grad_norm": 0.12745390832424164, + "learning_rate": 6.5627166793827275e-06, + "loss": 0.1043, + "num_input_tokens_seen": 30216800, + "step": 143185 + }, + { + "epoch": 15.752475247524753, + "grad_norm": 0.045565687119960785, + "learning_rate": 6.561095862773886e-06, + "loss": 0.0033, + "num_input_tokens_seen": 30217888, + "step": 143190 + }, + { + "epoch": 15.753025302530252, + "grad_norm": 0.11753550171852112, + "learning_rate": 6.559475216107585e-06, + "loss": 0.003, + "num_input_tokens_seen": 30218912, + "step": 143195 + }, + { + "epoch": 15.753575357535754, + "grad_norm": 2.44543194770813, + "learning_rate": 6.55785473939875e-06, + "loss": 0.063, + "num_input_tokens_seen": 30219936, + "step": 143200 + }, + { + "epoch": 15.754125412541255, + "grad_norm": 0.10673987865447998, + "learning_rate": 6.5562344326623116e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30221024, + "step": 143205 + }, + { + "epoch": 15.754675467546754, + "grad_norm": 0.05195103958249092, + "learning_rate": 6.554614295913211e-06, + "loss": 0.0166, + "num_input_tokens_seen": 30222144, + "step": 143210 + }, + { + "epoch": 15.755225522552255, + "grad_norm": 0.9321435689926147, + "learning_rate": 6.552994329166382e-06, + "loss": 0.0091, + "num_input_tokens_seen": 30223232, + "step": 143215 + }, + { + "epoch": 15.755775577557756, + "grad_norm": 0.07393506169319153, + "learning_rate": 6.551374532436755e-06, + "loss": 0.0759, + "num_input_tokens_seen": 30224256, + "step": 143220 + }, + { + "epoch": 15.756325632563257, + "grad_norm": 0.036118701100349426, + "learning_rate": 6.549754905739258e-06, + "loss": 0.0949, + "num_input_tokens_seen": 30225312, + "step": 143225 + }, + { + "epoch": 15.756875687568757, + "grad_norm": 0.18147440254688263, + "learning_rate": 6.54813544908881e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30226432, + "step": 143230 + }, + { + "epoch": 15.757425742574258, + "grad_norm": 0.04844505712389946, + "learning_rate": 6.5465161625003516e-06, + "loss": 0.0151, + "num_input_tokens_seen": 30227488, + "step": 143235 + }, + { + "epoch": 15.757975797579759, + "grad_norm": 1.3032411336898804, + "learning_rate": 6.544897045988791e-06, + "loss": 0.0209, + "num_input_tokens_seen": 30228544, + "step": 143240 + }, + { + "epoch": 15.758525852585258, + "grad_norm": 2.1270339488983154, + "learning_rate": 6.543278099569059e-06, + "loss": 0.1365, + "num_input_tokens_seen": 30229632, + "step": 143245 + }, + { + "epoch": 15.75907590759076, + "grad_norm": 0.007680566515773535, + "learning_rate": 6.541659323256083e-06, + "loss": 0.0669, + "num_input_tokens_seen": 30230688, + "step": 143250 + }, + { + "epoch": 15.75962596259626, + "grad_norm": 0.4691812992095947, + "learning_rate": 6.540040717064769e-06, + "loss": 0.0073, + "num_input_tokens_seen": 30231808, + "step": 143255 + }, + { + "epoch": 15.76017601760176, + "grad_norm": 0.15605153143405914, + "learning_rate": 6.538422281010048e-06, + "loss": 0.086, + "num_input_tokens_seen": 30232928, + "step": 143260 + }, + { + "epoch": 15.76072607260726, + "grad_norm": 0.02873578667640686, + "learning_rate": 6.536804015106823e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30233984, + "step": 143265 + }, + { + "epoch": 15.761276127612762, + "grad_norm": 4.192117214202881, + "learning_rate": 6.535185919370018e-06, + "loss": 0.0368, + "num_input_tokens_seen": 30235072, + "step": 143270 + }, + { + "epoch": 15.761826182618261, + "grad_norm": 0.006724017672240734, + "learning_rate": 6.533567993814549e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30236192, + "step": 143275 + }, + { + "epoch": 15.762376237623762, + "grad_norm": 0.9849879145622253, + "learning_rate": 6.531950238455317e-06, + "loss": 0.0405, + "num_input_tokens_seen": 30237152, + "step": 143280 + }, + { + "epoch": 15.762926292629263, + "grad_norm": 0.07730850577354431, + "learning_rate": 6.530332653307245e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30238176, + "step": 143285 + }, + { + "epoch": 15.763476347634764, + "grad_norm": 0.01724371686577797, + "learning_rate": 6.5287152383852326e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30239200, + "step": 143290 + }, + { + "epoch": 15.764026402640264, + "grad_norm": 0.10947233438491821, + "learning_rate": 6.527097993704181e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30240256, + "step": 143295 + }, + { + "epoch": 15.764576457645765, + "grad_norm": 0.0934283509850502, + "learning_rate": 6.525480919279006e-06, + "loss": 0.0623, + "num_input_tokens_seen": 30241280, + "step": 143300 + }, + { + "epoch": 15.765126512651266, + "grad_norm": 0.0028304315637797117, + "learning_rate": 6.523864015124606e-06, + "loss": 0.0068, + "num_input_tokens_seen": 30242400, + "step": 143305 + }, + { + "epoch": 15.765676567656765, + "grad_norm": 0.037139467895030975, + "learning_rate": 6.5222472812558945e-06, + "loss": 0.007, + "num_input_tokens_seen": 30243488, + "step": 143310 + }, + { + "epoch": 15.766226622662266, + "grad_norm": 0.018679022789001465, + "learning_rate": 6.520630717687762e-06, + "loss": 0.0357, + "num_input_tokens_seen": 30244512, + "step": 143315 + }, + { + "epoch": 15.766776677667767, + "grad_norm": 0.03324102982878685, + "learning_rate": 6.519014324435102e-06, + "loss": 0.0054, + "num_input_tokens_seen": 30245568, + "step": 143320 + }, + { + "epoch": 15.767326732673267, + "grad_norm": 0.9560199975967407, + "learning_rate": 6.51739810151282e-06, + "loss": 0.0103, + "num_input_tokens_seen": 30246624, + "step": 143325 + }, + { + "epoch": 15.767876787678768, + "grad_norm": 0.06001931056380272, + "learning_rate": 6.515782048935809e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30247680, + "step": 143330 + }, + { + "epoch": 15.768426842684269, + "grad_norm": 2.068852424621582, + "learning_rate": 6.514166166718974e-06, + "loss": 0.0709, + "num_input_tokens_seen": 30248768, + "step": 143335 + }, + { + "epoch": 15.768976897689768, + "grad_norm": 0.09396496415138245, + "learning_rate": 6.512550454877198e-06, + "loss": 0.0056, + "num_input_tokens_seen": 30249856, + "step": 143340 + }, + { + "epoch": 15.76952695269527, + "grad_norm": 0.3391481935977936, + "learning_rate": 6.510934913425368e-06, + "loss": 0.0101, + "num_input_tokens_seen": 30250944, + "step": 143345 + }, + { + "epoch": 15.77007700770077, + "grad_norm": 2.2007761001586914, + "learning_rate": 6.509319542378384e-06, + "loss": 0.1541, + "num_input_tokens_seen": 30252064, + "step": 143350 + }, + { + "epoch": 15.770627062706271, + "grad_norm": 0.29299840331077576, + "learning_rate": 6.5077043417511225e-06, + "loss": 0.0062, + "num_input_tokens_seen": 30253088, + "step": 143355 + }, + { + "epoch": 15.77117711771177, + "grad_norm": 0.07699467241764069, + "learning_rate": 6.506089311558478e-06, + "loss": 0.0668, + "num_input_tokens_seen": 30254144, + "step": 143360 + }, + { + "epoch": 15.771727172717272, + "grad_norm": 1.4703950881958008, + "learning_rate": 6.50447445181534e-06, + "loss": 0.0485, + "num_input_tokens_seen": 30255264, + "step": 143365 + }, + { + "epoch": 15.772277227722773, + "grad_norm": 0.021779822185635567, + "learning_rate": 6.502859762536578e-06, + "loss": 0.0089, + "num_input_tokens_seen": 30256320, + "step": 143370 + }, + { + "epoch": 15.772827282728272, + "grad_norm": 0.02512126788496971, + "learning_rate": 6.501245243737089e-06, + "loss": 0.0087, + "num_input_tokens_seen": 30257344, + "step": 143375 + }, + { + "epoch": 15.773377337733773, + "grad_norm": 0.12491214275360107, + "learning_rate": 6.499630895431741e-06, + "loss": 0.0107, + "num_input_tokens_seen": 30258400, + "step": 143380 + }, + { + "epoch": 15.773927392739274, + "grad_norm": 3.5012168884277344, + "learning_rate": 6.498016717635416e-06, + "loss": 0.0252, + "num_input_tokens_seen": 30259424, + "step": 143385 + }, + { + "epoch": 15.774477447744774, + "grad_norm": 0.00976654700934887, + "learning_rate": 6.496402710363e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30260480, + "step": 143390 + }, + { + "epoch": 15.775027502750275, + "grad_norm": 0.010394730605185032, + "learning_rate": 6.494788873629354e-06, + "loss": 0.0423, + "num_input_tokens_seen": 30261568, + "step": 143395 + }, + { + "epoch": 15.775577557755776, + "grad_norm": 1.1306425333023071, + "learning_rate": 6.493175207449367e-06, + "loss": 0.0088, + "num_input_tokens_seen": 30262592, + "step": 143400 + }, + { + "epoch": 15.776127612761275, + "grad_norm": 1.9122480154037476, + "learning_rate": 6.491561711837899e-06, + "loss": 0.0473, + "num_input_tokens_seen": 30263616, + "step": 143405 + }, + { + "epoch": 15.776677667766776, + "grad_norm": 0.07907775789499283, + "learning_rate": 6.48994838680983e-06, + "loss": 0.0581, + "num_input_tokens_seen": 30264704, + "step": 143410 + }, + { + "epoch": 15.777227722772277, + "grad_norm": 0.07343851774930954, + "learning_rate": 6.488335232380016e-06, + "loss": 0.0853, + "num_input_tokens_seen": 30265824, + "step": 143415 + }, + { + "epoch": 15.777777777777779, + "grad_norm": 0.09612169116735458, + "learning_rate": 6.486722248563335e-06, + "loss": 0.003, + "num_input_tokens_seen": 30266880, + "step": 143420 + }, + { + "epoch": 15.778327832783278, + "grad_norm": 0.17088891565799713, + "learning_rate": 6.485109435374659e-06, + "loss": 0.0041, + "num_input_tokens_seen": 30267968, + "step": 143425 + }, + { + "epoch": 15.778877887788779, + "grad_norm": 0.09788205474615097, + "learning_rate": 6.483496792828845e-06, + "loss": 0.0094, + "num_input_tokens_seen": 30269024, + "step": 143430 + }, + { + "epoch": 15.77942794279428, + "grad_norm": 0.07242166996002197, + "learning_rate": 6.481884320940748e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30270112, + "step": 143435 + }, + { + "epoch": 15.77997799779978, + "grad_norm": 0.03280564025044441, + "learning_rate": 6.48027201972524e-06, + "loss": 0.095, + "num_input_tokens_seen": 30271232, + "step": 143440 + }, + { + "epoch": 15.78052805280528, + "grad_norm": 0.06398743391036987, + "learning_rate": 6.478659889197175e-06, + "loss": 0.0054, + "num_input_tokens_seen": 30272320, + "step": 143445 + }, + { + "epoch": 15.781078107810782, + "grad_norm": 0.031056124716997147, + "learning_rate": 6.477047929371421e-06, + "loss": 0.0527, + "num_input_tokens_seen": 30273376, + "step": 143450 + }, + { + "epoch": 15.781628162816281, + "grad_norm": 0.023586522787809372, + "learning_rate": 6.47543614026283e-06, + "loss": 0.0051, + "num_input_tokens_seen": 30274496, + "step": 143455 + }, + { + "epoch": 15.782178217821782, + "grad_norm": 0.04656151682138443, + "learning_rate": 6.473824521886249e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30275520, + "step": 143460 + }, + { + "epoch": 15.782728272827283, + "grad_norm": 0.02551594190299511, + "learning_rate": 6.4722130742565445e-06, + "loss": 0.001, + "num_input_tokens_seen": 30276512, + "step": 143465 + }, + { + "epoch": 15.783278327832782, + "grad_norm": 0.04737529903650284, + "learning_rate": 6.4706017973885535e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30277504, + "step": 143470 + }, + { + "epoch": 15.783828382838283, + "grad_norm": 0.1319412887096405, + "learning_rate": 6.468990691297133e-06, + "loss": 0.0071, + "num_input_tokens_seen": 30278528, + "step": 143475 + }, + { + "epoch": 15.784378437843785, + "grad_norm": 1.1619662046432495, + "learning_rate": 6.467379755997144e-06, + "loss": 0.015, + "num_input_tokens_seen": 30279584, + "step": 143480 + }, + { + "epoch": 15.784928492849286, + "grad_norm": 0.07405834645032883, + "learning_rate": 6.465768991503413e-06, + "loss": 0.0606, + "num_input_tokens_seen": 30280640, + "step": 143485 + }, + { + "epoch": 15.785478547854785, + "grad_norm": 0.021404484286904335, + "learning_rate": 6.4641583978308026e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30281664, + "step": 143490 + }, + { + "epoch": 15.786028602860286, + "grad_norm": 0.018910948187112808, + "learning_rate": 6.462547974994143e-06, + "loss": 0.002, + "num_input_tokens_seen": 30282752, + "step": 143495 + }, + { + "epoch": 15.786578657865787, + "grad_norm": 0.2871209681034088, + "learning_rate": 6.460937723008284e-06, + "loss": 0.0618, + "num_input_tokens_seen": 30283776, + "step": 143500 + }, + { + "epoch": 15.787128712871286, + "grad_norm": 1.1122030019760132, + "learning_rate": 6.459327641888074e-06, + "loss": 0.0427, + "num_input_tokens_seen": 30284800, + "step": 143505 + }, + { + "epoch": 15.787678767876788, + "grad_norm": 0.18209168314933777, + "learning_rate": 6.4577177316483386e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30285888, + "step": 143510 + }, + { + "epoch": 15.788228822882289, + "grad_norm": 0.7244780659675598, + "learning_rate": 6.456107992303926e-06, + "loss": 0.0084, + "num_input_tokens_seen": 30286944, + "step": 143515 + }, + { + "epoch": 15.788778877887788, + "grad_norm": 0.043755803257226944, + "learning_rate": 6.454498423869659e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30287936, + "step": 143520 + }, + { + "epoch": 15.789328932893289, + "grad_norm": 4.44874382019043, + "learning_rate": 6.4528890263603925e-06, + "loss": 0.1314, + "num_input_tokens_seen": 30288992, + "step": 143525 + }, + { + "epoch": 15.78987898789879, + "grad_norm": 0.0773204118013382, + "learning_rate": 6.451279799790938e-06, + "loss": 0.007, + "num_input_tokens_seen": 30290048, + "step": 143530 + }, + { + "epoch": 15.79042904290429, + "grad_norm": 0.08193949609994888, + "learning_rate": 6.44967074417614e-06, + "loss": 0.0966, + "num_input_tokens_seen": 30291040, + "step": 143535 + }, + { + "epoch": 15.79097909790979, + "grad_norm": 0.03852475434541702, + "learning_rate": 6.44806185953083e-06, + "loss": 0.0097, + "num_input_tokens_seen": 30292064, + "step": 143540 + }, + { + "epoch": 15.791529152915292, + "grad_norm": 0.05173359811306, + "learning_rate": 6.446453145869824e-06, + "loss": 0.1085, + "num_input_tokens_seen": 30293152, + "step": 143545 + }, + { + "epoch": 15.792079207920793, + "grad_norm": 0.0891198068857193, + "learning_rate": 6.444844603207967e-06, + "loss": 0.0044, + "num_input_tokens_seen": 30294240, + "step": 143550 + }, + { + "epoch": 15.792629262926292, + "grad_norm": 0.04210137575864792, + "learning_rate": 6.443236231560063e-06, + "loss": 0.0693, + "num_input_tokens_seen": 30295264, + "step": 143555 + }, + { + "epoch": 15.793179317931793, + "grad_norm": 0.6721213459968567, + "learning_rate": 6.4416280309409466e-06, + "loss": 0.01, + "num_input_tokens_seen": 30296256, + "step": 143560 + }, + { + "epoch": 15.793729372937294, + "grad_norm": 0.19954869151115417, + "learning_rate": 6.440020001365449e-06, + "loss": 0.0047, + "num_input_tokens_seen": 30297312, + "step": 143565 + }, + { + "epoch": 15.794279427942794, + "grad_norm": 0.6248469352722168, + "learning_rate": 6.4384121428483715e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30298400, + "step": 143570 + }, + { + "epoch": 15.794829482948295, + "grad_norm": 0.10324928164482117, + "learning_rate": 6.436804455404552e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30299456, + "step": 143575 + }, + { + "epoch": 15.795379537953796, + "grad_norm": 0.1341983675956726, + "learning_rate": 6.435196939048796e-06, + "loss": 0.0057, + "num_input_tokens_seen": 30300480, + "step": 143580 + }, + { + "epoch": 15.795929592959295, + "grad_norm": 0.08321550488471985, + "learning_rate": 6.433589593795917e-06, + "loss": 0.0458, + "num_input_tokens_seen": 30301600, + "step": 143585 + }, + { + "epoch": 15.796479647964796, + "grad_norm": 0.06967926770448685, + "learning_rate": 6.431982419660732e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30302656, + "step": 143590 + }, + { + "epoch": 15.797029702970297, + "grad_norm": 1.0585469007492065, + "learning_rate": 6.430375416658055e-06, + "loss": 0.0061, + "num_input_tokens_seen": 30303712, + "step": 143595 + }, + { + "epoch": 15.797579757975798, + "grad_norm": 0.012101155705749989, + "learning_rate": 6.428768584802705e-06, + "loss": 0.0188, + "num_input_tokens_seen": 30304800, + "step": 143600 + }, + { + "epoch": 15.798129812981298, + "grad_norm": 0.017360888421535492, + "learning_rate": 6.4271619241094845e-06, + "loss": 0.0078, + "num_input_tokens_seen": 30305824, + "step": 143605 + }, + { + "epoch": 15.798679867986799, + "grad_norm": 0.11168412119150162, + "learning_rate": 6.425555434593191e-06, + "loss": 0.002, + "num_input_tokens_seen": 30306848, + "step": 143610 + }, + { + "epoch": 15.7992299229923, + "grad_norm": 0.9189009070396423, + "learning_rate": 6.42394911626864e-06, + "loss": 0.0377, + "num_input_tokens_seen": 30307936, + "step": 143615 + }, + { + "epoch": 15.7997799779978, + "grad_norm": 0.022144554182887077, + "learning_rate": 6.422342969150646e-06, + "loss": 0.0802, + "num_input_tokens_seen": 30308992, + "step": 143620 + }, + { + "epoch": 15.8003300330033, + "grad_norm": 0.01787383295595646, + "learning_rate": 6.420736993253995e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30310048, + "step": 143625 + }, + { + "epoch": 15.800880088008801, + "grad_norm": 1.3124454021453857, + "learning_rate": 6.4191311885935026e-06, + "loss": 0.0186, + "num_input_tokens_seen": 30311040, + "step": 143630 + }, + { + "epoch": 15.8014301430143, + "grad_norm": 0.2823297083377838, + "learning_rate": 6.417525555183954e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30312096, + "step": 143635 + }, + { + "epoch": 15.801980198019802, + "grad_norm": 0.21651723980903625, + "learning_rate": 6.415920093040162e-06, + "loss": 0.0463, + "num_input_tokens_seen": 30313120, + "step": 143640 + }, + { + "epoch": 15.802530253025303, + "grad_norm": 0.018891990184783936, + "learning_rate": 6.414314802176913e-06, + "loss": 0.004, + "num_input_tokens_seen": 30314240, + "step": 143645 + }, + { + "epoch": 15.803080308030804, + "grad_norm": 0.39176857471466064, + "learning_rate": 6.412709682609003e-06, + "loss": 0.0486, + "num_input_tokens_seen": 30315328, + "step": 143650 + }, + { + "epoch": 15.803630363036303, + "grad_norm": 0.02601618319749832, + "learning_rate": 6.4111047343512365e-06, + "loss": 0.0582, + "num_input_tokens_seen": 30316384, + "step": 143655 + }, + { + "epoch": 15.804180418041804, + "grad_norm": 0.1104639321565628, + "learning_rate": 6.4094999574183936e-06, + "loss": 0.098, + "num_input_tokens_seen": 30317472, + "step": 143660 + }, + { + "epoch": 15.804730473047305, + "grad_norm": 0.563442587852478, + "learning_rate": 6.407895351825274e-06, + "loss": 0.0512, + "num_input_tokens_seen": 30318496, + "step": 143665 + }, + { + "epoch": 15.805280528052805, + "grad_norm": 0.020196931436657906, + "learning_rate": 6.406290917586655e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30319584, + "step": 143670 + }, + { + "epoch": 15.805830583058306, + "grad_norm": 1.198996663093567, + "learning_rate": 6.404686654717329e-06, + "loss": 0.0297, + "num_input_tokens_seen": 30320672, + "step": 143675 + }, + { + "epoch": 15.806380638063807, + "grad_norm": 2.2738773822784424, + "learning_rate": 6.4030825632320905e-06, + "loss": 0.0979, + "num_input_tokens_seen": 30321760, + "step": 143680 + }, + { + "epoch": 15.806930693069306, + "grad_norm": 0.1222858726978302, + "learning_rate": 6.401478643145711e-06, + "loss": 0.0107, + "num_input_tokens_seen": 30322784, + "step": 143685 + }, + { + "epoch": 15.807480748074807, + "grad_norm": 0.08070660382509232, + "learning_rate": 6.399874894472985e-06, + "loss": 0.0118, + "num_input_tokens_seen": 30323808, + "step": 143690 + }, + { + "epoch": 15.808030803080309, + "grad_norm": 0.13219593465328217, + "learning_rate": 6.398271317228685e-06, + "loss": 0.0115, + "num_input_tokens_seen": 30324864, + "step": 143695 + }, + { + "epoch": 15.808580858085808, + "grad_norm": 0.04927186667919159, + "learning_rate": 6.396667911427587e-06, + "loss": 0.0043, + "num_input_tokens_seen": 30325920, + "step": 143700 + }, + { + "epoch": 15.809130913091309, + "grad_norm": 0.8836777210235596, + "learning_rate": 6.395064677084475e-06, + "loss": 0.0352, + "num_input_tokens_seen": 30327040, + "step": 143705 + }, + { + "epoch": 15.80968096809681, + "grad_norm": 0.17266711592674255, + "learning_rate": 6.393461614214122e-06, + "loss": 0.1139, + "num_input_tokens_seen": 30328096, + "step": 143710 + }, + { + "epoch": 15.810231023102311, + "grad_norm": 0.31836527585983276, + "learning_rate": 6.391858722831312e-06, + "loss": 0.0099, + "num_input_tokens_seen": 30329120, + "step": 143715 + }, + { + "epoch": 15.81078107810781, + "grad_norm": 0.06127673760056496, + "learning_rate": 6.390256002950815e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30330176, + "step": 143720 + }, + { + "epoch": 15.811331133113312, + "grad_norm": 0.6997557282447815, + "learning_rate": 6.388653454587387e-06, + "loss": 0.0053, + "num_input_tokens_seen": 30331168, + "step": 143725 + }, + { + "epoch": 15.811881188118813, + "grad_norm": 0.013794376514852047, + "learning_rate": 6.387051077755812e-06, + "loss": 0.0061, + "num_input_tokens_seen": 30332224, + "step": 143730 + }, + { + "epoch": 15.812431243124312, + "grad_norm": 0.10140956938266754, + "learning_rate": 6.3854488724708564e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30333280, + "step": 143735 + }, + { + "epoch": 15.812981298129813, + "grad_norm": 0.08006805926561356, + "learning_rate": 6.38384683874729e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30334336, + "step": 143740 + }, + { + "epoch": 15.813531353135314, + "grad_norm": 0.01941293105483055, + "learning_rate": 6.382244976599877e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30335392, + "step": 143745 + }, + { + "epoch": 15.814081408140813, + "grad_norm": 0.011521860025823116, + "learning_rate": 6.380643286043372e-06, + "loss": 0.0485, + "num_input_tokens_seen": 30336448, + "step": 143750 + }, + { + "epoch": 15.814631463146315, + "grad_norm": 1.6403205394744873, + "learning_rate": 6.37904176709255e-06, + "loss": 0.0249, + "num_input_tokens_seen": 30337472, + "step": 143755 + }, + { + "epoch": 15.815181518151816, + "grad_norm": 0.038037899881601334, + "learning_rate": 6.377440419762157e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30338560, + "step": 143760 + }, + { + "epoch": 15.815731573157315, + "grad_norm": 0.1313694268465042, + "learning_rate": 6.3758392440669584e-06, + "loss": 0.0053, + "num_input_tokens_seen": 30339584, + "step": 143765 + }, + { + "epoch": 15.816281628162816, + "grad_norm": 0.021773142740130424, + "learning_rate": 6.374238240021721e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30340640, + "step": 143770 + }, + { + "epoch": 15.816831683168317, + "grad_norm": 0.031394269317388535, + "learning_rate": 6.372637407641186e-06, + "loss": 0.0546, + "num_input_tokens_seen": 30341696, + "step": 143775 + }, + { + "epoch": 15.817381738173818, + "grad_norm": 0.42500394582748413, + "learning_rate": 6.371036746940121e-06, + "loss": 0.0063, + "num_input_tokens_seen": 30342752, + "step": 143780 + }, + { + "epoch": 15.817931793179318, + "grad_norm": 0.00835930835455656, + "learning_rate": 6.369436257933264e-06, + "loss": 0.0062, + "num_input_tokens_seen": 30343744, + "step": 143785 + }, + { + "epoch": 15.818481848184819, + "grad_norm": 0.017430895939469337, + "learning_rate": 6.367835940635372e-06, + "loss": 0.0056, + "num_input_tokens_seen": 30344768, + "step": 143790 + }, + { + "epoch": 15.81903190319032, + "grad_norm": 0.029575079679489136, + "learning_rate": 6.366235795061204e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30345856, + "step": 143795 + }, + { + "epoch": 15.819581958195819, + "grad_norm": 0.049740344285964966, + "learning_rate": 6.36463582122549e-06, + "loss": 0.0128, + "num_input_tokens_seen": 30346848, + "step": 143800 + }, + { + "epoch": 15.82013201320132, + "grad_norm": 0.04437730088829994, + "learning_rate": 6.3630360191429955e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30347872, + "step": 143805 + }, + { + "epoch": 15.820682068206821, + "grad_norm": 0.3930281102657318, + "learning_rate": 6.361436388828454e-06, + "loss": 0.0175, + "num_input_tokens_seen": 30348992, + "step": 143810 + }, + { + "epoch": 15.82123212321232, + "grad_norm": 0.031679555773735046, + "learning_rate": 6.3598369302966e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30350048, + "step": 143815 + }, + { + "epoch": 15.821782178217822, + "grad_norm": 0.07589029520750046, + "learning_rate": 6.35823764356219e-06, + "loss": 0.0105, + "num_input_tokens_seen": 30351072, + "step": 143820 + }, + { + "epoch": 15.822332233223323, + "grad_norm": 0.08469068259000778, + "learning_rate": 6.356638528639955e-06, + "loss": 0.0505, + "num_input_tokens_seen": 30352096, + "step": 143825 + }, + { + "epoch": 15.822882288228822, + "grad_norm": 0.023779449984431267, + "learning_rate": 6.355039585544642e-06, + "loss": 0.0067, + "num_input_tokens_seen": 30353152, + "step": 143830 + }, + { + "epoch": 15.823432343234323, + "grad_norm": 0.008467351086437702, + "learning_rate": 6.353440814290987e-06, + "loss": 0.0065, + "num_input_tokens_seen": 30354240, + "step": 143835 + }, + { + "epoch": 15.823982398239824, + "grad_norm": 0.22099368274211884, + "learning_rate": 6.35184221489371e-06, + "loss": 0.1359, + "num_input_tokens_seen": 30355360, + "step": 143840 + }, + { + "epoch": 15.824532453245325, + "grad_norm": 0.017580142244696617, + "learning_rate": 6.350243787367563e-06, + "loss": 0.0064, + "num_input_tokens_seen": 30356416, + "step": 143845 + }, + { + "epoch": 15.825082508250825, + "grad_norm": 0.006401800084859133, + "learning_rate": 6.348645531727257e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30357472, + "step": 143850 + }, + { + "epoch": 15.825632563256326, + "grad_norm": 0.026251638308167458, + "learning_rate": 6.3470474479875475e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30358496, + "step": 143855 + }, + { + "epoch": 15.826182618261827, + "grad_norm": 0.04974013566970825, + "learning_rate": 6.345449536163153e-06, + "loss": 0.0074, + "num_input_tokens_seen": 30359552, + "step": 143860 + }, + { + "epoch": 15.826732673267326, + "grad_norm": 0.07912944257259369, + "learning_rate": 6.34385179626879e-06, + "loss": 0.0166, + "num_input_tokens_seen": 30360640, + "step": 143865 + }, + { + "epoch": 15.827282728272827, + "grad_norm": 0.018186477944254875, + "learning_rate": 6.3422542283192e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30361728, + "step": 143870 + }, + { + "epoch": 15.827832783278328, + "grad_norm": 0.033981334418058395, + "learning_rate": 6.3406568323290946e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30362816, + "step": 143875 + }, + { + "epoch": 15.828382838283828, + "grad_norm": 0.036395393311977386, + "learning_rate": 6.3390596083132e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30363904, + "step": 143880 + }, + { + "epoch": 15.828932893289329, + "grad_norm": 0.3516717851161957, + "learning_rate": 6.337462556286247e-06, + "loss": 0.0058, + "num_input_tokens_seen": 30364960, + "step": 143885 + }, + { + "epoch": 15.82948294829483, + "grad_norm": 0.20829714834690094, + "learning_rate": 6.3358656762629395e-06, + "loss": 0.0835, + "num_input_tokens_seen": 30366016, + "step": 143890 + }, + { + "epoch": 15.83003300330033, + "grad_norm": 0.4181526303291321, + "learning_rate": 6.3342689682580085e-06, + "loss": 0.0078, + "num_input_tokens_seen": 30367040, + "step": 143895 + }, + { + "epoch": 15.83058305830583, + "grad_norm": 0.004957317840307951, + "learning_rate": 6.332672432286154e-06, + "loss": 0.0557, + "num_input_tokens_seen": 30368064, + "step": 143900 + }, + { + "epoch": 15.831133113311331, + "grad_norm": 0.006262457463890314, + "learning_rate": 6.331076068362104e-06, + "loss": 0.1174, + "num_input_tokens_seen": 30369088, + "step": 143905 + }, + { + "epoch": 15.831683168316832, + "grad_norm": 0.06285975873470306, + "learning_rate": 6.329479876500574e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30370176, + "step": 143910 + }, + { + "epoch": 15.832233223322332, + "grad_norm": 1.4584343433380127, + "learning_rate": 6.32788385671626e-06, + "loss": 0.0106, + "num_input_tokens_seen": 30371232, + "step": 143915 + }, + { + "epoch": 15.832783278327833, + "grad_norm": 0.010347685776650906, + "learning_rate": 6.32628800902389e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30372288, + "step": 143920 + }, + { + "epoch": 15.833333333333334, + "grad_norm": 0.5454383492469788, + "learning_rate": 6.324692333438156e-06, + "loss": 0.0074, + "num_input_tokens_seen": 30373312, + "step": 143925 + }, + { + "epoch": 15.833883388338833, + "grad_norm": 1.7291433811187744, + "learning_rate": 6.323096829973776e-06, + "loss": 0.0443, + "num_input_tokens_seen": 30374272, + "step": 143930 + }, + { + "epoch": 15.834433443344334, + "grad_norm": 0.01053988840430975, + "learning_rate": 6.321501498645446e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30375392, + "step": 143935 + }, + { + "epoch": 15.834983498349835, + "grad_norm": 0.026859698817133904, + "learning_rate": 6.3199063394678735e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30376448, + "step": 143940 + }, + { + "epoch": 15.835533553355335, + "grad_norm": 0.04256056249141693, + "learning_rate": 6.318311352455766e-06, + "loss": 0.0086, + "num_input_tokens_seen": 30377504, + "step": 143945 + }, + { + "epoch": 15.836083608360836, + "grad_norm": 0.024751313030719757, + "learning_rate": 6.316716537623813e-06, + "loss": 0.031, + "num_input_tokens_seen": 30378528, + "step": 143950 + }, + { + "epoch": 15.836633663366337, + "grad_norm": 0.10874924063682556, + "learning_rate": 6.3151218949867255e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30379616, + "step": 143955 + }, + { + "epoch": 15.837183718371836, + "grad_norm": 1.9058570861816406, + "learning_rate": 6.313527424559193e-06, + "loss": 0.0501, + "num_input_tokens_seen": 30380640, + "step": 143960 + }, + { + "epoch": 15.837733773377337, + "grad_norm": 0.1691751480102539, + "learning_rate": 6.3119331263558984e-06, + "loss": 0.002, + "num_input_tokens_seen": 30381664, + "step": 143965 + }, + { + "epoch": 15.838283828382838, + "grad_norm": 0.06753017008304596, + "learning_rate": 6.310339000391563e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30382848, + "step": 143970 + }, + { + "epoch": 15.83883388338834, + "grad_norm": 0.02001316472887993, + "learning_rate": 6.308745046680856e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30383936, + "step": 143975 + }, + { + "epoch": 15.839383938393839, + "grad_norm": 0.14006082713603973, + "learning_rate": 6.307151265238484e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30384992, + "step": 143980 + }, + { + "epoch": 15.83993399339934, + "grad_norm": 0.03987819701433182, + "learning_rate": 6.305557656079131e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30386112, + "step": 143985 + }, + { + "epoch": 15.840484048404841, + "grad_norm": 0.017768343910574913, + "learning_rate": 6.303964219217476e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30387200, + "step": 143990 + }, + { + "epoch": 15.84103410341034, + "grad_norm": 0.013467025011777878, + "learning_rate": 6.30237095466821e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30388256, + "step": 143995 + }, + { + "epoch": 15.841584158415841, + "grad_norm": 1.6744855642318726, + "learning_rate": 6.30077786244602e-06, + "loss": 0.023, + "num_input_tokens_seen": 30389280, + "step": 144000 + }, + { + "epoch": 15.842134213421343, + "grad_norm": 0.009512504562735558, + "learning_rate": 6.299184942565592e-06, + "loss": 0.012, + "num_input_tokens_seen": 30390272, + "step": 144005 + }, + { + "epoch": 15.842684268426842, + "grad_norm": 0.1258317232131958, + "learning_rate": 6.297592195041605e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30391328, + "step": 144010 + }, + { + "epoch": 15.843234323432343, + "grad_norm": 3.8564329147338867, + "learning_rate": 6.295999619888729e-06, + "loss": 0.1061, + "num_input_tokens_seen": 30392384, + "step": 144015 + }, + { + "epoch": 15.843784378437844, + "grad_norm": 0.023399151861667633, + "learning_rate": 6.2944072171216564e-06, + "loss": 0.0077, + "num_input_tokens_seen": 30393408, + "step": 144020 + }, + { + "epoch": 15.844334433443345, + "grad_norm": 0.6348488926887512, + "learning_rate": 6.2928149867550495e-06, + "loss": 0.0237, + "num_input_tokens_seen": 30394464, + "step": 144025 + }, + { + "epoch": 15.844884488448844, + "grad_norm": 1.6114181280136108, + "learning_rate": 6.291222928803592e-06, + "loss": 0.164, + "num_input_tokens_seen": 30395520, + "step": 144030 + }, + { + "epoch": 15.845434543454346, + "grad_norm": 0.03544428572058678, + "learning_rate": 6.28963104328196e-06, + "loss": 0.0176, + "num_input_tokens_seen": 30396512, + "step": 144035 + }, + { + "epoch": 15.845984598459847, + "grad_norm": 0.48002299666404724, + "learning_rate": 6.288039330204815e-06, + "loss": 0.0051, + "num_input_tokens_seen": 30397536, + "step": 144040 + }, + { + "epoch": 15.846534653465346, + "grad_norm": 0.5966328978538513, + "learning_rate": 6.2864477895868415e-06, + "loss": 0.0159, + "num_input_tokens_seen": 30398528, + "step": 144045 + }, + { + "epoch": 15.847084708470847, + "grad_norm": 0.03573514148592949, + "learning_rate": 6.284856421442689e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30399552, + "step": 144050 + }, + { + "epoch": 15.847634763476348, + "grad_norm": 0.07686898112297058, + "learning_rate": 6.283265225787036e-06, + "loss": 0.028, + "num_input_tokens_seen": 30400608, + "step": 144055 + }, + { + "epoch": 15.848184818481847, + "grad_norm": 0.09568673372268677, + "learning_rate": 6.281674202634552e-06, + "loss": 0.0109, + "num_input_tokens_seen": 30401664, + "step": 144060 + }, + { + "epoch": 15.848734873487349, + "grad_norm": 1.3331729173660278, + "learning_rate": 6.280083351999888e-06, + "loss": 0.0645, + "num_input_tokens_seen": 30402752, + "step": 144065 + }, + { + "epoch": 15.84928492849285, + "grad_norm": 0.003137866733595729, + "learning_rate": 6.27849267389772e-06, + "loss": 0.0393, + "num_input_tokens_seen": 30403808, + "step": 144070 + }, + { + "epoch": 15.84983498349835, + "grad_norm": 0.009410783648490906, + "learning_rate": 6.276902168342702e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30404832, + "step": 144075 + }, + { + "epoch": 15.85038503850385, + "grad_norm": 1.3859211206436157, + "learning_rate": 6.275311835349487e-06, + "loss": 0.0266, + "num_input_tokens_seen": 30405824, + "step": 144080 + }, + { + "epoch": 15.850935093509351, + "grad_norm": 1.9543792009353638, + "learning_rate": 6.273721674932737e-06, + "loss": 0.083, + "num_input_tokens_seen": 30406816, + "step": 144085 + }, + { + "epoch": 15.851485148514852, + "grad_norm": 3.416489362716675, + "learning_rate": 6.272131687107108e-06, + "loss": 0.0516, + "num_input_tokens_seen": 30407936, + "step": 144090 + }, + { + "epoch": 15.852035203520352, + "grad_norm": 0.0374533087015152, + "learning_rate": 6.270541871887261e-06, + "loss": 0.0468, + "num_input_tokens_seen": 30408960, + "step": 144095 + }, + { + "epoch": 15.852585258525853, + "grad_norm": 0.030425451695919037, + "learning_rate": 6.268952229287842e-06, + "loss": 0.002, + "num_input_tokens_seen": 30409952, + "step": 144100 + }, + { + "epoch": 15.853135313531354, + "grad_norm": 0.015663864091038704, + "learning_rate": 6.267362759323498e-06, + "loss": 0.0047, + "num_input_tokens_seen": 30410944, + "step": 144105 + }, + { + "epoch": 15.853685368536853, + "grad_norm": 0.7984446883201599, + "learning_rate": 6.265773462008881e-06, + "loss": 0.0104, + "num_input_tokens_seen": 30412064, + "step": 144110 + }, + { + "epoch": 15.854235423542354, + "grad_norm": 0.5506104826927185, + "learning_rate": 6.264184337358639e-06, + "loss": 0.0044, + "num_input_tokens_seen": 30413120, + "step": 144115 + }, + { + "epoch": 15.854785478547855, + "grad_norm": 0.039929501712322235, + "learning_rate": 6.262595385387429e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30414240, + "step": 144120 + }, + { + "epoch": 15.855335533553355, + "grad_norm": 2.0918161869049072, + "learning_rate": 6.261006606109884e-06, + "loss": 0.0373, + "num_input_tokens_seen": 30415264, + "step": 144125 + }, + { + "epoch": 15.855885588558856, + "grad_norm": 0.008721664547920227, + "learning_rate": 6.259417999540645e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30416256, + "step": 144130 + }, + { + "epoch": 15.856435643564357, + "grad_norm": 0.008720527403056622, + "learning_rate": 6.257829565694365e-06, + "loss": 0.0095, + "num_input_tokens_seen": 30417248, + "step": 144135 + }, + { + "epoch": 15.856985698569858, + "grad_norm": 0.04412417858839035, + "learning_rate": 6.256241304585667e-06, + "loss": 0.129, + "num_input_tokens_seen": 30418304, + "step": 144140 + }, + { + "epoch": 15.857535753575357, + "grad_norm": 0.054785918444395065, + "learning_rate": 6.254653216229198e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30419328, + "step": 144145 + }, + { + "epoch": 15.858085808580858, + "grad_norm": 0.13393905758857727, + "learning_rate": 6.253065300639604e-06, + "loss": 0.0095, + "num_input_tokens_seen": 30420416, + "step": 144150 + }, + { + "epoch": 15.85863586358636, + "grad_norm": 2.170393705368042, + "learning_rate": 6.251477557831506e-06, + "loss": 0.0266, + "num_input_tokens_seen": 30421504, + "step": 144155 + }, + { + "epoch": 15.859185918591859, + "grad_norm": 0.02662472613155842, + "learning_rate": 6.249889987819549e-06, + "loss": 0.0238, + "num_input_tokens_seen": 30422528, + "step": 144160 + }, + { + "epoch": 15.85973597359736, + "grad_norm": 0.08103436231613159, + "learning_rate": 6.248302590618349e-06, + "loss": 0.0043, + "num_input_tokens_seen": 30423552, + "step": 144165 + }, + { + "epoch": 15.86028602860286, + "grad_norm": 0.08424148708581924, + "learning_rate": 6.24671536624255e-06, + "loss": 0.003, + "num_input_tokens_seen": 30424576, + "step": 144170 + }, + { + "epoch": 15.86083608360836, + "grad_norm": 1.425411343574524, + "learning_rate": 6.245128314706783e-06, + "loss": 0.0172, + "num_input_tokens_seen": 30425664, + "step": 144175 + }, + { + "epoch": 15.861386138613861, + "grad_norm": 0.10055513679981232, + "learning_rate": 6.2435414360256596e-06, + "loss": 0.003, + "num_input_tokens_seen": 30426752, + "step": 144180 + }, + { + "epoch": 15.861936193619362, + "grad_norm": 0.031855508685112, + "learning_rate": 6.241954730213823e-06, + "loss": 0.0111, + "num_input_tokens_seen": 30427808, + "step": 144185 + }, + { + "epoch": 15.862486248624862, + "grad_norm": 0.08796636015176773, + "learning_rate": 6.240368197285881e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30428864, + "step": 144190 + }, + { + "epoch": 15.863036303630363, + "grad_norm": 2.9305810928344727, + "learning_rate": 6.238781837256472e-06, + "loss": 0.0377, + "num_input_tokens_seen": 30429888, + "step": 144195 + }, + { + "epoch": 15.863586358635864, + "grad_norm": 0.025728730484843254, + "learning_rate": 6.237195650140201e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30431008, + "step": 144200 + }, + { + "epoch": 15.864136413641365, + "grad_norm": 0.04842870682477951, + "learning_rate": 6.235609635951692e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30432128, + "step": 144205 + }, + { + "epoch": 15.864686468646864, + "grad_norm": 0.04712526127696037, + "learning_rate": 6.234023794705574e-06, + "loss": 0.0401, + "num_input_tokens_seen": 30433184, + "step": 144210 + }, + { + "epoch": 15.865236523652365, + "grad_norm": 0.009031401015818119, + "learning_rate": 6.232438126416454e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30434272, + "step": 144215 + }, + { + "epoch": 15.865786578657866, + "grad_norm": 0.021999504417181015, + "learning_rate": 6.230852631098941e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30435392, + "step": 144220 + }, + { + "epoch": 15.866336633663366, + "grad_norm": 2.4989116191864014, + "learning_rate": 6.22926730876765e-06, + "loss": 0.0754, + "num_input_tokens_seen": 30436448, + "step": 144225 + }, + { + "epoch": 15.866886688668867, + "grad_norm": 0.14592821896076202, + "learning_rate": 6.227682159437195e-06, + "loss": 0.0058, + "num_input_tokens_seen": 30437472, + "step": 144230 + }, + { + "epoch": 15.867436743674368, + "grad_norm": 0.03023061901330948, + "learning_rate": 6.2260971831221956e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30438560, + "step": 144235 + }, + { + "epoch": 15.867986798679867, + "grad_norm": 0.5589037537574768, + "learning_rate": 6.224512379837247e-06, + "loss": 0.0057, + "num_input_tokens_seen": 30439584, + "step": 144240 + }, + { + "epoch": 15.868536853685368, + "grad_norm": 0.027580566704273224, + "learning_rate": 6.2229277495969515e-06, + "loss": 0.0098, + "num_input_tokens_seen": 30440672, + "step": 144245 + }, + { + "epoch": 15.86908690869087, + "grad_norm": 0.014783380553126335, + "learning_rate": 6.2213432924159296e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30441664, + "step": 144250 + }, + { + "epoch": 15.869636963696369, + "grad_norm": 0.010383245535194874, + "learning_rate": 6.219759008308768e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30442720, + "step": 144255 + }, + { + "epoch": 15.87018701870187, + "grad_norm": 0.01307919342070818, + "learning_rate": 6.218174897290077e-06, + "loss": 0.0251, + "num_input_tokens_seen": 30443744, + "step": 144260 + }, + { + "epoch": 15.870737073707371, + "grad_norm": 0.3008197546005249, + "learning_rate": 6.21659095937446e-06, + "loss": 0.04, + "num_input_tokens_seen": 30444864, + "step": 144265 + }, + { + "epoch": 15.871287128712872, + "grad_norm": 0.004683095961809158, + "learning_rate": 6.215007194576505e-06, + "loss": 0.1292, + "num_input_tokens_seen": 30445952, + "step": 144270 + }, + { + "epoch": 15.871837183718371, + "grad_norm": 0.09548845142126083, + "learning_rate": 6.2134236029108225e-06, + "loss": 0.0969, + "num_input_tokens_seen": 30447072, + "step": 144275 + }, + { + "epoch": 15.872387238723872, + "grad_norm": 0.012324187904596329, + "learning_rate": 6.211840184391993e-06, + "loss": 0.1106, + "num_input_tokens_seen": 30448096, + "step": 144280 + }, + { + "epoch": 15.872937293729374, + "grad_norm": 0.004983154125511646, + "learning_rate": 6.210256939034617e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30449120, + "step": 144285 + }, + { + "epoch": 15.873487348734873, + "grad_norm": 0.1158214658498764, + "learning_rate": 6.208673866853293e-06, + "loss": 0.0067, + "num_input_tokens_seen": 30450208, + "step": 144290 + }, + { + "epoch": 15.874037403740374, + "grad_norm": 0.9045621752738953, + "learning_rate": 6.2070909678626e-06, + "loss": 0.0069, + "num_input_tokens_seen": 30451296, + "step": 144295 + }, + { + "epoch": 15.874587458745875, + "grad_norm": 0.02687167376279831, + "learning_rate": 6.205508242077138e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30452352, + "step": 144300 + }, + { + "epoch": 15.875137513751374, + "grad_norm": 0.010053335689008236, + "learning_rate": 6.203925689511478e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30453408, + "step": 144305 + }, + { + "epoch": 15.875687568756875, + "grad_norm": 0.005774384364485741, + "learning_rate": 6.202343310180225e-06, + "loss": 0.0209, + "num_input_tokens_seen": 30454368, + "step": 144310 + }, + { + "epoch": 15.876237623762377, + "grad_norm": 0.020763365551829338, + "learning_rate": 6.200761104097946e-06, + "loss": 0.066, + "num_input_tokens_seen": 30455456, + "step": 144315 + }, + { + "epoch": 15.876787678767876, + "grad_norm": 0.01261898037046194, + "learning_rate": 6.199179071279232e-06, + "loss": 0.0914, + "num_input_tokens_seen": 30456480, + "step": 144320 + }, + { + "epoch": 15.877337733773377, + "grad_norm": 0.030884699895977974, + "learning_rate": 6.19759721173867e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30457536, + "step": 144325 + }, + { + "epoch": 15.877887788778878, + "grad_norm": 4.376774311065674, + "learning_rate": 6.196015525490825e-06, + "loss": 0.0951, + "num_input_tokens_seen": 30458624, + "step": 144330 + }, + { + "epoch": 15.87843784378438, + "grad_norm": 0.011653625406324863, + "learning_rate": 6.194434012550291e-06, + "loss": 0.0324, + "num_input_tokens_seen": 30459680, + "step": 144335 + }, + { + "epoch": 15.878987898789878, + "grad_norm": 0.03475311025977135, + "learning_rate": 6.192852672931626e-06, + "loss": 0.002, + "num_input_tokens_seen": 30460704, + "step": 144340 + }, + { + "epoch": 15.87953795379538, + "grad_norm": 0.013332840986549854, + "learning_rate": 6.191271506649413e-06, + "loss": 0.0688, + "num_input_tokens_seen": 30461728, + "step": 144345 + }, + { + "epoch": 15.88008800880088, + "grad_norm": 0.00616608839482069, + "learning_rate": 6.189690513718232e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30462816, + "step": 144350 + }, + { + "epoch": 15.88063806380638, + "grad_norm": 0.27301985025405884, + "learning_rate": 6.188109694152641e-06, + "loss": 0.0565, + "num_input_tokens_seen": 30463936, + "step": 144355 + }, + { + "epoch": 15.881188118811881, + "grad_norm": 0.006997459568083286, + "learning_rate": 6.1865290479672235e-06, + "loss": 0.0607, + "num_input_tokens_seen": 30465024, + "step": 144360 + }, + { + "epoch": 15.881738173817382, + "grad_norm": 0.08980752527713776, + "learning_rate": 6.184948575176538e-06, + "loss": 0.0349, + "num_input_tokens_seen": 30466176, + "step": 144365 + }, + { + "epoch": 15.882288228822881, + "grad_norm": 0.0676497146487236, + "learning_rate": 6.183368275795148e-06, + "loss": 0.0076, + "num_input_tokens_seen": 30467296, + "step": 144370 + }, + { + "epoch": 15.882838283828383, + "grad_norm": 0.08729438483715057, + "learning_rate": 6.181788149837625e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30468320, + "step": 144375 + }, + { + "epoch": 15.883388338833884, + "grad_norm": 3.5548417568206787, + "learning_rate": 6.18020819731853e-06, + "loss": 0.03, + "num_input_tokens_seen": 30469408, + "step": 144380 + }, + { + "epoch": 15.883938393839383, + "grad_norm": 0.024762162938714027, + "learning_rate": 6.17862841825243e-06, + "loss": 0.0104, + "num_input_tokens_seen": 30470464, + "step": 144385 + }, + { + "epoch": 15.884488448844884, + "grad_norm": 0.04181085526943207, + "learning_rate": 6.177048812653882e-06, + "loss": 0.0474, + "num_input_tokens_seen": 30471488, + "step": 144390 + }, + { + "epoch": 15.885038503850385, + "grad_norm": 0.035321708768606186, + "learning_rate": 6.1754693805374355e-06, + "loss": 0.0556, + "num_input_tokens_seen": 30472512, + "step": 144395 + }, + { + "epoch": 15.885588558855886, + "grad_norm": 0.005173183046281338, + "learning_rate": 6.1738901219176555e-06, + "loss": 0.0018, + "num_input_tokens_seen": 30473504, + "step": 144400 + }, + { + "epoch": 15.886138613861386, + "grad_norm": 0.27128931879997253, + "learning_rate": 6.172311036809103e-06, + "loss": 0.0323, + "num_input_tokens_seen": 30474624, + "step": 144405 + }, + { + "epoch": 15.886688668866887, + "grad_norm": 0.12366432696580887, + "learning_rate": 6.170732125226319e-06, + "loss": 0.0113, + "num_input_tokens_seen": 30475648, + "step": 144410 + }, + { + "epoch": 15.887238723872388, + "grad_norm": 2.966169595718384, + "learning_rate": 6.169153387183871e-06, + "loss": 0.0876, + "num_input_tokens_seen": 30476672, + "step": 144415 + }, + { + "epoch": 15.887788778877887, + "grad_norm": 0.022639039903879166, + "learning_rate": 6.1675748226962894e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30477728, + "step": 144420 + }, + { + "epoch": 15.888338833883388, + "grad_norm": 0.06185370683670044, + "learning_rate": 6.165996431778142e-06, + "loss": 0.037, + "num_input_tokens_seen": 30478848, + "step": 144425 + }, + { + "epoch": 15.88888888888889, + "grad_norm": 0.007237161509692669, + "learning_rate": 6.164418214443962e-06, + "loss": 0.0224, + "num_input_tokens_seen": 30479936, + "step": 144430 + }, + { + "epoch": 15.88943894389439, + "grad_norm": 0.007405493874102831, + "learning_rate": 6.1628401707083e-06, + "loss": 0.0063, + "num_input_tokens_seen": 30480960, + "step": 144435 + }, + { + "epoch": 15.88998899889989, + "grad_norm": 0.06255337595939636, + "learning_rate": 6.16126230058571e-06, + "loss": 0.0248, + "num_input_tokens_seen": 30482016, + "step": 144440 + }, + { + "epoch": 15.89053905390539, + "grad_norm": 0.02479272149503231, + "learning_rate": 6.159684604090718e-06, + "loss": 0.0181, + "num_input_tokens_seen": 30483072, + "step": 144445 + }, + { + "epoch": 15.891089108910892, + "grad_norm": 0.07533037662506104, + "learning_rate": 6.158107081237879e-06, + "loss": 0.0267, + "num_input_tokens_seen": 30484128, + "step": 144450 + }, + { + "epoch": 15.891639163916391, + "grad_norm": 0.02406635694205761, + "learning_rate": 6.156529732041719e-06, + "loss": 0.0488, + "num_input_tokens_seen": 30485152, + "step": 144455 + }, + { + "epoch": 15.892189218921892, + "grad_norm": 0.039656177163124084, + "learning_rate": 6.15495255651678e-06, + "loss": 0.0747, + "num_input_tokens_seen": 30486144, + "step": 144460 + }, + { + "epoch": 15.892739273927393, + "grad_norm": 0.2406587153673172, + "learning_rate": 6.153375554677612e-06, + "loss": 0.0924, + "num_input_tokens_seen": 30487200, + "step": 144465 + }, + { + "epoch": 15.893289328932893, + "grad_norm": 1.4125593900680542, + "learning_rate": 6.15179872653873e-06, + "loss": 0.1042, + "num_input_tokens_seen": 30488224, + "step": 144470 + }, + { + "epoch": 15.893839383938394, + "grad_norm": 0.7007049918174744, + "learning_rate": 6.150222072114678e-06, + "loss": 0.0446, + "num_input_tokens_seen": 30489312, + "step": 144475 + }, + { + "epoch": 15.894389438943895, + "grad_norm": 0.016481034457683563, + "learning_rate": 6.148645591419988e-06, + "loss": 0.1114, + "num_input_tokens_seen": 30490336, + "step": 144480 + }, + { + "epoch": 15.894939493949394, + "grad_norm": 2.587343454360962, + "learning_rate": 6.147069284469178e-06, + "loss": 0.1889, + "num_input_tokens_seen": 30491456, + "step": 144485 + }, + { + "epoch": 15.895489548954895, + "grad_norm": 0.009859863668680191, + "learning_rate": 6.1454931512767825e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30492416, + "step": 144490 + }, + { + "epoch": 15.896039603960396, + "grad_norm": 0.17946431040763855, + "learning_rate": 6.143917191857332e-06, + "loss": 0.003, + "num_input_tokens_seen": 30493472, + "step": 144495 + }, + { + "epoch": 15.896589658965897, + "grad_norm": 0.01710118167102337, + "learning_rate": 6.142341406225355e-06, + "loss": 0.0116, + "num_input_tokens_seen": 30494528, + "step": 144500 + }, + { + "epoch": 15.897139713971397, + "grad_norm": 0.1114494651556015, + "learning_rate": 6.140765794395367e-06, + "loss": 0.0113, + "num_input_tokens_seen": 30495584, + "step": 144505 + }, + { + "epoch": 15.897689768976898, + "grad_norm": 0.3245413303375244, + "learning_rate": 6.139190356381885e-06, + "loss": 0.0071, + "num_input_tokens_seen": 30496704, + "step": 144510 + }, + { + "epoch": 15.898239823982399, + "grad_norm": 0.009829316288232803, + "learning_rate": 6.1376150921994335e-06, + "loss": 0.0437, + "num_input_tokens_seen": 30497664, + "step": 144515 + }, + { + "epoch": 15.898789878987898, + "grad_norm": 1.8079835176467896, + "learning_rate": 6.136040001862536e-06, + "loss": 0.2019, + "num_input_tokens_seen": 30498752, + "step": 144520 + }, + { + "epoch": 15.8993399339934, + "grad_norm": 0.021421736106276512, + "learning_rate": 6.13446508538571e-06, + "loss": 0.0144, + "num_input_tokens_seen": 30499872, + "step": 144525 + }, + { + "epoch": 15.8998899889989, + "grad_norm": 1.7258163690567017, + "learning_rate": 6.1328903427834674e-06, + "loss": 0.0187, + "num_input_tokens_seen": 30500896, + "step": 144530 + }, + { + "epoch": 15.9004400440044, + "grad_norm": 0.014799715019762516, + "learning_rate": 6.131315774070315e-06, + "loss": 0.0596, + "num_input_tokens_seen": 30501952, + "step": 144535 + }, + { + "epoch": 15.900990099009901, + "grad_norm": 0.06539929658174515, + "learning_rate": 6.129741379260778e-06, + "loss": 0.1046, + "num_input_tokens_seen": 30503072, + "step": 144540 + }, + { + "epoch": 15.901540154015402, + "grad_norm": 0.37950751185417175, + "learning_rate": 6.128167158369353e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30504192, + "step": 144545 + }, + { + "epoch": 15.902090209020901, + "grad_norm": 0.09673900157213211, + "learning_rate": 6.126593111410556e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30505312, + "step": 144550 + }, + { + "epoch": 15.902640264026402, + "grad_norm": 0.03938229754567146, + "learning_rate": 6.1250192383988995e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30506336, + "step": 144555 + }, + { + "epoch": 15.903190319031903, + "grad_norm": 0.013124078512191772, + "learning_rate": 6.123445539348877e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30507456, + "step": 144560 + }, + { + "epoch": 15.903740374037405, + "grad_norm": 0.007839835248887539, + "learning_rate": 6.121872014275007e-06, + "loss": 0.0157, + "num_input_tokens_seen": 30508512, + "step": 144565 + }, + { + "epoch": 15.904290429042904, + "grad_norm": 0.0062683019787073135, + "learning_rate": 6.1202986631917755e-06, + "loss": 0.0077, + "num_input_tokens_seen": 30509600, + "step": 144570 + }, + { + "epoch": 15.904840484048405, + "grad_norm": 0.017344102263450623, + "learning_rate": 6.118725486113694e-06, + "loss": 0.0259, + "num_input_tokens_seen": 30510592, + "step": 144575 + }, + { + "epoch": 15.905390539053906, + "grad_norm": 0.03589070588350296, + "learning_rate": 6.1171524830552665e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30511680, + "step": 144580 + }, + { + "epoch": 15.905940594059405, + "grad_norm": 0.01327302772551775, + "learning_rate": 6.115579654030976e-06, + "loss": 0.0161, + "num_input_tokens_seen": 30512704, + "step": 144585 + }, + { + "epoch": 15.906490649064907, + "grad_norm": 0.02283361740410328, + "learning_rate": 6.114006999055333e-06, + "loss": 0.0068, + "num_input_tokens_seen": 30513728, + "step": 144590 + }, + { + "epoch": 15.907040704070408, + "grad_norm": 0.10734309256076813, + "learning_rate": 6.1124345181428235e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30514816, + "step": 144595 + }, + { + "epoch": 15.907590759075907, + "grad_norm": 0.01440461352467537, + "learning_rate": 6.110862211307936e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30515904, + "step": 144600 + }, + { + "epoch": 15.908140814081408, + "grad_norm": 0.013126683421432972, + "learning_rate": 6.109290078565169e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30516960, + "step": 144605 + }, + { + "epoch": 15.908690869086909, + "grad_norm": 0.01132075022906065, + "learning_rate": 6.10771811992901e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30518016, + "step": 144610 + }, + { + "epoch": 15.909240924092408, + "grad_norm": 0.06063085421919823, + "learning_rate": 6.106146335413953e-06, + "loss": 0.0141, + "num_input_tokens_seen": 30519104, + "step": 144615 + }, + { + "epoch": 15.90979097909791, + "grad_norm": 0.3442826569080353, + "learning_rate": 6.1045747250344815e-06, + "loss": 0.0406, + "num_input_tokens_seen": 30520192, + "step": 144620 + }, + { + "epoch": 15.91034103410341, + "grad_norm": 0.003596184542402625, + "learning_rate": 6.10300328880507e-06, + "loss": 0.0058, + "num_input_tokens_seen": 30521248, + "step": 144625 + }, + { + "epoch": 15.910891089108912, + "grad_norm": 0.039047420024871826, + "learning_rate": 6.101432026740217e-06, + "loss": 0.0177, + "num_input_tokens_seen": 30522368, + "step": 144630 + }, + { + "epoch": 15.911441144114411, + "grad_norm": 0.06558527052402496, + "learning_rate": 6.099860938854382e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30523456, + "step": 144635 + }, + { + "epoch": 15.911991199119912, + "grad_norm": 2.1110873222351074, + "learning_rate": 6.098290025162073e-06, + "loss": 0.1826, + "num_input_tokens_seen": 30524576, + "step": 144640 + }, + { + "epoch": 15.912541254125413, + "grad_norm": 1.978780746459961, + "learning_rate": 6.096719285677754e-06, + "loss": 0.0755, + "num_input_tokens_seen": 30525632, + "step": 144645 + }, + { + "epoch": 15.913091309130913, + "grad_norm": 0.0066551570780575275, + "learning_rate": 6.0951487204158966e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30526688, + "step": 144650 + }, + { + "epoch": 15.913641364136414, + "grad_norm": 0.01461878139525652, + "learning_rate": 6.093578329390992e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30527744, + "step": 144655 + }, + { + "epoch": 15.914191419141915, + "grad_norm": 0.06385461241006851, + "learning_rate": 6.092008112617492e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30528800, + "step": 144660 + }, + { + "epoch": 15.914741474147414, + "grad_norm": 0.01690489426255226, + "learning_rate": 6.090438070109883e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30529856, + "step": 144665 + }, + { + "epoch": 15.915291529152915, + "grad_norm": 0.004434072412550449, + "learning_rate": 6.088868201882638e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30530848, + "step": 144670 + }, + { + "epoch": 15.915841584158416, + "grad_norm": 0.0072120376862585545, + "learning_rate": 6.0872985079502176e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30531936, + "step": 144675 + }, + { + "epoch": 15.916391639163916, + "grad_norm": 0.03998611122369766, + "learning_rate": 6.085728988327094e-06, + "loss": 0.0032, + "num_input_tokens_seen": 30533024, + "step": 144680 + }, + { + "epoch": 15.916941694169417, + "grad_norm": 0.025267472490668297, + "learning_rate": 6.084159643027726e-06, + "loss": 0.0134, + "num_input_tokens_seen": 30534048, + "step": 144685 + }, + { + "epoch": 15.917491749174918, + "grad_norm": 0.06634160876274109, + "learning_rate": 6.082590472066588e-06, + "loss": 0.1182, + "num_input_tokens_seen": 30535104, + "step": 144690 + }, + { + "epoch": 15.918041804180419, + "grad_norm": 0.06130179762840271, + "learning_rate": 6.081021475458129e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30536128, + "step": 144695 + }, + { + "epoch": 15.918591859185918, + "grad_norm": 0.08691247552633286, + "learning_rate": 6.079452653216816e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30537216, + "step": 144700 + }, + { + "epoch": 15.91914191419142, + "grad_norm": 0.005668507888913155, + "learning_rate": 6.077884005357118e-06, + "loss": 0.0018, + "num_input_tokens_seen": 30538304, + "step": 144705 + }, + { + "epoch": 15.91969196919692, + "grad_norm": 0.003491136245429516, + "learning_rate": 6.076315531893473e-06, + "loss": 0.1345, + "num_input_tokens_seen": 30539392, + "step": 144710 + }, + { + "epoch": 15.92024202420242, + "grad_norm": 0.13094528019428253, + "learning_rate": 6.074747232840358e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30540480, + "step": 144715 + }, + { + "epoch": 15.92079207920792, + "grad_norm": 1.8851447105407715, + "learning_rate": 6.073179108212207e-06, + "loss": 0.0588, + "num_input_tokens_seen": 30541536, + "step": 144720 + }, + { + "epoch": 15.921342134213422, + "grad_norm": 0.056408677250146866, + "learning_rate": 6.071611158023483e-06, + "loss": 0.0101, + "num_input_tokens_seen": 30542592, + "step": 144725 + }, + { + "epoch": 15.921892189218921, + "grad_norm": 0.019318796694278717, + "learning_rate": 6.070043382288643e-06, + "loss": 0.0045, + "num_input_tokens_seen": 30543616, + "step": 144730 + }, + { + "epoch": 15.922442244224422, + "grad_norm": 0.11617577075958252, + "learning_rate": 6.068475781022123e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30544704, + "step": 144735 + }, + { + "epoch": 15.922992299229923, + "grad_norm": 0.2581149935722351, + "learning_rate": 6.066908354238385e-06, + "loss": 0.043, + "num_input_tokens_seen": 30545792, + "step": 144740 + }, + { + "epoch": 15.923542354235423, + "grad_norm": 0.007521031424403191, + "learning_rate": 6.065341101951869e-06, + "loss": 0.0104, + "num_input_tokens_seen": 30546880, + "step": 144745 + }, + { + "epoch": 15.924092409240924, + "grad_norm": 0.1716349571943283, + "learning_rate": 6.063774024177002e-06, + "loss": 0.1024, + "num_input_tokens_seen": 30547904, + "step": 144750 + }, + { + "epoch": 15.924642464246425, + "grad_norm": 0.06315621733665466, + "learning_rate": 6.062207120928256e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30548928, + "step": 144755 + }, + { + "epoch": 15.925192519251926, + "grad_norm": 0.10734585672616959, + "learning_rate": 6.060640392220054e-06, + "loss": 0.0506, + "num_input_tokens_seen": 30550016, + "step": 144760 + }, + { + "epoch": 15.925742574257425, + "grad_norm": 1.749890685081482, + "learning_rate": 6.059073838066851e-06, + "loss": 0.009, + "num_input_tokens_seen": 30551008, + "step": 144765 + }, + { + "epoch": 15.926292629262926, + "grad_norm": 0.023466356098651886, + "learning_rate": 6.0575074584830745e-06, + "loss": 0.0041, + "num_input_tokens_seen": 30552032, + "step": 144770 + }, + { + "epoch": 15.926842684268427, + "grad_norm": 0.25009894371032715, + "learning_rate": 6.055941253483155e-06, + "loss": 0.0131, + "num_input_tokens_seen": 30553024, + "step": 144775 + }, + { + "epoch": 15.927392739273927, + "grad_norm": 0.027409350499510765, + "learning_rate": 6.0543752230815324e-06, + "loss": 0.006, + "num_input_tokens_seen": 30554048, + "step": 144780 + }, + { + "epoch": 15.927942794279428, + "grad_norm": 0.01856524683535099, + "learning_rate": 6.052809367292653e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30555168, + "step": 144785 + }, + { + "epoch": 15.928492849284929, + "grad_norm": 0.018488695845007896, + "learning_rate": 6.051243686130928e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30556224, + "step": 144790 + }, + { + "epoch": 15.929042904290428, + "grad_norm": 0.005428322125226259, + "learning_rate": 6.049678179610807e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30557280, + "step": 144795 + }, + { + "epoch": 15.92959295929593, + "grad_norm": 0.29773223400115967, + "learning_rate": 6.0481128477467005e-06, + "loss": 0.109, + "num_input_tokens_seen": 30558304, + "step": 144800 + }, + { + "epoch": 15.93014301430143, + "grad_norm": 1.8906887769699097, + "learning_rate": 6.04654769055305e-06, + "loss": 0.0101, + "num_input_tokens_seen": 30559296, + "step": 144805 + }, + { + "epoch": 15.930693069306932, + "grad_norm": 0.1678227186203003, + "learning_rate": 6.0449827080442694e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30560320, + "step": 144810 + }, + { + "epoch": 15.93124312431243, + "grad_norm": 0.2779484689235687, + "learning_rate": 6.043417900234788e-06, + "loss": 0.0071, + "num_input_tokens_seen": 30561344, + "step": 144815 + }, + { + "epoch": 15.931793179317932, + "grad_norm": 0.02313280664384365, + "learning_rate": 6.041853267139033e-06, + "loss": 0.0103, + "num_input_tokens_seen": 30562400, + "step": 144820 + }, + { + "epoch": 15.932343234323433, + "grad_norm": 0.08188918232917786, + "learning_rate": 6.040288808771413e-06, + "loss": 0.0073, + "num_input_tokens_seen": 30563456, + "step": 144825 + }, + { + "epoch": 15.932893289328932, + "grad_norm": 0.01124876644462347, + "learning_rate": 6.038724525146361e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30564544, + "step": 144830 + }, + { + "epoch": 15.933443344334433, + "grad_norm": 0.023571854457259178, + "learning_rate": 6.037160416278278e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30565600, + "step": 144835 + }, + { + "epoch": 15.933993399339935, + "grad_norm": 0.26713040471076965, + "learning_rate": 6.035596482181588e-06, + "loss": 0.0945, + "num_input_tokens_seen": 30566688, + "step": 144840 + }, + { + "epoch": 15.934543454345434, + "grad_norm": 0.01594192162156105, + "learning_rate": 6.0340327228707135e-06, + "loss": 0.1074, + "num_input_tokens_seen": 30567744, + "step": 144845 + }, + { + "epoch": 15.935093509350935, + "grad_norm": 0.009655458852648735, + "learning_rate": 6.032469138360053e-06, + "loss": 0.003, + "num_input_tokens_seen": 30568768, + "step": 144850 + }, + { + "epoch": 15.935643564356436, + "grad_norm": 0.03242649883031845, + "learning_rate": 6.030905728664027e-06, + "loss": 0.004, + "num_input_tokens_seen": 30569856, + "step": 144855 + }, + { + "epoch": 15.936193619361937, + "grad_norm": 0.09918677061796188, + "learning_rate": 6.029342493797041e-06, + "loss": 0.0749, + "num_input_tokens_seen": 30570976, + "step": 144860 + }, + { + "epoch": 15.936743674367436, + "grad_norm": 1.916650414466858, + "learning_rate": 6.027779433773498e-06, + "loss": 0.085, + "num_input_tokens_seen": 30572064, + "step": 144865 + }, + { + "epoch": 15.937293729372938, + "grad_norm": 0.0074692582711577415, + "learning_rate": 6.026216548607805e-06, + "loss": 0.0058, + "num_input_tokens_seen": 30573152, + "step": 144870 + }, + { + "epoch": 15.937843784378439, + "grad_norm": 0.00204563164152205, + "learning_rate": 6.02465383831437e-06, + "loss": 0.0089, + "num_input_tokens_seen": 30574240, + "step": 144875 + }, + { + "epoch": 15.938393839383938, + "grad_norm": 0.003318383125588298, + "learning_rate": 6.023091302907605e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30575328, + "step": 144880 + }, + { + "epoch": 15.938943894389439, + "grad_norm": 0.011804869398474693, + "learning_rate": 6.021528942401899e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30576352, + "step": 144885 + }, + { + "epoch": 15.93949394939494, + "grad_norm": 0.15654665231704712, + "learning_rate": 6.019966756811648e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30577472, + "step": 144890 + }, + { + "epoch": 15.94004400440044, + "grad_norm": 0.017178090289235115, + "learning_rate": 6.018404746151257e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30578560, + "step": 144895 + }, + { + "epoch": 15.94059405940594, + "grad_norm": 0.23795580863952637, + "learning_rate": 6.016842910435119e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30579712, + "step": 144900 + }, + { + "epoch": 15.941144114411442, + "grad_norm": 0.08805269747972488, + "learning_rate": 6.015281249677637e-06, + "loss": 0.0991, + "num_input_tokens_seen": 30580736, + "step": 144905 + }, + { + "epoch": 15.941694169416941, + "grad_norm": 0.017781389877200127, + "learning_rate": 6.0137197638932025e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30581792, + "step": 144910 + }, + { + "epoch": 15.942244224422442, + "grad_norm": 0.05179058760404587, + "learning_rate": 6.012158453096192e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30582816, + "step": 144915 + }, + { + "epoch": 15.942794279427943, + "grad_norm": 0.06943629682064056, + "learning_rate": 6.010597317301014e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30583872, + "step": 144920 + }, + { + "epoch": 15.943344334433444, + "grad_norm": 0.014988462440669537, + "learning_rate": 6.0090363565220405e-06, + "loss": 0.0086, + "num_input_tokens_seen": 30584928, + "step": 144925 + }, + { + "epoch": 15.943894389438944, + "grad_norm": 3.8700366020202637, + "learning_rate": 6.007475570773666e-06, + "loss": 0.1277, + "num_input_tokens_seen": 30585952, + "step": 144930 + }, + { + "epoch": 15.944444444444445, + "grad_norm": 0.03407725691795349, + "learning_rate": 6.005914960070283e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30586944, + "step": 144935 + }, + { + "epoch": 15.944994499449946, + "grad_norm": 0.009291025809943676, + "learning_rate": 6.004354524426262e-06, + "loss": 0.002, + "num_input_tokens_seen": 30587904, + "step": 144940 + }, + { + "epoch": 15.945544554455445, + "grad_norm": 0.37986382842063904, + "learning_rate": 6.002794263855996e-06, + "loss": 0.016, + "num_input_tokens_seen": 30588960, + "step": 144945 + }, + { + "epoch": 15.946094609460946, + "grad_norm": 0.16373145580291748, + "learning_rate": 6.001234178373852e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30590016, + "step": 144950 + }, + { + "epoch": 15.946644664466447, + "grad_norm": 0.11534769833087921, + "learning_rate": 5.999674267994215e-06, + "loss": 0.0063, + "num_input_tokens_seen": 30591008, + "step": 144955 + }, + { + "epoch": 15.947194719471947, + "grad_norm": 1.0319303274154663, + "learning_rate": 5.998114532731475e-06, + "loss": 0.0102, + "num_input_tokens_seen": 30592128, + "step": 144960 + }, + { + "epoch": 15.947744774477448, + "grad_norm": 0.030841009691357613, + "learning_rate": 5.996554972599982e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30593184, + "step": 144965 + }, + { + "epoch": 15.948294829482949, + "grad_norm": 0.014171282760798931, + "learning_rate": 5.994995587614136e-06, + "loss": 0.0082, + "num_input_tokens_seen": 30594208, + "step": 144970 + }, + { + "epoch": 15.948844884488448, + "grad_norm": 0.005888731684535742, + "learning_rate": 5.9934363777882914e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30595232, + "step": 144975 + }, + { + "epoch": 15.94939493949395, + "grad_norm": 0.01437058113515377, + "learning_rate": 5.99187734313682e-06, + "loss": 0.0384, + "num_input_tokens_seen": 30596224, + "step": 144980 + }, + { + "epoch": 15.94994499449945, + "grad_norm": 0.03215737268328667, + "learning_rate": 5.990318483674093e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30597312, + "step": 144985 + }, + { + "epoch": 15.950495049504951, + "grad_norm": 0.016492675989866257, + "learning_rate": 5.988759799414478e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30598368, + "step": 144990 + }, + { + "epoch": 15.95104510451045, + "grad_norm": 0.10816621035337448, + "learning_rate": 5.987201290372349e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30599392, + "step": 144995 + }, + { + "epoch": 15.951595159515952, + "grad_norm": 0.036311376839876175, + "learning_rate": 5.9856429565620626e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30600544, + "step": 145000 + }, + { + "epoch": 15.952145214521453, + "grad_norm": 0.05872470512986183, + "learning_rate": 5.984084797997974e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30601632, + "step": 145005 + }, + { + "epoch": 15.952695269526952, + "grad_norm": 2.0801780223846436, + "learning_rate": 5.98252681469445e-06, + "loss": 0.1174, + "num_input_tokens_seen": 30602688, + "step": 145010 + }, + { + "epoch": 15.953245324532453, + "grad_norm": 0.058234285563230515, + "learning_rate": 5.980969006665851e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30603712, + "step": 145015 + }, + { + "epoch": 15.953795379537954, + "grad_norm": 0.047666069120168686, + "learning_rate": 5.979411373926544e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30604768, + "step": 145020 + }, + { + "epoch": 15.954345434543454, + "grad_norm": 0.018114665523171425, + "learning_rate": 5.977853916490872e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30605792, + "step": 145025 + }, + { + "epoch": 15.954895489548955, + "grad_norm": 0.033532317727804184, + "learning_rate": 5.976296634373188e-06, + "loss": 0.0698, + "num_input_tokens_seen": 30606816, + "step": 145030 + }, + { + "epoch": 15.955445544554456, + "grad_norm": 0.025310801342129707, + "learning_rate": 5.974739527587856e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30607840, + "step": 145035 + }, + { + "epoch": 15.955995599559955, + "grad_norm": 2.0606095790863037, + "learning_rate": 5.973182596149213e-06, + "loss": 0.138, + "num_input_tokens_seen": 30608832, + "step": 145040 + }, + { + "epoch": 15.956545654565456, + "grad_norm": 0.05444502830505371, + "learning_rate": 5.971625840071618e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30609888, + "step": 145045 + }, + { + "epoch": 15.957095709570957, + "grad_norm": 0.1511969119310379, + "learning_rate": 5.970069259369421e-06, + "loss": 0.0627, + "num_input_tokens_seen": 30610944, + "step": 145050 + }, + { + "epoch": 15.957645764576458, + "grad_norm": 0.005763383582234383, + "learning_rate": 5.9685128540569595e-06, + "loss": 0.0076, + "num_input_tokens_seen": 30612032, + "step": 145055 + }, + { + "epoch": 15.958195819581958, + "grad_norm": 2.788015127182007, + "learning_rate": 5.9669566241485865e-06, + "loss": 0.0916, + "num_input_tokens_seen": 30613088, + "step": 145060 + }, + { + "epoch": 15.958745874587459, + "grad_norm": 0.09313101321458817, + "learning_rate": 5.965400569658639e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30614112, + "step": 145065 + }, + { + "epoch": 15.95929592959296, + "grad_norm": 0.023138409480452538, + "learning_rate": 5.963844690601458e-06, + "loss": 0.0057, + "num_input_tokens_seen": 30615200, + "step": 145070 + }, + { + "epoch": 15.95984598459846, + "grad_norm": 0.06659909337759018, + "learning_rate": 5.962288986991394e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30616256, + "step": 145075 + }, + { + "epoch": 15.96039603960396, + "grad_norm": 0.018832692876458168, + "learning_rate": 5.9607334588427685e-06, + "loss": 0.0032, + "num_input_tokens_seen": 30617344, + "step": 145080 + }, + { + "epoch": 15.960946094609461, + "grad_norm": 0.004689279943704605, + "learning_rate": 5.959178106169936e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30618400, + "step": 145085 + }, + { + "epoch": 15.96149614961496, + "grad_norm": 0.012300158850848675, + "learning_rate": 5.957622928987214e-06, + "loss": 0.0256, + "num_input_tokens_seen": 30619424, + "step": 145090 + }, + { + "epoch": 15.962046204620462, + "grad_norm": 0.03478582575917244, + "learning_rate": 5.956067927308953e-06, + "loss": 0.077, + "num_input_tokens_seen": 30620480, + "step": 145095 + }, + { + "epoch": 15.962596259625963, + "grad_norm": 2.801342248916626, + "learning_rate": 5.954513101149467e-06, + "loss": 0.1076, + "num_input_tokens_seen": 30621632, + "step": 145100 + }, + { + "epoch": 15.963146314631462, + "grad_norm": 0.03159531578421593, + "learning_rate": 5.952958450523094e-06, + "loss": 0.0187, + "num_input_tokens_seen": 30622656, + "step": 145105 + }, + { + "epoch": 15.963696369636963, + "grad_norm": 0.0039325887337327, + "learning_rate": 5.951403975444172e-06, + "loss": 0.0126, + "num_input_tokens_seen": 30623680, + "step": 145110 + }, + { + "epoch": 15.964246424642464, + "grad_norm": 0.023641623556613922, + "learning_rate": 5.949849675927011e-06, + "loss": 0.0197, + "num_input_tokens_seen": 30624768, + "step": 145115 + }, + { + "epoch": 15.964796479647966, + "grad_norm": 0.005430944729596376, + "learning_rate": 5.948295551985952e-06, + "loss": 0.0653, + "num_input_tokens_seen": 30625760, + "step": 145120 + }, + { + "epoch": 15.965346534653465, + "grad_norm": 0.5379640460014343, + "learning_rate": 5.946741603635303e-06, + "loss": 0.0808, + "num_input_tokens_seen": 30626784, + "step": 145125 + }, + { + "epoch": 15.965896589658966, + "grad_norm": 0.25696873664855957, + "learning_rate": 5.945187830889395e-06, + "loss": 0.0059, + "num_input_tokens_seen": 30627808, + "step": 145130 + }, + { + "epoch": 15.966446644664467, + "grad_norm": 0.033928558230400085, + "learning_rate": 5.943634233762554e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30628864, + "step": 145135 + }, + { + "epoch": 15.966996699669966, + "grad_norm": 0.16140532493591309, + "learning_rate": 5.942080812269085e-06, + "loss": 0.0032, + "num_input_tokens_seen": 30629856, + "step": 145140 + }, + { + "epoch": 15.967546754675467, + "grad_norm": 0.002958934986963868, + "learning_rate": 5.94052756642332e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30631008, + "step": 145145 + }, + { + "epoch": 15.968096809680969, + "grad_norm": 0.03891293331980705, + "learning_rate": 5.938974496239566e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30632032, + "step": 145150 + }, + { + "epoch": 15.968646864686468, + "grad_norm": 0.012831958010792732, + "learning_rate": 5.937421601732132e-06, + "loss": 0.0051, + "num_input_tokens_seen": 30633088, + "step": 145155 + }, + { + "epoch": 15.969196919691969, + "grad_norm": 0.013422633521258831, + "learning_rate": 5.935868882915335e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30634112, + "step": 145160 + }, + { + "epoch": 15.96974697469747, + "grad_norm": 1.6826783418655396, + "learning_rate": 5.934316339803495e-06, + "loss": 0.0631, + "num_input_tokens_seen": 30635168, + "step": 145165 + }, + { + "epoch": 15.97029702970297, + "grad_norm": 0.04870956763625145, + "learning_rate": 5.932763972410901e-06, + "loss": 0.0664, + "num_input_tokens_seen": 30636192, + "step": 145170 + }, + { + "epoch": 15.97084708470847, + "grad_norm": 0.02001410350203514, + "learning_rate": 5.931211780751883e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30637216, + "step": 145175 + }, + { + "epoch": 15.971397139713972, + "grad_norm": 0.21315909922122955, + "learning_rate": 5.929659764840728e-06, + "loss": 0.0571, + "num_input_tokens_seen": 30638336, + "step": 145180 + }, + { + "epoch": 15.971947194719473, + "grad_norm": 0.03771838918328285, + "learning_rate": 5.92810792469175e-06, + "loss": 0.0306, + "num_input_tokens_seen": 30639456, + "step": 145185 + }, + { + "epoch": 15.972497249724972, + "grad_norm": 0.07968863844871521, + "learning_rate": 5.926556260319255e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30640512, + "step": 145190 + }, + { + "epoch": 15.973047304730473, + "grad_norm": 0.031113024801015854, + "learning_rate": 5.925004771737533e-06, + "loss": 0.0817, + "num_input_tokens_seen": 30641536, + "step": 145195 + }, + { + "epoch": 15.973597359735974, + "grad_norm": 0.06379390507936478, + "learning_rate": 5.9234534589608924e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30642592, + "step": 145200 + }, + { + "epoch": 15.974147414741473, + "grad_norm": 0.009042131714522839, + "learning_rate": 5.921902322003625e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30643584, + "step": 145205 + }, + { + "epoch": 15.974697469746975, + "grad_norm": 0.0031226363498717546, + "learning_rate": 5.920351360880033e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30644672, + "step": 145210 + }, + { + "epoch": 15.975247524752476, + "grad_norm": 0.02638588845729828, + "learning_rate": 5.918800575604402e-06, + "loss": 0.1035, + "num_input_tokens_seen": 30645696, + "step": 145215 + }, + { + "epoch": 15.975797579757975, + "grad_norm": 0.047214772552251816, + "learning_rate": 5.91724996619103e-06, + "loss": 0.0695, + "num_input_tokens_seen": 30646784, + "step": 145220 + }, + { + "epoch": 15.976347634763476, + "grad_norm": 0.018927516415715218, + "learning_rate": 5.915699532654215e-06, + "loss": 0.0088, + "num_input_tokens_seen": 30647808, + "step": 145225 + }, + { + "epoch": 15.976897689768977, + "grad_norm": 0.018618779256939888, + "learning_rate": 5.914149275008232e-06, + "loss": 0.0255, + "num_input_tokens_seen": 30648864, + "step": 145230 + }, + { + "epoch": 15.977447744774478, + "grad_norm": 1.1765750646591187, + "learning_rate": 5.9125991932673845e-06, + "loss": 0.0221, + "num_input_tokens_seen": 30649952, + "step": 145235 + }, + { + "epoch": 15.977997799779978, + "grad_norm": 0.03125829994678497, + "learning_rate": 5.911049287445944e-06, + "loss": 0.0542, + "num_input_tokens_seen": 30651008, + "step": 145240 + }, + { + "epoch": 15.978547854785479, + "grad_norm": 0.027383234351873398, + "learning_rate": 5.909499557558201e-06, + "loss": 0.1003, + "num_input_tokens_seen": 30652064, + "step": 145245 + }, + { + "epoch": 15.97909790979098, + "grad_norm": 0.06910836696624756, + "learning_rate": 5.907950003618451e-06, + "loss": 0.0054, + "num_input_tokens_seen": 30653152, + "step": 145250 + }, + { + "epoch": 15.979647964796479, + "grad_norm": 0.0542888343334198, + "learning_rate": 5.906400625640954e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30654208, + "step": 145255 + }, + { + "epoch": 15.98019801980198, + "grad_norm": 3.159412384033203, + "learning_rate": 5.904851423640009e-06, + "loss": 0.0534, + "num_input_tokens_seen": 30655264, + "step": 145260 + }, + { + "epoch": 15.980748074807481, + "grad_norm": 0.03977334499359131, + "learning_rate": 5.903302397629884e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30656352, + "step": 145265 + }, + { + "epoch": 15.98129812981298, + "grad_norm": 0.012971393764019012, + "learning_rate": 5.901753547624853e-06, + "loss": 0.0067, + "num_input_tokens_seen": 30657472, + "step": 145270 + }, + { + "epoch": 15.981848184818482, + "grad_norm": 1.1762993335723877, + "learning_rate": 5.900204873639192e-06, + "loss": 0.0099, + "num_input_tokens_seen": 30658496, + "step": 145275 + }, + { + "epoch": 15.982398239823983, + "grad_norm": 0.05456367880105972, + "learning_rate": 5.898656375687181e-06, + "loss": 0.0399, + "num_input_tokens_seen": 30659552, + "step": 145280 + }, + { + "epoch": 15.982948294829484, + "grad_norm": 0.02112584374845028, + "learning_rate": 5.897108053783093e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30660608, + "step": 145285 + }, + { + "epoch": 15.983498349834983, + "grad_norm": 0.06566701829433441, + "learning_rate": 5.895559907941195e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30661696, + "step": 145290 + }, + { + "epoch": 15.984048404840484, + "grad_norm": 1.6506245136260986, + "learning_rate": 5.894011938175747e-06, + "loss": 0.0596, + "num_input_tokens_seen": 30662688, + "step": 145295 + }, + { + "epoch": 15.984598459845985, + "grad_norm": 0.07138748466968536, + "learning_rate": 5.892464144501023e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30663712, + "step": 145300 + }, + { + "epoch": 15.985148514851485, + "grad_norm": 0.04874727129936218, + "learning_rate": 5.89091652693129e-06, + "loss": 0.0013, + "num_input_tokens_seen": 30664768, + "step": 145305 + }, + { + "epoch": 15.985698569856986, + "grad_norm": 0.020842093974351883, + "learning_rate": 5.889369085480814e-06, + "loss": 0.0207, + "num_input_tokens_seen": 30665856, + "step": 145310 + }, + { + "epoch": 15.986248624862487, + "grad_norm": 0.33689945936203003, + "learning_rate": 5.887821820163855e-06, + "loss": 0.0032, + "num_input_tokens_seen": 30666912, + "step": 145315 + }, + { + "epoch": 15.986798679867986, + "grad_norm": 0.006874634884297848, + "learning_rate": 5.886274730994662e-06, + "loss": 0.0072, + "num_input_tokens_seen": 30667968, + "step": 145320 + }, + { + "epoch": 15.987348734873487, + "grad_norm": 0.03358235955238342, + "learning_rate": 5.884727817987512e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30668960, + "step": 145325 + }, + { + "epoch": 15.987898789878988, + "grad_norm": 0.08080336451530457, + "learning_rate": 5.883181081156647e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30670016, + "step": 145330 + }, + { + "epoch": 15.988448844884488, + "grad_norm": 0.10541074723005295, + "learning_rate": 5.881634520516327e-06, + "loss": 0.1082, + "num_input_tokens_seen": 30671072, + "step": 145335 + }, + { + "epoch": 15.988998899889989, + "grad_norm": 1.8158248662948608, + "learning_rate": 5.880088136080814e-06, + "loss": 0.0724, + "num_input_tokens_seen": 30672128, + "step": 145340 + }, + { + "epoch": 15.98954895489549, + "grad_norm": 0.0328700952231884, + "learning_rate": 5.87854192786435e-06, + "loss": 0.0078, + "num_input_tokens_seen": 30673248, + "step": 145345 + }, + { + "epoch": 15.990099009900991, + "grad_norm": 1.958640694618225, + "learning_rate": 5.876995895881194e-06, + "loss": 0.0369, + "num_input_tokens_seen": 30674336, + "step": 145350 + }, + { + "epoch": 15.99064906490649, + "grad_norm": 0.03137980401515961, + "learning_rate": 5.875450040145591e-06, + "loss": 0.0889, + "num_input_tokens_seen": 30675360, + "step": 145355 + }, + { + "epoch": 15.991199119911991, + "grad_norm": 1.363964319229126, + "learning_rate": 5.873904360671775e-06, + "loss": 0.0115, + "num_input_tokens_seen": 30676384, + "step": 145360 + }, + { + "epoch": 15.991749174917492, + "grad_norm": 0.020668840035796165, + "learning_rate": 5.87235885747402e-06, + "loss": 0.077, + "num_input_tokens_seen": 30677408, + "step": 145365 + }, + { + "epoch": 15.992299229922992, + "grad_norm": 0.023899300023913383, + "learning_rate": 5.870813530566546e-06, + "loss": 0.0566, + "num_input_tokens_seen": 30678432, + "step": 145370 + }, + { + "epoch": 15.992849284928493, + "grad_norm": 0.02600589580833912, + "learning_rate": 5.869268379963613e-06, + "loss": 0.0673, + "num_input_tokens_seen": 30679456, + "step": 145375 + }, + { + "epoch": 15.993399339933994, + "grad_norm": 0.0038242689333856106, + "learning_rate": 5.867723405679454e-06, + "loss": 0.0524, + "num_input_tokens_seen": 30680544, + "step": 145380 + }, + { + "epoch": 15.993949394939493, + "grad_norm": 0.008413977921009064, + "learning_rate": 5.866178607728301e-06, + "loss": 0.0061, + "num_input_tokens_seen": 30681568, + "step": 145385 + }, + { + "epoch": 15.994499449944994, + "grad_norm": 0.0892561599612236, + "learning_rate": 5.864633986124396e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30682592, + "step": 145390 + }, + { + "epoch": 15.995049504950495, + "grad_norm": 0.02484819106757641, + "learning_rate": 5.86308954088198e-06, + "loss": 0.033, + "num_input_tokens_seen": 30683616, + "step": 145395 + }, + { + "epoch": 15.995599559955995, + "grad_norm": 1.7691618204116821, + "learning_rate": 5.861545272015289e-06, + "loss": 0.0463, + "num_input_tokens_seen": 30684704, + "step": 145400 + }, + { + "epoch": 15.996149614961496, + "grad_norm": 0.35769936442375183, + "learning_rate": 5.8600011795385545e-06, + "loss": 0.0631, + "num_input_tokens_seen": 30685792, + "step": 145405 + }, + { + "epoch": 15.996699669966997, + "grad_norm": 0.0039019810501486063, + "learning_rate": 5.8584572634659964e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30686880, + "step": 145410 + }, + { + "epoch": 15.997249724972498, + "grad_norm": 0.06935332715511322, + "learning_rate": 5.8569135238118606e-06, + "loss": 0.0821, + "num_input_tokens_seen": 30687936, + "step": 145415 + }, + { + "epoch": 15.997799779977997, + "grad_norm": 0.00914042629301548, + "learning_rate": 5.855369960590354e-06, + "loss": 0.0286, + "num_input_tokens_seen": 30689024, + "step": 145420 + }, + { + "epoch": 15.998349834983498, + "grad_norm": 0.013281811960041523, + "learning_rate": 5.853826573815727e-06, + "loss": 0.0053, + "num_input_tokens_seen": 30690112, + "step": 145425 + }, + { + "epoch": 15.998899889989, + "grad_norm": 0.010710650123655796, + "learning_rate": 5.852283363502195e-06, + "loss": 0.0069, + "num_input_tokens_seen": 30691200, + "step": 145430 + }, + { + "epoch": 15.999449944994499, + "grad_norm": 0.28493934869766235, + "learning_rate": 5.850740329663973e-06, + "loss": 0.0106, + "num_input_tokens_seen": 30692192, + "step": 145435 + }, + { + "epoch": 16.0, + "grad_norm": 0.00977506022900343, + "learning_rate": 5.849197472315296e-06, + "loss": 0.0331, + "num_input_tokens_seen": 30693088, + "step": 145440 + }, + { + "epoch": 16.0, + "eval_loss": 0.07597047090530396, + "eval_runtime": 36.9848, + "eval_samples_per_second": 109.234, + "eval_steps_per_second": 27.309, + "num_input_tokens_seen": 30693088, + "step": 145440 + }, + { + "epoch": 16.0005500550055, + "grad_norm": 0.032794930040836334, + "learning_rate": 5.847654791470367e-06, + "loss": 0.0642, + "num_input_tokens_seen": 30694208, + "step": 145445 + }, + { + "epoch": 16.001100110011002, + "grad_norm": 0.020306356251239777, + "learning_rate": 5.846112287143415e-06, + "loss": 0.007, + "num_input_tokens_seen": 30695232, + "step": 145450 + }, + { + "epoch": 16.001650165016503, + "grad_norm": 0.13894101977348328, + "learning_rate": 5.8445699593486615e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30696256, + "step": 145455 + }, + { + "epoch": 16.002200220022, + "grad_norm": 0.0662078708410263, + "learning_rate": 5.84302780810031e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30697344, + "step": 145460 + }, + { + "epoch": 16.002750275027502, + "grad_norm": 0.008355867117643356, + "learning_rate": 5.841485833412583e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30698304, + "step": 145465 + }, + { + "epoch": 16.003300330033003, + "grad_norm": 0.003815778996795416, + "learning_rate": 5.839944035299682e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30699488, + "step": 145470 + }, + { + "epoch": 16.003850385038504, + "grad_norm": 0.5084518790245056, + "learning_rate": 5.838402413775831e-06, + "loss": 0.0055, + "num_input_tokens_seen": 30700544, + "step": 145475 + }, + { + "epoch": 16.004400440044005, + "grad_norm": 0.009207767434418201, + "learning_rate": 5.836860968855224e-06, + "loss": 0.1087, + "num_input_tokens_seen": 30701632, + "step": 145480 + }, + { + "epoch": 16.004950495049506, + "grad_norm": 0.08660601824522018, + "learning_rate": 5.835319700552072e-06, + "loss": 0.002, + "num_input_tokens_seen": 30702688, + "step": 145485 + }, + { + "epoch": 16.005500550055004, + "grad_norm": 0.20354023575782776, + "learning_rate": 5.8337786088805884e-06, + "loss": 0.0081, + "num_input_tokens_seen": 30703744, + "step": 145490 + }, + { + "epoch": 16.006050605060505, + "grad_norm": 0.03923701494932175, + "learning_rate": 5.832237693854966e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30704832, + "step": 145495 + }, + { + "epoch": 16.006600660066006, + "grad_norm": 0.05561540275812149, + "learning_rate": 5.830696955489417e-06, + "loss": 0.0711, + "num_input_tokens_seen": 30705856, + "step": 145500 + }, + { + "epoch": 16.007150715071507, + "grad_norm": 0.3091549873352051, + "learning_rate": 5.82915639379813e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30706880, + "step": 145505 + }, + { + "epoch": 16.007700770077008, + "grad_norm": 0.010949618183076382, + "learning_rate": 5.827616008795308e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30708000, + "step": 145510 + }, + { + "epoch": 16.00825082508251, + "grad_norm": 0.22545212507247925, + "learning_rate": 5.826075800495156e-06, + "loss": 0.0085, + "num_input_tokens_seen": 30708960, + "step": 145515 + }, + { + "epoch": 16.00880088008801, + "grad_norm": 0.1250685751438141, + "learning_rate": 5.824535768911856e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30709984, + "step": 145520 + }, + { + "epoch": 16.009350935093508, + "grad_norm": 2.5447452068328857, + "learning_rate": 5.822995914059618e-06, + "loss": 0.0914, + "num_input_tokens_seen": 30711040, + "step": 145525 + }, + { + "epoch": 16.00990099009901, + "grad_norm": 0.01953301951289177, + "learning_rate": 5.821456235952621e-06, + "loss": 0.0039, + "num_input_tokens_seen": 30712128, + "step": 145530 + }, + { + "epoch": 16.01045104510451, + "grad_norm": 4.036099433898926, + "learning_rate": 5.819916734605049e-06, + "loss": 0.0326, + "num_input_tokens_seen": 30713152, + "step": 145535 + }, + { + "epoch": 16.01100110011001, + "grad_norm": 0.0234520323574543, + "learning_rate": 5.818377410031112e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30714272, + "step": 145540 + }, + { + "epoch": 16.011551155115512, + "grad_norm": 0.060584187507629395, + "learning_rate": 5.816838262244981e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30715360, + "step": 145545 + }, + { + "epoch": 16.012101210121013, + "grad_norm": 0.02316095307469368, + "learning_rate": 5.815299291260851e-06, + "loss": 0.0309, + "num_input_tokens_seen": 30716416, + "step": 145550 + }, + { + "epoch": 16.01265126512651, + "grad_norm": 0.03921937942504883, + "learning_rate": 5.8137604970929046e-06, + "loss": 0.0128, + "num_input_tokens_seen": 30717568, + "step": 145555 + }, + { + "epoch": 16.013201320132012, + "grad_norm": 2.864950656890869, + "learning_rate": 5.812221879755314e-06, + "loss": 0.1138, + "num_input_tokens_seen": 30718592, + "step": 145560 + }, + { + "epoch": 16.013751375137513, + "grad_norm": 0.14867128431797028, + "learning_rate": 5.810683439262266e-06, + "loss": 0.0072, + "num_input_tokens_seen": 30719712, + "step": 145565 + }, + { + "epoch": 16.014301430143014, + "grad_norm": 0.36776667833328247, + "learning_rate": 5.809145175627947e-06, + "loss": 0.0079, + "num_input_tokens_seen": 30720736, + "step": 145570 + }, + { + "epoch": 16.014851485148515, + "grad_norm": 0.00562320975586772, + "learning_rate": 5.8076070888665236e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30721792, + "step": 145575 + }, + { + "epoch": 16.015401540154016, + "grad_norm": 0.009724866598844528, + "learning_rate": 5.806069178992182e-06, + "loss": 0.0164, + "num_input_tokens_seen": 30722784, + "step": 145580 + }, + { + "epoch": 16.015951595159517, + "grad_norm": 0.025645066052675247, + "learning_rate": 5.804531446019082e-06, + "loss": 0.0078, + "num_input_tokens_seen": 30723808, + "step": 145585 + }, + { + "epoch": 16.016501650165015, + "grad_norm": 1.9869643449783325, + "learning_rate": 5.802993889961414e-06, + "loss": 0.0194, + "num_input_tokens_seen": 30724864, + "step": 145590 + }, + { + "epoch": 16.017051705170516, + "grad_norm": 0.015674011781811714, + "learning_rate": 5.801456510833333e-06, + "loss": 0.016, + "num_input_tokens_seen": 30725952, + "step": 145595 + }, + { + "epoch": 16.017601760176017, + "grad_norm": 0.09031666815280914, + "learning_rate": 5.799919308649013e-06, + "loss": 0.0102, + "num_input_tokens_seen": 30727008, + "step": 145600 + }, + { + "epoch": 16.01815181518152, + "grad_norm": 0.3415803611278534, + "learning_rate": 5.798382283422632e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30728032, + "step": 145605 + }, + { + "epoch": 16.01870187018702, + "grad_norm": 0.005897872615605593, + "learning_rate": 5.796845435168341e-06, + "loss": 0.0249, + "num_input_tokens_seen": 30729152, + "step": 145610 + }, + { + "epoch": 16.01925192519252, + "grad_norm": 0.12721191346645355, + "learning_rate": 5.795308763900317e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30730208, + "step": 145615 + }, + { + "epoch": 16.019801980198018, + "grad_norm": 0.06040984392166138, + "learning_rate": 5.79377226963271e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30731200, + "step": 145620 + }, + { + "epoch": 16.02035203520352, + "grad_norm": 0.006791349034756422, + "learning_rate": 5.79223595237969e-06, + "loss": 0.0145, + "num_input_tokens_seen": 30732288, + "step": 145625 + }, + { + "epoch": 16.02090209020902, + "grad_norm": 0.010135278105735779, + "learning_rate": 5.7906998121554204e-06, + "loss": 0.0598, + "num_input_tokens_seen": 30733312, + "step": 145630 + }, + { + "epoch": 16.02145214521452, + "grad_norm": 0.01803814433515072, + "learning_rate": 5.789163848974044e-06, + "loss": 0.0247, + "num_input_tokens_seen": 30734368, + "step": 145635 + }, + { + "epoch": 16.022002200220022, + "grad_norm": 1.1082791090011597, + "learning_rate": 5.787628062849734e-06, + "loss": 0.0136, + "num_input_tokens_seen": 30735424, + "step": 145640 + }, + { + "epoch": 16.022552255225524, + "grad_norm": 0.756343424320221, + "learning_rate": 5.786092453796638e-06, + "loss": 0.0554, + "num_input_tokens_seen": 30736448, + "step": 145645 + }, + { + "epoch": 16.023102310231025, + "grad_norm": 0.026918256655335426, + "learning_rate": 5.784557021828902e-06, + "loss": 0.0063, + "num_input_tokens_seen": 30737408, + "step": 145650 + }, + { + "epoch": 16.023652365236522, + "grad_norm": 0.1321284919977188, + "learning_rate": 5.783021766960681e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30738464, + "step": 145655 + }, + { + "epoch": 16.024202420242023, + "grad_norm": 1.6855452060699463, + "learning_rate": 5.781486689206128e-06, + "loss": 0.0062, + "num_input_tokens_seen": 30739488, + "step": 145660 + }, + { + "epoch": 16.024752475247524, + "grad_norm": 1.9445441961288452, + "learning_rate": 5.779951788579399e-06, + "loss": 0.0382, + "num_input_tokens_seen": 30740576, + "step": 145665 + }, + { + "epoch": 16.025302530253025, + "grad_norm": 0.06601511687040329, + "learning_rate": 5.77841706509463e-06, + "loss": 0.002, + "num_input_tokens_seen": 30741696, + "step": 145670 + }, + { + "epoch": 16.025852585258527, + "grad_norm": 0.08368761837482452, + "learning_rate": 5.776882518765961e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30742784, + "step": 145675 + }, + { + "epoch": 16.026402640264028, + "grad_norm": 0.020698877051472664, + "learning_rate": 5.7753481496075425e-06, + "loss": 0.005, + "num_input_tokens_seen": 30743840, + "step": 145680 + }, + { + "epoch": 16.02695269526953, + "grad_norm": 0.07370175421237946, + "learning_rate": 5.773813957633514e-06, + "loss": 0.0379, + "num_input_tokens_seen": 30744864, + "step": 145685 + }, + { + "epoch": 16.027502750275026, + "grad_norm": 3.6674442291259766, + "learning_rate": 5.772279942858025e-06, + "loss": 0.1837, + "num_input_tokens_seen": 30745920, + "step": 145690 + }, + { + "epoch": 16.028052805280527, + "grad_norm": 0.0030948494095355272, + "learning_rate": 5.770746105295202e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30747008, + "step": 145695 + }, + { + "epoch": 16.02860286028603, + "grad_norm": 0.009325225837528706, + "learning_rate": 5.769212444959182e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30748128, + "step": 145700 + }, + { + "epoch": 16.02915291529153, + "grad_norm": 0.015355064533650875, + "learning_rate": 5.767678961864109e-06, + "loss": 0.0004, + "num_input_tokens_seen": 30749184, + "step": 145705 + }, + { + "epoch": 16.02970297029703, + "grad_norm": 0.02004816383123398, + "learning_rate": 5.7661456560241025e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30750240, + "step": 145710 + }, + { + "epoch": 16.03025302530253, + "grad_norm": 0.1569165289402008, + "learning_rate": 5.764612527453303e-06, + "loss": 0.0052, + "num_input_tokens_seen": 30751328, + "step": 145715 + }, + { + "epoch": 16.03080308030803, + "grad_norm": 0.05204857885837555, + "learning_rate": 5.763079576165844e-06, + "loss": 0.0348, + "num_input_tokens_seen": 30752416, + "step": 145720 + }, + { + "epoch": 16.03135313531353, + "grad_norm": 0.06421486288309097, + "learning_rate": 5.7615468021758465e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30753504, + "step": 145725 + }, + { + "epoch": 16.03190319031903, + "grad_norm": 0.013856004923582077, + "learning_rate": 5.760014205497447e-06, + "loss": 0.069, + "num_input_tokens_seen": 30754592, + "step": 145730 + }, + { + "epoch": 16.032453245324533, + "grad_norm": 0.15404532849788666, + "learning_rate": 5.758481786144757e-06, + "loss": 0.0193, + "num_input_tokens_seen": 30755584, + "step": 145735 + }, + { + "epoch": 16.033003300330034, + "grad_norm": 0.13022497296333313, + "learning_rate": 5.756949544131907e-06, + "loss": 0.0234, + "num_input_tokens_seen": 30756672, + "step": 145740 + }, + { + "epoch": 16.033553355335535, + "grad_norm": 0.8215236067771912, + "learning_rate": 5.755417479473027e-06, + "loss": 0.0078, + "num_input_tokens_seen": 30757728, + "step": 145745 + }, + { + "epoch": 16.034103410341036, + "grad_norm": 0.012566987425088882, + "learning_rate": 5.753885592182221e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30758784, + "step": 145750 + }, + { + "epoch": 16.034653465346533, + "grad_norm": 0.6180347204208374, + "learning_rate": 5.7523538822736225e-06, + "loss": 0.0095, + "num_input_tokens_seen": 30759872, + "step": 145755 + }, + { + "epoch": 16.035203520352034, + "grad_norm": 0.012955917976796627, + "learning_rate": 5.750822349761345e-06, + "loss": 0.0033, + "num_input_tokens_seen": 30760896, + "step": 145760 + }, + { + "epoch": 16.035753575357536, + "grad_norm": 0.4711466133594513, + "learning_rate": 5.749290994659495e-06, + "loss": 0.0424, + "num_input_tokens_seen": 30761856, + "step": 145765 + }, + { + "epoch": 16.036303630363037, + "grad_norm": 0.014606518670916557, + "learning_rate": 5.74775981698219e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30762976, + "step": 145770 + }, + { + "epoch": 16.036853685368538, + "grad_norm": 0.1824112832546234, + "learning_rate": 5.7462288167435455e-06, + "loss": 0.0948, + "num_input_tokens_seen": 30764096, + "step": 145775 + }, + { + "epoch": 16.03740374037404, + "grad_norm": 0.24697065353393555, + "learning_rate": 5.744697993957679e-06, + "loss": 0.0239, + "num_input_tokens_seen": 30765152, + "step": 145780 + }, + { + "epoch": 16.037953795379536, + "grad_norm": 0.02940761111676693, + "learning_rate": 5.743167348638689e-06, + "loss": 0.0067, + "num_input_tokens_seen": 30766208, + "step": 145785 + }, + { + "epoch": 16.038503850385037, + "grad_norm": 0.007560229394584894, + "learning_rate": 5.74163688080068e-06, + "loss": 0.0045, + "num_input_tokens_seen": 30767296, + "step": 145790 + }, + { + "epoch": 16.03905390539054, + "grad_norm": 0.007300166878849268, + "learning_rate": 5.740106590457764e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30768320, + "step": 145795 + }, + { + "epoch": 16.03960396039604, + "grad_norm": 0.014719286002218723, + "learning_rate": 5.738576477624041e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30769376, + "step": 145800 + }, + { + "epoch": 16.04015401540154, + "grad_norm": 0.0043091680854558945, + "learning_rate": 5.737046542313626e-06, + "loss": 0.2391, + "num_input_tokens_seen": 30770432, + "step": 145805 + }, + { + "epoch": 16.040704070407042, + "grad_norm": 2.0535693168640137, + "learning_rate": 5.73551678454061e-06, + "loss": 0.0106, + "num_input_tokens_seen": 30771456, + "step": 145810 + }, + { + "epoch": 16.041254125412543, + "grad_norm": 0.008340829983353615, + "learning_rate": 5.733987204319083e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30772448, + "step": 145815 + }, + { + "epoch": 16.04180418041804, + "grad_norm": 0.16200481355190277, + "learning_rate": 5.732457801663158e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30773504, + "step": 145820 + }, + { + "epoch": 16.04235423542354, + "grad_norm": 0.006953815929591656, + "learning_rate": 5.730928576586917e-06, + "loss": 0.0922, + "num_input_tokens_seen": 30774496, + "step": 145825 + }, + { + "epoch": 16.042904290429043, + "grad_norm": 3.8703975677490234, + "learning_rate": 5.7293995291044595e-06, + "loss": 0.1368, + "num_input_tokens_seen": 30775520, + "step": 145830 + }, + { + "epoch": 16.043454345434544, + "grad_norm": 2.125471830368042, + "learning_rate": 5.727870659229889e-06, + "loss": 0.0886, + "num_input_tokens_seen": 30776576, + "step": 145835 + }, + { + "epoch": 16.044004400440045, + "grad_norm": 0.12424977123737335, + "learning_rate": 5.726341966977281e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30777632, + "step": 145840 + }, + { + "epoch": 16.044554455445546, + "grad_norm": 0.040160421282052994, + "learning_rate": 5.724813452360736e-06, + "loss": 0.0488, + "num_input_tokens_seen": 30778688, + "step": 145845 + }, + { + "epoch": 16.045104510451043, + "grad_norm": 0.024239251390099525, + "learning_rate": 5.723285115394328e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30779712, + "step": 145850 + }, + { + "epoch": 16.045654565456545, + "grad_norm": 0.007407397031784058, + "learning_rate": 5.721756956092153e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30780832, + "step": 145855 + }, + { + "epoch": 16.046204620462046, + "grad_norm": 0.025967882946133614, + "learning_rate": 5.7202289744683e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30781920, + "step": 145860 + }, + { + "epoch": 16.046754675467547, + "grad_norm": 0.009010318666696548, + "learning_rate": 5.718701170536839e-06, + "loss": 0.093, + "num_input_tokens_seen": 30782976, + "step": 145865 + }, + { + "epoch": 16.047304730473048, + "grad_norm": 0.03895290940999985, + "learning_rate": 5.717173544311863e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30784032, + "step": 145870 + }, + { + "epoch": 16.04785478547855, + "grad_norm": 0.06804215908050537, + "learning_rate": 5.715646095807439e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30785088, + "step": 145875 + }, + { + "epoch": 16.04840484048405, + "grad_norm": 0.00918705202639103, + "learning_rate": 5.714118825037659e-06, + "loss": 0.0073, + "num_input_tokens_seen": 30786144, + "step": 145880 + }, + { + "epoch": 16.048954895489548, + "grad_norm": 4.619109630584717, + "learning_rate": 5.712591732016584e-06, + "loss": 0.0678, + "num_input_tokens_seen": 30787232, + "step": 145885 + }, + { + "epoch": 16.04950495049505, + "grad_norm": 0.009434837847948074, + "learning_rate": 5.711064816758297e-06, + "loss": 0.0042, + "num_input_tokens_seen": 30788256, + "step": 145890 + }, + { + "epoch": 16.05005500550055, + "grad_norm": 0.17780256271362305, + "learning_rate": 5.709538079276877e-06, + "loss": 0.0099, + "num_input_tokens_seen": 30789248, + "step": 145895 + }, + { + "epoch": 16.05060506050605, + "grad_norm": 0.018753720447421074, + "learning_rate": 5.708011519586379e-06, + "loss": 0.0766, + "num_input_tokens_seen": 30790336, + "step": 145900 + }, + { + "epoch": 16.051155115511552, + "grad_norm": 0.01141467597335577, + "learning_rate": 5.70648513770089e-06, + "loss": 0.0707, + "num_input_tokens_seen": 30791392, + "step": 145905 + }, + { + "epoch": 16.051705170517053, + "grad_norm": 0.4219198524951935, + "learning_rate": 5.704958933634464e-06, + "loss": 0.0056, + "num_input_tokens_seen": 30792416, + "step": 145910 + }, + { + "epoch": 16.05225522552255, + "grad_norm": 0.037046223878860474, + "learning_rate": 5.703432907401171e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30793440, + "step": 145915 + }, + { + "epoch": 16.05280528052805, + "grad_norm": 1.7849403619766235, + "learning_rate": 5.7019070590150855e-06, + "loss": 0.0223, + "num_input_tokens_seen": 30794560, + "step": 145920 + }, + { + "epoch": 16.053355335533553, + "grad_norm": 0.01800612360239029, + "learning_rate": 5.700381388490253e-06, + "loss": 0.003, + "num_input_tokens_seen": 30795616, + "step": 145925 + }, + { + "epoch": 16.053905390539054, + "grad_norm": 0.015403236262500286, + "learning_rate": 5.698855895840754e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30796736, + "step": 145930 + }, + { + "epoch": 16.054455445544555, + "grad_norm": 0.012041947804391384, + "learning_rate": 5.697330581080637e-06, + "loss": 0.0317, + "num_input_tokens_seen": 30797696, + "step": 145935 + }, + { + "epoch": 16.055005500550056, + "grad_norm": 0.008947579190135002, + "learning_rate": 5.695805444223956e-06, + "loss": 0.001, + "num_input_tokens_seen": 30798752, + "step": 145940 + }, + { + "epoch": 16.055555555555557, + "grad_norm": 0.1436518430709839, + "learning_rate": 5.6942804852847726e-06, + "loss": 0.0028, + "num_input_tokens_seen": 30799776, + "step": 145945 + }, + { + "epoch": 16.056105610561055, + "grad_norm": 0.5629875659942627, + "learning_rate": 5.692755704277148e-06, + "loss": 0.0058, + "num_input_tokens_seen": 30800800, + "step": 145950 + }, + { + "epoch": 16.056655665566556, + "grad_norm": 0.0032016607001423836, + "learning_rate": 5.691231101215125e-06, + "loss": 0.0049, + "num_input_tokens_seen": 30801888, + "step": 145955 + }, + { + "epoch": 16.057205720572057, + "grad_norm": 3.4062206745147705, + "learning_rate": 5.689706676112766e-06, + "loss": 0.1073, + "num_input_tokens_seen": 30802944, + "step": 145960 + }, + { + "epoch": 16.057755775577558, + "grad_norm": 1.0654736757278442, + "learning_rate": 5.688182428984107e-06, + "loss": 0.0234, + "num_input_tokens_seen": 30803968, + "step": 145965 + }, + { + "epoch": 16.05830583058306, + "grad_norm": 0.08974550664424896, + "learning_rate": 5.686658359843203e-06, + "loss": 0.0037, + "num_input_tokens_seen": 30805024, + "step": 145970 + }, + { + "epoch": 16.05885588558856, + "grad_norm": 0.03417569771409035, + "learning_rate": 5.685134468704109e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30806112, + "step": 145975 + }, + { + "epoch": 16.059405940594058, + "grad_norm": 3.003647565841675, + "learning_rate": 5.6836107555808565e-06, + "loss": 0.0829, + "num_input_tokens_seen": 30807168, + "step": 145980 + }, + { + "epoch": 16.05995599559956, + "grad_norm": 0.03403891995549202, + "learning_rate": 5.6820872204875005e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30808224, + "step": 145985 + }, + { + "epoch": 16.06050605060506, + "grad_norm": 0.012959790416061878, + "learning_rate": 5.680563863438071e-06, + "loss": 0.0112, + "num_input_tokens_seen": 30809312, + "step": 145990 + }, + { + "epoch": 16.06105610561056, + "grad_norm": 0.5167398452758789, + "learning_rate": 5.679040684446621e-06, + "loss": 0.0094, + "num_input_tokens_seen": 30810368, + "step": 145995 + }, + { + "epoch": 16.061606160616062, + "grad_norm": 0.3824525773525238, + "learning_rate": 5.677517683527172e-06, + "loss": 0.0133, + "num_input_tokens_seen": 30811392, + "step": 146000 + }, + { + "epoch": 16.062156215621563, + "grad_norm": 0.10048521310091019, + "learning_rate": 5.675994860693773e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30812448, + "step": 146005 + }, + { + "epoch": 16.062706270627064, + "grad_norm": 0.08043725788593292, + "learning_rate": 5.6744722159604635e-06, + "loss": 0.0034, + "num_input_tokens_seen": 30813472, + "step": 146010 + }, + { + "epoch": 16.063256325632562, + "grad_norm": 0.013369455002248287, + "learning_rate": 5.672949749341264e-06, + "loss": 0.0067, + "num_input_tokens_seen": 30814464, + "step": 146015 + }, + { + "epoch": 16.063806380638063, + "grad_norm": 0.07902584969997406, + "learning_rate": 5.671427460850221e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30815488, + "step": 146020 + }, + { + "epoch": 16.064356435643564, + "grad_norm": 0.002103643724694848, + "learning_rate": 5.669905350501348e-06, + "loss": 0.0251, + "num_input_tokens_seen": 30816608, + "step": 146025 + }, + { + "epoch": 16.064906490649065, + "grad_norm": 0.013546889647841454, + "learning_rate": 5.668383418308681e-06, + "loss": 0.0044, + "num_input_tokens_seen": 30817664, + "step": 146030 + }, + { + "epoch": 16.065456545654566, + "grad_norm": 0.012800855562090874, + "learning_rate": 5.6668616642862545e-06, + "loss": 0.028, + "num_input_tokens_seen": 30818720, + "step": 146035 + }, + { + "epoch": 16.066006600660067, + "grad_norm": 0.005212446674704552, + "learning_rate": 5.665340088448082e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30819840, + "step": 146040 + }, + { + "epoch": 16.066556655665565, + "grad_norm": 0.8110535740852356, + "learning_rate": 5.663818690808198e-06, + "loss": 0.0085, + "num_input_tokens_seen": 30820928, + "step": 146045 + }, + { + "epoch": 16.067106710671066, + "grad_norm": 0.016723403707146645, + "learning_rate": 5.6622974713806174e-06, + "loss": 0.1317, + "num_input_tokens_seen": 30822016, + "step": 146050 + }, + { + "epoch": 16.067656765676567, + "grad_norm": 0.007714204024523497, + "learning_rate": 5.660776430179357e-06, + "loss": 0.0024, + "num_input_tokens_seen": 30823072, + "step": 146055 + }, + { + "epoch": 16.068206820682068, + "grad_norm": 0.14014823734760284, + "learning_rate": 5.659255567218441e-06, + "loss": 0.0431, + "num_input_tokens_seen": 30824128, + "step": 146060 + }, + { + "epoch": 16.06875687568757, + "grad_norm": 0.09099100530147552, + "learning_rate": 5.657734882511883e-06, + "loss": 0.0895, + "num_input_tokens_seen": 30825152, + "step": 146065 + }, + { + "epoch": 16.06930693069307, + "grad_norm": 0.0023939863312989473, + "learning_rate": 5.6562143760737094e-06, + "loss": 0.0006, + "num_input_tokens_seen": 30826272, + "step": 146070 + }, + { + "epoch": 16.06985698569857, + "grad_norm": 0.0057060010731220245, + "learning_rate": 5.6546940479179274e-06, + "loss": 0.001, + "num_input_tokens_seen": 30827392, + "step": 146075 + }, + { + "epoch": 16.07040704070407, + "grad_norm": 0.011500022374093533, + "learning_rate": 5.653173898058539e-06, + "loss": 0.0092, + "num_input_tokens_seen": 30828416, + "step": 146080 + }, + { + "epoch": 16.07095709570957, + "grad_norm": 0.005405231844633818, + "learning_rate": 5.651653926509564e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30829504, + "step": 146085 + }, + { + "epoch": 16.07150715071507, + "grad_norm": 0.0017826947150751948, + "learning_rate": 5.6501341332850126e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30830592, + "step": 146090 + }, + { + "epoch": 16.072057205720572, + "grad_norm": 0.004923895932734013, + "learning_rate": 5.648614518398892e-06, + "loss": 0.0033, + "num_input_tokens_seen": 30831648, + "step": 146095 + }, + { + "epoch": 16.072607260726073, + "grad_norm": 0.004260456655174494, + "learning_rate": 5.6470950818652085e-06, + "loss": 0.0606, + "num_input_tokens_seen": 30832640, + "step": 146100 + }, + { + "epoch": 16.073157315731574, + "grad_norm": 0.16502605378627777, + "learning_rate": 5.6455758236979575e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30833696, + "step": 146105 + }, + { + "epoch": 16.073707370737075, + "grad_norm": 0.003559795906767249, + "learning_rate": 5.644056743911153e-06, + "loss": 0.0041, + "num_input_tokens_seen": 30834720, + "step": 146110 + }, + { + "epoch": 16.074257425742573, + "grad_norm": 0.021572522819042206, + "learning_rate": 5.64253784251878e-06, + "loss": 0.002, + "num_input_tokens_seen": 30835744, + "step": 146115 + }, + { + "epoch": 16.074807480748074, + "grad_norm": 0.19760173559188843, + "learning_rate": 5.641019119534849e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30836800, + "step": 146120 + }, + { + "epoch": 16.075357535753575, + "grad_norm": 0.004596361890435219, + "learning_rate": 5.639500574973363e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30837952, + "step": 146125 + }, + { + "epoch": 16.075907590759076, + "grad_norm": 0.03974435105919838, + "learning_rate": 5.6379822088483025e-06, + "loss": 0.095, + "num_input_tokens_seen": 30838976, + "step": 146130 + }, + { + "epoch": 16.076457645764577, + "grad_norm": 0.023011568933725357, + "learning_rate": 5.6364640211736765e-06, + "loss": 0.0377, + "num_input_tokens_seen": 30840000, + "step": 146135 + }, + { + "epoch": 16.07700770077008, + "grad_norm": 0.015619839541614056, + "learning_rate": 5.634946011963471e-06, + "loss": 0.0051, + "num_input_tokens_seen": 30841088, + "step": 146140 + }, + { + "epoch": 16.077557755775576, + "grad_norm": 0.010462184436619282, + "learning_rate": 5.633428181231662e-06, + "loss": 0.0368, + "num_input_tokens_seen": 30842176, + "step": 146145 + }, + { + "epoch": 16.078107810781077, + "grad_norm": 0.08801236748695374, + "learning_rate": 5.6319105289922655e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30843296, + "step": 146150 + }, + { + "epoch": 16.078657865786578, + "grad_norm": 0.07259967178106308, + "learning_rate": 5.630393055259247e-06, + "loss": 0.0011, + "num_input_tokens_seen": 30844352, + "step": 146155 + }, + { + "epoch": 16.07920792079208, + "grad_norm": 0.016594870015978813, + "learning_rate": 5.6288757600466094e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30845344, + "step": 146160 + }, + { + "epoch": 16.07975797579758, + "grad_norm": 0.005368737503886223, + "learning_rate": 5.62735864336833e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30846336, + "step": 146165 + }, + { + "epoch": 16.08030803080308, + "grad_norm": 0.004975243471562862, + "learning_rate": 5.6258417052383826e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30847328, + "step": 146170 + }, + { + "epoch": 16.080858085808583, + "grad_norm": 0.14826984703540802, + "learning_rate": 5.624324945670756e-06, + "loss": 0.1087, + "num_input_tokens_seen": 30848384, + "step": 146175 + }, + { + "epoch": 16.08140814081408, + "grad_norm": 0.0316118560731411, + "learning_rate": 5.622808364679427e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30849504, + "step": 146180 + }, + { + "epoch": 16.08195819581958, + "grad_norm": 0.1950213760137558, + "learning_rate": 5.621291962278385e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30850560, + "step": 146185 + }, + { + "epoch": 16.082508250825082, + "grad_norm": 0.022379610687494278, + "learning_rate": 5.619775738481595e-06, + "loss": 0.0003, + "num_input_tokens_seen": 30851616, + "step": 146190 + }, + { + "epoch": 16.083058305830583, + "grad_norm": 0.15040212869644165, + "learning_rate": 5.618259693303024e-06, + "loss": 0.0186, + "num_input_tokens_seen": 30852640, + "step": 146195 + }, + { + "epoch": 16.083608360836084, + "grad_norm": 0.07505913078784943, + "learning_rate": 5.6167438267566625e-06, + "loss": 0.0519, + "num_input_tokens_seen": 30853696, + "step": 146200 + }, + { + "epoch": 16.084158415841586, + "grad_norm": 4.281711101531982, + "learning_rate": 5.615228138856457e-06, + "loss": 0.0422, + "num_input_tokens_seen": 30854688, + "step": 146205 + }, + { + "epoch": 16.084708470847083, + "grad_norm": 0.08786498755216599, + "learning_rate": 5.613712629616408e-06, + "loss": 0.0102, + "num_input_tokens_seen": 30855744, + "step": 146210 + }, + { + "epoch": 16.085258525852584, + "grad_norm": 0.08311157673597336, + "learning_rate": 5.612197299050464e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30856736, + "step": 146215 + }, + { + "epoch": 16.085808580858085, + "grad_norm": 1.0903854370117188, + "learning_rate": 5.610682147172588e-06, + "loss": 0.0678, + "num_input_tokens_seen": 30857856, + "step": 146220 + }, + { + "epoch": 16.086358635863586, + "grad_norm": 0.08054650574922562, + "learning_rate": 5.60916717399676e-06, + "loss": 0.0313, + "num_input_tokens_seen": 30858976, + "step": 146225 + }, + { + "epoch": 16.086908690869087, + "grad_norm": 0.003921315539628267, + "learning_rate": 5.607652379536924e-06, + "loss": 0.0009, + "num_input_tokens_seen": 30860032, + "step": 146230 + }, + { + "epoch": 16.08745874587459, + "grad_norm": 0.06765656918287277, + "learning_rate": 5.606137763807051e-06, + "loss": 0.1143, + "num_input_tokens_seen": 30861088, + "step": 146235 + }, + { + "epoch": 16.08800880088009, + "grad_norm": 3.7035443782806396, + "learning_rate": 5.604623326821109e-06, + "loss": 0.1719, + "num_input_tokens_seen": 30862144, + "step": 146240 + }, + { + "epoch": 16.088558855885587, + "grad_norm": 0.00643151206895709, + "learning_rate": 5.6031090685930375e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30863200, + "step": 146245 + }, + { + "epoch": 16.08910891089109, + "grad_norm": 0.04260263592004776, + "learning_rate": 5.601594989136813e-06, + "loss": 0.0023, + "num_input_tokens_seen": 30864288, + "step": 146250 + }, + { + "epoch": 16.08965896589659, + "grad_norm": 0.05455968528985977, + "learning_rate": 5.600081088466369e-06, + "loss": 0.0641, + "num_input_tokens_seen": 30865440, + "step": 146255 + }, + { + "epoch": 16.09020902090209, + "grad_norm": 0.004894298035651445, + "learning_rate": 5.598567366595675e-06, + "loss": 0.0045, + "num_input_tokens_seen": 30866528, + "step": 146260 + }, + { + "epoch": 16.09075907590759, + "grad_norm": 0.03197915107011795, + "learning_rate": 5.597053823538672e-06, + "loss": 0.0059, + "num_input_tokens_seen": 30867488, + "step": 146265 + }, + { + "epoch": 16.091309130913093, + "grad_norm": 0.03430332988500595, + "learning_rate": 5.595540459309312e-06, + "loss": 0.0978, + "num_input_tokens_seen": 30868608, + "step": 146270 + }, + { + "epoch": 16.09185918591859, + "grad_norm": 0.033251386135816574, + "learning_rate": 5.594027273921551e-06, + "loss": 0.0005, + "num_input_tokens_seen": 30869600, + "step": 146275 + }, + { + "epoch": 16.09240924092409, + "grad_norm": 0.09459128230810165, + "learning_rate": 5.5925142673893205e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30870624, + "step": 146280 + }, + { + "epoch": 16.092959295929592, + "grad_norm": 0.01932551898062229, + "learning_rate": 5.591001439726582e-06, + "loss": 0.0357, + "num_input_tokens_seen": 30871776, + "step": 146285 + }, + { + "epoch": 16.093509350935093, + "grad_norm": 0.0240633524954319, + "learning_rate": 5.589488790947265e-06, + "loss": 0.0019, + "num_input_tokens_seen": 30872864, + "step": 146290 + }, + { + "epoch": 16.094059405940595, + "grad_norm": 0.2158791869878769, + "learning_rate": 5.587976321065311e-06, + "loss": 0.0293, + "num_input_tokens_seen": 30873888, + "step": 146295 + }, + { + "epoch": 16.094609460946096, + "grad_norm": 0.10211247205734253, + "learning_rate": 5.586464030094674e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30874880, + "step": 146300 + }, + { + "epoch": 16.095159515951597, + "grad_norm": 0.012225991114974022, + "learning_rate": 5.584951918049275e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30875904, + "step": 146305 + }, + { + "epoch": 16.095709570957094, + "grad_norm": 0.2023906260728836, + "learning_rate": 5.5834399849430644e-06, + "loss": 0.0927, + "num_input_tokens_seen": 30876960, + "step": 146310 + }, + { + "epoch": 16.096259625962595, + "grad_norm": 0.015070915222167969, + "learning_rate": 5.581928230789973e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30878016, + "step": 146315 + }, + { + "epoch": 16.096809680968097, + "grad_norm": 3.0928781032562256, + "learning_rate": 5.580416655603915e-06, + "loss": 0.0287, + "num_input_tokens_seen": 30879072, + "step": 146320 + }, + { + "epoch": 16.097359735973598, + "grad_norm": 0.07045100629329681, + "learning_rate": 5.578905259398853e-06, + "loss": 0.0055, + "num_input_tokens_seen": 30880128, + "step": 146325 + }, + { + "epoch": 16.0979097909791, + "grad_norm": 0.0012373313074931502, + "learning_rate": 5.577394042188702e-06, + "loss": 0.0737, + "num_input_tokens_seen": 30881216, + "step": 146330 + }, + { + "epoch": 16.0984598459846, + "grad_norm": 0.006045331712812185, + "learning_rate": 5.5758830039873835e-06, + "loss": 0.0587, + "num_input_tokens_seen": 30882272, + "step": 146335 + }, + { + "epoch": 16.099009900990097, + "grad_norm": 0.009996484965085983, + "learning_rate": 5.574372144808837e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30883296, + "step": 146340 + }, + { + "epoch": 16.0995599559956, + "grad_norm": 0.015035013668239117, + "learning_rate": 5.572861464666976e-06, + "loss": 0.0033, + "num_input_tokens_seen": 30884384, + "step": 146345 + }, + { + "epoch": 16.1001100110011, + "grad_norm": 0.4056529700756073, + "learning_rate": 5.571350963575728e-06, + "loss": 0.0164, + "num_input_tokens_seen": 30885472, + "step": 146350 + }, + { + "epoch": 16.1006600660066, + "grad_norm": 0.011349194683134556, + "learning_rate": 5.5698406415490205e-06, + "loss": 0.0025, + "num_input_tokens_seen": 30886432, + "step": 146355 + }, + { + "epoch": 16.1012101210121, + "grad_norm": 0.006535420659929514, + "learning_rate": 5.568330498600763e-06, + "loss": 0.02, + "num_input_tokens_seen": 30887456, + "step": 146360 + }, + { + "epoch": 16.101760176017603, + "grad_norm": 1.8082237243652344, + "learning_rate": 5.5668205347448875e-06, + "loss": 0.0584, + "num_input_tokens_seen": 30888512, + "step": 146365 + }, + { + "epoch": 16.102310231023104, + "grad_norm": 0.04206382855772972, + "learning_rate": 5.565310749995292e-06, + "loss": 0.0276, + "num_input_tokens_seen": 30889600, + "step": 146370 + }, + { + "epoch": 16.1028602860286, + "grad_norm": 0.031074173748493195, + "learning_rate": 5.56380114436591e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30890656, + "step": 146375 + }, + { + "epoch": 16.103410341034103, + "grad_norm": 2.6353182792663574, + "learning_rate": 5.562291717870641e-06, + "loss": 0.0648, + "num_input_tokens_seen": 30891680, + "step": 146380 + }, + { + "epoch": 16.103960396039604, + "grad_norm": 0.07404723763465881, + "learning_rate": 5.5607824705234015e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30892736, + "step": 146385 + }, + { + "epoch": 16.104510451045105, + "grad_norm": 1.0533790588378906, + "learning_rate": 5.559273402338109e-06, + "loss": 0.0229, + "num_input_tokens_seen": 30893760, + "step": 146390 + }, + { + "epoch": 16.105060506050606, + "grad_norm": 0.09772493690252304, + "learning_rate": 5.557764513328656e-06, + "loss": 0.1063, + "num_input_tokens_seen": 30894752, + "step": 146395 + }, + { + "epoch": 16.105610561056107, + "grad_norm": 0.07044357061386108, + "learning_rate": 5.556255803508967e-06, + "loss": 0.002, + "num_input_tokens_seen": 30895840, + "step": 146400 + }, + { + "epoch": 16.106160616061604, + "grad_norm": 0.02494901791214943, + "learning_rate": 5.554747272892929e-06, + "loss": 0.0007, + "num_input_tokens_seen": 30896896, + "step": 146405 + }, + { + "epoch": 16.106710671067106, + "grad_norm": 15.871684074401855, + "learning_rate": 5.553238921494458e-06, + "loss": 0.0516, + "num_input_tokens_seen": 30897920, + "step": 146410 + }, + { + "epoch": 16.107260726072607, + "grad_norm": 0.10814279317855835, + "learning_rate": 5.551730749327458e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30898944, + "step": 146415 + }, + { + "epoch": 16.107810781078108, + "grad_norm": 0.08990589529275894, + "learning_rate": 5.550222756405815e-06, + "loss": 0.0076, + "num_input_tokens_seen": 30900032, + "step": 146420 + }, + { + "epoch": 16.10836083608361, + "grad_norm": 0.004157517571002245, + "learning_rate": 5.5487149427434435e-06, + "loss": 0.059, + "num_input_tokens_seen": 30901056, + "step": 146425 + }, + { + "epoch": 16.10891089108911, + "grad_norm": 0.1464853435754776, + "learning_rate": 5.547207308354235e-06, + "loss": 0.0435, + "num_input_tokens_seen": 30902144, + "step": 146430 + }, + { + "epoch": 16.10946094609461, + "grad_norm": 0.011357210576534271, + "learning_rate": 5.545699853252073e-06, + "loss": 0.1324, + "num_input_tokens_seen": 30903200, + "step": 146435 + }, + { + "epoch": 16.11001100110011, + "grad_norm": 0.0012030202196910977, + "learning_rate": 5.54419257745086e-06, + "loss": 0.004, + "num_input_tokens_seen": 30904288, + "step": 146440 + }, + { + "epoch": 16.11056105610561, + "grad_norm": 0.008168213069438934, + "learning_rate": 5.54268548096449e-06, + "loss": 0.0056, + "num_input_tokens_seen": 30905312, + "step": 146445 + }, + { + "epoch": 16.11111111111111, + "grad_norm": 0.0041601830162107944, + "learning_rate": 5.541178563806859e-06, + "loss": 0.0022, + "num_input_tokens_seen": 30906368, + "step": 146450 + }, + { + "epoch": 16.111661166116612, + "grad_norm": 0.047454819083213806, + "learning_rate": 5.539671825991846e-06, + "loss": 0.0048, + "num_input_tokens_seen": 30907424, + "step": 146455 + }, + { + "epoch": 16.112211221122113, + "grad_norm": 0.48547127842903137, + "learning_rate": 5.538165267533335e-06, + "loss": 0.0176, + "num_input_tokens_seen": 30908416, + "step": 146460 + }, + { + "epoch": 16.112761276127614, + "grad_norm": 0.1318383365869522, + "learning_rate": 5.536658888445215e-06, + "loss": 0.1612, + "num_input_tokens_seen": 30909472, + "step": 146465 + }, + { + "epoch": 16.11331133113311, + "grad_norm": 0.0029181824065744877, + "learning_rate": 5.535152688741368e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30910528, + "step": 146470 + }, + { + "epoch": 16.113861386138613, + "grad_norm": 0.003079896792769432, + "learning_rate": 5.533646668435688e-06, + "loss": 0.0728, + "num_input_tokens_seen": 30911584, + "step": 146475 + }, + { + "epoch": 16.114411441144114, + "grad_norm": 2.205986976623535, + "learning_rate": 5.532140827542046e-06, + "loss": 0.0972, + "num_input_tokens_seen": 30912704, + "step": 146480 + }, + { + "epoch": 16.114961496149615, + "grad_norm": 0.03186042979359627, + "learning_rate": 5.5306351660743135e-06, + "loss": 0.0669, + "num_input_tokens_seen": 30913728, + "step": 146485 + }, + { + "epoch": 16.115511551155116, + "grad_norm": 0.17309753596782684, + "learning_rate": 5.52912968404638e-06, + "loss": 0.0392, + "num_input_tokens_seen": 30914816, + "step": 146490 + }, + { + "epoch": 16.116061606160617, + "grad_norm": 0.011348546482622623, + "learning_rate": 5.52762438147211e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30915904, + "step": 146495 + }, + { + "epoch": 16.116611661166118, + "grad_norm": 0.06539541482925415, + "learning_rate": 5.526119258365381e-06, + "loss": 0.0166, + "num_input_tokens_seen": 30916960, + "step": 146500 + }, + { + "epoch": 16.117161716171616, + "grad_norm": 0.02257586643099785, + "learning_rate": 5.5246143147400735e-06, + "loss": 0.003, + "num_input_tokens_seen": 30918080, + "step": 146505 + }, + { + "epoch": 16.117711771177117, + "grad_norm": 0.008557386696338654, + "learning_rate": 5.523109550610045e-06, + "loss": 0.0424, + "num_input_tokens_seen": 30919136, + "step": 146510 + }, + { + "epoch": 16.118261826182618, + "grad_norm": 1.5757731199264526, + "learning_rate": 5.521604965989174e-06, + "loss": 0.0935, + "num_input_tokens_seen": 30920192, + "step": 146515 + }, + { + "epoch": 16.11881188118812, + "grad_norm": 2.5103893280029297, + "learning_rate": 5.520100560891317e-06, + "loss": 0.0147, + "num_input_tokens_seen": 30921184, + "step": 146520 + }, + { + "epoch": 16.11936193619362, + "grad_norm": 6.479368686676025, + "learning_rate": 5.518596335330348e-06, + "loss": 0.1632, + "num_input_tokens_seen": 30922304, + "step": 146525 + }, + { + "epoch": 16.11991199119912, + "grad_norm": 0.0536343939602375, + "learning_rate": 5.517092289320133e-06, + "loss": 0.0027, + "num_input_tokens_seen": 30923424, + "step": 146530 + }, + { + "epoch": 16.120462046204622, + "grad_norm": 0.022828327491879463, + "learning_rate": 5.515588422874527e-06, + "loss": 0.0307, + "num_input_tokens_seen": 30924416, + "step": 146535 + }, + { + "epoch": 16.12101210121012, + "grad_norm": 0.5569292306900024, + "learning_rate": 5.514084736007394e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30925440, + "step": 146540 + }, + { + "epoch": 16.12156215621562, + "grad_norm": 0.7432680726051331, + "learning_rate": 5.512581228732597e-06, + "loss": 0.0105, + "num_input_tokens_seen": 30926496, + "step": 146545 + }, + { + "epoch": 16.122112211221122, + "grad_norm": 0.23470552265644073, + "learning_rate": 5.511077901063977e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30927552, + "step": 146550 + }, + { + "epoch": 16.122662266226623, + "grad_norm": 0.08415202051401138, + "learning_rate": 5.509574753015401e-06, + "loss": 0.0014, + "num_input_tokens_seen": 30928608, + "step": 146555 + }, + { + "epoch": 16.123212321232124, + "grad_norm": 0.1734817922115326, + "learning_rate": 5.508071784600724e-06, + "loss": 0.0045, + "num_input_tokens_seen": 30929696, + "step": 146560 + }, + { + "epoch": 16.123762376237625, + "grad_norm": 0.011941778473556042, + "learning_rate": 5.5065689958338025e-06, + "loss": 0.0008, + "num_input_tokens_seen": 30930752, + "step": 146565 + }, + { + "epoch": 16.124312431243123, + "grad_norm": 0.004937020130455494, + "learning_rate": 5.50506638672848e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30931776, + "step": 146570 + }, + { + "epoch": 16.124862486248624, + "grad_norm": 0.007033574394881725, + "learning_rate": 5.503563957298599e-06, + "loss": 0.0826, + "num_input_tokens_seen": 30932832, + "step": 146575 + }, + { + "epoch": 16.125412541254125, + "grad_norm": 0.00504625728353858, + "learning_rate": 5.502061707558012e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30933824, + "step": 146580 + }, + { + "epoch": 16.125962596259626, + "grad_norm": 0.016280872747302055, + "learning_rate": 5.500559637520569e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30934880, + "step": 146585 + }, + { + "epoch": 16.126512651265127, + "grad_norm": 0.35631078481674194, + "learning_rate": 5.499057747200117e-06, + "loss": 0.0031, + "num_input_tokens_seen": 30935936, + "step": 146590 + }, + { + "epoch": 16.127062706270628, + "grad_norm": 1.047548532485962, + "learning_rate": 5.497556036610493e-06, + "loss": 0.0134, + "num_input_tokens_seen": 30936960, + "step": 146595 + }, + { + "epoch": 16.12761276127613, + "grad_norm": 0.10534606873989105, + "learning_rate": 5.49605450576553e-06, + "loss": 0.0016, + "num_input_tokens_seen": 30938112, + "step": 146600 + }, + { + "epoch": 16.128162816281627, + "grad_norm": 0.0052911206148564816, + "learning_rate": 5.49455315467908e-06, + "loss": 0.0057, + "num_input_tokens_seen": 30939168, + "step": 146605 + }, + { + "epoch": 16.128712871287128, + "grad_norm": 0.36640679836273193, + "learning_rate": 5.493051983364966e-06, + "loss": 0.1382, + "num_input_tokens_seen": 30940224, + "step": 146610 + }, + { + "epoch": 16.12926292629263, + "grad_norm": 0.03555416688323021, + "learning_rate": 5.491550991837033e-06, + "loss": 0.0252, + "num_input_tokens_seen": 30941312, + "step": 146615 + }, + { + "epoch": 16.12981298129813, + "grad_norm": 0.05732051283121109, + "learning_rate": 5.490050180109121e-06, + "loss": 0.0715, + "num_input_tokens_seen": 30942336, + "step": 146620 + }, + { + "epoch": 16.13036303630363, + "grad_norm": 0.06987003237009048, + "learning_rate": 5.488549548195046e-06, + "loss": 0.0845, + "num_input_tokens_seen": 30943424, + "step": 146625 + }, + { + "epoch": 16.130913091309132, + "grad_norm": 0.007487265393137932, + "learning_rate": 5.487049096108654e-06, + "loss": 0.0071, + "num_input_tokens_seen": 30944512, + "step": 146630 + }, + { + "epoch": 16.13146314631463, + "grad_norm": 0.025843659415841103, + "learning_rate": 5.485548823863762e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30945568, + "step": 146635 + }, + { + "epoch": 16.13201320132013, + "grad_norm": 0.0052637881599366665, + "learning_rate": 5.484048731474203e-06, + "loss": 0.0083, + "num_input_tokens_seen": 30946592, + "step": 146640 + }, + { + "epoch": 16.132563256325632, + "grad_norm": 0.07358801364898682, + "learning_rate": 5.482548818953806e-06, + "loss": 0.0026, + "num_input_tokens_seen": 30947584, + "step": 146645 + }, + { + "epoch": 16.133113311331133, + "grad_norm": 0.09027516096830368, + "learning_rate": 5.481049086316386e-06, + "loss": 0.0134, + "num_input_tokens_seen": 30948608, + "step": 146650 + }, + { + "epoch": 16.133663366336634, + "grad_norm": 0.14508624374866486, + "learning_rate": 5.479549533575778e-06, + "loss": 0.0544, + "num_input_tokens_seen": 30949728, + "step": 146655 + }, + { + "epoch": 16.134213421342135, + "grad_norm": 0.03761252760887146, + "learning_rate": 5.478050160745788e-06, + "loss": 0.0046, + "num_input_tokens_seen": 30950752, + "step": 146660 + }, + { + "epoch": 16.134763476347636, + "grad_norm": 0.12142663449048996, + "learning_rate": 5.476550967840249e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30951744, + "step": 146665 + }, + { + "epoch": 16.135313531353134, + "grad_norm": 0.013255652040243149, + "learning_rate": 5.475051954872964e-06, + "loss": 0.0041, + "num_input_tokens_seen": 30952800, + "step": 146670 + }, + { + "epoch": 16.135863586358635, + "grad_norm": 0.01980099268257618, + "learning_rate": 5.473553121857758e-06, + "loss": 0.1369, + "num_input_tokens_seen": 30953856, + "step": 146675 + }, + { + "epoch": 16.136413641364136, + "grad_norm": 0.2099795788526535, + "learning_rate": 5.472054468808449e-06, + "loss": 0.002, + "num_input_tokens_seen": 30954912, + "step": 146680 + }, + { + "epoch": 16.136963696369637, + "grad_norm": 0.009098120033740997, + "learning_rate": 5.470555995738835e-06, + "loss": 0.0761, + "num_input_tokens_seen": 30956000, + "step": 146685 + }, + { + "epoch": 16.13751375137514, + "grad_norm": 0.027427369728684425, + "learning_rate": 5.469057702662744e-06, + "loss": 0.0012, + "num_input_tokens_seen": 30957088, + "step": 146690 + }, + { + "epoch": 16.13806380638064, + "grad_norm": 0.010547314770519733, + "learning_rate": 5.467559589593968e-06, + "loss": 0.0145, + "num_input_tokens_seen": 30958176, + "step": 146695 + }, + { + "epoch": 16.138613861386137, + "grad_norm": 0.018917536363005638, + "learning_rate": 5.466061656546323e-06, + "loss": 0.0144, + "num_input_tokens_seen": 30959200, + "step": 146700 + }, + { + "epoch": 16.139163916391638, + "grad_norm": 0.9039666652679443, + "learning_rate": 5.464563903533623e-06, + "loss": 0.0381, + "num_input_tokens_seen": 30960256, + "step": 146705 + }, + { + "epoch": 16.13971397139714, + "grad_norm": 0.009921599179506302, + "learning_rate": 5.463066330569661e-06, + "loss": 0.1094, + "num_input_tokens_seen": 30961248, + "step": 146710 + }, + { + "epoch": 16.14026402640264, + "grad_norm": 0.016211455687880516, + "learning_rate": 5.461568937668238e-06, + "loss": 0.1158, + "num_input_tokens_seen": 30962336, + "step": 146715 + }, + { + "epoch": 16.14081408140814, + "grad_norm": 0.16444984078407288, + "learning_rate": 5.460071724843163e-06, + "loss": 0.003, + "num_input_tokens_seen": 30963392, + "step": 146720 + }, + { + "epoch": 16.141364136413642, + "grad_norm": 0.011632694862782955, + "learning_rate": 5.4585746921082235e-06, + "loss": 0.034, + "num_input_tokens_seen": 30964480, + "step": 146725 + }, + { + "epoch": 16.141914191419144, + "grad_norm": 0.01555539108812809, + "learning_rate": 5.4570778394772256e-06, + "loss": 0.008, + "num_input_tokens_seen": 30965504, + "step": 146730 + }, + { + "epoch": 16.14246424642464, + "grad_norm": 3.6748733520507812, + "learning_rate": 5.455581166963971e-06, + "loss": 0.097, + "num_input_tokens_seen": 30966560, + "step": 146735 + }, + { + "epoch": 16.143014301430142, + "grad_norm": 0.024981200695037842, + "learning_rate": 5.454084674582238e-06, + "loss": 0.0072, + "num_input_tokens_seen": 30967584, + "step": 146740 + }, + { + "epoch": 16.143564356435643, + "grad_norm": 0.26919427514076233, + "learning_rate": 5.452588362345837e-06, + "loss": 0.0803, + "num_input_tokens_seen": 30968704, + "step": 146745 + }, + { + "epoch": 16.144114411441144, + "grad_norm": 0.11879904568195343, + "learning_rate": 5.451092230268539e-06, + "loss": 0.0206, + "num_input_tokens_seen": 30969760, + "step": 146750 + }, + { + "epoch": 16.144664466446645, + "grad_norm": 1.3422306776046753, + "learning_rate": 5.449596278364147e-06, + "loss": 0.0095, + "num_input_tokens_seen": 30970752, + "step": 146755 + }, + { + "epoch": 16.145214521452147, + "grad_norm": 0.06290355324745178, + "learning_rate": 5.448100506646447e-06, + "loss": 0.0021, + "num_input_tokens_seen": 30971776, + "step": 146760 + }, + { + "epoch": 16.145764576457644, + "grad_norm": 0.029019495472311974, + "learning_rate": 5.446604915129219e-06, + "loss": 0.0736, + "num_input_tokens_seen": 30972736, + "step": 146765 + }, + { + "epoch": 16.146314631463145, + "grad_norm": 2.3411543369293213, + "learning_rate": 5.4451095038262595e-06, + "loss": 0.0745, + "num_input_tokens_seen": 30973824, + "step": 146770 + }, + { + "epoch": 16.146864686468646, + "grad_norm": 0.009303216822445393, + "learning_rate": 5.443614272751333e-06, + "loss": 0.0056, + "num_input_tokens_seen": 30974912, + "step": 146775 + }, + { + "epoch": 16.147414741474147, + "grad_norm": 2.3109540939331055, + "learning_rate": 5.442119221918238e-06, + "loss": 0.072, + "num_input_tokens_seen": 30975936, + "step": 146780 + }, + { + "epoch": 16.14796479647965, + "grad_norm": 0.18610678613185883, + "learning_rate": 5.440624351340737e-06, + "loss": 0.0552, + "num_input_tokens_seen": 30976960, + "step": 146785 + }, + { + "epoch": 16.14851485148515, + "grad_norm": 0.026364777237176895, + "learning_rate": 5.439129661032616e-06, + "loss": 0.0097, + "num_input_tokens_seen": 30977984, + "step": 146790 + }, + { + "epoch": 16.14906490649065, + "grad_norm": 0.034087106585502625, + "learning_rate": 5.437635151007658e-06, + "loss": 0.0036, + "num_input_tokens_seen": 30979072, + "step": 146795 + }, + { + "epoch": 16.149614961496148, + "grad_norm": 0.017187513411045074, + "learning_rate": 5.436140821279623e-06, + "loss": 0.0059, + "num_input_tokens_seen": 30980128, + "step": 146800 + }, + { + "epoch": 16.15016501650165, + "grad_norm": 0.051886457949876785, + "learning_rate": 5.434646671862298e-06, + "loss": 0.0072, + "num_input_tokens_seen": 30981248, + "step": 146805 + }, + { + "epoch": 16.15071507150715, + "grad_norm": 2.5073835849761963, + "learning_rate": 5.433152702769437e-06, + "loss": 0.104, + "num_input_tokens_seen": 30982304, + "step": 146810 + }, + { + "epoch": 16.15126512651265, + "grad_norm": 0.07867815345525742, + "learning_rate": 5.43165891401482e-06, + "loss": 0.0038, + "num_input_tokens_seen": 30983392, + "step": 146815 + }, + { + "epoch": 16.151815181518153, + "grad_norm": 0.035452187061309814, + "learning_rate": 5.43016530561222e-06, + "loss": 0.0321, + "num_input_tokens_seen": 30984480, + "step": 146820 + }, + { + "epoch": 16.152365236523654, + "grad_norm": 0.013515535742044449, + "learning_rate": 5.428671877575389e-06, + "loss": 0.0029, + "num_input_tokens_seen": 30985504, + "step": 146825 + }, + { + "epoch": 16.15291529152915, + "grad_norm": 0.030084751546382904, + "learning_rate": 5.427178629918106e-06, + "loss": 0.002, + "num_input_tokens_seen": 30986560, + "step": 146830 + }, + { + "epoch": 16.153465346534652, + "grad_norm": 0.022135848179459572, + "learning_rate": 5.425685562654123e-06, + "loss": 0.0068, + "num_input_tokens_seen": 30987584, + "step": 146835 + }, + { + "epoch": 16.154015401540153, + "grad_norm": 0.22187058627605438, + "learning_rate": 5.424192675797199e-06, + "loss": 0.0407, + "num_input_tokens_seen": 30988640, + "step": 146840 + }, + { + "epoch": 16.154565456545654, + "grad_norm": 0.03404994308948517, + "learning_rate": 5.422699969361094e-06, + "loss": 0.0035, + "num_input_tokens_seen": 30989664, + "step": 146845 + }, + { + "epoch": 16.155115511551156, + "grad_norm": 0.05447057634592056, + "learning_rate": 5.421207443359575e-06, + "loss": 0.0653, + "num_input_tokens_seen": 30990688, + "step": 146850 + }, + { + "epoch": 16.155665566556657, + "grad_norm": 0.11099174618721008, + "learning_rate": 5.4197150978063965e-06, + "loss": 0.004, + "num_input_tokens_seen": 30991808, + "step": 146855 + }, + { + "epoch": 16.156215621562158, + "grad_norm": 0.03421696648001671, + "learning_rate": 5.418222932715306e-06, + "loss": 0.0096, + "num_input_tokens_seen": 30992832, + "step": 146860 + }, + { + "epoch": 16.156765676567655, + "grad_norm": 0.05723109096288681, + "learning_rate": 5.416730948100054e-06, + "loss": 0.0124, + "num_input_tokens_seen": 30993792, + "step": 146865 + }, + { + "epoch": 16.157315731573156, + "grad_norm": 0.01115168072283268, + "learning_rate": 5.415239143974396e-06, + "loss": 0.0017, + "num_input_tokens_seen": 30994816, + "step": 146870 + }, + { + "epoch": 16.157865786578657, + "grad_norm": 0.22505266964435577, + "learning_rate": 5.41374752035208e-06, + "loss": 0.1078, + "num_input_tokens_seen": 30995872, + "step": 146875 + }, + { + "epoch": 16.15841584158416, + "grad_norm": 0.02830134704709053, + "learning_rate": 5.412256077246863e-06, + "loss": 0.0015, + "num_input_tokens_seen": 30996896, + "step": 146880 + }, + { + "epoch": 16.15896589658966, + "grad_norm": 0.34219440817832947, + "learning_rate": 5.410764814672481e-06, + "loss": 0.037, + "num_input_tokens_seen": 30997984, + "step": 146885 + }, + { + "epoch": 16.15951595159516, + "grad_norm": 0.003946194890886545, + "learning_rate": 5.409273732642675e-06, + "loss": 0.1258, + "num_input_tokens_seen": 30999072, + "step": 146890 + }, + { + "epoch": 16.16006600660066, + "grad_norm": 0.00860563199967146, + "learning_rate": 5.407782831171198e-06, + "loss": 0.001, + "num_input_tokens_seen": 31000160, + "step": 146895 + }, + { + "epoch": 16.16061606160616, + "grad_norm": 0.34402740001678467, + "learning_rate": 5.406292110271777e-06, + "loss": 0.0393, + "num_input_tokens_seen": 31001216, + "step": 146900 + }, + { + "epoch": 16.16116611661166, + "grad_norm": 0.4700954556465149, + "learning_rate": 5.404801569958162e-06, + "loss": 0.0479, + "num_input_tokens_seen": 31002240, + "step": 146905 + }, + { + "epoch": 16.16171617161716, + "grad_norm": 0.43687066435813904, + "learning_rate": 5.403311210244094e-06, + "loss": 0.0806, + "num_input_tokens_seen": 31003296, + "step": 146910 + }, + { + "epoch": 16.162266226622663, + "grad_norm": 0.03536655381321907, + "learning_rate": 5.401821031143298e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31004448, + "step": 146915 + }, + { + "epoch": 16.162816281628164, + "grad_norm": 0.0215169508010149, + "learning_rate": 5.400331032669517e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31005440, + "step": 146920 + }, + { + "epoch": 16.163366336633665, + "grad_norm": 0.018920600414276123, + "learning_rate": 5.398841214836484e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31006496, + "step": 146925 + }, + { + "epoch": 16.163916391639162, + "grad_norm": 0.019867539405822754, + "learning_rate": 5.397351577657911e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31007520, + "step": 146930 + }, + { + "epoch": 16.164466446644663, + "grad_norm": 0.022676918655633926, + "learning_rate": 5.395862121147554e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31008576, + "step": 146935 + }, + { + "epoch": 16.165016501650165, + "grad_norm": 0.1611468344926834, + "learning_rate": 5.394372845319123e-06, + "loss": 0.0357, + "num_input_tokens_seen": 31009664, + "step": 146940 + }, + { + "epoch": 16.165566556655666, + "grad_norm": 0.10035502165555954, + "learning_rate": 5.392883750186356e-06, + "loss": 0.0476, + "num_input_tokens_seen": 31010720, + "step": 146945 + }, + { + "epoch": 16.166116611661167, + "grad_norm": 0.17307156324386597, + "learning_rate": 5.391394835762972e-06, + "loss": 0.0801, + "num_input_tokens_seen": 31011776, + "step": 146950 + }, + { + "epoch": 16.166666666666668, + "grad_norm": 0.021082093939185143, + "learning_rate": 5.3899061020626865e-06, + "loss": 0.0094, + "num_input_tokens_seen": 31012800, + "step": 146955 + }, + { + "epoch": 16.16721672167217, + "grad_norm": 0.05257810652256012, + "learning_rate": 5.388417549099226e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31013920, + "step": 146960 + }, + { + "epoch": 16.167766776677666, + "grad_norm": 0.02782152034342289, + "learning_rate": 5.38692917688631e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31014944, + "step": 146965 + }, + { + "epoch": 16.168316831683168, + "grad_norm": 3.213193893432617, + "learning_rate": 5.3854409854376645e-06, + "loss": 0.0108, + "num_input_tokens_seen": 31016032, + "step": 146970 + }, + { + "epoch": 16.16886688668867, + "grad_norm": 0.030264312401413918, + "learning_rate": 5.3839529747669954e-06, + "loss": 0.0057, + "num_input_tokens_seen": 31017120, + "step": 146975 + }, + { + "epoch": 16.16941694169417, + "grad_norm": 0.0038829450495541096, + "learning_rate": 5.382465144888016e-06, + "loss": 0.125, + "num_input_tokens_seen": 31018176, + "step": 146980 + }, + { + "epoch": 16.16996699669967, + "grad_norm": 0.15780891478061676, + "learning_rate": 5.380977495814446e-06, + "loss": 0.0183, + "num_input_tokens_seen": 31019232, + "step": 146985 + }, + { + "epoch": 16.170517051705172, + "grad_norm": 0.020915942266583443, + "learning_rate": 5.379490027559977e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31020320, + "step": 146990 + }, + { + "epoch": 16.17106710671067, + "grad_norm": 0.1462419480085373, + "learning_rate": 5.3780027401383495e-06, + "loss": 0.0277, + "num_input_tokens_seen": 31021408, + "step": 146995 + }, + { + "epoch": 16.17161716171617, + "grad_norm": 0.006004914175719023, + "learning_rate": 5.376515633563256e-06, + "loss": 0.0397, + "num_input_tokens_seen": 31022432, + "step": 147000 + }, + { + "epoch": 16.17216721672167, + "grad_norm": 0.11032981425523758, + "learning_rate": 5.375028707848393e-06, + "loss": 0.014, + "num_input_tokens_seen": 31023520, + "step": 147005 + }, + { + "epoch": 16.172717271727173, + "grad_norm": 1.6361290216445923, + "learning_rate": 5.37354196300748e-06, + "loss": 0.0111, + "num_input_tokens_seen": 31024576, + "step": 147010 + }, + { + "epoch": 16.173267326732674, + "grad_norm": 0.020154422149062157, + "learning_rate": 5.372055399054207e-06, + "loss": 0.0827, + "num_input_tokens_seen": 31025632, + "step": 147015 + }, + { + "epoch": 16.173817381738175, + "grad_norm": 1.8949143886566162, + "learning_rate": 5.37056901600228e-06, + "loss": 0.0613, + "num_input_tokens_seen": 31026656, + "step": 147020 + }, + { + "epoch": 16.174367436743676, + "grad_norm": 0.004902952816337347, + "learning_rate": 5.369082813865409e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31027680, + "step": 147025 + }, + { + "epoch": 16.174917491749174, + "grad_norm": 0.32708463072776794, + "learning_rate": 5.367596792657273e-06, + "loss": 0.0075, + "num_input_tokens_seen": 31028800, + "step": 147030 + }, + { + "epoch": 16.175467546754675, + "grad_norm": 0.01878707855939865, + "learning_rate": 5.366110952391584e-06, + "loss": 0.0076, + "num_input_tokens_seen": 31029824, + "step": 147035 + }, + { + "epoch": 16.176017601760176, + "grad_norm": 0.009588266722857952, + "learning_rate": 5.36462529308202e-06, + "loss": 0.0048, + "num_input_tokens_seen": 31030848, + "step": 147040 + }, + { + "epoch": 16.176567656765677, + "grad_norm": 0.0009475243859924376, + "learning_rate": 5.363139814742293e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31031936, + "step": 147045 + }, + { + "epoch": 16.177117711771178, + "grad_norm": 0.19560116529464722, + "learning_rate": 5.361654517386075e-06, + "loss": 0.0999, + "num_input_tokens_seen": 31032992, + "step": 147050 + }, + { + "epoch": 16.17766776677668, + "grad_norm": 0.7037792205810547, + "learning_rate": 5.3601694010270655e-06, + "loss": 0.01, + "num_input_tokens_seen": 31034080, + "step": 147055 + }, + { + "epoch": 16.178217821782177, + "grad_norm": 0.045100413262844086, + "learning_rate": 5.358684465678956e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31035136, + "step": 147060 + }, + { + "epoch": 16.178767876787678, + "grad_norm": 0.022760484367609024, + "learning_rate": 5.3571997113554206e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31036160, + "step": 147065 + }, + { + "epoch": 16.17931793179318, + "grad_norm": 0.054471638053655624, + "learning_rate": 5.355715138070155e-06, + "loss": 0.0041, + "num_input_tokens_seen": 31037184, + "step": 147070 + }, + { + "epoch": 16.17986798679868, + "grad_norm": 0.06294059008359909, + "learning_rate": 5.354230745836833e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31038240, + "step": 147075 + }, + { + "epoch": 16.18041804180418, + "grad_norm": 0.029207266867160797, + "learning_rate": 5.352746534669137e-06, + "loss": 0.1027, + "num_input_tokens_seen": 31039232, + "step": 147080 + }, + { + "epoch": 16.180968096809682, + "grad_norm": 0.06697630137205124, + "learning_rate": 5.3512625045807565e-06, + "loss": 0.0092, + "num_input_tokens_seen": 31040288, + "step": 147085 + }, + { + "epoch": 16.181518151815183, + "grad_norm": 0.018110739067196846, + "learning_rate": 5.349778655585361e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31041344, + "step": 147090 + }, + { + "epoch": 16.18206820682068, + "grad_norm": 0.132238507270813, + "learning_rate": 5.34829498769662e-06, + "loss": 0.016, + "num_input_tokens_seen": 31042432, + "step": 147095 + }, + { + "epoch": 16.182618261826182, + "grad_norm": 0.23239746689796448, + "learning_rate": 5.34681150092822e-06, + "loss": 0.0063, + "num_input_tokens_seen": 31043456, + "step": 147100 + }, + { + "epoch": 16.183168316831683, + "grad_norm": 0.12211668491363525, + "learning_rate": 5.345328195293814e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31044416, + "step": 147105 + }, + { + "epoch": 16.183718371837184, + "grad_norm": 1.961102843284607, + "learning_rate": 5.343845070807102e-06, + "loss": 0.0767, + "num_input_tokens_seen": 31045472, + "step": 147110 + }, + { + "epoch": 16.184268426842685, + "grad_norm": 0.03065716288983822, + "learning_rate": 5.342362127481737e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31046560, + "step": 147115 + }, + { + "epoch": 16.184818481848186, + "grad_norm": 0.2915366590023041, + "learning_rate": 5.340879365331381e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31047616, + "step": 147120 + }, + { + "epoch": 16.185368536853684, + "grad_norm": 0.010502987541258335, + "learning_rate": 5.339396784369713e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31048640, + "step": 147125 + }, + { + "epoch": 16.185918591859185, + "grad_norm": 0.011596919037401676, + "learning_rate": 5.3379143846103845e-06, + "loss": 0.0646, + "num_input_tokens_seen": 31049728, + "step": 147130 + }, + { + "epoch": 16.186468646864686, + "grad_norm": 0.011632321402430534, + "learning_rate": 5.336432166067062e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31050784, + "step": 147135 + }, + { + "epoch": 16.187018701870187, + "grad_norm": 0.3951329290866852, + "learning_rate": 5.334950128753419e-06, + "loss": 0.004, + "num_input_tokens_seen": 31051872, + "step": 147140 + }, + { + "epoch": 16.187568756875688, + "grad_norm": 0.03112364560365677, + "learning_rate": 5.333468272683096e-06, + "loss": 0.0213, + "num_input_tokens_seen": 31052960, + "step": 147145 + }, + { + "epoch": 16.18811881188119, + "grad_norm": 0.0089619942009449, + "learning_rate": 5.331986597869768e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31054016, + "step": 147150 + }, + { + "epoch": 16.18866886688669, + "grad_norm": 2.5691583156585693, + "learning_rate": 5.3305051043270715e-06, + "loss": 0.0114, + "num_input_tokens_seen": 31055104, + "step": 147155 + }, + { + "epoch": 16.189218921892188, + "grad_norm": 0.004886402748525143, + "learning_rate": 5.32902379206868e-06, + "loss": 0.0651, + "num_input_tokens_seen": 31056160, + "step": 147160 + }, + { + "epoch": 16.18976897689769, + "grad_norm": 1.5778379440307617, + "learning_rate": 5.327542661108229e-06, + "loss": 0.1198, + "num_input_tokens_seen": 31057248, + "step": 147165 + }, + { + "epoch": 16.19031903190319, + "grad_norm": 0.09502460807561874, + "learning_rate": 5.326061711459379e-06, + "loss": 0.0098, + "num_input_tokens_seen": 31058272, + "step": 147170 + }, + { + "epoch": 16.19086908690869, + "grad_norm": 0.0124766044318676, + "learning_rate": 5.324580943135784e-06, + "loss": 0.0123, + "num_input_tokens_seen": 31059296, + "step": 147175 + }, + { + "epoch": 16.191419141914192, + "grad_norm": 0.230485737323761, + "learning_rate": 5.323100356151078e-06, + "loss": 0.0132, + "num_input_tokens_seen": 31060288, + "step": 147180 + }, + { + "epoch": 16.191969196919693, + "grad_norm": 0.3269527554512024, + "learning_rate": 5.321619950518922e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31061376, + "step": 147185 + }, + { + "epoch": 16.19251925192519, + "grad_norm": 0.03000292181968689, + "learning_rate": 5.320139726252946e-06, + "loss": 0.0682, + "num_input_tokens_seen": 31062400, + "step": 147190 + }, + { + "epoch": 16.193069306930692, + "grad_norm": 0.011336193419992924, + "learning_rate": 5.318659683366797e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31063488, + "step": 147195 + }, + { + "epoch": 16.193619361936193, + "grad_norm": 0.005471081472933292, + "learning_rate": 5.3171798218741245e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31064608, + "step": 147200 + }, + { + "epoch": 16.194169416941694, + "grad_norm": 0.004195657558739185, + "learning_rate": 5.315700141788554e-06, + "loss": 0.0098, + "num_input_tokens_seen": 31065728, + "step": 147205 + }, + { + "epoch": 16.194719471947195, + "grad_norm": 0.0100894495844841, + "learning_rate": 5.314220643123738e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31066816, + "step": 147210 + }, + { + "epoch": 16.195269526952696, + "grad_norm": 0.00708022341132164, + "learning_rate": 5.312741325893303e-06, + "loss": 0.0082, + "num_input_tokens_seen": 31067872, + "step": 147215 + }, + { + "epoch": 16.195819581958197, + "grad_norm": 0.010069279000163078, + "learning_rate": 5.31126219011088e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31068928, + "step": 147220 + }, + { + "epoch": 16.196369636963695, + "grad_norm": 0.013421742245554924, + "learning_rate": 5.309783235790103e-06, + "loss": 0.095, + "num_input_tokens_seen": 31069984, + "step": 147225 + }, + { + "epoch": 16.196919691969196, + "grad_norm": 0.02594790793955326, + "learning_rate": 5.308304462944608e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31071104, + "step": 147230 + }, + { + "epoch": 16.197469746974697, + "grad_norm": 2.859501838684082, + "learning_rate": 5.306825871588026e-06, + "loss": 0.0256, + "num_input_tokens_seen": 31072224, + "step": 147235 + }, + { + "epoch": 16.198019801980198, + "grad_norm": 0.3857688903808594, + "learning_rate": 5.305347461733981e-06, + "loss": 0.1473, + "num_input_tokens_seen": 31073216, + "step": 147240 + }, + { + "epoch": 16.1985698569857, + "grad_norm": 0.22726459801197052, + "learning_rate": 5.303869233396092e-06, + "loss": 0.1193, + "num_input_tokens_seen": 31074272, + "step": 147245 + }, + { + "epoch": 16.1991199119912, + "grad_norm": 0.013748628087341785, + "learning_rate": 5.302391186587988e-06, + "loss": 0.0417, + "num_input_tokens_seen": 31075296, + "step": 147250 + }, + { + "epoch": 16.199669966996698, + "grad_norm": 0.01219874620437622, + "learning_rate": 5.3009133213232946e-06, + "loss": 0.003, + "num_input_tokens_seen": 31076352, + "step": 147255 + }, + { + "epoch": 16.2002200220022, + "grad_norm": 0.21391195058822632, + "learning_rate": 5.2994356376156354e-06, + "loss": 0.0049, + "num_input_tokens_seen": 31077408, + "step": 147260 + }, + { + "epoch": 16.2007700770077, + "grad_norm": 0.0008078123209998012, + "learning_rate": 5.2979581354786255e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31078528, + "step": 147265 + }, + { + "epoch": 16.2013201320132, + "grad_norm": 0.025341223925352097, + "learning_rate": 5.296480814925875e-06, + "loss": 0.0061, + "num_input_tokens_seen": 31079520, + "step": 147270 + }, + { + "epoch": 16.201870187018702, + "grad_norm": 0.6091185808181763, + "learning_rate": 5.295003675971011e-06, + "loss": 0.0069, + "num_input_tokens_seen": 31080576, + "step": 147275 + }, + { + "epoch": 16.202420242024203, + "grad_norm": 0.005051824264228344, + "learning_rate": 5.2935267186276395e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31081600, + "step": 147280 + }, + { + "epoch": 16.202970297029704, + "grad_norm": 0.18698441982269287, + "learning_rate": 5.292049942909374e-06, + "loss": 0.0372, + "num_input_tokens_seen": 31082720, + "step": 147285 + }, + { + "epoch": 16.203520352035202, + "grad_norm": 0.6396514177322388, + "learning_rate": 5.290573348829833e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31083776, + "step": 147290 + }, + { + "epoch": 16.204070407040703, + "grad_norm": 0.06092365086078644, + "learning_rate": 5.289096936402616e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31084832, + "step": 147295 + }, + { + "epoch": 16.204620462046204, + "grad_norm": 0.027688806876540184, + "learning_rate": 5.287620705641339e-06, + "loss": 0.0268, + "num_input_tokens_seen": 31085888, + "step": 147300 + }, + { + "epoch": 16.205170517051705, + "grad_norm": 0.013597806915640831, + "learning_rate": 5.286144656559597e-06, + "loss": 0.0115, + "num_input_tokens_seen": 31086976, + "step": 147305 + }, + { + "epoch": 16.205720572057206, + "grad_norm": 2.5318615436553955, + "learning_rate": 5.284668789170999e-06, + "loss": 0.1086, + "num_input_tokens_seen": 31088064, + "step": 147310 + }, + { + "epoch": 16.206270627062707, + "grad_norm": 0.028562117367982864, + "learning_rate": 5.283193103489156e-06, + "loss": 0.0412, + "num_input_tokens_seen": 31089120, + "step": 147315 + }, + { + "epoch": 16.206820682068205, + "grad_norm": 0.04680676385760307, + "learning_rate": 5.2817175995276555e-06, + "loss": 0.006, + "num_input_tokens_seen": 31090208, + "step": 147320 + }, + { + "epoch": 16.207370737073706, + "grad_norm": 0.013186811469495296, + "learning_rate": 5.280242277300107e-06, + "loss": 0.0516, + "num_input_tokens_seen": 31091296, + "step": 147325 + }, + { + "epoch": 16.207920792079207, + "grad_norm": 0.017535489052534103, + "learning_rate": 5.278767136820104e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31092384, + "step": 147330 + }, + { + "epoch": 16.20847084708471, + "grad_norm": 0.026188356801867485, + "learning_rate": 5.2772921781012345e-06, + "loss": 0.0045, + "num_input_tokens_seen": 31093440, + "step": 147335 + }, + { + "epoch": 16.20902090209021, + "grad_norm": 0.010164634324610233, + "learning_rate": 5.275817401157099e-06, + "loss": 0.0453, + "num_input_tokens_seen": 31094464, + "step": 147340 + }, + { + "epoch": 16.20957095709571, + "grad_norm": 0.014620289206504822, + "learning_rate": 5.274342806001289e-06, + "loss": 0.0654, + "num_input_tokens_seen": 31095520, + "step": 147345 + }, + { + "epoch": 16.21012101210121, + "grad_norm": 0.01176412496715784, + "learning_rate": 5.272868392647404e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31096512, + "step": 147350 + }, + { + "epoch": 16.21067106710671, + "grad_norm": 2.2231154441833496, + "learning_rate": 5.271394161109025e-06, + "loss": 0.0292, + "num_input_tokens_seen": 31097600, + "step": 147355 + }, + { + "epoch": 16.21122112211221, + "grad_norm": 0.02393255941569805, + "learning_rate": 5.269920111399732e-06, + "loss": 0.056, + "num_input_tokens_seen": 31098624, + "step": 147360 + }, + { + "epoch": 16.21177117711771, + "grad_norm": 2.4398860931396484, + "learning_rate": 5.268446243533118e-06, + "loss": 0.0383, + "num_input_tokens_seen": 31099584, + "step": 147365 + }, + { + "epoch": 16.212321232123212, + "grad_norm": 0.030419567599892616, + "learning_rate": 5.2669725575227676e-06, + "loss": 0.0995, + "num_input_tokens_seen": 31100704, + "step": 147370 + }, + { + "epoch": 16.212871287128714, + "grad_norm": 0.3103388845920563, + "learning_rate": 5.2654990533822665e-06, + "loss": 0.0094, + "num_input_tokens_seen": 31101792, + "step": 147375 + }, + { + "epoch": 16.213421342134215, + "grad_norm": 0.04069400206208229, + "learning_rate": 5.264025731125194e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31102880, + "step": 147380 + }, + { + "epoch": 16.213971397139716, + "grad_norm": 0.002636770950630307, + "learning_rate": 5.2625525907651185e-06, + "loss": 0.0417, + "num_input_tokens_seen": 31103904, + "step": 147385 + }, + { + "epoch": 16.214521452145213, + "grad_norm": 0.1169024407863617, + "learning_rate": 5.2610796323156326e-06, + "loss": 0.006, + "num_input_tokens_seen": 31104992, + "step": 147390 + }, + { + "epoch": 16.215071507150714, + "grad_norm": 0.020170211791992188, + "learning_rate": 5.259606855790295e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31106048, + "step": 147395 + }, + { + "epoch": 16.215621562156215, + "grad_norm": 3.516453742980957, + "learning_rate": 5.258134261202691e-06, + "loss": 0.0242, + "num_input_tokens_seen": 31107040, + "step": 147400 + }, + { + "epoch": 16.216171617161717, + "grad_norm": 0.1927487999200821, + "learning_rate": 5.256661848566396e-06, + "loss": 0.0333, + "num_input_tokens_seen": 31108160, + "step": 147405 + }, + { + "epoch": 16.216721672167218, + "grad_norm": 2.232868194580078, + "learning_rate": 5.255189617894967e-06, + "loss": 0.0346, + "num_input_tokens_seen": 31109184, + "step": 147410 + }, + { + "epoch": 16.21727172717272, + "grad_norm": 0.13605032861232758, + "learning_rate": 5.253717569201988e-06, + "loss": 0.037, + "num_input_tokens_seen": 31110272, + "step": 147415 + }, + { + "epoch": 16.217821782178216, + "grad_norm": 0.00402163015678525, + "learning_rate": 5.252245702501013e-06, + "loss": 0.0742, + "num_input_tokens_seen": 31111296, + "step": 147420 + }, + { + "epoch": 16.218371837183717, + "grad_norm": 0.11069735884666443, + "learning_rate": 5.2507740178056154e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31112384, + "step": 147425 + }, + { + "epoch": 16.21892189218922, + "grad_norm": 0.03881401568651199, + "learning_rate": 5.249302515129362e-06, + "loss": 0.0402, + "num_input_tokens_seen": 31113440, + "step": 147430 + }, + { + "epoch": 16.21947194719472, + "grad_norm": 0.008663907647132874, + "learning_rate": 5.2478311944858025e-06, + "loss": 0.001, + "num_input_tokens_seen": 31114464, + "step": 147435 + }, + { + "epoch": 16.22002200220022, + "grad_norm": 0.7619073987007141, + "learning_rate": 5.246360055888514e-06, + "loss": 0.1618, + "num_input_tokens_seen": 31115488, + "step": 147440 + }, + { + "epoch": 16.22057205720572, + "grad_norm": 0.011429322883486748, + "learning_rate": 5.244889099351036e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31116576, + "step": 147445 + }, + { + "epoch": 16.221122112211223, + "grad_norm": 0.3987375795841217, + "learning_rate": 5.243418324886945e-06, + "loss": 0.0087, + "num_input_tokens_seen": 31117632, + "step": 147450 + }, + { + "epoch": 16.22167216721672, + "grad_norm": 0.01984335109591484, + "learning_rate": 5.241947732509778e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31118656, + "step": 147455 + }, + { + "epoch": 16.22222222222222, + "grad_norm": 0.03255297988653183, + "learning_rate": 5.240477322233101e-06, + "loss": 0.0035, + "num_input_tokens_seen": 31119744, + "step": 147460 + }, + { + "epoch": 16.222772277227723, + "grad_norm": 0.024047624319791794, + "learning_rate": 5.239007094070467e-06, + "loss": 0.0114, + "num_input_tokens_seen": 31120800, + "step": 147465 + }, + { + "epoch": 16.223322332233224, + "grad_norm": 0.30085960030555725, + "learning_rate": 5.237537048035418e-06, + "loss": 0.0189, + "num_input_tokens_seen": 31121824, + "step": 147470 + }, + { + "epoch": 16.223872387238725, + "grad_norm": 0.008342698216438293, + "learning_rate": 5.236067184141513e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31122912, + "step": 147475 + }, + { + "epoch": 16.224422442244226, + "grad_norm": 0.030365828424692154, + "learning_rate": 5.234597502402286e-06, + "loss": 0.079, + "num_input_tokens_seen": 31123936, + "step": 147480 + }, + { + "epoch": 16.224972497249723, + "grad_norm": 0.01225221622735262, + "learning_rate": 5.233128002831289e-06, + "loss": 0.0095, + "num_input_tokens_seen": 31124960, + "step": 147485 + }, + { + "epoch": 16.225522552255224, + "grad_norm": 0.009545008651912212, + "learning_rate": 5.231658685442073e-06, + "loss": 0.1622, + "num_input_tokens_seen": 31126048, + "step": 147490 + }, + { + "epoch": 16.226072607260726, + "grad_norm": 0.030147027224302292, + "learning_rate": 5.230189550248174e-06, + "loss": 0.0501, + "num_input_tokens_seen": 31127104, + "step": 147495 + }, + { + "epoch": 16.226622662266227, + "grad_norm": 0.08633419871330261, + "learning_rate": 5.228720597263123e-06, + "loss": 0.0102, + "num_input_tokens_seen": 31128192, + "step": 147500 + }, + { + "epoch": 16.227172717271728, + "grad_norm": 0.10729966312646866, + "learning_rate": 5.2272518265004765e-06, + "loss": 0.005, + "num_input_tokens_seen": 31129248, + "step": 147505 + }, + { + "epoch": 16.22772277227723, + "grad_norm": 0.1705523580312729, + "learning_rate": 5.2257832379737525e-06, + "loss": 0.0548, + "num_input_tokens_seen": 31130336, + "step": 147510 + }, + { + "epoch": 16.22827282728273, + "grad_norm": 0.1621570736169815, + "learning_rate": 5.224314831696498e-06, + "loss": 0.0287, + "num_input_tokens_seen": 31131328, + "step": 147515 + }, + { + "epoch": 16.228822882288227, + "grad_norm": 3.088472366333008, + "learning_rate": 5.22284660768225e-06, + "loss": 0.0187, + "num_input_tokens_seen": 31132384, + "step": 147520 + }, + { + "epoch": 16.22937293729373, + "grad_norm": 0.7615554928779602, + "learning_rate": 5.221378565944529e-06, + "loss": 0.0059, + "num_input_tokens_seen": 31133472, + "step": 147525 + }, + { + "epoch": 16.22992299229923, + "grad_norm": 0.02073414996266365, + "learning_rate": 5.219910706496875e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31134464, + "step": 147530 + }, + { + "epoch": 16.23047304730473, + "grad_norm": 0.14760924875736237, + "learning_rate": 5.2184430293528096e-06, + "loss": 0.005, + "num_input_tokens_seen": 31135488, + "step": 147535 + }, + { + "epoch": 16.231023102310232, + "grad_norm": 0.013068323023617268, + "learning_rate": 5.2169755345258615e-06, + "loss": 0.055, + "num_input_tokens_seen": 31136544, + "step": 147540 + }, + { + "epoch": 16.231573157315733, + "grad_norm": 0.03030513785779476, + "learning_rate": 5.215508222029564e-06, + "loss": 0.0456, + "num_input_tokens_seen": 31137632, + "step": 147545 + }, + { + "epoch": 16.23212321232123, + "grad_norm": 0.38546356558799744, + "learning_rate": 5.214041091877425e-06, + "loss": 0.05, + "num_input_tokens_seen": 31138624, + "step": 147550 + }, + { + "epoch": 16.23267326732673, + "grad_norm": 0.08255231380462646, + "learning_rate": 5.212574144082982e-06, + "loss": 0.1353, + "num_input_tokens_seen": 31139648, + "step": 147555 + }, + { + "epoch": 16.233223322332233, + "grad_norm": 0.07320292294025421, + "learning_rate": 5.211107378659744e-06, + "loss": 0.1016, + "num_input_tokens_seen": 31140704, + "step": 147560 + }, + { + "epoch": 16.233773377337734, + "grad_norm": 2.7769079208374023, + "learning_rate": 5.209640795621238e-06, + "loss": 0.1858, + "num_input_tokens_seen": 31141696, + "step": 147565 + }, + { + "epoch": 16.234323432343235, + "grad_norm": 1.9325664043426514, + "learning_rate": 5.208174394980972e-06, + "loss": 0.0712, + "num_input_tokens_seen": 31142880, + "step": 147570 + }, + { + "epoch": 16.234873487348736, + "grad_norm": 0.2086769938468933, + "learning_rate": 5.206708176752461e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31143936, + "step": 147575 + }, + { + "epoch": 16.235423542354237, + "grad_norm": 0.011461825110018253, + "learning_rate": 5.205242140949235e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31145024, + "step": 147580 + }, + { + "epoch": 16.235973597359735, + "grad_norm": 0.014955231919884682, + "learning_rate": 5.203776287584786e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31146080, + "step": 147585 + }, + { + "epoch": 16.236523652365236, + "grad_norm": 0.019613733515143394, + "learning_rate": 5.202310616672637e-06, + "loss": 0.0176, + "num_input_tokens_seen": 31147168, + "step": 147590 + }, + { + "epoch": 16.237073707370737, + "grad_norm": 2.6330833435058594, + "learning_rate": 5.200845128226284e-06, + "loss": 0.0771, + "num_input_tokens_seen": 31148192, + "step": 147595 + }, + { + "epoch": 16.237623762376238, + "grad_norm": 2.3693792819976807, + "learning_rate": 5.1993798222592425e-06, + "loss": 0.0547, + "num_input_tokens_seen": 31149248, + "step": 147600 + }, + { + "epoch": 16.23817381738174, + "grad_norm": 0.012744462117552757, + "learning_rate": 5.197914698785022e-06, + "loss": 0.0048, + "num_input_tokens_seen": 31150336, + "step": 147605 + }, + { + "epoch": 16.23872387238724, + "grad_norm": 0.012865390628576279, + "learning_rate": 5.196449757817112e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31151424, + "step": 147610 + }, + { + "epoch": 16.239273927392738, + "grad_norm": 0.06308634579181671, + "learning_rate": 5.194984999369029e-06, + "loss": 0.0064, + "num_input_tokens_seen": 31152448, + "step": 147615 + }, + { + "epoch": 16.23982398239824, + "grad_norm": 1.5497024059295654, + "learning_rate": 5.1935204234542676e-06, + "loss": 0.0478, + "num_input_tokens_seen": 31153472, + "step": 147620 + }, + { + "epoch": 16.24037403740374, + "grad_norm": 0.030485639348626137, + "learning_rate": 5.1920560300863175e-06, + "loss": 0.0496, + "num_input_tokens_seen": 31154464, + "step": 147625 + }, + { + "epoch": 16.24092409240924, + "grad_norm": 2.061988592147827, + "learning_rate": 5.1905918192786814e-06, + "loss": 0.0635, + "num_input_tokens_seen": 31155520, + "step": 147630 + }, + { + "epoch": 16.241474147414742, + "grad_norm": 0.026497701182961464, + "learning_rate": 5.1891277910448535e-06, + "loss": 0.0372, + "num_input_tokens_seen": 31156608, + "step": 147635 + }, + { + "epoch": 16.242024202420243, + "grad_norm": 0.01260544452816248, + "learning_rate": 5.187663945398339e-06, + "loss": 0.0334, + "num_input_tokens_seen": 31157632, + "step": 147640 + }, + { + "epoch": 16.242574257425744, + "grad_norm": 0.04546934366226196, + "learning_rate": 5.186200282352616e-06, + "loss": 0.0041, + "num_input_tokens_seen": 31158624, + "step": 147645 + }, + { + "epoch": 16.24312431243124, + "grad_norm": 0.006448178086429834, + "learning_rate": 5.184736801921169e-06, + "loss": 0.0965, + "num_input_tokens_seen": 31159616, + "step": 147650 + }, + { + "epoch": 16.243674367436743, + "grad_norm": 0.023736916482448578, + "learning_rate": 5.1832735041174975e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31160672, + "step": 147655 + }, + { + "epoch": 16.244224422442244, + "grad_norm": 0.03223089128732681, + "learning_rate": 5.181810388955086e-06, + "loss": 0.0254, + "num_input_tokens_seen": 31161664, + "step": 147660 + }, + { + "epoch": 16.244774477447745, + "grad_norm": 0.02037804201245308, + "learning_rate": 5.1803474564474235e-06, + "loss": 0.0722, + "num_input_tokens_seen": 31162784, + "step": 147665 + }, + { + "epoch": 16.245324532453246, + "grad_norm": 0.22980889678001404, + "learning_rate": 5.178884706607989e-06, + "loss": 0.0103, + "num_input_tokens_seen": 31163872, + "step": 147670 + }, + { + "epoch": 16.245874587458747, + "grad_norm": 0.008313716389238834, + "learning_rate": 5.177422139450255e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31164928, + "step": 147675 + }, + { + "epoch": 16.246424642464245, + "grad_norm": 0.43473944067955017, + "learning_rate": 5.175959754987716e-06, + "loss": 0.0167, + "num_input_tokens_seen": 31165984, + "step": 147680 + }, + { + "epoch": 16.246974697469746, + "grad_norm": 0.01465271320194006, + "learning_rate": 5.17449755323384e-06, + "loss": 0.0005, + "num_input_tokens_seen": 31167008, + "step": 147685 + }, + { + "epoch": 16.247524752475247, + "grad_norm": 0.02503260411322117, + "learning_rate": 5.173035534202103e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31168032, + "step": 147690 + }, + { + "epoch": 16.248074807480748, + "grad_norm": 0.026073865592479706, + "learning_rate": 5.171573697905993e-06, + "loss": 0.0187, + "num_input_tokens_seen": 31169088, + "step": 147695 + }, + { + "epoch": 16.24862486248625, + "grad_norm": 3.126542568206787, + "learning_rate": 5.170112044358966e-06, + "loss": 0.0133, + "num_input_tokens_seen": 31170080, + "step": 147700 + }, + { + "epoch": 16.24917491749175, + "grad_norm": 0.1940922886133194, + "learning_rate": 5.168650573574507e-06, + "loss": 0.0055, + "num_input_tokens_seen": 31171168, + "step": 147705 + }, + { + "epoch": 16.24972497249725, + "grad_norm": 0.09770655632019043, + "learning_rate": 5.167189285566079e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31172288, + "step": 147710 + }, + { + "epoch": 16.25027502750275, + "grad_norm": 0.01685965433716774, + "learning_rate": 5.1657281803471375e-06, + "loss": 0.0053, + "num_input_tokens_seen": 31173312, + "step": 147715 + }, + { + "epoch": 16.25082508250825, + "grad_norm": 0.007357332389801741, + "learning_rate": 5.164267257931177e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31174368, + "step": 147720 + }, + { + "epoch": 16.25137513751375, + "grad_norm": 0.34543028473854065, + "learning_rate": 5.16280651833164e-06, + "loss": 0.0069, + "num_input_tokens_seen": 31175392, + "step": 147725 + }, + { + "epoch": 16.251925192519252, + "grad_norm": 0.019016291946172714, + "learning_rate": 5.161345961562003e-06, + "loss": 0.1349, + "num_input_tokens_seen": 31176448, + "step": 147730 + }, + { + "epoch": 16.252475247524753, + "grad_norm": 0.11914703249931335, + "learning_rate": 5.159885587635718e-06, + "loss": 0.0107, + "num_input_tokens_seen": 31177536, + "step": 147735 + }, + { + "epoch": 16.253025302530254, + "grad_norm": 0.7593849897384644, + "learning_rate": 5.158425396566244e-06, + "loss": 0.0078, + "num_input_tokens_seen": 31178624, + "step": 147740 + }, + { + "epoch": 16.253575357535752, + "grad_norm": 0.04057406261563301, + "learning_rate": 5.156965388367041e-06, + "loss": 0.0054, + "num_input_tokens_seen": 31179712, + "step": 147745 + }, + { + "epoch": 16.254125412541253, + "grad_norm": 0.004244734533131123, + "learning_rate": 5.155505563051566e-06, + "loss": 0.001, + "num_input_tokens_seen": 31180864, + "step": 147750 + }, + { + "epoch": 16.254675467546754, + "grad_norm": 0.7244237661361694, + "learning_rate": 5.154045920633282e-06, + "loss": 0.0364, + "num_input_tokens_seen": 31181920, + "step": 147755 + }, + { + "epoch": 16.255225522552255, + "grad_norm": 2.4710004329681396, + "learning_rate": 5.152586461125633e-06, + "loss": 0.06, + "num_input_tokens_seen": 31182944, + "step": 147760 + }, + { + "epoch": 16.255775577557756, + "grad_norm": 0.006299509666860104, + "learning_rate": 5.151127184542065e-06, + "loss": 0.0063, + "num_input_tokens_seen": 31184000, + "step": 147765 + }, + { + "epoch": 16.256325632563257, + "grad_norm": 0.014582941308617592, + "learning_rate": 5.14966809089604e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31185088, + "step": 147770 + }, + { + "epoch": 16.25687568756876, + "grad_norm": 0.019816173240542412, + "learning_rate": 5.148209180200983e-06, + "loss": 0.1091, + "num_input_tokens_seen": 31186144, + "step": 147775 + }, + { + "epoch": 16.257425742574256, + "grad_norm": 0.07560683786869049, + "learning_rate": 5.1467504524703725e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31187136, + "step": 147780 + }, + { + "epoch": 16.257975797579757, + "grad_norm": 0.045137546956539154, + "learning_rate": 5.1452919077176335e-06, + "loss": 0.0327, + "num_input_tokens_seen": 31188224, + "step": 147785 + }, + { + "epoch": 16.258525852585258, + "grad_norm": 0.0021676968317478895, + "learning_rate": 5.1438335459562085e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31189184, + "step": 147790 + }, + { + "epoch": 16.25907590759076, + "grad_norm": 0.004765890073031187, + "learning_rate": 5.142375367199548e-06, + "loss": 0.0364, + "num_input_tokens_seen": 31190304, + "step": 147795 + }, + { + "epoch": 16.25962596259626, + "grad_norm": 0.022298766300082207, + "learning_rate": 5.140917371461076e-06, + "loss": 0.1151, + "num_input_tokens_seen": 31191296, + "step": 147800 + }, + { + "epoch": 16.26017601760176, + "grad_norm": 0.11730266362428665, + "learning_rate": 5.139459558754242e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31192352, + "step": 147805 + }, + { + "epoch": 16.260726072607262, + "grad_norm": 0.04729209840297699, + "learning_rate": 5.1380019290924844e-06, + "loss": 0.0196, + "num_input_tokens_seen": 31193408, + "step": 147810 + }, + { + "epoch": 16.26127612761276, + "grad_norm": 0.5061337947845459, + "learning_rate": 5.136544482489225e-06, + "loss": 0.0311, + "num_input_tokens_seen": 31194464, + "step": 147815 + }, + { + "epoch": 16.26182618261826, + "grad_norm": 0.07343751192092896, + "learning_rate": 5.135087218957912e-06, + "loss": 0.0471, + "num_input_tokens_seen": 31195488, + "step": 147820 + }, + { + "epoch": 16.262376237623762, + "grad_norm": 0.061030663549900055, + "learning_rate": 5.13363013851196e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31196512, + "step": 147825 + }, + { + "epoch": 16.262926292629263, + "grad_norm": 7.652339935302734, + "learning_rate": 5.132173241164814e-06, + "loss": 0.0359, + "num_input_tokens_seen": 31197600, + "step": 147830 + }, + { + "epoch": 16.263476347634764, + "grad_norm": 3.046605110168457, + "learning_rate": 5.130716526929885e-06, + "loss": 0.0372, + "num_input_tokens_seen": 31198688, + "step": 147835 + }, + { + "epoch": 16.264026402640265, + "grad_norm": 0.11826545745134354, + "learning_rate": 5.12925999582061e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31199776, + "step": 147840 + }, + { + "epoch": 16.264576457645763, + "grad_norm": 0.1685280054807663, + "learning_rate": 5.127803647850418e-06, + "loss": 0.0752, + "num_input_tokens_seen": 31200800, + "step": 147845 + }, + { + "epoch": 16.265126512651264, + "grad_norm": 0.10464546829462051, + "learning_rate": 5.126347483032715e-06, + "loss": 0.0292, + "num_input_tokens_seen": 31201824, + "step": 147850 + }, + { + "epoch": 16.265676567656765, + "grad_norm": 0.058747805655002594, + "learning_rate": 5.124891501380938e-06, + "loss": 0.0617, + "num_input_tokens_seen": 31202880, + "step": 147855 + }, + { + "epoch": 16.266226622662266, + "grad_norm": 0.07272163033485413, + "learning_rate": 5.123435702908494e-06, + "loss": 0.0605, + "num_input_tokens_seen": 31203936, + "step": 147860 + }, + { + "epoch": 16.266776677667767, + "grad_norm": 0.008533554151654243, + "learning_rate": 5.121980087628803e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31204960, + "step": 147865 + }, + { + "epoch": 16.26732673267327, + "grad_norm": 0.08942460268735886, + "learning_rate": 5.1205246555552925e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31205984, + "step": 147870 + }, + { + "epoch": 16.26787678767877, + "grad_norm": 2.334573745727539, + "learning_rate": 5.119069406701365e-06, + "loss": 0.1588, + "num_input_tokens_seen": 31207040, + "step": 147875 + }, + { + "epoch": 16.268426842684267, + "grad_norm": 0.024525344371795654, + "learning_rate": 5.117614341080429e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31208128, + "step": 147880 + }, + { + "epoch": 16.268976897689768, + "grad_norm": 0.031940821558237076, + "learning_rate": 5.1161594587059074e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31209152, + "step": 147885 + }, + { + "epoch": 16.26952695269527, + "grad_norm": 0.43696385622024536, + "learning_rate": 5.11470475959119e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31210240, + "step": 147890 + }, + { + "epoch": 16.27007700770077, + "grad_norm": 0.11626476049423218, + "learning_rate": 5.113250243749712e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31211360, + "step": 147895 + }, + { + "epoch": 16.27062706270627, + "grad_norm": 0.12038334459066391, + "learning_rate": 5.111795911194861e-06, + "loss": 0.0063, + "num_input_tokens_seen": 31212384, + "step": 147900 + }, + { + "epoch": 16.271177117711773, + "grad_norm": 0.008871867321431637, + "learning_rate": 5.110341761940038e-06, + "loss": 0.0219, + "num_input_tokens_seen": 31213408, + "step": 147905 + }, + { + "epoch": 16.27172717271727, + "grad_norm": 0.34589436650276184, + "learning_rate": 5.10888779599866e-06, + "loss": 0.0177, + "num_input_tokens_seen": 31214496, + "step": 147910 + }, + { + "epoch": 16.27227722772277, + "grad_norm": 0.04473525658249855, + "learning_rate": 5.1074340133841075e-06, + "loss": 0.0045, + "num_input_tokens_seen": 31215616, + "step": 147915 + }, + { + "epoch": 16.272827282728272, + "grad_norm": 0.004239408764988184, + "learning_rate": 5.105980414109793e-06, + "loss": 0.0115, + "num_input_tokens_seen": 31216672, + "step": 147920 + }, + { + "epoch": 16.273377337733773, + "grad_norm": 0.02117585763335228, + "learning_rate": 5.104526998189119e-06, + "loss": 0.0538, + "num_input_tokens_seen": 31217760, + "step": 147925 + }, + { + "epoch": 16.273927392739274, + "grad_norm": 0.018362250179052353, + "learning_rate": 5.1030737656354635e-06, + "loss": 0.0147, + "num_input_tokens_seen": 31218816, + "step": 147930 + }, + { + "epoch": 16.274477447744776, + "grad_norm": 0.012292019091546535, + "learning_rate": 5.101620716462238e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31219936, + "step": 147935 + }, + { + "epoch": 16.275027502750277, + "grad_norm": 0.016136135905981064, + "learning_rate": 5.100167850682819e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31220928, + "step": 147940 + }, + { + "epoch": 16.275577557755774, + "grad_norm": 1.7974907159805298, + "learning_rate": 5.098715168310611e-06, + "loss": 0.2881, + "num_input_tokens_seen": 31221984, + "step": 147945 + }, + { + "epoch": 16.276127612761275, + "grad_norm": 0.007906795479357243, + "learning_rate": 5.097262669358987e-06, + "loss": 0.0635, + "num_input_tokens_seen": 31223008, + "step": 147950 + }, + { + "epoch": 16.276677667766776, + "grad_norm": 0.4455927312374115, + "learning_rate": 5.095810353841346e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31224000, + "step": 147955 + }, + { + "epoch": 16.277227722772277, + "grad_norm": 0.12209125608205795, + "learning_rate": 5.0943582217710726e-06, + "loss": 0.0054, + "num_input_tokens_seen": 31225120, + "step": 147960 + }, + { + "epoch": 16.27777777777778, + "grad_norm": 0.019483191892504692, + "learning_rate": 5.092906273161544e-06, + "loss": 0.0464, + "num_input_tokens_seen": 31226144, + "step": 147965 + }, + { + "epoch": 16.27832783278328, + "grad_norm": 0.17879381775856018, + "learning_rate": 5.091454508026153e-06, + "loss": 0.01, + "num_input_tokens_seen": 31227200, + "step": 147970 + }, + { + "epoch": 16.278877887788777, + "grad_norm": 0.0642981305718422, + "learning_rate": 5.090002926378265e-06, + "loss": 0.0663, + "num_input_tokens_seen": 31228256, + "step": 147975 + }, + { + "epoch": 16.27942794279428, + "grad_norm": 0.01938832923769951, + "learning_rate": 5.0885515282312665e-06, + "loss": 0.0503, + "num_input_tokens_seen": 31229280, + "step": 147980 + }, + { + "epoch": 16.27997799779978, + "grad_norm": 0.2704131305217743, + "learning_rate": 5.087100313598542e-06, + "loss": 0.0227, + "num_input_tokens_seen": 31230272, + "step": 147985 + }, + { + "epoch": 16.28052805280528, + "grad_norm": 0.08213392645120621, + "learning_rate": 5.085649282493449e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31231264, + "step": 147990 + }, + { + "epoch": 16.28107810781078, + "grad_norm": 0.006637312471866608, + "learning_rate": 5.084198434929379e-06, + "loss": 0.0349, + "num_input_tokens_seen": 31232352, + "step": 147995 + }, + { + "epoch": 16.281628162816283, + "grad_norm": 0.5577791929244995, + "learning_rate": 5.0827477709196964e-06, + "loss": 0.0969, + "num_input_tokens_seen": 31233408, + "step": 148000 + }, + { + "epoch": 16.282178217821784, + "grad_norm": 0.2700616717338562, + "learning_rate": 5.081297290477763e-06, + "loss": 0.003, + "num_input_tokens_seen": 31234464, + "step": 148005 + }, + { + "epoch": 16.28272827282728, + "grad_norm": 0.5016542673110962, + "learning_rate": 5.079846993616955e-06, + "loss": 0.0046, + "num_input_tokens_seen": 31235520, + "step": 148010 + }, + { + "epoch": 16.283278327832782, + "grad_norm": 0.3691094219684601, + "learning_rate": 5.078396880350639e-06, + "loss": 0.0133, + "num_input_tokens_seen": 31236576, + "step": 148015 + }, + { + "epoch": 16.283828382838283, + "grad_norm": 0.0021501551382243633, + "learning_rate": 5.076946950692185e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31237632, + "step": 148020 + }, + { + "epoch": 16.284378437843785, + "grad_norm": 0.18077927827835083, + "learning_rate": 5.0754972046549525e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31238688, + "step": 148025 + }, + { + "epoch": 16.284928492849286, + "grad_norm": 0.3059101700782776, + "learning_rate": 5.0740476422522945e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31239712, + "step": 148030 + }, + { + "epoch": 16.285478547854787, + "grad_norm": 0.0933532640337944, + "learning_rate": 5.072598263497577e-06, + "loss": 0.0089, + "num_input_tokens_seen": 31240736, + "step": 148035 + }, + { + "epoch": 16.286028602860284, + "grad_norm": 0.08367367088794708, + "learning_rate": 5.071149068404158e-06, + "loss": 0.0046, + "num_input_tokens_seen": 31241728, + "step": 148040 + }, + { + "epoch": 16.286578657865785, + "grad_norm": 0.003905476536601782, + "learning_rate": 5.069700056985404e-06, + "loss": 0.0102, + "num_input_tokens_seen": 31242752, + "step": 148045 + }, + { + "epoch": 16.287128712871286, + "grad_norm": 0.08689943701028824, + "learning_rate": 5.0682512292546616e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31243808, + "step": 148050 + }, + { + "epoch": 16.287678767876788, + "grad_norm": 0.04599587619304657, + "learning_rate": 5.066802585225275e-06, + "loss": 0.0321, + "num_input_tokens_seen": 31244896, + "step": 148055 + }, + { + "epoch": 16.28822882288229, + "grad_norm": 0.08633194118738174, + "learning_rate": 5.065354124910612e-06, + "loss": 0.0044, + "num_input_tokens_seen": 31245920, + "step": 148060 + }, + { + "epoch": 16.28877887788779, + "grad_norm": 0.006576296873390675, + "learning_rate": 5.063905848324008e-06, + "loss": 0.0078, + "num_input_tokens_seen": 31246944, + "step": 148065 + }, + { + "epoch": 16.28932893289329, + "grad_norm": 0.021327441558241844, + "learning_rate": 5.062457755478816e-06, + "loss": 0.0059, + "num_input_tokens_seen": 31248032, + "step": 148070 + }, + { + "epoch": 16.28987898789879, + "grad_norm": 0.011885473504662514, + "learning_rate": 5.06100984638839e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31249120, + "step": 148075 + }, + { + "epoch": 16.29042904290429, + "grad_norm": 2.2324256896972656, + "learning_rate": 5.059562121066063e-06, + "loss": 0.0055, + "num_input_tokens_seen": 31250176, + "step": 148080 + }, + { + "epoch": 16.29097909790979, + "grad_norm": 0.016199015080928802, + "learning_rate": 5.058114579525191e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31251264, + "step": 148085 + }, + { + "epoch": 16.29152915291529, + "grad_norm": 0.30914342403411865, + "learning_rate": 5.0566672217791015e-06, + "loss": 0.01, + "num_input_tokens_seen": 31252352, + "step": 148090 + }, + { + "epoch": 16.292079207920793, + "grad_norm": 2.500633716583252, + "learning_rate": 5.055220047841139e-06, + "loss": 0.0567, + "num_input_tokens_seen": 31253408, + "step": 148095 + }, + { + "epoch": 16.292629262926294, + "grad_norm": 0.15789303183555603, + "learning_rate": 5.053773057724651e-06, + "loss": 0.0197, + "num_input_tokens_seen": 31254464, + "step": 148100 + }, + { + "epoch": 16.293179317931795, + "grad_norm": 0.04299125075340271, + "learning_rate": 5.05232625144296e-06, + "loss": 0.0163, + "num_input_tokens_seen": 31255456, + "step": 148105 + }, + { + "epoch": 16.293729372937293, + "grad_norm": 1.265637755393982, + "learning_rate": 5.050879629009409e-06, + "loss": 0.1514, + "num_input_tokens_seen": 31256480, + "step": 148110 + }, + { + "epoch": 16.294279427942794, + "grad_norm": 0.5160309076309204, + "learning_rate": 5.0494331904373305e-06, + "loss": 0.0153, + "num_input_tokens_seen": 31257536, + "step": 148115 + }, + { + "epoch": 16.294829482948295, + "grad_norm": 0.014112205244600773, + "learning_rate": 5.047986935740045e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31258592, + "step": 148120 + }, + { + "epoch": 16.295379537953796, + "grad_norm": 0.34182095527648926, + "learning_rate": 5.046540864930893e-06, + "loss": 0.0471, + "num_input_tokens_seen": 31259648, + "step": 148125 + }, + { + "epoch": 16.295929592959297, + "grad_norm": 0.44282081723213196, + "learning_rate": 5.0450949780231956e-06, + "loss": 0.0067, + "num_input_tokens_seen": 31260704, + "step": 148130 + }, + { + "epoch": 16.296479647964798, + "grad_norm": 0.7657991051673889, + "learning_rate": 5.04364927503029e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31261760, + "step": 148135 + }, + { + "epoch": 16.297029702970296, + "grad_norm": 0.21065868437290192, + "learning_rate": 5.042203755965494e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31262816, + "step": 148140 + }, + { + "epoch": 16.297579757975797, + "grad_norm": 0.3741612732410431, + "learning_rate": 5.040758420842123e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31263808, + "step": 148145 + }, + { + "epoch": 16.298129812981298, + "grad_norm": 0.04794304072856903, + "learning_rate": 5.039313269673504e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31264896, + "step": 148150 + }, + { + "epoch": 16.2986798679868, + "grad_norm": 0.1349163055419922, + "learning_rate": 5.037868302472956e-06, + "loss": 0.0069, + "num_input_tokens_seen": 31265984, + "step": 148155 + }, + { + "epoch": 16.2992299229923, + "grad_norm": 0.004826949443668127, + "learning_rate": 5.036423519253805e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31267040, + "step": 148160 + }, + { + "epoch": 16.2997799779978, + "grad_norm": 0.007451315876096487, + "learning_rate": 5.034978920029357e-06, + "loss": 0.001, + "num_input_tokens_seen": 31268064, + "step": 148165 + }, + { + "epoch": 16.300330033003302, + "grad_norm": 0.011837981641292572, + "learning_rate": 5.033534504812923e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31269120, + "step": 148170 + }, + { + "epoch": 16.3008800880088, + "grad_norm": 0.0046004923060536385, + "learning_rate": 5.032090273617826e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31270080, + "step": 148175 + }, + { + "epoch": 16.3014301430143, + "grad_norm": 0.01458804216235876, + "learning_rate": 5.030646226457364e-06, + "loss": 0.03, + "num_input_tokens_seen": 31271168, + "step": 148180 + }, + { + "epoch": 16.301980198019802, + "grad_norm": 0.1221674382686615, + "learning_rate": 5.029202363344856e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31272192, + "step": 148185 + }, + { + "epoch": 16.302530253025303, + "grad_norm": 0.05559799075126648, + "learning_rate": 5.027758684293612e-06, + "loss": 0.067, + "num_input_tokens_seen": 31273248, + "step": 148190 + }, + { + "epoch": 16.303080308030804, + "grad_norm": 0.0030399514362215996, + "learning_rate": 5.026315189316927e-06, + "loss": 0.0101, + "num_input_tokens_seen": 31274304, + "step": 148195 + }, + { + "epoch": 16.303630363036305, + "grad_norm": 0.00496178911998868, + "learning_rate": 5.0248718784281165e-06, + "loss": 0.0341, + "num_input_tokens_seen": 31275328, + "step": 148200 + }, + { + "epoch": 16.304180418041803, + "grad_norm": 0.02037905342876911, + "learning_rate": 5.023428751640472e-06, + "loss": 0.0079, + "num_input_tokens_seen": 31276352, + "step": 148205 + }, + { + "epoch": 16.304730473047304, + "grad_norm": 0.24281655251979828, + "learning_rate": 5.021985808967294e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31277440, + "step": 148210 + }, + { + "epoch": 16.305280528052805, + "grad_norm": 0.005197967402637005, + "learning_rate": 5.020543050421897e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31278464, + "step": 148215 + }, + { + "epoch": 16.305830583058306, + "grad_norm": 1.4352850914001465, + "learning_rate": 5.019100476017563e-06, + "loss": 0.0086, + "num_input_tokens_seen": 31279488, + "step": 148220 + }, + { + "epoch": 16.306380638063807, + "grad_norm": 0.7186741232872009, + "learning_rate": 5.017658085767596e-06, + "loss": 0.0551, + "num_input_tokens_seen": 31280576, + "step": 148225 + }, + { + "epoch": 16.306930693069308, + "grad_norm": 0.37521883845329285, + "learning_rate": 5.016215879685279e-06, + "loss": 0.0041, + "num_input_tokens_seen": 31281664, + "step": 148230 + }, + { + "epoch": 16.30748074807481, + "grad_norm": 4.229318618774414, + "learning_rate": 5.0147738577839165e-06, + "loss": 0.0911, + "num_input_tokens_seen": 31282816, + "step": 148235 + }, + { + "epoch": 16.308030803080307, + "grad_norm": 0.03451046347618103, + "learning_rate": 5.013332020076788e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31283872, + "step": 148240 + }, + { + "epoch": 16.308580858085808, + "grad_norm": 0.6184321045875549, + "learning_rate": 5.0118903665771905e-06, + "loss": 0.0595, + "num_input_tokens_seen": 31284896, + "step": 148245 + }, + { + "epoch": 16.30913091309131, + "grad_norm": 0.052632156759500504, + "learning_rate": 5.0104488972984116e-06, + "loss": 0.0893, + "num_input_tokens_seen": 31285984, + "step": 148250 + }, + { + "epoch": 16.30968096809681, + "grad_norm": 0.02152865380048752, + "learning_rate": 5.009007612253735e-06, + "loss": 0.1743, + "num_input_tokens_seen": 31287008, + "step": 148255 + }, + { + "epoch": 16.31023102310231, + "grad_norm": 1.76207435131073, + "learning_rate": 5.007566511456435e-06, + "loss": 0.0141, + "num_input_tokens_seen": 31288032, + "step": 148260 + }, + { + "epoch": 16.310781078107812, + "grad_norm": 0.020493824034929276, + "learning_rate": 5.006125594919803e-06, + "loss": 0.0304, + "num_input_tokens_seen": 31289088, + "step": 148265 + }, + { + "epoch": 16.31133113311331, + "grad_norm": 0.11522349715232849, + "learning_rate": 5.004684862657116e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31290208, + "step": 148270 + }, + { + "epoch": 16.31188118811881, + "grad_norm": 0.19696001708507538, + "learning_rate": 5.00324431468166e-06, + "loss": 0.0059, + "num_input_tokens_seen": 31291168, + "step": 148275 + }, + { + "epoch": 16.312431243124312, + "grad_norm": 0.029924893751740456, + "learning_rate": 5.001803951006706e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31292256, + "step": 148280 + }, + { + "epoch": 16.312981298129813, + "grad_norm": 0.008498619310557842, + "learning_rate": 5.000363771645522e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31293312, + "step": 148285 + }, + { + "epoch": 16.313531353135314, + "grad_norm": 0.30141472816467285, + "learning_rate": 4.998923776611397e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31294432, + "step": 148290 + }, + { + "epoch": 16.314081408140815, + "grad_norm": 1.9822903871536255, + "learning_rate": 4.997483965917585e-06, + "loss": 0.1046, + "num_input_tokens_seen": 31295456, + "step": 148295 + }, + { + "epoch": 16.314631463146316, + "grad_norm": 0.0357329323887825, + "learning_rate": 4.996044339577366e-06, + "loss": 0.0105, + "num_input_tokens_seen": 31296512, + "step": 148300 + }, + { + "epoch": 16.315181518151814, + "grad_norm": 0.06203526630997658, + "learning_rate": 4.994604897604016e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31297600, + "step": 148305 + }, + { + "epoch": 16.315731573157315, + "grad_norm": 0.4395903944969177, + "learning_rate": 4.993165640010786e-06, + "loss": 0.0485, + "num_input_tokens_seen": 31298624, + "step": 148310 + }, + { + "epoch": 16.316281628162816, + "grad_norm": 0.2213716357946396, + "learning_rate": 4.991726566810953e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31299744, + "step": 148315 + }, + { + "epoch": 16.316831683168317, + "grad_norm": 4.555361747741699, + "learning_rate": 4.990287678017769e-06, + "loss": 0.0968, + "num_input_tokens_seen": 31300832, + "step": 148320 + }, + { + "epoch": 16.317381738173818, + "grad_norm": 1.4523468017578125, + "learning_rate": 4.988848973644502e-06, + "loss": 0.0223, + "num_input_tokens_seen": 31301856, + "step": 148325 + }, + { + "epoch": 16.31793179317932, + "grad_norm": 0.03030218370258808, + "learning_rate": 4.987410453704419e-06, + "loss": 0.2289, + "num_input_tokens_seen": 31302976, + "step": 148330 + }, + { + "epoch": 16.318481848184817, + "grad_norm": 0.08298125863075256, + "learning_rate": 4.985972118210763e-06, + "loss": 0.002, + "num_input_tokens_seen": 31303968, + "step": 148335 + }, + { + "epoch": 16.319031903190318, + "grad_norm": 0.27983835339546204, + "learning_rate": 4.984533967176805e-06, + "loss": 0.011, + "num_input_tokens_seen": 31304960, + "step": 148340 + }, + { + "epoch": 16.31958195819582, + "grad_norm": 0.033721860498189926, + "learning_rate": 4.983096000615789e-06, + "loss": 0.1317, + "num_input_tokens_seen": 31306016, + "step": 148345 + }, + { + "epoch": 16.32013201320132, + "grad_norm": 0.18978840112686157, + "learning_rate": 4.981658218540977e-06, + "loss": 0.0245, + "num_input_tokens_seen": 31307072, + "step": 148350 + }, + { + "epoch": 16.32068206820682, + "grad_norm": 0.6448286175727844, + "learning_rate": 4.980220620965609e-06, + "loss": 0.0051, + "num_input_tokens_seen": 31308160, + "step": 148355 + }, + { + "epoch": 16.321232123212322, + "grad_norm": 0.007011979352682829, + "learning_rate": 4.97878320790294e-06, + "loss": 0.001, + "num_input_tokens_seen": 31309216, + "step": 148360 + }, + { + "epoch": 16.321782178217823, + "grad_norm": 0.020719438791275024, + "learning_rate": 4.977345979366227e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31310304, + "step": 148365 + }, + { + "epoch": 16.32233223322332, + "grad_norm": 0.011433004401624203, + "learning_rate": 4.975908935368701e-06, + "loss": 0.0049, + "num_input_tokens_seen": 31311424, + "step": 148370 + }, + { + "epoch": 16.322882288228822, + "grad_norm": 4.543817520141602, + "learning_rate": 4.97447207592362e-06, + "loss": 0.0394, + "num_input_tokens_seen": 31312448, + "step": 148375 + }, + { + "epoch": 16.323432343234323, + "grad_norm": 0.013788824900984764, + "learning_rate": 4.973035401044216e-06, + "loss": 0.0271, + "num_input_tokens_seen": 31313440, + "step": 148380 + }, + { + "epoch": 16.323982398239824, + "grad_norm": 0.03108544647693634, + "learning_rate": 4.971598910743733e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31314464, + "step": 148385 + }, + { + "epoch": 16.324532453245325, + "grad_norm": 0.1670452505350113, + "learning_rate": 4.97016260503542e-06, + "loss": 0.0264, + "num_input_tokens_seen": 31315488, + "step": 148390 + }, + { + "epoch": 16.325082508250826, + "grad_norm": 0.018924105912446976, + "learning_rate": 4.9687264839324986e-06, + "loss": 0.0417, + "num_input_tokens_seen": 31316512, + "step": 148395 + }, + { + "epoch": 16.325632563256324, + "grad_norm": 0.07376965135335922, + "learning_rate": 4.967290547448222e-06, + "loss": 0.0294, + "num_input_tokens_seen": 31317536, + "step": 148400 + }, + { + "epoch": 16.326182618261825, + "grad_norm": 0.0776967778801918, + "learning_rate": 4.965854795595815e-06, + "loss": 0.0549, + "num_input_tokens_seen": 31318560, + "step": 148405 + }, + { + "epoch": 16.326732673267326, + "grad_norm": 0.008775055408477783, + "learning_rate": 4.964419228388504e-06, + "loss": 0.033, + "num_input_tokens_seen": 31319552, + "step": 148410 + }, + { + "epoch": 16.327282728272827, + "grad_norm": 0.05160035938024521, + "learning_rate": 4.962983845839528e-06, + "loss": 0.0231, + "num_input_tokens_seen": 31320576, + "step": 148415 + }, + { + "epoch": 16.32783278327833, + "grad_norm": 0.9987872242927551, + "learning_rate": 4.961548647962114e-06, + "loss": 0.0074, + "num_input_tokens_seen": 31321728, + "step": 148420 + }, + { + "epoch": 16.32838283828383, + "grad_norm": 0.021262774243950844, + "learning_rate": 4.9601136347694975e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31322784, + "step": 148425 + }, + { + "epoch": 16.32893289328933, + "grad_norm": 0.011240394786000252, + "learning_rate": 4.958678806274897e-06, + "loss": 0.0685, + "num_input_tokens_seen": 31323808, + "step": 148430 + }, + { + "epoch": 16.329482948294828, + "grad_norm": 0.009507554583251476, + "learning_rate": 4.957244162491531e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31324928, + "step": 148435 + }, + { + "epoch": 16.33003300330033, + "grad_norm": 0.0043126787059009075, + "learning_rate": 4.955809703432629e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31325920, + "step": 148440 + }, + { + "epoch": 16.33058305830583, + "grad_norm": 4.030251502990723, + "learning_rate": 4.954375429111419e-06, + "loss": 0.0278, + "num_input_tokens_seen": 31327040, + "step": 148445 + }, + { + "epoch": 16.33113311331133, + "grad_norm": 0.05715305358171463, + "learning_rate": 4.952941339541103e-06, + "loss": 0.002, + "num_input_tokens_seen": 31328096, + "step": 148450 + }, + { + "epoch": 16.331683168316832, + "grad_norm": 0.06114638224244118, + "learning_rate": 4.951507434734914e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31329152, + "step": 148455 + }, + { + "epoch": 16.332233223322334, + "grad_norm": 0.022364744916558266, + "learning_rate": 4.950073714706055e-06, + "loss": 0.0005, + "num_input_tokens_seen": 31330208, + "step": 148460 + }, + { + "epoch": 16.33278327832783, + "grad_norm": 0.028048744425177574, + "learning_rate": 4.948640179467751e-06, + "loss": 0.002, + "num_input_tokens_seen": 31331232, + "step": 148465 + }, + { + "epoch": 16.333333333333332, + "grad_norm": 0.015810659155249596, + "learning_rate": 4.947206829033202e-06, + "loss": 0.0101, + "num_input_tokens_seen": 31332352, + "step": 148470 + }, + { + "epoch": 16.333883388338833, + "grad_norm": 0.015158547088503838, + "learning_rate": 4.945773663415626e-06, + "loss": 0.0285, + "num_input_tokens_seen": 31333440, + "step": 148475 + }, + { + "epoch": 16.334433443344334, + "grad_norm": 0.019548645243048668, + "learning_rate": 4.944340682628237e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31334496, + "step": 148480 + }, + { + "epoch": 16.334983498349835, + "grad_norm": 0.021604446694254875, + "learning_rate": 4.942907886684231e-06, + "loss": 0.0417, + "num_input_tokens_seen": 31335520, + "step": 148485 + }, + { + "epoch": 16.335533553355337, + "grad_norm": 0.1592775583267212, + "learning_rate": 4.941475275596827e-06, + "loss": 0.0083, + "num_input_tokens_seen": 31336608, + "step": 148490 + }, + { + "epoch": 16.336083608360838, + "grad_norm": 0.05555630847811699, + "learning_rate": 4.9400428493792175e-06, + "loss": 0.0084, + "num_input_tokens_seen": 31337664, + "step": 148495 + }, + { + "epoch": 16.336633663366335, + "grad_norm": 2.0136804580688477, + "learning_rate": 4.9386106080445946e-06, + "loss": 0.1574, + "num_input_tokens_seen": 31338720, + "step": 148500 + }, + { + "epoch": 16.337183718371836, + "grad_norm": 0.2903009057044983, + "learning_rate": 4.9371785516061865e-06, + "loss": 0.0037, + "num_input_tokens_seen": 31339776, + "step": 148505 + }, + { + "epoch": 16.337733773377337, + "grad_norm": 0.035004887729883194, + "learning_rate": 4.935746680077166e-06, + "loss": 0.007, + "num_input_tokens_seen": 31340896, + "step": 148510 + }, + { + "epoch": 16.33828382838284, + "grad_norm": 0.085891492664814, + "learning_rate": 4.934314993470751e-06, + "loss": 0.0336, + "num_input_tokens_seen": 31342016, + "step": 148515 + }, + { + "epoch": 16.33883388338834, + "grad_norm": 4.129112720489502, + "learning_rate": 4.932883491800125e-06, + "loss": 0.0608, + "num_input_tokens_seen": 31343104, + "step": 148520 + }, + { + "epoch": 16.33938393839384, + "grad_norm": 0.0077055515721440315, + "learning_rate": 4.931452175078474e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31344160, + "step": 148525 + }, + { + "epoch": 16.33993399339934, + "grad_norm": 0.017899399623274803, + "learning_rate": 4.930021043319002e-06, + "loss": 0.0909, + "num_input_tokens_seen": 31345184, + "step": 148530 + }, + { + "epoch": 16.34048404840484, + "grad_norm": 0.38239967823028564, + "learning_rate": 4.928590096534894e-06, + "loss": 0.0053, + "num_input_tokens_seen": 31346304, + "step": 148535 + }, + { + "epoch": 16.34103410341034, + "grad_norm": 0.33265092968940735, + "learning_rate": 4.9271593347393455e-06, + "loss": 0.1027, + "num_input_tokens_seen": 31347360, + "step": 148540 + }, + { + "epoch": 16.34158415841584, + "grad_norm": 0.029672492295503616, + "learning_rate": 4.925728757945539e-06, + "loss": 0.0544, + "num_input_tokens_seen": 31348384, + "step": 148545 + }, + { + "epoch": 16.342134213421343, + "grad_norm": 0.019349174574017525, + "learning_rate": 4.924298366166652e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31349440, + "step": 148550 + }, + { + "epoch": 16.342684268426844, + "grad_norm": 0.030664237216114998, + "learning_rate": 4.922868159415878e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31350496, + "step": 148555 + }, + { + "epoch": 16.343234323432345, + "grad_norm": 0.5478544235229492, + "learning_rate": 4.921438137706383e-06, + "loss": 0.0546, + "num_input_tokens_seen": 31351552, + "step": 148560 + }, + { + "epoch": 16.343784378437842, + "grad_norm": 0.08556760847568512, + "learning_rate": 4.920008301051371e-06, + "loss": 0.1594, + "num_input_tokens_seen": 31352640, + "step": 148565 + }, + { + "epoch": 16.344334433443343, + "grad_norm": 0.031940706074237823, + "learning_rate": 4.918578649464006e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31353696, + "step": 148570 + }, + { + "epoch": 16.344884488448844, + "grad_norm": 0.013931884430348873, + "learning_rate": 4.917149182957459e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31354688, + "step": 148575 + }, + { + "epoch": 16.345434543454346, + "grad_norm": 0.025370197370648384, + "learning_rate": 4.915719901544919e-06, + "loss": 0.0091, + "num_input_tokens_seen": 31355776, + "step": 148580 + }, + { + "epoch": 16.345984598459847, + "grad_norm": 0.010366183705627918, + "learning_rate": 4.9142908052395436e-06, + "loss": 0.0578, + "num_input_tokens_seen": 31356800, + "step": 148585 + }, + { + "epoch": 16.346534653465348, + "grad_norm": 0.006870264653116465, + "learning_rate": 4.9128618940545126e-06, + "loss": 0.0477, + "num_input_tokens_seen": 31357824, + "step": 148590 + }, + { + "epoch": 16.34708470847085, + "grad_norm": 0.15319828689098358, + "learning_rate": 4.911433168003002e-06, + "loss": 0.0076, + "num_input_tokens_seen": 31358912, + "step": 148595 + }, + { + "epoch": 16.347634763476346, + "grad_norm": 0.2148427963256836, + "learning_rate": 4.910004627098164e-06, + "loss": 0.0195, + "num_input_tokens_seen": 31359968, + "step": 148600 + }, + { + "epoch": 16.348184818481847, + "grad_norm": 0.010278088971972466, + "learning_rate": 4.90857627135318e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31361056, + "step": 148605 + }, + { + "epoch": 16.34873487348735, + "grad_norm": 2.1133339405059814, + "learning_rate": 4.907148100781203e-06, + "loss": 0.0431, + "num_input_tokens_seen": 31362144, + "step": 148610 + }, + { + "epoch": 16.34928492849285, + "grad_norm": 0.09321744740009308, + "learning_rate": 4.9057201153954065e-06, + "loss": 0.0065, + "num_input_tokens_seen": 31363168, + "step": 148615 + }, + { + "epoch": 16.34983498349835, + "grad_norm": 0.012598933652043343, + "learning_rate": 4.9042923152089384e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31364224, + "step": 148620 + }, + { + "epoch": 16.350385038503852, + "grad_norm": 0.01106127630919218, + "learning_rate": 4.902864700234966e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31365216, + "step": 148625 + }, + { + "epoch": 16.35093509350935, + "grad_norm": 0.08392485976219177, + "learning_rate": 4.90143727048665e-06, + "loss": 0.0044, + "num_input_tokens_seen": 31366240, + "step": 148630 + }, + { + "epoch": 16.35148514851485, + "grad_norm": 0.005730106495320797, + "learning_rate": 4.900010025977145e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31367328, + "step": 148635 + }, + { + "epoch": 16.35203520352035, + "grad_norm": 0.30121418833732605, + "learning_rate": 4.8985829667195945e-06, + "loss": 0.1557, + "num_input_tokens_seen": 31368352, + "step": 148640 + }, + { + "epoch": 16.352585258525853, + "grad_norm": 0.025948308408260345, + "learning_rate": 4.897156092727162e-06, + "loss": 0.0662, + "num_input_tokens_seen": 31369408, + "step": 148645 + }, + { + "epoch": 16.353135313531354, + "grad_norm": 0.04581081494688988, + "learning_rate": 4.895729404012992e-06, + "loss": 0.007, + "num_input_tokens_seen": 31370528, + "step": 148650 + }, + { + "epoch": 16.353685368536855, + "grad_norm": 0.050111014395952225, + "learning_rate": 4.894302900590247e-06, + "loss": 0.0058, + "num_input_tokens_seen": 31371648, + "step": 148655 + }, + { + "epoch": 16.354235423542356, + "grad_norm": 0.022923042997717857, + "learning_rate": 4.892876582472061e-06, + "loss": 0.0893, + "num_input_tokens_seen": 31372672, + "step": 148660 + }, + { + "epoch": 16.354785478547853, + "grad_norm": 1.1970224380493164, + "learning_rate": 4.891450449671581e-06, + "loss": 0.0159, + "num_input_tokens_seen": 31373728, + "step": 148665 + }, + { + "epoch": 16.355335533553355, + "grad_norm": 0.018098337575793266, + "learning_rate": 4.890024502201956e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31374816, + "step": 148670 + }, + { + "epoch": 16.355885588558856, + "grad_norm": 0.0032598059624433517, + "learning_rate": 4.8885987400763195e-06, + "loss": 0.0487, + "num_input_tokens_seen": 31375872, + "step": 148675 + }, + { + "epoch": 16.356435643564357, + "grad_norm": 4.147327899932861, + "learning_rate": 4.887173163307821e-06, + "loss": 0.0406, + "num_input_tokens_seen": 31376960, + "step": 148680 + }, + { + "epoch": 16.356985698569858, + "grad_norm": 0.1123923733830452, + "learning_rate": 4.885747771909599e-06, + "loss": 0.0904, + "num_input_tokens_seen": 31378048, + "step": 148685 + }, + { + "epoch": 16.35753575357536, + "grad_norm": 0.004694981034845114, + "learning_rate": 4.884322565894783e-06, + "loss": 0.0667, + "num_input_tokens_seen": 31379168, + "step": 148690 + }, + { + "epoch": 16.358085808580856, + "grad_norm": 3.653923273086548, + "learning_rate": 4.882897545276522e-06, + "loss": 0.0302, + "num_input_tokens_seen": 31380192, + "step": 148695 + }, + { + "epoch": 16.358635863586358, + "grad_norm": 0.026033705100417137, + "learning_rate": 4.8814727100679304e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31381312, + "step": 148700 + }, + { + "epoch": 16.35918591859186, + "grad_norm": 1.2520736455917358, + "learning_rate": 4.880048060282153e-06, + "loss": 0.0236, + "num_input_tokens_seen": 31382368, + "step": 148705 + }, + { + "epoch": 16.35973597359736, + "grad_norm": 0.008806811645627022, + "learning_rate": 4.878623595932324e-06, + "loss": 0.0497, + "num_input_tokens_seen": 31383424, + "step": 148710 + }, + { + "epoch": 16.36028602860286, + "grad_norm": 0.007873307913541794, + "learning_rate": 4.877199317031561e-06, + "loss": 0.0084, + "num_input_tokens_seen": 31384512, + "step": 148715 + }, + { + "epoch": 16.360836083608362, + "grad_norm": 0.011849531903862953, + "learning_rate": 4.875775223593004e-06, + "loss": 0.0502, + "num_input_tokens_seen": 31385536, + "step": 148720 + }, + { + "epoch": 16.361386138613863, + "grad_norm": 0.039563655853271484, + "learning_rate": 4.8743513156297605e-06, + "loss": 0.1052, + "num_input_tokens_seen": 31386656, + "step": 148725 + }, + { + "epoch": 16.36193619361936, + "grad_norm": 3.4847233295440674, + "learning_rate": 4.8729275931549715e-06, + "loss": 0.1283, + "num_input_tokens_seen": 31387744, + "step": 148730 + }, + { + "epoch": 16.36248624862486, + "grad_norm": 0.7381970882415771, + "learning_rate": 4.871504056181747e-06, + "loss": 0.0174, + "num_input_tokens_seen": 31388736, + "step": 148735 + }, + { + "epoch": 16.363036303630363, + "grad_norm": 0.00883081927895546, + "learning_rate": 4.8700807047232065e-06, + "loss": 0.0423, + "num_input_tokens_seen": 31389760, + "step": 148740 + }, + { + "epoch": 16.363586358635864, + "grad_norm": 1.5649093389511108, + "learning_rate": 4.868657538792482e-06, + "loss": 0.0673, + "num_input_tokens_seen": 31390816, + "step": 148745 + }, + { + "epoch": 16.364136413641365, + "grad_norm": 0.03762596473097801, + "learning_rate": 4.867234558402675e-06, + "loss": 0.0053, + "num_input_tokens_seen": 31391872, + "step": 148750 + }, + { + "epoch": 16.364686468646866, + "grad_norm": 0.027987010776996613, + "learning_rate": 4.86581176356691e-06, + "loss": 0.0335, + "num_input_tokens_seen": 31392928, + "step": 148755 + }, + { + "epoch": 16.365236523652364, + "grad_norm": 2.014155149459839, + "learning_rate": 4.8643891542982935e-06, + "loss": 0.1192, + "num_input_tokens_seen": 31393952, + "step": 148760 + }, + { + "epoch": 16.365786578657865, + "grad_norm": 0.07672532647848129, + "learning_rate": 4.862966730609938e-06, + "loss": 0.0037, + "num_input_tokens_seen": 31395008, + "step": 148765 + }, + { + "epoch": 16.366336633663366, + "grad_norm": 0.09098224341869354, + "learning_rate": 4.861544492514963e-06, + "loss": 0.002, + "num_input_tokens_seen": 31396064, + "step": 148770 + }, + { + "epoch": 16.366886688668867, + "grad_norm": 0.05757766216993332, + "learning_rate": 4.8601224400264625e-06, + "loss": 0.0828, + "num_input_tokens_seen": 31397152, + "step": 148775 + }, + { + "epoch": 16.367436743674368, + "grad_norm": 0.14574980735778809, + "learning_rate": 4.858700573157557e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31398208, + "step": 148780 + }, + { + "epoch": 16.36798679867987, + "grad_norm": 0.18603065609931946, + "learning_rate": 4.8572788919213404e-06, + "loss": 0.0993, + "num_input_tokens_seen": 31399296, + "step": 148785 + }, + { + "epoch": 16.36853685368537, + "grad_norm": 0.11782700568437576, + "learning_rate": 4.855857396330915e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31400352, + "step": 148790 + }, + { + "epoch": 16.369086908690868, + "grad_norm": 0.02414502203464508, + "learning_rate": 4.854436086399386e-06, + "loss": 0.1941, + "num_input_tokens_seen": 31401408, + "step": 148795 + }, + { + "epoch": 16.36963696369637, + "grad_norm": 0.09644485265016556, + "learning_rate": 4.853014962139851e-06, + "loss": 0.2273, + "num_input_tokens_seen": 31402592, + "step": 148800 + }, + { + "epoch": 16.37018701870187, + "grad_norm": 0.33264970779418945, + "learning_rate": 4.851594023565418e-06, + "loss": 0.0096, + "num_input_tokens_seen": 31403552, + "step": 148805 + }, + { + "epoch": 16.37073707370737, + "grad_norm": 0.14602063596248627, + "learning_rate": 4.8501732706891735e-06, + "loss": 0.043, + "num_input_tokens_seen": 31404640, + "step": 148810 + }, + { + "epoch": 16.371287128712872, + "grad_norm": 0.013501932844519615, + "learning_rate": 4.848752703524207e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31405632, + "step": 148815 + }, + { + "epoch": 16.371837183718373, + "grad_norm": 0.005280149634927511, + "learning_rate": 4.847332322083617e-06, + "loss": 0.0078, + "num_input_tokens_seen": 31406688, + "step": 148820 + }, + { + "epoch": 16.37238723872387, + "grad_norm": 0.019284162670373917, + "learning_rate": 4.845912126380495e-06, + "loss": 0.1027, + "num_input_tokens_seen": 31407712, + "step": 148825 + }, + { + "epoch": 16.372937293729372, + "grad_norm": 0.024551985785365105, + "learning_rate": 4.844492116427934e-06, + "loss": 0.008, + "num_input_tokens_seen": 31408832, + "step": 148830 + }, + { + "epoch": 16.373487348734873, + "grad_norm": 0.09775020182132721, + "learning_rate": 4.8430722922390235e-06, + "loss": 0.0366, + "num_input_tokens_seen": 31409888, + "step": 148835 + }, + { + "epoch": 16.374037403740374, + "grad_norm": 0.03879937529563904, + "learning_rate": 4.841652653826831e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31410944, + "step": 148840 + }, + { + "epoch": 16.374587458745875, + "grad_norm": 0.05659081041812897, + "learning_rate": 4.840233201204464e-06, + "loss": 0.0576, + "num_input_tokens_seen": 31412000, + "step": 148845 + }, + { + "epoch": 16.375137513751376, + "grad_norm": 0.05360521748661995, + "learning_rate": 4.838813934384983e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31413056, + "step": 148850 + }, + { + "epoch": 16.375687568756877, + "grad_norm": 0.012890118174254894, + "learning_rate": 4.837394853381485e-06, + "loss": 0.0971, + "num_input_tokens_seen": 31414144, + "step": 148855 + }, + { + "epoch": 16.376237623762375, + "grad_norm": 0.02958420105278492, + "learning_rate": 4.835975958207048e-06, + "loss": 0.0898, + "num_input_tokens_seen": 31415168, + "step": 148860 + }, + { + "epoch": 16.376787678767876, + "grad_norm": 0.18309560418128967, + "learning_rate": 4.834557248874738e-06, + "loss": 0.0135, + "num_input_tokens_seen": 31416224, + "step": 148865 + }, + { + "epoch": 16.377337733773377, + "grad_norm": 0.14131931960582733, + "learning_rate": 4.833138725397643e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31417248, + "step": 148870 + }, + { + "epoch": 16.377887788778878, + "grad_norm": 0.0068810260854661465, + "learning_rate": 4.831720387788827e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31418272, + "step": 148875 + }, + { + "epoch": 16.37843784378438, + "grad_norm": 0.018051858991384506, + "learning_rate": 4.830302236061365e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31419296, + "step": 148880 + }, + { + "epoch": 16.37898789878988, + "grad_norm": 0.03696072846651077, + "learning_rate": 4.828884270228334e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31420320, + "step": 148885 + }, + { + "epoch": 16.379537953795378, + "grad_norm": 0.07171696424484253, + "learning_rate": 4.827466490302792e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31421312, + "step": 148890 + }, + { + "epoch": 16.38008800880088, + "grad_norm": 0.039155665785074234, + "learning_rate": 4.826048896297821e-06, + "loss": 0.0139, + "num_input_tokens_seen": 31422368, + "step": 148895 + }, + { + "epoch": 16.38063806380638, + "grad_norm": 0.03477063775062561, + "learning_rate": 4.824631488226475e-06, + "loss": 0.0151, + "num_input_tokens_seen": 31423456, + "step": 148900 + }, + { + "epoch": 16.38118811881188, + "grad_norm": 0.02424788475036621, + "learning_rate": 4.823214266101814e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31424576, + "step": 148905 + }, + { + "epoch": 16.381738173817382, + "grad_norm": 0.018191022798419, + "learning_rate": 4.821797229936903e-06, + "loss": 0.015, + "num_input_tokens_seen": 31425632, + "step": 148910 + }, + { + "epoch": 16.382288228822883, + "grad_norm": 0.008266692981123924, + "learning_rate": 4.820380379744807e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31426720, + "step": 148915 + }, + { + "epoch": 16.382838283828384, + "grad_norm": 0.05076397955417633, + "learning_rate": 4.818963715538586e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31427744, + "step": 148920 + }, + { + "epoch": 16.383388338833882, + "grad_norm": 0.013979235664010048, + "learning_rate": 4.817547237331293e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31428832, + "step": 148925 + }, + { + "epoch": 16.383938393839383, + "grad_norm": 0.002176059177145362, + "learning_rate": 4.816130945135977e-06, + "loss": 0.0226, + "num_input_tokens_seen": 31429920, + "step": 148930 + }, + { + "epoch": 16.384488448844884, + "grad_norm": 0.02223377116024494, + "learning_rate": 4.814714838965698e-06, + "loss": 0.1302, + "num_input_tokens_seen": 31431008, + "step": 148935 + }, + { + "epoch": 16.385038503850385, + "grad_norm": 0.021301453933119774, + "learning_rate": 4.813298918833506e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31432064, + "step": 148940 + }, + { + "epoch": 16.385588558855886, + "grad_norm": 0.015056350268423557, + "learning_rate": 4.811883184752458e-06, + "loss": 0.0503, + "num_input_tokens_seen": 31433152, + "step": 148945 + }, + { + "epoch": 16.386138613861387, + "grad_norm": 0.01807023584842682, + "learning_rate": 4.810467636735594e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31434208, + "step": 148950 + }, + { + "epoch": 16.38668866886689, + "grad_norm": 0.005011126399040222, + "learning_rate": 4.809052274795956e-06, + "loss": 0.001, + "num_input_tokens_seen": 31435296, + "step": 148955 + }, + { + "epoch": 16.387238723872386, + "grad_norm": 0.06516561657190323, + "learning_rate": 4.807637098946602e-06, + "loss": 0.0057, + "num_input_tokens_seen": 31436288, + "step": 148960 + }, + { + "epoch": 16.387788778877887, + "grad_norm": 0.010218014940619469, + "learning_rate": 4.806222109200559e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31437376, + "step": 148965 + }, + { + "epoch": 16.388338833883388, + "grad_norm": 1.649653434753418, + "learning_rate": 4.80480730557088e-06, + "loss": 0.006, + "num_input_tokens_seen": 31438400, + "step": 148970 + }, + { + "epoch": 16.38888888888889, + "grad_norm": 0.07152390480041504, + "learning_rate": 4.803392688070605e-06, + "loss": 0.1738, + "num_input_tokens_seen": 31439392, + "step": 148975 + }, + { + "epoch": 16.38943894389439, + "grad_norm": 0.23778973519802094, + "learning_rate": 4.801978256712764e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31440416, + "step": 148980 + }, + { + "epoch": 16.38998899889989, + "grad_norm": 0.0889880359172821, + "learning_rate": 4.800564011510403e-06, + "loss": 0.0099, + "num_input_tokens_seen": 31441408, + "step": 148985 + }, + { + "epoch": 16.39053905390539, + "grad_norm": 0.034735266119241714, + "learning_rate": 4.799149952476545e-06, + "loss": 0.0137, + "num_input_tokens_seen": 31442496, + "step": 148990 + }, + { + "epoch": 16.39108910891089, + "grad_norm": 0.018853822723031044, + "learning_rate": 4.797736079624227e-06, + "loss": 0.003, + "num_input_tokens_seen": 31443520, + "step": 148995 + }, + { + "epoch": 16.39163916391639, + "grad_norm": 0.07846719771623611, + "learning_rate": 4.7963223929664875e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31444576, + "step": 149000 + }, + { + "epoch": 16.392189218921892, + "grad_norm": 0.09026388078927994, + "learning_rate": 4.794908892516345e-06, + "loss": 0.0567, + "num_input_tokens_seen": 31445600, + "step": 149005 + }, + { + "epoch": 16.392739273927393, + "grad_norm": 0.009256141260266304, + "learning_rate": 4.793495578286838e-06, + "loss": 0.0256, + "num_input_tokens_seen": 31446688, + "step": 149010 + }, + { + "epoch": 16.393289328932894, + "grad_norm": 0.03358638286590576, + "learning_rate": 4.792082450290983e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31447808, + "step": 149015 + }, + { + "epoch": 16.393839383938396, + "grad_norm": 0.02140713669359684, + "learning_rate": 4.790669508541809e-06, + "loss": 0.003, + "num_input_tokens_seen": 31448832, + "step": 149020 + }, + { + "epoch": 16.394389438943893, + "grad_norm": 0.08224312216043472, + "learning_rate": 4.7892567530523316e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31449824, + "step": 149025 + }, + { + "epoch": 16.394939493949394, + "grad_norm": 0.16594551503658295, + "learning_rate": 4.787844183835577e-06, + "loss": 0.0037, + "num_input_tokens_seen": 31450848, + "step": 149030 + }, + { + "epoch": 16.395489548954895, + "grad_norm": 0.27176523208618164, + "learning_rate": 4.7864318009045725e-06, + "loss": 0.0844, + "num_input_tokens_seen": 31451936, + "step": 149035 + }, + { + "epoch": 16.396039603960396, + "grad_norm": 0.2693754732608795, + "learning_rate": 4.785019604272323e-06, + "loss": 0.0054, + "num_input_tokens_seen": 31453024, + "step": 149040 + }, + { + "epoch": 16.396589658965897, + "grad_norm": 0.028180411085486412, + "learning_rate": 4.783607593951844e-06, + "loss": 0.0825, + "num_input_tokens_seen": 31454048, + "step": 149045 + }, + { + "epoch": 16.3971397139714, + "grad_norm": 3.228813886642456, + "learning_rate": 4.782195769956152e-06, + "loss": 0.0446, + "num_input_tokens_seen": 31455168, + "step": 149050 + }, + { + "epoch": 16.397689768976896, + "grad_norm": 0.2555233836174011, + "learning_rate": 4.78078413229826e-06, + "loss": 0.0046, + "num_input_tokens_seen": 31456224, + "step": 149055 + }, + { + "epoch": 16.398239823982397, + "grad_norm": 2.673207998275757, + "learning_rate": 4.779372680991184e-06, + "loss": 0.073, + "num_input_tokens_seen": 31457248, + "step": 149060 + }, + { + "epoch": 16.3987898789879, + "grad_norm": 2.8011090755462646, + "learning_rate": 4.77796141604793e-06, + "loss": 0.0075, + "num_input_tokens_seen": 31458304, + "step": 149065 + }, + { + "epoch": 16.3993399339934, + "grad_norm": 0.05018763616681099, + "learning_rate": 4.7765503374814915e-06, + "loss": 0.0411, + "num_input_tokens_seen": 31459360, + "step": 149070 + }, + { + "epoch": 16.3998899889989, + "grad_norm": 0.012965979985892773, + "learning_rate": 4.775139445304894e-06, + "loss": 0.0405, + "num_input_tokens_seen": 31460416, + "step": 149075 + }, + { + "epoch": 16.4004400440044, + "grad_norm": 0.04238322749733925, + "learning_rate": 4.773728739531125e-06, + "loss": 0.0389, + "num_input_tokens_seen": 31461472, + "step": 149080 + }, + { + "epoch": 16.400990099009903, + "grad_norm": 1.0404093265533447, + "learning_rate": 4.772318220173192e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31462496, + "step": 149085 + }, + { + "epoch": 16.4015401540154, + "grad_norm": 1.9348584413528442, + "learning_rate": 4.770907887244103e-06, + "loss": 0.0556, + "num_input_tokens_seen": 31463552, + "step": 149090 + }, + { + "epoch": 16.4020902090209, + "grad_norm": 1.1141794919967651, + "learning_rate": 4.769497740756842e-06, + "loss": 0.0116, + "num_input_tokens_seen": 31464672, + "step": 149095 + }, + { + "epoch": 16.402640264026402, + "grad_norm": 0.013705318793654442, + "learning_rate": 4.768087780724418e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31465696, + "step": 149100 + }, + { + "epoch": 16.403190319031903, + "grad_norm": 0.04244713485240936, + "learning_rate": 4.7666780071598165e-06, + "loss": 0.0073, + "num_input_tokens_seen": 31466720, + "step": 149105 + }, + { + "epoch": 16.403740374037405, + "grad_norm": 0.032394442707300186, + "learning_rate": 4.765268420076033e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31467712, + "step": 149110 + }, + { + "epoch": 16.404290429042906, + "grad_norm": 0.018386516720056534, + "learning_rate": 4.76385901948607e-06, + "loss": 0.0988, + "num_input_tokens_seen": 31468736, + "step": 149115 + }, + { + "epoch": 16.404840484048403, + "grad_norm": 0.2512420117855072, + "learning_rate": 4.762449805402899e-06, + "loss": 0.0366, + "num_input_tokens_seen": 31469824, + "step": 149120 + }, + { + "epoch": 16.405390539053904, + "grad_norm": 0.153122216463089, + "learning_rate": 4.761040777839523e-06, + "loss": 0.217, + "num_input_tokens_seen": 31470912, + "step": 149125 + }, + { + "epoch": 16.405940594059405, + "grad_norm": 6.210613250732422, + "learning_rate": 4.759631936808917e-06, + "loss": 0.2103, + "num_input_tokens_seen": 31472000, + "step": 149130 + }, + { + "epoch": 16.406490649064907, + "grad_norm": 0.4714054763317108, + "learning_rate": 4.758223282324078e-06, + "loss": 0.0112, + "num_input_tokens_seen": 31473056, + "step": 149135 + }, + { + "epoch": 16.407040704070408, + "grad_norm": 0.17678186297416687, + "learning_rate": 4.756814814397978e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31474208, + "step": 149140 + }, + { + "epoch": 16.40759075907591, + "grad_norm": 0.012421566061675549, + "learning_rate": 4.7554065330435994e-06, + "loss": 0.0989, + "num_input_tokens_seen": 31475264, + "step": 149145 + }, + { + "epoch": 16.40814081408141, + "grad_norm": 0.030213214457035065, + "learning_rate": 4.753998438273932e-06, + "loss": 0.0082, + "num_input_tokens_seen": 31476352, + "step": 149150 + }, + { + "epoch": 16.408690869086907, + "grad_norm": 0.005501865409314632, + "learning_rate": 4.752590530101939e-06, + "loss": 0.0447, + "num_input_tokens_seen": 31477408, + "step": 149155 + }, + { + "epoch": 16.40924092409241, + "grad_norm": 0.2257305383682251, + "learning_rate": 4.751182808540611e-06, + "loss": 0.0347, + "num_input_tokens_seen": 31478464, + "step": 149160 + }, + { + "epoch": 16.40979097909791, + "grad_norm": 0.3694220781326294, + "learning_rate": 4.749775273602908e-06, + "loss": 0.0377, + "num_input_tokens_seen": 31479456, + "step": 149165 + }, + { + "epoch": 16.41034103410341, + "grad_norm": 1.1395328044891357, + "learning_rate": 4.748367925301809e-06, + "loss": 0.0223, + "num_input_tokens_seen": 31480576, + "step": 149170 + }, + { + "epoch": 16.41089108910891, + "grad_norm": 2.2853496074676514, + "learning_rate": 4.746960763650291e-06, + "loss": 0.0485, + "num_input_tokens_seen": 31481600, + "step": 149175 + }, + { + "epoch": 16.411441144114413, + "grad_norm": 0.019126413390040398, + "learning_rate": 4.745553788661309e-06, + "loss": 0.0075, + "num_input_tokens_seen": 31482624, + "step": 149180 + }, + { + "epoch": 16.41199119911991, + "grad_norm": 0.0396285206079483, + "learning_rate": 4.744147000347848e-06, + "loss": 0.0662, + "num_input_tokens_seen": 31483744, + "step": 149185 + }, + { + "epoch": 16.41254125412541, + "grad_norm": 0.027193669229745865, + "learning_rate": 4.742740398722864e-06, + "loss": 0.0588, + "num_input_tokens_seen": 31484800, + "step": 149190 + }, + { + "epoch": 16.413091309130913, + "grad_norm": 0.08271882683038712, + "learning_rate": 4.7413339837993145e-06, + "loss": 0.0361, + "num_input_tokens_seen": 31485888, + "step": 149195 + }, + { + "epoch": 16.413641364136414, + "grad_norm": 0.017663855105638504, + "learning_rate": 4.739927755590168e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31486912, + "step": 149200 + }, + { + "epoch": 16.414191419141915, + "grad_norm": 0.008930985815823078, + "learning_rate": 4.738521714108387e-06, + "loss": 0.0363, + "num_input_tokens_seen": 31487904, + "step": 149205 + }, + { + "epoch": 16.414741474147416, + "grad_norm": 0.03677544742822647, + "learning_rate": 4.737115859366934e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31488928, + "step": 149210 + }, + { + "epoch": 16.415291529152917, + "grad_norm": 1.5304094552993774, + "learning_rate": 4.735710191378761e-06, + "loss": 0.0393, + "num_input_tokens_seen": 31489984, + "step": 149215 + }, + { + "epoch": 16.415841584158414, + "grad_norm": 0.05494474992156029, + "learning_rate": 4.73430471015682e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31491008, + "step": 149220 + }, + { + "epoch": 16.416391639163916, + "grad_norm": 0.010499962605535984, + "learning_rate": 4.732899415714065e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31492064, + "step": 149225 + }, + { + "epoch": 16.416941694169417, + "grad_norm": 0.0651368722319603, + "learning_rate": 4.7314943080634585e-06, + "loss": 0.0283, + "num_input_tokens_seen": 31493056, + "step": 149230 + }, + { + "epoch": 16.417491749174918, + "grad_norm": 0.02774817869067192, + "learning_rate": 4.730089387217937e-06, + "loss": 0.0797, + "num_input_tokens_seen": 31494080, + "step": 149235 + }, + { + "epoch": 16.41804180418042, + "grad_norm": 0.015945380553603172, + "learning_rate": 4.72868465319046e-06, + "loss": 0.0049, + "num_input_tokens_seen": 31495168, + "step": 149240 + }, + { + "epoch": 16.41859185918592, + "grad_norm": 0.056820426136255264, + "learning_rate": 4.727280105993964e-06, + "loss": 0.0195, + "num_input_tokens_seen": 31496192, + "step": 149245 + }, + { + "epoch": 16.419141914191417, + "grad_norm": 0.2820354700088501, + "learning_rate": 4.725875745641406e-06, + "loss": 0.0106, + "num_input_tokens_seen": 31497248, + "step": 149250 + }, + { + "epoch": 16.41969196919692, + "grad_norm": 0.02322325110435486, + "learning_rate": 4.724471572145717e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31498272, + "step": 149255 + }, + { + "epoch": 16.42024202420242, + "grad_norm": 0.03631564602255821, + "learning_rate": 4.723067585519844e-06, + "loss": 0.005, + "num_input_tokens_seen": 31499328, + "step": 149260 + }, + { + "epoch": 16.42079207920792, + "grad_norm": 0.0035412826109677553, + "learning_rate": 4.721663785776734e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31500384, + "step": 149265 + }, + { + "epoch": 16.421342134213422, + "grad_norm": 0.13111162185668945, + "learning_rate": 4.7202601729293115e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31501504, + "step": 149270 + }, + { + "epoch": 16.421892189218923, + "grad_norm": 0.575334370136261, + "learning_rate": 4.718856746990524e-06, + "loss": 0.0095, + "num_input_tokens_seen": 31502496, + "step": 149275 + }, + { + "epoch": 16.422442244224424, + "grad_norm": 0.11797787249088287, + "learning_rate": 4.7174535079733045e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31503616, + "step": 149280 + }, + { + "epoch": 16.42299229922992, + "grad_norm": 0.3575993776321411, + "learning_rate": 4.71605045589057e-06, + "loss": 0.012, + "num_input_tokens_seen": 31504704, + "step": 149285 + }, + { + "epoch": 16.423542354235423, + "grad_norm": 0.015487399883568287, + "learning_rate": 4.714647590755278e-06, + "loss": 0.0109, + "num_input_tokens_seen": 31505760, + "step": 149290 + }, + { + "epoch": 16.424092409240924, + "grad_norm": 0.046638984233140945, + "learning_rate": 4.713244912580339e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31506784, + "step": 149295 + }, + { + "epoch": 16.424642464246425, + "grad_norm": 0.003996764775365591, + "learning_rate": 4.711842421378696e-06, + "loss": 0.0081, + "num_input_tokens_seen": 31507904, + "step": 149300 + }, + { + "epoch": 16.425192519251926, + "grad_norm": 0.01588940992951393, + "learning_rate": 4.710440117163262e-06, + "loss": 0.0171, + "num_input_tokens_seen": 31508960, + "step": 149305 + }, + { + "epoch": 16.425742574257427, + "grad_norm": 0.0141665143892169, + "learning_rate": 4.70903799994696e-06, + "loss": 0.1854, + "num_input_tokens_seen": 31510016, + "step": 149310 + }, + { + "epoch": 16.426292629262925, + "grad_norm": 2.3097426891326904, + "learning_rate": 4.707636069742722e-06, + "loss": 0.0255, + "num_input_tokens_seen": 31511008, + "step": 149315 + }, + { + "epoch": 16.426842684268426, + "grad_norm": 1.9545884132385254, + "learning_rate": 4.706234326563461e-06, + "loss": 0.1396, + "num_input_tokens_seen": 31512032, + "step": 149320 + }, + { + "epoch": 16.427392739273927, + "grad_norm": 0.1991194486618042, + "learning_rate": 4.70483277042211e-06, + "loss": 0.0037, + "num_input_tokens_seen": 31513088, + "step": 149325 + }, + { + "epoch": 16.427942794279428, + "grad_norm": 0.020690452307462692, + "learning_rate": 4.703431401331573e-06, + "loss": 0.0091, + "num_input_tokens_seen": 31514144, + "step": 149330 + }, + { + "epoch": 16.42849284928493, + "grad_norm": 0.03367901220917702, + "learning_rate": 4.702030219304765e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31515200, + "step": 149335 + }, + { + "epoch": 16.42904290429043, + "grad_norm": 0.03278520330786705, + "learning_rate": 4.700629224354613e-06, + "loss": 0.0988, + "num_input_tokens_seen": 31516256, + "step": 149340 + }, + { + "epoch": 16.42959295929593, + "grad_norm": 0.013936772011220455, + "learning_rate": 4.699228416494006e-06, + "loss": 0.1048, + "num_input_tokens_seen": 31517280, + "step": 149345 + }, + { + "epoch": 16.43014301430143, + "grad_norm": 0.030781783163547516, + "learning_rate": 4.697827795735882e-06, + "loss": 0.0363, + "num_input_tokens_seen": 31518368, + "step": 149350 + }, + { + "epoch": 16.43069306930693, + "grad_norm": 0.011547105386853218, + "learning_rate": 4.696427362093137e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31519360, + "step": 149355 + }, + { + "epoch": 16.43124312431243, + "grad_norm": 0.10478588938713074, + "learning_rate": 4.695027115578674e-06, + "loss": 0.0083, + "num_input_tokens_seen": 31520480, + "step": 149360 + }, + { + "epoch": 16.431793179317932, + "grad_norm": 0.060732755810022354, + "learning_rate": 4.693627056205407e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31521536, + "step": 149365 + }, + { + "epoch": 16.432343234323433, + "grad_norm": 0.2930883467197418, + "learning_rate": 4.692227183986231e-06, + "loss": 0.0065, + "num_input_tokens_seen": 31522624, + "step": 149370 + }, + { + "epoch": 16.432893289328934, + "grad_norm": 0.17783065140247345, + "learning_rate": 4.69082749893405e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31523616, + "step": 149375 + }, + { + "epoch": 16.433443344334435, + "grad_norm": 0.23773989081382751, + "learning_rate": 4.689428001061774e-06, + "loss": 0.0243, + "num_input_tokens_seen": 31524640, + "step": 149380 + }, + { + "epoch": 16.433993399339933, + "grad_norm": 0.027701448649168015, + "learning_rate": 4.6880286903822875e-06, + "loss": 0.004, + "num_input_tokens_seen": 31525696, + "step": 149385 + }, + { + "epoch": 16.434543454345434, + "grad_norm": 0.008203006349503994, + "learning_rate": 4.686629566908501e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31526720, + "step": 149390 + }, + { + "epoch": 16.435093509350935, + "grad_norm": 1.548359751701355, + "learning_rate": 4.685230630653298e-06, + "loss": 0.0873, + "num_input_tokens_seen": 31527744, + "step": 149395 + }, + { + "epoch": 16.435643564356436, + "grad_norm": 0.03527655079960823, + "learning_rate": 4.68383188162958e-06, + "loss": 0.0366, + "num_input_tokens_seen": 31528864, + "step": 149400 + }, + { + "epoch": 16.436193619361937, + "grad_norm": 0.08558455109596252, + "learning_rate": 4.682433319850229e-06, + "loss": 0.0415, + "num_input_tokens_seen": 31529952, + "step": 149405 + }, + { + "epoch": 16.436743674367438, + "grad_norm": 0.051823604851961136, + "learning_rate": 4.681034945328141e-06, + "loss": 0.039, + "num_input_tokens_seen": 31531040, + "step": 149410 + }, + { + "epoch": 16.437293729372936, + "grad_norm": 0.024111056700348854, + "learning_rate": 4.6796367580762116e-06, + "loss": 0.0005, + "num_input_tokens_seen": 31532064, + "step": 149415 + }, + { + "epoch": 16.437843784378437, + "grad_norm": 0.025223013013601303, + "learning_rate": 4.678238758107317e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31533120, + "step": 149420 + }, + { + "epoch": 16.438393839383938, + "grad_norm": 0.02957652322947979, + "learning_rate": 4.676840945434338e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31534240, + "step": 149425 + }, + { + "epoch": 16.43894389438944, + "grad_norm": 1.515123963356018, + "learning_rate": 4.675443320070166e-06, + "loss": 0.1175, + "num_input_tokens_seen": 31535360, + "step": 149430 + }, + { + "epoch": 16.43949394939494, + "grad_norm": 0.0028015868738293648, + "learning_rate": 4.674045882027681e-06, + "loss": 0.0501, + "num_input_tokens_seen": 31536352, + "step": 149435 + }, + { + "epoch": 16.44004400440044, + "grad_norm": 0.06588361412286758, + "learning_rate": 4.672648631319765e-06, + "loss": 0.003, + "num_input_tokens_seen": 31537376, + "step": 149440 + }, + { + "epoch": 16.440594059405942, + "grad_norm": 0.8587848544120789, + "learning_rate": 4.671251567959295e-06, + "loss": 0.0218, + "num_input_tokens_seen": 31538496, + "step": 149445 + }, + { + "epoch": 16.44114411441144, + "grad_norm": 1.2462204694747925, + "learning_rate": 4.669854691959139e-06, + "loss": 0.0358, + "num_input_tokens_seen": 31539584, + "step": 149450 + }, + { + "epoch": 16.44169416941694, + "grad_norm": 1.0764572620391846, + "learning_rate": 4.66845800333218e-06, + "loss": 0.1539, + "num_input_tokens_seen": 31540640, + "step": 149455 + }, + { + "epoch": 16.442244224422442, + "grad_norm": 0.2556208372116089, + "learning_rate": 4.667061502091283e-06, + "loss": 0.0291, + "num_input_tokens_seen": 31541728, + "step": 149460 + }, + { + "epoch": 16.442794279427943, + "grad_norm": 0.00916940625756979, + "learning_rate": 4.665665188249324e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31542784, + "step": 149465 + }, + { + "epoch": 16.443344334433444, + "grad_norm": 0.030839789658784866, + "learning_rate": 4.6642690618191764e-06, + "loss": 0.0756, + "num_input_tokens_seen": 31543808, + "step": 149470 + }, + { + "epoch": 16.443894389438945, + "grad_norm": 0.012375322170555592, + "learning_rate": 4.662873122813699e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31544896, + "step": 149475 + }, + { + "epoch": 16.444444444444443, + "grad_norm": 0.04026706889271736, + "learning_rate": 4.661477371245765e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31545952, + "step": 149480 + }, + { + "epoch": 16.444994499449944, + "grad_norm": 2.3634836673736572, + "learning_rate": 4.66008180712823e-06, + "loss": 0.1123, + "num_input_tokens_seen": 31547008, + "step": 149485 + }, + { + "epoch": 16.445544554455445, + "grad_norm": 2.2754476070404053, + "learning_rate": 4.65868643047396e-06, + "loss": 0.0732, + "num_input_tokens_seen": 31548064, + "step": 149490 + }, + { + "epoch": 16.446094609460946, + "grad_norm": 0.08705953508615494, + "learning_rate": 4.657291241295825e-06, + "loss": 0.0254, + "num_input_tokens_seen": 31549056, + "step": 149495 + }, + { + "epoch": 16.446644664466447, + "grad_norm": 0.2647656202316284, + "learning_rate": 4.655896239606666e-06, + "loss": 0.0135, + "num_input_tokens_seen": 31550176, + "step": 149500 + }, + { + "epoch": 16.44719471947195, + "grad_norm": 0.3339744210243225, + "learning_rate": 4.654501425419358e-06, + "loss": 0.0147, + "num_input_tokens_seen": 31551264, + "step": 149505 + }, + { + "epoch": 16.44774477447745, + "grad_norm": 0.14456991851329803, + "learning_rate": 4.65310679874674e-06, + "loss": 0.007, + "num_input_tokens_seen": 31552288, + "step": 149510 + }, + { + "epoch": 16.448294829482947, + "grad_norm": 0.03652719780802727, + "learning_rate": 4.651712359601678e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31553344, + "step": 149515 + }, + { + "epoch": 16.448844884488448, + "grad_norm": 0.08659019321203232, + "learning_rate": 4.6503181079970144e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31554432, + "step": 149520 + }, + { + "epoch": 16.44939493949395, + "grad_norm": 0.09401887655258179, + "learning_rate": 4.6489240439456044e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31555488, + "step": 149525 + }, + { + "epoch": 16.44994499449945, + "grad_norm": 0.10159943997859955, + "learning_rate": 4.647530167460304e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31556544, + "step": 149530 + }, + { + "epoch": 16.45049504950495, + "grad_norm": 0.005523481406271458, + "learning_rate": 4.646136478553945e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31557664, + "step": 149535 + }, + { + "epoch": 16.451045104510452, + "grad_norm": 0.35415950417518616, + "learning_rate": 4.644742977239383e-06, + "loss": 0.0481, + "num_input_tokens_seen": 31558752, + "step": 149540 + }, + { + "epoch": 16.45159515951595, + "grad_norm": 0.18668167293071747, + "learning_rate": 4.6433496635294535e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31559808, + "step": 149545 + }, + { + "epoch": 16.45214521452145, + "grad_norm": 0.02782640978693962, + "learning_rate": 4.641956537437e-06, + "loss": 0.0564, + "num_input_tokens_seen": 31560832, + "step": 149550 + }, + { + "epoch": 16.452695269526952, + "grad_norm": 0.023800035938620567, + "learning_rate": 4.640563598974873e-06, + "loss": 0.0097, + "num_input_tokens_seen": 31561792, + "step": 149555 + }, + { + "epoch": 16.453245324532453, + "grad_norm": 0.013495051302015781, + "learning_rate": 4.6391708481558936e-06, + "loss": 0.1148, + "num_input_tokens_seen": 31562816, + "step": 149560 + }, + { + "epoch": 16.453795379537954, + "grad_norm": 0.05913310497999191, + "learning_rate": 4.6377782849929135e-06, + "loss": 0.1392, + "num_input_tokens_seen": 31563872, + "step": 149565 + }, + { + "epoch": 16.454345434543455, + "grad_norm": 0.04817362129688263, + "learning_rate": 4.636385909498761e-06, + "loss": 0.002, + "num_input_tokens_seen": 31564960, + "step": 149570 + }, + { + "epoch": 16.454895489548957, + "grad_norm": 0.05213295668363571, + "learning_rate": 4.634993721686262e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31565952, + "step": 149575 + }, + { + "epoch": 16.455445544554454, + "grad_norm": 0.04550613462924957, + "learning_rate": 4.633601721568254e-06, + "loss": 0.0402, + "num_input_tokens_seen": 31566944, + "step": 149580 + }, + { + "epoch": 16.455995599559955, + "grad_norm": 0.16321569681167603, + "learning_rate": 4.632209909157564e-06, + "loss": 0.0068, + "num_input_tokens_seen": 31568000, + "step": 149585 + }, + { + "epoch": 16.456545654565456, + "grad_norm": 0.1742181032896042, + "learning_rate": 4.630818284467028e-06, + "loss": 0.0073, + "num_input_tokens_seen": 31569088, + "step": 149590 + }, + { + "epoch": 16.457095709570957, + "grad_norm": 0.7852206230163574, + "learning_rate": 4.629426847509469e-06, + "loss": 0.0098, + "num_input_tokens_seen": 31570144, + "step": 149595 + }, + { + "epoch": 16.45764576457646, + "grad_norm": 0.14377155900001526, + "learning_rate": 4.6280355982977e-06, + "loss": 0.0591, + "num_input_tokens_seen": 31571264, + "step": 149600 + }, + { + "epoch": 16.45819581958196, + "grad_norm": 0.05256924778223038, + "learning_rate": 4.626644536844552e-06, + "loss": 0.0232, + "num_input_tokens_seen": 31572288, + "step": 149605 + }, + { + "epoch": 16.458745874587457, + "grad_norm": 0.4324255883693695, + "learning_rate": 4.625253663162851e-06, + "loss": 0.0105, + "num_input_tokens_seen": 31573344, + "step": 149610 + }, + { + "epoch": 16.459295929592958, + "grad_norm": 0.06928640604019165, + "learning_rate": 4.623862977265403e-06, + "loss": 0.0035, + "num_input_tokens_seen": 31574464, + "step": 149615 + }, + { + "epoch": 16.45984598459846, + "grad_norm": 0.05151529610157013, + "learning_rate": 4.6224724791650385e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31575488, + "step": 149620 + }, + { + "epoch": 16.46039603960396, + "grad_norm": 0.022919785231351852, + "learning_rate": 4.6210821688745634e-06, + "loss": 0.011, + "num_input_tokens_seen": 31576544, + "step": 149625 + }, + { + "epoch": 16.46094609460946, + "grad_norm": 0.19393104314804077, + "learning_rate": 4.619692046406798e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31577568, + "step": 149630 + }, + { + "epoch": 16.461496149614963, + "grad_norm": 3.2262489795684814, + "learning_rate": 4.61830211177455e-06, + "loss": 0.0532, + "num_input_tokens_seen": 31578656, + "step": 149635 + }, + { + "epoch": 16.462046204620464, + "grad_norm": 0.30487966537475586, + "learning_rate": 4.616912364990627e-06, + "loss": 0.005, + "num_input_tokens_seen": 31579712, + "step": 149640 + }, + { + "epoch": 16.46259625962596, + "grad_norm": 0.049603357911109924, + "learning_rate": 4.615522806067852e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31580736, + "step": 149645 + }, + { + "epoch": 16.463146314631462, + "grad_norm": 0.018046170473098755, + "learning_rate": 4.614133435019011e-06, + "loss": 0.0108, + "num_input_tokens_seen": 31581856, + "step": 149650 + }, + { + "epoch": 16.463696369636963, + "grad_norm": 2.1245434284210205, + "learning_rate": 4.612744251856929e-06, + "loss": 0.0641, + "num_input_tokens_seen": 31582880, + "step": 149655 + }, + { + "epoch": 16.464246424642464, + "grad_norm": 0.023837393149733543, + "learning_rate": 4.6113552565943934e-06, + "loss": 0.0689, + "num_input_tokens_seen": 31584000, + "step": 149660 + }, + { + "epoch": 16.464796479647966, + "grad_norm": 2.446887969970703, + "learning_rate": 4.609966449244215e-06, + "loss": 0.1579, + "num_input_tokens_seen": 31585024, + "step": 149665 + }, + { + "epoch": 16.465346534653467, + "grad_norm": 4.07136869430542, + "learning_rate": 4.608577829819197e-06, + "loss": 0.088, + "num_input_tokens_seen": 31586080, + "step": 149670 + }, + { + "epoch": 16.465896589658964, + "grad_norm": 0.1133606880903244, + "learning_rate": 4.607189398332126e-06, + "loss": 0.1742, + "num_input_tokens_seen": 31587136, + "step": 149675 + }, + { + "epoch": 16.466446644664465, + "grad_norm": 0.20181089639663696, + "learning_rate": 4.605801154795811e-06, + "loss": 0.0151, + "num_input_tokens_seen": 31588160, + "step": 149680 + }, + { + "epoch": 16.466996699669966, + "grad_norm": 1.7867419719696045, + "learning_rate": 4.60441309922304e-06, + "loss": 0.0728, + "num_input_tokens_seen": 31589152, + "step": 149685 + }, + { + "epoch": 16.467546754675467, + "grad_norm": 0.011037405580282211, + "learning_rate": 4.603025231626601e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31590240, + "step": 149690 + }, + { + "epoch": 16.46809680968097, + "grad_norm": 0.04982245713472366, + "learning_rate": 4.601637552019292e-06, + "loss": 0.0057, + "num_input_tokens_seen": 31591360, + "step": 149695 + }, + { + "epoch": 16.46864686468647, + "grad_norm": 0.005389049649238586, + "learning_rate": 4.6002500604139e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31592384, + "step": 149700 + }, + { + "epoch": 16.46919691969197, + "grad_norm": 0.006857557687908411, + "learning_rate": 4.598862756823222e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31593408, + "step": 149705 + }, + { + "epoch": 16.46974697469747, + "grad_norm": 0.11967603862285614, + "learning_rate": 4.597475641260035e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31594496, + "step": 149710 + }, + { + "epoch": 16.47029702970297, + "grad_norm": 0.14357295632362366, + "learning_rate": 4.596088713737118e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31595584, + "step": 149715 + }, + { + "epoch": 16.47084708470847, + "grad_norm": 0.025851251557469368, + "learning_rate": 4.594701974267263e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31596736, + "step": 149720 + }, + { + "epoch": 16.47139713971397, + "grad_norm": 0.16437216103076935, + "learning_rate": 4.593315422863248e-06, + "loss": 0.0837, + "num_input_tokens_seen": 31597824, + "step": 149725 + }, + { + "epoch": 16.471947194719473, + "grad_norm": 0.03058924898505211, + "learning_rate": 4.591929059537858e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31598912, + "step": 149730 + }, + { + "epoch": 16.472497249724974, + "grad_norm": 0.013472882099449635, + "learning_rate": 4.590542884303867e-06, + "loss": 0.0068, + "num_input_tokens_seen": 31600000, + "step": 149735 + }, + { + "epoch": 16.47304730473047, + "grad_norm": 1.713169813156128, + "learning_rate": 4.5891568971740406e-06, + "loss": 0.2403, + "num_input_tokens_seen": 31601120, + "step": 149740 + }, + { + "epoch": 16.473597359735972, + "grad_norm": 1.9564852714538574, + "learning_rate": 4.587771098161167e-06, + "loss": 0.1589, + "num_input_tokens_seen": 31602144, + "step": 149745 + }, + { + "epoch": 16.474147414741473, + "grad_norm": 0.01253124512732029, + "learning_rate": 4.586385487278008e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31603200, + "step": 149750 + }, + { + "epoch": 16.474697469746975, + "grad_norm": 0.006701457314193249, + "learning_rate": 4.585000064537337e-06, + "loss": 0.001, + "num_input_tokens_seen": 31604288, + "step": 149755 + }, + { + "epoch": 16.475247524752476, + "grad_norm": 0.07813365757465363, + "learning_rate": 4.583614829951932e-06, + "loss": 0.002, + "num_input_tokens_seen": 31605280, + "step": 149760 + }, + { + "epoch": 16.475797579757977, + "grad_norm": 0.09449481219053268, + "learning_rate": 4.582229783534545e-06, + "loss": 0.0371, + "num_input_tokens_seen": 31606304, + "step": 149765 + }, + { + "epoch": 16.476347634763478, + "grad_norm": 0.021001538261771202, + "learning_rate": 4.580844925297955e-06, + "loss": 0.0293, + "num_input_tokens_seen": 31607328, + "step": 149770 + }, + { + "epoch": 16.476897689768975, + "grad_norm": 1.834216594696045, + "learning_rate": 4.579460255254914e-06, + "loss": 0.0229, + "num_input_tokens_seen": 31608384, + "step": 149775 + }, + { + "epoch": 16.477447744774476, + "grad_norm": 0.014535686932504177, + "learning_rate": 4.578075773418189e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31609408, + "step": 149780 + }, + { + "epoch": 16.477997799779978, + "grad_norm": 0.04392994940280914, + "learning_rate": 4.576691479800546e-06, + "loss": 0.0173, + "num_input_tokens_seen": 31610496, + "step": 149785 + }, + { + "epoch": 16.47854785478548, + "grad_norm": 0.006857273634523153, + "learning_rate": 4.575307374414731e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31611552, + "step": 149790 + }, + { + "epoch": 16.47909790979098, + "grad_norm": 0.019918611273169518, + "learning_rate": 4.573923457273513e-06, + "loss": 0.058, + "num_input_tokens_seen": 31612608, + "step": 149795 + }, + { + "epoch": 16.47964796479648, + "grad_norm": 0.009935392066836357, + "learning_rate": 4.572539728389641e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31613696, + "step": 149800 + }, + { + "epoch": 16.480198019801982, + "grad_norm": 0.048446279019117355, + "learning_rate": 4.571156187775863e-06, + "loss": 0.002, + "num_input_tokens_seen": 31614784, + "step": 149805 + }, + { + "epoch": 16.48074807480748, + "grad_norm": 0.5236630439758301, + "learning_rate": 4.569772835444935e-06, + "loss": 0.0071, + "num_input_tokens_seen": 31615872, + "step": 149810 + }, + { + "epoch": 16.48129812981298, + "grad_norm": 0.018528243526816368, + "learning_rate": 4.5683896714096095e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31616896, + "step": 149815 + }, + { + "epoch": 16.48184818481848, + "grad_norm": 1.402235746383667, + "learning_rate": 4.567006695682635e-06, + "loss": 0.0104, + "num_input_tokens_seen": 31618016, + "step": 149820 + }, + { + "epoch": 16.482398239823983, + "grad_norm": 0.0581539161503315, + "learning_rate": 4.5656239082767586e-06, + "loss": 0.0466, + "num_input_tokens_seen": 31619072, + "step": 149825 + }, + { + "epoch": 16.482948294829484, + "grad_norm": 2.5778748989105225, + "learning_rate": 4.564241309204714e-06, + "loss": 0.0212, + "num_input_tokens_seen": 31620192, + "step": 149830 + }, + { + "epoch": 16.483498349834985, + "grad_norm": 0.6763216257095337, + "learning_rate": 4.56285889847925e-06, + "loss": 0.0105, + "num_input_tokens_seen": 31621184, + "step": 149835 + }, + { + "epoch": 16.484048404840483, + "grad_norm": 0.004372330382466316, + "learning_rate": 4.561476676113108e-06, + "loss": 0.0899, + "num_input_tokens_seen": 31622272, + "step": 149840 + }, + { + "epoch": 16.484598459845984, + "grad_norm": 3.907301664352417, + "learning_rate": 4.5600946421190365e-06, + "loss": 0.1011, + "num_input_tokens_seen": 31623328, + "step": 149845 + }, + { + "epoch": 16.485148514851485, + "grad_norm": 1.6506526470184326, + "learning_rate": 4.558712796509765e-06, + "loss": 0.0428, + "num_input_tokens_seen": 31624448, + "step": 149850 + }, + { + "epoch": 16.485698569856986, + "grad_norm": 3.8099043369293213, + "learning_rate": 4.557331139298021e-06, + "loss": 0.1408, + "num_input_tokens_seen": 31625472, + "step": 149855 + }, + { + "epoch": 16.486248624862487, + "grad_norm": 1.1282107830047607, + "learning_rate": 4.555949670496553e-06, + "loss": 0.023, + "num_input_tokens_seen": 31626464, + "step": 149860 + }, + { + "epoch": 16.486798679867988, + "grad_norm": 0.04340296611189842, + "learning_rate": 4.554568390118083e-06, + "loss": 0.0554, + "num_input_tokens_seen": 31627552, + "step": 149865 + }, + { + "epoch": 16.48734873487349, + "grad_norm": 0.01563780941069126, + "learning_rate": 4.553187298175343e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31628608, + "step": 149870 + }, + { + "epoch": 16.487898789878987, + "grad_norm": 0.0021765760611742735, + "learning_rate": 4.5518063946810715e-06, + "loss": 0.0255, + "num_input_tokens_seen": 31629632, + "step": 149875 + }, + { + "epoch": 16.488448844884488, + "grad_norm": 2.177332878112793, + "learning_rate": 4.550425679647982e-06, + "loss": 0.067, + "num_input_tokens_seen": 31630720, + "step": 149880 + }, + { + "epoch": 16.48899889988999, + "grad_norm": 0.021028604358434677, + "learning_rate": 4.549045153088813e-06, + "loss": 0.0003, + "num_input_tokens_seen": 31631776, + "step": 149885 + }, + { + "epoch": 16.48954895489549, + "grad_norm": 0.03963105008006096, + "learning_rate": 4.547664815016275e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31632768, + "step": 149890 + }, + { + "epoch": 16.49009900990099, + "grad_norm": 0.027304302901029587, + "learning_rate": 4.546284665443095e-06, + "loss": 0.0132, + "num_input_tokens_seen": 31633920, + "step": 149895 + }, + { + "epoch": 16.490649064906492, + "grad_norm": 0.016077397391200066, + "learning_rate": 4.544904704382003e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31634976, + "step": 149900 + }, + { + "epoch": 16.49119911991199, + "grad_norm": 0.02555696852505207, + "learning_rate": 4.543524931845703e-06, + "loss": 0.0826, + "num_input_tokens_seen": 31636032, + "step": 149905 + }, + { + "epoch": 16.49174917491749, + "grad_norm": 0.24237291514873505, + "learning_rate": 4.5421453478469215e-06, + "loss": 0.0327, + "num_input_tokens_seen": 31637152, + "step": 149910 + }, + { + "epoch": 16.492299229922992, + "grad_norm": 0.007435754407197237, + "learning_rate": 4.5407659523983655e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31638176, + "step": 149915 + }, + { + "epoch": 16.492849284928493, + "grad_norm": 0.26019421219825745, + "learning_rate": 4.539386745512758e-06, + "loss": 0.003, + "num_input_tokens_seen": 31639264, + "step": 149920 + }, + { + "epoch": 16.493399339933994, + "grad_norm": 0.10354950278997421, + "learning_rate": 4.538007727202798e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31640352, + "step": 149925 + }, + { + "epoch": 16.493949394939495, + "grad_norm": 0.015501171350479126, + "learning_rate": 4.536628897481202e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31641440, + "step": 149930 + }, + { + "epoch": 16.494499449944996, + "grad_norm": 3.7462661266326904, + "learning_rate": 4.535250256360685e-06, + "loss": 0.0267, + "num_input_tokens_seen": 31642528, + "step": 149935 + }, + { + "epoch": 16.495049504950494, + "grad_norm": 3.6014437675476074, + "learning_rate": 4.533871803853942e-06, + "loss": 0.0433, + "num_input_tokens_seen": 31643552, + "step": 149940 + }, + { + "epoch": 16.495599559955995, + "grad_norm": 0.1145075261592865, + "learning_rate": 4.532493539973687e-06, + "loss": 0.0093, + "num_input_tokens_seen": 31644608, + "step": 149945 + }, + { + "epoch": 16.496149614961496, + "grad_norm": 0.019521770998835564, + "learning_rate": 4.5311154647326105e-06, + "loss": 0.0435, + "num_input_tokens_seen": 31645568, + "step": 149950 + }, + { + "epoch": 16.496699669966997, + "grad_norm": 0.12624269723892212, + "learning_rate": 4.52973757814342e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31646624, + "step": 149955 + }, + { + "epoch": 16.497249724972498, + "grad_norm": 0.051974404603242874, + "learning_rate": 4.528359880218824e-06, + "loss": 0.002, + "num_input_tokens_seen": 31647616, + "step": 149960 + }, + { + "epoch": 16.497799779978, + "grad_norm": 2.0919244289398193, + "learning_rate": 4.5269823709715055e-06, + "loss": 0.1038, + "num_input_tokens_seen": 31648672, + "step": 149965 + }, + { + "epoch": 16.498349834983497, + "grad_norm": 0.008013119921088219, + "learning_rate": 4.525605050414173e-06, + "loss": 0.0094, + "num_input_tokens_seen": 31649728, + "step": 149970 + }, + { + "epoch": 16.498899889988998, + "grad_norm": 0.022912118583917618, + "learning_rate": 4.524227918559512e-06, + "loss": 0.002, + "num_input_tokens_seen": 31650784, + "step": 149975 + }, + { + "epoch": 16.4994499449945, + "grad_norm": 0.013896957039833069, + "learning_rate": 4.522850975420212e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31651840, + "step": 149980 + }, + { + "epoch": 16.5, + "grad_norm": 0.007405150216072798, + "learning_rate": 4.52147422100897e-06, + "loss": 0.0223, + "num_input_tokens_seen": 31652960, + "step": 149985 + }, + { + "epoch": 16.5005500550055, + "grad_norm": 0.07628028094768524, + "learning_rate": 4.520097655338479e-06, + "loss": 0.0059, + "num_input_tokens_seen": 31654048, + "step": 149990 + }, + { + "epoch": 16.501100110011002, + "grad_norm": 0.023296743631362915, + "learning_rate": 4.518721278421414e-06, + "loss": 0.0138, + "num_input_tokens_seen": 31655104, + "step": 149995 + }, + { + "epoch": 16.501650165016503, + "grad_norm": 0.02330251969397068, + "learning_rate": 4.517345090270473e-06, + "loss": 0.0134, + "num_input_tokens_seen": 31656224, + "step": 150000 + }, + { + "epoch": 16.502200220022, + "grad_norm": 0.031229715794324875, + "learning_rate": 4.515969090898328e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31657216, + "step": 150005 + }, + { + "epoch": 16.502750275027502, + "grad_norm": 0.02766815759241581, + "learning_rate": 4.514593280317666e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31658336, + "step": 150010 + }, + { + "epoch": 16.503300330033003, + "grad_norm": 0.008935915306210518, + "learning_rate": 4.513217658541174e-06, + "loss": 0.0049, + "num_input_tokens_seen": 31659392, + "step": 150015 + }, + { + "epoch": 16.503850385038504, + "grad_norm": 0.18937571346759796, + "learning_rate": 4.511842225581519e-06, + "loss": 0.0071, + "num_input_tokens_seen": 31660448, + "step": 150020 + }, + { + "epoch": 16.504400440044005, + "grad_norm": 0.012849603779613972, + "learning_rate": 4.510466981451391e-06, + "loss": 0.021, + "num_input_tokens_seen": 31661504, + "step": 150025 + }, + { + "epoch": 16.504950495049506, + "grad_norm": 0.7286546230316162, + "learning_rate": 4.509091926163447e-06, + "loss": 0.0266, + "num_input_tokens_seen": 31662624, + "step": 150030 + }, + { + "epoch": 16.505500550055004, + "grad_norm": 1.6465048789978027, + "learning_rate": 4.507717059730379e-06, + "loss": 0.0498, + "num_input_tokens_seen": 31663680, + "step": 150035 + }, + { + "epoch": 16.506050605060505, + "grad_norm": 3.0112929344177246, + "learning_rate": 4.506342382164844e-06, + "loss": 0.1191, + "num_input_tokens_seen": 31664672, + "step": 150040 + }, + { + "epoch": 16.506600660066006, + "grad_norm": 0.01586567983031273, + "learning_rate": 4.504967893479517e-06, + "loss": 0.067, + "num_input_tokens_seen": 31665728, + "step": 150045 + }, + { + "epoch": 16.507150715071507, + "grad_norm": 0.12275783717632294, + "learning_rate": 4.503593593687072e-06, + "loss": 0.0087, + "num_input_tokens_seen": 31666816, + "step": 150050 + }, + { + "epoch": 16.507700770077008, + "grad_norm": 0.1373777687549591, + "learning_rate": 4.502219482800163e-06, + "loss": 0.0701, + "num_input_tokens_seen": 31667904, + "step": 150055 + }, + { + "epoch": 16.50825082508251, + "grad_norm": 0.023722022771835327, + "learning_rate": 4.500845560831468e-06, + "loss": 0.0338, + "num_input_tokens_seen": 31668992, + "step": 150060 + }, + { + "epoch": 16.50880088008801, + "grad_norm": 0.08291497826576233, + "learning_rate": 4.499471827793641e-06, + "loss": 0.0026, + "num_input_tokens_seen": 31670080, + "step": 150065 + }, + { + "epoch": 16.509350935093508, + "grad_norm": 0.049532026052474976, + "learning_rate": 4.498098283699337e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31671168, + "step": 150070 + }, + { + "epoch": 16.50990099009901, + "grad_norm": 0.004848419222980738, + "learning_rate": 4.4967249285612325e-06, + "loss": 0.0074, + "num_input_tokens_seen": 31672160, + "step": 150075 + }, + { + "epoch": 16.51045104510451, + "grad_norm": 0.032058291137218475, + "learning_rate": 4.495351762391969e-06, + "loss": 0.0058, + "num_input_tokens_seen": 31673184, + "step": 150080 + }, + { + "epoch": 16.51100110011001, + "grad_norm": 3.151827812194824, + "learning_rate": 4.49397878520422e-06, + "loss": 0.0199, + "num_input_tokens_seen": 31674304, + "step": 150085 + }, + { + "epoch": 16.511551155115512, + "grad_norm": 1.8815464973449707, + "learning_rate": 4.492605997010624e-06, + "loss": 0.0316, + "num_input_tokens_seen": 31675328, + "step": 150090 + }, + { + "epoch": 16.512101210121013, + "grad_norm": 0.0711677074432373, + "learning_rate": 4.4912333978238355e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31676352, + "step": 150095 + }, + { + "epoch": 16.51265126512651, + "grad_norm": 0.01958685740828514, + "learning_rate": 4.489860987656505e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31677408, + "step": 150100 + }, + { + "epoch": 16.513201320132012, + "grad_norm": 0.35523682832717896, + "learning_rate": 4.488488766521284e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31678432, + "step": 150105 + }, + { + "epoch": 16.513751375137513, + "grad_norm": 0.05275460705161095, + "learning_rate": 4.487116734430827e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31679392, + "step": 150110 + }, + { + "epoch": 16.514301430143014, + "grad_norm": 0.0353914350271225, + "learning_rate": 4.485744891397772e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31680512, + "step": 150115 + }, + { + "epoch": 16.514851485148515, + "grad_norm": 0.13917317986488342, + "learning_rate": 4.484373237434755e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31681504, + "step": 150120 + }, + { + "epoch": 16.515401540154016, + "grad_norm": 0.06191719323396683, + "learning_rate": 4.483001772554435e-06, + "loss": 0.0326, + "num_input_tokens_seen": 31682624, + "step": 150125 + }, + { + "epoch": 16.515951595159517, + "grad_norm": 0.19280017912387848, + "learning_rate": 4.481630496769429e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31683680, + "step": 150130 + }, + { + "epoch": 16.516501650165015, + "grad_norm": 0.008319075219333172, + "learning_rate": 4.4802594100923995e-06, + "loss": 0.008, + "num_input_tokens_seen": 31684672, + "step": 150135 + }, + { + "epoch": 16.517051705170516, + "grad_norm": 0.12872330844402313, + "learning_rate": 4.478888512535972e-06, + "loss": 0.0585, + "num_input_tokens_seen": 31685696, + "step": 150140 + }, + { + "epoch": 16.517601760176017, + "grad_norm": 0.19184595346450806, + "learning_rate": 4.47751780411278e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31686848, + "step": 150145 + }, + { + "epoch": 16.51815181518152, + "grad_norm": 0.03195275366306305, + "learning_rate": 4.4761472848354635e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31687904, + "step": 150150 + }, + { + "epoch": 16.51870187018702, + "grad_norm": 1.6274969577789307, + "learning_rate": 4.4747769547166415e-06, + "loss": 0.1628, + "num_input_tokens_seen": 31688960, + "step": 150155 + }, + { + "epoch": 16.51925192519252, + "grad_norm": 0.9514830708503723, + "learning_rate": 4.473406813768952e-06, + "loss": 0.1411, + "num_input_tokens_seen": 31690016, + "step": 150160 + }, + { + "epoch": 16.519801980198018, + "grad_norm": 0.024031134322285652, + "learning_rate": 4.472036862005027e-06, + "loss": 0.0061, + "num_input_tokens_seen": 31691136, + "step": 150165 + }, + { + "epoch": 16.52035203520352, + "grad_norm": 0.011465867049992085, + "learning_rate": 4.470667099437484e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31692160, + "step": 150170 + }, + { + "epoch": 16.52090209020902, + "grad_norm": 1.118772029876709, + "learning_rate": 4.469297526078958e-06, + "loss": 0.1035, + "num_input_tokens_seen": 31693216, + "step": 150175 + }, + { + "epoch": 16.52145214521452, + "grad_norm": 0.12323538213968277, + "learning_rate": 4.467928141942063e-06, + "loss": 0.0195, + "num_input_tokens_seen": 31694240, + "step": 150180 + }, + { + "epoch": 16.522002200220022, + "grad_norm": 0.5492427349090576, + "learning_rate": 4.4665589470394175e-06, + "loss": 0.0057, + "num_input_tokens_seen": 31695232, + "step": 150185 + }, + { + "epoch": 16.522552255225524, + "grad_norm": 0.00131552095990628, + "learning_rate": 4.465189941383643e-06, + "loss": 0.0044, + "num_input_tokens_seen": 31696288, + "step": 150190 + }, + { + "epoch": 16.523102310231025, + "grad_norm": 0.011218772269785404, + "learning_rate": 4.463821124987364e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31697344, + "step": 150195 + }, + { + "epoch": 16.523652365236522, + "grad_norm": 0.15020853281021118, + "learning_rate": 4.462452497863193e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31698368, + "step": 150200 + }, + { + "epoch": 16.524202420242023, + "grad_norm": 0.007960904389619827, + "learning_rate": 4.461084060023743e-06, + "loss": 0.017, + "num_input_tokens_seen": 31699392, + "step": 150205 + }, + { + "epoch": 16.524752475247524, + "grad_norm": 0.4058455228805542, + "learning_rate": 4.459715811481621e-06, + "loss": 0.0674, + "num_input_tokens_seen": 31700480, + "step": 150210 + }, + { + "epoch": 16.525302530253025, + "grad_norm": 0.05259174853563309, + "learning_rate": 4.4583477522494425e-06, + "loss": 0.065, + "num_input_tokens_seen": 31701568, + "step": 150215 + }, + { + "epoch": 16.525852585258527, + "grad_norm": 2.029951333999634, + "learning_rate": 4.456979882339815e-06, + "loss": 0.0891, + "num_input_tokens_seen": 31702592, + "step": 150220 + }, + { + "epoch": 16.526402640264028, + "grad_norm": 0.03883959352970123, + "learning_rate": 4.4556122017653526e-06, + "loss": 0.0189, + "num_input_tokens_seen": 31703616, + "step": 150225 + }, + { + "epoch": 16.52695269526953, + "grad_norm": 0.06215786933898926, + "learning_rate": 4.454244710538655e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31704672, + "step": 150230 + }, + { + "epoch": 16.527502750275026, + "grad_norm": 0.0076084379106760025, + "learning_rate": 4.4528774086723175e-06, + "loss": 0.002, + "num_input_tokens_seen": 31705728, + "step": 150235 + }, + { + "epoch": 16.528052805280527, + "grad_norm": 1.4405949115753174, + "learning_rate": 4.451510296178957e-06, + "loss": 0.0146, + "num_input_tokens_seen": 31706816, + "step": 150240 + }, + { + "epoch": 16.52860286028603, + "grad_norm": 0.07244279235601425, + "learning_rate": 4.450143373071158e-06, + "loss": 0.0732, + "num_input_tokens_seen": 31707936, + "step": 150245 + }, + { + "epoch": 16.52915291529153, + "grad_norm": 0.8024709820747375, + "learning_rate": 4.448776639361529e-06, + "loss": 0.0643, + "num_input_tokens_seen": 31708960, + "step": 150250 + }, + { + "epoch": 16.52970297029703, + "grad_norm": 0.14737273752689362, + "learning_rate": 4.447410095062671e-06, + "loss": 0.062, + "num_input_tokens_seen": 31710016, + "step": 150255 + }, + { + "epoch": 16.53025302530253, + "grad_norm": 0.042486611753702164, + "learning_rate": 4.446043740187164e-06, + "loss": 0.0126, + "num_input_tokens_seen": 31711072, + "step": 150260 + }, + { + "epoch": 16.53080308030803, + "grad_norm": 0.3337053060531616, + "learning_rate": 4.444677574747616e-06, + "loss": 0.0055, + "num_input_tokens_seen": 31712128, + "step": 150265 + }, + { + "epoch": 16.53135313531353, + "grad_norm": 0.01803005300462246, + "learning_rate": 4.443311598756606e-06, + "loss": 0.0461, + "num_input_tokens_seen": 31713184, + "step": 150270 + }, + { + "epoch": 16.53190319031903, + "grad_norm": 0.03220679610967636, + "learning_rate": 4.441945812226725e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31714272, + "step": 150275 + }, + { + "epoch": 16.532453245324533, + "grad_norm": 0.07053583860397339, + "learning_rate": 4.440580215170576e-06, + "loss": 0.1105, + "num_input_tokens_seen": 31715328, + "step": 150280 + }, + { + "epoch": 16.533003300330034, + "grad_norm": 0.01612815074622631, + "learning_rate": 4.439214807600725e-06, + "loss": 0.0342, + "num_input_tokens_seen": 31716384, + "step": 150285 + }, + { + "epoch": 16.533553355335535, + "grad_norm": 0.4164925515651703, + "learning_rate": 4.437849589529772e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31717504, + "step": 150290 + }, + { + "epoch": 16.534103410341036, + "grad_norm": 1.6230485439300537, + "learning_rate": 4.436484560970286e-06, + "loss": 0.1955, + "num_input_tokens_seen": 31718560, + "step": 150295 + }, + { + "epoch": 16.534653465346533, + "grad_norm": 0.033539608120918274, + "learning_rate": 4.435119721934861e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31719648, + "step": 150300 + }, + { + "epoch": 16.535203520352034, + "grad_norm": 1.9937701225280762, + "learning_rate": 4.433755072436063e-06, + "loss": 0.0797, + "num_input_tokens_seen": 31720736, + "step": 150305 + }, + { + "epoch": 16.535753575357536, + "grad_norm": 0.006764047779142857, + "learning_rate": 4.432390612486476e-06, + "loss": 0.0195, + "num_input_tokens_seen": 31721888, + "step": 150310 + }, + { + "epoch": 16.536303630363037, + "grad_norm": 0.012558427639305592, + "learning_rate": 4.431026342098682e-06, + "loss": 0.0167, + "num_input_tokens_seen": 31722944, + "step": 150315 + }, + { + "epoch": 16.536853685368538, + "grad_norm": 0.0609973706305027, + "learning_rate": 4.429662261285242e-06, + "loss": 0.0493, + "num_input_tokens_seen": 31724000, + "step": 150320 + }, + { + "epoch": 16.53740374037404, + "grad_norm": 0.09892545640468597, + "learning_rate": 4.4282983700587394e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31724992, + "step": 150325 + }, + { + "epoch": 16.537953795379536, + "grad_norm": 0.026314236223697662, + "learning_rate": 4.426934668431734e-06, + "loss": 0.0053, + "num_input_tokens_seen": 31726016, + "step": 150330 + }, + { + "epoch": 16.538503850385037, + "grad_norm": 0.06933710724115372, + "learning_rate": 4.4255711564168e-06, + "loss": 0.0033, + "num_input_tokens_seen": 31727040, + "step": 150335 + }, + { + "epoch": 16.53905390539054, + "grad_norm": 0.2253081500530243, + "learning_rate": 4.424207834026509e-06, + "loss": 0.0166, + "num_input_tokens_seen": 31728128, + "step": 150340 + }, + { + "epoch": 16.53960396039604, + "grad_norm": 0.3624267578125, + "learning_rate": 4.422844701273415e-06, + "loss": 0.0617, + "num_input_tokens_seen": 31729152, + "step": 150345 + }, + { + "epoch": 16.54015401540154, + "grad_norm": 0.026064475998282433, + "learning_rate": 4.421481758170095e-06, + "loss": 0.0101, + "num_input_tokens_seen": 31730176, + "step": 150350 + }, + { + "epoch": 16.540704070407042, + "grad_norm": 0.02439279668033123, + "learning_rate": 4.420119004729103e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31731264, + "step": 150355 + }, + { + "epoch": 16.541254125412543, + "grad_norm": 0.03288908675312996, + "learning_rate": 4.4187564409629955e-06, + "loss": 0.0059, + "num_input_tokens_seen": 31732416, + "step": 150360 + }, + { + "epoch": 16.54180418041804, + "grad_norm": 0.25701144337654114, + "learning_rate": 4.417394066884331e-06, + "loss": 0.0054, + "num_input_tokens_seen": 31733440, + "step": 150365 + }, + { + "epoch": 16.54235423542354, + "grad_norm": 0.07631631195545197, + "learning_rate": 4.416031882505675e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31734560, + "step": 150370 + }, + { + "epoch": 16.542904290429043, + "grad_norm": 0.07305797934532166, + "learning_rate": 4.4146698878395706e-06, + "loss": 0.035, + "num_input_tokens_seen": 31735584, + "step": 150375 + }, + { + "epoch": 16.543454345434544, + "grad_norm": 0.035553816705942154, + "learning_rate": 4.413308082898582e-06, + "loss": 0.0565, + "num_input_tokens_seen": 31736640, + "step": 150380 + }, + { + "epoch": 16.544004400440045, + "grad_norm": 0.1384631246328354, + "learning_rate": 4.4119464676952465e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31737696, + "step": 150385 + }, + { + "epoch": 16.544554455445546, + "grad_norm": 0.024945922195911407, + "learning_rate": 4.410585042242124e-06, + "loss": 0.0157, + "num_input_tokens_seen": 31738720, + "step": 150390 + }, + { + "epoch": 16.545104510451043, + "grad_norm": 2.964319944381714, + "learning_rate": 4.409223806551763e-06, + "loss": 0.0855, + "num_input_tokens_seen": 31739776, + "step": 150395 + }, + { + "epoch": 16.545654565456545, + "grad_norm": 0.021396061405539513, + "learning_rate": 4.407862760636702e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31740864, + "step": 150400 + }, + { + "epoch": 16.546204620462046, + "grad_norm": 0.027813637629151344, + "learning_rate": 4.406501904509494e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31741856, + "step": 150405 + }, + { + "epoch": 16.546754675467547, + "grad_norm": 1.811870813369751, + "learning_rate": 4.40514123818267e-06, + "loss": 0.0642, + "num_input_tokens_seen": 31742944, + "step": 150410 + }, + { + "epoch": 16.547304730473048, + "grad_norm": 0.18743082880973816, + "learning_rate": 4.4037807616687826e-06, + "loss": 0.0058, + "num_input_tokens_seen": 31744000, + "step": 150415 + }, + { + "epoch": 16.54785478547855, + "grad_norm": 0.007543859072029591, + "learning_rate": 4.402420474980359e-06, + "loss": 0.074, + "num_input_tokens_seen": 31745088, + "step": 150420 + }, + { + "epoch": 16.54840484048405, + "grad_norm": 0.067265085875988, + "learning_rate": 4.401060378129943e-06, + "loss": 0.0093, + "num_input_tokens_seen": 31746144, + "step": 150425 + }, + { + "epoch": 16.548954895489548, + "grad_norm": 0.0152365667745471, + "learning_rate": 4.399700471130075e-06, + "loss": 0.0694, + "num_input_tokens_seen": 31747200, + "step": 150430 + }, + { + "epoch": 16.54950495049505, + "grad_norm": 0.07581877708435059, + "learning_rate": 4.398340753993274e-06, + "loss": 0.0056, + "num_input_tokens_seen": 31748288, + "step": 150435 + }, + { + "epoch": 16.55005500550055, + "grad_norm": 0.024715084582567215, + "learning_rate": 4.396981226732089e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31749280, + "step": 150440 + }, + { + "epoch": 16.55060506050605, + "grad_norm": 0.006340306252241135, + "learning_rate": 4.395621889359036e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31750272, + "step": 150445 + }, + { + "epoch": 16.551155115511552, + "grad_norm": 0.09993124008178711, + "learning_rate": 4.3942627418866485e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31751328, + "step": 150450 + }, + { + "epoch": 16.551705170517053, + "grad_norm": 0.05628914386034012, + "learning_rate": 4.392903784327462e-06, + "loss": 0.001, + "num_input_tokens_seen": 31752352, + "step": 150455 + }, + { + "epoch": 16.55225522552255, + "grad_norm": 2.433596134185791, + "learning_rate": 4.391545016693985e-06, + "loss": 0.0925, + "num_input_tokens_seen": 31753344, + "step": 150460 + }, + { + "epoch": 16.55280528052805, + "grad_norm": 0.08432255685329437, + "learning_rate": 4.390186438998753e-06, + "loss": 0.0182, + "num_input_tokens_seen": 31754400, + "step": 150465 + }, + { + "epoch": 16.553355335533553, + "grad_norm": 0.08647476136684418, + "learning_rate": 4.388828051254287e-06, + "loss": 0.104, + "num_input_tokens_seen": 31755424, + "step": 150470 + }, + { + "epoch": 16.553905390539054, + "grad_norm": 0.19820940494537354, + "learning_rate": 4.387469853473095e-06, + "loss": 0.0054, + "num_input_tokens_seen": 31756512, + "step": 150475 + }, + { + "epoch": 16.554455445544555, + "grad_norm": 0.20677794516086578, + "learning_rate": 4.386111845667701e-06, + "loss": 0.1417, + "num_input_tokens_seen": 31757568, + "step": 150480 + }, + { + "epoch": 16.555005500550056, + "grad_norm": 0.05216941237449646, + "learning_rate": 4.384754027850624e-06, + "loss": 0.001, + "num_input_tokens_seen": 31758592, + "step": 150485 + }, + { + "epoch": 16.555555555555557, + "grad_norm": 0.5854387879371643, + "learning_rate": 4.383396400034381e-06, + "loss": 0.0575, + "num_input_tokens_seen": 31759680, + "step": 150490 + }, + { + "epoch": 16.556105610561055, + "grad_norm": 0.06470059603452682, + "learning_rate": 4.3820389622314825e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31760704, + "step": 150495 + }, + { + "epoch": 16.556655665566556, + "grad_norm": 0.06910740584135056, + "learning_rate": 4.380681714454432e-06, + "loss": 0.016, + "num_input_tokens_seen": 31761728, + "step": 150500 + }, + { + "epoch": 16.557205720572057, + "grad_norm": 2.5475099086761475, + "learning_rate": 4.379324656715741e-06, + "loss": 0.0986, + "num_input_tokens_seen": 31762784, + "step": 150505 + }, + { + "epoch": 16.557755775577558, + "grad_norm": 2.3609259128570557, + "learning_rate": 4.377967789027923e-06, + "loss": 0.0367, + "num_input_tokens_seen": 31763808, + "step": 150510 + }, + { + "epoch": 16.55830583058306, + "grad_norm": 0.10966017097234726, + "learning_rate": 4.376611111403487e-06, + "loss": 0.1724, + "num_input_tokens_seen": 31764832, + "step": 150515 + }, + { + "epoch": 16.55885588558856, + "grad_norm": 0.2236928939819336, + "learning_rate": 4.375254623854927e-06, + "loss": 0.0101, + "num_input_tokens_seen": 31765888, + "step": 150520 + }, + { + "epoch": 16.55940594059406, + "grad_norm": 0.020306119695305824, + "learning_rate": 4.373898326394746e-06, + "loss": 0.0054, + "num_input_tokens_seen": 31766944, + "step": 150525 + }, + { + "epoch": 16.55995599559956, + "grad_norm": 1.3547427654266357, + "learning_rate": 4.37254221903545e-06, + "loss": 0.0374, + "num_input_tokens_seen": 31767968, + "step": 150530 + }, + { + "epoch": 16.56050605060506, + "grad_norm": 0.046324748545885086, + "learning_rate": 4.371186301789529e-06, + "loss": 0.009, + "num_input_tokens_seen": 31768992, + "step": 150535 + }, + { + "epoch": 16.56105610561056, + "grad_norm": 0.07379527390003204, + "learning_rate": 4.369830574669484e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31769984, + "step": 150540 + }, + { + "epoch": 16.561606160616062, + "grad_norm": 0.016716336831450462, + "learning_rate": 4.368475037687819e-06, + "loss": 0.0397, + "num_input_tokens_seen": 31771040, + "step": 150545 + }, + { + "epoch": 16.562156215621563, + "grad_norm": 4.247318744659424, + "learning_rate": 4.367119690857014e-06, + "loss": 0.0961, + "num_input_tokens_seen": 31772064, + "step": 150550 + }, + { + "epoch": 16.562706270627064, + "grad_norm": 0.014202019199728966, + "learning_rate": 4.365764534189573e-06, + "loss": 0.006, + "num_input_tokens_seen": 31773056, + "step": 150555 + }, + { + "epoch": 16.563256325632562, + "grad_norm": 0.12223625928163528, + "learning_rate": 4.364409567697972e-06, + "loss": 0.0154, + "num_input_tokens_seen": 31774112, + "step": 150560 + }, + { + "epoch": 16.563806380638063, + "grad_norm": 0.09388945996761322, + "learning_rate": 4.363054791394705e-06, + "loss": 0.0997, + "num_input_tokens_seen": 31775168, + "step": 150565 + }, + { + "epoch": 16.564356435643564, + "grad_norm": 1.9012221097946167, + "learning_rate": 4.361700205292265e-06, + "loss": 0.0093, + "num_input_tokens_seen": 31776256, + "step": 150570 + }, + { + "epoch": 16.564906490649065, + "grad_norm": 5.655153274536133, + "learning_rate": 4.360345809403127e-06, + "loss": 0.1098, + "num_input_tokens_seen": 31777312, + "step": 150575 + }, + { + "epoch": 16.565456545654566, + "grad_norm": 0.35518378019332886, + "learning_rate": 4.358991603739782e-06, + "loss": 0.0079, + "num_input_tokens_seen": 31778464, + "step": 150580 + }, + { + "epoch": 16.566006600660067, + "grad_norm": 0.039579618722200394, + "learning_rate": 4.357637588314708e-06, + "loss": 0.144, + "num_input_tokens_seen": 31779616, + "step": 150585 + }, + { + "epoch": 16.566556655665565, + "grad_norm": 0.1922222524881363, + "learning_rate": 4.356283763140376e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31780640, + "step": 150590 + }, + { + "epoch": 16.567106710671066, + "grad_norm": 1.00386643409729, + "learning_rate": 4.354930128229273e-06, + "loss": 0.1707, + "num_input_tokens_seen": 31781760, + "step": 150595 + }, + { + "epoch": 16.567656765676567, + "grad_norm": 0.07167308777570724, + "learning_rate": 4.353576683593871e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31782848, + "step": 150600 + }, + { + "epoch": 16.568206820682068, + "grad_norm": 0.08458996564149857, + "learning_rate": 4.3522234292466515e-06, + "loss": 0.0354, + "num_input_tokens_seen": 31783872, + "step": 150605 + }, + { + "epoch": 16.56875687568757, + "grad_norm": 0.08856918662786484, + "learning_rate": 4.350870365200083e-06, + "loss": 0.036, + "num_input_tokens_seen": 31784992, + "step": 150610 + }, + { + "epoch": 16.56930693069307, + "grad_norm": 0.1617637574672699, + "learning_rate": 4.349517491466623e-06, + "loss": 0.0709, + "num_input_tokens_seen": 31785984, + "step": 150615 + }, + { + "epoch": 16.56985698569857, + "grad_norm": 0.3914473056793213, + "learning_rate": 4.348164808058755e-06, + "loss": 0.0068, + "num_input_tokens_seen": 31787072, + "step": 150620 + }, + { + "epoch": 16.57040704070407, + "grad_norm": 0.015271153301000595, + "learning_rate": 4.346812314988943e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31788128, + "step": 150625 + }, + { + "epoch": 16.57095709570957, + "grad_norm": 0.3803008794784546, + "learning_rate": 4.345460012269656e-06, + "loss": 0.0635, + "num_input_tokens_seen": 31789248, + "step": 150630 + }, + { + "epoch": 16.57150715071507, + "grad_norm": 0.03668559715151787, + "learning_rate": 4.344107899913352e-06, + "loss": 0.0529, + "num_input_tokens_seen": 31790368, + "step": 150635 + }, + { + "epoch": 16.572057205720572, + "grad_norm": 0.13783176243305206, + "learning_rate": 4.342755977932489e-06, + "loss": 0.0458, + "num_input_tokens_seen": 31791488, + "step": 150640 + }, + { + "epoch": 16.572607260726073, + "grad_norm": 0.14365938305854797, + "learning_rate": 4.341404246339539e-06, + "loss": 0.0063, + "num_input_tokens_seen": 31792512, + "step": 150645 + }, + { + "epoch": 16.573157315731574, + "grad_norm": 0.10681550949811935, + "learning_rate": 4.340052705146947e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31793536, + "step": 150650 + }, + { + "epoch": 16.573707370737075, + "grad_norm": 1.2216652631759644, + "learning_rate": 4.338701354367172e-06, + "loss": 0.009, + "num_input_tokens_seen": 31794496, + "step": 150655 + }, + { + "epoch": 16.574257425742573, + "grad_norm": 0.35715189576148987, + "learning_rate": 4.337350194012682e-06, + "loss": 0.0111, + "num_input_tokens_seen": 31795616, + "step": 150660 + }, + { + "epoch": 16.574807480748074, + "grad_norm": 0.37290796637535095, + "learning_rate": 4.335999224095913e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31796672, + "step": 150665 + }, + { + "epoch": 16.575357535753575, + "grad_norm": 0.05186183378100395, + "learning_rate": 4.33464844462933e-06, + "loss": 0.0069, + "num_input_tokens_seen": 31797792, + "step": 150670 + }, + { + "epoch": 16.575907590759076, + "grad_norm": 0.024163521826267242, + "learning_rate": 4.33329785562537e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31798848, + "step": 150675 + }, + { + "epoch": 16.576457645764577, + "grad_norm": 0.012059476226568222, + "learning_rate": 4.331947457096486e-06, + "loss": 0.0526, + "num_input_tokens_seen": 31799872, + "step": 150680 + }, + { + "epoch": 16.57700770077008, + "grad_norm": 0.04597993940114975, + "learning_rate": 4.330597249055132e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31800992, + "step": 150685 + }, + { + "epoch": 16.577557755775576, + "grad_norm": 0.14114442467689514, + "learning_rate": 4.329247231513739e-06, + "loss": 0.0132, + "num_input_tokens_seen": 31802048, + "step": 150690 + }, + { + "epoch": 16.578107810781077, + "grad_norm": 0.005470140837132931, + "learning_rate": 4.3278974044847595e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31803040, + "step": 150695 + }, + { + "epoch": 16.578657865786578, + "grad_norm": 0.01758645661175251, + "learning_rate": 4.326547767980626e-06, + "loss": 0.0092, + "num_input_tokens_seen": 31804128, + "step": 150700 + }, + { + "epoch": 16.57920792079208, + "grad_norm": 0.060251809656620026, + "learning_rate": 4.325198322013787e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31805152, + "step": 150705 + }, + { + "epoch": 16.57975797579758, + "grad_norm": 0.04705007001757622, + "learning_rate": 4.323849066596669e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31806208, + "step": 150710 + }, + { + "epoch": 16.58030803080308, + "grad_norm": 0.056970708072185516, + "learning_rate": 4.322500001741714e-06, + "loss": 0.004, + "num_input_tokens_seen": 31807264, + "step": 150715 + }, + { + "epoch": 16.580858085808583, + "grad_norm": 0.04938248544931412, + "learning_rate": 4.321151127461359e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31808320, + "step": 150720 + }, + { + "epoch": 16.58140814081408, + "grad_norm": 0.1824020892381668, + "learning_rate": 4.319802443768025e-06, + "loss": 0.005, + "num_input_tokens_seen": 31809408, + "step": 150725 + }, + { + "epoch": 16.58195819581958, + "grad_norm": 0.2078879475593567, + "learning_rate": 4.318453950674153e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31810496, + "step": 150730 + }, + { + "epoch": 16.582508250825082, + "grad_norm": 0.02205735631287098, + "learning_rate": 4.317105648192171e-06, + "loss": 0.0119, + "num_input_tokens_seen": 31811584, + "step": 150735 + }, + { + "epoch": 16.583058305830583, + "grad_norm": 0.015384968370199203, + "learning_rate": 4.315757536334486e-06, + "loss": 0.0285, + "num_input_tokens_seen": 31812704, + "step": 150740 + }, + { + "epoch": 16.583608360836084, + "grad_norm": 1.468880295753479, + "learning_rate": 4.314409615113552e-06, + "loss": 0.1169, + "num_input_tokens_seen": 31813696, + "step": 150745 + }, + { + "epoch": 16.584158415841586, + "grad_norm": 2.2898688316345215, + "learning_rate": 4.313061884541775e-06, + "loss": 0.1682, + "num_input_tokens_seen": 31814752, + "step": 150750 + }, + { + "epoch": 16.584708470847083, + "grad_norm": 0.02118837833404541, + "learning_rate": 4.311714344631582e-06, + "loss": 0.005, + "num_input_tokens_seen": 31815840, + "step": 150755 + }, + { + "epoch": 16.585258525852584, + "grad_norm": 0.022513892501592636, + "learning_rate": 4.3103669953953925e-06, + "loss": 0.0264, + "num_input_tokens_seen": 31816928, + "step": 150760 + }, + { + "epoch": 16.585808580858085, + "grad_norm": 0.26831018924713135, + "learning_rate": 4.309019836845615e-06, + "loss": 0.0053, + "num_input_tokens_seen": 31818048, + "step": 150765 + }, + { + "epoch": 16.586358635863586, + "grad_norm": 0.04804505780339241, + "learning_rate": 4.307672868994675e-06, + "loss": 0.0538, + "num_input_tokens_seen": 31819072, + "step": 150770 + }, + { + "epoch": 16.586908690869087, + "grad_norm": 0.022494228556752205, + "learning_rate": 4.306326091854992e-06, + "loss": 0.0239, + "num_input_tokens_seen": 31820096, + "step": 150775 + }, + { + "epoch": 16.58745874587459, + "grad_norm": 0.04470156505703926, + "learning_rate": 4.304979505438964e-06, + "loss": 0.012, + "num_input_tokens_seen": 31821184, + "step": 150780 + }, + { + "epoch": 16.58800880088009, + "grad_norm": 0.8920797109603882, + "learning_rate": 4.303633109759015e-06, + "loss": 0.0048, + "num_input_tokens_seen": 31822144, + "step": 150785 + }, + { + "epoch": 16.588558855885587, + "grad_norm": 0.05214235931634903, + "learning_rate": 4.302286904827543e-06, + "loss": 0.0064, + "num_input_tokens_seen": 31823200, + "step": 150790 + }, + { + "epoch": 16.58910891089109, + "grad_norm": 0.4363252520561218, + "learning_rate": 4.300940890656963e-06, + "loss": 0.0037, + "num_input_tokens_seen": 31824256, + "step": 150795 + }, + { + "epoch": 16.58965896589659, + "grad_norm": 0.16064919531345367, + "learning_rate": 4.299595067259682e-06, + "loss": 0.0097, + "num_input_tokens_seen": 31825312, + "step": 150800 + }, + { + "epoch": 16.59020902090209, + "grad_norm": 0.004724796861410141, + "learning_rate": 4.298249434648094e-06, + "loss": 0.0568, + "num_input_tokens_seen": 31826304, + "step": 150805 + }, + { + "epoch": 16.59075907590759, + "grad_norm": 0.08216279745101929, + "learning_rate": 4.296903992834614e-06, + "loss": 0.0821, + "num_input_tokens_seen": 31827360, + "step": 150810 + }, + { + "epoch": 16.591309130913093, + "grad_norm": 0.030671266838908195, + "learning_rate": 4.295558741831629e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31828448, + "step": 150815 + }, + { + "epoch": 16.59185918591859, + "grad_norm": 0.02856704778969288, + "learning_rate": 4.294213681651552e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31829536, + "step": 150820 + }, + { + "epoch": 16.59240924092409, + "grad_norm": 0.22821137309074402, + "learning_rate": 4.292868812306763e-06, + "loss": 0.0037, + "num_input_tokens_seen": 31830624, + "step": 150825 + }, + { + "epoch": 16.592959295929592, + "grad_norm": 0.038354769349098206, + "learning_rate": 4.291524133809668e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31831648, + "step": 150830 + }, + { + "epoch": 16.593509350935093, + "grad_norm": 0.018864883109927177, + "learning_rate": 4.290179646172665e-06, + "loss": 0.0036, + "num_input_tokens_seen": 31832704, + "step": 150835 + }, + { + "epoch": 16.594059405940595, + "grad_norm": 0.07108797878026962, + "learning_rate": 4.288835349408132e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31833760, + "step": 150840 + }, + { + "epoch": 16.594609460946096, + "grad_norm": 1.7356826066970825, + "learning_rate": 4.287491243528471e-06, + "loss": 0.0284, + "num_input_tokens_seen": 31834816, + "step": 150845 + }, + { + "epoch": 16.595159515951597, + "grad_norm": 0.02017853781580925, + "learning_rate": 4.2861473285460644e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31835840, + "step": 150850 + }, + { + "epoch": 16.595709570957094, + "grad_norm": 0.05926349386572838, + "learning_rate": 4.284803604473284e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31836896, + "step": 150855 + }, + { + "epoch": 16.596259625962595, + "grad_norm": 2.0021286010742188, + "learning_rate": 4.283460071322543e-06, + "loss": 0.0536, + "num_input_tokens_seen": 31837920, + "step": 150860 + }, + { + "epoch": 16.596809680968097, + "grad_norm": 0.17989234626293182, + "learning_rate": 4.282116729106203e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31839008, + "step": 150865 + }, + { + "epoch": 16.597359735973598, + "grad_norm": 0.10680720955133438, + "learning_rate": 4.280773577836661e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31840064, + "step": 150870 + }, + { + "epoch": 16.5979097909791, + "grad_norm": 0.036612238734960556, + "learning_rate": 4.279430617526284e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31841088, + "step": 150875 + }, + { + "epoch": 16.5984598459846, + "grad_norm": 0.08897992223501205, + "learning_rate": 4.2780878481874456e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31842112, + "step": 150880 + }, + { + "epoch": 16.599009900990097, + "grad_norm": 0.015388011001050472, + "learning_rate": 4.27674526983253e-06, + "loss": 0.0389, + "num_input_tokens_seen": 31843200, + "step": 150885 + }, + { + "epoch": 16.5995599559956, + "grad_norm": 0.14858368039131165, + "learning_rate": 4.275402882473908e-06, + "loss": 0.05, + "num_input_tokens_seen": 31844320, + "step": 150890 + }, + { + "epoch": 16.6001100110011, + "grad_norm": 1.711364507675171, + "learning_rate": 4.274060686123959e-06, + "loss": 0.1316, + "num_input_tokens_seen": 31845344, + "step": 150895 + }, + { + "epoch": 16.6006600660066, + "grad_norm": 0.19890561699867249, + "learning_rate": 4.272718680795049e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31846336, + "step": 150900 + }, + { + "epoch": 16.6012101210121, + "grad_norm": 1.5761456489562988, + "learning_rate": 4.271376866499535e-06, + "loss": 0.0296, + "num_input_tokens_seen": 31847392, + "step": 150905 + }, + { + "epoch": 16.601760176017603, + "grad_norm": 0.021261878311634064, + "learning_rate": 4.270035243249803e-06, + "loss": 0.0024, + "num_input_tokens_seen": 31848448, + "step": 150910 + }, + { + "epoch": 16.602310231023104, + "grad_norm": 0.0502375029027462, + "learning_rate": 4.268693811058197e-06, + "loss": 0.0486, + "num_input_tokens_seen": 31849504, + "step": 150915 + }, + { + "epoch": 16.6028602860286, + "grad_norm": 0.07391366362571716, + "learning_rate": 4.267352569937105e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31850592, + "step": 150920 + }, + { + "epoch": 16.603410341034103, + "grad_norm": 0.015002958476543427, + "learning_rate": 4.266011519898874e-06, + "loss": 0.0376, + "num_input_tokens_seen": 31851648, + "step": 150925 + }, + { + "epoch": 16.603960396039604, + "grad_norm": 0.13910788297653198, + "learning_rate": 4.2646706609558605e-06, + "loss": 0.005, + "num_input_tokens_seen": 31852672, + "step": 150930 + }, + { + "epoch": 16.604510451045105, + "grad_norm": 0.030011778697371483, + "learning_rate": 4.263329993120432e-06, + "loss": 0.0009, + "num_input_tokens_seen": 31853728, + "step": 150935 + }, + { + "epoch": 16.605060506050606, + "grad_norm": 0.2082183063030243, + "learning_rate": 4.261989516404938e-06, + "loss": 0.005, + "num_input_tokens_seen": 31854720, + "step": 150940 + }, + { + "epoch": 16.605610561056107, + "grad_norm": 0.35136300325393677, + "learning_rate": 4.260649230821734e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31855808, + "step": 150945 + }, + { + "epoch": 16.606160616061608, + "grad_norm": 0.00205700914375484, + "learning_rate": 4.25930913638318e-06, + "loss": 0.0033, + "num_input_tokens_seen": 31856832, + "step": 150950 + }, + { + "epoch": 16.606710671067106, + "grad_norm": 0.028289783746004105, + "learning_rate": 4.257969233101614e-06, + "loss": 0.0008, + "num_input_tokens_seen": 31857824, + "step": 150955 + }, + { + "epoch": 16.607260726072607, + "grad_norm": 0.527468204498291, + "learning_rate": 4.2566295209894e-06, + "loss": 0.0112, + "num_input_tokens_seen": 31858944, + "step": 150960 + }, + { + "epoch": 16.607810781078108, + "grad_norm": 0.0029296299908310175, + "learning_rate": 4.255290000058879e-06, + "loss": 0.0091, + "num_input_tokens_seen": 31860000, + "step": 150965 + }, + { + "epoch": 16.60836083608361, + "grad_norm": 0.1439238041639328, + "learning_rate": 4.253950670322388e-06, + "loss": 0.0032, + "num_input_tokens_seen": 31861024, + "step": 150970 + }, + { + "epoch": 16.60891089108911, + "grad_norm": 0.16178487241268158, + "learning_rate": 4.25261153179228e-06, + "loss": 0.0065, + "num_input_tokens_seen": 31862080, + "step": 150975 + }, + { + "epoch": 16.60946094609461, + "grad_norm": 0.27321940660476685, + "learning_rate": 4.251272584480894e-06, + "loss": 0.0062, + "num_input_tokens_seen": 31863136, + "step": 150980 + }, + { + "epoch": 16.61001100110011, + "grad_norm": 0.030407559126615524, + "learning_rate": 4.249933828400582e-06, + "loss": 0.0162, + "num_input_tokens_seen": 31864256, + "step": 150985 + }, + { + "epoch": 16.61056105610561, + "grad_norm": 0.041785288602113724, + "learning_rate": 4.2485952635636705e-06, + "loss": 0.0365, + "num_input_tokens_seen": 31865280, + "step": 150990 + }, + { + "epoch": 16.61111111111111, + "grad_norm": 0.10553644597530365, + "learning_rate": 4.247256889982493e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31866400, + "step": 150995 + }, + { + "epoch": 16.611661166116612, + "grad_norm": 0.03902282938361168, + "learning_rate": 4.245918707669391e-06, + "loss": 0.1201, + "num_input_tokens_seen": 31867424, + "step": 151000 + }, + { + "epoch": 16.612211221122113, + "grad_norm": 0.22407566010951996, + "learning_rate": 4.244580716636698e-06, + "loss": 0.0035, + "num_input_tokens_seen": 31868480, + "step": 151005 + }, + { + "epoch": 16.612761276127614, + "grad_norm": 0.05526382103562355, + "learning_rate": 4.24324291689675e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31869632, + "step": 151010 + }, + { + "epoch": 16.61331133113311, + "grad_norm": 0.027294475585222244, + "learning_rate": 4.2419053084618746e-06, + "loss": 0.0232, + "num_input_tokens_seen": 31870688, + "step": 151015 + }, + { + "epoch": 16.613861386138613, + "grad_norm": 0.00952145829796791, + "learning_rate": 4.240567891344388e-06, + "loss": 0.0425, + "num_input_tokens_seen": 31871776, + "step": 151020 + }, + { + "epoch": 16.614411441144114, + "grad_norm": 0.13888514041900635, + "learning_rate": 4.239230665556634e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31872800, + "step": 151025 + }, + { + "epoch": 16.614961496149615, + "grad_norm": 0.06245267391204834, + "learning_rate": 4.23789363111092e-06, + "loss": 0.0703, + "num_input_tokens_seen": 31873824, + "step": 151030 + }, + { + "epoch": 16.615511551155116, + "grad_norm": 0.035856593400239944, + "learning_rate": 4.23655678801958e-06, + "loss": 0.005, + "num_input_tokens_seen": 31874880, + "step": 151035 + }, + { + "epoch": 16.616061606160617, + "grad_norm": 0.03640352562069893, + "learning_rate": 4.23522013629494e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31875936, + "step": 151040 + }, + { + "epoch": 16.616611661166118, + "grad_norm": 3.205267906188965, + "learning_rate": 4.233883675949304e-06, + "loss": 0.1051, + "num_input_tokens_seen": 31876928, + "step": 151045 + }, + { + "epoch": 16.617161716171616, + "grad_norm": 0.030611924827098846, + "learning_rate": 4.232547406995005e-06, + "loss": 0.0075, + "num_input_tokens_seen": 31877984, + "step": 151050 + }, + { + "epoch": 16.617711771177117, + "grad_norm": 0.019061904400587082, + "learning_rate": 4.231211329444346e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31879040, + "step": 151055 + }, + { + "epoch": 16.618261826182618, + "grad_norm": 0.026986384764313698, + "learning_rate": 4.229875443309645e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31880096, + "step": 151060 + }, + { + "epoch": 16.61881188118812, + "grad_norm": 0.008094166405498981, + "learning_rate": 4.228539748603219e-06, + "loss": 0.0015, + "num_input_tokens_seen": 31881152, + "step": 151065 + }, + { + "epoch": 16.61936193619362, + "grad_norm": 0.006963068153709173, + "learning_rate": 4.227204245337374e-06, + "loss": 0.0031, + "num_input_tokens_seen": 31882176, + "step": 151070 + }, + { + "epoch": 16.61991199119912, + "grad_norm": 0.011206677183508873, + "learning_rate": 4.225868933524421e-06, + "loss": 0.0624, + "num_input_tokens_seen": 31883232, + "step": 151075 + }, + { + "epoch": 16.620462046204622, + "grad_norm": 0.004443114157766104, + "learning_rate": 4.224533813176662e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31884224, + "step": 151080 + }, + { + "epoch": 16.62101210121012, + "grad_norm": 0.2106190174818039, + "learning_rate": 4.223198884306412e-06, + "loss": 0.0035, + "num_input_tokens_seen": 31885248, + "step": 151085 + }, + { + "epoch": 16.62156215621562, + "grad_norm": 0.009046295657753944, + "learning_rate": 4.221864146925963e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31886304, + "step": 151090 + }, + { + "epoch": 16.622112211221122, + "grad_norm": 0.06396332383155823, + "learning_rate": 4.220529601047621e-06, + "loss": 0.0073, + "num_input_tokens_seen": 31887360, + "step": 151095 + }, + { + "epoch": 16.622662266226623, + "grad_norm": 0.013745073229074478, + "learning_rate": 4.219195246683691e-06, + "loss": 0.0072, + "num_input_tokens_seen": 31888384, + "step": 151100 + }, + { + "epoch": 16.623212321232124, + "grad_norm": 0.007430146913975477, + "learning_rate": 4.217861083846464e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31889408, + "step": 151105 + }, + { + "epoch": 16.623762376237625, + "grad_norm": 0.05028454214334488, + "learning_rate": 4.216527112548243e-06, + "loss": 0.0115, + "num_input_tokens_seen": 31890496, + "step": 151110 + }, + { + "epoch": 16.624312431243123, + "grad_norm": 0.008483748883008957, + "learning_rate": 4.2151933328013125e-06, + "loss": 0.0338, + "num_input_tokens_seen": 31891584, + "step": 151115 + }, + { + "epoch": 16.624862486248624, + "grad_norm": 0.01506616361439228, + "learning_rate": 4.213859744617973e-06, + "loss": 0.0052, + "num_input_tokens_seen": 31892672, + "step": 151120 + }, + { + "epoch": 16.625412541254125, + "grad_norm": 0.04472386837005615, + "learning_rate": 4.21252634801052e-06, + "loss": 0.0246, + "num_input_tokens_seen": 31893728, + "step": 151125 + }, + { + "epoch": 16.625962596259626, + "grad_norm": 0.03359399363398552, + "learning_rate": 4.21119314299123e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31894752, + "step": 151130 + }, + { + "epoch": 16.626512651265127, + "grad_norm": 0.22846148908138275, + "learning_rate": 4.209860129572404e-06, + "loss": 0.003, + "num_input_tokens_seen": 31895776, + "step": 151135 + }, + { + "epoch": 16.627062706270628, + "grad_norm": 0.037084534764289856, + "learning_rate": 4.208527307766319e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31896832, + "step": 151140 + }, + { + "epoch": 16.62761276127613, + "grad_norm": 0.007265027612447739, + "learning_rate": 4.207194677585258e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31897888, + "step": 151145 + }, + { + "epoch": 16.628162816281627, + "grad_norm": 0.09316696971654892, + "learning_rate": 4.205862239041505e-06, + "loss": 0.0337, + "num_input_tokens_seen": 31898944, + "step": 151150 + }, + { + "epoch": 16.628712871287128, + "grad_norm": 0.2210487276315689, + "learning_rate": 4.204529992147346e-06, + "loss": 0.0098, + "num_input_tokens_seen": 31900032, + "step": 151155 + }, + { + "epoch": 16.62926292629263, + "grad_norm": 0.08924774080514908, + "learning_rate": 4.203197936915051e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31901184, + "step": 151160 + }, + { + "epoch": 16.62981298129813, + "grad_norm": 0.563075065612793, + "learning_rate": 4.201866073356908e-06, + "loss": 0.0712, + "num_input_tokens_seen": 31902272, + "step": 151165 + }, + { + "epoch": 16.63036303630363, + "grad_norm": 0.3602854907512665, + "learning_rate": 4.200534401485179e-06, + "loss": 0.093, + "num_input_tokens_seen": 31903328, + "step": 151170 + }, + { + "epoch": 16.630913091309132, + "grad_norm": 0.02306760475039482, + "learning_rate": 4.1992029213121455e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31904416, + "step": 151175 + }, + { + "epoch": 16.63146314631463, + "grad_norm": 0.009061303921043873, + "learning_rate": 4.197871632850081e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31905440, + "step": 151180 + }, + { + "epoch": 16.63201320132013, + "grad_norm": 0.024678319692611694, + "learning_rate": 4.196540536111246e-06, + "loss": 0.0338, + "num_input_tokens_seen": 31906496, + "step": 151185 + }, + { + "epoch": 16.632563256325632, + "grad_norm": 0.058473244309425354, + "learning_rate": 4.195209631107921e-06, + "loss": 0.1177, + "num_input_tokens_seen": 31907552, + "step": 151190 + }, + { + "epoch": 16.633113311331133, + "grad_norm": 0.0119326775893569, + "learning_rate": 4.193878917852362e-06, + "loss": 0.0094, + "num_input_tokens_seen": 31908576, + "step": 151195 + }, + { + "epoch": 16.633663366336634, + "grad_norm": 0.005473021883517504, + "learning_rate": 4.1925483963568405e-06, + "loss": 0.005, + "num_input_tokens_seen": 31909664, + "step": 151200 + }, + { + "epoch": 16.634213421342135, + "grad_norm": 0.034057989716529846, + "learning_rate": 4.191218066633609e-06, + "loss": 0.0115, + "num_input_tokens_seen": 31910720, + "step": 151205 + }, + { + "epoch": 16.634763476347636, + "grad_norm": 0.5382419228553772, + "learning_rate": 4.189887928694939e-06, + "loss": 0.0047, + "num_input_tokens_seen": 31911776, + "step": 151210 + }, + { + "epoch": 16.635313531353134, + "grad_norm": 0.005779968108981848, + "learning_rate": 4.18855798255309e-06, + "loss": 0.008, + "num_input_tokens_seen": 31912864, + "step": 151215 + }, + { + "epoch": 16.635863586358635, + "grad_norm": 0.037205785512924194, + "learning_rate": 4.1872282282203105e-06, + "loss": 0.0017, + "num_input_tokens_seen": 31913952, + "step": 151220 + }, + { + "epoch": 16.636413641364136, + "grad_norm": 1.806037425994873, + "learning_rate": 4.185898665708868e-06, + "loss": 0.0653, + "num_input_tokens_seen": 31914976, + "step": 151225 + }, + { + "epoch": 16.636963696369637, + "grad_norm": 0.013940063305199146, + "learning_rate": 4.184569295031002e-06, + "loss": 0.01, + "num_input_tokens_seen": 31916032, + "step": 151230 + }, + { + "epoch": 16.63751375137514, + "grad_norm": 0.028081929311156273, + "learning_rate": 4.183240116198975e-06, + "loss": 0.0097, + "num_input_tokens_seen": 31917088, + "step": 151235 + }, + { + "epoch": 16.63806380638064, + "grad_norm": 0.05552598461508751, + "learning_rate": 4.181911129225039e-06, + "loss": 0.0019, + "num_input_tokens_seen": 31918208, + "step": 151240 + }, + { + "epoch": 16.638613861386137, + "grad_norm": 0.08552609384059906, + "learning_rate": 4.180582334121433e-06, + "loss": 0.0373, + "num_input_tokens_seen": 31919264, + "step": 151245 + }, + { + "epoch": 16.639163916391638, + "grad_norm": 0.005052626598626375, + "learning_rate": 4.179253730900418e-06, + "loss": 0.0116, + "num_input_tokens_seen": 31920320, + "step": 151250 + }, + { + "epoch": 16.63971397139714, + "grad_norm": 0.1412767767906189, + "learning_rate": 4.177925319574228e-06, + "loss": 0.0078, + "num_input_tokens_seen": 31921376, + "step": 151255 + }, + { + "epoch": 16.64026402640264, + "grad_norm": 0.10148870944976807, + "learning_rate": 4.176597100155103e-06, + "loss": 0.0035, + "num_input_tokens_seen": 31922528, + "step": 151260 + }, + { + "epoch": 16.64081408140814, + "grad_norm": 0.014988461509346962, + "learning_rate": 4.175269072655291e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31923552, + "step": 151265 + }, + { + "epoch": 16.641364136413642, + "grad_norm": 0.005274834111332893, + "learning_rate": 4.173941237087031e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31924640, + "step": 151270 + }, + { + "epoch": 16.641914191419144, + "grad_norm": 0.20940524339675903, + "learning_rate": 4.172613593462566e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31925728, + "step": 151275 + }, + { + "epoch": 16.64246424642464, + "grad_norm": 0.017420465126633644, + "learning_rate": 4.171286141794131e-06, + "loss": 0.0499, + "num_input_tokens_seen": 31926720, + "step": 151280 + }, + { + "epoch": 16.643014301430142, + "grad_norm": 0.0029577685054391623, + "learning_rate": 4.1699588820939466e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31927776, + "step": 151285 + }, + { + "epoch": 16.643564356435643, + "grad_norm": 0.07477188855409622, + "learning_rate": 4.1686318143742604e-06, + "loss": 0.006, + "num_input_tokens_seen": 31928832, + "step": 151290 + }, + { + "epoch": 16.644114411441144, + "grad_norm": 0.023402784019708633, + "learning_rate": 4.167304938647296e-06, + "loss": 0.072, + "num_input_tokens_seen": 31929920, + "step": 151295 + }, + { + "epoch": 16.644664466446645, + "grad_norm": 0.03512706980109215, + "learning_rate": 4.1659782549252925e-06, + "loss": 0.1216, + "num_input_tokens_seen": 31930912, + "step": 151300 + }, + { + "epoch": 16.645214521452147, + "grad_norm": 0.04533539339900017, + "learning_rate": 4.16465176322047e-06, + "loss": 0.0014, + "num_input_tokens_seen": 31931968, + "step": 151305 + }, + { + "epoch": 16.645764576457644, + "grad_norm": 0.02146153151988983, + "learning_rate": 4.163325463545048e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31933088, + "step": 151310 + }, + { + "epoch": 16.646314631463145, + "grad_norm": 0.15791717171669006, + "learning_rate": 4.161999355911264e-06, + "loss": 0.003, + "num_input_tokens_seen": 31934080, + "step": 151315 + }, + { + "epoch": 16.646864686468646, + "grad_norm": 0.12181491404771805, + "learning_rate": 4.160673440331328e-06, + "loss": 0.0411, + "num_input_tokens_seen": 31935136, + "step": 151320 + }, + { + "epoch": 16.647414741474147, + "grad_norm": 0.01031066756695509, + "learning_rate": 4.159347716817463e-06, + "loss": 0.0049, + "num_input_tokens_seen": 31936192, + "step": 151325 + }, + { + "epoch": 16.64796479647965, + "grad_norm": 0.2515348494052887, + "learning_rate": 4.158022185381896e-06, + "loss": 0.0798, + "num_input_tokens_seen": 31937216, + "step": 151330 + }, + { + "epoch": 16.64851485148515, + "grad_norm": 0.39211758971214294, + "learning_rate": 4.156696846036831e-06, + "loss": 0.087, + "num_input_tokens_seen": 31938272, + "step": 151335 + }, + { + "epoch": 16.64906490649065, + "grad_norm": 0.010064559057354927, + "learning_rate": 4.155371698794497e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31939264, + "step": 151340 + }, + { + "epoch": 16.649614961496148, + "grad_norm": 0.15849050879478455, + "learning_rate": 4.1540467436670946e-06, + "loss": 0.0107, + "num_input_tokens_seen": 31940288, + "step": 151345 + }, + { + "epoch": 16.65016501650165, + "grad_norm": 0.027404794469475746, + "learning_rate": 4.1527219806668375e-06, + "loss": 0.0011, + "num_input_tokens_seen": 31941376, + "step": 151350 + }, + { + "epoch": 16.65071507150715, + "grad_norm": 0.012754896655678749, + "learning_rate": 4.151397409805946e-06, + "loss": 0.002, + "num_input_tokens_seen": 31942400, + "step": 151355 + }, + { + "epoch": 16.65126512651265, + "grad_norm": 0.03283287212252617, + "learning_rate": 4.1500730310966155e-06, + "loss": 0.0023, + "num_input_tokens_seen": 31943424, + "step": 151360 + }, + { + "epoch": 16.651815181518153, + "grad_norm": 0.31896376609802246, + "learning_rate": 4.148748844551062e-06, + "loss": 0.0077, + "num_input_tokens_seen": 31944448, + "step": 151365 + }, + { + "epoch": 16.652365236523654, + "grad_norm": 0.04936298355460167, + "learning_rate": 4.147424850181486e-06, + "loss": 0.0027, + "num_input_tokens_seen": 31945472, + "step": 151370 + }, + { + "epoch": 16.652915291529155, + "grad_norm": 0.03201668709516525, + "learning_rate": 4.146101048000081e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31946560, + "step": 151375 + }, + { + "epoch": 16.653465346534652, + "grad_norm": 0.09013764560222626, + "learning_rate": 4.144777438019057e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31947616, + "step": 151380 + }, + { + "epoch": 16.654015401540153, + "grad_norm": 0.06559015065431595, + "learning_rate": 4.1434540202506115e-06, + "loss": 0.0501, + "num_input_tokens_seen": 31948672, + "step": 151385 + }, + { + "epoch": 16.654565456545654, + "grad_norm": 0.24286285042762756, + "learning_rate": 4.14213079470695e-06, + "loss": 0.005, + "num_input_tokens_seen": 31949792, + "step": 151390 + }, + { + "epoch": 16.655115511551156, + "grad_norm": 0.028419384732842445, + "learning_rate": 4.1408077614002585e-06, + "loss": 0.122, + "num_input_tokens_seen": 31950816, + "step": 151395 + }, + { + "epoch": 16.655665566556657, + "grad_norm": 0.027410142123699188, + "learning_rate": 4.1394849203427284e-06, + "loss": 0.0012, + "num_input_tokens_seen": 31951936, + "step": 151400 + }, + { + "epoch": 16.656215621562158, + "grad_norm": 0.12605369091033936, + "learning_rate": 4.138162271546555e-06, + "loss": 0.135, + "num_input_tokens_seen": 31952992, + "step": 151405 + }, + { + "epoch": 16.656765676567655, + "grad_norm": 0.011700090952217579, + "learning_rate": 4.1368398150239296e-06, + "loss": 0.0086, + "num_input_tokens_seen": 31954016, + "step": 151410 + }, + { + "epoch": 16.657315731573156, + "grad_norm": 0.007217299658805132, + "learning_rate": 4.135517550787046e-06, + "loss": 0.0022, + "num_input_tokens_seen": 31955040, + "step": 151415 + }, + { + "epoch": 16.657865786578657, + "grad_norm": 0.012388059869408607, + "learning_rate": 4.134195478848088e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31956128, + "step": 151420 + }, + { + "epoch": 16.65841584158416, + "grad_norm": 0.015438810922205448, + "learning_rate": 4.132873599219231e-06, + "loss": 0.0347, + "num_input_tokens_seen": 31957216, + "step": 151425 + }, + { + "epoch": 16.65896589658966, + "grad_norm": 0.04335092008113861, + "learning_rate": 4.131551911912668e-06, + "loss": 0.0033, + "num_input_tokens_seen": 31958208, + "step": 151430 + }, + { + "epoch": 16.65951595159516, + "grad_norm": 0.002873862162232399, + "learning_rate": 4.1302304169405726e-06, + "loss": 0.0006, + "num_input_tokens_seen": 31959232, + "step": 151435 + }, + { + "epoch": 16.66006600660066, + "grad_norm": 0.09861137717962265, + "learning_rate": 4.1289091143151295e-06, + "loss": 0.039, + "num_input_tokens_seen": 31960352, + "step": 151440 + }, + { + "epoch": 16.66061606160616, + "grad_norm": 0.025219189003109932, + "learning_rate": 4.127588004048524e-06, + "loss": 0.0555, + "num_input_tokens_seen": 31961408, + "step": 151445 + }, + { + "epoch": 16.66116611661166, + "grad_norm": 0.023213325068354607, + "learning_rate": 4.126267086152916e-06, + "loss": 0.0331, + "num_input_tokens_seen": 31962464, + "step": 151450 + }, + { + "epoch": 16.66171617161716, + "grad_norm": 0.13784359395503998, + "learning_rate": 4.124946360640494e-06, + "loss": 0.0025, + "num_input_tokens_seen": 31963584, + "step": 151455 + }, + { + "epoch": 16.662266226622663, + "grad_norm": 1.338616967201233, + "learning_rate": 4.123625827523419e-06, + "loss": 0.0103, + "num_input_tokens_seen": 31964640, + "step": 151460 + }, + { + "epoch": 16.662816281628164, + "grad_norm": 0.003447241149842739, + "learning_rate": 4.122305486813868e-06, + "loss": 0.0007, + "num_input_tokens_seen": 31965632, + "step": 151465 + }, + { + "epoch": 16.663366336633665, + "grad_norm": 2.65958309173584, + "learning_rate": 4.120985338524017e-06, + "loss": 0.077, + "num_input_tokens_seen": 31966720, + "step": 151470 + }, + { + "epoch": 16.663916391639162, + "grad_norm": 0.005522461608052254, + "learning_rate": 4.119665382666016e-06, + "loss": 0.0021, + "num_input_tokens_seen": 31967808, + "step": 151475 + }, + { + "epoch": 16.664466446644663, + "grad_norm": 0.4648074209690094, + "learning_rate": 4.118345619252048e-06, + "loss": 0.0065, + "num_input_tokens_seen": 31968864, + "step": 151480 + }, + { + "epoch": 16.665016501650165, + "grad_norm": 0.01855027861893177, + "learning_rate": 4.117026048294262e-06, + "loss": 0.0018, + "num_input_tokens_seen": 31969920, + "step": 151485 + }, + { + "epoch": 16.665566556655666, + "grad_norm": 0.12971174716949463, + "learning_rate": 4.115706669804831e-06, + "loss": 0.0043, + "num_input_tokens_seen": 31970944, + "step": 151490 + }, + { + "epoch": 16.666116611661167, + "grad_norm": 0.2874564826488495, + "learning_rate": 4.114387483795906e-06, + "loss": 0.0034, + "num_input_tokens_seen": 31972000, + "step": 151495 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 0.018240952864289284, + "learning_rate": 4.113068490279648e-06, + "loss": 0.0398, + "num_input_tokens_seen": 31973056, + "step": 151500 + }, + { + "epoch": 16.66721672167217, + "grad_norm": 0.017932457849383354, + "learning_rate": 4.111749689268224e-06, + "loss": 0.0738, + "num_input_tokens_seen": 31974112, + "step": 151505 + }, + { + "epoch": 16.667766776677666, + "grad_norm": 0.013885658234357834, + "learning_rate": 4.110431080773772e-06, + "loss": 0.0016, + "num_input_tokens_seen": 31975168, + "step": 151510 + }, + { + "epoch": 16.668316831683168, + "grad_norm": 0.00316402199678123, + "learning_rate": 4.1091126648084594e-06, + "loss": 0.0056, + "num_input_tokens_seen": 31976224, + "step": 151515 + }, + { + "epoch": 16.66886688668867, + "grad_norm": 0.18571288883686066, + "learning_rate": 4.10779444138443e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31977280, + "step": 151520 + }, + { + "epoch": 16.66941694169417, + "grad_norm": 0.028336869552731514, + "learning_rate": 4.1064764105138205e-06, + "loss": 0.0059, + "num_input_tokens_seen": 31978304, + "step": 151525 + }, + { + "epoch": 16.66996699669967, + "grad_norm": 1.6968930959701538, + "learning_rate": 4.105158572208809e-06, + "loss": 0.0985, + "num_input_tokens_seen": 31979360, + "step": 151530 + }, + { + "epoch": 16.670517051705172, + "grad_norm": 0.0814741998910904, + "learning_rate": 4.10384092648152e-06, + "loss": 0.0415, + "num_input_tokens_seen": 31980448, + "step": 151535 + }, + { + "epoch": 16.67106710671067, + "grad_norm": 0.1626010239124298, + "learning_rate": 4.1025234733440995e-06, + "loss": 0.0028, + "num_input_tokens_seen": 31981472, + "step": 151540 + }, + { + "epoch": 16.67161716171617, + "grad_norm": 0.362749308347702, + "learning_rate": 4.101206212808698e-06, + "loss": 0.0039, + "num_input_tokens_seen": 31982592, + "step": 151545 + }, + { + "epoch": 16.67216721672167, + "grad_norm": 0.027351761236786842, + "learning_rate": 4.099889144887442e-06, + "loss": 0.067, + "num_input_tokens_seen": 31983680, + "step": 151550 + }, + { + "epoch": 16.672717271727173, + "grad_norm": 0.00525079108774662, + "learning_rate": 4.098572269592482e-06, + "loss": 0.0013, + "num_input_tokens_seen": 31984704, + "step": 151555 + }, + { + "epoch": 16.673267326732674, + "grad_norm": 0.014830883592367172, + "learning_rate": 4.097255586935958e-06, + "loss": 0.0081, + "num_input_tokens_seen": 31985760, + "step": 151560 + }, + { + "epoch": 16.673817381738175, + "grad_norm": 0.016335805878043175, + "learning_rate": 4.09593909692999e-06, + "loss": 0.0992, + "num_input_tokens_seen": 31986880, + "step": 151565 + }, + { + "epoch": 16.674367436743676, + "grad_norm": 1.4753704071044922, + "learning_rate": 4.094622799586728e-06, + "loss": 0.0175, + "num_input_tokens_seen": 31987936, + "step": 151570 + }, + { + "epoch": 16.674917491749174, + "grad_norm": 0.10098566859960556, + "learning_rate": 4.0933066949182915e-06, + "loss": 0.0042, + "num_input_tokens_seen": 31988992, + "step": 151575 + }, + { + "epoch": 16.675467546754675, + "grad_norm": 0.031099114567041397, + "learning_rate": 4.0919907829368165e-06, + "loss": 0.0038, + "num_input_tokens_seen": 31989984, + "step": 151580 + }, + { + "epoch": 16.676017601760176, + "grad_norm": 0.2759786546230316, + "learning_rate": 4.090675063654434e-06, + "loss": 0.0168, + "num_input_tokens_seen": 31991008, + "step": 151585 + }, + { + "epoch": 16.676567656765677, + "grad_norm": 1.0114761590957642, + "learning_rate": 4.089359537083259e-06, + "loss": 0.0248, + "num_input_tokens_seen": 31992064, + "step": 151590 + }, + { + "epoch": 16.677117711771178, + "grad_norm": 0.05741708353161812, + "learning_rate": 4.08804420323543e-06, + "loss": 0.0078, + "num_input_tokens_seen": 31993088, + "step": 151595 + }, + { + "epoch": 16.67766776677668, + "grad_norm": 0.0787505954504013, + "learning_rate": 4.086729062123057e-06, + "loss": 0.0029, + "num_input_tokens_seen": 31994112, + "step": 151600 + }, + { + "epoch": 16.678217821782177, + "grad_norm": 0.01341842021793127, + "learning_rate": 4.085414113758274e-06, + "loss": 0.0652, + "num_input_tokens_seen": 31995200, + "step": 151605 + }, + { + "epoch": 16.678767876787678, + "grad_norm": 0.04648863524198532, + "learning_rate": 4.084099358153188e-06, + "loss": 0.052, + "num_input_tokens_seen": 31996256, + "step": 151610 + }, + { + "epoch": 16.67931793179318, + "grad_norm": 0.04827500879764557, + "learning_rate": 4.0827847953199185e-06, + "loss": 0.131, + "num_input_tokens_seen": 31997280, + "step": 151615 + }, + { + "epoch": 16.67986798679868, + "grad_norm": 3.4204554557800293, + "learning_rate": 4.081470425270592e-06, + "loss": 0.1288, + "num_input_tokens_seen": 31998304, + "step": 151620 + }, + { + "epoch": 16.68041804180418, + "grad_norm": 0.15817497670650482, + "learning_rate": 4.080156248017311e-06, + "loss": 0.0507, + "num_input_tokens_seen": 31999392, + "step": 151625 + }, + { + "epoch": 16.680968096809682, + "grad_norm": 0.12678247690200806, + "learning_rate": 4.078842263572194e-06, + "loss": 0.002, + "num_input_tokens_seen": 32000480, + "step": 151630 + }, + { + "epoch": 16.681518151815183, + "grad_norm": 0.005734989419579506, + "learning_rate": 4.077528471947353e-06, + "loss": 0.0538, + "num_input_tokens_seen": 32001504, + "step": 151635 + }, + { + "epoch": 16.68206820682068, + "grad_norm": 0.05563308298587799, + "learning_rate": 4.076214873154877e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32002624, + "step": 151640 + }, + { + "epoch": 16.682618261826182, + "grad_norm": 0.018352335318922997, + "learning_rate": 4.0749014672069014e-06, + "loss": 0.0209, + "num_input_tokens_seen": 32003616, + "step": 151645 + }, + { + "epoch": 16.683168316831683, + "grad_norm": 0.036739103496074677, + "learning_rate": 4.073588254115513e-06, + "loss": 0.1034, + "num_input_tokens_seen": 32004640, + "step": 151650 + }, + { + "epoch": 16.683718371837184, + "grad_norm": 0.011298034340143204, + "learning_rate": 4.072275233892825e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32005760, + "step": 151655 + }, + { + "epoch": 16.684268426842685, + "grad_norm": 2.5132319927215576, + "learning_rate": 4.070962406550935e-06, + "loss": 0.1442, + "num_input_tokens_seen": 32006816, + "step": 151660 + }, + { + "epoch": 16.684818481848186, + "grad_norm": 0.02479134313762188, + "learning_rate": 4.069649772101936e-06, + "loss": 0.0055, + "num_input_tokens_seen": 32007904, + "step": 151665 + }, + { + "epoch": 16.685368536853684, + "grad_norm": 0.8151726722717285, + "learning_rate": 4.068337330557931e-06, + "loss": 0.0242, + "num_input_tokens_seen": 32008928, + "step": 151670 + }, + { + "epoch": 16.685918591859185, + "grad_norm": 0.06671746075153351, + "learning_rate": 4.067025081931017e-06, + "loss": 0.0405, + "num_input_tokens_seen": 32010016, + "step": 151675 + }, + { + "epoch": 16.686468646864686, + "grad_norm": 5.468995571136475, + "learning_rate": 4.065713026233295e-06, + "loss": 0.0407, + "num_input_tokens_seen": 32011104, + "step": 151680 + }, + { + "epoch": 16.687018701870187, + "grad_norm": 0.004974331706762314, + "learning_rate": 4.0644011634768496e-06, + "loss": 0.0161, + "num_input_tokens_seen": 32012160, + "step": 151685 + }, + { + "epoch": 16.687568756875688, + "grad_norm": 0.01205726433545351, + "learning_rate": 4.0630894936737664e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32013152, + "step": 151690 + }, + { + "epoch": 16.68811881188119, + "grad_norm": 0.040846940129995346, + "learning_rate": 4.061778016836148e-06, + "loss": 0.0685, + "num_input_tokens_seen": 32014272, + "step": 151695 + }, + { + "epoch": 16.68866886688669, + "grad_norm": 0.4743776321411133, + "learning_rate": 4.060466732976062e-06, + "loss": 0.007, + "num_input_tokens_seen": 32015296, + "step": 151700 + }, + { + "epoch": 16.689218921892188, + "grad_norm": 1.29700767993927, + "learning_rate": 4.059155642105616e-06, + "loss": 0.0237, + "num_input_tokens_seen": 32016384, + "step": 151705 + }, + { + "epoch": 16.68976897689769, + "grad_norm": 0.12019701302051544, + "learning_rate": 4.057844744236888e-06, + "loss": 0.0058, + "num_input_tokens_seen": 32017376, + "step": 151710 + }, + { + "epoch": 16.69031903190319, + "grad_norm": 0.009743206202983856, + "learning_rate": 4.056534039381949e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32018432, + "step": 151715 + }, + { + "epoch": 16.69086908690869, + "grad_norm": 0.008833254687488079, + "learning_rate": 4.05522352755289e-06, + "loss": 0.101, + "num_input_tokens_seen": 32019488, + "step": 151720 + }, + { + "epoch": 16.691419141914192, + "grad_norm": 0.09883498400449753, + "learning_rate": 4.053913208761781e-06, + "loss": 0.0467, + "num_input_tokens_seen": 32020576, + "step": 151725 + }, + { + "epoch": 16.691969196919693, + "grad_norm": 0.7535621523857117, + "learning_rate": 4.052603083020703e-06, + "loss": 0.0507, + "num_input_tokens_seen": 32021696, + "step": 151730 + }, + { + "epoch": 16.69251925192519, + "grad_norm": 0.06891480833292007, + "learning_rate": 4.051293150341737e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32022784, + "step": 151735 + }, + { + "epoch": 16.693069306930692, + "grad_norm": 0.021166564896702766, + "learning_rate": 4.049983410736946e-06, + "loss": 0.0217, + "num_input_tokens_seen": 32023808, + "step": 151740 + }, + { + "epoch": 16.693619361936193, + "grad_norm": 0.05268336832523346, + "learning_rate": 4.048673864218408e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32024832, + "step": 151745 + }, + { + "epoch": 16.694169416941694, + "grad_norm": 0.01257331296801567, + "learning_rate": 4.04736451079819e-06, + "loss": 0.0337, + "num_input_tokens_seen": 32025856, + "step": 151750 + }, + { + "epoch": 16.694719471947195, + "grad_norm": 2.863830804824829, + "learning_rate": 4.046055350488354e-06, + "loss": 0.0353, + "num_input_tokens_seen": 32026880, + "step": 151755 + }, + { + "epoch": 16.695269526952696, + "grad_norm": 0.15196628868579865, + "learning_rate": 4.044746383300971e-06, + "loss": 0.0249, + "num_input_tokens_seen": 32027968, + "step": 151760 + }, + { + "epoch": 16.695819581958197, + "grad_norm": 0.09682799130678177, + "learning_rate": 4.043437609248105e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32029056, + "step": 151765 + }, + { + "epoch": 16.696369636963695, + "grad_norm": 0.05447513610124588, + "learning_rate": 4.042129028341823e-06, + "loss": 0.033, + "num_input_tokens_seen": 32030080, + "step": 151770 + }, + { + "epoch": 16.696919691969196, + "grad_norm": 0.008240108378231525, + "learning_rate": 4.040820640594184e-06, + "loss": 0.018, + "num_input_tokens_seen": 32031200, + "step": 151775 + }, + { + "epoch": 16.697469746974697, + "grad_norm": 0.015820857137441635, + "learning_rate": 4.039512446017238e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32032320, + "step": 151780 + }, + { + "epoch": 16.698019801980198, + "grad_norm": 0.04855228587985039, + "learning_rate": 4.038204444623045e-06, + "loss": 0.0619, + "num_input_tokens_seen": 32033408, + "step": 151785 + }, + { + "epoch": 16.6985698569857, + "grad_norm": 0.0012396065285429358, + "learning_rate": 4.036896636423665e-06, + "loss": 0.0914, + "num_input_tokens_seen": 32034400, + "step": 151790 + }, + { + "epoch": 16.6991199119912, + "grad_norm": 0.05328337103128433, + "learning_rate": 4.035589021431157e-06, + "loss": 0.0064, + "num_input_tokens_seen": 32035520, + "step": 151795 + }, + { + "epoch": 16.6996699669967, + "grad_norm": 2.917052745819092, + "learning_rate": 4.034281599657563e-06, + "loss": 0.0316, + "num_input_tokens_seen": 32036608, + "step": 151800 + }, + { + "epoch": 16.7002200220022, + "grad_norm": 1.3967206478118896, + "learning_rate": 4.032974371114931e-06, + "loss": 0.0387, + "num_input_tokens_seen": 32037696, + "step": 151805 + }, + { + "epoch": 16.7007700770077, + "grad_norm": 0.12642444670200348, + "learning_rate": 4.031667335815317e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32038720, + "step": 151810 + }, + { + "epoch": 16.7013201320132, + "grad_norm": 0.023402124643325806, + "learning_rate": 4.03036049377076e-06, + "loss": 0.0005, + "num_input_tokens_seen": 32039776, + "step": 151815 + }, + { + "epoch": 16.701870187018702, + "grad_norm": 0.04857123643159866, + "learning_rate": 4.029053844993308e-06, + "loss": 0.1206, + "num_input_tokens_seen": 32040864, + "step": 151820 + }, + { + "epoch": 16.702420242024203, + "grad_norm": 0.06401089578866959, + "learning_rate": 4.027747389495007e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32041952, + "step": 151825 + }, + { + "epoch": 16.702970297029704, + "grad_norm": 0.212161123752594, + "learning_rate": 4.026441127287892e-06, + "loss": 0.0045, + "num_input_tokens_seen": 32043072, + "step": 151830 + }, + { + "epoch": 16.703520352035202, + "grad_norm": 0.006466935388743877, + "learning_rate": 4.0251350583840106e-06, + "loss": 0.0976, + "num_input_tokens_seen": 32044128, + "step": 151835 + }, + { + "epoch": 16.704070407040703, + "grad_norm": 0.01563296653330326, + "learning_rate": 4.023829182795386e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32045184, + "step": 151840 + }, + { + "epoch": 16.704620462046204, + "grad_norm": 0.041472118347883224, + "learning_rate": 4.022523500534065e-06, + "loss": 0.1288, + "num_input_tokens_seen": 32046336, + "step": 151845 + }, + { + "epoch": 16.705170517051705, + "grad_norm": 0.0046800170093774796, + "learning_rate": 4.0212180116120854e-06, + "loss": 0.0834, + "num_input_tokens_seen": 32047392, + "step": 151850 + }, + { + "epoch": 16.705720572057206, + "grad_norm": 0.29499104619026184, + "learning_rate": 4.019912716041466e-06, + "loss": 0.002, + "num_input_tokens_seen": 32048448, + "step": 151855 + }, + { + "epoch": 16.706270627062707, + "grad_norm": 0.2542218267917633, + "learning_rate": 4.018607613834249e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32049504, + "step": 151860 + }, + { + "epoch": 16.706820682068205, + "grad_norm": 0.16775886714458466, + "learning_rate": 4.017302705002452e-06, + "loss": 0.0429, + "num_input_tokens_seen": 32050624, + "step": 151865 + }, + { + "epoch": 16.707370737073706, + "grad_norm": 0.009255070239305496, + "learning_rate": 4.015997989558112e-06, + "loss": 0.0055, + "num_input_tokens_seen": 32051680, + "step": 151870 + }, + { + "epoch": 16.707920792079207, + "grad_norm": 0.05081946402788162, + "learning_rate": 4.014693467513248e-06, + "loss": 0.0246, + "num_input_tokens_seen": 32052736, + "step": 151875 + }, + { + "epoch": 16.70847084708471, + "grad_norm": 0.009490065276622772, + "learning_rate": 4.013389138879881e-06, + "loss": 0.0088, + "num_input_tokens_seen": 32053792, + "step": 151880 + }, + { + "epoch": 16.70902090209021, + "grad_norm": 0.00733546307310462, + "learning_rate": 4.012085003670041e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32054848, + "step": 151885 + }, + { + "epoch": 16.70957095709571, + "grad_norm": 0.15004952251911163, + "learning_rate": 4.010781061895738e-06, + "loss": 0.0055, + "num_input_tokens_seen": 32055872, + "step": 151890 + }, + { + "epoch": 16.71012101210121, + "grad_norm": 0.06635113060474396, + "learning_rate": 4.009477313569002e-06, + "loss": 0.0109, + "num_input_tokens_seen": 32056928, + "step": 151895 + }, + { + "epoch": 16.71067106710671, + "grad_norm": 0.05029341205954552, + "learning_rate": 4.008173758701833e-06, + "loss": 0.0285, + "num_input_tokens_seen": 32057952, + "step": 151900 + }, + { + "epoch": 16.71122112211221, + "grad_norm": 1.3131017684936523, + "learning_rate": 4.006870397306256e-06, + "loss": 0.1257, + "num_input_tokens_seen": 32059040, + "step": 151905 + }, + { + "epoch": 16.71177117711771, + "grad_norm": 0.07716166973114014, + "learning_rate": 4.005567229394286e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32060096, + "step": 151910 + }, + { + "epoch": 16.712321232123212, + "grad_norm": 0.008378610946238041, + "learning_rate": 4.004264254977927e-06, + "loss": 0.1325, + "num_input_tokens_seen": 32061184, + "step": 151915 + }, + { + "epoch": 16.712871287128714, + "grad_norm": 0.5873852968215942, + "learning_rate": 4.002961474069186e-06, + "loss": 0.0683, + "num_input_tokens_seen": 32062336, + "step": 151920 + }, + { + "epoch": 16.713421342134215, + "grad_norm": 2.5296218395233154, + "learning_rate": 4.001658886680079e-06, + "loss": 0.0451, + "num_input_tokens_seen": 32063456, + "step": 151925 + }, + { + "epoch": 16.713971397139716, + "grad_norm": 0.007380942348390818, + "learning_rate": 4.000356492822599e-06, + "loss": 0.0746, + "num_input_tokens_seen": 32064544, + "step": 151930 + }, + { + "epoch": 16.714521452145213, + "grad_norm": 2.566310405731201, + "learning_rate": 3.999054292508758e-06, + "loss": 0.1121, + "num_input_tokens_seen": 32065632, + "step": 151935 + }, + { + "epoch": 16.715071507150714, + "grad_norm": 0.008128629066050053, + "learning_rate": 3.9977522857505615e-06, + "loss": 0.0102, + "num_input_tokens_seen": 32066688, + "step": 151940 + }, + { + "epoch": 16.715621562156215, + "grad_norm": 0.00878338236361742, + "learning_rate": 3.9964504725599986e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32067776, + "step": 151945 + }, + { + "epoch": 16.716171617161717, + "grad_norm": 0.02736862376332283, + "learning_rate": 3.995148852949079e-06, + "loss": 0.0019, + "num_input_tokens_seen": 32068800, + "step": 151950 + }, + { + "epoch": 16.716721672167218, + "grad_norm": 0.08329754322767258, + "learning_rate": 3.993847426929787e-06, + "loss": 0.0032, + "num_input_tokens_seen": 32069888, + "step": 151955 + }, + { + "epoch": 16.71727172717272, + "grad_norm": 0.03891373425722122, + "learning_rate": 3.992546194514124e-06, + "loss": 0.0091, + "num_input_tokens_seen": 32070944, + "step": 151960 + }, + { + "epoch": 16.717821782178216, + "grad_norm": 0.11776101589202881, + "learning_rate": 3.991245155714091e-06, + "loss": 0.006, + "num_input_tokens_seen": 32072032, + "step": 151965 + }, + { + "epoch": 16.718371837183717, + "grad_norm": 0.04723444581031799, + "learning_rate": 3.98994431054166e-06, + "loss": 0.0233, + "num_input_tokens_seen": 32073088, + "step": 151970 + }, + { + "epoch": 16.71892189218922, + "grad_norm": 0.9007939100265503, + "learning_rate": 3.988643659008839e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32074144, + "step": 151975 + }, + { + "epoch": 16.71947194719472, + "grad_norm": 0.02525045908987522, + "learning_rate": 3.9873432011276e-06, + "loss": 0.0827, + "num_input_tokens_seen": 32075200, + "step": 151980 + }, + { + "epoch": 16.72002200220022, + "grad_norm": 0.11121194064617157, + "learning_rate": 3.9860429369099424e-06, + "loss": 0.0202, + "num_input_tokens_seen": 32076256, + "step": 151985 + }, + { + "epoch": 16.72057205720572, + "grad_norm": 0.1853971928358078, + "learning_rate": 3.984742866367838e-06, + "loss": 0.0824, + "num_input_tokens_seen": 32077312, + "step": 151990 + }, + { + "epoch": 16.721122112211223, + "grad_norm": 0.02790895476937294, + "learning_rate": 3.9834429895132755e-06, + "loss": 0.0408, + "num_input_tokens_seen": 32078368, + "step": 151995 + }, + { + "epoch": 16.72167216721672, + "grad_norm": 0.06870805472135544, + "learning_rate": 3.982143306358238e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32079392, + "step": 152000 + }, + { + "epoch": 16.72222222222222, + "grad_norm": 0.06620685756206512, + "learning_rate": 3.980843816914695e-06, + "loss": 0.0035, + "num_input_tokens_seen": 32080512, + "step": 152005 + }, + { + "epoch": 16.722772277227723, + "grad_norm": 0.3114983141422272, + "learning_rate": 3.979544521194636e-06, + "loss": 0.0797, + "num_input_tokens_seen": 32081600, + "step": 152010 + }, + { + "epoch": 16.723322332233224, + "grad_norm": 0.020374035462737083, + "learning_rate": 3.978245419210019e-06, + "loss": 0.0574, + "num_input_tokens_seen": 32082656, + "step": 152015 + }, + { + "epoch": 16.723872387238725, + "grad_norm": 2.525193929672241, + "learning_rate": 3.976946510972831e-06, + "loss": 0.0389, + "num_input_tokens_seen": 32083648, + "step": 152020 + }, + { + "epoch": 16.724422442244226, + "grad_norm": 0.012737746350467205, + "learning_rate": 3.975647796495044e-06, + "loss": 0.0023, + "num_input_tokens_seen": 32084736, + "step": 152025 + }, + { + "epoch": 16.724972497249723, + "grad_norm": 0.07003886252641678, + "learning_rate": 3.974349275788616e-06, + "loss": 0.0053, + "num_input_tokens_seen": 32085760, + "step": 152030 + }, + { + "epoch": 16.725522552255224, + "grad_norm": 0.3156791925430298, + "learning_rate": 3.973050948865525e-06, + "loss": 0.0074, + "num_input_tokens_seen": 32086784, + "step": 152035 + }, + { + "epoch": 16.726072607260726, + "grad_norm": 0.3044711947441101, + "learning_rate": 3.971752815737739e-06, + "loss": 0.0605, + "num_input_tokens_seen": 32087776, + "step": 152040 + }, + { + "epoch": 16.726622662266227, + "grad_norm": 0.01805201731622219, + "learning_rate": 3.970454876417207e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32088832, + "step": 152045 + }, + { + "epoch": 16.727172717271728, + "grad_norm": 0.08018665015697479, + "learning_rate": 3.9691571309159045e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32089920, + "step": 152050 + }, + { + "epoch": 16.72772277227723, + "grad_norm": 1.8782861232757568, + "learning_rate": 3.967859579245786e-06, + "loss": 0.0716, + "num_input_tokens_seen": 32090944, + "step": 152055 + }, + { + "epoch": 16.72827282728273, + "grad_norm": 0.0946706235408783, + "learning_rate": 3.966562221418821e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32092032, + "step": 152060 + }, + { + "epoch": 16.728822882288227, + "grad_norm": 0.3397214710712433, + "learning_rate": 3.9652650574469595e-06, + "loss": 0.009, + "num_input_tokens_seen": 32093088, + "step": 152065 + }, + { + "epoch": 16.72937293729373, + "grad_norm": 0.04092784598469734, + "learning_rate": 3.963968087342152e-06, + "loss": 0.0268, + "num_input_tokens_seen": 32094112, + "step": 152070 + }, + { + "epoch": 16.72992299229923, + "grad_norm": 0.0619761124253273, + "learning_rate": 3.9626713111163535e-06, + "loss": 0.0544, + "num_input_tokens_seen": 32095168, + "step": 152075 + }, + { + "epoch": 16.73047304730473, + "grad_norm": 0.009136867709457874, + "learning_rate": 3.96137472878152e-06, + "loss": 0.0006, + "num_input_tokens_seen": 32096192, + "step": 152080 + }, + { + "epoch": 16.731023102310232, + "grad_norm": 0.030451757833361626, + "learning_rate": 3.960078340349607e-06, + "loss": 0.0108, + "num_input_tokens_seen": 32097280, + "step": 152085 + }, + { + "epoch": 16.731573157315733, + "grad_norm": 0.16740722954273224, + "learning_rate": 3.958782145832554e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32098304, + "step": 152090 + }, + { + "epoch": 16.73212321232123, + "grad_norm": 0.020197471603751183, + "learning_rate": 3.957486145242306e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32099296, + "step": 152095 + }, + { + "epoch": 16.73267326732673, + "grad_norm": 0.048945870250463486, + "learning_rate": 3.956190338590815e-06, + "loss": 0.0716, + "num_input_tokens_seen": 32100288, + "step": 152100 + }, + { + "epoch": 16.733223322332233, + "grad_norm": 2.079000473022461, + "learning_rate": 3.954894725890015e-06, + "loss": 0.0466, + "num_input_tokens_seen": 32101376, + "step": 152105 + }, + { + "epoch": 16.733773377337734, + "grad_norm": 0.014738153666257858, + "learning_rate": 3.953599307151851e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32102400, + "step": 152110 + }, + { + "epoch": 16.734323432343235, + "grad_norm": 0.004438171163201332, + "learning_rate": 3.952304082388269e-06, + "loss": 0.0777, + "num_input_tokens_seen": 32103392, + "step": 152115 + }, + { + "epoch": 16.734873487348736, + "grad_norm": 0.2794383764266968, + "learning_rate": 3.951009051611196e-06, + "loss": 0.0051, + "num_input_tokens_seen": 32104416, + "step": 152120 + }, + { + "epoch": 16.735423542354237, + "grad_norm": 0.027928553521633148, + "learning_rate": 3.949714214832578e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32105472, + "step": 152125 + }, + { + "epoch": 16.735973597359735, + "grad_norm": 0.018114812672138214, + "learning_rate": 3.948419572064338e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32106496, + "step": 152130 + }, + { + "epoch": 16.736523652365236, + "grad_norm": 0.005911955609917641, + "learning_rate": 3.947125123318413e-06, + "loss": 0.0074, + "num_input_tokens_seen": 32107616, + "step": 152135 + }, + { + "epoch": 16.737073707370737, + "grad_norm": 0.00587024912238121, + "learning_rate": 3.945830868606737e-06, + "loss": 0.0104, + "num_input_tokens_seen": 32108640, + "step": 152140 + }, + { + "epoch": 16.737623762376238, + "grad_norm": 0.07205070555210114, + "learning_rate": 3.944536807941232e-06, + "loss": 0.0683, + "num_input_tokens_seen": 32109664, + "step": 152145 + }, + { + "epoch": 16.73817381738174, + "grad_norm": 2.263883352279663, + "learning_rate": 3.943242941333833e-06, + "loss": 0.0985, + "num_input_tokens_seen": 32110720, + "step": 152150 + }, + { + "epoch": 16.73872387238724, + "grad_norm": 0.005975139793008566, + "learning_rate": 3.941949268796457e-06, + "loss": 0.0501, + "num_input_tokens_seen": 32111808, + "step": 152155 + }, + { + "epoch": 16.739273927392738, + "grad_norm": 0.1547563374042511, + "learning_rate": 3.9406557903410275e-06, + "loss": 0.047, + "num_input_tokens_seen": 32112768, + "step": 152160 + }, + { + "epoch": 16.73982398239824, + "grad_norm": 0.42071953415870667, + "learning_rate": 3.939362505979469e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32113792, + "step": 152165 + }, + { + "epoch": 16.74037403740374, + "grad_norm": 0.01685827039182186, + "learning_rate": 3.938069415723697e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32114880, + "step": 152170 + }, + { + "epoch": 16.74092409240924, + "grad_norm": 1.7867474555969238, + "learning_rate": 3.936776519585641e-06, + "loss": 0.0191, + "num_input_tokens_seen": 32115968, + "step": 152175 + }, + { + "epoch": 16.741474147414742, + "grad_norm": 0.01166918221861124, + "learning_rate": 3.935483817577207e-06, + "loss": 0.0145, + "num_input_tokens_seen": 32117056, + "step": 152180 + }, + { + "epoch": 16.742024202420243, + "grad_norm": 0.007364529184997082, + "learning_rate": 3.934191309710306e-06, + "loss": 0.0174, + "num_input_tokens_seen": 32118112, + "step": 152185 + }, + { + "epoch": 16.742574257425744, + "grad_norm": 0.08533062785863876, + "learning_rate": 3.932898995996856e-06, + "loss": 0.0408, + "num_input_tokens_seen": 32119168, + "step": 152190 + }, + { + "epoch": 16.74312431243124, + "grad_norm": 1.1856701374053955, + "learning_rate": 3.931606876448765e-06, + "loss": 0.0169, + "num_input_tokens_seen": 32120256, + "step": 152195 + }, + { + "epoch": 16.743674367436743, + "grad_norm": 0.5361159443855286, + "learning_rate": 3.930314951077952e-06, + "loss": 0.1175, + "num_input_tokens_seen": 32121312, + "step": 152200 + }, + { + "epoch": 16.744224422442244, + "grad_norm": 0.0020346594974398613, + "learning_rate": 3.929023219896314e-06, + "loss": 0.0536, + "num_input_tokens_seen": 32122432, + "step": 152205 + }, + { + "epoch": 16.744774477447745, + "grad_norm": 0.10877268761396408, + "learning_rate": 3.927731682915753e-06, + "loss": 0.0436, + "num_input_tokens_seen": 32123520, + "step": 152210 + }, + { + "epoch": 16.745324532453246, + "grad_norm": 0.003227273700758815, + "learning_rate": 3.9264403401481855e-06, + "loss": 0.0114, + "num_input_tokens_seen": 32124512, + "step": 152215 + }, + { + "epoch": 16.745874587458747, + "grad_norm": 0.03007870726287365, + "learning_rate": 3.925149191605496e-06, + "loss": 0.1731, + "num_input_tokens_seen": 32125600, + "step": 152220 + }, + { + "epoch": 16.746424642464248, + "grad_norm": 0.021090267226099968, + "learning_rate": 3.923858237299596e-06, + "loss": 0.0189, + "num_input_tokens_seen": 32126656, + "step": 152225 + }, + { + "epoch": 16.746974697469746, + "grad_norm": 0.008015818893909454, + "learning_rate": 3.922567477242386e-06, + "loss": 0.0006, + "num_input_tokens_seen": 32127744, + "step": 152230 + }, + { + "epoch": 16.747524752475247, + "grad_norm": 0.03338159993290901, + "learning_rate": 3.9212769114457535e-06, + "loss": 0.0151, + "num_input_tokens_seen": 32128864, + "step": 152235 + }, + { + "epoch": 16.748074807480748, + "grad_norm": 0.02917635068297386, + "learning_rate": 3.9199865399216e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32129856, + "step": 152240 + }, + { + "epoch": 16.74862486248625, + "grad_norm": 0.015154029242694378, + "learning_rate": 3.918696362681812e-06, + "loss": 0.07, + "num_input_tokens_seen": 32130848, + "step": 152245 + }, + { + "epoch": 16.74917491749175, + "grad_norm": 1.0603935718536377, + "learning_rate": 3.9174063797382836e-06, + "loss": 0.0186, + "num_input_tokens_seen": 32131904, + "step": 152250 + }, + { + "epoch": 16.74972497249725, + "grad_norm": 0.060414090752601624, + "learning_rate": 3.916116591102909e-06, + "loss": 0.0469, + "num_input_tokens_seen": 32132992, + "step": 152255 + }, + { + "epoch": 16.75027502750275, + "grad_norm": 0.05217432603240013, + "learning_rate": 3.914826996787566e-06, + "loss": 0.0075, + "num_input_tokens_seen": 32133952, + "step": 152260 + }, + { + "epoch": 16.75082508250825, + "grad_norm": 1.9206538200378418, + "learning_rate": 3.913537596804151e-06, + "loss": 0.1219, + "num_input_tokens_seen": 32135072, + "step": 152265 + }, + { + "epoch": 16.75137513751375, + "grad_norm": 0.025108827278017998, + "learning_rate": 3.912248391164533e-06, + "loss": 0.0045, + "num_input_tokens_seen": 32136096, + "step": 152270 + }, + { + "epoch": 16.751925192519252, + "grad_norm": 0.3300679922103882, + "learning_rate": 3.910959379880611e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32137184, + "step": 152275 + }, + { + "epoch": 16.752475247524753, + "grad_norm": 0.0050836484879255295, + "learning_rate": 3.909670562964252e-06, + "loss": 0.001, + "num_input_tokens_seen": 32138240, + "step": 152280 + }, + { + "epoch": 16.753025302530254, + "grad_norm": 0.0187073927372694, + "learning_rate": 3.908381940427336e-06, + "loss": 0.0339, + "num_input_tokens_seen": 32139328, + "step": 152285 + }, + { + "epoch": 16.753575357535752, + "grad_norm": 0.2930673062801361, + "learning_rate": 3.907093512281751e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32140320, + "step": 152290 + }, + { + "epoch": 16.754125412541253, + "grad_norm": 0.07123195379972458, + "learning_rate": 3.90580527853936e-06, + "loss": 0.013, + "num_input_tokens_seen": 32141344, + "step": 152295 + }, + { + "epoch": 16.754675467546754, + "grad_norm": 2.251915454864502, + "learning_rate": 3.904517239212041e-06, + "loss": 0.034, + "num_input_tokens_seen": 32142368, + "step": 152300 + }, + { + "epoch": 16.755225522552255, + "grad_norm": 0.02192385494709015, + "learning_rate": 3.9032293943116665e-06, + "loss": 0.0107, + "num_input_tokens_seen": 32143392, + "step": 152305 + }, + { + "epoch": 16.755775577557756, + "grad_norm": 3.1238036155700684, + "learning_rate": 3.90194174385009e-06, + "loss": 0.1748, + "num_input_tokens_seen": 32144416, + "step": 152310 + }, + { + "epoch": 16.756325632563257, + "grad_norm": 0.045934904366731644, + "learning_rate": 3.900654287839206e-06, + "loss": 0.0023, + "num_input_tokens_seen": 32145408, + "step": 152315 + }, + { + "epoch": 16.75687568756876, + "grad_norm": 1.1479204893112183, + "learning_rate": 3.899367026290868e-06, + "loss": 0.0057, + "num_input_tokens_seen": 32146464, + "step": 152320 + }, + { + "epoch": 16.757425742574256, + "grad_norm": 0.09849170595407486, + "learning_rate": 3.898079959216933e-06, + "loss": 0.005, + "num_input_tokens_seen": 32147520, + "step": 152325 + }, + { + "epoch": 16.757975797579757, + "grad_norm": 0.04879359155893326, + "learning_rate": 3.8967930866292746e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32148576, + "step": 152330 + }, + { + "epoch": 16.758525852585258, + "grad_norm": 0.03450433909893036, + "learning_rate": 3.895506408539739e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32149664, + "step": 152335 + }, + { + "epoch": 16.75907590759076, + "grad_norm": 0.007721772417426109, + "learning_rate": 3.894219924960199e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32150720, + "step": 152340 + }, + { + "epoch": 16.75962596259626, + "grad_norm": 0.011669212020933628, + "learning_rate": 3.89293363590251e-06, + "loss": 0.0137, + "num_input_tokens_seen": 32151776, + "step": 152345 + }, + { + "epoch": 16.76017601760176, + "grad_norm": 0.4019140899181366, + "learning_rate": 3.891647541378515e-06, + "loss": 0.0033, + "num_input_tokens_seen": 32152800, + "step": 152350 + }, + { + "epoch": 16.760726072607262, + "grad_norm": 0.015717927366495132, + "learning_rate": 3.890361641400087e-06, + "loss": 0.0445, + "num_input_tokens_seen": 32153856, + "step": 152355 + }, + { + "epoch": 16.76127612761276, + "grad_norm": 0.011244143359363079, + "learning_rate": 3.889075935979056e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32154880, + "step": 152360 + }, + { + "epoch": 16.76182618261826, + "grad_norm": 0.008709666319191456, + "learning_rate": 3.887790425127286e-06, + "loss": 0.0721, + "num_input_tokens_seen": 32155904, + "step": 152365 + }, + { + "epoch": 16.762376237623762, + "grad_norm": 0.49053114652633667, + "learning_rate": 3.8865051088566225e-06, + "loss": 0.0776, + "num_input_tokens_seen": 32156992, + "step": 152370 + }, + { + "epoch": 16.762926292629263, + "grad_norm": 0.02043006382882595, + "learning_rate": 3.885219987178906e-06, + "loss": 0.1064, + "num_input_tokens_seen": 32158112, + "step": 152375 + }, + { + "epoch": 16.763476347634764, + "grad_norm": 1.5550739765167236, + "learning_rate": 3.883935060105992e-06, + "loss": 0.0102, + "num_input_tokens_seen": 32159168, + "step": 152380 + }, + { + "epoch": 16.764026402640265, + "grad_norm": 0.38672614097595215, + "learning_rate": 3.882650327649709e-06, + "loss": 0.0081, + "num_input_tokens_seen": 32160224, + "step": 152385 + }, + { + "epoch": 16.764576457645763, + "grad_norm": 0.01734170690178871, + "learning_rate": 3.881365789821912e-06, + "loss": 0.0326, + "num_input_tokens_seen": 32161248, + "step": 152390 + }, + { + "epoch": 16.765126512651264, + "grad_norm": 0.3747188150882721, + "learning_rate": 3.880081446634426e-06, + "loss": 0.0548, + "num_input_tokens_seen": 32162368, + "step": 152395 + }, + { + "epoch": 16.765676567656765, + "grad_norm": 0.009924369864165783, + "learning_rate": 3.878797298099096e-06, + "loss": 0.004, + "num_input_tokens_seen": 32163392, + "step": 152400 + }, + { + "epoch": 16.766226622662266, + "grad_norm": 0.010735311545431614, + "learning_rate": 3.877513344227762e-06, + "loss": 0.0886, + "num_input_tokens_seen": 32164448, + "step": 152405 + }, + { + "epoch": 16.766776677667767, + "grad_norm": 0.06371047347784042, + "learning_rate": 3.876229585032245e-06, + "loss": 0.007, + "num_input_tokens_seen": 32165440, + "step": 152410 + }, + { + "epoch": 16.76732673267327, + "grad_norm": 0.024936361238360405, + "learning_rate": 3.874946020524389e-06, + "loss": 0.0032, + "num_input_tokens_seen": 32166496, + "step": 152415 + }, + { + "epoch": 16.76787678767877, + "grad_norm": 0.0482998751103878, + "learning_rate": 3.87366265071602e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32167488, + "step": 152420 + }, + { + "epoch": 16.768426842684267, + "grad_norm": 0.5231096148490906, + "learning_rate": 3.872379475618954e-06, + "loss": 0.0093, + "num_input_tokens_seen": 32168544, + "step": 152425 + }, + { + "epoch": 16.768976897689768, + "grad_norm": 0.2979987859725952, + "learning_rate": 3.871096495245039e-06, + "loss": 0.0086, + "num_input_tokens_seen": 32169600, + "step": 152430 + }, + { + "epoch": 16.76952695269527, + "grad_norm": 0.0854014977812767, + "learning_rate": 3.8698137096060845e-06, + "loss": 0.0384, + "num_input_tokens_seen": 32170656, + "step": 152435 + }, + { + "epoch": 16.77007700770077, + "grad_norm": 0.11393024027347565, + "learning_rate": 3.8685311187139225e-06, + "loss": 0.0301, + "num_input_tokens_seen": 32171680, + "step": 152440 + }, + { + "epoch": 16.77062706270627, + "grad_norm": 0.005000877659767866, + "learning_rate": 3.867248722580369e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32172768, + "step": 152445 + }, + { + "epoch": 16.771177117711773, + "grad_norm": 3.820564031600952, + "learning_rate": 3.865966521217238e-06, + "loss": 0.0521, + "num_input_tokens_seen": 32173792, + "step": 152450 + }, + { + "epoch": 16.77172717271727, + "grad_norm": 0.013465411961078644, + "learning_rate": 3.864684514636352e-06, + "loss": 0.0074, + "num_input_tokens_seen": 32174816, + "step": 152455 + }, + { + "epoch": 16.77227722772277, + "grad_norm": 0.02531198039650917, + "learning_rate": 3.863402702849528e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32175872, + "step": 152460 + }, + { + "epoch": 16.772827282728272, + "grad_norm": 1.6507370471954346, + "learning_rate": 3.862121085868584e-06, + "loss": 0.0624, + "num_input_tokens_seen": 32176960, + "step": 152465 + }, + { + "epoch": 16.773377337733773, + "grad_norm": 0.0035715082194656134, + "learning_rate": 3.860839663705329e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32177984, + "step": 152470 + }, + { + "epoch": 16.773927392739274, + "grad_norm": 0.013797709718346596, + "learning_rate": 3.859558436371563e-06, + "loss": 0.001, + "num_input_tokens_seen": 32179136, + "step": 152475 + }, + { + "epoch": 16.774477447744776, + "grad_norm": 0.0013789712684229016, + "learning_rate": 3.8582774038791095e-06, + "loss": 0.0938, + "num_input_tokens_seen": 32180160, + "step": 152480 + }, + { + "epoch": 16.775027502750277, + "grad_norm": 1.8272862434387207, + "learning_rate": 3.856996566239757e-06, + "loss": 0.0698, + "num_input_tokens_seen": 32181280, + "step": 152485 + }, + { + "epoch": 16.775577557755774, + "grad_norm": 0.017056280747056007, + "learning_rate": 3.855715923465333e-06, + "loss": 0.003, + "num_input_tokens_seen": 32182368, + "step": 152490 + }, + { + "epoch": 16.776127612761275, + "grad_norm": 0.0070281983353197575, + "learning_rate": 3.854435475567628e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32183392, + "step": 152495 + }, + { + "epoch": 16.776677667766776, + "grad_norm": 0.016768790781497955, + "learning_rate": 3.853155222558441e-06, + "loss": 0.003, + "num_input_tokens_seen": 32184384, + "step": 152500 + }, + { + "epoch": 16.777227722772277, + "grad_norm": 0.14141878485679626, + "learning_rate": 3.851875164449581e-06, + "loss": 0.0762, + "num_input_tokens_seen": 32185408, + "step": 152505 + }, + { + "epoch": 16.77777777777778, + "grad_norm": 0.014545464888215065, + "learning_rate": 3.850595301252832e-06, + "loss": 0.0027, + "num_input_tokens_seen": 32186464, + "step": 152510 + }, + { + "epoch": 16.77832783278328, + "grad_norm": 0.1952980011701584, + "learning_rate": 3.8493156329799965e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32187488, + "step": 152515 + }, + { + "epoch": 16.778877887788777, + "grad_norm": 2.3897056579589844, + "learning_rate": 3.848036159642879e-06, + "loss": 0.041, + "num_input_tokens_seen": 32188576, + "step": 152520 + }, + { + "epoch": 16.77942794279428, + "grad_norm": 0.021684959530830383, + "learning_rate": 3.846756881253255e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32189568, + "step": 152525 + }, + { + "epoch": 16.77997799779978, + "grad_norm": 0.02216673083603382, + "learning_rate": 3.84547779782293e-06, + "loss": 0.0542, + "num_input_tokens_seen": 32190624, + "step": 152530 + }, + { + "epoch": 16.78052805280528, + "grad_norm": 0.009641085751354694, + "learning_rate": 3.8441989093636835e-06, + "loss": 0.0346, + "num_input_tokens_seen": 32191680, + "step": 152535 + }, + { + "epoch": 16.78107810781078, + "grad_norm": 0.1586533784866333, + "learning_rate": 3.8429202158872995e-06, + "loss": 0.1062, + "num_input_tokens_seen": 32192672, + "step": 152540 + }, + { + "epoch": 16.781628162816283, + "grad_norm": 0.029918156564235687, + "learning_rate": 3.841641717405567e-06, + "loss": 0.1167, + "num_input_tokens_seen": 32193760, + "step": 152545 + }, + { + "epoch": 16.782178217821784, + "grad_norm": 0.0064775715582072735, + "learning_rate": 3.840363413930267e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32194848, + "step": 152550 + }, + { + "epoch": 16.78272827282728, + "grad_norm": 0.015942147001624107, + "learning_rate": 3.839085305473195e-06, + "loss": 0.0264, + "num_input_tokens_seen": 32195936, + "step": 152555 + }, + { + "epoch": 16.783278327832782, + "grad_norm": 0.008641674183309078, + "learning_rate": 3.837807392046116e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32196992, + "step": 152560 + }, + { + "epoch": 16.783828382838283, + "grad_norm": 0.018451616168022156, + "learning_rate": 3.836529673660808e-06, + "loss": 0.0043, + "num_input_tokens_seen": 32198048, + "step": 152565 + }, + { + "epoch": 16.784378437843785, + "grad_norm": 3.201930046081543, + "learning_rate": 3.835252150329049e-06, + "loss": 0.1173, + "num_input_tokens_seen": 32199136, + "step": 152570 + }, + { + "epoch": 16.784928492849286, + "grad_norm": 0.13887657225131989, + "learning_rate": 3.8339748220626165e-06, + "loss": 0.0149, + "num_input_tokens_seen": 32200128, + "step": 152575 + }, + { + "epoch": 16.785478547854787, + "grad_norm": 0.018386198207736015, + "learning_rate": 3.832697688873288e-06, + "loss": 0.0576, + "num_input_tokens_seen": 32201280, + "step": 152580 + }, + { + "epoch": 16.786028602860284, + "grad_norm": 0.010984827764332294, + "learning_rate": 3.8314207507728265e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32202304, + "step": 152585 + }, + { + "epoch": 16.786578657865785, + "grad_norm": 0.04029873386025429, + "learning_rate": 3.830144007772998e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32203392, + "step": 152590 + }, + { + "epoch": 16.787128712871286, + "grad_norm": 0.016771504655480385, + "learning_rate": 3.828867459885579e-06, + "loss": 0.0095, + "num_input_tokens_seen": 32204448, + "step": 152595 + }, + { + "epoch": 16.787678767876788, + "grad_norm": 0.11883721500635147, + "learning_rate": 3.827591107122322e-06, + "loss": 0.0069, + "num_input_tokens_seen": 32205504, + "step": 152600 + }, + { + "epoch": 16.78822882288229, + "grad_norm": 0.008583690971136093, + "learning_rate": 3.826314949494999e-06, + "loss": 0.0658, + "num_input_tokens_seen": 32206624, + "step": 152605 + }, + { + "epoch": 16.78877887788779, + "grad_norm": 0.14213162660598755, + "learning_rate": 3.825038987015378e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32207648, + "step": 152610 + }, + { + "epoch": 16.78932893289329, + "grad_norm": 2.526860237121582, + "learning_rate": 3.823763219695206e-06, + "loss": 0.0346, + "num_input_tokens_seen": 32208704, + "step": 152615 + }, + { + "epoch": 16.78987898789879, + "grad_norm": 0.008180043660104275, + "learning_rate": 3.822487647546252e-06, + "loss": 0.0046, + "num_input_tokens_seen": 32209728, + "step": 152620 + }, + { + "epoch": 16.79042904290429, + "grad_norm": 0.05966978147625923, + "learning_rate": 3.821212270580263e-06, + "loss": 0.1288, + "num_input_tokens_seen": 32210784, + "step": 152625 + }, + { + "epoch": 16.79097909790979, + "grad_norm": 0.03360545262694359, + "learning_rate": 3.819937088808997e-06, + "loss": 0.0085, + "num_input_tokens_seen": 32211744, + "step": 152630 + }, + { + "epoch": 16.79152915291529, + "grad_norm": 0.006863908376544714, + "learning_rate": 3.8186621022442135e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32212800, + "step": 152635 + }, + { + "epoch": 16.792079207920793, + "grad_norm": 9.310041427612305, + "learning_rate": 3.8173873108976525e-06, + "loss": 0.0231, + "num_input_tokens_seen": 32213856, + "step": 152640 + }, + { + "epoch": 16.792629262926294, + "grad_norm": 0.8881847858428955, + "learning_rate": 3.816112714781073e-06, + "loss": 0.1269, + "num_input_tokens_seen": 32214944, + "step": 152645 + }, + { + "epoch": 16.793179317931795, + "grad_norm": 0.018613873049616814, + "learning_rate": 3.814838313906213e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32215968, + "step": 152650 + }, + { + "epoch": 16.793729372937293, + "grad_norm": 0.028207654133439064, + "learning_rate": 3.8135641082848284e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32216992, + "step": 152655 + }, + { + "epoch": 16.794279427942794, + "grad_norm": 0.08335728198289871, + "learning_rate": 3.8122900979286504e-06, + "loss": 0.0098, + "num_input_tokens_seen": 32218080, + "step": 152660 + }, + { + "epoch": 16.794829482948295, + "grad_norm": 1.2979240417480469, + "learning_rate": 3.811016282849428e-06, + "loss": 0.0643, + "num_input_tokens_seen": 32219104, + "step": 152665 + }, + { + "epoch": 16.795379537953796, + "grad_norm": 3.7943105697631836, + "learning_rate": 3.8097426630589073e-06, + "loss": 0.1394, + "num_input_tokens_seen": 32220096, + "step": 152670 + }, + { + "epoch": 16.795929592959297, + "grad_norm": 0.37136971950531006, + "learning_rate": 3.8084692385688127e-06, + "loss": 0.0056, + "num_input_tokens_seen": 32221120, + "step": 152675 + }, + { + "epoch": 16.796479647964798, + "grad_norm": 1.389072060585022, + "learning_rate": 3.8071960093908955e-06, + "loss": 0.0839, + "num_input_tokens_seen": 32222144, + "step": 152680 + }, + { + "epoch": 16.797029702970296, + "grad_norm": 0.005025940015912056, + "learning_rate": 3.805922975536877e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32223168, + "step": 152685 + }, + { + "epoch": 16.797579757975797, + "grad_norm": 0.056394707411527634, + "learning_rate": 3.804650137018498e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32224288, + "step": 152690 + }, + { + "epoch": 16.798129812981298, + "grad_norm": 0.013997224159538746, + "learning_rate": 3.803377493847493e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32225312, + "step": 152695 + }, + { + "epoch": 16.7986798679868, + "grad_norm": 0.05367344990372658, + "learning_rate": 3.8021050460355867e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32226464, + "step": 152700 + }, + { + "epoch": 16.7992299229923, + "grad_norm": 0.009951608255505562, + "learning_rate": 3.8008327935945027e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32227552, + "step": 152705 + }, + { + "epoch": 16.7997799779978, + "grad_norm": 0.15555894374847412, + "learning_rate": 3.799560736535973e-06, + "loss": 0.1491, + "num_input_tokens_seen": 32228608, + "step": 152710 + }, + { + "epoch": 16.8003300330033, + "grad_norm": 6.252666473388672, + "learning_rate": 3.798288874871714e-06, + "loss": 0.1853, + "num_input_tokens_seen": 32229632, + "step": 152715 + }, + { + "epoch": 16.8008800880088, + "grad_norm": 0.011541224084794521, + "learning_rate": 3.7970172086134543e-06, + "loss": 0.0058, + "num_input_tokens_seen": 32230688, + "step": 152720 + }, + { + "epoch": 16.8014301430143, + "grad_norm": 0.007143412251025438, + "learning_rate": 3.795745737772918e-06, + "loss": 0.0464, + "num_input_tokens_seen": 32231712, + "step": 152725 + }, + { + "epoch": 16.801980198019802, + "grad_norm": 0.01836531050503254, + "learning_rate": 3.7944744623618107e-06, + "loss": 0.0039, + "num_input_tokens_seen": 32232800, + "step": 152730 + }, + { + "epoch": 16.802530253025303, + "grad_norm": 0.252364844083786, + "learning_rate": 3.793203382391866e-06, + "loss": 0.004, + "num_input_tokens_seen": 32233888, + "step": 152735 + }, + { + "epoch": 16.803080308030804, + "grad_norm": 0.27768903970718384, + "learning_rate": 3.791932497874781e-06, + "loss": 0.1293, + "num_input_tokens_seen": 32234912, + "step": 152740 + }, + { + "epoch": 16.803630363036305, + "grad_norm": 1.7246448993682861, + "learning_rate": 3.7906618088222796e-06, + "loss": 0.079, + "num_input_tokens_seen": 32235936, + "step": 152745 + }, + { + "epoch": 16.804180418041803, + "grad_norm": 0.8998833298683167, + "learning_rate": 3.789391315246077e-06, + "loss": 0.0145, + "num_input_tokens_seen": 32236992, + "step": 152750 + }, + { + "epoch": 16.804730473047304, + "grad_norm": 0.014317340217530727, + "learning_rate": 3.7881210171578697e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32237984, + "step": 152755 + }, + { + "epoch": 16.805280528052805, + "grad_norm": 3.0410850048065186, + "learning_rate": 3.7868509145693787e-06, + "loss": 0.1288, + "num_input_tokens_seen": 32239040, + "step": 152760 + }, + { + "epoch": 16.805830583058306, + "grad_norm": 0.02585465833544731, + "learning_rate": 3.7855810074922977e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32240096, + "step": 152765 + }, + { + "epoch": 16.806380638063807, + "grad_norm": 0.05318703502416611, + "learning_rate": 3.784311295938342e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32241184, + "step": 152770 + }, + { + "epoch": 16.806930693069308, + "grad_norm": 0.11106971651315689, + "learning_rate": 3.7830417799192053e-06, + "loss": 0.0356, + "num_input_tokens_seen": 32242176, + "step": 152775 + }, + { + "epoch": 16.80748074807481, + "grad_norm": 0.06477342545986176, + "learning_rate": 3.7817724594465893e-06, + "loss": 0.1253, + "num_input_tokens_seen": 32243200, + "step": 152780 + }, + { + "epoch": 16.808030803080307, + "grad_norm": 1.1771761178970337, + "learning_rate": 3.780503334532201e-06, + "loss": 0.0071, + "num_input_tokens_seen": 32244288, + "step": 152785 + }, + { + "epoch": 16.808580858085808, + "grad_norm": 1.0165430307388306, + "learning_rate": 3.779234405187726e-06, + "loss": 0.0416, + "num_input_tokens_seen": 32245312, + "step": 152790 + }, + { + "epoch": 16.80913091309131, + "grad_norm": 0.6001543402671814, + "learning_rate": 3.777965671424871e-06, + "loss": 0.012, + "num_input_tokens_seen": 32246368, + "step": 152795 + }, + { + "epoch": 16.80968096809681, + "grad_norm": 0.058054327964782715, + "learning_rate": 3.7766971332553193e-06, + "loss": 0.0093, + "num_input_tokens_seen": 32247392, + "step": 152800 + }, + { + "epoch": 16.81023102310231, + "grad_norm": 0.004051441792398691, + "learning_rate": 3.7754287906907633e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32248480, + "step": 152805 + }, + { + "epoch": 16.810781078107812, + "grad_norm": 0.019878817722201347, + "learning_rate": 3.7741606437429024e-06, + "loss": 0.0061, + "num_input_tokens_seen": 32249536, + "step": 152810 + }, + { + "epoch": 16.81133113311331, + "grad_norm": 2.03995418548584, + "learning_rate": 3.7728926924234103e-06, + "loss": 0.0577, + "num_input_tokens_seen": 32250624, + "step": 152815 + }, + { + "epoch": 16.81188118811881, + "grad_norm": 0.49369046092033386, + "learning_rate": 3.771624936743989e-06, + "loss": 0.008, + "num_input_tokens_seen": 32251648, + "step": 152820 + }, + { + "epoch": 16.812431243124312, + "grad_norm": 0.0680319219827652, + "learning_rate": 3.7703573767163132e-06, + "loss": 0.001, + "num_input_tokens_seen": 32252640, + "step": 152825 + }, + { + "epoch": 16.812981298129813, + "grad_norm": 0.14187166094779968, + "learning_rate": 3.769090012352061e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32253696, + "step": 152830 + }, + { + "epoch": 16.813531353135314, + "grad_norm": 0.6858357191085815, + "learning_rate": 3.767822843662916e-06, + "loss": 0.043, + "num_input_tokens_seen": 32254752, + "step": 152835 + }, + { + "epoch": 16.814081408140815, + "grad_norm": 3.8956644535064697, + "learning_rate": 3.7665558706605623e-06, + "loss": 0.0094, + "num_input_tokens_seen": 32255776, + "step": 152840 + }, + { + "epoch": 16.814631463146316, + "grad_norm": 0.013898499310016632, + "learning_rate": 3.7652890933566804e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32256800, + "step": 152845 + }, + { + "epoch": 16.815181518151814, + "grad_norm": 0.062159959226846695, + "learning_rate": 3.764022511762938e-06, + "loss": 0.0191, + "num_input_tokens_seen": 32257824, + "step": 152850 + }, + { + "epoch": 16.815731573157315, + "grad_norm": 0.05215704068541527, + "learning_rate": 3.7627561258910064e-06, + "loss": 0.0353, + "num_input_tokens_seen": 32258880, + "step": 152855 + }, + { + "epoch": 16.816281628162816, + "grad_norm": 0.060389988124370575, + "learning_rate": 3.7614899357525574e-06, + "loss": 0.0022, + "num_input_tokens_seen": 32259968, + "step": 152860 + }, + { + "epoch": 16.816831683168317, + "grad_norm": 0.11304411292076111, + "learning_rate": 3.7602239413592673e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32260960, + "step": 152865 + }, + { + "epoch": 16.817381738173818, + "grad_norm": 1.9637932777404785, + "learning_rate": 3.758958142722807e-06, + "loss": 0.0177, + "num_input_tokens_seen": 32261984, + "step": 152870 + }, + { + "epoch": 16.81793179317932, + "grad_norm": 0.013387419283390045, + "learning_rate": 3.7576925398548373e-06, + "loss": 0.0006, + "num_input_tokens_seen": 32263040, + "step": 152875 + }, + { + "epoch": 16.818481848184817, + "grad_norm": 0.00735727371647954, + "learning_rate": 3.7564271327670153e-06, + "loss": 0.0037, + "num_input_tokens_seen": 32264096, + "step": 152880 + }, + { + "epoch": 16.819031903190318, + "grad_norm": 0.874745786190033, + "learning_rate": 3.7551619214710175e-06, + "loss": 0.0296, + "num_input_tokens_seen": 32265184, + "step": 152885 + }, + { + "epoch": 16.81958195819582, + "grad_norm": 0.10415302962064743, + "learning_rate": 3.7538969059784906e-06, + "loss": 0.0665, + "num_input_tokens_seen": 32266176, + "step": 152890 + }, + { + "epoch": 16.82013201320132, + "grad_norm": 0.046316735446453094, + "learning_rate": 3.752632086301103e-06, + "loss": 0.0619, + "num_input_tokens_seen": 32267200, + "step": 152895 + }, + { + "epoch": 16.82068206820682, + "grad_norm": 0.037098996341228485, + "learning_rate": 3.751367462450514e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32268288, + "step": 152900 + }, + { + "epoch": 16.821232123212322, + "grad_norm": 0.004740023985505104, + "learning_rate": 3.750103034438371e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32269344, + "step": 152905 + }, + { + "epoch": 16.821782178217823, + "grad_norm": 0.014065884053707123, + "learning_rate": 3.748838802276336e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32270368, + "step": 152910 + }, + { + "epoch": 16.82233223322332, + "grad_norm": 0.16898635029792786, + "learning_rate": 3.7475747659760502e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32271424, + "step": 152915 + }, + { + "epoch": 16.822882288228822, + "grad_norm": 0.08996444195508957, + "learning_rate": 3.7463109255491736e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32272512, + "step": 152920 + }, + { + "epoch": 16.823432343234323, + "grad_norm": 0.19038823246955872, + "learning_rate": 3.7450472810073523e-06, + "loss": 0.0032, + "num_input_tokens_seen": 32273568, + "step": 152925 + }, + { + "epoch": 16.823982398239824, + "grad_norm": 0.2716180086135864, + "learning_rate": 3.743783832362227e-06, + "loss": 0.0462, + "num_input_tokens_seen": 32274560, + "step": 152930 + }, + { + "epoch": 16.824532453245325, + "grad_norm": 3.5504298210144043, + "learning_rate": 3.7425205796254525e-06, + "loss": 0.0496, + "num_input_tokens_seen": 32275648, + "step": 152935 + }, + { + "epoch": 16.825082508250826, + "grad_norm": 0.004200766794383526, + "learning_rate": 3.741257522808664e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32276704, + "step": 152940 + }, + { + "epoch": 16.825632563256324, + "grad_norm": 0.020048975944519043, + "learning_rate": 3.7399946619234986e-06, + "loss": 0.0659, + "num_input_tokens_seen": 32277728, + "step": 152945 + }, + { + "epoch": 16.826182618261825, + "grad_norm": 2.357172727584839, + "learning_rate": 3.7387319969816035e-06, + "loss": 0.0456, + "num_input_tokens_seen": 32278752, + "step": 152950 + }, + { + "epoch": 16.826732673267326, + "grad_norm": 0.00773456459864974, + "learning_rate": 3.737469527994611e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32279808, + "step": 152955 + }, + { + "epoch": 16.827282728272827, + "grad_norm": 0.026586536318063736, + "learning_rate": 3.736207254974164e-06, + "loss": 0.0037, + "num_input_tokens_seen": 32280864, + "step": 152960 + }, + { + "epoch": 16.82783278327833, + "grad_norm": 0.012647976167500019, + "learning_rate": 3.7349451779318924e-06, + "loss": 0.0591, + "num_input_tokens_seen": 32281920, + "step": 152965 + }, + { + "epoch": 16.82838283828383, + "grad_norm": 0.03530154377222061, + "learning_rate": 3.733683296879423e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32283008, + "step": 152970 + }, + { + "epoch": 16.82893289328933, + "grad_norm": 0.020009994506835938, + "learning_rate": 3.732421611828388e-06, + "loss": 0.0113, + "num_input_tokens_seen": 32284128, + "step": 152975 + }, + { + "epoch": 16.829482948294828, + "grad_norm": 0.030790219083428383, + "learning_rate": 3.7311601227904197e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32285152, + "step": 152980 + }, + { + "epoch": 16.83003300330033, + "grad_norm": 0.09775997698307037, + "learning_rate": 3.7298988297771455e-06, + "loss": 0.0059, + "num_input_tokens_seen": 32286176, + "step": 152985 + }, + { + "epoch": 16.83058305830583, + "grad_norm": 0.04489104449748993, + "learning_rate": 3.7286377328001914e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32287264, + "step": 152990 + }, + { + "epoch": 16.83113311331133, + "grad_norm": 0.009313790127635002, + "learning_rate": 3.7273768318711683e-06, + "loss": 0.0515, + "num_input_tokens_seen": 32288320, + "step": 152995 + }, + { + "epoch": 16.831683168316832, + "grad_norm": 0.004989528562873602, + "learning_rate": 3.726116127001711e-06, + "loss": 0.0019, + "num_input_tokens_seen": 32289408, + "step": 153000 + }, + { + "epoch": 16.832233223322334, + "grad_norm": 0.5070434212684631, + "learning_rate": 3.7248556182034267e-06, + "loss": 0.1335, + "num_input_tokens_seen": 32290464, + "step": 153005 + }, + { + "epoch": 16.83278327832783, + "grad_norm": 1.817000150680542, + "learning_rate": 3.7235953054879397e-06, + "loss": 0.0296, + "num_input_tokens_seen": 32291488, + "step": 153010 + }, + { + "epoch": 16.833333333333332, + "grad_norm": 0.010649002157151699, + "learning_rate": 3.722335188866871e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32292544, + "step": 153015 + }, + { + "epoch": 16.833883388338833, + "grad_norm": 0.01694176159799099, + "learning_rate": 3.7210752683518256e-06, + "loss": 0.0004, + "num_input_tokens_seen": 32293600, + "step": 153020 + }, + { + "epoch": 16.834433443344334, + "grad_norm": 0.05164314806461334, + "learning_rate": 3.719815543954422e-06, + "loss": 0.1165, + "num_input_tokens_seen": 32294656, + "step": 153025 + }, + { + "epoch": 16.834983498349835, + "grad_norm": 0.005521573591977358, + "learning_rate": 3.7185560156862615e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32295616, + "step": 153030 + }, + { + "epoch": 16.835533553355337, + "grad_norm": 0.004944789223372936, + "learning_rate": 3.71729668355896e-06, + "loss": 0.0027, + "num_input_tokens_seen": 32296672, + "step": 153035 + }, + { + "epoch": 16.836083608360838, + "grad_norm": 0.006436891388148069, + "learning_rate": 3.716037547584128e-06, + "loss": 0.004, + "num_input_tokens_seen": 32297760, + "step": 153040 + }, + { + "epoch": 16.836633663366335, + "grad_norm": 0.18700763583183289, + "learning_rate": 3.714778607773359e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32298784, + "step": 153045 + }, + { + "epoch": 16.837183718371836, + "grad_norm": 0.1135970801115036, + "learning_rate": 3.7135198641382677e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32299904, + "step": 153050 + }, + { + "epoch": 16.837733773377337, + "grad_norm": 0.1185835525393486, + "learning_rate": 3.7122613166904435e-06, + "loss": 0.0067, + "num_input_tokens_seen": 32301024, + "step": 153055 + }, + { + "epoch": 16.83828382838284, + "grad_norm": 0.14750204980373383, + "learning_rate": 3.7110029654414958e-06, + "loss": 0.1429, + "num_input_tokens_seen": 32302048, + "step": 153060 + }, + { + "epoch": 16.83883388338834, + "grad_norm": 0.2825798988342285, + "learning_rate": 3.7097448104030153e-06, + "loss": 0.0037, + "num_input_tokens_seen": 32303008, + "step": 153065 + }, + { + "epoch": 16.83938393839384, + "grad_norm": 0.19520661234855652, + "learning_rate": 3.7084868515865983e-06, + "loss": 0.0118, + "num_input_tokens_seen": 32303968, + "step": 153070 + }, + { + "epoch": 16.83993399339934, + "grad_norm": 0.23653027415275574, + "learning_rate": 3.7072290890038496e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32305024, + "step": 153075 + }, + { + "epoch": 16.84048404840484, + "grad_norm": 0.057452548295259476, + "learning_rate": 3.7059715226663515e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32306080, + "step": 153080 + }, + { + "epoch": 16.84103410341034, + "grad_norm": 0.052942246198654175, + "learning_rate": 3.704714152585692e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32307136, + "step": 153085 + }, + { + "epoch": 16.84158415841584, + "grad_norm": 0.3386090099811554, + "learning_rate": 3.7034569787734675e-06, + "loss": 0.002, + "num_input_tokens_seen": 32308224, + "step": 153090 + }, + { + "epoch": 16.842134213421343, + "grad_norm": 0.03163023665547371, + "learning_rate": 3.7022000012412517e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32309280, + "step": 153095 + }, + { + "epoch": 16.842684268426844, + "grad_norm": 1.0249977111816406, + "learning_rate": 3.700943220000647e-06, + "loss": 0.0313, + "num_input_tokens_seen": 32310304, + "step": 153100 + }, + { + "epoch": 16.843234323432345, + "grad_norm": 0.03755169361829758, + "learning_rate": 3.699686635063232e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32311360, + "step": 153105 + }, + { + "epoch": 16.843784378437842, + "grad_norm": 0.5386151671409607, + "learning_rate": 3.6984302464405764e-06, + "loss": 0.012, + "num_input_tokens_seen": 32312448, + "step": 153110 + }, + { + "epoch": 16.844334433443343, + "grad_norm": 0.31086596846580505, + "learning_rate": 3.697174054144273e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32313568, + "step": 153115 + }, + { + "epoch": 16.844884488448844, + "grad_norm": 0.14296606183052063, + "learning_rate": 3.6959180581858904e-06, + "loss": 0.0066, + "num_input_tokens_seen": 32314688, + "step": 153120 + }, + { + "epoch": 16.845434543454346, + "grad_norm": 3.075483798980713, + "learning_rate": 3.6946622585770057e-06, + "loss": 0.0646, + "num_input_tokens_seen": 32315712, + "step": 153125 + }, + { + "epoch": 16.845984598459847, + "grad_norm": 0.032037168741226196, + "learning_rate": 3.693406655329204e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32316736, + "step": 153130 + }, + { + "epoch": 16.846534653465348, + "grad_norm": 0.042553097009658813, + "learning_rate": 3.692151248454043e-06, + "loss": 0.0019, + "num_input_tokens_seen": 32317728, + "step": 153135 + }, + { + "epoch": 16.847084708470845, + "grad_norm": 0.00566213158890605, + "learning_rate": 3.690896037963104e-06, + "loss": 0.0005, + "num_input_tokens_seen": 32318784, + "step": 153140 + }, + { + "epoch": 16.847634763476346, + "grad_norm": 0.4877593219280243, + "learning_rate": 3.6896410238679484e-06, + "loss": 0.029, + "num_input_tokens_seen": 32319808, + "step": 153145 + }, + { + "epoch": 16.848184818481847, + "grad_norm": 0.030997993424534798, + "learning_rate": 3.6883862061801417e-06, + "loss": 0.0092, + "num_input_tokens_seen": 32320864, + "step": 153150 + }, + { + "epoch": 16.84873487348735, + "grad_norm": 0.013226255774497986, + "learning_rate": 3.6871315849112605e-06, + "loss": 0.0696, + "num_input_tokens_seen": 32321888, + "step": 153155 + }, + { + "epoch": 16.84928492849285, + "grad_norm": 0.0059623378328979015, + "learning_rate": 3.6858771600728566e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32322912, + "step": 153160 + }, + { + "epoch": 16.84983498349835, + "grad_norm": 0.0039358604699373245, + "learning_rate": 3.6846229316764987e-06, + "loss": 0.0751, + "num_input_tokens_seen": 32323936, + "step": 153165 + }, + { + "epoch": 16.850385038503852, + "grad_norm": 0.00903023686259985, + "learning_rate": 3.6833688997337386e-06, + "loss": 0.113, + "num_input_tokens_seen": 32325024, + "step": 153170 + }, + { + "epoch": 16.85093509350935, + "grad_norm": 0.06118525564670563, + "learning_rate": 3.6821150642561448e-06, + "loss": 0.0119, + "num_input_tokens_seen": 32326048, + "step": 153175 + }, + { + "epoch": 16.85148514851485, + "grad_norm": 0.0036097662523388863, + "learning_rate": 3.680861425255261e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32327136, + "step": 153180 + }, + { + "epoch": 16.85203520352035, + "grad_norm": 0.014372015371918678, + "learning_rate": 3.6796079827426444e-06, + "loss": 0.0101, + "num_input_tokens_seen": 32328128, + "step": 153185 + }, + { + "epoch": 16.852585258525853, + "grad_norm": 0.09909675270318985, + "learning_rate": 3.678354736729861e-06, + "loss": 0.0103, + "num_input_tokens_seen": 32329184, + "step": 153190 + }, + { + "epoch": 16.853135313531354, + "grad_norm": 0.02316015213727951, + "learning_rate": 3.677101687228443e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32330208, + "step": 153195 + }, + { + "epoch": 16.853685368536855, + "grad_norm": 0.855750560760498, + "learning_rate": 3.6758488342499534e-06, + "loss": 0.0484, + "num_input_tokens_seen": 32331168, + "step": 153200 + }, + { + "epoch": 16.854235423542356, + "grad_norm": 0.005300502292811871, + "learning_rate": 3.6745961778059306e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32332224, + "step": 153205 + }, + { + "epoch": 16.854785478547853, + "grad_norm": 0.14553241431713104, + "learning_rate": 3.6733437179079123e-06, + "loss": 0.0022, + "num_input_tokens_seen": 32333344, + "step": 153210 + }, + { + "epoch": 16.855335533553355, + "grad_norm": 0.006302640773355961, + "learning_rate": 3.672091454567464e-06, + "loss": 0.0023, + "num_input_tokens_seen": 32334528, + "step": 153215 + }, + { + "epoch": 16.855885588558856, + "grad_norm": 0.0038232356309890747, + "learning_rate": 3.670839387796107e-06, + "loss": 0.0055, + "num_input_tokens_seen": 32335552, + "step": 153220 + }, + { + "epoch": 16.856435643564357, + "grad_norm": 0.0431257039308548, + "learning_rate": 3.669587517605394e-06, + "loss": 0.0076, + "num_input_tokens_seen": 32336544, + "step": 153225 + }, + { + "epoch": 16.856985698569858, + "grad_norm": 0.05007066950201988, + "learning_rate": 3.6683358440068594e-06, + "loss": 0.0365, + "num_input_tokens_seen": 32337568, + "step": 153230 + }, + { + "epoch": 16.85753575357536, + "grad_norm": 0.03381359949707985, + "learning_rate": 3.66708436701203e-06, + "loss": 0.0247, + "num_input_tokens_seen": 32338688, + "step": 153235 + }, + { + "epoch": 16.858085808580856, + "grad_norm": 0.040525276213884354, + "learning_rate": 3.6658330866324474e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32339776, + "step": 153240 + }, + { + "epoch": 16.858635863586358, + "grad_norm": 0.02024991437792778, + "learning_rate": 3.6645820028796464e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32340864, + "step": 153245 + }, + { + "epoch": 16.85918591859186, + "grad_norm": 0.01630803942680359, + "learning_rate": 3.6633311157651588e-06, + "loss": 0.0053, + "num_input_tokens_seen": 32341888, + "step": 153250 + }, + { + "epoch": 16.85973597359736, + "grad_norm": 0.1510186344385147, + "learning_rate": 3.6620804253005126e-06, + "loss": 0.1209, + "num_input_tokens_seen": 32342912, + "step": 153255 + }, + { + "epoch": 16.86028602860286, + "grad_norm": 0.018567444756627083, + "learning_rate": 3.6608299314972227e-06, + "loss": 0.003, + "num_input_tokens_seen": 32343968, + "step": 153260 + }, + { + "epoch": 16.860836083608362, + "grad_norm": 3.2857465744018555, + "learning_rate": 3.659579634366833e-06, + "loss": 0.0399, + "num_input_tokens_seen": 32344992, + "step": 153265 + }, + { + "epoch": 16.861386138613863, + "grad_norm": 0.008005524054169655, + "learning_rate": 3.6583295339208485e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32346048, + "step": 153270 + }, + { + "epoch": 16.86193619361936, + "grad_norm": 0.026544038206338882, + "learning_rate": 3.657079630170804e-06, + "loss": 0.0784, + "num_input_tokens_seen": 32347104, + "step": 153275 + }, + { + "epoch": 16.86248624862486, + "grad_norm": 0.01723255030810833, + "learning_rate": 3.655829923128218e-06, + "loss": 0.003, + "num_input_tokens_seen": 32348192, + "step": 153280 + }, + { + "epoch": 16.863036303630363, + "grad_norm": 0.014617412351071835, + "learning_rate": 3.6545804128046006e-06, + "loss": 0.0216, + "num_input_tokens_seen": 32349248, + "step": 153285 + }, + { + "epoch": 16.863586358635864, + "grad_norm": 0.004898191429674625, + "learning_rate": 3.6533310992114792e-06, + "loss": 0.134, + "num_input_tokens_seen": 32350240, + "step": 153290 + }, + { + "epoch": 16.864136413641365, + "grad_norm": 0.08233877271413803, + "learning_rate": 3.652081982360356e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32351264, + "step": 153295 + }, + { + "epoch": 16.864686468646866, + "grad_norm": 1.4187822341918945, + "learning_rate": 3.6508330622627485e-06, + "loss": 0.0641, + "num_input_tokens_seen": 32352288, + "step": 153300 + }, + { + "epoch": 16.865236523652364, + "grad_norm": 0.008013235405087471, + "learning_rate": 3.649584338930176e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32353376, + "step": 153305 + }, + { + "epoch": 16.865786578657865, + "grad_norm": 0.009832208044826984, + "learning_rate": 3.6483358123741317e-06, + "loss": 0.0032, + "num_input_tokens_seen": 32354400, + "step": 153310 + }, + { + "epoch": 16.866336633663366, + "grad_norm": 0.010284028947353363, + "learning_rate": 3.6470874826061375e-06, + "loss": 0.002, + "num_input_tokens_seen": 32355520, + "step": 153315 + }, + { + "epoch": 16.866886688668867, + "grad_norm": 0.011600720696151257, + "learning_rate": 3.645839349637692e-06, + "loss": 0.1084, + "num_input_tokens_seen": 32356544, + "step": 153320 + }, + { + "epoch": 16.867436743674368, + "grad_norm": 0.011628072708845139, + "learning_rate": 3.6445914134802942e-06, + "loss": 0.0046, + "num_input_tokens_seen": 32357600, + "step": 153325 + }, + { + "epoch": 16.86798679867987, + "grad_norm": 0.06941012293100357, + "learning_rate": 3.6433436741454467e-06, + "loss": 0.006, + "num_input_tokens_seen": 32358624, + "step": 153330 + }, + { + "epoch": 16.86853685368537, + "grad_norm": 3.1860105991363525, + "learning_rate": 3.6420961316446567e-06, + "loss": 0.0539, + "num_input_tokens_seen": 32359680, + "step": 153335 + }, + { + "epoch": 16.869086908690868, + "grad_norm": 0.038716357201337814, + "learning_rate": 3.6408487859894202e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32360736, + "step": 153340 + }, + { + "epoch": 16.86963696369637, + "grad_norm": 0.03252318874001503, + "learning_rate": 3.639601637191234e-06, + "loss": 0.124, + "num_input_tokens_seen": 32361760, + "step": 153345 + }, + { + "epoch": 16.87018701870187, + "grad_norm": 0.0033260860946029425, + "learning_rate": 3.638354685261583e-06, + "loss": 0.001, + "num_input_tokens_seen": 32362752, + "step": 153350 + }, + { + "epoch": 16.87073707370737, + "grad_norm": 0.04399716109037399, + "learning_rate": 3.6371079302119667e-06, + "loss": 0.1484, + "num_input_tokens_seen": 32363744, + "step": 153355 + }, + { + "epoch": 16.871287128712872, + "grad_norm": 0.03768568113446236, + "learning_rate": 3.6358613720538724e-06, + "loss": 0.0723, + "num_input_tokens_seen": 32364800, + "step": 153360 + }, + { + "epoch": 16.871837183718373, + "grad_norm": 0.012309867888689041, + "learning_rate": 3.6346150107988032e-06, + "loss": 0.009, + "num_input_tokens_seen": 32365856, + "step": 153365 + }, + { + "epoch": 16.87238723872387, + "grad_norm": 0.022579679265618324, + "learning_rate": 3.633368846458232e-06, + "loss": 0.001, + "num_input_tokens_seen": 32366912, + "step": 153370 + }, + { + "epoch": 16.872937293729372, + "grad_norm": 0.11955245584249496, + "learning_rate": 3.6321228790436425e-06, + "loss": 0.0143, + "num_input_tokens_seen": 32367904, + "step": 153375 + }, + { + "epoch": 16.873487348734873, + "grad_norm": 0.015847425907850266, + "learning_rate": 3.63087710856653e-06, + "loss": 0.0128, + "num_input_tokens_seen": 32368960, + "step": 153380 + }, + { + "epoch": 16.874037403740374, + "grad_norm": 0.05993322283029556, + "learning_rate": 3.6296315350383635e-06, + "loss": 0.007, + "num_input_tokens_seen": 32370016, + "step": 153385 + }, + { + "epoch": 16.874587458745875, + "grad_norm": 0.04570085182785988, + "learning_rate": 3.6283861584706287e-06, + "loss": 0.0804, + "num_input_tokens_seen": 32371072, + "step": 153390 + }, + { + "epoch": 16.875137513751376, + "grad_norm": 0.04518585279583931, + "learning_rate": 3.6271409788748073e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32372160, + "step": 153395 + }, + { + "epoch": 16.875687568756877, + "grad_norm": 0.04044988751411438, + "learning_rate": 3.6258959962623686e-06, + "loss": 0.0205, + "num_input_tokens_seen": 32373248, + "step": 153400 + }, + { + "epoch": 16.876237623762375, + "grad_norm": 0.29814857244491577, + "learning_rate": 3.6246512106447946e-06, + "loss": 0.08, + "num_input_tokens_seen": 32374304, + "step": 153405 + }, + { + "epoch": 16.876787678767876, + "grad_norm": 0.006261034402996302, + "learning_rate": 3.6234066220335455e-06, + "loss": 0.0554, + "num_input_tokens_seen": 32375424, + "step": 153410 + }, + { + "epoch": 16.877337733773377, + "grad_norm": 0.0064473277889192104, + "learning_rate": 3.622162230440104e-06, + "loss": 0.0944, + "num_input_tokens_seen": 32376480, + "step": 153415 + }, + { + "epoch": 16.877887788778878, + "grad_norm": 0.1722649782896042, + "learning_rate": 3.6209180358759394e-06, + "loss": 0.048, + "num_input_tokens_seen": 32377536, + "step": 153420 + }, + { + "epoch": 16.87843784378438, + "grad_norm": 0.5847628712654114, + "learning_rate": 3.619674038352508e-06, + "loss": 0.0205, + "num_input_tokens_seen": 32378624, + "step": 153425 + }, + { + "epoch": 16.87898789878988, + "grad_norm": 0.039393600076436996, + "learning_rate": 3.6184302378812872e-06, + "loss": 0.0056, + "num_input_tokens_seen": 32379680, + "step": 153430 + }, + { + "epoch": 16.879537953795378, + "grad_norm": 0.015765363350510597, + "learning_rate": 3.6171866344737323e-06, + "loss": 0.004, + "num_input_tokens_seen": 32380800, + "step": 153435 + }, + { + "epoch": 16.88008800880088, + "grad_norm": 0.30110785365104675, + "learning_rate": 3.615943228141311e-06, + "loss": 0.005, + "num_input_tokens_seen": 32381856, + "step": 153440 + }, + { + "epoch": 16.88063806380638, + "grad_norm": 0.26844778656959534, + "learning_rate": 3.614700018895473e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32382912, + "step": 153445 + }, + { + "epoch": 16.88118811881188, + "grad_norm": 1.744978666305542, + "learning_rate": 3.6134570067476865e-06, + "loss": 0.1453, + "num_input_tokens_seen": 32384032, + "step": 153450 + }, + { + "epoch": 16.881738173817382, + "grad_norm": 0.019719868898391724, + "learning_rate": 3.6122141917094064e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32385088, + "step": 153455 + }, + { + "epoch": 16.882288228822883, + "grad_norm": 0.128041610121727, + "learning_rate": 3.610971573792088e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32386144, + "step": 153460 + }, + { + "epoch": 16.882838283828384, + "grad_norm": 0.005704651586711407, + "learning_rate": 3.609729153007174e-06, + "loss": 0.0103, + "num_input_tokens_seen": 32387232, + "step": 153465 + }, + { + "epoch": 16.883388338833882, + "grad_norm": 4.56197452545166, + "learning_rate": 3.6084869293661255e-06, + "loss": 0.0637, + "num_input_tokens_seen": 32388224, + "step": 153470 + }, + { + "epoch": 16.883938393839383, + "grad_norm": 0.13564784824848175, + "learning_rate": 3.6072449028803857e-06, + "loss": 0.0096, + "num_input_tokens_seen": 32389280, + "step": 153475 + }, + { + "epoch": 16.884488448844884, + "grad_norm": 0.003774333745241165, + "learning_rate": 3.606003073561409e-06, + "loss": 0.0168, + "num_input_tokens_seen": 32390368, + "step": 153480 + }, + { + "epoch": 16.885038503850385, + "grad_norm": 0.09219537675380707, + "learning_rate": 3.604761441420637e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32391392, + "step": 153485 + }, + { + "epoch": 16.885588558855886, + "grad_norm": 1.659076452255249, + "learning_rate": 3.6035200064695072e-06, + "loss": 0.0072, + "num_input_tokens_seen": 32392416, + "step": 153490 + }, + { + "epoch": 16.886138613861387, + "grad_norm": 1.1054351329803467, + "learning_rate": 3.6022787687194747e-06, + "loss": 0.0462, + "num_input_tokens_seen": 32393504, + "step": 153495 + }, + { + "epoch": 16.88668866886689, + "grad_norm": 0.07727921009063721, + "learning_rate": 3.601037728181961e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32394624, + "step": 153500 + }, + { + "epoch": 16.887238723872386, + "grad_norm": 0.08701635152101517, + "learning_rate": 3.599796884868417e-06, + "loss": 0.0121, + "num_input_tokens_seen": 32395680, + "step": 153505 + }, + { + "epoch": 16.887788778877887, + "grad_norm": 0.04870983585715294, + "learning_rate": 3.598556238790282e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32396800, + "step": 153510 + }, + { + "epoch": 16.888338833883388, + "grad_norm": 4.043204307556152, + "learning_rate": 3.597315789958977e-06, + "loss": 0.0301, + "num_input_tokens_seen": 32397920, + "step": 153515 + }, + { + "epoch": 16.88888888888889, + "grad_norm": 0.012744995765388012, + "learning_rate": 3.596075538385951e-06, + "loss": 0.0531, + "num_input_tokens_seen": 32398944, + "step": 153520 + }, + { + "epoch": 16.88943894389439, + "grad_norm": 0.0388234406709671, + "learning_rate": 3.5948354840826177e-06, + "loss": 0.0123, + "num_input_tokens_seen": 32400000, + "step": 153525 + }, + { + "epoch": 16.88998899889989, + "grad_norm": 0.008706455118954182, + "learning_rate": 3.5935956270604173e-06, + "loss": 0.0576, + "num_input_tokens_seen": 32401024, + "step": 153530 + }, + { + "epoch": 16.89053905390539, + "grad_norm": 0.029680052772164345, + "learning_rate": 3.5923559673307767e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32402112, + "step": 153535 + }, + { + "epoch": 16.89108910891089, + "grad_norm": 0.17240530252456665, + "learning_rate": 3.591116504905115e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32403200, + "step": 153540 + }, + { + "epoch": 16.89163916391639, + "grad_norm": 0.06761117279529572, + "learning_rate": 3.5898772397948645e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32404288, + "step": 153545 + }, + { + "epoch": 16.892189218921892, + "grad_norm": 0.04042214900255203, + "learning_rate": 3.588638172011435e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32405376, + "step": 153550 + }, + { + "epoch": 16.892739273927393, + "grad_norm": 1.1915302276611328, + "learning_rate": 3.587399301566263e-06, + "loss": 0.0437, + "num_input_tokens_seen": 32406432, + "step": 153555 + }, + { + "epoch": 16.893289328932894, + "grad_norm": 0.018339237198233604, + "learning_rate": 3.586160628470747e-06, + "loss": 0.1146, + "num_input_tokens_seen": 32407424, + "step": 153560 + }, + { + "epoch": 16.893839383938392, + "grad_norm": 0.14765171706676483, + "learning_rate": 3.584922152736317e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32408480, + "step": 153565 + }, + { + "epoch": 16.894389438943893, + "grad_norm": 0.043385688215494156, + "learning_rate": 3.5836838743743854e-06, + "loss": 0.0064, + "num_input_tokens_seen": 32409536, + "step": 153570 + }, + { + "epoch": 16.894939493949394, + "grad_norm": 0.031674839556217194, + "learning_rate": 3.582445793396358e-06, + "loss": 0.0861, + "num_input_tokens_seen": 32410592, + "step": 153575 + }, + { + "epoch": 16.895489548954895, + "grad_norm": 0.38095369935035706, + "learning_rate": 3.581207909813658e-06, + "loss": 0.0112, + "num_input_tokens_seen": 32411616, + "step": 153580 + }, + { + "epoch": 16.896039603960396, + "grad_norm": 0.02278359793126583, + "learning_rate": 3.5799702236376796e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32412704, + "step": 153585 + }, + { + "epoch": 16.896589658965897, + "grad_norm": 1.6766058206558228, + "learning_rate": 3.578732734879839e-06, + "loss": 0.0628, + "num_input_tokens_seen": 32413728, + "step": 153590 + }, + { + "epoch": 16.8971397139714, + "grad_norm": 0.050706785172224045, + "learning_rate": 3.5774954435515456e-06, + "loss": 0.1192, + "num_input_tokens_seen": 32414848, + "step": 153595 + }, + { + "epoch": 16.897689768976896, + "grad_norm": 3.244253635406494, + "learning_rate": 3.576258349664191e-06, + "loss": 0.089, + "num_input_tokens_seen": 32416000, + "step": 153600 + }, + { + "epoch": 16.898239823982397, + "grad_norm": 6.1416473388671875, + "learning_rate": 3.5750214532291883e-06, + "loss": 0.0803, + "num_input_tokens_seen": 32417120, + "step": 153605 + }, + { + "epoch": 16.8987898789879, + "grad_norm": 0.016923101618885994, + "learning_rate": 3.5737847542579333e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32418080, + "step": 153610 + }, + { + "epoch": 16.8993399339934, + "grad_norm": 0.028155777603387833, + "learning_rate": 3.572548252761815e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32419168, + "step": 153615 + }, + { + "epoch": 16.8998899889989, + "grad_norm": 0.014274762943387032, + "learning_rate": 3.57131194875224e-06, + "loss": 0.1465, + "num_input_tokens_seen": 32420224, + "step": 153620 + }, + { + "epoch": 16.9004400440044, + "grad_norm": 0.1078328788280487, + "learning_rate": 3.5700758422405973e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32421312, + "step": 153625 + }, + { + "epoch": 16.900990099009903, + "grad_norm": 0.1291494220495224, + "learning_rate": 3.568839933238291e-06, + "loss": 0.0211, + "num_input_tokens_seen": 32422400, + "step": 153630 + }, + { + "epoch": 16.9015401540154, + "grad_norm": 0.3909144401550293, + "learning_rate": 3.5676042217567014e-06, + "loss": 0.0046, + "num_input_tokens_seen": 32423488, + "step": 153635 + }, + { + "epoch": 16.9020902090209, + "grad_norm": 0.019380751997232437, + "learning_rate": 3.566368707807216e-06, + "loss": 0.064, + "num_input_tokens_seen": 32424544, + "step": 153640 + }, + { + "epoch": 16.902640264026402, + "grad_norm": 0.11275152862071991, + "learning_rate": 3.5651333914012235e-06, + "loss": 0.0054, + "num_input_tokens_seen": 32425568, + "step": 153645 + }, + { + "epoch": 16.903190319031903, + "grad_norm": 0.11162875592708588, + "learning_rate": 3.5638982725501174e-06, + "loss": 0.1037, + "num_input_tokens_seen": 32426592, + "step": 153650 + }, + { + "epoch": 16.903740374037405, + "grad_norm": 0.07533622533082962, + "learning_rate": 3.562663351265269e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32427584, + "step": 153655 + }, + { + "epoch": 16.904290429042906, + "grad_norm": 0.024150891229510307, + "learning_rate": 3.561428627558072e-06, + "loss": 0.001, + "num_input_tokens_seen": 32428544, + "step": 153660 + }, + { + "epoch": 16.904840484048403, + "grad_norm": 5.553455352783203, + "learning_rate": 3.5601941014398925e-06, + "loss": 0.0942, + "num_input_tokens_seen": 32429632, + "step": 153665 + }, + { + "epoch": 16.905390539053904, + "grad_norm": 0.12103977799415588, + "learning_rate": 3.558959772922124e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32430688, + "step": 153670 + }, + { + "epoch": 16.905940594059405, + "grad_norm": 8.116814613342285, + "learning_rate": 3.55772564201613e-06, + "loss": 0.1645, + "num_input_tokens_seen": 32431680, + "step": 153675 + }, + { + "epoch": 16.906490649064907, + "grad_norm": 0.2101937234401703, + "learning_rate": 3.5564917087332867e-06, + "loss": 0.0569, + "num_input_tokens_seen": 32432736, + "step": 153680 + }, + { + "epoch": 16.907040704070408, + "grad_norm": 0.027914345264434814, + "learning_rate": 3.5552579730849773e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32433792, + "step": 153685 + }, + { + "epoch": 16.90759075907591, + "grad_norm": 0.02679152972996235, + "learning_rate": 3.554024435082559e-06, + "loss": 0.0146, + "num_input_tokens_seen": 32434848, + "step": 153690 + }, + { + "epoch": 16.90814081408141, + "grad_norm": 0.012055056169629097, + "learning_rate": 3.5527910947374123e-06, + "loss": 0.001, + "num_input_tokens_seen": 32435904, + "step": 153695 + }, + { + "epoch": 16.908690869086907, + "grad_norm": 0.11897605657577515, + "learning_rate": 3.551557952060894e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32436992, + "step": 153700 + }, + { + "epoch": 16.90924092409241, + "grad_norm": 0.027569344267249107, + "learning_rate": 3.5503250070643727e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32437984, + "step": 153705 + }, + { + "epoch": 16.90979097909791, + "grad_norm": 0.4482358992099762, + "learning_rate": 3.5490922597592206e-06, + "loss": 0.0342, + "num_input_tokens_seen": 32439008, + "step": 153710 + }, + { + "epoch": 16.91034103410341, + "grad_norm": 0.562359631061554, + "learning_rate": 3.547859710156784e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32440064, + "step": 153715 + }, + { + "epoch": 16.91089108910891, + "grad_norm": 0.024084176868200302, + "learning_rate": 3.5466273582684366e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32441088, + "step": 153720 + }, + { + "epoch": 16.911441144114413, + "grad_norm": 0.04496026039123535, + "learning_rate": 3.545395204105534e-06, + "loss": 0.0043, + "num_input_tokens_seen": 32442080, + "step": 153725 + }, + { + "epoch": 16.91199119911991, + "grad_norm": 1.7849370241165161, + "learning_rate": 3.544163247679419e-06, + "loss": 0.0395, + "num_input_tokens_seen": 32443136, + "step": 153730 + }, + { + "epoch": 16.91254125412541, + "grad_norm": 0.06581579893827438, + "learning_rate": 3.542931489001458e-06, + "loss": 0.0048, + "num_input_tokens_seen": 32444192, + "step": 153735 + }, + { + "epoch": 16.913091309130913, + "grad_norm": 0.04665328934788704, + "learning_rate": 3.5416999280830004e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32445280, + "step": 153740 + }, + { + "epoch": 16.913641364136414, + "grad_norm": 0.009158133529126644, + "learning_rate": 3.5404685649354036e-06, + "loss": 0.0227, + "num_input_tokens_seen": 32446272, + "step": 153745 + }, + { + "epoch": 16.914191419141915, + "grad_norm": 0.5354534387588501, + "learning_rate": 3.5392373995700114e-06, + "loss": 0.0077, + "num_input_tokens_seen": 32447392, + "step": 153750 + }, + { + "epoch": 16.914741474147416, + "grad_norm": 0.008882243186235428, + "learning_rate": 3.5380064319981644e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32448416, + "step": 153755 + }, + { + "epoch": 16.915291529152917, + "grad_norm": 0.045105475932359695, + "learning_rate": 3.5367756622312148e-06, + "loss": 0.1584, + "num_input_tokens_seen": 32449472, + "step": 153760 + }, + { + "epoch": 16.915841584158414, + "grad_norm": 0.28665241599082947, + "learning_rate": 3.5355450902805036e-06, + "loss": 0.0176, + "num_input_tokens_seen": 32450592, + "step": 153765 + }, + { + "epoch": 16.916391639163916, + "grad_norm": 0.02136593870818615, + "learning_rate": 3.5343147161573777e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32451744, + "step": 153770 + }, + { + "epoch": 16.916941694169417, + "grad_norm": 0.20880256593227386, + "learning_rate": 3.5330845398731745e-06, + "loss": 0.0655, + "num_input_tokens_seen": 32452768, + "step": 153775 + }, + { + "epoch": 16.917491749174918, + "grad_norm": 0.04258990287780762, + "learning_rate": 3.531854561439224e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32453856, + "step": 153780 + }, + { + "epoch": 16.91804180418042, + "grad_norm": 0.3399701714515686, + "learning_rate": 3.5306247808668758e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32454944, + "step": 153785 + }, + { + "epoch": 16.91859185918592, + "grad_norm": 0.018807165324687958, + "learning_rate": 3.529395198167451e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32456000, + "step": 153790 + }, + { + "epoch": 16.919141914191417, + "grad_norm": 2.9633517265319824, + "learning_rate": 3.528165813352288e-06, + "loss": 0.0592, + "num_input_tokens_seen": 32457056, + "step": 153795 + }, + { + "epoch": 16.91969196919692, + "grad_norm": 0.05104634165763855, + "learning_rate": 3.5269366264327226e-06, + "loss": 0.0037, + "num_input_tokens_seen": 32458144, + "step": 153800 + }, + { + "epoch": 16.92024202420242, + "grad_norm": 0.16765835881233215, + "learning_rate": 3.5257076374200725e-06, + "loss": 0.0252, + "num_input_tokens_seen": 32459168, + "step": 153805 + }, + { + "epoch": 16.92079207920792, + "grad_norm": 0.03492242842912674, + "learning_rate": 3.5244788463256766e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32460224, + "step": 153810 + }, + { + "epoch": 16.921342134213422, + "grad_norm": 0.06832356005907059, + "learning_rate": 3.523250253160848e-06, + "loss": 0.027, + "num_input_tokens_seen": 32461248, + "step": 153815 + }, + { + "epoch": 16.921892189218923, + "grad_norm": 0.010874244384467602, + "learning_rate": 3.522021857936916e-06, + "loss": 0.0812, + "num_input_tokens_seen": 32462336, + "step": 153820 + }, + { + "epoch": 16.922442244224424, + "grad_norm": 0.022212596610188484, + "learning_rate": 3.5207936606652115e-06, + "loss": 0.0071, + "num_input_tokens_seen": 32463424, + "step": 153825 + }, + { + "epoch": 16.92299229922992, + "grad_norm": 0.05053069069981575, + "learning_rate": 3.5195656613570354e-06, + "loss": 0.0079, + "num_input_tokens_seen": 32464512, + "step": 153830 + }, + { + "epoch": 16.923542354235423, + "grad_norm": 0.004453529138118029, + "learning_rate": 3.5183378600237237e-06, + "loss": 0.003, + "num_input_tokens_seen": 32465664, + "step": 153835 + }, + { + "epoch": 16.924092409240924, + "grad_norm": 0.06369680911302567, + "learning_rate": 3.5171102566765757e-06, + "loss": 0.0828, + "num_input_tokens_seen": 32466720, + "step": 153840 + }, + { + "epoch": 16.924642464246425, + "grad_norm": 0.0413205623626709, + "learning_rate": 3.515882851326921e-06, + "loss": 0.0132, + "num_input_tokens_seen": 32467776, + "step": 153845 + }, + { + "epoch": 16.925192519251926, + "grad_norm": 0.1742764562368393, + "learning_rate": 3.5146556439860617e-06, + "loss": 0.004, + "num_input_tokens_seen": 32468768, + "step": 153850 + }, + { + "epoch": 16.925742574257427, + "grad_norm": 0.04368354380130768, + "learning_rate": 3.5134286346653084e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32469792, + "step": 153855 + }, + { + "epoch": 16.926292629262925, + "grad_norm": 0.02408396080136299, + "learning_rate": 3.5122018233759797e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32470784, + "step": 153860 + }, + { + "epoch": 16.926842684268426, + "grad_norm": 0.016310416162014008, + "learning_rate": 3.5109752101293775e-06, + "loss": 0.0367, + "num_input_tokens_seen": 32471840, + "step": 153865 + }, + { + "epoch": 16.927392739273927, + "grad_norm": 0.006874135695397854, + "learning_rate": 3.509748794936796e-06, + "loss": 0.0143, + "num_input_tokens_seen": 32472896, + "step": 153870 + }, + { + "epoch": 16.927942794279428, + "grad_norm": 0.1483454555273056, + "learning_rate": 3.508522577809556e-06, + "loss": 0.0512, + "num_input_tokens_seen": 32473952, + "step": 153875 + }, + { + "epoch": 16.92849284928493, + "grad_norm": 0.01086813397705555, + "learning_rate": 3.5072965587589405e-06, + "loss": 0.002, + "num_input_tokens_seen": 32475008, + "step": 153880 + }, + { + "epoch": 16.92904290429043, + "grad_norm": 0.01856173947453499, + "learning_rate": 3.506070737796269e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32476096, + "step": 153885 + }, + { + "epoch": 16.92959295929593, + "grad_norm": 0.03401665762066841, + "learning_rate": 3.504845114932831e-06, + "loss": 0.0324, + "num_input_tokens_seen": 32477120, + "step": 153890 + }, + { + "epoch": 16.93014301430143, + "grad_norm": 0.002112002344802022, + "learning_rate": 3.503619690179913e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32478112, + "step": 153895 + }, + { + "epoch": 16.93069306930693, + "grad_norm": 0.011079886928200722, + "learning_rate": 3.502394463548825e-06, + "loss": 0.0481, + "num_input_tokens_seen": 32479168, + "step": 153900 + }, + { + "epoch": 16.93124312431243, + "grad_norm": 0.35557401180267334, + "learning_rate": 3.5011694350508468e-06, + "loss": 0.0056, + "num_input_tokens_seen": 32480288, + "step": 153905 + }, + { + "epoch": 16.931793179317932, + "grad_norm": 1.664747953414917, + "learning_rate": 3.499944604697272e-06, + "loss": 0.0633, + "num_input_tokens_seen": 32481376, + "step": 153910 + }, + { + "epoch": 16.932343234323433, + "grad_norm": 0.01454963255673647, + "learning_rate": 3.498719972499398e-06, + "loss": 0.0071, + "num_input_tokens_seen": 32482432, + "step": 153915 + }, + { + "epoch": 16.932893289328934, + "grad_norm": 0.017760686576366425, + "learning_rate": 3.4974955384684982e-06, + "loss": 0.0212, + "num_input_tokens_seen": 32483456, + "step": 153920 + }, + { + "epoch": 16.933443344334435, + "grad_norm": 0.06698226928710938, + "learning_rate": 3.4962713026158694e-06, + "loss": 0.0185, + "num_input_tokens_seen": 32484448, + "step": 153925 + }, + { + "epoch": 16.933993399339933, + "grad_norm": 0.2177520990371704, + "learning_rate": 3.4950472649527832e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32485568, + "step": 153930 + }, + { + "epoch": 16.934543454345434, + "grad_norm": 0.07956965267658234, + "learning_rate": 3.4938234254905304e-06, + "loss": 0.0866, + "num_input_tokens_seen": 32486624, + "step": 153935 + }, + { + "epoch": 16.935093509350935, + "grad_norm": 0.005016264505684376, + "learning_rate": 3.4925997842403912e-06, + "loss": 0.0934, + "num_input_tokens_seen": 32487680, + "step": 153940 + }, + { + "epoch": 16.935643564356436, + "grad_norm": 0.040313977748155594, + "learning_rate": 3.491376341213634e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32488768, + "step": 153945 + }, + { + "epoch": 16.936193619361937, + "grad_norm": 0.053363438695669174, + "learning_rate": 3.4901530964215443e-06, + "loss": 0.0398, + "num_input_tokens_seen": 32489760, + "step": 153950 + }, + { + "epoch": 16.936743674367438, + "grad_norm": 0.04636828228831291, + "learning_rate": 3.4889300498753884e-06, + "loss": 0.0088, + "num_input_tokens_seen": 32490784, + "step": 153955 + }, + { + "epoch": 16.937293729372936, + "grad_norm": 0.2461024820804596, + "learning_rate": 3.4877072015864486e-06, + "loss": 0.0057, + "num_input_tokens_seen": 32491840, + "step": 153960 + }, + { + "epoch": 16.937843784378437, + "grad_norm": 0.07048878818750381, + "learning_rate": 3.48648455156598e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32492832, + "step": 153965 + }, + { + "epoch": 16.938393839383938, + "grad_norm": 0.11655808985233307, + "learning_rate": 3.4852620998252622e-06, + "loss": 0.0149, + "num_input_tokens_seen": 32493920, + "step": 153970 + }, + { + "epoch": 16.93894389438944, + "grad_norm": 0.11113600432872772, + "learning_rate": 3.4840398463755667e-06, + "loss": 0.068, + "num_input_tokens_seen": 32494976, + "step": 153975 + }, + { + "epoch": 16.93949394939494, + "grad_norm": 0.0010844916105270386, + "learning_rate": 3.4828177912281434e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32496064, + "step": 153980 + }, + { + "epoch": 16.94004400440044, + "grad_norm": 0.1315387785434723, + "learning_rate": 3.4815959343942715e-06, + "loss": 0.0067, + "num_input_tokens_seen": 32497120, + "step": 153985 + }, + { + "epoch": 16.94059405940594, + "grad_norm": 0.020462539047002792, + "learning_rate": 3.4803742758852038e-06, + "loss": 0.0087, + "num_input_tokens_seen": 32498240, + "step": 153990 + }, + { + "epoch": 16.94114411441144, + "grad_norm": 0.4981771409511566, + "learning_rate": 3.479152815712189e-06, + "loss": 0.1498, + "num_input_tokens_seen": 32499232, + "step": 153995 + }, + { + "epoch": 16.94169416941694, + "grad_norm": 0.008362485095858574, + "learning_rate": 3.477931553886507e-06, + "loss": 0.0439, + "num_input_tokens_seen": 32500256, + "step": 154000 + }, + { + "epoch": 16.942244224422442, + "grad_norm": 0.21464210748672485, + "learning_rate": 3.4767104904193963e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32501312, + "step": 154005 + }, + { + "epoch": 16.942794279427943, + "grad_norm": 0.020528243854641914, + "learning_rate": 3.475489625322123e-06, + "loss": 0.0495, + "num_input_tokens_seen": 32502368, + "step": 154010 + }, + { + "epoch": 16.943344334433444, + "grad_norm": 0.1393732726573944, + "learning_rate": 3.4742689586059337e-06, + "loss": 0.0035, + "num_input_tokens_seen": 32503456, + "step": 154015 + }, + { + "epoch": 16.943894389438945, + "grad_norm": 0.055982448160648346, + "learning_rate": 3.4730484902820717e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32504576, + "step": 154020 + }, + { + "epoch": 16.944444444444443, + "grad_norm": 0.02127298153936863, + "learning_rate": 3.4718282203617947e-06, + "loss": 0.02, + "num_input_tokens_seen": 32505600, + "step": 154025 + }, + { + "epoch": 16.944994499449944, + "grad_norm": 2.0464541912078857, + "learning_rate": 3.470608148856344e-06, + "loss": 0.0944, + "num_input_tokens_seen": 32506656, + "step": 154030 + }, + { + "epoch": 16.945544554455445, + "grad_norm": 0.34115397930145264, + "learning_rate": 3.4693882757769745e-06, + "loss": 0.1073, + "num_input_tokens_seen": 32507712, + "step": 154035 + }, + { + "epoch": 16.946094609460946, + "grad_norm": 0.013844587840139866, + "learning_rate": 3.4681686011349218e-06, + "loss": 0.0231, + "num_input_tokens_seen": 32508736, + "step": 154040 + }, + { + "epoch": 16.946644664466447, + "grad_norm": 0.008845696225762367, + "learning_rate": 3.466949124941421e-06, + "loss": 0.0271, + "num_input_tokens_seen": 32509760, + "step": 154045 + }, + { + "epoch": 16.94719471947195, + "grad_norm": 0.007245153654366732, + "learning_rate": 3.4657298472077214e-06, + "loss": 0.043, + "num_input_tokens_seen": 32510784, + "step": 154050 + }, + { + "epoch": 16.94774477447745, + "grad_norm": 0.08945567905902863, + "learning_rate": 3.4645107679450535e-06, + "loss": 0.0783, + "num_input_tokens_seen": 32511872, + "step": 154055 + }, + { + "epoch": 16.948294829482947, + "grad_norm": 0.0077273123897612095, + "learning_rate": 3.4632918871646546e-06, + "loss": 0.0785, + "num_input_tokens_seen": 32512960, + "step": 154060 + }, + { + "epoch": 16.948844884488448, + "grad_norm": 2.185267925262451, + "learning_rate": 3.4620732048777668e-06, + "loss": 0.1297, + "num_input_tokens_seen": 32513984, + "step": 154065 + }, + { + "epoch": 16.94939493949395, + "grad_norm": 0.0032768230885267258, + "learning_rate": 3.4608547210956107e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32514976, + "step": 154070 + }, + { + "epoch": 16.94994499449945, + "grad_norm": 3.655208110809326, + "learning_rate": 3.459636435829425e-06, + "loss": 0.0725, + "num_input_tokens_seen": 32516032, + "step": 154075 + }, + { + "epoch": 16.95049504950495, + "grad_norm": 0.015789693221449852, + "learning_rate": 3.458418349090428e-06, + "loss": 0.0035, + "num_input_tokens_seen": 32517184, + "step": 154080 + }, + { + "epoch": 16.951045104510452, + "grad_norm": 0.009274573065340519, + "learning_rate": 3.4572004608898535e-06, + "loss": 0.0884, + "num_input_tokens_seen": 32518304, + "step": 154085 + }, + { + "epoch": 16.95159515951595, + "grad_norm": 0.007234653457999229, + "learning_rate": 3.4559827712389305e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32519360, + "step": 154090 + }, + { + "epoch": 16.95214521452145, + "grad_norm": 0.10007112473249435, + "learning_rate": 3.454765280148872e-06, + "loss": 0.0065, + "num_input_tokens_seen": 32520416, + "step": 154095 + }, + { + "epoch": 16.952695269526952, + "grad_norm": 0.11382510513067245, + "learning_rate": 3.453547987630909e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32521440, + "step": 154100 + }, + { + "epoch": 16.953245324532453, + "grad_norm": 0.15081055462360382, + "learning_rate": 3.452330893696254e-06, + "loss": 0.0512, + "num_input_tokens_seen": 32522592, + "step": 154105 + }, + { + "epoch": 16.953795379537954, + "grad_norm": 1.8236562013626099, + "learning_rate": 3.4511139983561202e-06, + "loss": 0.0125, + "num_input_tokens_seen": 32523648, + "step": 154110 + }, + { + "epoch": 16.954345434543455, + "grad_norm": 0.04081230238080025, + "learning_rate": 3.449897301621727e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32524704, + "step": 154115 + }, + { + "epoch": 16.954895489548957, + "grad_norm": 0.09741275757551193, + "learning_rate": 3.448680803504292e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32525760, + "step": 154120 + }, + { + "epoch": 16.955445544554454, + "grad_norm": 0.3091840147972107, + "learning_rate": 3.4474645040150298e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32526816, + "step": 154125 + }, + { + "epoch": 16.955995599559955, + "grad_norm": 2.878603458404541, + "learning_rate": 3.446248403165145e-06, + "loss": 0.1267, + "num_input_tokens_seen": 32527904, + "step": 154130 + }, + { + "epoch": 16.956545654565456, + "grad_norm": 0.01324187871068716, + "learning_rate": 3.445032500965842e-06, + "loss": 0.0039, + "num_input_tokens_seen": 32529024, + "step": 154135 + }, + { + "epoch": 16.957095709570957, + "grad_norm": 0.016561396420001984, + "learning_rate": 3.4438167974283287e-06, + "loss": 0.0555, + "num_input_tokens_seen": 32530048, + "step": 154140 + }, + { + "epoch": 16.95764576457646, + "grad_norm": 0.03269343823194504, + "learning_rate": 3.442601292563813e-06, + "loss": 0.0212, + "num_input_tokens_seen": 32531104, + "step": 154145 + }, + { + "epoch": 16.95819581958196, + "grad_norm": 0.014393799938261509, + "learning_rate": 3.441385986383505e-06, + "loss": 0.057, + "num_input_tokens_seen": 32532288, + "step": 154150 + }, + { + "epoch": 16.958745874587457, + "grad_norm": 0.023751504719257355, + "learning_rate": 3.4401708788985966e-06, + "loss": 0.012, + "num_input_tokens_seen": 32533344, + "step": 154155 + }, + { + "epoch": 16.959295929592958, + "grad_norm": 0.0026158480904996395, + "learning_rate": 3.4389559701202806e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32534336, + "step": 154160 + }, + { + "epoch": 16.95984598459846, + "grad_norm": 0.04724588245153427, + "learning_rate": 3.4377412600597682e-06, + "loss": 0.0127, + "num_input_tokens_seen": 32535456, + "step": 154165 + }, + { + "epoch": 16.96039603960396, + "grad_norm": 0.02027249149978161, + "learning_rate": 3.4365267487282417e-06, + "loss": 0.0215, + "num_input_tokens_seen": 32536512, + "step": 154170 + }, + { + "epoch": 16.96094609460946, + "grad_norm": 0.056481990963220596, + "learning_rate": 3.4353124361369006e-06, + "loss": 0.0033, + "num_input_tokens_seen": 32537568, + "step": 154175 + }, + { + "epoch": 16.961496149614963, + "grad_norm": 0.024450793862342834, + "learning_rate": 3.4340983222969443e-06, + "loss": 0.1048, + "num_input_tokens_seen": 32538624, + "step": 154180 + }, + { + "epoch": 16.962046204620464, + "grad_norm": 0.6262588500976562, + "learning_rate": 3.432884407219547e-06, + "loss": 0.0068, + "num_input_tokens_seen": 32539680, + "step": 154185 + }, + { + "epoch": 16.96259625962596, + "grad_norm": 0.020281802862882614, + "learning_rate": 3.431670690915914e-06, + "loss": 0.0319, + "num_input_tokens_seen": 32540672, + "step": 154190 + }, + { + "epoch": 16.963146314631462, + "grad_norm": 0.013380722142755985, + "learning_rate": 3.430457173397217e-06, + "loss": 0.0132, + "num_input_tokens_seen": 32541728, + "step": 154195 + }, + { + "epoch": 16.963696369636963, + "grad_norm": 0.041903700679540634, + "learning_rate": 3.4292438546746437e-06, + "loss": 0.0081, + "num_input_tokens_seen": 32542784, + "step": 154200 + }, + { + "epoch": 16.964246424642464, + "grad_norm": 1.9910156726837158, + "learning_rate": 3.4280307347593853e-06, + "loss": 0.0113, + "num_input_tokens_seen": 32543808, + "step": 154205 + }, + { + "epoch": 16.964796479647966, + "grad_norm": 0.021567819640040398, + "learning_rate": 3.426817813662611e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32544832, + "step": 154210 + }, + { + "epoch": 16.965346534653467, + "grad_norm": 0.033886272460222244, + "learning_rate": 3.4256050913955117e-06, + "loss": 0.006, + "num_input_tokens_seen": 32545888, + "step": 154215 + }, + { + "epoch": 16.965896589658964, + "grad_norm": 0.060495566576719284, + "learning_rate": 3.4243925679692535e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32546976, + "step": 154220 + }, + { + "epoch": 16.966446644664465, + "grad_norm": 0.005804596934467554, + "learning_rate": 3.423180243395019e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32547936, + "step": 154225 + }, + { + "epoch": 16.966996699669966, + "grad_norm": 2.019191026687622, + "learning_rate": 3.421968117683974e-06, + "loss": 0.1096, + "num_input_tokens_seen": 32548960, + "step": 154230 + }, + { + "epoch": 16.967546754675467, + "grad_norm": 0.036067165434360504, + "learning_rate": 3.4207561908472934e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32550048, + "step": 154235 + }, + { + "epoch": 16.96809680968097, + "grad_norm": 0.03361223265528679, + "learning_rate": 3.4195444628961572e-06, + "loss": 0.001, + "num_input_tokens_seen": 32551104, + "step": 154240 + }, + { + "epoch": 16.96864686468647, + "grad_norm": 0.010422705672681332, + "learning_rate": 3.4183329338417224e-06, + "loss": 0.0744, + "num_input_tokens_seen": 32552224, + "step": 154245 + }, + { + "epoch": 16.96919691969197, + "grad_norm": 0.07254448533058167, + "learning_rate": 3.41712160369515e-06, + "loss": 0.0702, + "num_input_tokens_seen": 32553312, + "step": 154250 + }, + { + "epoch": 16.96974697469747, + "grad_norm": 1.9806188344955444, + "learning_rate": 3.4159104724676122e-06, + "loss": 0.1296, + "num_input_tokens_seen": 32554336, + "step": 154255 + }, + { + "epoch": 16.97029702970297, + "grad_norm": 0.04255340248346329, + "learning_rate": 3.4146995401702715e-06, + "loss": 0.1339, + "num_input_tokens_seen": 32555456, + "step": 154260 + }, + { + "epoch": 16.97084708470847, + "grad_norm": 2.036229372024536, + "learning_rate": 3.413488806814294e-06, + "loss": 0.0538, + "num_input_tokens_seen": 32556480, + "step": 154265 + }, + { + "epoch": 16.97139713971397, + "grad_norm": 0.01805831864476204, + "learning_rate": 3.4122782724108295e-06, + "loss": 0.0721, + "num_input_tokens_seen": 32557504, + "step": 154270 + }, + { + "epoch": 16.971947194719473, + "grad_norm": 0.058877818286418915, + "learning_rate": 3.4110679369710326e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32558528, + "step": 154275 + }, + { + "epoch": 16.972497249724974, + "grad_norm": 0.0015583416679874063, + "learning_rate": 3.4098578005060696e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32559552, + "step": 154280 + }, + { + "epoch": 16.97304730473047, + "grad_norm": 0.009870298206806183, + "learning_rate": 3.4086478630270813e-06, + "loss": 0.0051, + "num_input_tokens_seen": 32560608, + "step": 154285 + }, + { + "epoch": 16.973597359735972, + "grad_norm": 0.019117044284939766, + "learning_rate": 3.4074381245452232e-06, + "loss": 0.0096, + "num_input_tokens_seen": 32561664, + "step": 154290 + }, + { + "epoch": 16.974147414741473, + "grad_norm": 0.02206617034971714, + "learning_rate": 3.406228585071655e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32562752, + "step": 154295 + }, + { + "epoch": 16.974697469746975, + "grad_norm": 0.003942445851862431, + "learning_rate": 3.4050192446175107e-06, + "loss": 0.0651, + "num_input_tokens_seen": 32563744, + "step": 154300 + }, + { + "epoch": 16.975247524752476, + "grad_norm": 0.07168262451887131, + "learning_rate": 3.403810103193944e-06, + "loss": 0.0053, + "num_input_tokens_seen": 32564704, + "step": 154305 + }, + { + "epoch": 16.975797579757977, + "grad_norm": 0.7317763566970825, + "learning_rate": 3.402601160812094e-06, + "loss": 0.0287, + "num_input_tokens_seen": 32565792, + "step": 154310 + }, + { + "epoch": 16.976347634763478, + "grad_norm": 0.04782311990857124, + "learning_rate": 3.401392417483107e-06, + "loss": 0.062, + "num_input_tokens_seen": 32566848, + "step": 154315 + }, + { + "epoch": 16.976897689768975, + "grad_norm": 0.26491138339042664, + "learning_rate": 3.400183873218124e-06, + "loss": 0.0087, + "num_input_tokens_seen": 32567904, + "step": 154320 + }, + { + "epoch": 16.977447744774476, + "grad_norm": 0.032911721616983414, + "learning_rate": 3.398975528028278e-06, + "loss": 0.0537, + "num_input_tokens_seen": 32568960, + "step": 154325 + }, + { + "epoch": 16.977997799779978, + "grad_norm": 0.011043748818337917, + "learning_rate": 3.3977673819247157e-06, + "loss": 0.0578, + "num_input_tokens_seen": 32569952, + "step": 154330 + }, + { + "epoch": 16.97854785478548, + "grad_norm": 0.004513012245297432, + "learning_rate": 3.3965594349185588e-06, + "loss": 0.1278, + "num_input_tokens_seen": 32571008, + "step": 154335 + }, + { + "epoch": 16.97909790979098, + "grad_norm": 0.02622084878385067, + "learning_rate": 3.3953516870209534e-06, + "loss": 0.1151, + "num_input_tokens_seen": 32572160, + "step": 154340 + }, + { + "epoch": 16.97964796479648, + "grad_norm": 0.01915965974330902, + "learning_rate": 3.394144138243019e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32573248, + "step": 154345 + }, + { + "epoch": 16.980198019801982, + "grad_norm": 0.06173976510763168, + "learning_rate": 3.392936788595891e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32574304, + "step": 154350 + }, + { + "epoch": 16.98074807480748, + "grad_norm": 2.3393399715423584, + "learning_rate": 3.3917296380907014e-06, + "loss": 0.0277, + "num_input_tokens_seen": 32575392, + "step": 154355 + }, + { + "epoch": 16.98129812981298, + "grad_norm": 2.433441400527954, + "learning_rate": 3.3905226867385648e-06, + "loss": 0.102, + "num_input_tokens_seen": 32576448, + "step": 154360 + }, + { + "epoch": 16.98184818481848, + "grad_norm": 0.023582682013511658, + "learning_rate": 3.3893159345506187e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32577472, + "step": 154365 + }, + { + "epoch": 16.982398239823983, + "grad_norm": 1.5285956859588623, + "learning_rate": 3.3881093815379712e-06, + "loss": 0.1154, + "num_input_tokens_seen": 32578560, + "step": 154370 + }, + { + "epoch": 16.982948294829484, + "grad_norm": 0.08721703290939331, + "learning_rate": 3.3869030277117493e-06, + "loss": 0.0855, + "num_input_tokens_seen": 32579616, + "step": 154375 + }, + { + "epoch": 16.983498349834985, + "grad_norm": 0.006927085109055042, + "learning_rate": 3.3856968730830745e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32580640, + "step": 154380 + }, + { + "epoch": 16.984048404840483, + "grad_norm": 1.3047951459884644, + "learning_rate": 3.3844909176630577e-06, + "loss": 0.0108, + "num_input_tokens_seen": 32581728, + "step": 154385 + }, + { + "epoch": 16.984598459845984, + "grad_norm": 0.11395629495382309, + "learning_rate": 3.3832851614628206e-06, + "loss": 0.0035, + "num_input_tokens_seen": 32582720, + "step": 154390 + }, + { + "epoch": 16.985148514851485, + "grad_norm": 0.7318142652511597, + "learning_rate": 3.382079604493471e-06, + "loss": 0.0085, + "num_input_tokens_seen": 32583744, + "step": 154395 + }, + { + "epoch": 16.985698569856986, + "grad_norm": 0.32877907156944275, + "learning_rate": 3.3808742467661164e-06, + "loss": 0.033, + "num_input_tokens_seen": 32584832, + "step": 154400 + }, + { + "epoch": 16.986248624862487, + "grad_norm": 0.021949350833892822, + "learning_rate": 3.3796690882918673e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32585888, + "step": 154405 + }, + { + "epoch": 16.986798679867988, + "grad_norm": 1.1042025089263916, + "learning_rate": 3.3784641290818375e-06, + "loss": 0.1249, + "num_input_tokens_seen": 32586912, + "step": 154410 + }, + { + "epoch": 16.98734873487349, + "grad_norm": 0.016870465129613876, + "learning_rate": 3.3772593691471315e-06, + "loss": 0.0049, + "num_input_tokens_seen": 32587936, + "step": 154415 + }, + { + "epoch": 16.987898789878987, + "grad_norm": 0.0441058948636055, + "learning_rate": 3.3760548084988517e-06, + "loss": 0.002, + "num_input_tokens_seen": 32588960, + "step": 154420 + }, + { + "epoch": 16.988448844884488, + "grad_norm": 0.017018236219882965, + "learning_rate": 3.3748504471480923e-06, + "loss": 0.0219, + "num_input_tokens_seen": 32589984, + "step": 154425 + }, + { + "epoch": 16.98899889988999, + "grad_norm": 0.00858613196760416, + "learning_rate": 3.373646285105958e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32591072, + "step": 154430 + }, + { + "epoch": 16.98954895489549, + "grad_norm": 1.2190171480178833, + "learning_rate": 3.3724423223835565e-06, + "loss": 0.061, + "num_input_tokens_seen": 32592160, + "step": 154435 + }, + { + "epoch": 16.99009900990099, + "grad_norm": 0.011014387011528015, + "learning_rate": 3.3712385589919682e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32593312, + "step": 154440 + }, + { + "epoch": 16.990649064906492, + "grad_norm": 0.03741157427430153, + "learning_rate": 3.3700349949423034e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32594368, + "step": 154445 + }, + { + "epoch": 16.99119911991199, + "grad_norm": 0.04983639717102051, + "learning_rate": 3.3688316302456417e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32595392, + "step": 154450 + }, + { + "epoch": 16.99174917491749, + "grad_norm": 0.04132974147796631, + "learning_rate": 3.3676284649130834e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32596416, + "step": 154455 + }, + { + "epoch": 16.992299229922992, + "grad_norm": 0.015485738404095173, + "learning_rate": 3.366425498955708e-06, + "loss": 0.0075, + "num_input_tokens_seen": 32597504, + "step": 154460 + }, + { + "epoch": 16.992849284928493, + "grad_norm": 0.10327315330505371, + "learning_rate": 3.3652227323846096e-06, + "loss": 0.018, + "num_input_tokens_seen": 32598560, + "step": 154465 + }, + { + "epoch": 16.993399339933994, + "grad_norm": 0.014543440192937851, + "learning_rate": 3.364020165210874e-06, + "loss": 0.0159, + "num_input_tokens_seen": 32599584, + "step": 154470 + }, + { + "epoch": 16.993949394939495, + "grad_norm": 0.008700171485543251, + "learning_rate": 3.3628177974455803e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32600672, + "step": 154475 + }, + { + "epoch": 16.994499449944996, + "grad_norm": 0.012094027362763882, + "learning_rate": 3.361615629099818e-06, + "loss": 0.0033, + "num_input_tokens_seen": 32601792, + "step": 154480 + }, + { + "epoch": 16.995049504950494, + "grad_norm": 0.04278276488184929, + "learning_rate": 3.3604136601846524e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32602848, + "step": 154485 + }, + { + "epoch": 16.995599559955995, + "grad_norm": 0.07598542422056198, + "learning_rate": 3.3592118907111724e-06, + "loss": 0.0197, + "num_input_tokens_seen": 32603872, + "step": 154490 + }, + { + "epoch": 16.996149614961496, + "grad_norm": 0.025269048288464546, + "learning_rate": 3.3580103206904577e-06, + "loss": 0.0563, + "num_input_tokens_seen": 32604896, + "step": 154495 + }, + { + "epoch": 16.996699669966997, + "grad_norm": 0.015146685764193535, + "learning_rate": 3.356808950133572e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32606048, + "step": 154500 + }, + { + "epoch": 16.997249724972498, + "grad_norm": 0.0071389633230865, + "learning_rate": 3.355607779051598e-06, + "loss": 0.1071, + "num_input_tokens_seen": 32607104, + "step": 154505 + }, + { + "epoch": 16.997799779978, + "grad_norm": 0.02079484984278679, + "learning_rate": 3.354406807455601e-06, + "loss": 0.0039, + "num_input_tokens_seen": 32608160, + "step": 154510 + }, + { + "epoch": 16.998349834983497, + "grad_norm": 0.005558294244110584, + "learning_rate": 3.3532060353566457e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32609280, + "step": 154515 + }, + { + "epoch": 16.998899889988998, + "grad_norm": 0.0665670856833458, + "learning_rate": 3.3520054627658033e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32610336, + "step": 154520 + }, + { + "epoch": 16.9994499449945, + "grad_norm": 0.031346645206213, + "learning_rate": 3.3508050896941366e-06, + "loss": 0.0399, + "num_input_tokens_seen": 32611456, + "step": 154525 + }, + { + "epoch": 17.0, + "grad_norm": 0.010692469775676727, + "learning_rate": 3.349604916152718e-06, + "loss": 0.1504, + "num_input_tokens_seen": 32612480, + "step": 154530 + }, + { + "epoch": 17.0, + "eval_loss": 0.07643326371908188, + "eval_runtime": 37.0516, + "eval_samples_per_second": 109.037, + "eval_steps_per_second": 27.259, + "num_input_tokens_seen": 32612480, + "step": 154530 + }, + { + "epoch": 17.0005500550055, + "grad_norm": 0.0312969833612442, + "learning_rate": 3.3484049421525997e-06, + "loss": 0.0292, + "num_input_tokens_seen": 32613504, + "step": 154535 + }, + { + "epoch": 17.001100110011002, + "grad_norm": 0.06249796599149704, + "learning_rate": 3.347205167704842e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32614592, + "step": 154540 + }, + { + "epoch": 17.001650165016503, + "grad_norm": 0.01583990454673767, + "learning_rate": 3.3460055928205003e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32615680, + "step": 154545 + }, + { + "epoch": 17.002200220022, + "grad_norm": 0.06745553016662598, + "learning_rate": 3.3448062175106378e-06, + "loss": 0.0104, + "num_input_tokens_seen": 32616736, + "step": 154550 + }, + { + "epoch": 17.002750275027502, + "grad_norm": 0.0364927239716053, + "learning_rate": 3.3436070417863092e-06, + "loss": 0.006, + "num_input_tokens_seen": 32617792, + "step": 154555 + }, + { + "epoch": 17.003300330033003, + "grad_norm": 2.0136492252349854, + "learning_rate": 3.3424080656585617e-06, + "loss": 0.0729, + "num_input_tokens_seen": 32618848, + "step": 154560 + }, + { + "epoch": 17.003850385038504, + "grad_norm": 0.07713478058576584, + "learning_rate": 3.3412092891384416e-06, + "loss": 0.0392, + "num_input_tokens_seen": 32619968, + "step": 154565 + }, + { + "epoch": 17.004400440044005, + "grad_norm": 1.6029813289642334, + "learning_rate": 3.34001071223701e-06, + "loss": 0.0409, + "num_input_tokens_seen": 32621056, + "step": 154570 + }, + { + "epoch": 17.004950495049506, + "grad_norm": 0.012414978817105293, + "learning_rate": 3.3388123349652993e-06, + "loss": 0.0217, + "num_input_tokens_seen": 32622112, + "step": 154575 + }, + { + "epoch": 17.005500550055004, + "grad_norm": 0.03360913321375847, + "learning_rate": 3.337614157334362e-06, + "loss": 0.0998, + "num_input_tokens_seen": 32623136, + "step": 154580 + }, + { + "epoch": 17.006050605060505, + "grad_norm": 1.3099488019943237, + "learning_rate": 3.336416179355245e-06, + "loss": 0.0148, + "num_input_tokens_seen": 32624192, + "step": 154585 + }, + { + "epoch": 17.006600660066006, + "grad_norm": 0.19070236384868622, + "learning_rate": 3.335218401038981e-06, + "loss": 0.0051, + "num_input_tokens_seen": 32625216, + "step": 154590 + }, + { + "epoch": 17.007150715071507, + "grad_norm": 1.549472451210022, + "learning_rate": 3.334020822396616e-06, + "loss": 0.075, + "num_input_tokens_seen": 32626304, + "step": 154595 + }, + { + "epoch": 17.007700770077008, + "grad_norm": 0.07428339868783951, + "learning_rate": 3.3328234434391816e-06, + "loss": 0.0063, + "num_input_tokens_seen": 32627328, + "step": 154600 + }, + { + "epoch": 17.00825082508251, + "grad_norm": 2.7719948291778564, + "learning_rate": 3.3316262641777147e-06, + "loss": 0.0587, + "num_input_tokens_seen": 32628416, + "step": 154605 + }, + { + "epoch": 17.00880088008801, + "grad_norm": 0.023414304479956627, + "learning_rate": 3.3304292846232573e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32629472, + "step": 154610 + }, + { + "epoch": 17.009350935093508, + "grad_norm": 2.898374557495117, + "learning_rate": 3.3292325047868305e-06, + "loss": 0.1278, + "num_input_tokens_seen": 32630496, + "step": 154615 + }, + { + "epoch": 17.00990099009901, + "grad_norm": 0.4499836266040802, + "learning_rate": 3.3280359246794733e-06, + "loss": 0.072, + "num_input_tokens_seen": 32631552, + "step": 154620 + }, + { + "epoch": 17.01045104510451, + "grad_norm": 0.010667107999324799, + "learning_rate": 3.3268395443122104e-06, + "loss": 0.0076, + "num_input_tokens_seen": 32632544, + "step": 154625 + }, + { + "epoch": 17.01100110011001, + "grad_norm": 0.18167249858379364, + "learning_rate": 3.3256433636960627e-06, + "loss": 0.049, + "num_input_tokens_seen": 32633568, + "step": 154630 + }, + { + "epoch": 17.011551155115512, + "grad_norm": 0.01812836155295372, + "learning_rate": 3.324447382842061e-06, + "loss": 0.0238, + "num_input_tokens_seen": 32634688, + "step": 154635 + }, + { + "epoch": 17.012101210121013, + "grad_norm": 0.2588762938976288, + "learning_rate": 3.3232516017612235e-06, + "loss": 0.0215, + "num_input_tokens_seen": 32635712, + "step": 154640 + }, + { + "epoch": 17.01265126512651, + "grad_norm": 0.07151281833648682, + "learning_rate": 3.322056020464581e-06, + "loss": 0.0721, + "num_input_tokens_seen": 32636736, + "step": 154645 + }, + { + "epoch": 17.013201320132012, + "grad_norm": 0.011699467897415161, + "learning_rate": 3.3208606389631467e-06, + "loss": 0.0027, + "num_input_tokens_seen": 32637792, + "step": 154650 + }, + { + "epoch": 17.013751375137513, + "grad_norm": 0.012514832429587841, + "learning_rate": 3.3196654572679315e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32638816, + "step": 154655 + }, + { + "epoch": 17.014301430143014, + "grad_norm": 0.08546805381774902, + "learning_rate": 3.318470475389962e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32639904, + "step": 154660 + }, + { + "epoch": 17.014851485148515, + "grad_norm": 3.748522996902466, + "learning_rate": 3.3172756933402356e-06, + "loss": 0.1402, + "num_input_tokens_seen": 32640992, + "step": 154665 + }, + { + "epoch": 17.015401540154016, + "grad_norm": 0.0052227480337023735, + "learning_rate": 3.3160811111297876e-06, + "loss": 0.0609, + "num_input_tokens_seen": 32642048, + "step": 154670 + }, + { + "epoch": 17.015951595159517, + "grad_norm": 0.05275560915470123, + "learning_rate": 3.3148867287696115e-06, + "loss": 0.0514, + "num_input_tokens_seen": 32643104, + "step": 154675 + }, + { + "epoch": 17.016501650165015, + "grad_norm": 0.21453511714935303, + "learning_rate": 3.3136925462707134e-06, + "loss": 0.031, + "num_input_tokens_seen": 32644128, + "step": 154680 + }, + { + "epoch": 17.017051705170516, + "grad_norm": 0.0026925657875835896, + "learning_rate": 3.312498563644115e-06, + "loss": 0.031, + "num_input_tokens_seen": 32645184, + "step": 154685 + }, + { + "epoch": 17.017601760176017, + "grad_norm": 3.125138521194458, + "learning_rate": 3.311304780900801e-06, + "loss": 0.0374, + "num_input_tokens_seen": 32646336, + "step": 154690 + }, + { + "epoch": 17.01815181518152, + "grad_norm": 0.023560969159007072, + "learning_rate": 3.310111198051785e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32647456, + "step": 154695 + }, + { + "epoch": 17.01870187018702, + "grad_norm": 0.05644388124346733, + "learning_rate": 3.30891781510807e-06, + "loss": 0.1547, + "num_input_tokens_seen": 32648544, + "step": 154700 + }, + { + "epoch": 17.01925192519252, + "grad_norm": 0.006858056876808405, + "learning_rate": 3.307724632080647e-06, + "loss": 0.0042, + "num_input_tokens_seen": 32649568, + "step": 154705 + }, + { + "epoch": 17.019801980198018, + "grad_norm": 0.2325194627046585, + "learning_rate": 3.3065316489805237e-06, + "loss": 0.0359, + "num_input_tokens_seen": 32650656, + "step": 154710 + }, + { + "epoch": 17.02035203520352, + "grad_norm": 0.011267352849245071, + "learning_rate": 3.305338865818683e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32651744, + "step": 154715 + }, + { + "epoch": 17.02090209020902, + "grad_norm": 0.1622290164232254, + "learning_rate": 3.304146282606127e-06, + "loss": 0.105, + "num_input_tokens_seen": 32652736, + "step": 154720 + }, + { + "epoch": 17.02145214521452, + "grad_norm": 0.007134619168937206, + "learning_rate": 3.302953899353842e-06, + "loss": 0.0724, + "num_input_tokens_seen": 32653728, + "step": 154725 + }, + { + "epoch": 17.022002200220022, + "grad_norm": 1.2019325494766235, + "learning_rate": 3.3017617160728155e-06, + "loss": 0.0091, + "num_input_tokens_seen": 32654752, + "step": 154730 + }, + { + "epoch": 17.022552255225524, + "grad_norm": 0.20128054916858673, + "learning_rate": 3.3005697327740476e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32655744, + "step": 154735 + }, + { + "epoch": 17.023102310231025, + "grad_norm": 0.0016453906428068876, + "learning_rate": 3.29937794946851e-06, + "loss": 0.0082, + "num_input_tokens_seen": 32656768, + "step": 154740 + }, + { + "epoch": 17.023652365236522, + "grad_norm": 0.2948291301727295, + "learning_rate": 3.2981863661671996e-06, + "loss": 0.0506, + "num_input_tokens_seen": 32657856, + "step": 154745 + }, + { + "epoch": 17.024202420242023, + "grad_norm": 0.2397981584072113, + "learning_rate": 3.2969949828810855e-06, + "loss": 0.0503, + "num_input_tokens_seen": 32658944, + "step": 154750 + }, + { + "epoch": 17.024752475247524, + "grad_norm": 0.017897717654705048, + "learning_rate": 3.2958037996211527e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32660000, + "step": 154755 + }, + { + "epoch": 17.025302530253025, + "grad_norm": 0.04224969819188118, + "learning_rate": 3.29461281639839e-06, + "loss": 0.0617, + "num_input_tokens_seen": 32661056, + "step": 154760 + }, + { + "epoch": 17.025852585258527, + "grad_norm": 0.08183349668979645, + "learning_rate": 3.2934220332237584e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32662144, + "step": 154765 + }, + { + "epoch": 17.026402640264028, + "grad_norm": 0.07383553683757782, + "learning_rate": 3.292231450108246e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32663200, + "step": 154770 + }, + { + "epoch": 17.02695269526953, + "grad_norm": 0.02885655127465725, + "learning_rate": 3.2910410670628222e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32664224, + "step": 154775 + }, + { + "epoch": 17.027502750275026, + "grad_norm": 0.005362489260733128, + "learning_rate": 3.2898508840984415e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32665312, + "step": 154780 + }, + { + "epoch": 17.028052805280527, + "grad_norm": 3.276662826538086, + "learning_rate": 3.288660901226101e-06, + "loss": 0.126, + "num_input_tokens_seen": 32666368, + "step": 154785 + }, + { + "epoch": 17.02860286028603, + "grad_norm": 0.22903375327587128, + "learning_rate": 3.2874711184567476e-06, + "loss": 0.0448, + "num_input_tokens_seen": 32667392, + "step": 154790 + }, + { + "epoch": 17.02915291529153, + "grad_norm": 0.033934347331523895, + "learning_rate": 3.286281535801361e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32668416, + "step": 154795 + }, + { + "epoch": 17.02970297029703, + "grad_norm": 0.1876448690891266, + "learning_rate": 3.2850921532708967e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32669472, + "step": 154800 + }, + { + "epoch": 17.03025302530253, + "grad_norm": 0.05936002358794212, + "learning_rate": 3.2839029708763123e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32670464, + "step": 154805 + }, + { + "epoch": 17.03080308030803, + "grad_norm": 2.44423246383667, + "learning_rate": 3.2827139886285775e-06, + "loss": 0.097, + "num_input_tokens_seen": 32671520, + "step": 154810 + }, + { + "epoch": 17.03135313531353, + "grad_norm": 0.007954691536724567, + "learning_rate": 3.2815252065386494e-06, + "loss": 0.0022, + "num_input_tokens_seen": 32672512, + "step": 154815 + }, + { + "epoch": 17.03190319031903, + "grad_norm": 0.03340848535299301, + "learning_rate": 3.2803366246174777e-06, + "loss": 0.0491, + "num_input_tokens_seen": 32673536, + "step": 154820 + }, + { + "epoch": 17.032453245324533, + "grad_norm": 0.05036370828747749, + "learning_rate": 3.279148242876026e-06, + "loss": 0.0039, + "num_input_tokens_seen": 32674560, + "step": 154825 + }, + { + "epoch": 17.033003300330034, + "grad_norm": 0.6401095390319824, + "learning_rate": 3.2779600613252357e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32675648, + "step": 154830 + }, + { + "epoch": 17.033553355335535, + "grad_norm": 0.0019150658044964075, + "learning_rate": 3.2767720799760705e-06, + "loss": 0.1003, + "num_input_tokens_seen": 32676672, + "step": 154835 + }, + { + "epoch": 17.034103410341036, + "grad_norm": 0.009434974752366543, + "learning_rate": 3.275584298839468e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32677792, + "step": 154840 + }, + { + "epoch": 17.034653465346533, + "grad_norm": 0.019891714677214622, + "learning_rate": 3.274396717926381e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32678912, + "step": 154845 + }, + { + "epoch": 17.035203520352034, + "grad_norm": 1.5515731573104858, + "learning_rate": 3.2732093372477594e-06, + "loss": 0.0857, + "num_input_tokens_seen": 32680000, + "step": 154850 + }, + { + "epoch": 17.035753575357536, + "grad_norm": 0.013858293183147907, + "learning_rate": 3.2720221568145387e-06, + "loss": 0.0054, + "num_input_tokens_seen": 32681088, + "step": 154855 + }, + { + "epoch": 17.036303630363037, + "grad_norm": 0.13728515803813934, + "learning_rate": 3.2708351766376657e-06, + "loss": 0.0247, + "num_input_tokens_seen": 32682112, + "step": 154860 + }, + { + "epoch": 17.036853685368538, + "grad_norm": 0.18985192477703094, + "learning_rate": 3.2696483967280754e-06, + "loss": 0.0433, + "num_input_tokens_seen": 32683072, + "step": 154865 + }, + { + "epoch": 17.03740374037404, + "grad_norm": 0.0762273445725441, + "learning_rate": 3.2684618170967075e-06, + "loss": 0.0342, + "num_input_tokens_seen": 32684128, + "step": 154870 + }, + { + "epoch": 17.037953795379536, + "grad_norm": 0.043789446353912354, + "learning_rate": 3.267275437754505e-06, + "loss": 0.002, + "num_input_tokens_seen": 32685120, + "step": 154875 + }, + { + "epoch": 17.038503850385037, + "grad_norm": 1.706199288368225, + "learning_rate": 3.2660892587123904e-06, + "loss": 0.0638, + "num_input_tokens_seen": 32686208, + "step": 154880 + }, + { + "epoch": 17.03905390539054, + "grad_norm": 0.3676161766052246, + "learning_rate": 3.264903279981307e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32687392, + "step": 154885 + }, + { + "epoch": 17.03960396039604, + "grad_norm": 0.07029686123132706, + "learning_rate": 3.2637175015721805e-06, + "loss": 0.0022, + "num_input_tokens_seen": 32688416, + "step": 154890 + }, + { + "epoch": 17.04015401540154, + "grad_norm": 0.00987032800912857, + "learning_rate": 3.2625319234959373e-06, + "loss": 0.0132, + "num_input_tokens_seen": 32689440, + "step": 154895 + }, + { + "epoch": 17.040704070407042, + "grad_norm": 0.034494683146476746, + "learning_rate": 3.261346545763502e-06, + "loss": 0.001, + "num_input_tokens_seen": 32690496, + "step": 154900 + }, + { + "epoch": 17.041254125412543, + "grad_norm": 2.6496973037719727, + "learning_rate": 3.2601613683858085e-06, + "loss": 0.0252, + "num_input_tokens_seen": 32691584, + "step": 154905 + }, + { + "epoch": 17.04180418041804, + "grad_norm": 0.05947059020400047, + "learning_rate": 3.2589763913737805e-06, + "loss": 0.0052, + "num_input_tokens_seen": 32692640, + "step": 154910 + }, + { + "epoch": 17.04235423542354, + "grad_norm": 0.09923987835645676, + "learning_rate": 3.2577916147383343e-06, + "loss": 0.004, + "num_input_tokens_seen": 32693696, + "step": 154915 + }, + { + "epoch": 17.042904290429043, + "grad_norm": 0.05060970410704613, + "learning_rate": 3.2566070384903836e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32694784, + "step": 154920 + }, + { + "epoch": 17.043454345434544, + "grad_norm": 0.011991698294878006, + "learning_rate": 3.255422662640853e-06, + "loss": 0.0016, + "num_input_tokens_seen": 32695776, + "step": 154925 + }, + { + "epoch": 17.044004400440045, + "grad_norm": 0.016517067328095436, + "learning_rate": 3.254238487200656e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32696864, + "step": 154930 + }, + { + "epoch": 17.044554455445546, + "grad_norm": 0.011823268607258797, + "learning_rate": 3.2530545121807145e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32697856, + "step": 154935 + }, + { + "epoch": 17.045104510451043, + "grad_norm": 0.05015919357538223, + "learning_rate": 3.251870737591936e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32698944, + "step": 154940 + }, + { + "epoch": 17.045654565456545, + "grad_norm": 0.020274870097637177, + "learning_rate": 3.2506871634452236e-06, + "loss": 0.0365, + "num_input_tokens_seen": 32700000, + "step": 154945 + }, + { + "epoch": 17.046204620462046, + "grad_norm": 0.004586890805512667, + "learning_rate": 3.2495037897514957e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32701024, + "step": 154950 + }, + { + "epoch": 17.046754675467547, + "grad_norm": 0.12250353395938873, + "learning_rate": 3.2483206165216466e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32702048, + "step": 154955 + }, + { + "epoch": 17.047304730473048, + "grad_norm": 0.022178903222084045, + "learning_rate": 3.247137643766593e-06, + "loss": 0.0023, + "num_input_tokens_seen": 32703200, + "step": 154960 + }, + { + "epoch": 17.04785478547855, + "grad_norm": 1.289892315864563, + "learning_rate": 3.2459548714972365e-06, + "loss": 0.0104, + "num_input_tokens_seen": 32704224, + "step": 154965 + }, + { + "epoch": 17.04840484048405, + "grad_norm": 0.01026725210249424, + "learning_rate": 3.2447722997244718e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32705312, + "step": 154970 + }, + { + "epoch": 17.048954895489548, + "grad_norm": 1.9080750942230225, + "learning_rate": 3.2435899284592063e-06, + "loss": 0.0553, + "num_input_tokens_seen": 32706336, + "step": 154975 + }, + { + "epoch": 17.04950495049505, + "grad_norm": 0.039720259606838226, + "learning_rate": 3.2424077577123265e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32707392, + "step": 154980 + }, + { + "epoch": 17.05005500550055, + "grad_norm": 0.011915387585759163, + "learning_rate": 3.2412257874947367e-06, + "loss": 0.0045, + "num_input_tokens_seen": 32708448, + "step": 154985 + }, + { + "epoch": 17.05060506050605, + "grad_norm": 0.018879611045122147, + "learning_rate": 3.2400440178173315e-06, + "loss": 0.01, + "num_input_tokens_seen": 32709472, + "step": 154990 + }, + { + "epoch": 17.051155115511552, + "grad_norm": 0.02671623043715954, + "learning_rate": 3.2388624486909936e-06, + "loss": 0.003, + "num_input_tokens_seen": 32710592, + "step": 154995 + }, + { + "epoch": 17.051705170517053, + "grad_norm": 0.020101241767406464, + "learning_rate": 3.237681080126623e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32711616, + "step": 155000 + }, + { + "epoch": 17.05225522552255, + "grad_norm": 0.2488938719034195, + "learning_rate": 3.2364999121351047e-06, + "loss": 0.0599, + "num_input_tokens_seen": 32712576, + "step": 155005 + }, + { + "epoch": 17.05280528052805, + "grad_norm": 0.15314936637878418, + "learning_rate": 3.235318944727317e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32713664, + "step": 155010 + }, + { + "epoch": 17.053355335533553, + "grad_norm": 0.040546029806137085, + "learning_rate": 3.2341381779141535e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32714752, + "step": 155015 + }, + { + "epoch": 17.053905390539054, + "grad_norm": 0.07432370632886887, + "learning_rate": 3.2329576117064914e-06, + "loss": 0.003, + "num_input_tokens_seen": 32715808, + "step": 155020 + }, + { + "epoch": 17.054455445544555, + "grad_norm": 0.008969205431640148, + "learning_rate": 3.2317772461152197e-06, + "loss": 0.002, + "num_input_tokens_seen": 32716864, + "step": 155025 + }, + { + "epoch": 17.055005500550056, + "grad_norm": 5.538593769073486, + "learning_rate": 3.2305970811512127e-06, + "loss": 0.0313, + "num_input_tokens_seen": 32717920, + "step": 155030 + }, + { + "epoch": 17.055555555555557, + "grad_norm": 0.029493188485503197, + "learning_rate": 3.2294171168253394e-06, + "loss": 0.0658, + "num_input_tokens_seen": 32719008, + "step": 155035 + }, + { + "epoch": 17.056105610561055, + "grad_norm": 0.0074409195221960545, + "learning_rate": 3.2282373531484833e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32720128, + "step": 155040 + }, + { + "epoch": 17.056655665566556, + "grad_norm": 0.40444785356521606, + "learning_rate": 3.2270577901315184e-06, + "loss": 0.004, + "num_input_tokens_seen": 32721216, + "step": 155045 + }, + { + "epoch": 17.057205720572057, + "grad_norm": 0.062135327607393265, + "learning_rate": 3.225878427785317e-06, + "loss": 0.109, + "num_input_tokens_seen": 32722240, + "step": 155050 + }, + { + "epoch": 17.057755775577558, + "grad_norm": 0.30018162727355957, + "learning_rate": 3.2246992661207453e-06, + "loss": 0.0127, + "num_input_tokens_seen": 32723296, + "step": 155055 + }, + { + "epoch": 17.05830583058306, + "grad_norm": 2.239781379699707, + "learning_rate": 3.2235203051486695e-06, + "loss": 0.2031, + "num_input_tokens_seen": 32724352, + "step": 155060 + }, + { + "epoch": 17.05885588558856, + "grad_norm": 0.02652578242123127, + "learning_rate": 3.2223415448799613e-06, + "loss": 0.0892, + "num_input_tokens_seen": 32725376, + "step": 155065 + }, + { + "epoch": 17.059405940594058, + "grad_norm": 0.01745596155524254, + "learning_rate": 3.2211629853254737e-06, + "loss": 0.0418, + "num_input_tokens_seen": 32726496, + "step": 155070 + }, + { + "epoch": 17.05995599559956, + "grad_norm": 0.07159799337387085, + "learning_rate": 3.219984626496078e-06, + "loss": 0.0108, + "num_input_tokens_seen": 32727584, + "step": 155075 + }, + { + "epoch": 17.06050605060506, + "grad_norm": 0.5950127243995667, + "learning_rate": 3.218806468402641e-06, + "loss": 0.0458, + "num_input_tokens_seen": 32728608, + "step": 155080 + }, + { + "epoch": 17.06105610561056, + "grad_norm": 0.011353028938174248, + "learning_rate": 3.2176285110560034e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32729632, + "step": 155085 + }, + { + "epoch": 17.061606160616062, + "grad_norm": 0.33484408259391785, + "learning_rate": 3.2164507544670403e-06, + "loss": 0.0083, + "num_input_tokens_seen": 32730720, + "step": 155090 + }, + { + "epoch": 17.062156215621563, + "grad_norm": 0.01672995649278164, + "learning_rate": 3.21527319864659e-06, + "loss": 0.0648, + "num_input_tokens_seen": 32731776, + "step": 155095 + }, + { + "epoch": 17.062706270627064, + "grad_norm": 0.008584958501160145, + "learning_rate": 3.2140958436055135e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32732864, + "step": 155100 + }, + { + "epoch": 17.063256325632562, + "grad_norm": 1.1528810262680054, + "learning_rate": 3.2129186893546685e-06, + "loss": 0.0074, + "num_input_tokens_seen": 32733952, + "step": 155105 + }, + { + "epoch": 17.063806380638063, + "grad_norm": 0.13291914761066437, + "learning_rate": 3.211741735904891e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32735008, + "step": 155110 + }, + { + "epoch": 17.064356435643564, + "grad_norm": 0.04514166712760925, + "learning_rate": 3.210564983267042e-06, + "loss": 0.0194, + "num_input_tokens_seen": 32736064, + "step": 155115 + }, + { + "epoch": 17.064906490649065, + "grad_norm": 0.3431178331375122, + "learning_rate": 3.209388431451954e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32737120, + "step": 155120 + }, + { + "epoch": 17.065456545654566, + "grad_norm": 0.041378144174814224, + "learning_rate": 3.2082120804704796e-06, + "loss": 0.0172, + "num_input_tokens_seen": 32738208, + "step": 155125 + }, + { + "epoch": 17.066006600660067, + "grad_norm": 0.017414389178156853, + "learning_rate": 3.2070359303334545e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32739264, + "step": 155130 + }, + { + "epoch": 17.066556655665565, + "grad_norm": 0.01084089931100607, + "learning_rate": 3.2058599810517233e-06, + "loss": 0.0899, + "num_input_tokens_seen": 32740256, + "step": 155135 + }, + { + "epoch": 17.067106710671066, + "grad_norm": 0.022208552807569504, + "learning_rate": 3.2046842326361264e-06, + "loss": 0.2116, + "num_input_tokens_seen": 32741344, + "step": 155140 + }, + { + "epoch": 17.067656765676567, + "grad_norm": 0.3939327597618103, + "learning_rate": 3.2035086850974892e-06, + "loss": 0.0175, + "num_input_tokens_seen": 32742464, + "step": 155145 + }, + { + "epoch": 17.068206820682068, + "grad_norm": 0.1441003382205963, + "learning_rate": 3.2023333384466615e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32743488, + "step": 155150 + }, + { + "epoch": 17.06875687568757, + "grad_norm": 0.037390463054180145, + "learning_rate": 3.201158192694462e-06, + "loss": 0.0044, + "num_input_tokens_seen": 32744544, + "step": 155155 + }, + { + "epoch": 17.06930693069307, + "grad_norm": 0.16853170096874237, + "learning_rate": 3.199983247851726e-06, + "loss": 0.014, + "num_input_tokens_seen": 32745632, + "step": 155160 + }, + { + "epoch": 17.06985698569857, + "grad_norm": 0.08007325232028961, + "learning_rate": 3.198808503929293e-06, + "loss": 0.0431, + "num_input_tokens_seen": 32746656, + "step": 155165 + }, + { + "epoch": 17.07040704070407, + "grad_norm": 0.04397173970937729, + "learning_rate": 3.197633960937971e-06, + "loss": 0.0055, + "num_input_tokens_seen": 32747712, + "step": 155170 + }, + { + "epoch": 17.07095709570957, + "grad_norm": 0.017553191632032394, + "learning_rate": 3.1964596188886033e-06, + "loss": 0.0019, + "num_input_tokens_seen": 32748768, + "step": 155175 + }, + { + "epoch": 17.07150715071507, + "grad_norm": 0.07828492671251297, + "learning_rate": 3.1952854777920036e-06, + "loss": 0.0239, + "num_input_tokens_seen": 32749760, + "step": 155180 + }, + { + "epoch": 17.072057205720572, + "grad_norm": 0.01503084134310484, + "learning_rate": 3.1941115376589894e-06, + "loss": 0.0054, + "num_input_tokens_seen": 32750848, + "step": 155185 + }, + { + "epoch": 17.072607260726073, + "grad_norm": 0.010992608033120632, + "learning_rate": 3.1929377985003865e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32751904, + "step": 155190 + }, + { + "epoch": 17.073157315731574, + "grad_norm": 0.02953209914267063, + "learning_rate": 3.1917642603270153e-06, + "loss": 0.0351, + "num_input_tokens_seen": 32752896, + "step": 155195 + }, + { + "epoch": 17.073707370737075, + "grad_norm": 0.0829240009188652, + "learning_rate": 3.1905909231496832e-06, + "loss": 0.0296, + "num_input_tokens_seen": 32753984, + "step": 155200 + }, + { + "epoch": 17.074257425742573, + "grad_norm": 2.3212664127349854, + "learning_rate": 3.189417786979218e-06, + "loss": 0.03, + "num_input_tokens_seen": 32755040, + "step": 155205 + }, + { + "epoch": 17.074807480748074, + "grad_norm": 0.025835588574409485, + "learning_rate": 3.1882448518264136e-06, + "loss": 0.0058, + "num_input_tokens_seen": 32756096, + "step": 155210 + }, + { + "epoch": 17.075357535753575, + "grad_norm": 0.05674891173839569, + "learning_rate": 3.1870721177020945e-06, + "loss": 0.0081, + "num_input_tokens_seen": 32757088, + "step": 155215 + }, + { + "epoch": 17.075907590759076, + "grad_norm": 0.029876338317990303, + "learning_rate": 3.1858995846170665e-06, + "loss": 0.0208, + "num_input_tokens_seen": 32758144, + "step": 155220 + }, + { + "epoch": 17.076457645764577, + "grad_norm": 0.049857232719659805, + "learning_rate": 3.1847272525821314e-06, + "loss": 0.0226, + "num_input_tokens_seen": 32759168, + "step": 155225 + }, + { + "epoch": 17.07700770077008, + "grad_norm": 0.016928331926465034, + "learning_rate": 3.1835551216081007e-06, + "loss": 0.0539, + "num_input_tokens_seen": 32760224, + "step": 155230 + }, + { + "epoch": 17.077557755775576, + "grad_norm": 2.5298914909362793, + "learning_rate": 3.1823831917057706e-06, + "loss": 0.0143, + "num_input_tokens_seen": 32761248, + "step": 155235 + }, + { + "epoch": 17.078107810781077, + "grad_norm": 0.009759422391653061, + "learning_rate": 3.1812114628859495e-06, + "loss": 0.0381, + "num_input_tokens_seen": 32762272, + "step": 155240 + }, + { + "epoch": 17.078657865786578, + "grad_norm": 0.07725739479064941, + "learning_rate": 3.180039935159429e-06, + "loss": 0.004, + "num_input_tokens_seen": 32763328, + "step": 155245 + }, + { + "epoch": 17.07920792079208, + "grad_norm": 0.6081969738006592, + "learning_rate": 3.1788686085370085e-06, + "loss": 0.0088, + "num_input_tokens_seen": 32764384, + "step": 155250 + }, + { + "epoch": 17.07975797579758, + "grad_norm": 0.04364801570773125, + "learning_rate": 3.1776974830294903e-06, + "loss": 0.003, + "num_input_tokens_seen": 32765440, + "step": 155255 + }, + { + "epoch": 17.08030803080308, + "grad_norm": 0.008516640402376652, + "learning_rate": 3.176526558647658e-06, + "loss": 0.1193, + "num_input_tokens_seen": 32766432, + "step": 155260 + }, + { + "epoch": 17.080858085808583, + "grad_norm": 1.98699152469635, + "learning_rate": 3.175355835402316e-06, + "loss": 0.0148, + "num_input_tokens_seen": 32767488, + "step": 155265 + }, + { + "epoch": 17.08140814081408, + "grad_norm": 2.456225872039795, + "learning_rate": 3.17418531330424e-06, + "loss": 0.0195, + "num_input_tokens_seen": 32768448, + "step": 155270 + }, + { + "epoch": 17.08195819581958, + "grad_norm": 0.09616030752658844, + "learning_rate": 3.1730149923642234e-06, + "loss": 0.0092, + "num_input_tokens_seen": 32769472, + "step": 155275 + }, + { + "epoch": 17.082508250825082, + "grad_norm": 0.4161761701107025, + "learning_rate": 3.1718448725930607e-06, + "loss": 0.0061, + "num_input_tokens_seen": 32770528, + "step": 155280 + }, + { + "epoch": 17.083058305830583, + "grad_norm": 0.028347468003630638, + "learning_rate": 3.1706749540015234e-06, + "loss": 0.0051, + "num_input_tokens_seen": 32771584, + "step": 155285 + }, + { + "epoch": 17.083608360836084, + "grad_norm": 0.08678145706653595, + "learning_rate": 3.1695052366004064e-06, + "loss": 0.0109, + "num_input_tokens_seen": 32772672, + "step": 155290 + }, + { + "epoch": 17.084158415841586, + "grad_norm": 0.016314567998051643, + "learning_rate": 3.1683357204004837e-06, + "loss": 0.0019, + "num_input_tokens_seen": 32773728, + "step": 155295 + }, + { + "epoch": 17.084708470847083, + "grad_norm": 0.02362130582332611, + "learning_rate": 3.1671664054125304e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32774752, + "step": 155300 + }, + { + "epoch": 17.085258525852584, + "grad_norm": 0.07024861872196198, + "learning_rate": 3.1659972916473295e-06, + "loss": 0.003, + "num_input_tokens_seen": 32775840, + "step": 155305 + }, + { + "epoch": 17.085808580858085, + "grad_norm": 3.227626323699951, + "learning_rate": 3.164828379115653e-06, + "loss": 0.0711, + "num_input_tokens_seen": 32776896, + "step": 155310 + }, + { + "epoch": 17.086358635863586, + "grad_norm": 0.023011766374111176, + "learning_rate": 3.163659667828281e-06, + "loss": 0.0114, + "num_input_tokens_seen": 32777984, + "step": 155315 + }, + { + "epoch": 17.086908690869087, + "grad_norm": 0.6849150657653809, + "learning_rate": 3.162491157795977e-06, + "loss": 0.009, + "num_input_tokens_seen": 32779072, + "step": 155320 + }, + { + "epoch": 17.08745874587459, + "grad_norm": 0.00464974157512188, + "learning_rate": 3.161322849029513e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32780160, + "step": 155325 + }, + { + "epoch": 17.08800880088009, + "grad_norm": 3.7938146591186523, + "learning_rate": 3.160154741539653e-06, + "loss": 0.1106, + "num_input_tokens_seen": 32781120, + "step": 155330 + }, + { + "epoch": 17.088558855885587, + "grad_norm": 0.004533181432634592, + "learning_rate": 3.1589868353371686e-06, + "loss": 0.0066, + "num_input_tokens_seen": 32782208, + "step": 155335 + }, + { + "epoch": 17.08910891089109, + "grad_norm": 0.0387372188270092, + "learning_rate": 3.157819130432829e-06, + "loss": 0.0252, + "num_input_tokens_seen": 32783200, + "step": 155340 + }, + { + "epoch": 17.08965896589659, + "grad_norm": 0.2973555624485016, + "learning_rate": 3.1566516268373868e-06, + "loss": 0.006, + "num_input_tokens_seen": 32784288, + "step": 155345 + }, + { + "epoch": 17.09020902090209, + "grad_norm": 6.108741283416748, + "learning_rate": 3.1554843245616027e-06, + "loss": 0.0887, + "num_input_tokens_seen": 32785408, + "step": 155350 + }, + { + "epoch": 17.09075907590759, + "grad_norm": 0.02015000768005848, + "learning_rate": 3.1543172236162405e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32786528, + "step": 155355 + }, + { + "epoch": 17.091309130913093, + "grad_norm": 0.16104421019554138, + "learning_rate": 3.15315032401205e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32787616, + "step": 155360 + }, + { + "epoch": 17.09185918591859, + "grad_norm": 0.16234160959720612, + "learning_rate": 3.1519836257597884e-06, + "loss": 0.0045, + "num_input_tokens_seen": 32788608, + "step": 155365 + }, + { + "epoch": 17.09240924092409, + "grad_norm": 0.01638677716255188, + "learning_rate": 3.1508171288702178e-06, + "loss": 0.0387, + "num_input_tokens_seen": 32789632, + "step": 155370 + }, + { + "epoch": 17.092959295929592, + "grad_norm": 0.01708350144326687, + "learning_rate": 3.1496508333540733e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32790720, + "step": 155375 + }, + { + "epoch": 17.093509350935093, + "grad_norm": 0.02921193465590477, + "learning_rate": 3.1484847392221185e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32791808, + "step": 155380 + }, + { + "epoch": 17.094059405940595, + "grad_norm": 0.017524365335702896, + "learning_rate": 3.1473188464850893e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32792864, + "step": 155385 + }, + { + "epoch": 17.094609460946096, + "grad_norm": 0.10970695316791534, + "learning_rate": 3.146153155153736e-06, + "loss": 0.0856, + "num_input_tokens_seen": 32793888, + "step": 155390 + }, + { + "epoch": 17.095159515951597, + "grad_norm": 0.004354860167950392, + "learning_rate": 3.1449876652388104e-06, + "loss": 0.0027, + "num_input_tokens_seen": 32794880, + "step": 155395 + }, + { + "epoch": 17.095709570957094, + "grad_norm": 0.10085837543010712, + "learning_rate": 3.1438223767510376e-06, + "loss": 0.0634, + "num_input_tokens_seen": 32795840, + "step": 155400 + }, + { + "epoch": 17.096259625962595, + "grad_norm": 0.005323614459484816, + "learning_rate": 3.142657289701173e-06, + "loss": 0.0667, + "num_input_tokens_seen": 32796960, + "step": 155405 + }, + { + "epoch": 17.096809680968097, + "grad_norm": 0.07460861653089523, + "learning_rate": 3.1414924040999493e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32798080, + "step": 155410 + }, + { + "epoch": 17.097359735973598, + "grad_norm": 0.02401544526219368, + "learning_rate": 3.1403277199580943e-06, + "loss": 0.0026, + "num_input_tokens_seen": 32799168, + "step": 155415 + }, + { + "epoch": 17.0979097909791, + "grad_norm": 0.11166016757488251, + "learning_rate": 3.1391632372863495e-06, + "loss": 0.0097, + "num_input_tokens_seen": 32800192, + "step": 155420 + }, + { + "epoch": 17.0984598459846, + "grad_norm": 0.044011298567056656, + "learning_rate": 3.1379989560954478e-06, + "loss": 0.1463, + "num_input_tokens_seen": 32801184, + "step": 155425 + }, + { + "epoch": 17.099009900990097, + "grad_norm": 3.0541985034942627, + "learning_rate": 3.1368348763961248e-06, + "loss": 0.0769, + "num_input_tokens_seen": 32802208, + "step": 155430 + }, + { + "epoch": 17.0995599559956, + "grad_norm": 0.013821718282997608, + "learning_rate": 3.1356709981991056e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32803232, + "step": 155435 + }, + { + "epoch": 17.1001100110011, + "grad_norm": 0.007481798063963652, + "learning_rate": 3.1345073215151066e-06, + "loss": 0.0286, + "num_input_tokens_seen": 32804320, + "step": 155440 + }, + { + "epoch": 17.1006600660066, + "grad_norm": 0.10905394703149796, + "learning_rate": 3.1333438463548693e-06, + "loss": 0.0033, + "num_input_tokens_seen": 32805408, + "step": 155445 + }, + { + "epoch": 17.1012101210121, + "grad_norm": 0.0037227843422442675, + "learning_rate": 3.132180572729096e-06, + "loss": 0.0136, + "num_input_tokens_seen": 32806432, + "step": 155450 + }, + { + "epoch": 17.101760176017603, + "grad_norm": 0.012803759425878525, + "learning_rate": 3.131017500648534e-06, + "loss": 0.018, + "num_input_tokens_seen": 32807424, + "step": 155455 + }, + { + "epoch": 17.102310231023104, + "grad_norm": 0.11603642255067825, + "learning_rate": 3.1298546301238907e-06, + "loss": 0.0103, + "num_input_tokens_seen": 32808448, + "step": 155460 + }, + { + "epoch": 17.1028602860286, + "grad_norm": 2.623950719833374, + "learning_rate": 3.128691961165875e-06, + "loss": 0.0982, + "num_input_tokens_seen": 32809536, + "step": 155465 + }, + { + "epoch": 17.103410341034103, + "grad_norm": 3.639906167984009, + "learning_rate": 3.1275294937852168e-06, + "loss": 0.1664, + "num_input_tokens_seen": 32810528, + "step": 155470 + }, + { + "epoch": 17.103960396039604, + "grad_norm": 0.023933539167046547, + "learning_rate": 3.126367227992619e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32811616, + "step": 155475 + }, + { + "epoch": 17.104510451045105, + "grad_norm": 0.017097529023885727, + "learning_rate": 3.1252051637987973e-06, + "loss": 0.0114, + "num_input_tokens_seen": 32812672, + "step": 155480 + }, + { + "epoch": 17.105060506050606, + "grad_norm": 0.021486297249794006, + "learning_rate": 3.1240433012144687e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32813760, + "step": 155485 + }, + { + "epoch": 17.105610561056107, + "grad_norm": 0.011993125081062317, + "learning_rate": 3.122881640250333e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32814784, + "step": 155490 + }, + { + "epoch": 17.106160616061604, + "grad_norm": 0.001741489744745195, + "learning_rate": 3.121720180917104e-06, + "loss": 0.0005, + "num_input_tokens_seen": 32815968, + "step": 155495 + }, + { + "epoch": 17.106710671067106, + "grad_norm": 0.12181991338729858, + "learning_rate": 3.120558923225475e-06, + "loss": 0.003, + "num_input_tokens_seen": 32816992, + "step": 155500 + }, + { + "epoch": 17.107260726072607, + "grad_norm": 3.2551653385162354, + "learning_rate": 3.1193978671861606e-06, + "loss": 0.013, + "num_input_tokens_seen": 32818080, + "step": 155505 + }, + { + "epoch": 17.107810781078108, + "grad_norm": 0.006401609163731337, + "learning_rate": 3.118237012809852e-06, + "loss": 0.086, + "num_input_tokens_seen": 32819136, + "step": 155510 + }, + { + "epoch": 17.10836083608361, + "grad_norm": 2.338792562484741, + "learning_rate": 3.1170763601072543e-06, + "loss": 0.0677, + "num_input_tokens_seen": 32820160, + "step": 155515 + }, + { + "epoch": 17.10891089108911, + "grad_norm": 0.0225394144654274, + "learning_rate": 3.1159159090890677e-06, + "loss": 0.0113, + "num_input_tokens_seen": 32821216, + "step": 155520 + }, + { + "epoch": 17.10946094609461, + "grad_norm": 1.1227197647094727, + "learning_rate": 3.114755659765978e-06, + "loss": 0.0165, + "num_input_tokens_seen": 32822272, + "step": 155525 + }, + { + "epoch": 17.11001100110011, + "grad_norm": 0.3618495464324951, + "learning_rate": 3.1135956121486898e-06, + "loss": 0.0345, + "num_input_tokens_seen": 32823296, + "step": 155530 + }, + { + "epoch": 17.11056105610561, + "grad_norm": 0.04462816193699837, + "learning_rate": 3.1124357662478816e-06, + "loss": 0.0379, + "num_input_tokens_seen": 32824384, + "step": 155535 + }, + { + "epoch": 17.11111111111111, + "grad_norm": 0.16889017820358276, + "learning_rate": 3.11127612207425e-06, + "loss": 0.0061, + "num_input_tokens_seen": 32825440, + "step": 155540 + }, + { + "epoch": 17.111661166116612, + "grad_norm": 0.007498129270970821, + "learning_rate": 3.110116679638492e-06, + "loss": 0.0032, + "num_input_tokens_seen": 32826496, + "step": 155545 + }, + { + "epoch": 17.112211221122113, + "grad_norm": 0.011987078934907913, + "learning_rate": 3.1089574389512766e-06, + "loss": 0.1358, + "num_input_tokens_seen": 32827584, + "step": 155550 + }, + { + "epoch": 17.112761276127614, + "grad_norm": 0.1547011137008667, + "learning_rate": 3.1077984000233013e-06, + "loss": 0.0069, + "num_input_tokens_seen": 32828704, + "step": 155555 + }, + { + "epoch": 17.11331133113311, + "grad_norm": 0.08437147736549377, + "learning_rate": 3.1066395628652435e-06, + "loss": 0.0088, + "num_input_tokens_seen": 32829728, + "step": 155560 + }, + { + "epoch": 17.113861386138613, + "grad_norm": 0.007982535287737846, + "learning_rate": 3.105480927487775e-06, + "loss": 0.003, + "num_input_tokens_seen": 32830752, + "step": 155565 + }, + { + "epoch": 17.114411441144114, + "grad_norm": 1.688150405883789, + "learning_rate": 3.10432249390159e-06, + "loss": 0.0556, + "num_input_tokens_seen": 32831840, + "step": 155570 + }, + { + "epoch": 17.114961496149615, + "grad_norm": 0.14945422112941742, + "learning_rate": 3.1031642621173555e-06, + "loss": 0.0055, + "num_input_tokens_seen": 32832960, + "step": 155575 + }, + { + "epoch": 17.115511551155116, + "grad_norm": 0.1641208529472351, + "learning_rate": 3.1020062321457518e-06, + "loss": 0.0085, + "num_input_tokens_seen": 32833952, + "step": 155580 + }, + { + "epoch": 17.116061606160617, + "grad_norm": 0.013233554549515247, + "learning_rate": 3.10084840399745e-06, + "loss": 0.0101, + "num_input_tokens_seen": 32834976, + "step": 155585 + }, + { + "epoch": 17.116611661166118, + "grad_norm": 0.01557929441332817, + "learning_rate": 3.0996907776831148e-06, + "loss": 0.0013, + "num_input_tokens_seen": 32836064, + "step": 155590 + }, + { + "epoch": 17.117161716171616, + "grad_norm": 0.0072843097150325775, + "learning_rate": 3.098533353213423e-06, + "loss": 0.022, + "num_input_tokens_seen": 32837152, + "step": 155595 + }, + { + "epoch": 17.117711771177117, + "grad_norm": 0.015976712107658386, + "learning_rate": 3.097376130599042e-06, + "loss": 0.1509, + "num_input_tokens_seen": 32838272, + "step": 155600 + }, + { + "epoch": 17.118261826182618, + "grad_norm": 0.20316945016384125, + "learning_rate": 3.0962191098506314e-06, + "loss": 0.0109, + "num_input_tokens_seen": 32839232, + "step": 155605 + }, + { + "epoch": 17.11881188118812, + "grad_norm": 0.06197814270853996, + "learning_rate": 3.0950622909788648e-06, + "loss": 0.022, + "num_input_tokens_seen": 32840256, + "step": 155610 + }, + { + "epoch": 17.11936193619362, + "grad_norm": 1.0711661577224731, + "learning_rate": 3.0939056739943932e-06, + "loss": 0.0117, + "num_input_tokens_seen": 32841344, + "step": 155615 + }, + { + "epoch": 17.11991199119912, + "grad_norm": 0.0546916201710701, + "learning_rate": 3.0927492589078844e-06, + "loss": 0.0012, + "num_input_tokens_seen": 32842368, + "step": 155620 + }, + { + "epoch": 17.120462046204622, + "grad_norm": 0.042179882526397705, + "learning_rate": 3.09159304572999e-06, + "loss": 0.0188, + "num_input_tokens_seen": 32843392, + "step": 155625 + }, + { + "epoch": 17.12101210121012, + "grad_norm": 0.009807211346924305, + "learning_rate": 3.090437034471369e-06, + "loss": 0.0774, + "num_input_tokens_seen": 32844416, + "step": 155630 + }, + { + "epoch": 17.12156215621562, + "grad_norm": 0.14646703004837036, + "learning_rate": 3.0892812251426816e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32845536, + "step": 155635 + }, + { + "epoch": 17.122112211221122, + "grad_norm": 0.1790788769721985, + "learning_rate": 3.0881256177545727e-06, + "loss": 0.0098, + "num_input_tokens_seen": 32846592, + "step": 155640 + }, + { + "epoch": 17.122662266226623, + "grad_norm": 0.005997996311634779, + "learning_rate": 3.0869702123177e-06, + "loss": 0.0068, + "num_input_tokens_seen": 32847648, + "step": 155645 + }, + { + "epoch": 17.123212321232124, + "grad_norm": 0.07353436946868896, + "learning_rate": 3.085815008842699e-06, + "loss": 0.0021, + "num_input_tokens_seen": 32848672, + "step": 155650 + }, + { + "epoch": 17.123762376237625, + "grad_norm": 0.03517074137926102, + "learning_rate": 3.0846600073402287e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32849696, + "step": 155655 + }, + { + "epoch": 17.124312431243123, + "grad_norm": 0.1553381234407425, + "learning_rate": 3.083505207820936e-06, + "loss": 0.0041, + "num_input_tokens_seen": 32850720, + "step": 155660 + }, + { + "epoch": 17.124862486248624, + "grad_norm": 0.02327439747750759, + "learning_rate": 3.0823506102954537e-06, + "loss": 0.0185, + "num_input_tokens_seen": 32851744, + "step": 155665 + }, + { + "epoch": 17.125412541254125, + "grad_norm": 0.016738880425691605, + "learning_rate": 3.0811962147744345e-06, + "loss": 0.0287, + "num_input_tokens_seen": 32852768, + "step": 155670 + }, + { + "epoch": 17.125962596259626, + "grad_norm": 0.04153711721301079, + "learning_rate": 3.0800420212685116e-06, + "loss": 0.0023, + "num_input_tokens_seen": 32853760, + "step": 155675 + }, + { + "epoch": 17.126512651265127, + "grad_norm": 0.21570004522800446, + "learning_rate": 3.0788880297883183e-06, + "loss": 0.014, + "num_input_tokens_seen": 32854848, + "step": 155680 + }, + { + "epoch": 17.127062706270628, + "grad_norm": 0.1030239686369896, + "learning_rate": 3.077734240344496e-06, + "loss": 0.1266, + "num_input_tokens_seen": 32855936, + "step": 155685 + }, + { + "epoch": 17.12761276127613, + "grad_norm": 0.011012925766408443, + "learning_rate": 3.0765806529476774e-06, + "loss": 0.0027, + "num_input_tokens_seen": 32857056, + "step": 155690 + }, + { + "epoch": 17.128162816281627, + "grad_norm": 0.048015039414167404, + "learning_rate": 3.0754272676084993e-06, + "loss": 0.0808, + "num_input_tokens_seen": 32858016, + "step": 155695 + }, + { + "epoch": 17.128712871287128, + "grad_norm": 0.00477571738883853, + "learning_rate": 3.0742740843375887e-06, + "loss": 0.0437, + "num_input_tokens_seen": 32859008, + "step": 155700 + }, + { + "epoch": 17.12926292629263, + "grad_norm": 0.5181321501731873, + "learning_rate": 3.0731211031455654e-06, + "loss": 0.1162, + "num_input_tokens_seen": 32860064, + "step": 155705 + }, + { + "epoch": 17.12981298129813, + "grad_norm": 0.005426458548754454, + "learning_rate": 3.0719683240430672e-06, + "loss": 0.0061, + "num_input_tokens_seen": 32861120, + "step": 155710 + }, + { + "epoch": 17.13036303630363, + "grad_norm": 0.011652130633592606, + "learning_rate": 3.0708157470407117e-06, + "loss": 0.0157, + "num_input_tokens_seen": 32862112, + "step": 155715 + }, + { + "epoch": 17.130913091309132, + "grad_norm": 0.05141472816467285, + "learning_rate": 3.0696633721491313e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32863168, + "step": 155720 + }, + { + "epoch": 17.13146314631463, + "grad_norm": 0.028164416551589966, + "learning_rate": 3.06851119937894e-06, + "loss": 0.0039, + "num_input_tokens_seen": 32864224, + "step": 155725 + }, + { + "epoch": 17.13201320132013, + "grad_norm": 0.6229575276374817, + "learning_rate": 3.0673592287407515e-06, + "loss": 0.0237, + "num_input_tokens_seen": 32865312, + "step": 155730 + }, + { + "epoch": 17.132563256325632, + "grad_norm": 0.35626935958862305, + "learning_rate": 3.066207460245196e-06, + "loss": 0.003, + "num_input_tokens_seen": 32866368, + "step": 155735 + }, + { + "epoch": 17.133113311331133, + "grad_norm": 0.00545547716319561, + "learning_rate": 3.065055893902874e-06, + "loss": 0.0567, + "num_input_tokens_seen": 32867456, + "step": 155740 + }, + { + "epoch": 17.133663366336634, + "grad_norm": 0.03185325860977173, + "learning_rate": 3.063904529724407e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32868480, + "step": 155745 + }, + { + "epoch": 17.134213421342135, + "grad_norm": 0.01696665771305561, + "learning_rate": 3.0627533677204113e-06, + "loss": 0.0468, + "num_input_tokens_seen": 32869632, + "step": 155750 + }, + { + "epoch": 17.134763476347636, + "grad_norm": 0.1921825259923935, + "learning_rate": 3.061602407901487e-06, + "loss": 0.0035, + "num_input_tokens_seen": 32870720, + "step": 155755 + }, + { + "epoch": 17.135313531353134, + "grad_norm": 2.9391353130340576, + "learning_rate": 3.060451650278251e-06, + "loss": 0.0786, + "num_input_tokens_seen": 32871776, + "step": 155760 + }, + { + "epoch": 17.135863586358635, + "grad_norm": 0.12095728516578674, + "learning_rate": 3.0593010948613e-06, + "loss": 0.0069, + "num_input_tokens_seen": 32872768, + "step": 155765 + }, + { + "epoch": 17.136413641364136, + "grad_norm": 0.007958756759762764, + "learning_rate": 3.058150741661242e-06, + "loss": 0.0056, + "num_input_tokens_seen": 32873856, + "step": 155770 + }, + { + "epoch": 17.136963696369637, + "grad_norm": 0.012087075039744377, + "learning_rate": 3.057000590688683e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32874944, + "step": 155775 + }, + { + "epoch": 17.13751375137514, + "grad_norm": 0.3526068925857544, + "learning_rate": 3.0558506419542167e-06, + "loss": 0.1236, + "num_input_tokens_seen": 32876000, + "step": 155780 + }, + { + "epoch": 17.13806380638064, + "grad_norm": 0.21165426075458527, + "learning_rate": 3.0547008954684513e-06, + "loss": 0.066, + "num_input_tokens_seen": 32876992, + "step": 155785 + }, + { + "epoch": 17.138613861386137, + "grad_norm": 2.2845256328582764, + "learning_rate": 3.0535513512419762e-06, + "loss": 0.0481, + "num_input_tokens_seen": 32878016, + "step": 155790 + }, + { + "epoch": 17.139163916391638, + "grad_norm": 0.031164085492491722, + "learning_rate": 3.052402009285382e-06, + "loss": 0.003, + "num_input_tokens_seen": 32879008, + "step": 155795 + }, + { + "epoch": 17.13971397139714, + "grad_norm": 0.007191329728811979, + "learning_rate": 3.0512528696092695e-06, + "loss": 0.002, + "num_input_tokens_seen": 32880064, + "step": 155800 + }, + { + "epoch": 17.14026402640264, + "grad_norm": 1.7858672142028809, + "learning_rate": 3.050103932224224e-06, + "loss": 0.1783, + "num_input_tokens_seen": 32881056, + "step": 155805 + }, + { + "epoch": 17.14081408140814, + "grad_norm": 0.2816886007785797, + "learning_rate": 3.048955197140846e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32882080, + "step": 155810 + }, + { + "epoch": 17.141364136413642, + "grad_norm": 2.110072135925293, + "learning_rate": 3.0478066643697124e-06, + "loss": 0.0549, + "num_input_tokens_seen": 32883104, + "step": 155815 + }, + { + "epoch": 17.141914191419144, + "grad_norm": 0.0056497505865991116, + "learning_rate": 3.0466583339214066e-06, + "loss": 0.0022, + "num_input_tokens_seen": 32884192, + "step": 155820 + }, + { + "epoch": 17.14246424642464, + "grad_norm": 0.04537023976445198, + "learning_rate": 3.0455102058065176e-06, + "loss": 0.0168, + "num_input_tokens_seen": 32885216, + "step": 155825 + }, + { + "epoch": 17.143014301430142, + "grad_norm": 0.010787912644445896, + "learning_rate": 3.0443622800356258e-06, + "loss": 0.0045, + "num_input_tokens_seen": 32886336, + "step": 155830 + }, + { + "epoch": 17.143564356435643, + "grad_norm": 0.010662605054676533, + "learning_rate": 3.0432145566193173e-06, + "loss": 0.0006, + "num_input_tokens_seen": 32887424, + "step": 155835 + }, + { + "epoch": 17.144114411441144, + "grad_norm": 0.04176678508520126, + "learning_rate": 3.0420670355681635e-06, + "loss": 0.003, + "num_input_tokens_seen": 32888416, + "step": 155840 + }, + { + "epoch": 17.144664466446645, + "grad_norm": 0.016952427104115486, + "learning_rate": 3.0409197168927345e-06, + "loss": 0.0054, + "num_input_tokens_seen": 32889408, + "step": 155845 + }, + { + "epoch": 17.145214521452147, + "grad_norm": 0.004242540802806616, + "learning_rate": 3.0397726006036188e-06, + "loss": 0.0007, + "num_input_tokens_seen": 32890464, + "step": 155850 + }, + { + "epoch": 17.145764576457644, + "grad_norm": 0.04823247715830803, + "learning_rate": 3.038625686711377e-06, + "loss": 0.0303, + "num_input_tokens_seen": 32891488, + "step": 155855 + }, + { + "epoch": 17.146314631463145, + "grad_norm": 0.6705477237701416, + "learning_rate": 3.037478975226582e-06, + "loss": 0.0059, + "num_input_tokens_seen": 32892544, + "step": 155860 + }, + { + "epoch": 17.146864686468646, + "grad_norm": 0.004095054231584072, + "learning_rate": 3.036332466159811e-06, + "loss": 0.0759, + "num_input_tokens_seen": 32893632, + "step": 155865 + }, + { + "epoch": 17.147414741474147, + "grad_norm": 0.031101880595088005, + "learning_rate": 3.035186159521619e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32894656, + "step": 155870 + }, + { + "epoch": 17.14796479647965, + "grad_norm": 0.1288217306137085, + "learning_rate": 3.0340400553225817e-06, + "loss": 0.105, + "num_input_tokens_seen": 32895712, + "step": 155875 + }, + { + "epoch": 17.14851485148515, + "grad_norm": 0.007532930001616478, + "learning_rate": 3.0328941535732512e-06, + "loss": 0.013, + "num_input_tokens_seen": 32896736, + "step": 155880 + }, + { + "epoch": 17.14906490649065, + "grad_norm": 0.013621235266327858, + "learning_rate": 3.0317484542841973e-06, + "loss": 0.002, + "num_input_tokens_seen": 32897760, + "step": 155885 + }, + { + "epoch": 17.149614961496148, + "grad_norm": 0.22647367417812347, + "learning_rate": 3.0306029574659777e-06, + "loss": 0.0039, + "num_input_tokens_seen": 32898784, + "step": 155890 + }, + { + "epoch": 17.15016501650165, + "grad_norm": 0.015911076217889786, + "learning_rate": 3.0294576631291456e-06, + "loss": 0.0015, + "num_input_tokens_seen": 32899808, + "step": 155895 + }, + { + "epoch": 17.15071507150715, + "grad_norm": 0.8103158473968506, + "learning_rate": 3.028312571284264e-06, + "loss": 0.0092, + "num_input_tokens_seen": 32900832, + "step": 155900 + }, + { + "epoch": 17.15126512651265, + "grad_norm": 0.14689888060092926, + "learning_rate": 3.027167681941878e-06, + "loss": 0.0377, + "num_input_tokens_seen": 32901888, + "step": 155905 + }, + { + "epoch": 17.151815181518153, + "grad_norm": 2.5553805828094482, + "learning_rate": 3.0260229951125487e-06, + "loss": 0.0343, + "num_input_tokens_seen": 32902912, + "step": 155910 + }, + { + "epoch": 17.152365236523654, + "grad_norm": 0.02179357223212719, + "learning_rate": 3.024878510806817e-06, + "loss": 0.0523, + "num_input_tokens_seen": 32904000, + "step": 155915 + }, + { + "epoch": 17.15291529152915, + "grad_norm": 0.13956405222415924, + "learning_rate": 3.0237342290352356e-06, + "loss": 0.1208, + "num_input_tokens_seen": 32905056, + "step": 155920 + }, + { + "epoch": 17.153465346534652, + "grad_norm": 0.1548731029033661, + "learning_rate": 3.0225901498083547e-06, + "loss": 0.0548, + "num_input_tokens_seen": 32906112, + "step": 155925 + }, + { + "epoch": 17.154015401540153, + "grad_norm": 0.019126396626234055, + "learning_rate": 3.0214462731367078e-06, + "loss": 0.1091, + "num_input_tokens_seen": 32907104, + "step": 155930 + }, + { + "epoch": 17.154565456545654, + "grad_norm": 0.007366304751485586, + "learning_rate": 3.0203025990308527e-06, + "loss": 0.0087, + "num_input_tokens_seen": 32908192, + "step": 155935 + }, + { + "epoch": 17.155115511551156, + "grad_norm": 0.03368109464645386, + "learning_rate": 3.0191591275013142e-06, + "loss": 0.0865, + "num_input_tokens_seen": 32909248, + "step": 155940 + }, + { + "epoch": 17.155665566556657, + "grad_norm": 0.028463609516620636, + "learning_rate": 3.0180158585586397e-06, + "loss": 0.0058, + "num_input_tokens_seen": 32910304, + "step": 155945 + }, + { + "epoch": 17.156215621562158, + "grad_norm": 0.10821764916181564, + "learning_rate": 3.0168727922133713e-06, + "loss": 0.0676, + "num_input_tokens_seen": 32911360, + "step": 155950 + }, + { + "epoch": 17.156765676567655, + "grad_norm": 4.98050594329834, + "learning_rate": 3.01572992847603e-06, + "loss": 0.1262, + "num_input_tokens_seen": 32912416, + "step": 155955 + }, + { + "epoch": 17.157315731573156, + "grad_norm": 0.10335177928209305, + "learning_rate": 3.0145872673571633e-06, + "loss": 0.0123, + "num_input_tokens_seen": 32913440, + "step": 155960 + }, + { + "epoch": 17.157865786578657, + "grad_norm": 0.03705916553735733, + "learning_rate": 3.0134448088672967e-06, + "loss": 0.0252, + "num_input_tokens_seen": 32914432, + "step": 155965 + }, + { + "epoch": 17.15841584158416, + "grad_norm": 0.03719960153102875, + "learning_rate": 3.012302553016952e-06, + "loss": 0.0107, + "num_input_tokens_seen": 32915488, + "step": 155970 + }, + { + "epoch": 17.15896589658966, + "grad_norm": 0.016192127019166946, + "learning_rate": 3.0111604998166647e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32916512, + "step": 155975 + }, + { + "epoch": 17.15951595159516, + "grad_norm": 1.9312342405319214, + "learning_rate": 3.010018649276966e-06, + "loss": 0.1388, + "num_input_tokens_seen": 32917568, + "step": 155980 + }, + { + "epoch": 17.16006600660066, + "grad_norm": 2.178009510040283, + "learning_rate": 3.0088770014083694e-06, + "loss": 0.0624, + "num_input_tokens_seen": 32918592, + "step": 155985 + }, + { + "epoch": 17.16061606160616, + "grad_norm": 0.32184794545173645, + "learning_rate": 3.0077355562214053e-06, + "loss": 0.0051, + "num_input_tokens_seen": 32919744, + "step": 155990 + }, + { + "epoch": 17.16116611661166, + "grad_norm": 0.03008372336626053, + "learning_rate": 3.0065943137265825e-06, + "loss": 0.0008, + "num_input_tokens_seen": 32920832, + "step": 155995 + }, + { + "epoch": 17.16171617161716, + "grad_norm": 1.796386957168579, + "learning_rate": 3.0054532739344278e-06, + "loss": 0.0761, + "num_input_tokens_seen": 32921888, + "step": 156000 + }, + { + "epoch": 17.162266226622663, + "grad_norm": 0.24416719377040863, + "learning_rate": 3.004312436855464e-06, + "loss": 0.0033, + "num_input_tokens_seen": 32922976, + "step": 156005 + }, + { + "epoch": 17.162816281628164, + "grad_norm": 0.02192930318415165, + "learning_rate": 3.00317180250019e-06, + "loss": 0.0046, + "num_input_tokens_seen": 32924064, + "step": 156010 + }, + { + "epoch": 17.163366336633665, + "grad_norm": 0.12634965777397156, + "learning_rate": 3.0020313708791324e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32925120, + "step": 156015 + }, + { + "epoch": 17.163916391639162, + "grad_norm": 0.07478244602680206, + "learning_rate": 3.0008911420027903e-06, + "loss": 0.0073, + "num_input_tokens_seen": 32926144, + "step": 156020 + }, + { + "epoch": 17.164466446644663, + "grad_norm": 0.008804638870060444, + "learning_rate": 2.999751115881683e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32927200, + "step": 156025 + }, + { + "epoch": 17.165016501650165, + "grad_norm": 0.037029288709163666, + "learning_rate": 2.998611292526307e-06, + "loss": 0.0747, + "num_input_tokens_seen": 32928256, + "step": 156030 + }, + { + "epoch": 17.165566556655666, + "grad_norm": 0.010001900605857372, + "learning_rate": 2.9974716719471746e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32929312, + "step": 156035 + }, + { + "epoch": 17.166116611661167, + "grad_norm": 0.011342157609760761, + "learning_rate": 2.9963322541547935e-06, + "loss": 0.0706, + "num_input_tokens_seen": 32930304, + "step": 156040 + }, + { + "epoch": 17.166666666666668, + "grad_norm": 0.004919453989714384, + "learning_rate": 2.995193039159655e-06, + "loss": 0.0059, + "num_input_tokens_seen": 32931360, + "step": 156045 + }, + { + "epoch": 17.16721672167217, + "grad_norm": 0.005000779405236244, + "learning_rate": 2.9940540269722678e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32932448, + "step": 156050 + }, + { + "epoch": 17.167766776677666, + "grad_norm": 0.02896200306713581, + "learning_rate": 2.9929152176031207e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32933536, + "step": 156055 + }, + { + "epoch": 17.168316831683168, + "grad_norm": 0.07812991738319397, + "learning_rate": 2.9917766110627133e-06, + "loss": 0.0019, + "num_input_tokens_seen": 32934624, + "step": 156060 + }, + { + "epoch": 17.16886688668867, + "grad_norm": 1.8301496505737305, + "learning_rate": 2.990638207361546e-06, + "loss": 0.07, + "num_input_tokens_seen": 32935712, + "step": 156065 + }, + { + "epoch": 17.16941694169417, + "grad_norm": 0.0038625102024525404, + "learning_rate": 2.9895000065100986e-06, + "loss": 0.0505, + "num_input_tokens_seen": 32936768, + "step": 156070 + }, + { + "epoch": 17.16996699669967, + "grad_norm": 0.09158927947282791, + "learning_rate": 2.9883620085188745e-06, + "loss": 0.0253, + "num_input_tokens_seen": 32937824, + "step": 156075 + }, + { + "epoch": 17.170517051705172, + "grad_norm": 0.07054980099201202, + "learning_rate": 2.987224213398354e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32938816, + "step": 156080 + }, + { + "epoch": 17.17106710671067, + "grad_norm": 0.05493539199233055, + "learning_rate": 2.9860866211590204e-06, + "loss": 0.1525, + "num_input_tokens_seen": 32939904, + "step": 156085 + }, + { + "epoch": 17.17161716171617, + "grad_norm": 0.05190000310540199, + "learning_rate": 2.984949231811365e-06, + "loss": 0.0198, + "num_input_tokens_seen": 32940992, + "step": 156090 + }, + { + "epoch": 17.17216721672167, + "grad_norm": 0.006432853173464537, + "learning_rate": 2.9838120453658662e-06, + "loss": 0.0029, + "num_input_tokens_seen": 32942112, + "step": 156095 + }, + { + "epoch": 17.172717271727173, + "grad_norm": 0.016160447150468826, + "learning_rate": 2.9826750618330153e-06, + "loss": 0.072, + "num_input_tokens_seen": 32943168, + "step": 156100 + }, + { + "epoch": 17.173267326732674, + "grad_norm": 0.036118991672992706, + "learning_rate": 2.9815382812232813e-06, + "loss": 0.0009, + "num_input_tokens_seen": 32944192, + "step": 156105 + }, + { + "epoch": 17.173817381738175, + "grad_norm": 0.2953614890575409, + "learning_rate": 2.9804017035471395e-06, + "loss": 0.004, + "num_input_tokens_seen": 32945216, + "step": 156110 + }, + { + "epoch": 17.174367436743676, + "grad_norm": 0.001349996542558074, + "learning_rate": 2.979265328815067e-06, + "loss": 0.0076, + "num_input_tokens_seen": 32946240, + "step": 156115 + }, + { + "epoch": 17.174917491749174, + "grad_norm": 4.362103462219238, + "learning_rate": 2.978129157037543e-06, + "loss": 0.0745, + "num_input_tokens_seen": 32947328, + "step": 156120 + }, + { + "epoch": 17.175467546754675, + "grad_norm": 0.016222627833485603, + "learning_rate": 2.9769931882250383e-06, + "loss": 0.1247, + "num_input_tokens_seen": 32948384, + "step": 156125 + }, + { + "epoch": 17.176017601760176, + "grad_norm": 0.20498518645763397, + "learning_rate": 2.9758574223880204e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32949440, + "step": 156130 + }, + { + "epoch": 17.176567656765677, + "grad_norm": 0.05327220261096954, + "learning_rate": 2.9747218595369497e-06, + "loss": 0.0037, + "num_input_tokens_seen": 32950432, + "step": 156135 + }, + { + "epoch": 17.177117711771178, + "grad_norm": 0.02006218396127224, + "learning_rate": 2.973586499682307e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32951488, + "step": 156140 + }, + { + "epoch": 17.17766776677668, + "grad_norm": 0.06311063468456268, + "learning_rate": 2.9724513428345394e-06, + "loss": 0.0062, + "num_input_tokens_seen": 32952512, + "step": 156145 + }, + { + "epoch": 17.178217821782177, + "grad_norm": 0.10871861129999161, + "learning_rate": 2.9713163890041224e-06, + "loss": 0.0632, + "num_input_tokens_seen": 32953504, + "step": 156150 + }, + { + "epoch": 17.178767876787678, + "grad_norm": 0.015243318863213062, + "learning_rate": 2.9701816382015134e-06, + "loss": 0.006, + "num_input_tokens_seen": 32954592, + "step": 156155 + }, + { + "epoch": 17.17931793179318, + "grad_norm": 0.02672596089541912, + "learning_rate": 2.9690470904371658e-06, + "loss": 0.0011, + "num_input_tokens_seen": 32955712, + "step": 156160 + }, + { + "epoch": 17.17986798679868, + "grad_norm": 0.0062190936878323555, + "learning_rate": 2.9679127457215433e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32956864, + "step": 156165 + }, + { + "epoch": 17.18041804180418, + "grad_norm": 3.0777337551116943, + "learning_rate": 2.9667786040650987e-06, + "loss": 0.1034, + "num_input_tokens_seen": 32957920, + "step": 156170 + }, + { + "epoch": 17.180968096809682, + "grad_norm": 0.04918302595615387, + "learning_rate": 2.9656446654782705e-06, + "loss": 0.009, + "num_input_tokens_seen": 32958944, + "step": 156175 + }, + { + "epoch": 17.181518151815183, + "grad_norm": 0.2514318823814392, + "learning_rate": 2.964510929971534e-06, + "loss": 0.092, + "num_input_tokens_seen": 32960000, + "step": 156180 + }, + { + "epoch": 17.18206820682068, + "grad_norm": 0.01213650219142437, + "learning_rate": 2.9633773975553224e-06, + "loss": 0.0308, + "num_input_tokens_seen": 32961056, + "step": 156185 + }, + { + "epoch": 17.182618261826182, + "grad_norm": 0.004238855559378862, + "learning_rate": 2.9622440682400915e-06, + "loss": 0.0611, + "num_input_tokens_seen": 32962112, + "step": 156190 + }, + { + "epoch": 17.183168316831683, + "grad_norm": 0.012325732968747616, + "learning_rate": 2.9611109420362827e-06, + "loss": 0.0038, + "num_input_tokens_seen": 32963104, + "step": 156195 + }, + { + "epoch": 17.183718371837184, + "grad_norm": 0.008782695978879929, + "learning_rate": 2.9599780189543348e-06, + "loss": 0.0322, + "num_input_tokens_seen": 32964160, + "step": 156200 + }, + { + "epoch": 17.184268426842685, + "grad_norm": 0.09819845110177994, + "learning_rate": 2.958845299004692e-06, + "loss": 0.0061, + "num_input_tokens_seen": 32965152, + "step": 156205 + }, + { + "epoch": 17.184818481848186, + "grad_norm": 0.02758134715259075, + "learning_rate": 2.957712782197794e-06, + "loss": 0.0036, + "num_input_tokens_seen": 32966208, + "step": 156210 + }, + { + "epoch": 17.185368536853684, + "grad_norm": 0.02588069811463356, + "learning_rate": 2.9565804685440875e-06, + "loss": 0.0262, + "num_input_tokens_seen": 32967232, + "step": 156215 + }, + { + "epoch": 17.185918591859185, + "grad_norm": 2.1449434757232666, + "learning_rate": 2.9554483580540026e-06, + "loss": 0.0513, + "num_input_tokens_seen": 32968256, + "step": 156220 + }, + { + "epoch": 17.186468646864686, + "grad_norm": 0.061420511454343796, + "learning_rate": 2.954316450737965e-06, + "loss": 0.0034, + "num_input_tokens_seen": 32969248, + "step": 156225 + }, + { + "epoch": 17.187018701870187, + "grad_norm": 0.08749360591173172, + "learning_rate": 2.9531847466064217e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32970336, + "step": 156230 + }, + { + "epoch": 17.187568756875688, + "grad_norm": 0.06523515284061432, + "learning_rate": 2.9520532456697835e-06, + "loss": 0.0025, + "num_input_tokens_seen": 32971328, + "step": 156235 + }, + { + "epoch": 17.18811881188119, + "grad_norm": 0.2187902331352234, + "learning_rate": 2.9509219479385007e-06, + "loss": 0.0024, + "num_input_tokens_seen": 32972416, + "step": 156240 + }, + { + "epoch": 17.18866886688669, + "grad_norm": 1.4916146993637085, + "learning_rate": 2.94979085342299e-06, + "loss": 0.0134, + "num_input_tokens_seen": 32973472, + "step": 156245 + }, + { + "epoch": 17.189218921892188, + "grad_norm": 0.021098153665661812, + "learning_rate": 2.9486599621336737e-06, + "loss": 0.0018, + "num_input_tokens_seen": 32974528, + "step": 156250 + }, + { + "epoch": 17.18976897689769, + "grad_norm": 0.1588609367609024, + "learning_rate": 2.947529274080979e-06, + "loss": 0.0399, + "num_input_tokens_seen": 32975584, + "step": 156255 + }, + { + "epoch": 17.19031903190319, + "grad_norm": 0.044462647289037704, + "learning_rate": 2.946398789275323e-06, + "loss": 0.003, + "num_input_tokens_seen": 32976608, + "step": 156260 + }, + { + "epoch": 17.19086908690869, + "grad_norm": 3.0396907329559326, + "learning_rate": 2.945268507727125e-06, + "loss": 0.1128, + "num_input_tokens_seen": 32977664, + "step": 156265 + }, + { + "epoch": 17.191419141914192, + "grad_norm": 0.19826799631118774, + "learning_rate": 2.9441384294468106e-06, + "loss": 0.0043, + "num_input_tokens_seen": 32978720, + "step": 156270 + }, + { + "epoch": 17.191969196919693, + "grad_norm": 1.9108575582504272, + "learning_rate": 2.9430085544447816e-06, + "loss": 0.0735, + "num_input_tokens_seen": 32979808, + "step": 156275 + }, + { + "epoch": 17.19251925192519, + "grad_norm": 0.017959322780370712, + "learning_rate": 2.9418788827314663e-06, + "loss": 0.0047, + "num_input_tokens_seen": 32980832, + "step": 156280 + }, + { + "epoch": 17.193069306930692, + "grad_norm": 0.01796756125986576, + "learning_rate": 2.9407494143172646e-06, + "loss": 0.0072, + "num_input_tokens_seen": 32981888, + "step": 156285 + }, + { + "epoch": 17.193619361936193, + "grad_norm": 0.2834470868110657, + "learning_rate": 2.9396201492125937e-06, + "loss": 0.0197, + "num_input_tokens_seen": 32982944, + "step": 156290 + }, + { + "epoch": 17.194169416941694, + "grad_norm": 0.07281061261892319, + "learning_rate": 2.9384910874278527e-06, + "loss": 0.0014, + "num_input_tokens_seen": 32984064, + "step": 156295 + }, + { + "epoch": 17.194719471947195, + "grad_norm": 0.09211882203817368, + "learning_rate": 2.9373622289734533e-06, + "loss": 0.0028, + "num_input_tokens_seen": 32985152, + "step": 156300 + }, + { + "epoch": 17.195269526952696, + "grad_norm": 0.031176595017313957, + "learning_rate": 2.9362335738598063e-06, + "loss": 0.0017, + "num_input_tokens_seen": 32986176, + "step": 156305 + }, + { + "epoch": 17.195819581958197, + "grad_norm": 0.011693958193063736, + "learning_rate": 2.935105122097301e-06, + "loss": 0.0077, + "num_input_tokens_seen": 32987264, + "step": 156310 + }, + { + "epoch": 17.196369636963695, + "grad_norm": 2.2542104721069336, + "learning_rate": 2.933976873696348e-06, + "loss": 0.0744, + "num_input_tokens_seen": 32988320, + "step": 156315 + }, + { + "epoch": 17.196919691969196, + "grad_norm": 0.009096951223909855, + "learning_rate": 2.9328488286673393e-06, + "loss": 0.0003, + "num_input_tokens_seen": 32989312, + "step": 156320 + }, + { + "epoch": 17.197469746974697, + "grad_norm": 0.03801129758358002, + "learning_rate": 2.9317209870206725e-06, + "loss": 0.0207, + "num_input_tokens_seen": 32990432, + "step": 156325 + }, + { + "epoch": 17.198019801980198, + "grad_norm": 2.029527187347412, + "learning_rate": 2.93059334876675e-06, + "loss": 0.0282, + "num_input_tokens_seen": 32991488, + "step": 156330 + }, + { + "epoch": 17.1985698569857, + "grad_norm": 0.01641334779560566, + "learning_rate": 2.9294659139159518e-06, + "loss": 0.0033, + "num_input_tokens_seen": 32992576, + "step": 156335 + }, + { + "epoch": 17.1991199119912, + "grad_norm": 0.06650960445404053, + "learning_rate": 2.928338682478682e-06, + "loss": 0.0794, + "num_input_tokens_seen": 32993600, + "step": 156340 + }, + { + "epoch": 17.199669966996698, + "grad_norm": 0.25517788529396057, + "learning_rate": 2.927211654465323e-06, + "loss": 0.0546, + "num_input_tokens_seen": 32994624, + "step": 156345 + }, + { + "epoch": 17.2002200220022, + "grad_norm": 0.11167672276496887, + "learning_rate": 2.9260848298862524e-06, + "loss": 0.0031, + "num_input_tokens_seen": 32995648, + "step": 156350 + }, + { + "epoch": 17.2007700770077, + "grad_norm": 0.01622280478477478, + "learning_rate": 2.9249582087518793e-06, + "loss": 0.0292, + "num_input_tokens_seen": 32996736, + "step": 156355 + }, + { + "epoch": 17.2013201320132, + "grad_norm": 2.1226818561553955, + "learning_rate": 2.9238317910725705e-06, + "loss": 0.0308, + "num_input_tokens_seen": 32997760, + "step": 156360 + }, + { + "epoch": 17.201870187018702, + "grad_norm": 0.37108761072158813, + "learning_rate": 2.922705576858706e-06, + "loss": 0.0089, + "num_input_tokens_seen": 32998816, + "step": 156365 + }, + { + "epoch": 17.202420242024203, + "grad_norm": 0.20849980413913727, + "learning_rate": 2.9215795661206746e-06, + "loss": 0.011, + "num_input_tokens_seen": 32999872, + "step": 156370 + }, + { + "epoch": 17.202970297029704, + "grad_norm": 0.02591291256248951, + "learning_rate": 2.920453758868846e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33000864, + "step": 156375 + }, + { + "epoch": 17.203520352035202, + "grad_norm": 0.08000143617391586, + "learning_rate": 2.9193281551136014e-06, + "loss": 0.0047, + "num_input_tokens_seen": 33001952, + "step": 156380 + }, + { + "epoch": 17.204070407040703, + "grad_norm": 2.2393863201141357, + "learning_rate": 2.9182027548653177e-06, + "loss": 0.0912, + "num_input_tokens_seen": 33003008, + "step": 156385 + }, + { + "epoch": 17.204620462046204, + "grad_norm": 0.018571816384792328, + "learning_rate": 2.9170775581343594e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33004128, + "step": 156390 + }, + { + "epoch": 17.205170517051705, + "grad_norm": 0.014754313044250011, + "learning_rate": 2.915952564931107e-06, + "loss": 0.0593, + "num_input_tokens_seen": 33005184, + "step": 156395 + }, + { + "epoch": 17.205720572057206, + "grad_norm": 0.01202363520860672, + "learning_rate": 2.914827775265916e-06, + "loss": 0.001, + "num_input_tokens_seen": 33006272, + "step": 156400 + }, + { + "epoch": 17.206270627062707, + "grad_norm": 0.007426067721098661, + "learning_rate": 2.913703189149164e-06, + "loss": 0.0505, + "num_input_tokens_seen": 33007296, + "step": 156405 + }, + { + "epoch": 17.206820682068205, + "grad_norm": 0.45553165674209595, + "learning_rate": 2.912578806591207e-06, + "loss": 0.0049, + "num_input_tokens_seen": 33008320, + "step": 156410 + }, + { + "epoch": 17.207370737073706, + "grad_norm": 0.010913128964602947, + "learning_rate": 2.911454627602414e-06, + "loss": 0.0056, + "num_input_tokens_seen": 33009376, + "step": 156415 + }, + { + "epoch": 17.207920792079207, + "grad_norm": 0.05559954419732094, + "learning_rate": 2.9103306521931495e-06, + "loss": 0.1316, + "num_input_tokens_seen": 33010400, + "step": 156420 + }, + { + "epoch": 17.20847084708471, + "grad_norm": 0.0018369505414739251, + "learning_rate": 2.9092068803737605e-06, + "loss": 0.0081, + "num_input_tokens_seen": 33011456, + "step": 156425 + }, + { + "epoch": 17.20902090209021, + "grad_norm": 0.14780423045158386, + "learning_rate": 2.9080833121546193e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33012512, + "step": 156430 + }, + { + "epoch": 17.20957095709571, + "grad_norm": 0.06408219039440155, + "learning_rate": 2.9069599475460673e-06, + "loss": 0.0039, + "num_input_tokens_seen": 33013536, + "step": 156435 + }, + { + "epoch": 17.21012101210121, + "grad_norm": 0.0907093957066536, + "learning_rate": 2.9058367865584637e-06, + "loss": 0.0047, + "num_input_tokens_seen": 33014528, + "step": 156440 + }, + { + "epoch": 17.21067106710671, + "grad_norm": 0.009816749952733517, + "learning_rate": 2.904713829202166e-06, + "loss": 0.0089, + "num_input_tokens_seen": 33015552, + "step": 156445 + }, + { + "epoch": 17.21122112211221, + "grad_norm": 2.468794107437134, + "learning_rate": 2.9035910754875136e-06, + "loss": 0.0588, + "num_input_tokens_seen": 33016544, + "step": 156450 + }, + { + "epoch": 17.21177117711771, + "grad_norm": 0.023709608241915703, + "learning_rate": 2.9024685254248648e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33017632, + "step": 156455 + }, + { + "epoch": 17.212321232123212, + "grad_norm": 0.08481011539697647, + "learning_rate": 2.9013461790245584e-06, + "loss": 0.004, + "num_input_tokens_seen": 33018656, + "step": 156460 + }, + { + "epoch": 17.212871287128714, + "grad_norm": 0.01236548088490963, + "learning_rate": 2.9002240362969364e-06, + "loss": 0.0004, + "num_input_tokens_seen": 33019744, + "step": 156465 + }, + { + "epoch": 17.213421342134215, + "grad_norm": 0.050312597304582596, + "learning_rate": 2.8991020972523457e-06, + "loss": 0.0061, + "num_input_tokens_seen": 33020768, + "step": 156470 + }, + { + "epoch": 17.213971397139716, + "grad_norm": 0.05854285508394241, + "learning_rate": 2.897980361901126e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33021792, + "step": 156475 + }, + { + "epoch": 17.214521452145213, + "grad_norm": 0.013223509304225445, + "learning_rate": 2.896858830253621e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33022848, + "step": 156480 + }, + { + "epoch": 17.215071507150714, + "grad_norm": 1.5920300483703613, + "learning_rate": 2.895737502320159e-06, + "loss": 0.1088, + "num_input_tokens_seen": 33023936, + "step": 156485 + }, + { + "epoch": 17.215621562156215, + "grad_norm": 0.04090740531682968, + "learning_rate": 2.8946163781110758e-06, + "loss": 0.0713, + "num_input_tokens_seen": 33024928, + "step": 156490 + }, + { + "epoch": 17.216171617161717, + "grad_norm": 0.003586965147405863, + "learning_rate": 2.893495457636705e-06, + "loss": 0.0006, + "num_input_tokens_seen": 33026048, + "step": 156495 + }, + { + "epoch": 17.216721672167218, + "grad_norm": 0.8700488209724426, + "learning_rate": 2.8923747409073805e-06, + "loss": 0.0494, + "num_input_tokens_seen": 33027072, + "step": 156500 + }, + { + "epoch": 17.21727172717272, + "grad_norm": 2.753004312515259, + "learning_rate": 2.891254227933432e-06, + "loss": 0.0294, + "num_input_tokens_seen": 33028096, + "step": 156505 + }, + { + "epoch": 17.217821782178216, + "grad_norm": 0.014505302533507347, + "learning_rate": 2.890133918725188e-06, + "loss": 0.0029, + "num_input_tokens_seen": 33029088, + "step": 156510 + }, + { + "epoch": 17.218371837183717, + "grad_norm": 0.023227760568261147, + "learning_rate": 2.8890138132929622e-06, + "loss": 0.0057, + "num_input_tokens_seen": 33030240, + "step": 156515 + }, + { + "epoch": 17.21892189218922, + "grad_norm": 0.036832816898822784, + "learning_rate": 2.887893911647091e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33031264, + "step": 156520 + }, + { + "epoch": 17.21947194719472, + "grad_norm": 0.019103804603219032, + "learning_rate": 2.886774213797888e-06, + "loss": 0.1692, + "num_input_tokens_seen": 33032288, + "step": 156525 + }, + { + "epoch": 17.22002200220022, + "grad_norm": 0.008476557210087776, + "learning_rate": 2.885654719755676e-06, + "loss": 0.0091, + "num_input_tokens_seen": 33033376, + "step": 156530 + }, + { + "epoch": 17.22057205720572, + "grad_norm": 0.2683899700641632, + "learning_rate": 2.884535429530777e-06, + "loss": 0.0109, + "num_input_tokens_seen": 33034432, + "step": 156535 + }, + { + "epoch": 17.221122112211223, + "grad_norm": 0.02720623090863228, + "learning_rate": 2.8834163431334995e-06, + "loss": 0.0417, + "num_input_tokens_seen": 33035488, + "step": 156540 + }, + { + "epoch": 17.22167216721672, + "grad_norm": 0.061239443719387054, + "learning_rate": 2.8822974605741655e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33036544, + "step": 156545 + }, + { + "epoch": 17.22222222222222, + "grad_norm": 0.028804456815123558, + "learning_rate": 2.8811787818630754e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33037504, + "step": 156550 + }, + { + "epoch": 17.222772277227723, + "grad_norm": 2.597970724105835, + "learning_rate": 2.880060307010549e-06, + "loss": 0.0187, + "num_input_tokens_seen": 33038528, + "step": 156555 + }, + { + "epoch": 17.223322332233224, + "grad_norm": 0.4448929727077484, + "learning_rate": 2.8789420360268964e-06, + "loss": 0.0048, + "num_input_tokens_seen": 33039552, + "step": 156560 + }, + { + "epoch": 17.223872387238725, + "grad_norm": 0.09529586136341095, + "learning_rate": 2.8778239689224163e-06, + "loss": 0.048, + "num_input_tokens_seen": 33040640, + "step": 156565 + }, + { + "epoch": 17.224422442244226, + "grad_norm": 0.008512354455888271, + "learning_rate": 2.8767061057074245e-06, + "loss": 0.0037, + "num_input_tokens_seen": 33041632, + "step": 156570 + }, + { + "epoch": 17.224972497249723, + "grad_norm": 0.09313599765300751, + "learning_rate": 2.8755884463922134e-06, + "loss": 0.0337, + "num_input_tokens_seen": 33042720, + "step": 156575 + }, + { + "epoch": 17.225522552255224, + "grad_norm": 0.03303100913763046, + "learning_rate": 2.8744709909870824e-06, + "loss": 0.0681, + "num_input_tokens_seen": 33043840, + "step": 156580 + }, + { + "epoch": 17.226072607260726, + "grad_norm": 0.07338108867406845, + "learning_rate": 2.873353739502338e-06, + "loss": 0.008, + "num_input_tokens_seen": 33044928, + "step": 156585 + }, + { + "epoch": 17.226622662266227, + "grad_norm": 0.08808336406946182, + "learning_rate": 2.872236691948271e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33045952, + "step": 156590 + }, + { + "epoch": 17.227172717271728, + "grad_norm": 0.11891581118106842, + "learning_rate": 2.8711198483351882e-06, + "loss": 0.053, + "num_input_tokens_seen": 33046976, + "step": 156595 + }, + { + "epoch": 17.22772277227723, + "grad_norm": 0.014243464916944504, + "learning_rate": 2.8700032086733745e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33048064, + "step": 156600 + }, + { + "epoch": 17.22827282728273, + "grad_norm": 0.04273172840476036, + "learning_rate": 2.8688867729731172e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33049056, + "step": 156605 + }, + { + "epoch": 17.228822882288227, + "grad_norm": 2.4823429584503174, + "learning_rate": 2.867770541244713e-06, + "loss": 0.0488, + "num_input_tokens_seen": 33050112, + "step": 156610 + }, + { + "epoch": 17.22937293729373, + "grad_norm": 0.1410103142261505, + "learning_rate": 2.8666545134984455e-06, + "loss": 0.0716, + "num_input_tokens_seen": 33051136, + "step": 156615 + }, + { + "epoch": 17.22992299229923, + "grad_norm": 1.72184157371521, + "learning_rate": 2.8655386897446067e-06, + "loss": 0.0084, + "num_input_tokens_seen": 33052160, + "step": 156620 + }, + { + "epoch": 17.23047304730473, + "grad_norm": 0.2101961225271225, + "learning_rate": 2.86442306999348e-06, + "loss": 0.003, + "num_input_tokens_seen": 33053184, + "step": 156625 + }, + { + "epoch": 17.231023102310232, + "grad_norm": 0.015054072253406048, + "learning_rate": 2.863307654255337e-06, + "loss": 0.0037, + "num_input_tokens_seen": 33054208, + "step": 156630 + }, + { + "epoch": 17.231573157315733, + "grad_norm": 2.390960931777954, + "learning_rate": 2.8621924425404708e-06, + "loss": 0.0359, + "num_input_tokens_seen": 33055168, + "step": 156635 + }, + { + "epoch": 17.23212321232123, + "grad_norm": 0.03982776030898094, + "learning_rate": 2.861077434859147e-06, + "loss": 0.0327, + "num_input_tokens_seen": 33056192, + "step": 156640 + }, + { + "epoch": 17.23267326732673, + "grad_norm": 0.1631932109594345, + "learning_rate": 2.8599626312216522e-06, + "loss": 0.0112, + "num_input_tokens_seen": 33057248, + "step": 156645 + }, + { + "epoch": 17.233223322332233, + "grad_norm": 0.03173629194498062, + "learning_rate": 2.8588480316382617e-06, + "loss": 0.0072, + "num_input_tokens_seen": 33058272, + "step": 156650 + }, + { + "epoch": 17.233773377337734, + "grad_norm": 0.033376727253198624, + "learning_rate": 2.857733636119239e-06, + "loss": 0.0044, + "num_input_tokens_seen": 33059328, + "step": 156655 + }, + { + "epoch": 17.234323432343235, + "grad_norm": 0.026002073660492897, + "learning_rate": 2.8566194446748656e-06, + "loss": 0.0377, + "num_input_tokens_seen": 33060384, + "step": 156660 + }, + { + "epoch": 17.234873487348736, + "grad_norm": 0.1707322895526886, + "learning_rate": 2.8555054573153993e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33061408, + "step": 156665 + }, + { + "epoch": 17.235423542354237, + "grad_norm": 0.04285755380988121, + "learning_rate": 2.8543916740511157e-06, + "loss": 0.0041, + "num_input_tokens_seen": 33062464, + "step": 156670 + }, + { + "epoch": 17.235973597359735, + "grad_norm": 0.007331534754484892, + "learning_rate": 2.8532780948922812e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33063456, + "step": 156675 + }, + { + "epoch": 17.236523652365236, + "grad_norm": 0.007332897745072842, + "learning_rate": 2.8521647198491514e-06, + "loss": 0.0058, + "num_input_tokens_seen": 33064512, + "step": 156680 + }, + { + "epoch": 17.237073707370737, + "grad_norm": 0.009644296951591969, + "learning_rate": 2.851051548931996e-06, + "loss": 0.0062, + "num_input_tokens_seen": 33065568, + "step": 156685 + }, + { + "epoch": 17.237623762376238, + "grad_norm": 1.44060218334198, + "learning_rate": 2.8499385821510654e-06, + "loss": 0.0136, + "num_input_tokens_seen": 33066656, + "step": 156690 + }, + { + "epoch": 17.23817381738174, + "grad_norm": 0.03570699691772461, + "learning_rate": 2.848825819516629e-06, + "loss": 0.0794, + "num_input_tokens_seen": 33067712, + "step": 156695 + }, + { + "epoch": 17.23872387238724, + "grad_norm": 0.028360407799482346, + "learning_rate": 2.8477132610389284e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33068736, + "step": 156700 + }, + { + "epoch": 17.239273927392738, + "grad_norm": 0.034544140100479126, + "learning_rate": 2.8466009067282252e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33069824, + "step": 156705 + }, + { + "epoch": 17.23982398239824, + "grad_norm": 0.5920581817626953, + "learning_rate": 2.8454887565947776e-06, + "loss": 0.0491, + "num_input_tokens_seen": 33070944, + "step": 156710 + }, + { + "epoch": 17.24037403740374, + "grad_norm": 1.5381954908370972, + "learning_rate": 2.8443768106488215e-06, + "loss": 0.0198, + "num_input_tokens_seen": 33071968, + "step": 156715 + }, + { + "epoch": 17.24092409240924, + "grad_norm": 0.006171696819365025, + "learning_rate": 2.8432650689006216e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33073088, + "step": 156720 + }, + { + "epoch": 17.241474147414742, + "grad_norm": 0.018015267327427864, + "learning_rate": 2.8421535313604085e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33074080, + "step": 156725 + }, + { + "epoch": 17.242024202420243, + "grad_norm": 0.009914328344166279, + "learning_rate": 2.841042198038435e-06, + "loss": 0.0364, + "num_input_tokens_seen": 33075104, + "step": 156730 + }, + { + "epoch": 17.242574257425744, + "grad_norm": 0.011373050510883331, + "learning_rate": 2.839931068944948e-06, + "loss": 0.0558, + "num_input_tokens_seen": 33076128, + "step": 156735 + }, + { + "epoch": 17.24312431243124, + "grad_norm": 0.09716765582561493, + "learning_rate": 2.838820144090182e-06, + "loss": 0.0247, + "num_input_tokens_seen": 33077248, + "step": 156740 + }, + { + "epoch": 17.243674367436743, + "grad_norm": 2.0959506034851074, + "learning_rate": 2.837709423484372e-06, + "loss": 0.108, + "num_input_tokens_seen": 33078304, + "step": 156745 + }, + { + "epoch": 17.244224422442244, + "grad_norm": 0.19120071828365326, + "learning_rate": 2.8365989071377636e-06, + "loss": 0.0041, + "num_input_tokens_seen": 33079328, + "step": 156750 + }, + { + "epoch": 17.244774477447745, + "grad_norm": 0.039392467588186264, + "learning_rate": 2.835488595060584e-06, + "loss": 0.0764, + "num_input_tokens_seen": 33080352, + "step": 156755 + }, + { + "epoch": 17.245324532453246, + "grad_norm": 0.012734136544167995, + "learning_rate": 2.834378487263073e-06, + "loss": 0.0098, + "num_input_tokens_seen": 33081408, + "step": 156760 + }, + { + "epoch": 17.245874587458747, + "grad_norm": 0.20478123426437378, + "learning_rate": 2.8332685837554608e-06, + "loss": 0.0046, + "num_input_tokens_seen": 33082464, + "step": 156765 + }, + { + "epoch": 17.246424642464245, + "grad_norm": 0.09901612251996994, + "learning_rate": 2.8321588845479724e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33083456, + "step": 156770 + }, + { + "epoch": 17.246974697469746, + "grad_norm": 2.0711584091186523, + "learning_rate": 2.8310493896508415e-06, + "loss": 0.1165, + "num_input_tokens_seen": 33084480, + "step": 156775 + }, + { + "epoch": 17.247524752475247, + "grad_norm": 0.024411696940660477, + "learning_rate": 2.829940099074288e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33085504, + "step": 156780 + }, + { + "epoch": 17.248074807480748, + "grad_norm": 0.030171161517500877, + "learning_rate": 2.828831012828537e-06, + "loss": 0.0031, + "num_input_tokens_seen": 33086560, + "step": 156785 + }, + { + "epoch": 17.24862486248625, + "grad_norm": 0.006060597952455282, + "learning_rate": 2.8277221309238157e-06, + "loss": 0.0583, + "num_input_tokens_seen": 33087616, + "step": 156790 + }, + { + "epoch": 17.24917491749175, + "grad_norm": 3.4228835105895996, + "learning_rate": 2.8266134533703386e-06, + "loss": 0.0591, + "num_input_tokens_seen": 33088736, + "step": 156795 + }, + { + "epoch": 17.24972497249725, + "grad_norm": 0.024000879377126694, + "learning_rate": 2.8255049801783285e-06, + "loss": 0.0351, + "num_input_tokens_seen": 33089856, + "step": 156800 + }, + { + "epoch": 17.25027502750275, + "grad_norm": 0.0888313427567482, + "learning_rate": 2.8243967113579934e-06, + "loss": 0.095, + "num_input_tokens_seen": 33090880, + "step": 156805 + }, + { + "epoch": 17.25082508250825, + "grad_norm": 0.39462387561798096, + "learning_rate": 2.8232886469195587e-06, + "loss": 0.0083, + "num_input_tokens_seen": 33092000, + "step": 156810 + }, + { + "epoch": 17.25137513751375, + "grad_norm": 0.024289146065711975, + "learning_rate": 2.8221807868732243e-06, + "loss": 0.0287, + "num_input_tokens_seen": 33093024, + "step": 156815 + }, + { + "epoch": 17.251925192519252, + "grad_norm": 0.01680346578359604, + "learning_rate": 2.821073131229207e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33094112, + "step": 156820 + }, + { + "epoch": 17.252475247524753, + "grad_norm": 0.28354620933532715, + "learning_rate": 2.819965679997724e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33095136, + "step": 156825 + }, + { + "epoch": 17.253025302530254, + "grad_norm": 0.05443674698472023, + "learning_rate": 2.81885843318897e-06, + "loss": 0.0047, + "num_input_tokens_seen": 33096192, + "step": 156830 + }, + { + "epoch": 17.253575357535752, + "grad_norm": 0.047035202383995056, + "learning_rate": 2.817751390813156e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33097248, + "step": 156835 + }, + { + "epoch": 17.254125412541253, + "grad_norm": 3.6038076877593994, + "learning_rate": 2.816644552880479e-06, + "loss": 0.0632, + "num_input_tokens_seen": 33098272, + "step": 156840 + }, + { + "epoch": 17.254675467546754, + "grad_norm": 0.4762333631515503, + "learning_rate": 2.8155379194011455e-06, + "loss": 0.0191, + "num_input_tokens_seen": 33099328, + "step": 156845 + }, + { + "epoch": 17.255225522552255, + "grad_norm": 0.6439802050590515, + "learning_rate": 2.814431490385361e-06, + "loss": 0.0172, + "num_input_tokens_seen": 33100384, + "step": 156850 + }, + { + "epoch": 17.255775577557756, + "grad_norm": 0.04827656224370003, + "learning_rate": 2.8133252658433084e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33101376, + "step": 156855 + }, + { + "epoch": 17.256325632563257, + "grad_norm": 1.6729316711425781, + "learning_rate": 2.8122192457851942e-06, + "loss": 0.0377, + "num_input_tokens_seen": 33102400, + "step": 156860 + }, + { + "epoch": 17.25687568756876, + "grad_norm": 0.0389794185757637, + "learning_rate": 2.81111343022121e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33103424, + "step": 156865 + }, + { + "epoch": 17.257425742574256, + "grad_norm": 0.0065015265718102455, + "learning_rate": 2.810007819161542e-06, + "loss": 0.0736, + "num_input_tokens_seen": 33104576, + "step": 156870 + }, + { + "epoch": 17.257975797579757, + "grad_norm": 0.072983019053936, + "learning_rate": 2.8089024126163822e-06, + "loss": 0.0085, + "num_input_tokens_seen": 33105600, + "step": 156875 + }, + { + "epoch": 17.258525852585258, + "grad_norm": 0.027376322075724602, + "learning_rate": 2.8077972105959223e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33106624, + "step": 156880 + }, + { + "epoch": 17.25907590759076, + "grad_norm": 0.4308309853076935, + "learning_rate": 2.8066922131103512e-06, + "loss": 0.0693, + "num_input_tokens_seen": 33107648, + "step": 156885 + }, + { + "epoch": 17.25962596259626, + "grad_norm": 0.3966098129749298, + "learning_rate": 2.80558742016985e-06, + "loss": 0.013, + "num_input_tokens_seen": 33108768, + "step": 156890 + }, + { + "epoch": 17.26017601760176, + "grad_norm": 0.00653727725148201, + "learning_rate": 2.804482831784591e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33109792, + "step": 156895 + }, + { + "epoch": 17.260726072607262, + "grad_norm": 0.19880519807338715, + "learning_rate": 2.8033784479647663e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33110752, + "step": 156900 + }, + { + "epoch": 17.26127612761276, + "grad_norm": 0.008535790257155895, + "learning_rate": 2.8022742687205507e-06, + "loss": 0.0539, + "num_input_tokens_seen": 33111808, + "step": 156905 + }, + { + "epoch": 17.26182618261826, + "grad_norm": 0.12157001346349716, + "learning_rate": 2.8011702940621248e-06, + "loss": 0.002, + "num_input_tokens_seen": 33112768, + "step": 156910 + }, + { + "epoch": 17.262376237623762, + "grad_norm": 0.06500892341136932, + "learning_rate": 2.8000665239996644e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33113792, + "step": 156915 + }, + { + "epoch": 17.262926292629263, + "grad_norm": 0.00912471953779459, + "learning_rate": 2.7989629585433304e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33114816, + "step": 156920 + }, + { + "epoch": 17.263476347634764, + "grad_norm": 0.00572294183075428, + "learning_rate": 2.797859597703306e-06, + "loss": 0.0217, + "num_input_tokens_seen": 33115904, + "step": 156925 + }, + { + "epoch": 17.264026402640265, + "grad_norm": 0.012676793150603771, + "learning_rate": 2.7967564414897535e-06, + "loss": 0.1375, + "num_input_tokens_seen": 33116928, + "step": 156930 + }, + { + "epoch": 17.264576457645763, + "grad_norm": 0.1262732744216919, + "learning_rate": 2.795653489912839e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33118016, + "step": 156935 + }, + { + "epoch": 17.265126512651264, + "grad_norm": 0.008515141904354095, + "learning_rate": 2.7945507429827405e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33119040, + "step": 156940 + }, + { + "epoch": 17.265676567656765, + "grad_norm": 0.005506074987351894, + "learning_rate": 2.793448200709606e-06, + "loss": 0.0643, + "num_input_tokens_seen": 33120096, + "step": 156945 + }, + { + "epoch": 17.266226622662266, + "grad_norm": 0.30793094635009766, + "learning_rate": 2.79234586310361e-06, + "loss": 0.0047, + "num_input_tokens_seen": 33121184, + "step": 156950 + }, + { + "epoch": 17.266776677667767, + "grad_norm": 0.02551114186644554, + "learning_rate": 2.7912437301749026e-06, + "loss": 0.0052, + "num_input_tokens_seen": 33122208, + "step": 156955 + }, + { + "epoch": 17.26732673267327, + "grad_norm": 0.005078549031168222, + "learning_rate": 2.790141801933638e-06, + "loss": 0.2685, + "num_input_tokens_seen": 33123264, + "step": 156960 + }, + { + "epoch": 17.26787678767877, + "grad_norm": 0.00301856710575521, + "learning_rate": 2.7890400783899873e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33124320, + "step": 156965 + }, + { + "epoch": 17.268426842684267, + "grad_norm": 0.43171021342277527, + "learning_rate": 2.787938559554093e-06, + "loss": 0.0315, + "num_input_tokens_seen": 33125472, + "step": 156970 + }, + { + "epoch": 17.268976897689768, + "grad_norm": 1.0388391017913818, + "learning_rate": 2.7868372454361136e-06, + "loss": 0.1153, + "num_input_tokens_seen": 33126528, + "step": 156975 + }, + { + "epoch": 17.26952695269527, + "grad_norm": 0.002998501295223832, + "learning_rate": 2.7857361360461994e-06, + "loss": 0.0086, + "num_input_tokens_seen": 33127616, + "step": 156980 + }, + { + "epoch": 17.27007700770077, + "grad_norm": 3.2263875007629395, + "learning_rate": 2.784635231394489e-06, + "loss": 0.0252, + "num_input_tokens_seen": 33128608, + "step": 156985 + }, + { + "epoch": 17.27062706270627, + "grad_norm": 4.769383907318115, + "learning_rate": 2.783534531491136e-06, + "loss": 0.2078, + "num_input_tokens_seen": 33129664, + "step": 156990 + }, + { + "epoch": 17.271177117711773, + "grad_norm": 0.07354626804590225, + "learning_rate": 2.782434036346282e-06, + "loss": 0.0066, + "num_input_tokens_seen": 33130720, + "step": 156995 + }, + { + "epoch": 17.27172717271727, + "grad_norm": 0.012216931208968163, + "learning_rate": 2.78133374597008e-06, + "loss": 0.0247, + "num_input_tokens_seen": 33131776, + "step": 157000 + }, + { + "epoch": 17.27227722772277, + "grad_norm": 0.019626732915639877, + "learning_rate": 2.780233660372664e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33132864, + "step": 157005 + }, + { + "epoch": 17.272827282728272, + "grad_norm": 0.1506660282611847, + "learning_rate": 2.7791337795641663e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33133856, + "step": 157010 + }, + { + "epoch": 17.273377337733773, + "grad_norm": 0.04020017385482788, + "learning_rate": 2.778034103554736e-06, + "loss": 0.0922, + "num_input_tokens_seen": 33134848, + "step": 157015 + }, + { + "epoch": 17.273927392739274, + "grad_norm": 0.03932274132966995, + "learning_rate": 2.7769346323544916e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33135904, + "step": 157020 + }, + { + "epoch": 17.274477447744776, + "grad_norm": 0.02102500945329666, + "learning_rate": 2.7758353659735865e-06, + "loss": 0.0572, + "num_input_tokens_seen": 33136960, + "step": 157025 + }, + { + "epoch": 17.275027502750277, + "grad_norm": 0.009209120646119118, + "learning_rate": 2.774736304422143e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33137952, + "step": 157030 + }, + { + "epoch": 17.275577557755774, + "grad_norm": 0.06873054802417755, + "learning_rate": 2.7736374477102837e-06, + "loss": 0.0995, + "num_input_tokens_seen": 33139072, + "step": 157035 + }, + { + "epoch": 17.276127612761275, + "grad_norm": 0.02078871987760067, + "learning_rate": 2.772538795848151e-06, + "loss": 0.0536, + "num_input_tokens_seen": 33140096, + "step": 157040 + }, + { + "epoch": 17.276677667766776, + "grad_norm": 0.04302217438817024, + "learning_rate": 2.7714403488458552e-06, + "loss": 0.0628, + "num_input_tokens_seen": 33141088, + "step": 157045 + }, + { + "epoch": 17.277227722772277, + "grad_norm": 0.02270307019352913, + "learning_rate": 2.770342106713528e-06, + "loss": 0.1045, + "num_input_tokens_seen": 33142112, + "step": 157050 + }, + { + "epoch": 17.27777777777778, + "grad_norm": 0.013658472336828709, + "learning_rate": 2.7692440694612964e-06, + "loss": 0.0118, + "num_input_tokens_seen": 33143168, + "step": 157055 + }, + { + "epoch": 17.27832783278328, + "grad_norm": 0.005152885336428881, + "learning_rate": 2.7681462370992673e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33144256, + "step": 157060 + }, + { + "epoch": 17.278877887788777, + "grad_norm": 0.03811713680624962, + "learning_rate": 2.7670486096375735e-06, + "loss": 0.002, + "num_input_tokens_seen": 33145280, + "step": 157065 + }, + { + "epoch": 17.27942794279428, + "grad_norm": 0.072299525141716, + "learning_rate": 2.7659511870863185e-06, + "loss": 0.0029, + "num_input_tokens_seen": 33146240, + "step": 157070 + }, + { + "epoch": 17.27997799779978, + "grad_norm": 0.01717950589954853, + "learning_rate": 2.7648539694556302e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33147296, + "step": 157075 + }, + { + "epoch": 17.28052805280528, + "grad_norm": 0.44440096616744995, + "learning_rate": 2.763756956755606e-06, + "loss": 0.0061, + "num_input_tokens_seen": 33148320, + "step": 157080 + }, + { + "epoch": 17.28107810781078, + "grad_norm": 0.6858662962913513, + "learning_rate": 2.762660148996363e-06, + "loss": 0.0553, + "num_input_tokens_seen": 33149344, + "step": 157085 + }, + { + "epoch": 17.281628162816283, + "grad_norm": 0.12854427099227905, + "learning_rate": 2.7615635461880172e-06, + "loss": 0.0226, + "num_input_tokens_seen": 33150496, + "step": 157090 + }, + { + "epoch": 17.282178217821784, + "grad_norm": 2.095148801803589, + "learning_rate": 2.7604671483406647e-06, + "loss": 0.0092, + "num_input_tokens_seen": 33151552, + "step": 157095 + }, + { + "epoch": 17.28272827282728, + "grad_norm": 0.032498378306627274, + "learning_rate": 2.759370955464419e-06, + "loss": 0.003, + "num_input_tokens_seen": 33152576, + "step": 157100 + }, + { + "epoch": 17.283278327832782, + "grad_norm": 0.004034789279103279, + "learning_rate": 2.758274967569374e-06, + "loss": 0.0327, + "num_input_tokens_seen": 33153728, + "step": 157105 + }, + { + "epoch": 17.283828382838283, + "grad_norm": 0.04930119216442108, + "learning_rate": 2.757179184665637e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33154784, + "step": 157110 + }, + { + "epoch": 17.284378437843785, + "grad_norm": 0.02024225704371929, + "learning_rate": 2.75608360676331e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33155840, + "step": 157115 + }, + { + "epoch": 17.284928492849286, + "grad_norm": 0.015927081927657127, + "learning_rate": 2.7549882338724823e-06, + "loss": 0.0052, + "num_input_tokens_seen": 33156832, + "step": 157120 + }, + { + "epoch": 17.285478547854787, + "grad_norm": 0.38394561409950256, + "learning_rate": 2.7538930660032603e-06, + "loss": 0.0128, + "num_input_tokens_seen": 33157888, + "step": 157125 + }, + { + "epoch": 17.286028602860284, + "grad_norm": 2.518420934677124, + "learning_rate": 2.7527981031657298e-06, + "loss": 0.0385, + "num_input_tokens_seen": 33158912, + "step": 157130 + }, + { + "epoch": 17.286578657865785, + "grad_norm": 0.017294522374868393, + "learning_rate": 2.7517033453699743e-06, + "loss": 0.0006, + "num_input_tokens_seen": 33159968, + "step": 157135 + }, + { + "epoch": 17.287128712871286, + "grad_norm": 4.724877834320068, + "learning_rate": 2.7506087926261027e-06, + "loss": 0.0534, + "num_input_tokens_seen": 33160992, + "step": 157140 + }, + { + "epoch": 17.287678767876788, + "grad_norm": 0.05986678972840309, + "learning_rate": 2.7495144449441927e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33162048, + "step": 157145 + }, + { + "epoch": 17.28822882288229, + "grad_norm": 1.487819790840149, + "learning_rate": 2.7484203023343285e-06, + "loss": 0.2267, + "num_input_tokens_seen": 33163104, + "step": 157150 + }, + { + "epoch": 17.28877887788779, + "grad_norm": 0.015171905048191547, + "learning_rate": 2.747326364806599e-06, + "loss": 0.0895, + "num_input_tokens_seen": 33164192, + "step": 157155 + }, + { + "epoch": 17.28932893289329, + "grad_norm": 0.016733277589082718, + "learning_rate": 2.746232632371082e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33165216, + "step": 157160 + }, + { + "epoch": 17.28987898789879, + "grad_norm": 0.1732492744922638, + "learning_rate": 2.745139105037861e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33166240, + "step": 157165 + }, + { + "epoch": 17.29042904290429, + "grad_norm": 0.03362726792693138, + "learning_rate": 2.7440457828170172e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33167296, + "step": 157170 + }, + { + "epoch": 17.29097909790979, + "grad_norm": 0.14132653176784515, + "learning_rate": 2.7429526657186206e-06, + "loss": 0.0044, + "num_input_tokens_seen": 33168384, + "step": 157175 + }, + { + "epoch": 17.29152915291529, + "grad_norm": 0.027314797043800354, + "learning_rate": 2.7418597537527514e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33169440, + "step": 157180 + }, + { + "epoch": 17.292079207920793, + "grad_norm": 2.7397003173828125, + "learning_rate": 2.7407670469294765e-06, + "loss": 0.0948, + "num_input_tokens_seen": 33170560, + "step": 157185 + }, + { + "epoch": 17.292629262926294, + "grad_norm": 0.009056114591658115, + "learning_rate": 2.7396745452588775e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33171616, + "step": 157190 + }, + { + "epoch": 17.293179317931795, + "grad_norm": 0.05276939645409584, + "learning_rate": 2.738582248751012e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33172736, + "step": 157195 + }, + { + "epoch": 17.293729372937293, + "grad_norm": 0.003408826654776931, + "learning_rate": 2.7374901574159506e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33173728, + "step": 157200 + }, + { + "epoch": 17.294279427942794, + "grad_norm": 0.014129401184618473, + "learning_rate": 2.7363982712637626e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33174816, + "step": 157205 + }, + { + "epoch": 17.294829482948295, + "grad_norm": 0.113923080265522, + "learning_rate": 2.7353065903045065e-06, + "loss": 0.004, + "num_input_tokens_seen": 33175840, + "step": 157210 + }, + { + "epoch": 17.295379537953796, + "grad_norm": 0.07465425878763199, + "learning_rate": 2.7342151145482497e-06, + "loss": 0.0318, + "num_input_tokens_seen": 33176864, + "step": 157215 + }, + { + "epoch": 17.295929592959297, + "grad_norm": 0.06698448210954666, + "learning_rate": 2.7331238440050447e-06, + "loss": 0.0528, + "num_input_tokens_seen": 33177888, + "step": 157220 + }, + { + "epoch": 17.296479647964798, + "grad_norm": 0.013534574769437313, + "learning_rate": 2.732032778684951e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33178912, + "step": 157225 + }, + { + "epoch": 17.297029702970296, + "grad_norm": 0.005687593948096037, + "learning_rate": 2.7309419185980317e-06, + "loss": 0.0335, + "num_input_tokens_seen": 33179968, + "step": 157230 + }, + { + "epoch": 17.297579757975797, + "grad_norm": 0.19345858693122864, + "learning_rate": 2.7298512637543292e-06, + "loss": 0.0331, + "num_input_tokens_seen": 33181024, + "step": 157235 + }, + { + "epoch": 17.298129812981298, + "grad_norm": 0.006071138195693493, + "learning_rate": 2.7287608141639055e-06, + "loss": 0.0923, + "num_input_tokens_seen": 33182112, + "step": 157240 + }, + { + "epoch": 17.2986798679868, + "grad_norm": 0.012689685449004173, + "learning_rate": 2.7276705698368097e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33183136, + "step": 157245 + }, + { + "epoch": 17.2992299229923, + "grad_norm": 0.08160439878702164, + "learning_rate": 2.7265805307830796e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33184128, + "step": 157250 + }, + { + "epoch": 17.2997799779978, + "grad_norm": 0.012554909102618694, + "learning_rate": 2.72549069701277e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33185184, + "step": 157255 + }, + { + "epoch": 17.300330033003302, + "grad_norm": 0.005755644291639328, + "learning_rate": 2.724401068535923e-06, + "loss": 0.0078, + "num_input_tokens_seen": 33186240, + "step": 157260 + }, + { + "epoch": 17.3008800880088, + "grad_norm": 0.015992416068911552, + "learning_rate": 2.723311645362589e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33187296, + "step": 157265 + }, + { + "epoch": 17.3014301430143, + "grad_norm": 0.0509084090590477, + "learning_rate": 2.7222224275027993e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33188384, + "step": 157270 + }, + { + "epoch": 17.301980198019802, + "grad_norm": 0.03280682489275932, + "learning_rate": 2.7211334149665925e-06, + "loss": 0.0035, + "num_input_tokens_seen": 33189408, + "step": 157275 + }, + { + "epoch": 17.302530253025303, + "grad_norm": 0.1287737786769867, + "learning_rate": 2.7200446077640105e-06, + "loss": 0.0104, + "num_input_tokens_seen": 33190432, + "step": 157280 + }, + { + "epoch": 17.303080308030804, + "grad_norm": 0.004942585714161396, + "learning_rate": 2.718956005905085e-06, + "loss": 0.0126, + "num_input_tokens_seen": 33191456, + "step": 157285 + }, + { + "epoch": 17.303630363036305, + "grad_norm": 0.020896341651678085, + "learning_rate": 2.7178676093998546e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33192512, + "step": 157290 + }, + { + "epoch": 17.304180418041803, + "grad_norm": 0.0050632585771381855, + "learning_rate": 2.7167794182583472e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33193568, + "step": 157295 + }, + { + "epoch": 17.304730473047304, + "grad_norm": 0.027004269883036613, + "learning_rate": 2.7156914324905858e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33194560, + "step": 157300 + }, + { + "epoch": 17.305280528052805, + "grad_norm": 2.812403440475464, + "learning_rate": 2.714603652106609e-06, + "loss": 0.0122, + "num_input_tokens_seen": 33195648, + "step": 157305 + }, + { + "epoch": 17.305830583058306, + "grad_norm": 0.2725857198238373, + "learning_rate": 2.713516077116429e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33196640, + "step": 157310 + }, + { + "epoch": 17.306380638063807, + "grad_norm": 0.00708021130412817, + "learning_rate": 2.712428707530082e-06, + "loss": 0.0103, + "num_input_tokens_seen": 33197760, + "step": 157315 + }, + { + "epoch": 17.306930693069308, + "grad_norm": 0.005503151565790176, + "learning_rate": 2.7113415433575875e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33198880, + "step": 157320 + }, + { + "epoch": 17.30748074807481, + "grad_norm": 0.050814803689718246, + "learning_rate": 2.7102545846089567e-06, + "loss": 0.0424, + "num_input_tokens_seen": 33200000, + "step": 157325 + }, + { + "epoch": 17.308030803080307, + "grad_norm": 0.051365215331315994, + "learning_rate": 2.709167831294221e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33201088, + "step": 157330 + }, + { + "epoch": 17.308580858085808, + "grad_norm": 0.06031981483101845, + "learning_rate": 2.708081283423383e-06, + "loss": 0.04, + "num_input_tokens_seen": 33202144, + "step": 157335 + }, + { + "epoch": 17.30913091309131, + "grad_norm": 1.0923633575439453, + "learning_rate": 2.706994941006463e-06, + "loss": 0.1083, + "num_input_tokens_seen": 33203200, + "step": 157340 + }, + { + "epoch": 17.30968096809681, + "grad_norm": 0.10109291970729828, + "learning_rate": 2.705908804053478e-06, + "loss": 0.0258, + "num_input_tokens_seen": 33204288, + "step": 157345 + }, + { + "epoch": 17.31023102310231, + "grad_norm": 0.02185058407485485, + "learning_rate": 2.7048228725744305e-06, + "loss": 0.0654, + "num_input_tokens_seen": 33205344, + "step": 157350 + }, + { + "epoch": 17.310781078107812, + "grad_norm": 2.7453415393829346, + "learning_rate": 2.7037371465793383e-06, + "loss": 0.0905, + "num_input_tokens_seen": 33206464, + "step": 157355 + }, + { + "epoch": 17.31133113311331, + "grad_norm": 0.020576145499944687, + "learning_rate": 2.702651626078198e-06, + "loss": 0.0315, + "num_input_tokens_seen": 33207552, + "step": 157360 + }, + { + "epoch": 17.31188118811881, + "grad_norm": 0.9725101590156555, + "learning_rate": 2.701566311081016e-06, + "loss": 0.012, + "num_input_tokens_seen": 33208640, + "step": 157365 + }, + { + "epoch": 17.312431243124312, + "grad_norm": 0.055238887667655945, + "learning_rate": 2.7004812015977987e-06, + "loss": 0.0978, + "num_input_tokens_seen": 33209664, + "step": 157370 + }, + { + "epoch": 17.312981298129813, + "grad_norm": 0.04800006002187729, + "learning_rate": 2.699396297638543e-06, + "loss": 0.0028, + "num_input_tokens_seen": 33210688, + "step": 157375 + }, + { + "epoch": 17.313531353135314, + "grad_norm": 0.03405315428972244, + "learning_rate": 2.6983115992132574e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33211744, + "step": 157380 + }, + { + "epoch": 17.314081408140815, + "grad_norm": 0.012374971061944962, + "learning_rate": 2.6972271063319315e-06, + "loss": 0.1488, + "num_input_tokens_seen": 33212832, + "step": 157385 + }, + { + "epoch": 17.314631463146316, + "grad_norm": 0.3033736050128937, + "learning_rate": 2.6961428190045574e-06, + "loss": 0.1382, + "num_input_tokens_seen": 33213920, + "step": 157390 + }, + { + "epoch": 17.315181518151814, + "grad_norm": 0.025388013571500778, + "learning_rate": 2.6950587372411325e-06, + "loss": 0.0042, + "num_input_tokens_seen": 33214976, + "step": 157395 + }, + { + "epoch": 17.315731573157315, + "grad_norm": 0.011194111779332161, + "learning_rate": 2.693974861051646e-06, + "loss": 0.0516, + "num_input_tokens_seen": 33216000, + "step": 157400 + }, + { + "epoch": 17.316281628162816, + "grad_norm": 0.017944179475307465, + "learning_rate": 2.692891190446098e-06, + "loss": 0.0033, + "num_input_tokens_seen": 33217024, + "step": 157405 + }, + { + "epoch": 17.316831683168317, + "grad_norm": 0.03657006099820137, + "learning_rate": 2.691807725434464e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33218080, + "step": 157410 + }, + { + "epoch": 17.317381738173818, + "grad_norm": 0.08991517126560211, + "learning_rate": 2.690724466026731e-06, + "loss": 0.0157, + "num_input_tokens_seen": 33219104, + "step": 157415 + }, + { + "epoch": 17.31793179317932, + "grad_norm": 0.09381169825792313, + "learning_rate": 2.6896414122328904e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33220192, + "step": 157420 + }, + { + "epoch": 17.318481848184817, + "grad_norm": 0.10054583102464676, + "learning_rate": 2.688558564062915e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33221280, + "step": 157425 + }, + { + "epoch": 17.319031903190318, + "grad_norm": 0.16313089430332184, + "learning_rate": 2.6874759215267887e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33222336, + "step": 157430 + }, + { + "epoch": 17.31958195819582, + "grad_norm": 0.03052734024822712, + "learning_rate": 2.6863934846344974e-06, + "loss": 0.0404, + "num_input_tokens_seen": 33223328, + "step": 157435 + }, + { + "epoch": 17.32013201320132, + "grad_norm": 0.16744454205036163, + "learning_rate": 2.6853112533960027e-06, + "loss": 0.0165, + "num_input_tokens_seen": 33224320, + "step": 157440 + }, + { + "epoch": 17.32068206820682, + "grad_norm": 2.294635772705078, + "learning_rate": 2.6842292278212915e-06, + "loss": 0.0557, + "num_input_tokens_seen": 33225440, + "step": 157445 + }, + { + "epoch": 17.321232123212322, + "grad_norm": 0.007962195202708244, + "learning_rate": 2.683147407920328e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33226560, + "step": 157450 + }, + { + "epoch": 17.321782178217823, + "grad_norm": 0.01408424973487854, + "learning_rate": 2.682065793703084e-06, + "loss": 0.0298, + "num_input_tokens_seen": 33227680, + "step": 157455 + }, + { + "epoch": 17.32233223322332, + "grad_norm": 3.527298927307129, + "learning_rate": 2.6809843851795357e-06, + "loss": 0.108, + "num_input_tokens_seen": 33228672, + "step": 157460 + }, + { + "epoch": 17.322882288228822, + "grad_norm": 0.6949889063835144, + "learning_rate": 2.6799031823596416e-06, + "loss": 0.0058, + "num_input_tokens_seen": 33229728, + "step": 157465 + }, + { + "epoch": 17.323432343234323, + "grad_norm": 0.14489461481571198, + "learning_rate": 2.6788221852533713e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33230784, + "step": 157470 + }, + { + "epoch": 17.323982398239824, + "grad_norm": 0.008056378923356533, + "learning_rate": 2.6777413938706837e-06, + "loss": 0.0052, + "num_input_tokens_seen": 33231872, + "step": 157475 + }, + { + "epoch": 17.324532453245325, + "grad_norm": 0.013125802390277386, + "learning_rate": 2.676660808221548e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33232992, + "step": 157480 + }, + { + "epoch": 17.325082508250826, + "grad_norm": 0.015063822269439697, + "learning_rate": 2.6755804283159103e-06, + "loss": 0.0544, + "num_input_tokens_seen": 33234016, + "step": 157485 + }, + { + "epoch": 17.325632563256324, + "grad_norm": 0.04788148030638695, + "learning_rate": 2.6745002541637394e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33235104, + "step": 157490 + }, + { + "epoch": 17.326182618261825, + "grad_norm": 0.057620082050561905, + "learning_rate": 2.6734202857749884e-06, + "loss": 0.0161, + "num_input_tokens_seen": 33236128, + "step": 157495 + }, + { + "epoch": 17.326732673267326, + "grad_norm": 0.12037897855043411, + "learning_rate": 2.6723405231596055e-06, + "loss": 0.0118, + "num_input_tokens_seen": 33237216, + "step": 157500 + }, + { + "epoch": 17.327282728272827, + "grad_norm": 0.1950041949748993, + "learning_rate": 2.6712609663275517e-06, + "loss": 0.0357, + "num_input_tokens_seen": 33238240, + "step": 157505 + }, + { + "epoch": 17.32783278327833, + "grad_norm": 0.011093349196016788, + "learning_rate": 2.670181615288764e-06, + "loss": 0.0003, + "num_input_tokens_seen": 33239296, + "step": 157510 + }, + { + "epoch": 17.32838283828383, + "grad_norm": 0.0700187087059021, + "learning_rate": 2.6691024700532003e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33240384, + "step": 157515 + }, + { + "epoch": 17.32893289328933, + "grad_norm": 0.0097886947914958, + "learning_rate": 2.668023530630806e-06, + "loss": 0.1026, + "num_input_tokens_seen": 33241408, + "step": 157520 + }, + { + "epoch": 17.329482948294828, + "grad_norm": 0.014500746503472328, + "learning_rate": 2.6669447970315257e-06, + "loss": 0.0357, + "num_input_tokens_seen": 33242400, + "step": 157525 + }, + { + "epoch": 17.33003300330033, + "grad_norm": 0.050201572477817535, + "learning_rate": 2.665866269265291e-06, + "loss": 0.0057, + "num_input_tokens_seen": 33243424, + "step": 157530 + }, + { + "epoch": 17.33058305830583, + "grad_norm": 0.24867138266563416, + "learning_rate": 2.6647879473420568e-06, + "loss": 0.0074, + "num_input_tokens_seen": 33244544, + "step": 157535 + }, + { + "epoch": 17.33113311331133, + "grad_norm": 0.13834068179130554, + "learning_rate": 2.663709831271749e-06, + "loss": 0.0799, + "num_input_tokens_seen": 33245632, + "step": 157540 + }, + { + "epoch": 17.331683168316832, + "grad_norm": 0.11382220685482025, + "learning_rate": 2.6626319210643095e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33246656, + "step": 157545 + }, + { + "epoch": 17.332233223322334, + "grad_norm": 0.23182299733161926, + "learning_rate": 2.6615542167296782e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33247744, + "step": 157550 + }, + { + "epoch": 17.33278327832783, + "grad_norm": 1.0785030126571655, + "learning_rate": 2.6604767182777796e-06, + "loss": 0.2627, + "num_input_tokens_seen": 33248832, + "step": 157555 + }, + { + "epoch": 17.333333333333332, + "grad_norm": 0.02107061631977558, + "learning_rate": 2.6593994257185506e-06, + "loss": 0.0984, + "num_input_tokens_seen": 33249824, + "step": 157560 + }, + { + "epoch": 17.333883388338833, + "grad_norm": 1.0778521299362183, + "learning_rate": 2.658322339061911e-06, + "loss": 0.0601, + "num_input_tokens_seen": 33250944, + "step": 157565 + }, + { + "epoch": 17.334433443344334, + "grad_norm": 0.02393883280456066, + "learning_rate": 2.6572454583177946e-06, + "loss": 0.0958, + "num_input_tokens_seen": 33251968, + "step": 157570 + }, + { + "epoch": 17.334983498349835, + "grad_norm": 0.05221635475754738, + "learning_rate": 2.656168783496132e-06, + "loss": 0.0172, + "num_input_tokens_seen": 33253120, + "step": 157575 + }, + { + "epoch": 17.335533553355337, + "grad_norm": 0.010145097970962524, + "learning_rate": 2.655092314606833e-06, + "loss": 0.0048, + "num_input_tokens_seen": 33254176, + "step": 157580 + }, + { + "epoch": 17.336083608360838, + "grad_norm": 0.0071417056024074554, + "learning_rate": 2.654016051659833e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33255200, + "step": 157585 + }, + { + "epoch": 17.336633663366335, + "grad_norm": 0.005051900167018175, + "learning_rate": 2.652939994665038e-06, + "loss": 0.0994, + "num_input_tokens_seen": 33256256, + "step": 157590 + }, + { + "epoch": 17.337183718371836, + "grad_norm": 0.009995262138545513, + "learning_rate": 2.651864143632379e-06, + "loss": 0.0119, + "num_input_tokens_seen": 33257344, + "step": 157595 + }, + { + "epoch": 17.337733773377337, + "grad_norm": 3.773409366607666, + "learning_rate": 2.6507884985717575e-06, + "loss": 0.0892, + "num_input_tokens_seen": 33258336, + "step": 157600 + }, + { + "epoch": 17.33828382838284, + "grad_norm": 2.863593339920044, + "learning_rate": 2.6497130594930946e-06, + "loss": 0.0643, + "num_input_tokens_seen": 33259392, + "step": 157605 + }, + { + "epoch": 17.33883388338834, + "grad_norm": 0.03128209710121155, + "learning_rate": 2.6486378264063083e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33260448, + "step": 157610 + }, + { + "epoch": 17.33938393839384, + "grad_norm": 0.03428469970822334, + "learning_rate": 2.647562799321296e-06, + "loss": 0.1116, + "num_input_tokens_seen": 33261472, + "step": 157615 + }, + { + "epoch": 17.33993399339934, + "grad_norm": 0.11853256821632385, + "learning_rate": 2.6464879782479746e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33262560, + "step": 157620 + }, + { + "epoch": 17.34048404840484, + "grad_norm": 0.017562877386808395, + "learning_rate": 2.645413363196245e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33263552, + "step": 157625 + }, + { + "epoch": 17.34103410341034, + "grad_norm": 0.016795247793197632, + "learning_rate": 2.644338954176012e-06, + "loss": 0.002, + "num_input_tokens_seen": 33264544, + "step": 157630 + }, + { + "epoch": 17.34158415841584, + "grad_norm": 0.12382633239030838, + "learning_rate": 2.6432647511971858e-06, + "loss": 0.0656, + "num_input_tokens_seen": 33265536, + "step": 157635 + }, + { + "epoch": 17.342134213421343, + "grad_norm": 0.010118557140231133, + "learning_rate": 2.642190754269652e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33266624, + "step": 157640 + }, + { + "epoch": 17.342684268426844, + "grad_norm": 0.09636534005403519, + "learning_rate": 2.6411169634033285e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33267616, + "step": 157645 + }, + { + "epoch": 17.343234323432345, + "grad_norm": 0.016477584838867188, + "learning_rate": 2.6400433786080982e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33268672, + "step": 157650 + }, + { + "epoch": 17.343784378437842, + "grad_norm": 0.01679205149412155, + "learning_rate": 2.6389699998938533e-06, + "loss": 0.1177, + "num_input_tokens_seen": 33269664, + "step": 157655 + }, + { + "epoch": 17.344334433443343, + "grad_norm": 0.01296247448772192, + "learning_rate": 2.6378968272704917e-06, + "loss": 0.0285, + "num_input_tokens_seen": 33270624, + "step": 157660 + }, + { + "epoch": 17.344884488448844, + "grad_norm": 0.04524372145533562, + "learning_rate": 2.6368238607479056e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33271712, + "step": 157665 + }, + { + "epoch": 17.345434543454346, + "grad_norm": 0.004451419226825237, + "learning_rate": 2.635751100335987e-06, + "loss": 0.004, + "num_input_tokens_seen": 33272768, + "step": 157670 + }, + { + "epoch": 17.345984598459847, + "grad_norm": 0.008149673230946064, + "learning_rate": 2.6346785460446194e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33273824, + "step": 157675 + }, + { + "epoch": 17.346534653465348, + "grad_norm": 0.08497946709394455, + "learning_rate": 2.633606197883684e-06, + "loss": 0.0143, + "num_input_tokens_seen": 33274848, + "step": 157680 + }, + { + "epoch": 17.34708470847085, + "grad_norm": 0.9291114211082458, + "learning_rate": 2.6325340558630675e-06, + "loss": 0.0707, + "num_input_tokens_seen": 33275840, + "step": 157685 + }, + { + "epoch": 17.347634763476346, + "grad_norm": 0.012594702653586864, + "learning_rate": 2.6314621199926502e-06, + "loss": 0.0062, + "num_input_tokens_seen": 33276832, + "step": 157690 + }, + { + "epoch": 17.348184818481847, + "grad_norm": 0.006226221565157175, + "learning_rate": 2.6303903902823194e-06, + "loss": 0.0376, + "num_input_tokens_seen": 33277952, + "step": 157695 + }, + { + "epoch": 17.34873487348735, + "grad_norm": 0.5651903748512268, + "learning_rate": 2.629318866741948e-06, + "loss": 0.0055, + "num_input_tokens_seen": 33279040, + "step": 157700 + }, + { + "epoch": 17.34928492849285, + "grad_norm": 0.009296650066971779, + "learning_rate": 2.6282475493814017e-06, + "loss": 0.0082, + "num_input_tokens_seen": 33280096, + "step": 157705 + }, + { + "epoch": 17.34983498349835, + "grad_norm": 0.08783701807260513, + "learning_rate": 2.6271764382105685e-06, + "loss": 0.003, + "num_input_tokens_seen": 33281152, + "step": 157710 + }, + { + "epoch": 17.350385038503852, + "grad_norm": 0.061376433819532394, + "learning_rate": 2.626105533239312e-06, + "loss": 0.1749, + "num_input_tokens_seen": 33282240, + "step": 157715 + }, + { + "epoch": 17.35093509350935, + "grad_norm": 0.01029416173696518, + "learning_rate": 2.6250348344775023e-06, + "loss": 0.0626, + "num_input_tokens_seen": 33283264, + "step": 157720 + }, + { + "epoch": 17.35148514851485, + "grad_norm": 0.0632634088397026, + "learning_rate": 2.6239643419350146e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33284256, + "step": 157725 + }, + { + "epoch": 17.35203520352035, + "grad_norm": 0.02023351937532425, + "learning_rate": 2.622894055621708e-06, + "loss": 0.0041, + "num_input_tokens_seen": 33285312, + "step": 157730 + }, + { + "epoch": 17.352585258525853, + "grad_norm": 0.2761729955673218, + "learning_rate": 2.621823975547452e-06, + "loss": 0.1151, + "num_input_tokens_seen": 33286368, + "step": 157735 + }, + { + "epoch": 17.353135313531354, + "grad_norm": 0.09067666530609131, + "learning_rate": 2.620754101722109e-06, + "loss": 0.0605, + "num_input_tokens_seen": 33287424, + "step": 157740 + }, + { + "epoch": 17.353685368536855, + "grad_norm": 0.45948317646980286, + "learning_rate": 2.6196844341555256e-06, + "loss": 0.0857, + "num_input_tokens_seen": 33288512, + "step": 157745 + }, + { + "epoch": 17.354235423542356, + "grad_norm": 0.08751236647367477, + "learning_rate": 2.6186149728575805e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33289600, + "step": 157750 + }, + { + "epoch": 17.354785478547853, + "grad_norm": 0.05844282731413841, + "learning_rate": 2.617545717838116e-06, + "loss": 0.0916, + "num_input_tokens_seen": 33290688, + "step": 157755 + }, + { + "epoch": 17.355335533553355, + "grad_norm": 0.011486613191664219, + "learning_rate": 2.6164766691069987e-06, + "loss": 0.0776, + "num_input_tokens_seen": 33291744, + "step": 157760 + }, + { + "epoch": 17.355885588558856, + "grad_norm": 0.9173631072044373, + "learning_rate": 2.6154078266740766e-06, + "loss": 0.0091, + "num_input_tokens_seen": 33292704, + "step": 157765 + }, + { + "epoch": 17.356435643564357, + "grad_norm": 0.021603049710392952, + "learning_rate": 2.614339190549192e-06, + "loss": 0.1304, + "num_input_tokens_seen": 33293696, + "step": 157770 + }, + { + "epoch": 17.356985698569858, + "grad_norm": 0.008598239161074162, + "learning_rate": 2.6132707607422036e-06, + "loss": 0.0896, + "num_input_tokens_seen": 33294848, + "step": 157775 + }, + { + "epoch": 17.35753575357536, + "grad_norm": 0.005936675239354372, + "learning_rate": 2.6122025372629533e-06, + "loss": 0.0085, + "num_input_tokens_seen": 33295904, + "step": 157780 + }, + { + "epoch": 17.358085808580856, + "grad_norm": 0.5229153633117676, + "learning_rate": 2.6111345201212974e-06, + "loss": 0.0075, + "num_input_tokens_seen": 33296960, + "step": 157785 + }, + { + "epoch": 17.358635863586358, + "grad_norm": 0.08860515058040619, + "learning_rate": 2.610066709327069e-06, + "loss": 0.004, + "num_input_tokens_seen": 33297984, + "step": 157790 + }, + { + "epoch": 17.35918591859186, + "grad_norm": 0.04451904445886612, + "learning_rate": 2.6089991048901084e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33299008, + "step": 157795 + }, + { + "epoch": 17.35973597359736, + "grad_norm": 0.11281803995370865, + "learning_rate": 2.607931706820266e-06, + "loss": 0.0083, + "num_input_tokens_seen": 33300128, + "step": 157800 + }, + { + "epoch": 17.36028602860286, + "grad_norm": 0.006082529202103615, + "learning_rate": 2.606864515127361e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33301152, + "step": 157805 + }, + { + "epoch": 17.360836083608362, + "grad_norm": 0.007799666840583086, + "learning_rate": 2.6057975298212496e-06, + "loss": 0.0004, + "num_input_tokens_seen": 33302240, + "step": 157810 + }, + { + "epoch": 17.361386138613863, + "grad_norm": 0.07624637335538864, + "learning_rate": 2.6047307509117545e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33303328, + "step": 157815 + }, + { + "epoch": 17.36193619361936, + "grad_norm": 0.07520904392004013, + "learning_rate": 2.6036641784087074e-06, + "loss": 0.0194, + "num_input_tokens_seen": 33304416, + "step": 157820 + }, + { + "epoch": 17.36248624862486, + "grad_norm": 0.024344878271222115, + "learning_rate": 2.602597812321947e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33305472, + "step": 157825 + }, + { + "epoch": 17.363036303630363, + "grad_norm": 0.09141337871551514, + "learning_rate": 2.6015316526612876e-06, + "loss": 0.0033, + "num_input_tokens_seen": 33306496, + "step": 157830 + }, + { + "epoch": 17.363586358635864, + "grad_norm": 0.02792593464255333, + "learning_rate": 2.6004656994365634e-06, + "loss": 0.0109, + "num_input_tokens_seen": 33307520, + "step": 157835 + }, + { + "epoch": 17.364136413641365, + "grad_norm": 0.008959290571510792, + "learning_rate": 2.5993999526576053e-06, + "loss": 0.0006, + "num_input_tokens_seen": 33308512, + "step": 157840 + }, + { + "epoch": 17.364686468646866, + "grad_norm": 0.024070048704743385, + "learning_rate": 2.598334412334222e-06, + "loss": 0.0033, + "num_input_tokens_seen": 33309568, + "step": 157845 + }, + { + "epoch": 17.365236523652364, + "grad_norm": 0.1856006681919098, + "learning_rate": 2.597269078476247e-06, + "loss": 0.0058, + "num_input_tokens_seen": 33310592, + "step": 157850 + }, + { + "epoch": 17.365786578657865, + "grad_norm": 0.01661524921655655, + "learning_rate": 2.5962039510934873e-06, + "loss": 0.0343, + "num_input_tokens_seen": 33311744, + "step": 157855 + }, + { + "epoch": 17.366336633663366, + "grad_norm": 0.013443700969219208, + "learning_rate": 2.595139030195773e-06, + "loss": 0.1041, + "num_input_tokens_seen": 33312768, + "step": 157860 + }, + { + "epoch": 17.366886688668867, + "grad_norm": 0.0848078653216362, + "learning_rate": 2.594074315792902e-06, + "loss": 0.0825, + "num_input_tokens_seen": 33313824, + "step": 157865 + }, + { + "epoch": 17.367436743674368, + "grad_norm": 0.20620277523994446, + "learning_rate": 2.5930098078947002e-06, + "loss": 0.0556, + "num_input_tokens_seen": 33314848, + "step": 157870 + }, + { + "epoch": 17.36798679867987, + "grad_norm": 0.01877201721072197, + "learning_rate": 2.591945506510979e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33315872, + "step": 157875 + }, + { + "epoch": 17.36853685368537, + "grad_norm": 0.11174110323190689, + "learning_rate": 2.5908814116515382e-06, + "loss": 0.0874, + "num_input_tokens_seen": 33316896, + "step": 157880 + }, + { + "epoch": 17.369086908690868, + "grad_norm": 0.014677265658974648, + "learning_rate": 2.589817523326196e-06, + "loss": 0.0441, + "num_input_tokens_seen": 33317952, + "step": 157885 + }, + { + "epoch": 17.36963696369637, + "grad_norm": 0.26809635758399963, + "learning_rate": 2.588753841544747e-06, + "loss": 0.0845, + "num_input_tokens_seen": 33319040, + "step": 157890 + }, + { + "epoch": 17.37018701870187, + "grad_norm": 0.24927592277526855, + "learning_rate": 2.5876903663169998e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33320064, + "step": 157895 + }, + { + "epoch": 17.37073707370737, + "grad_norm": 1.670238971710205, + "learning_rate": 2.5866270976527607e-06, + "loss": 0.1661, + "num_input_tokens_seen": 33321088, + "step": 157900 + }, + { + "epoch": 17.371287128712872, + "grad_norm": 0.20859605073928833, + "learning_rate": 2.585564035561824e-06, + "loss": 0.0326, + "num_input_tokens_seen": 33322208, + "step": 157905 + }, + { + "epoch": 17.371837183718373, + "grad_norm": 3.616591453552246, + "learning_rate": 2.584501180053986e-06, + "loss": 0.0939, + "num_input_tokens_seen": 33323296, + "step": 157910 + }, + { + "epoch": 17.37238723872387, + "grad_norm": 0.2059348225593567, + "learning_rate": 2.5834385311390487e-06, + "loss": 0.0039, + "num_input_tokens_seen": 33324416, + "step": 157915 + }, + { + "epoch": 17.372937293729372, + "grad_norm": 0.5810331702232361, + "learning_rate": 2.5823760888267907e-06, + "loss": 0.004, + "num_input_tokens_seen": 33325504, + "step": 157920 + }, + { + "epoch": 17.373487348734873, + "grad_norm": 0.05931329354643822, + "learning_rate": 2.5813138531270235e-06, + "loss": 0.0059, + "num_input_tokens_seen": 33326496, + "step": 157925 + }, + { + "epoch": 17.374037403740374, + "grad_norm": 0.02023172751069069, + "learning_rate": 2.5802518240495318e-06, + "loss": 0.0379, + "num_input_tokens_seen": 33327616, + "step": 157930 + }, + { + "epoch": 17.374587458745875, + "grad_norm": 0.5599097609519958, + "learning_rate": 2.5791900016040953e-06, + "loss": 0.019, + "num_input_tokens_seen": 33328640, + "step": 157935 + }, + { + "epoch": 17.375137513751376, + "grad_norm": 0.03289366513490677, + "learning_rate": 2.578128385800513e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33329664, + "step": 157940 + }, + { + "epoch": 17.375687568756877, + "grad_norm": 0.0023629802744835615, + "learning_rate": 2.577066976648554e-06, + "loss": 0.0325, + "num_input_tokens_seen": 33330688, + "step": 157945 + }, + { + "epoch": 17.376237623762375, + "grad_norm": 0.008807934820652008, + "learning_rate": 2.5760057741580106e-06, + "loss": 0.0188, + "num_input_tokens_seen": 33331712, + "step": 157950 + }, + { + "epoch": 17.376787678767876, + "grad_norm": 0.04342350736260414, + "learning_rate": 2.574944778338667e-06, + "loss": 0.004, + "num_input_tokens_seen": 33332800, + "step": 157955 + }, + { + "epoch": 17.377337733773377, + "grad_norm": 0.03173239901661873, + "learning_rate": 2.57388398920029e-06, + "loss": 0.0088, + "num_input_tokens_seen": 33333856, + "step": 157960 + }, + { + "epoch": 17.377887788778878, + "grad_norm": 0.035126179456710815, + "learning_rate": 2.57282340675267e-06, + "loss": 0.0006, + "num_input_tokens_seen": 33334912, + "step": 157965 + }, + { + "epoch": 17.37843784378438, + "grad_norm": 0.023267850279808044, + "learning_rate": 2.5717630310055696e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33335936, + "step": 157970 + }, + { + "epoch": 17.37898789878988, + "grad_norm": 0.02425052411854267, + "learning_rate": 2.570702861968774e-06, + "loss": 0.0233, + "num_input_tokens_seen": 33336928, + "step": 157975 + }, + { + "epoch": 17.379537953795378, + "grad_norm": 5.383931636810303, + "learning_rate": 2.5696428996520415e-06, + "loss": 0.1197, + "num_input_tokens_seen": 33337952, + "step": 157980 + }, + { + "epoch": 17.38008800880088, + "grad_norm": 0.015170165337622166, + "learning_rate": 2.5685831440651454e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33339072, + "step": 157985 + }, + { + "epoch": 17.38063806380638, + "grad_norm": 0.446977823972702, + "learning_rate": 2.5675235952178604e-06, + "loss": 0.0481, + "num_input_tokens_seen": 33340128, + "step": 157990 + }, + { + "epoch": 17.38118811881188, + "grad_norm": 0.010526198893785477, + "learning_rate": 2.5664642531199434e-06, + "loss": 0.0031, + "num_input_tokens_seen": 33341184, + "step": 157995 + }, + { + "epoch": 17.381738173817382, + "grad_norm": 0.030746053904294968, + "learning_rate": 2.5654051177811637e-06, + "loss": 0.0071, + "num_input_tokens_seen": 33342240, + "step": 158000 + }, + { + "epoch": 17.382288228822883, + "grad_norm": 0.34466421604156494, + "learning_rate": 2.5643461892112753e-06, + "loss": 0.0234, + "num_input_tokens_seen": 33343296, + "step": 158005 + }, + { + "epoch": 17.382838283828384, + "grad_norm": 0.21678821742534637, + "learning_rate": 2.563287467420042e-06, + "loss": 0.0123, + "num_input_tokens_seen": 33344352, + "step": 158010 + }, + { + "epoch": 17.383388338833882, + "grad_norm": 0.033336251974105835, + "learning_rate": 2.5622289524172283e-06, + "loss": 0.0436, + "num_input_tokens_seen": 33345440, + "step": 158015 + }, + { + "epoch": 17.383938393839383, + "grad_norm": 0.010540165938436985, + "learning_rate": 2.5611706442125764e-06, + "loss": 0.0613, + "num_input_tokens_seen": 33346464, + "step": 158020 + }, + { + "epoch": 17.384488448844884, + "grad_norm": 0.9660477638244629, + "learning_rate": 2.560112542815854e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33347520, + "step": 158025 + }, + { + "epoch": 17.385038503850385, + "grad_norm": 0.4996337294578552, + "learning_rate": 2.5590546482368084e-06, + "loss": 0.0058, + "num_input_tokens_seen": 33348544, + "step": 158030 + }, + { + "epoch": 17.385588558855886, + "grad_norm": 0.058996330946683884, + "learning_rate": 2.557996960485179e-06, + "loss": 0.0252, + "num_input_tokens_seen": 33349568, + "step": 158035 + }, + { + "epoch": 17.386138613861387, + "grad_norm": 0.09615971893072128, + "learning_rate": 2.5569394795707276e-06, + "loss": 0.0101, + "num_input_tokens_seen": 33350592, + "step": 158040 + }, + { + "epoch": 17.38668866886689, + "grad_norm": 0.08614299446344376, + "learning_rate": 2.5558822055031906e-06, + "loss": 0.0375, + "num_input_tokens_seen": 33351712, + "step": 158045 + }, + { + "epoch": 17.387238723872386, + "grad_norm": 0.9846720695495605, + "learning_rate": 2.554825138292327e-06, + "loss": 0.0362, + "num_input_tokens_seen": 33352832, + "step": 158050 + }, + { + "epoch": 17.387788778877887, + "grad_norm": 0.21618770062923431, + "learning_rate": 2.5537682779478655e-06, + "loss": 0.0116, + "num_input_tokens_seen": 33353856, + "step": 158055 + }, + { + "epoch": 17.388338833883388, + "grad_norm": 0.08777612447738647, + "learning_rate": 2.5527116244795506e-06, + "loss": 0.0134, + "num_input_tokens_seen": 33354912, + "step": 158060 + }, + { + "epoch": 17.38888888888889, + "grad_norm": 0.02157021500170231, + "learning_rate": 2.5516551778971193e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33355904, + "step": 158065 + }, + { + "epoch": 17.38943894389439, + "grad_norm": 0.008726688101887703, + "learning_rate": 2.5505989382103106e-06, + "loss": 0.0493, + "num_input_tokens_seen": 33356960, + "step": 158070 + }, + { + "epoch": 17.38998899889989, + "grad_norm": 0.2584286630153656, + "learning_rate": 2.5495429054288617e-06, + "loss": 0.0044, + "num_input_tokens_seen": 33358016, + "step": 158075 + }, + { + "epoch": 17.39053905390539, + "grad_norm": 0.026783553883433342, + "learning_rate": 2.548487079562506e-06, + "loss": 0.0042, + "num_input_tokens_seen": 33359040, + "step": 158080 + }, + { + "epoch": 17.39108910891089, + "grad_norm": 0.45799720287323, + "learning_rate": 2.5474314606209664e-06, + "loss": 0.1791, + "num_input_tokens_seen": 33360096, + "step": 158085 + }, + { + "epoch": 17.39163916391639, + "grad_norm": 0.04455734044313431, + "learning_rate": 2.5463760486139827e-06, + "loss": 0.004, + "num_input_tokens_seen": 33361184, + "step": 158090 + }, + { + "epoch": 17.392189218921892, + "grad_norm": 0.0077190883457660675, + "learning_rate": 2.545320843551269e-06, + "loss": 0.0099, + "num_input_tokens_seen": 33362240, + "step": 158095 + }, + { + "epoch": 17.392739273927393, + "grad_norm": 0.05649343132972717, + "learning_rate": 2.5442658454425616e-06, + "loss": 0.0063, + "num_input_tokens_seen": 33363296, + "step": 158100 + }, + { + "epoch": 17.393289328932894, + "grad_norm": 0.13223306834697723, + "learning_rate": 2.543211054297584e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33364352, + "step": 158105 + }, + { + "epoch": 17.393839383938396, + "grad_norm": 0.21310150623321533, + "learning_rate": 2.54215647012605e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33365440, + "step": 158110 + }, + { + "epoch": 17.394389438943893, + "grad_norm": 0.042289238423109055, + "learning_rate": 2.541102092937686e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33366432, + "step": 158115 + }, + { + "epoch": 17.394939493949394, + "grad_norm": 0.05433071404695511, + "learning_rate": 2.5400479227422056e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33367584, + "step": 158120 + }, + { + "epoch": 17.395489548954895, + "grad_norm": 0.008286896161735058, + "learning_rate": 2.538993959549324e-06, + "loss": 0.0072, + "num_input_tokens_seen": 33368672, + "step": 158125 + }, + { + "epoch": 17.396039603960396, + "grad_norm": 0.06399355083703995, + "learning_rate": 2.537940203368763e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33369760, + "step": 158130 + }, + { + "epoch": 17.396589658965897, + "grad_norm": 0.6565953493118286, + "learning_rate": 2.536886654210224e-06, + "loss": 0.0251, + "num_input_tokens_seen": 33370816, + "step": 158135 + }, + { + "epoch": 17.3971397139714, + "grad_norm": 0.03017088770866394, + "learning_rate": 2.5358333120834243e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33371872, + "step": 158140 + }, + { + "epoch": 17.397689768976896, + "grad_norm": 3.5727310180664062, + "learning_rate": 2.5347801769980724e-06, + "loss": 0.0954, + "num_input_tokens_seen": 33372960, + "step": 158145 + }, + { + "epoch": 17.398239823982397, + "grad_norm": 0.7360824346542358, + "learning_rate": 2.5337272489638656e-06, + "loss": 0.0543, + "num_input_tokens_seen": 33374016, + "step": 158150 + }, + { + "epoch": 17.3987898789879, + "grad_norm": 0.23879781365394592, + "learning_rate": 2.532674527990517e-06, + "loss": 0.014, + "num_input_tokens_seen": 33375136, + "step": 158155 + }, + { + "epoch": 17.3993399339934, + "grad_norm": 0.0859314426779747, + "learning_rate": 2.5316220140877226e-06, + "loss": 0.0885, + "num_input_tokens_seen": 33376224, + "step": 158160 + }, + { + "epoch": 17.3998899889989, + "grad_norm": 0.08664705604314804, + "learning_rate": 2.5305697072651957e-06, + "loss": 0.0028, + "num_input_tokens_seen": 33377280, + "step": 158165 + }, + { + "epoch": 17.4004400440044, + "grad_norm": 0.026853080838918686, + "learning_rate": 2.529517607532622e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33378304, + "step": 158170 + }, + { + "epoch": 17.400990099009903, + "grad_norm": 0.035173915326595306, + "learning_rate": 2.5284657148996997e-06, + "loss": 0.0458, + "num_input_tokens_seen": 33379392, + "step": 158175 + }, + { + "epoch": 17.4015401540154, + "grad_norm": 0.009113863110542297, + "learning_rate": 2.527414029376124e-06, + "loss": 0.0092, + "num_input_tokens_seen": 33380480, + "step": 158180 + }, + { + "epoch": 17.4020902090209, + "grad_norm": 0.11681892722845078, + "learning_rate": 2.526362550971592e-06, + "loss": 0.0505, + "num_input_tokens_seen": 33381536, + "step": 158185 + }, + { + "epoch": 17.402640264026402, + "grad_norm": 0.008090000599622726, + "learning_rate": 2.5253112796957966e-06, + "loss": 0.0988, + "num_input_tokens_seen": 33382624, + "step": 158190 + }, + { + "epoch": 17.403190319031903, + "grad_norm": 0.04543101042509079, + "learning_rate": 2.5242602155584216e-06, + "loss": 0.0932, + "num_input_tokens_seen": 33383648, + "step": 158195 + }, + { + "epoch": 17.403740374037405, + "grad_norm": 0.06296677142381668, + "learning_rate": 2.5232093585691508e-06, + "loss": 0.0141, + "num_input_tokens_seen": 33384768, + "step": 158200 + }, + { + "epoch": 17.404290429042906, + "grad_norm": 0.01610773615539074, + "learning_rate": 2.522158708737679e-06, + "loss": 0.0183, + "num_input_tokens_seen": 33385824, + "step": 158205 + }, + { + "epoch": 17.404840484048403, + "grad_norm": 0.06631335616111755, + "learning_rate": 2.5211082660736796e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33386848, + "step": 158210 + }, + { + "epoch": 17.405390539053904, + "grad_norm": 0.008379636332392693, + "learning_rate": 2.5200580305868416e-06, + "loss": 0.0073, + "num_input_tokens_seen": 33387872, + "step": 158215 + }, + { + "epoch": 17.405940594059405, + "grad_norm": 1.731289267539978, + "learning_rate": 2.5190080022868434e-06, + "loss": 0.026, + "num_input_tokens_seen": 33388864, + "step": 158220 + }, + { + "epoch": 17.406490649064907, + "grad_norm": 0.006185232661664486, + "learning_rate": 2.517958181183355e-06, + "loss": 0.0084, + "num_input_tokens_seen": 33389984, + "step": 158225 + }, + { + "epoch": 17.407040704070408, + "grad_norm": 0.006907218601554632, + "learning_rate": 2.5169085672860666e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33391072, + "step": 158230 + }, + { + "epoch": 17.40759075907591, + "grad_norm": 0.02641172520816326, + "learning_rate": 2.5158591606046363e-06, + "loss": 0.015, + "num_input_tokens_seen": 33392096, + "step": 158235 + }, + { + "epoch": 17.40814081408141, + "grad_norm": 0.08674435317516327, + "learning_rate": 2.5148099611487454e-06, + "loss": 0.0029, + "num_input_tokens_seen": 33393216, + "step": 158240 + }, + { + "epoch": 17.408690869086907, + "grad_norm": 0.005629677791148424, + "learning_rate": 2.5137609689280638e-06, + "loss": 0.002, + "num_input_tokens_seen": 33394304, + "step": 158245 + }, + { + "epoch": 17.40924092409241, + "grad_norm": 0.011989882215857506, + "learning_rate": 2.512712183952254e-06, + "loss": 0.0081, + "num_input_tokens_seen": 33395264, + "step": 158250 + }, + { + "epoch": 17.40979097909791, + "grad_norm": 0.019795402884483337, + "learning_rate": 2.5116636062309905e-06, + "loss": 0.1425, + "num_input_tokens_seen": 33396384, + "step": 158255 + }, + { + "epoch": 17.41034103410341, + "grad_norm": 0.10993148386478424, + "learning_rate": 2.5106152357739277e-06, + "loss": 0.0059, + "num_input_tokens_seen": 33397440, + "step": 158260 + }, + { + "epoch": 17.41089108910891, + "grad_norm": 0.030448786914348602, + "learning_rate": 2.509567072590735e-06, + "loss": 0.0039, + "num_input_tokens_seen": 33398464, + "step": 158265 + }, + { + "epoch": 17.411441144114413, + "grad_norm": 0.9354801177978516, + "learning_rate": 2.508519116691069e-06, + "loss": 0.074, + "num_input_tokens_seen": 33399488, + "step": 158270 + }, + { + "epoch": 17.41199119911991, + "grad_norm": 0.006519855000078678, + "learning_rate": 2.507471368084588e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33400576, + "step": 158275 + }, + { + "epoch": 17.41254125412541, + "grad_norm": 0.14724208414554596, + "learning_rate": 2.506423826780957e-06, + "loss": 0.0075, + "num_input_tokens_seen": 33401728, + "step": 158280 + }, + { + "epoch": 17.413091309130913, + "grad_norm": 2.0812408924102783, + "learning_rate": 2.505376492789824e-06, + "loss": 0.1207, + "num_input_tokens_seen": 33402816, + "step": 158285 + }, + { + "epoch": 17.413641364136414, + "grad_norm": 1.6652929782867432, + "learning_rate": 2.504329366120836e-06, + "loss": 0.2598, + "num_input_tokens_seen": 33403840, + "step": 158290 + }, + { + "epoch": 17.414191419141915, + "grad_norm": 0.3999980092048645, + "learning_rate": 2.50328244678365e-06, + "loss": 0.0143, + "num_input_tokens_seen": 33404864, + "step": 158295 + }, + { + "epoch": 17.414741474147416, + "grad_norm": 3.957944869995117, + "learning_rate": 2.502235734787914e-06, + "loss": 0.0239, + "num_input_tokens_seen": 33405984, + "step": 158300 + }, + { + "epoch": 17.415291529152917, + "grad_norm": 0.0040797460824251175, + "learning_rate": 2.5011892301432832e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33406976, + "step": 158305 + }, + { + "epoch": 17.415841584158414, + "grad_norm": 0.1090073436498642, + "learning_rate": 2.5001429328593923e-06, + "loss": 0.068, + "num_input_tokens_seen": 33408032, + "step": 158310 + }, + { + "epoch": 17.416391639163916, + "grad_norm": 0.02372029982507229, + "learning_rate": 2.4990968429458835e-06, + "loss": 0.0265, + "num_input_tokens_seen": 33409056, + "step": 158315 + }, + { + "epoch": 17.416941694169417, + "grad_norm": 3.2837939262390137, + "learning_rate": 2.498050960412407e-06, + "loss": 0.0207, + "num_input_tokens_seen": 33410080, + "step": 158320 + }, + { + "epoch": 17.417491749174918, + "grad_norm": 0.36490732431411743, + "learning_rate": 2.497005285268594e-06, + "loss": 0.0057, + "num_input_tokens_seen": 33411072, + "step": 158325 + }, + { + "epoch": 17.41804180418042, + "grad_norm": 1.6512129306793213, + "learning_rate": 2.495959817524085e-06, + "loss": 0.0922, + "num_input_tokens_seen": 33412064, + "step": 158330 + }, + { + "epoch": 17.41859185918592, + "grad_norm": 0.025809772312641144, + "learning_rate": 2.4949145571885184e-06, + "loss": 0.0671, + "num_input_tokens_seen": 33413120, + "step": 158335 + }, + { + "epoch": 17.419141914191417, + "grad_norm": 0.039718903601169586, + "learning_rate": 2.493869504271523e-06, + "loss": 0.0105, + "num_input_tokens_seen": 33414208, + "step": 158340 + }, + { + "epoch": 17.41969196919692, + "grad_norm": 0.21127428114414215, + "learning_rate": 2.492824658782736e-06, + "loss": 0.0042, + "num_input_tokens_seen": 33415264, + "step": 158345 + }, + { + "epoch": 17.42024202420242, + "grad_norm": 0.015817804262042046, + "learning_rate": 2.4917800207317792e-06, + "loss": 0.004, + "num_input_tokens_seen": 33416352, + "step": 158350 + }, + { + "epoch": 17.42079207920792, + "grad_norm": 1.4476375579833984, + "learning_rate": 2.4907355901282877e-06, + "loss": 0.0486, + "num_input_tokens_seen": 33417408, + "step": 158355 + }, + { + "epoch": 17.421342134213422, + "grad_norm": 4.54791784286499, + "learning_rate": 2.489691366981889e-06, + "loss": 0.0675, + "num_input_tokens_seen": 33418464, + "step": 158360 + }, + { + "epoch": 17.421892189218923, + "grad_norm": 0.29721352458000183, + "learning_rate": 2.4886473513021978e-06, + "loss": 0.0183, + "num_input_tokens_seen": 33419488, + "step": 158365 + }, + { + "epoch": 17.422442244224424, + "grad_norm": 0.8907998204231262, + "learning_rate": 2.4876035430988457e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33420576, + "step": 158370 + }, + { + "epoch": 17.42299229922992, + "grad_norm": 0.05084782466292381, + "learning_rate": 2.486559942381447e-06, + "loss": 0.0469, + "num_input_tokens_seen": 33421664, + "step": 158375 + }, + { + "epoch": 17.423542354235423, + "grad_norm": 0.003718194318935275, + "learning_rate": 2.485516549159625e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33422752, + "step": 158380 + }, + { + "epoch": 17.424092409240924, + "grad_norm": 0.04549981281161308, + "learning_rate": 2.4844733634429907e-06, + "loss": 0.0602, + "num_input_tokens_seen": 33423776, + "step": 158385 + }, + { + "epoch": 17.424642464246425, + "grad_norm": 0.07172054052352905, + "learning_rate": 2.4834303852411615e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33424896, + "step": 158390 + }, + { + "epoch": 17.425192519251926, + "grad_norm": 2.2910001277923584, + "learning_rate": 2.4823876145637524e-06, + "loss": 0.0802, + "num_input_tokens_seen": 33425984, + "step": 158395 + }, + { + "epoch": 17.425742574257427, + "grad_norm": 3.333575487136841, + "learning_rate": 2.4813450514203697e-06, + "loss": 0.1602, + "num_input_tokens_seen": 33426976, + "step": 158400 + }, + { + "epoch": 17.426292629262925, + "grad_norm": 0.04496040567755699, + "learning_rate": 2.480302695820627e-06, + "loss": 0.0057, + "num_input_tokens_seen": 33428096, + "step": 158405 + }, + { + "epoch": 17.426842684268426, + "grad_norm": 0.014820326119661331, + "learning_rate": 2.479260547774123e-06, + "loss": 0.02, + "num_input_tokens_seen": 33429216, + "step": 158410 + }, + { + "epoch": 17.427392739273927, + "grad_norm": 0.06267355382442474, + "learning_rate": 2.4782186072904693e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33430240, + "step": 158415 + }, + { + "epoch": 17.427942794279428, + "grad_norm": 0.1029549315571785, + "learning_rate": 2.477176874379272e-06, + "loss": 0.1106, + "num_input_tokens_seen": 33431328, + "step": 158420 + }, + { + "epoch": 17.42849284928493, + "grad_norm": 0.051049549132585526, + "learning_rate": 2.4761353490501237e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33432352, + "step": 158425 + }, + { + "epoch": 17.42904290429043, + "grad_norm": 0.054500605911016464, + "learning_rate": 2.475094031312633e-06, + "loss": 0.0498, + "num_input_tokens_seen": 33433440, + "step": 158430 + }, + { + "epoch": 17.42959295929593, + "grad_norm": 0.3520977795124054, + "learning_rate": 2.4740529211763895e-06, + "loss": 0.006, + "num_input_tokens_seen": 33434464, + "step": 158435 + }, + { + "epoch": 17.43014301430143, + "grad_norm": 0.006095369346439838, + "learning_rate": 2.4730120186509885e-06, + "loss": 0.0534, + "num_input_tokens_seen": 33435456, + "step": 158440 + }, + { + "epoch": 17.43069306930693, + "grad_norm": 0.009727435186505318, + "learning_rate": 2.4719713237460277e-06, + "loss": 0.0794, + "num_input_tokens_seen": 33436480, + "step": 158445 + }, + { + "epoch": 17.43124312431243, + "grad_norm": 0.1875249594449997, + "learning_rate": 2.4709308364710943e-06, + "loss": 0.0269, + "num_input_tokens_seen": 33437568, + "step": 158450 + }, + { + "epoch": 17.431793179317932, + "grad_norm": 0.021505964919924736, + "learning_rate": 2.4698905568357885e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33438656, + "step": 158455 + }, + { + "epoch": 17.432343234323433, + "grad_norm": 0.07842408120632172, + "learning_rate": 2.4688504848496885e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33439744, + "step": 158460 + }, + { + "epoch": 17.432893289328934, + "grad_norm": 0.06439853459596634, + "learning_rate": 2.467810620522376e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33440768, + "step": 158465 + }, + { + "epoch": 17.433443344334435, + "grad_norm": 0.008959008380770683, + "learning_rate": 2.4667709638634434e-06, + "loss": 0.0161, + "num_input_tokens_seen": 33441856, + "step": 158470 + }, + { + "epoch": 17.433993399339933, + "grad_norm": 0.010472365655004978, + "learning_rate": 2.4657315148824746e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33442912, + "step": 158475 + }, + { + "epoch": 17.434543454345434, + "grad_norm": 0.014611789956688881, + "learning_rate": 2.464692273589042e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33444032, + "step": 158480 + }, + { + "epoch": 17.435093509350935, + "grad_norm": 2.786299467086792, + "learning_rate": 2.4636532399927304e-06, + "loss": 0.0884, + "num_input_tokens_seen": 33445184, + "step": 158485 + }, + { + "epoch": 17.435643564356436, + "grad_norm": 0.24297493696212769, + "learning_rate": 2.4626144141031067e-06, + "loss": 0.0061, + "num_input_tokens_seen": 33446208, + "step": 158490 + }, + { + "epoch": 17.436193619361937, + "grad_norm": 0.38611719012260437, + "learning_rate": 2.4615757959297574e-06, + "loss": 0.0405, + "num_input_tokens_seen": 33447200, + "step": 158495 + }, + { + "epoch": 17.436743674367438, + "grad_norm": 0.05687854439020157, + "learning_rate": 2.460537385482245e-06, + "loss": 0.0081, + "num_input_tokens_seen": 33448224, + "step": 158500 + }, + { + "epoch": 17.437293729372936, + "grad_norm": 0.011901766993105412, + "learning_rate": 2.4594991827701445e-06, + "loss": 0.0193, + "num_input_tokens_seen": 33449280, + "step": 158505 + }, + { + "epoch": 17.437843784378437, + "grad_norm": 0.04068436101078987, + "learning_rate": 2.458461187803027e-06, + "loss": 0.0079, + "num_input_tokens_seen": 33450368, + "step": 158510 + }, + { + "epoch": 17.438393839383938, + "grad_norm": 0.03202022984623909, + "learning_rate": 2.457423400590453e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33451424, + "step": 158515 + }, + { + "epoch": 17.43894389438944, + "grad_norm": 0.031286027282476425, + "learning_rate": 2.4563858211419932e-06, + "loss": 0.0046, + "num_input_tokens_seen": 33452480, + "step": 158520 + }, + { + "epoch": 17.43949394939494, + "grad_norm": 0.05609947815537453, + "learning_rate": 2.4553484494672097e-06, + "loss": 0.0342, + "num_input_tokens_seen": 33453568, + "step": 158525 + }, + { + "epoch": 17.44004400440044, + "grad_norm": 0.1465335637331009, + "learning_rate": 2.454311285575653e-06, + "loss": 0.0731, + "num_input_tokens_seen": 33454560, + "step": 158530 + }, + { + "epoch": 17.440594059405942, + "grad_norm": 0.20976483821868896, + "learning_rate": 2.4532743294768984e-06, + "loss": 0.0239, + "num_input_tokens_seen": 33455616, + "step": 158535 + }, + { + "epoch": 17.44114411441144, + "grad_norm": 0.034894779324531555, + "learning_rate": 2.4522375811804916e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33456640, + "step": 158540 + }, + { + "epoch": 17.44169416941694, + "grad_norm": 0.3138734996318817, + "learning_rate": 2.451201040695994e-06, + "loss": 0.04, + "num_input_tokens_seen": 33457728, + "step": 158545 + }, + { + "epoch": 17.442244224422442, + "grad_norm": 0.2095271199941635, + "learning_rate": 2.4501647080329595e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33458752, + "step": 158550 + }, + { + "epoch": 17.442794279427943, + "grad_norm": 0.025576036423444748, + "learning_rate": 2.4491285832009298e-06, + "loss": 0.1386, + "num_input_tokens_seen": 33459872, + "step": 158555 + }, + { + "epoch": 17.443344334433444, + "grad_norm": 1.4243706464767456, + "learning_rate": 2.4480926662094617e-06, + "loss": 0.0501, + "num_input_tokens_seen": 33460992, + "step": 158560 + }, + { + "epoch": 17.443894389438945, + "grad_norm": 0.46627405285835266, + "learning_rate": 2.447056957068103e-06, + "loss": 0.0863, + "num_input_tokens_seen": 33462112, + "step": 158565 + }, + { + "epoch": 17.444444444444443, + "grad_norm": 0.0850636288523674, + "learning_rate": 2.4460214557864014e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33463200, + "step": 158570 + }, + { + "epoch": 17.444994499449944, + "grad_norm": 0.06524442136287689, + "learning_rate": 2.4449861623738967e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33464224, + "step": 158575 + }, + { + "epoch": 17.445544554455445, + "grad_norm": 1.976818561553955, + "learning_rate": 2.4439510768401286e-06, + "loss": 0.017, + "num_input_tokens_seen": 33465280, + "step": 158580 + }, + { + "epoch": 17.446094609460946, + "grad_norm": 0.028591196984052658, + "learning_rate": 2.4429161991946447e-06, + "loss": 0.0098, + "num_input_tokens_seen": 33466304, + "step": 158585 + }, + { + "epoch": 17.446644664466447, + "grad_norm": 0.07025355100631714, + "learning_rate": 2.4418815294469684e-06, + "loss": 0.0055, + "num_input_tokens_seen": 33467328, + "step": 158590 + }, + { + "epoch": 17.44719471947195, + "grad_norm": 0.2267587035894394, + "learning_rate": 2.4408470676066553e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33468416, + "step": 158595 + }, + { + "epoch": 17.44774477447745, + "grad_norm": 0.15606169402599335, + "learning_rate": 2.4398128136832317e-06, + "loss": 0.0288, + "num_input_tokens_seen": 33469472, + "step": 158600 + }, + { + "epoch": 17.448294829482947, + "grad_norm": 0.033668022602796555, + "learning_rate": 2.43877876768622e-06, + "loss": 0.0006, + "num_input_tokens_seen": 33470464, + "step": 158605 + }, + { + "epoch": 17.448844884488448, + "grad_norm": 0.015098785050213337, + "learning_rate": 2.437744929625166e-06, + "loss": 0.0507, + "num_input_tokens_seen": 33471552, + "step": 158610 + }, + { + "epoch": 17.44939493949395, + "grad_norm": 0.14048662781715393, + "learning_rate": 2.436711299509584e-06, + "loss": 0.0407, + "num_input_tokens_seen": 33472576, + "step": 158615 + }, + { + "epoch": 17.44994499449945, + "grad_norm": 0.008292702957987785, + "learning_rate": 2.4356778773490076e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33473600, + "step": 158620 + }, + { + "epoch": 17.45049504950495, + "grad_norm": 0.028463274240493774, + "learning_rate": 2.434644663152963e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33474720, + "step": 158625 + }, + { + "epoch": 17.451045104510452, + "grad_norm": 0.20435278117656708, + "learning_rate": 2.4336116569309703e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33475776, + "step": 158630 + }, + { + "epoch": 17.45159515951595, + "grad_norm": 0.041206661611795425, + "learning_rate": 2.4325788586925523e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33476832, + "step": 158635 + }, + { + "epoch": 17.45214521452145, + "grad_norm": 0.12403160333633423, + "learning_rate": 2.431546268447221e-06, + "loss": 0.004, + "num_input_tokens_seen": 33477888, + "step": 158640 + }, + { + "epoch": 17.452695269526952, + "grad_norm": 3.3283374309539795, + "learning_rate": 2.430513886204505e-06, + "loss": 0.0792, + "num_input_tokens_seen": 33478880, + "step": 158645 + }, + { + "epoch": 17.453245324532453, + "grad_norm": 0.019848180934786797, + "learning_rate": 2.429481711973908e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33479968, + "step": 158650 + }, + { + "epoch": 17.453795379537954, + "grad_norm": 0.007738676853477955, + "learning_rate": 2.4284497457649464e-06, + "loss": 0.0077, + "num_input_tokens_seen": 33480992, + "step": 158655 + }, + { + "epoch": 17.454345434543455, + "grad_norm": 0.010174078866839409, + "learning_rate": 2.4274179875871382e-06, + "loss": 0.0057, + "num_input_tokens_seen": 33482144, + "step": 158660 + }, + { + "epoch": 17.454895489548957, + "grad_norm": 1.0059202909469604, + "learning_rate": 2.4263864374499816e-06, + "loss": 0.032, + "num_input_tokens_seen": 33483168, + "step": 158665 + }, + { + "epoch": 17.455445544554454, + "grad_norm": 0.04079405963420868, + "learning_rate": 2.425355095362994e-06, + "loss": 0.0124, + "num_input_tokens_seen": 33484224, + "step": 158670 + }, + { + "epoch": 17.455995599559955, + "grad_norm": 0.06089140102267265, + "learning_rate": 2.4243239613356726e-06, + "loss": 0.0308, + "num_input_tokens_seen": 33485280, + "step": 158675 + }, + { + "epoch": 17.456545654565456, + "grad_norm": 0.04186883568763733, + "learning_rate": 2.4232930353775217e-06, + "loss": 0.0142, + "num_input_tokens_seen": 33486304, + "step": 158680 + }, + { + "epoch": 17.457095709570957, + "grad_norm": 0.010068081319332123, + "learning_rate": 2.422262317498053e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33487360, + "step": 158685 + }, + { + "epoch": 17.45764576457646, + "grad_norm": 0.6431028246879578, + "learning_rate": 2.4212318077067584e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33488384, + "step": 158690 + }, + { + "epoch": 17.45819581958196, + "grad_norm": 0.04781467095017433, + "learning_rate": 2.420201506013131e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33489440, + "step": 158695 + }, + { + "epoch": 17.458745874587457, + "grad_norm": 0.006590110249817371, + "learning_rate": 2.419171412426674e-06, + "loss": 0.0187, + "num_input_tokens_seen": 33490496, + "step": 158700 + }, + { + "epoch": 17.459295929592958, + "grad_norm": 0.007625308353453875, + "learning_rate": 2.418141526956877e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33491520, + "step": 158705 + }, + { + "epoch": 17.45984598459846, + "grad_norm": 1.2357844114303589, + "learning_rate": 2.417111849613232e-06, + "loss": 0.022, + "num_input_tokens_seen": 33492608, + "step": 158710 + }, + { + "epoch": 17.46039603960396, + "grad_norm": 0.3330448865890503, + "learning_rate": 2.416082380405235e-06, + "loss": 0.0096, + "num_input_tokens_seen": 33493696, + "step": 158715 + }, + { + "epoch": 17.46094609460946, + "grad_norm": 0.04861598089337349, + "learning_rate": 2.415053119342367e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33494816, + "step": 158720 + }, + { + "epoch": 17.461496149614963, + "grad_norm": 4.326886177062988, + "learning_rate": 2.41402406643412e-06, + "loss": 0.1016, + "num_input_tokens_seen": 33495808, + "step": 158725 + }, + { + "epoch": 17.462046204620464, + "grad_norm": 0.0588337741792202, + "learning_rate": 2.412995221689973e-06, + "loss": 0.0522, + "num_input_tokens_seen": 33496800, + "step": 158730 + }, + { + "epoch": 17.46259625962596, + "grad_norm": 0.009829898364841938, + "learning_rate": 2.4119665851194102e-06, + "loss": 0.0029, + "num_input_tokens_seen": 33497888, + "step": 158735 + }, + { + "epoch": 17.463146314631462, + "grad_norm": 0.012863308191299438, + "learning_rate": 2.4109381567319152e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33498976, + "step": 158740 + }, + { + "epoch": 17.463696369636963, + "grad_norm": 0.02085804007947445, + "learning_rate": 2.409909936536961e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33499968, + "step": 158745 + }, + { + "epoch": 17.464246424642464, + "grad_norm": 0.005943531636148691, + "learning_rate": 2.408881924544032e-06, + "loss": 0.0444, + "num_input_tokens_seen": 33500992, + "step": 158750 + }, + { + "epoch": 17.464796479647966, + "grad_norm": 2.210606098175049, + "learning_rate": 2.40785412076259e-06, + "loss": 0.076, + "num_input_tokens_seen": 33502144, + "step": 158755 + }, + { + "epoch": 17.465346534653467, + "grad_norm": 0.04146239906549454, + "learning_rate": 2.4068265252021245e-06, + "loss": 0.003, + "num_input_tokens_seen": 33503232, + "step": 158760 + }, + { + "epoch": 17.465896589658964, + "grad_norm": 0.048011619597673416, + "learning_rate": 2.4057991378720916e-06, + "loss": 0.0152, + "num_input_tokens_seen": 33504288, + "step": 158765 + }, + { + "epoch": 17.466446644664465, + "grad_norm": 0.02578306756913662, + "learning_rate": 2.404771958781965e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33505344, + "step": 158770 + }, + { + "epoch": 17.466996699669966, + "grad_norm": 0.011085542850196362, + "learning_rate": 2.40374498794122e-06, + "loss": 0.0315, + "num_input_tokens_seen": 33506400, + "step": 158775 + }, + { + "epoch": 17.467546754675467, + "grad_norm": 0.09484262019395828, + "learning_rate": 2.40271822535931e-06, + "loss": 0.0092, + "num_input_tokens_seen": 33507424, + "step": 158780 + }, + { + "epoch": 17.46809680968097, + "grad_norm": 0.023670533671975136, + "learning_rate": 2.4016916710457057e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33508512, + "step": 158785 + }, + { + "epoch": 17.46864686468647, + "grad_norm": 0.012978698126971722, + "learning_rate": 2.4006653250098625e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33509600, + "step": 158790 + }, + { + "epoch": 17.46919691969197, + "grad_norm": 0.15257585048675537, + "learning_rate": 2.3996391872612404e-06, + "loss": 0.0266, + "num_input_tokens_seen": 33510688, + "step": 158795 + }, + { + "epoch": 17.46974697469747, + "grad_norm": 0.022921990603208542, + "learning_rate": 2.398613257809307e-06, + "loss": 0.0059, + "num_input_tokens_seen": 33511744, + "step": 158800 + }, + { + "epoch": 17.47029702970297, + "grad_norm": 0.0944310650229454, + "learning_rate": 2.3975875366635036e-06, + "loss": 0.1025, + "num_input_tokens_seen": 33512800, + "step": 158805 + }, + { + "epoch": 17.47084708470847, + "grad_norm": 0.058411385864019394, + "learning_rate": 2.3965620238332933e-06, + "loss": 0.0181, + "num_input_tokens_seen": 33513824, + "step": 158810 + }, + { + "epoch": 17.47139713971397, + "grad_norm": 0.19097119569778442, + "learning_rate": 2.395536719328129e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33514880, + "step": 158815 + }, + { + "epoch": 17.471947194719473, + "grad_norm": 0.16417959332466125, + "learning_rate": 2.394511623157447e-06, + "loss": 0.1793, + "num_input_tokens_seen": 33515904, + "step": 158820 + }, + { + "epoch": 17.472497249724974, + "grad_norm": 0.003132219659164548, + "learning_rate": 2.393486735330705e-06, + "loss": 0.1333, + "num_input_tokens_seen": 33517024, + "step": 158825 + }, + { + "epoch": 17.47304730473047, + "grad_norm": 0.048309918493032455, + "learning_rate": 2.3924620558573503e-06, + "loss": 0.0601, + "num_input_tokens_seen": 33518016, + "step": 158830 + }, + { + "epoch": 17.473597359735972, + "grad_norm": 0.02795286662876606, + "learning_rate": 2.3914375847468277e-06, + "loss": 0.0208, + "num_input_tokens_seen": 33519008, + "step": 158835 + }, + { + "epoch": 17.474147414741473, + "grad_norm": 0.02480485290288925, + "learning_rate": 2.3904133220085778e-06, + "loss": 0.0033, + "num_input_tokens_seen": 33520160, + "step": 158840 + }, + { + "epoch": 17.474697469746975, + "grad_norm": 3.077345132827759, + "learning_rate": 2.389389267652034e-06, + "loss": 0.0518, + "num_input_tokens_seen": 33521184, + "step": 158845 + }, + { + "epoch": 17.475247524752476, + "grad_norm": 0.00569672929123044, + "learning_rate": 2.3883654216866393e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33522272, + "step": 158850 + }, + { + "epoch": 17.475797579757977, + "grad_norm": 0.028302976861596107, + "learning_rate": 2.38734178412183e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33523296, + "step": 158855 + }, + { + "epoch": 17.476347634763478, + "grad_norm": 0.028768403455615044, + "learning_rate": 2.3863183549670433e-06, + "loss": 0.0286, + "num_input_tokens_seen": 33524320, + "step": 158860 + }, + { + "epoch": 17.476897689768975, + "grad_norm": 0.007304881699383259, + "learning_rate": 2.3852951342317133e-06, + "loss": 0.0848, + "num_input_tokens_seen": 33525312, + "step": 158865 + }, + { + "epoch": 17.477447744774476, + "grad_norm": 0.9153130054473877, + "learning_rate": 2.38427212192526e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33526400, + "step": 158870 + }, + { + "epoch": 17.477997799779978, + "grad_norm": 0.3341912627220154, + "learning_rate": 2.3832493180571238e-06, + "loss": 0.0738, + "num_input_tokens_seen": 33527488, + "step": 158875 + }, + { + "epoch": 17.47854785478548, + "grad_norm": 0.02317245863378048, + "learning_rate": 2.382226722636721e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33528544, + "step": 158880 + }, + { + "epoch": 17.47909790979098, + "grad_norm": 0.11073032021522522, + "learning_rate": 2.3812043356734813e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33529600, + "step": 158885 + }, + { + "epoch": 17.47964796479648, + "grad_norm": 0.17188115417957306, + "learning_rate": 2.38018215717683e-06, + "loss": 0.0085, + "num_input_tokens_seen": 33530656, + "step": 158890 + }, + { + "epoch": 17.480198019801982, + "grad_norm": 0.054424528032541275, + "learning_rate": 2.3791601871561845e-06, + "loss": 0.0376, + "num_input_tokens_seen": 33531744, + "step": 158895 + }, + { + "epoch": 17.48074807480748, + "grad_norm": 0.17716343700885773, + "learning_rate": 2.3781384256209684e-06, + "loss": 0.0085, + "num_input_tokens_seen": 33532800, + "step": 158900 + }, + { + "epoch": 17.48129812981298, + "grad_norm": 0.008484158664941788, + "learning_rate": 2.3771168725805927e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33533856, + "step": 158905 + }, + { + "epoch": 17.48184818481848, + "grad_norm": 0.02122570388019085, + "learning_rate": 2.3760955280444725e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33534912, + "step": 158910 + }, + { + "epoch": 17.482398239823983, + "grad_norm": 0.021362708881497383, + "learning_rate": 2.375074392022028e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33535968, + "step": 158915 + }, + { + "epoch": 17.482948294829484, + "grad_norm": 0.08093171566724777, + "learning_rate": 2.374053464522663e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33537056, + "step": 158920 + }, + { + "epoch": 17.483498349834985, + "grad_norm": 0.01702648587524891, + "learning_rate": 2.3730327455557977e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33538048, + "step": 158925 + }, + { + "epoch": 17.484048404840483, + "grad_norm": 0.0695534273982048, + "learning_rate": 2.372012235130827e-06, + "loss": 0.1683, + "num_input_tokens_seen": 33539104, + "step": 158930 + }, + { + "epoch": 17.484598459845984, + "grad_norm": 0.02777084708213806, + "learning_rate": 2.3709919332571606e-06, + "loss": 0.0029, + "num_input_tokens_seen": 33540128, + "step": 158935 + }, + { + "epoch": 17.485148514851485, + "grad_norm": 0.032880764454603195, + "learning_rate": 2.369971839944202e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33541216, + "step": 158940 + }, + { + "epoch": 17.485698569856986, + "grad_norm": 2.592097520828247, + "learning_rate": 2.368951955201354e-06, + "loss": 0.0657, + "num_input_tokens_seen": 33542208, + "step": 158945 + }, + { + "epoch": 17.486248624862487, + "grad_norm": 0.038158465176820755, + "learning_rate": 2.367932279038021e-06, + "loss": 0.0087, + "num_input_tokens_seen": 33543232, + "step": 158950 + }, + { + "epoch": 17.486798679867988, + "grad_norm": 0.011767709627747536, + "learning_rate": 2.3669128114635953e-06, + "loss": 0.0041, + "num_input_tokens_seen": 33544256, + "step": 158955 + }, + { + "epoch": 17.48734873487349, + "grad_norm": 0.03376750648021698, + "learning_rate": 2.365893552487469e-06, + "loss": 0.068, + "num_input_tokens_seen": 33545312, + "step": 158960 + }, + { + "epoch": 17.487898789878987, + "grad_norm": 0.39151594042778015, + "learning_rate": 2.3648745021190438e-06, + "loss": 0.0505, + "num_input_tokens_seen": 33546304, + "step": 158965 + }, + { + "epoch": 17.488448844884488, + "grad_norm": 0.046932756900787354, + "learning_rate": 2.363855660367706e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33547328, + "step": 158970 + }, + { + "epoch": 17.48899889988999, + "grad_norm": 0.005471996497362852, + "learning_rate": 2.3628370272428564e-06, + "loss": 0.0086, + "num_input_tokens_seen": 33548352, + "step": 158975 + }, + { + "epoch": 17.48954895489549, + "grad_norm": 0.11513522267341614, + "learning_rate": 2.3618186027538736e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33549440, + "step": 158980 + }, + { + "epoch": 17.49009900990099, + "grad_norm": 0.14630328118801117, + "learning_rate": 2.3608003869101393e-06, + "loss": 0.0215, + "num_input_tokens_seen": 33550528, + "step": 158985 + }, + { + "epoch": 17.490649064906492, + "grad_norm": 0.15052571892738342, + "learning_rate": 2.359782379721051e-06, + "loss": 0.0828, + "num_input_tokens_seen": 33551616, + "step": 158990 + }, + { + "epoch": 17.49119911991199, + "grad_norm": 0.02516535297036171, + "learning_rate": 2.3587645811959796e-06, + "loss": 0.1041, + "num_input_tokens_seen": 33552608, + "step": 158995 + }, + { + "epoch": 17.49174917491749, + "grad_norm": 1.9137516021728516, + "learning_rate": 2.357746991344312e-06, + "loss": 0.0457, + "num_input_tokens_seen": 33553696, + "step": 159000 + }, + { + "epoch": 17.492299229922992, + "grad_norm": 0.04317206144332886, + "learning_rate": 2.356729610175429e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33554784, + "step": 159005 + }, + { + "epoch": 17.492849284928493, + "grad_norm": 0.13931946456432343, + "learning_rate": 2.3557124376987012e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33555872, + "step": 159010 + }, + { + "epoch": 17.493399339933994, + "grad_norm": 0.028058558702468872, + "learning_rate": 2.3546954739235074e-06, + "loss": 0.0383, + "num_input_tokens_seen": 33556928, + "step": 159015 + }, + { + "epoch": 17.493949394939495, + "grad_norm": 0.9488773941993713, + "learning_rate": 2.3536787188592176e-06, + "loss": 0.0124, + "num_input_tokens_seen": 33557952, + "step": 159020 + }, + { + "epoch": 17.494499449944996, + "grad_norm": 0.018467415124177933, + "learning_rate": 2.3526621725152022e-06, + "loss": 0.0072, + "num_input_tokens_seen": 33558976, + "step": 159025 + }, + { + "epoch": 17.495049504950494, + "grad_norm": 0.026576660573482513, + "learning_rate": 2.351645834900837e-06, + "loss": 0.0857, + "num_input_tokens_seen": 33560064, + "step": 159030 + }, + { + "epoch": 17.495599559955995, + "grad_norm": 3.9047956466674805, + "learning_rate": 2.3506297060254813e-06, + "loss": 0.0893, + "num_input_tokens_seen": 33561056, + "step": 159035 + }, + { + "epoch": 17.496149614961496, + "grad_norm": 0.26347190141677856, + "learning_rate": 2.349613785898508e-06, + "loss": 0.0052, + "num_input_tokens_seen": 33562144, + "step": 159040 + }, + { + "epoch": 17.496699669966997, + "grad_norm": 0.0786651223897934, + "learning_rate": 2.348598074529271e-06, + "loss": 0.0054, + "num_input_tokens_seen": 33563200, + "step": 159045 + }, + { + "epoch": 17.497249724972498, + "grad_norm": 0.0191484484821558, + "learning_rate": 2.34758257192714e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33564224, + "step": 159050 + }, + { + "epoch": 17.497799779978, + "grad_norm": 0.023506971076130867, + "learning_rate": 2.3465672781014663e-06, + "loss": 0.0473, + "num_input_tokens_seen": 33565248, + "step": 159055 + }, + { + "epoch": 17.498349834983497, + "grad_norm": 0.019618866965174675, + "learning_rate": 2.345552193061612e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33566336, + "step": 159060 + }, + { + "epoch": 17.498899889988998, + "grad_norm": 0.03388388454914093, + "learning_rate": 2.344537316816936e-06, + "loss": 0.0571, + "num_input_tokens_seen": 33567424, + "step": 159065 + }, + { + "epoch": 17.4994499449945, + "grad_norm": 0.038824502378702164, + "learning_rate": 2.343522649376789e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33568448, + "step": 159070 + }, + { + "epoch": 17.5, + "grad_norm": 0.07336699962615967, + "learning_rate": 2.3425081907505198e-06, + "loss": 0.003, + "num_input_tokens_seen": 33569504, + "step": 159075 + }, + { + "epoch": 17.5005500550055, + "grad_norm": 0.017237430438399315, + "learning_rate": 2.3414939409474783e-06, + "loss": 0.0056, + "num_input_tokens_seen": 33570560, + "step": 159080 + }, + { + "epoch": 17.501100110011002, + "grad_norm": 0.03998202085494995, + "learning_rate": 2.3404798999770134e-06, + "loss": 0.0103, + "num_input_tokens_seen": 33571616, + "step": 159085 + }, + { + "epoch": 17.501650165016503, + "grad_norm": 1.391804575920105, + "learning_rate": 2.339466067848478e-06, + "loss": 0.0118, + "num_input_tokens_seen": 33572672, + "step": 159090 + }, + { + "epoch": 17.502200220022, + "grad_norm": 0.042689479887485504, + "learning_rate": 2.33845244457121e-06, + "loss": 0.0845, + "num_input_tokens_seen": 33573728, + "step": 159095 + }, + { + "epoch": 17.502750275027502, + "grad_norm": 0.010382665321230888, + "learning_rate": 2.3374390301545477e-06, + "loss": 0.0055, + "num_input_tokens_seen": 33574784, + "step": 159100 + }, + { + "epoch": 17.503300330033003, + "grad_norm": 0.08244120329618454, + "learning_rate": 2.336425824607841e-06, + "loss": 0.05, + "num_input_tokens_seen": 33575840, + "step": 159105 + }, + { + "epoch": 17.503850385038504, + "grad_norm": 0.036308303475379944, + "learning_rate": 2.335412827940417e-06, + "loss": 0.0037, + "num_input_tokens_seen": 33576896, + "step": 159110 + }, + { + "epoch": 17.504400440044005, + "grad_norm": 2.277818441390991, + "learning_rate": 2.334400040161616e-06, + "loss": 0.1986, + "num_input_tokens_seen": 33577952, + "step": 159115 + }, + { + "epoch": 17.504950495049506, + "grad_norm": 0.2422078400850296, + "learning_rate": 2.333387461280781e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33579072, + "step": 159120 + }, + { + "epoch": 17.505500550055004, + "grad_norm": 0.054441556334495544, + "learning_rate": 2.332375091307229e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33580160, + "step": 159125 + }, + { + "epoch": 17.506050605060505, + "grad_norm": 0.1639147251844406, + "learning_rate": 2.3313629302503053e-06, + "loss": 0.0062, + "num_input_tokens_seen": 33581216, + "step": 159130 + }, + { + "epoch": 17.506600660066006, + "grad_norm": 0.18161991238594055, + "learning_rate": 2.330350978119328e-06, + "loss": 0.0184, + "num_input_tokens_seen": 33582304, + "step": 159135 + }, + { + "epoch": 17.507150715071507, + "grad_norm": 0.025513971224427223, + "learning_rate": 2.3293392349236277e-06, + "loss": 0.0035, + "num_input_tokens_seen": 33583328, + "step": 159140 + }, + { + "epoch": 17.507700770077008, + "grad_norm": 0.8047890067100525, + "learning_rate": 2.3283277006725335e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33584320, + "step": 159145 + }, + { + "epoch": 17.50825082508251, + "grad_norm": 0.01119446661323309, + "learning_rate": 2.32731637537536e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33585312, + "step": 159150 + }, + { + "epoch": 17.50880088008801, + "grad_norm": 2.3598039150238037, + "learning_rate": 2.326305259041436e-06, + "loss": 0.1294, + "num_input_tokens_seen": 33586400, + "step": 159155 + }, + { + "epoch": 17.509350935093508, + "grad_norm": 0.0312301404774189, + "learning_rate": 2.3252943516800713e-06, + "loss": 0.002, + "num_input_tokens_seen": 33587488, + "step": 159160 + }, + { + "epoch": 17.50990099009901, + "grad_norm": 5.848932266235352, + "learning_rate": 2.3242836533005936e-06, + "loss": 0.1119, + "num_input_tokens_seen": 33588544, + "step": 159165 + }, + { + "epoch": 17.51045104510451, + "grad_norm": 0.027504654601216316, + "learning_rate": 2.3232731639123096e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33589568, + "step": 159170 + }, + { + "epoch": 17.51100110011001, + "grad_norm": 0.16457214951515198, + "learning_rate": 2.322262883524534e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33590688, + "step": 159175 + }, + { + "epoch": 17.511551155115512, + "grad_norm": 0.024383660405874252, + "learning_rate": 2.321252812146582e-06, + "loss": 0.0325, + "num_input_tokens_seen": 33591776, + "step": 159180 + }, + { + "epoch": 17.512101210121013, + "grad_norm": 0.05173816531896591, + "learning_rate": 2.3202429497877598e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33592832, + "step": 159185 + }, + { + "epoch": 17.51265126512651, + "grad_norm": 0.02455834485590458, + "learning_rate": 2.3192332964573764e-06, + "loss": 0.0037, + "num_input_tokens_seen": 33593920, + "step": 159190 + }, + { + "epoch": 17.513201320132012, + "grad_norm": 0.11065201461315155, + "learning_rate": 2.318223852164736e-06, + "loss": 0.007, + "num_input_tokens_seen": 33595040, + "step": 159195 + }, + { + "epoch": 17.513751375137513, + "grad_norm": 0.009813206270337105, + "learning_rate": 2.317214616919139e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33596000, + "step": 159200 + }, + { + "epoch": 17.514301430143014, + "grad_norm": 0.1627066433429718, + "learning_rate": 2.3162055907298973e-06, + "loss": 0.0074, + "num_input_tokens_seen": 33597088, + "step": 159205 + }, + { + "epoch": 17.514851485148515, + "grad_norm": 0.010183525271713734, + "learning_rate": 2.3151967736062985e-06, + "loss": 0.0178, + "num_input_tokens_seen": 33598144, + "step": 159210 + }, + { + "epoch": 17.515401540154016, + "grad_norm": 0.03505745902657509, + "learning_rate": 2.3141881655576488e-06, + "loss": 0.027, + "num_input_tokens_seen": 33599296, + "step": 159215 + }, + { + "epoch": 17.515951595159517, + "grad_norm": 0.30093473196029663, + "learning_rate": 2.313179766593243e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33600352, + "step": 159220 + }, + { + "epoch": 17.516501650165015, + "grad_norm": 0.009693343192338943, + "learning_rate": 2.312171576722369e-06, + "loss": 0.0054, + "num_input_tokens_seen": 33601376, + "step": 159225 + }, + { + "epoch": 17.517051705170516, + "grad_norm": 0.019340984523296356, + "learning_rate": 2.3111635959543217e-06, + "loss": 0.1131, + "num_input_tokens_seen": 33602432, + "step": 159230 + }, + { + "epoch": 17.517601760176017, + "grad_norm": 1.8817628622055054, + "learning_rate": 2.3101558242983934e-06, + "loss": 0.066, + "num_input_tokens_seen": 33603456, + "step": 159235 + }, + { + "epoch": 17.51815181518152, + "grad_norm": 2.3232672214508057, + "learning_rate": 2.3091482617638743e-06, + "loss": 0.0609, + "num_input_tokens_seen": 33604544, + "step": 159240 + }, + { + "epoch": 17.51870187018702, + "grad_norm": 0.008680363185703754, + "learning_rate": 2.3081409083600484e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33605664, + "step": 159245 + }, + { + "epoch": 17.51925192519252, + "grad_norm": 0.09645060449838638, + "learning_rate": 2.307133764096195e-06, + "loss": 0.1163, + "num_input_tokens_seen": 33606752, + "step": 159250 + }, + { + "epoch": 17.519801980198018, + "grad_norm": 1.9852181673049927, + "learning_rate": 2.3061268289816e-06, + "loss": 0.1692, + "num_input_tokens_seen": 33607808, + "step": 159255 + }, + { + "epoch": 17.52035203520352, + "grad_norm": 0.007902480661869049, + "learning_rate": 2.3051201030255486e-06, + "loss": 0.054, + "num_input_tokens_seen": 33608864, + "step": 159260 + }, + { + "epoch": 17.52090209020902, + "grad_norm": 0.04850330203771591, + "learning_rate": 2.3041135862373107e-06, + "loss": 0.069, + "num_input_tokens_seen": 33609888, + "step": 159265 + }, + { + "epoch": 17.52145214521452, + "grad_norm": 0.056018415838479996, + "learning_rate": 2.3031072786261737e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33610944, + "step": 159270 + }, + { + "epoch": 17.522002200220022, + "grad_norm": 0.010377038270235062, + "learning_rate": 2.302101180201399e-06, + "loss": 0.0982, + "num_input_tokens_seen": 33612032, + "step": 159275 + }, + { + "epoch": 17.522552255225524, + "grad_norm": 0.05052010342478752, + "learning_rate": 2.301095290972272e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33613056, + "step": 159280 + }, + { + "epoch": 17.523102310231025, + "grad_norm": 0.13368941843509674, + "learning_rate": 2.300089610948053e-06, + "loss": 0.0282, + "num_input_tokens_seen": 33614080, + "step": 159285 + }, + { + "epoch": 17.523652365236522, + "grad_norm": 0.005878273863345385, + "learning_rate": 2.2990841401380136e-06, + "loss": 0.0407, + "num_input_tokens_seen": 33615104, + "step": 159290 + }, + { + "epoch": 17.524202420242023, + "grad_norm": 0.3947955071926117, + "learning_rate": 2.2980788785514294e-06, + "loss": 0.0184, + "num_input_tokens_seen": 33616160, + "step": 159295 + }, + { + "epoch": 17.524752475247524, + "grad_norm": 0.03210034593939781, + "learning_rate": 2.2970738261975545e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33617216, + "step": 159300 + }, + { + "epoch": 17.525302530253025, + "grad_norm": 0.10441086441278458, + "learning_rate": 2.2960689830856615e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33618272, + "step": 159305 + }, + { + "epoch": 17.525852585258527, + "grad_norm": 0.05957374349236488, + "learning_rate": 2.295064349225004e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33619424, + "step": 159310 + }, + { + "epoch": 17.526402640264028, + "grad_norm": 1.3482990264892578, + "learning_rate": 2.2940599246248364e-06, + "loss": 0.0329, + "num_input_tokens_seen": 33620480, + "step": 159315 + }, + { + "epoch": 17.52695269526953, + "grad_norm": 0.04511106386780739, + "learning_rate": 2.293055709294431e-06, + "loss": 0.162, + "num_input_tokens_seen": 33621472, + "step": 159320 + }, + { + "epoch": 17.527502750275026, + "grad_norm": 0.04216131195425987, + "learning_rate": 2.2920517032430284e-06, + "loss": 0.0465, + "num_input_tokens_seen": 33622528, + "step": 159325 + }, + { + "epoch": 17.528052805280527, + "grad_norm": 0.004394430201500654, + "learning_rate": 2.2910479064798985e-06, + "loss": 0.0757, + "num_input_tokens_seen": 33623616, + "step": 159330 + }, + { + "epoch": 17.52860286028603, + "grad_norm": 0.09574887901544571, + "learning_rate": 2.2900443190142785e-06, + "loss": 0.0182, + "num_input_tokens_seen": 33624608, + "step": 159335 + }, + { + "epoch": 17.52915291529153, + "grad_norm": 0.03917226940393448, + "learning_rate": 2.289040940855422e-06, + "loss": 0.0436, + "num_input_tokens_seen": 33625664, + "step": 159340 + }, + { + "epoch": 17.52970297029703, + "grad_norm": 0.0022969995625317097, + "learning_rate": 2.2880377720125744e-06, + "loss": 0.103, + "num_input_tokens_seen": 33626720, + "step": 159345 + }, + { + "epoch": 17.53025302530253, + "grad_norm": 0.01802210696041584, + "learning_rate": 2.287034812494987e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33627744, + "step": 159350 + }, + { + "epoch": 17.53080308030803, + "grad_norm": 0.8109975457191467, + "learning_rate": 2.286032062311902e-06, + "loss": 0.0738, + "num_input_tokens_seen": 33628800, + "step": 159355 + }, + { + "epoch": 17.53135313531353, + "grad_norm": 0.1570483148097992, + "learning_rate": 2.2850295214725647e-06, + "loss": 0.0054, + "num_input_tokens_seen": 33629888, + "step": 159360 + }, + { + "epoch": 17.53190319031903, + "grad_norm": 0.2613614499568939, + "learning_rate": 2.284027189986204e-06, + "loss": 0.0041, + "num_input_tokens_seen": 33630944, + "step": 159365 + }, + { + "epoch": 17.532453245324533, + "grad_norm": 0.18103161454200745, + "learning_rate": 2.2830250678620686e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33631968, + "step": 159370 + }, + { + "epoch": 17.533003300330034, + "grad_norm": 0.004845576826483011, + "learning_rate": 2.2820231551093805e-06, + "loss": 0.0054, + "num_input_tokens_seen": 33633152, + "step": 159375 + }, + { + "epoch": 17.533553355335535, + "grad_norm": 0.19805097579956055, + "learning_rate": 2.2810214517373945e-06, + "loss": 0.004, + "num_input_tokens_seen": 33634176, + "step": 159380 + }, + { + "epoch": 17.534103410341036, + "grad_norm": 0.10715442895889282, + "learning_rate": 2.2800199577553307e-06, + "loss": 0.1155, + "num_input_tokens_seen": 33635168, + "step": 159385 + }, + { + "epoch": 17.534653465346533, + "grad_norm": 0.018992574885487556, + "learning_rate": 2.2790186731724174e-06, + "loss": 0.0073, + "num_input_tokens_seen": 33636224, + "step": 159390 + }, + { + "epoch": 17.535203520352034, + "grad_norm": 1.6485648155212402, + "learning_rate": 2.278017597997892e-06, + "loss": 0.0591, + "num_input_tokens_seen": 33637312, + "step": 159395 + }, + { + "epoch": 17.535753575357536, + "grad_norm": 0.043960124254226685, + "learning_rate": 2.277016732240969e-06, + "loss": 0.0545, + "num_input_tokens_seen": 33638368, + "step": 159400 + }, + { + "epoch": 17.536303630363037, + "grad_norm": 0.00709952088072896, + "learning_rate": 2.2760160759108805e-06, + "loss": 0.0071, + "num_input_tokens_seen": 33639488, + "step": 159405 + }, + { + "epoch": 17.536853685368538, + "grad_norm": 0.19783204793930054, + "learning_rate": 2.2750156290168522e-06, + "loss": 0.0265, + "num_input_tokens_seen": 33640512, + "step": 159410 + }, + { + "epoch": 17.53740374037404, + "grad_norm": 0.5792677998542786, + "learning_rate": 2.2740153915680963e-06, + "loss": 0.0062, + "num_input_tokens_seen": 33641568, + "step": 159415 + }, + { + "epoch": 17.537953795379536, + "grad_norm": 0.05257796496152878, + "learning_rate": 2.273015363573838e-06, + "loss": 0.0076, + "num_input_tokens_seen": 33642624, + "step": 159420 + }, + { + "epoch": 17.538503850385037, + "grad_norm": 0.6346604824066162, + "learning_rate": 2.272015545043288e-06, + "loss": 0.0042, + "num_input_tokens_seen": 33643712, + "step": 159425 + }, + { + "epoch": 17.53905390539054, + "grad_norm": 0.01091501023620367, + "learning_rate": 2.271015935985671e-06, + "loss": 0.0132, + "num_input_tokens_seen": 33644768, + "step": 159430 + }, + { + "epoch": 17.53960396039604, + "grad_norm": 0.0965743437409401, + "learning_rate": 2.2700165364101856e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33645792, + "step": 159435 + }, + { + "epoch": 17.54015401540154, + "grad_norm": 0.2566201686859131, + "learning_rate": 2.269017346326055e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33646784, + "step": 159440 + }, + { + "epoch": 17.540704070407042, + "grad_norm": 0.2004665732383728, + "learning_rate": 2.268018365742486e-06, + "loss": 0.0272, + "num_input_tokens_seen": 33647840, + "step": 159445 + }, + { + "epoch": 17.541254125412543, + "grad_norm": 4.858467102050781, + "learning_rate": 2.2670195946686844e-06, + "loss": 0.0316, + "num_input_tokens_seen": 33648960, + "step": 159450 + }, + { + "epoch": 17.54180418041804, + "grad_norm": 0.013024801388382912, + "learning_rate": 2.2660210331138492e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33650048, + "step": 159455 + }, + { + "epoch": 17.54235423542354, + "grad_norm": 0.03008786030113697, + "learning_rate": 2.2650226810871922e-06, + "loss": 0.0263, + "num_input_tokens_seen": 33651104, + "step": 159460 + }, + { + "epoch": 17.542904290429043, + "grad_norm": 0.0027154749259352684, + "learning_rate": 2.264024538597909e-06, + "loss": 0.0096, + "num_input_tokens_seen": 33652160, + "step": 159465 + }, + { + "epoch": 17.543454345434544, + "grad_norm": 0.052782148122787476, + "learning_rate": 2.263026605655208e-06, + "loss": 0.0064, + "num_input_tokens_seen": 33653280, + "step": 159470 + }, + { + "epoch": 17.544004400440045, + "grad_norm": 0.007899221032857895, + "learning_rate": 2.26202888226828e-06, + "loss": 0.0625, + "num_input_tokens_seen": 33654304, + "step": 159475 + }, + { + "epoch": 17.544554455445546, + "grad_norm": 3.0715558528900146, + "learning_rate": 2.2610313684463177e-06, + "loss": 0.0495, + "num_input_tokens_seen": 33655424, + "step": 159480 + }, + { + "epoch": 17.545104510451043, + "grad_norm": 0.039087165147066116, + "learning_rate": 2.260034064198521e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33656480, + "step": 159485 + }, + { + "epoch": 17.545654565456545, + "grad_norm": 0.03506819158792496, + "learning_rate": 2.259036969534073e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33657472, + "step": 159490 + }, + { + "epoch": 17.546204620462046, + "grad_norm": 0.0099294763058424, + "learning_rate": 2.2580400844621708e-06, + "loss": 0.0902, + "num_input_tokens_seen": 33658528, + "step": 159495 + }, + { + "epoch": 17.546754675467547, + "grad_norm": 0.018520137295126915, + "learning_rate": 2.2570434089920044e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33659520, + "step": 159500 + }, + { + "epoch": 17.547304730473048, + "grad_norm": 0.00847394485026598, + "learning_rate": 2.256046943132753e-06, + "loss": 0.0608, + "num_input_tokens_seen": 33660576, + "step": 159505 + }, + { + "epoch": 17.54785478547855, + "grad_norm": 0.3238179087638855, + "learning_rate": 2.255050686893606e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33661632, + "step": 159510 + }, + { + "epoch": 17.54840484048405, + "grad_norm": 0.5599884986877441, + "learning_rate": 2.25405464028374e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33662688, + "step": 159515 + }, + { + "epoch": 17.548954895489548, + "grad_norm": 0.0066494448110461235, + "learning_rate": 2.2530588033123356e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33663776, + "step": 159520 + }, + { + "epoch": 17.54950495049505, + "grad_norm": 0.2971493899822235, + "learning_rate": 2.252063175988578e-06, + "loss": 0.0331, + "num_input_tokens_seen": 33664896, + "step": 159525 + }, + { + "epoch": 17.55005500550055, + "grad_norm": 0.14993339776992798, + "learning_rate": 2.2510677583216346e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33666016, + "step": 159530 + }, + { + "epoch": 17.55060506050605, + "grad_norm": 0.06951441615819931, + "learning_rate": 2.2500725503206892e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33667072, + "step": 159535 + }, + { + "epoch": 17.551155115511552, + "grad_norm": 0.007807530462741852, + "learning_rate": 2.2490775519949048e-06, + "loss": 0.0005, + "num_input_tokens_seen": 33668096, + "step": 159540 + }, + { + "epoch": 17.551705170517053, + "grad_norm": 0.09310563653707504, + "learning_rate": 2.2480827633534566e-06, + "loss": 0.0069, + "num_input_tokens_seen": 33669152, + "step": 159545 + }, + { + "epoch": 17.55225522552255, + "grad_norm": 0.2214161604642868, + "learning_rate": 2.24708818440551e-06, + "loss": 0.0509, + "num_input_tokens_seen": 33670176, + "step": 159550 + }, + { + "epoch": 17.55280528052805, + "grad_norm": 0.024993833154439926, + "learning_rate": 2.246093815160233e-06, + "loss": 0.0452, + "num_input_tokens_seen": 33671232, + "step": 159555 + }, + { + "epoch": 17.553355335533553, + "grad_norm": 0.14115986227989197, + "learning_rate": 2.2450996556267976e-06, + "loss": 0.0035, + "num_input_tokens_seen": 33672320, + "step": 159560 + }, + { + "epoch": 17.553905390539054, + "grad_norm": 0.28255221247673035, + "learning_rate": 2.2441057058143532e-06, + "loss": 0.005, + "num_input_tokens_seen": 33673408, + "step": 159565 + }, + { + "epoch": 17.554455445544555, + "grad_norm": 0.021740395575761795, + "learning_rate": 2.2431119657320697e-06, + "loss": 0.0115, + "num_input_tokens_seen": 33674464, + "step": 159570 + }, + { + "epoch": 17.555005500550056, + "grad_norm": 0.0023674466647207737, + "learning_rate": 2.2421184353891013e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33675488, + "step": 159575 + }, + { + "epoch": 17.555555555555557, + "grad_norm": 1.336052656173706, + "learning_rate": 2.2411251147946065e-06, + "loss": 0.0187, + "num_input_tokens_seen": 33676608, + "step": 159580 + }, + { + "epoch": 17.556105610561055, + "grad_norm": 0.07184773683547974, + "learning_rate": 2.2401320039577483e-06, + "loss": 0.0046, + "num_input_tokens_seen": 33677664, + "step": 159585 + }, + { + "epoch": 17.556655665566556, + "grad_norm": 0.17147725820541382, + "learning_rate": 2.2391391028876635e-06, + "loss": 0.0091, + "num_input_tokens_seen": 33678720, + "step": 159590 + }, + { + "epoch": 17.557205720572057, + "grad_norm": 0.039812713861465454, + "learning_rate": 2.238146411593517e-06, + "loss": 0.0456, + "num_input_tokens_seen": 33679840, + "step": 159595 + }, + { + "epoch": 17.557755775577558, + "grad_norm": 0.05346637964248657, + "learning_rate": 2.237153930084454e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33680960, + "step": 159600 + }, + { + "epoch": 17.55830583058306, + "grad_norm": 0.362773060798645, + "learning_rate": 2.2361616583696176e-06, + "loss": 0.004, + "num_input_tokens_seen": 33682080, + "step": 159605 + }, + { + "epoch": 17.55885588558856, + "grad_norm": 0.012657438404858112, + "learning_rate": 2.235169596458153e-06, + "loss": 0.0465, + "num_input_tokens_seen": 33683168, + "step": 159610 + }, + { + "epoch": 17.55940594059406, + "grad_norm": 0.009668446145951748, + "learning_rate": 2.2341777443592088e-06, + "loss": 0.0864, + "num_input_tokens_seen": 33684192, + "step": 159615 + }, + { + "epoch": 17.55995599559956, + "grad_norm": 0.04788253828883171, + "learning_rate": 2.2331861020819274e-06, + "loss": 0.003, + "num_input_tokens_seen": 33685248, + "step": 159620 + }, + { + "epoch": 17.56050605060506, + "grad_norm": 0.028437118977308273, + "learning_rate": 2.232194669635446e-06, + "loss": 0.0331, + "num_input_tokens_seen": 33686240, + "step": 159625 + }, + { + "epoch": 17.56105610561056, + "grad_norm": 2.015880584716797, + "learning_rate": 2.2312034470288955e-06, + "loss": 0.0678, + "num_input_tokens_seen": 33687296, + "step": 159630 + }, + { + "epoch": 17.561606160616062, + "grad_norm": 0.019940581172704697, + "learning_rate": 2.230212434271417e-06, + "loss": 0.0535, + "num_input_tokens_seen": 33688352, + "step": 159635 + }, + { + "epoch": 17.562156215621563, + "grad_norm": 0.03459131345152855, + "learning_rate": 2.22922163137215e-06, + "loss": 0.0094, + "num_input_tokens_seen": 33689472, + "step": 159640 + }, + { + "epoch": 17.562706270627064, + "grad_norm": 0.018180450424551964, + "learning_rate": 2.228231038340217e-06, + "loss": 0.016, + "num_input_tokens_seen": 33690560, + "step": 159645 + }, + { + "epoch": 17.563256325632562, + "grad_norm": 0.01593577116727829, + "learning_rate": 2.2272406551847534e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33691616, + "step": 159650 + }, + { + "epoch": 17.563806380638063, + "grad_norm": 0.01114885974675417, + "learning_rate": 2.226250481914882e-06, + "loss": 0.1173, + "num_input_tokens_seen": 33692640, + "step": 159655 + }, + { + "epoch": 17.564356435643564, + "grad_norm": 0.005727173760533333, + "learning_rate": 2.2252605185397373e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33693696, + "step": 159660 + }, + { + "epoch": 17.564906490649065, + "grad_norm": 1.8635188341140747, + "learning_rate": 2.224270765068434e-06, + "loss": 0.0518, + "num_input_tokens_seen": 33694752, + "step": 159665 + }, + { + "epoch": 17.565456545654566, + "grad_norm": 0.04479685798287392, + "learning_rate": 2.2232812215100984e-06, + "loss": 0.0714, + "num_input_tokens_seen": 33695776, + "step": 159670 + }, + { + "epoch": 17.566006600660067, + "grad_norm": 0.017444530501961708, + "learning_rate": 2.2222918878738536e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33696832, + "step": 159675 + }, + { + "epoch": 17.566556655665565, + "grad_norm": 0.14986532926559448, + "learning_rate": 2.2213027641688116e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33697920, + "step": 159680 + }, + { + "epoch": 17.567106710671066, + "grad_norm": 0.5847370028495789, + "learning_rate": 2.2203138504040934e-06, + "loss": 0.0482, + "num_input_tokens_seen": 33698976, + "step": 159685 + }, + { + "epoch": 17.567656765676567, + "grad_norm": 0.09529051184654236, + "learning_rate": 2.2193251465888103e-06, + "loss": 0.0329, + "num_input_tokens_seen": 33700000, + "step": 159690 + }, + { + "epoch": 17.568206820682068, + "grad_norm": 0.7089125514030457, + "learning_rate": 2.2183366527320753e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33701120, + "step": 159695 + }, + { + "epoch": 17.56875687568757, + "grad_norm": 0.25727206468582153, + "learning_rate": 2.217348368843003e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33702176, + "step": 159700 + }, + { + "epoch": 17.56930693069307, + "grad_norm": 0.01775665581226349, + "learning_rate": 2.2163602949306942e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33703232, + "step": 159705 + }, + { + "epoch": 17.56985698569857, + "grad_norm": 0.04019344970583916, + "learning_rate": 2.2153724310042644e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33704320, + "step": 159710 + }, + { + "epoch": 17.57040704070407, + "grad_norm": 0.008898287080228329, + "learning_rate": 2.2143847770728144e-06, + "loss": 0.0042, + "num_input_tokens_seen": 33705344, + "step": 159715 + }, + { + "epoch": 17.57095709570957, + "grad_norm": 0.009555656462907791, + "learning_rate": 2.213397333145442e-06, + "loss": 0.036, + "num_input_tokens_seen": 33706368, + "step": 159720 + }, + { + "epoch": 17.57150715071507, + "grad_norm": 0.02589072287082672, + "learning_rate": 2.212410099231252e-06, + "loss": 0.0624, + "num_input_tokens_seen": 33707424, + "step": 159725 + }, + { + "epoch": 17.572057205720572, + "grad_norm": 0.8566716909408569, + "learning_rate": 2.2114230753393443e-06, + "loss": 0.0429, + "num_input_tokens_seen": 33708480, + "step": 159730 + }, + { + "epoch": 17.572607260726073, + "grad_norm": 0.07058612257242203, + "learning_rate": 2.210436261478818e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33709536, + "step": 159735 + }, + { + "epoch": 17.573157315731574, + "grad_norm": 0.017860177904367447, + "learning_rate": 2.209449657658769e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33710592, + "step": 159740 + }, + { + "epoch": 17.573707370737075, + "grad_norm": 0.008118433877825737, + "learning_rate": 2.2084632638882775e-06, + "loss": 0.0635, + "num_input_tokens_seen": 33711616, + "step": 159745 + }, + { + "epoch": 17.574257425742573, + "grad_norm": 0.013009125366806984, + "learning_rate": 2.2074770801764453e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33712672, + "step": 159750 + }, + { + "epoch": 17.574807480748074, + "grad_norm": 0.12539851665496826, + "learning_rate": 2.2064911065323603e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33713696, + "step": 159755 + }, + { + "epoch": 17.575357535753575, + "grad_norm": 0.10855406522750854, + "learning_rate": 2.205505342965114e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33714752, + "step": 159760 + }, + { + "epoch": 17.575907590759076, + "grad_norm": 0.1627991646528244, + "learning_rate": 2.204519789483786e-06, + "loss": 0.0806, + "num_input_tokens_seen": 33715840, + "step": 159765 + }, + { + "epoch": 17.576457645764577, + "grad_norm": 0.025713734328746796, + "learning_rate": 2.203534446097458e-06, + "loss": 0.0066, + "num_input_tokens_seen": 33716896, + "step": 159770 + }, + { + "epoch": 17.57700770077008, + "grad_norm": 0.003685109317302704, + "learning_rate": 2.2025493128152165e-06, + "loss": 0.0023, + "num_input_tokens_seen": 33718016, + "step": 159775 + }, + { + "epoch": 17.577557755775576, + "grad_norm": 0.040408600121736526, + "learning_rate": 2.201564389646135e-06, + "loss": 0.1176, + "num_input_tokens_seen": 33719040, + "step": 159780 + }, + { + "epoch": 17.578107810781077, + "grad_norm": 3.2562289237976074, + "learning_rate": 2.200579676599296e-06, + "loss": 0.0501, + "num_input_tokens_seen": 33720032, + "step": 159785 + }, + { + "epoch": 17.578657865786578, + "grad_norm": 0.011753186583518982, + "learning_rate": 2.1995951736837773e-06, + "loss": 0.002, + "num_input_tokens_seen": 33721120, + "step": 159790 + }, + { + "epoch": 17.57920792079208, + "grad_norm": 0.027968939393758774, + "learning_rate": 2.198610880908644e-06, + "loss": 0.0617, + "num_input_tokens_seen": 33722176, + "step": 159795 + }, + { + "epoch": 17.57975797579758, + "grad_norm": 0.07863609492778778, + "learning_rate": 2.1976267982829784e-06, + "loss": 0.0719, + "num_input_tokens_seen": 33723232, + "step": 159800 + }, + { + "epoch": 17.58030803080308, + "grad_norm": 0.036733757704496384, + "learning_rate": 2.1966429258158423e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33724288, + "step": 159805 + }, + { + "epoch": 17.580858085808583, + "grad_norm": 0.008172040805220604, + "learning_rate": 2.195659263516303e-06, + "loss": 0.0102, + "num_input_tokens_seen": 33725376, + "step": 159810 + }, + { + "epoch": 17.58140814081408, + "grad_norm": 0.03670578449964523, + "learning_rate": 2.1946758113934345e-06, + "loss": 0.0087, + "num_input_tokens_seen": 33726400, + "step": 159815 + }, + { + "epoch": 17.58195819581958, + "grad_norm": 0.013721507973968983, + "learning_rate": 2.193692569456293e-06, + "loss": 0.0421, + "num_input_tokens_seen": 33727424, + "step": 159820 + }, + { + "epoch": 17.582508250825082, + "grad_norm": 0.07654841989278793, + "learning_rate": 2.1927095377139496e-06, + "loss": 0.0031, + "num_input_tokens_seen": 33728448, + "step": 159825 + }, + { + "epoch": 17.583058305830583, + "grad_norm": 0.13592401146888733, + "learning_rate": 2.1917267161754545e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33729536, + "step": 159830 + }, + { + "epoch": 17.583608360836084, + "grad_norm": 2.2184979915618896, + "learning_rate": 2.190744104849868e-06, + "loss": 0.0514, + "num_input_tokens_seen": 33730656, + "step": 159835 + }, + { + "epoch": 17.584158415841586, + "grad_norm": 0.5903710722923279, + "learning_rate": 2.189761703746246e-06, + "loss": 0.0056, + "num_input_tokens_seen": 33731776, + "step": 159840 + }, + { + "epoch": 17.584708470847083, + "grad_norm": 0.0031407789792865515, + "learning_rate": 2.1887795128736483e-06, + "loss": 0.0293, + "num_input_tokens_seen": 33732864, + "step": 159845 + }, + { + "epoch": 17.585258525852584, + "grad_norm": 0.009742996655404568, + "learning_rate": 2.187797532241126e-06, + "loss": 0.001, + "num_input_tokens_seen": 33733952, + "step": 159850 + }, + { + "epoch": 17.585808580858085, + "grad_norm": 0.007764185778796673, + "learning_rate": 2.1868157618577274e-06, + "loss": 0.004, + "num_input_tokens_seen": 33734976, + "step": 159855 + }, + { + "epoch": 17.586358635863586, + "grad_norm": 0.08555538207292557, + "learning_rate": 2.1858342017324978e-06, + "loss": 0.1269, + "num_input_tokens_seen": 33736064, + "step": 159860 + }, + { + "epoch": 17.586908690869087, + "grad_norm": 0.6542907357215881, + "learning_rate": 2.184852851874486e-06, + "loss": 0.024, + "num_input_tokens_seen": 33737088, + "step": 159865 + }, + { + "epoch": 17.58745874587459, + "grad_norm": 0.029537484049797058, + "learning_rate": 2.183871712292737e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33738112, + "step": 159870 + }, + { + "epoch": 17.58800880088009, + "grad_norm": 0.022040322422981262, + "learning_rate": 2.1828907829962998e-06, + "loss": 0.0108, + "num_input_tokens_seen": 33739200, + "step": 159875 + }, + { + "epoch": 17.588558855885587, + "grad_norm": 0.23043303191661835, + "learning_rate": 2.181910063994211e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33740224, + "step": 159880 + }, + { + "epoch": 17.58910891089109, + "grad_norm": 0.01240503415465355, + "learning_rate": 2.180929555295502e-06, + "loss": 0.0221, + "num_input_tokens_seen": 33741312, + "step": 159885 + }, + { + "epoch": 17.58965896589659, + "grad_norm": 0.0780954584479332, + "learning_rate": 2.1799492569092196e-06, + "loss": 0.019, + "num_input_tokens_seen": 33742496, + "step": 159890 + }, + { + "epoch": 17.59020902090209, + "grad_norm": 0.07005976885557175, + "learning_rate": 2.1789691688443912e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33743584, + "step": 159895 + }, + { + "epoch": 17.59075907590759, + "grad_norm": 0.3476962745189667, + "learning_rate": 2.1779892911100524e-06, + "loss": 0.0041, + "num_input_tokens_seen": 33744640, + "step": 159900 + }, + { + "epoch": 17.591309130913093, + "grad_norm": 2.7136919498443604, + "learning_rate": 2.17700962371524e-06, + "loss": 0.1061, + "num_input_tokens_seen": 33745664, + "step": 159905 + }, + { + "epoch": 17.59185918591859, + "grad_norm": 0.016410261392593384, + "learning_rate": 2.1760301666689743e-06, + "loss": 0.0409, + "num_input_tokens_seen": 33746752, + "step": 159910 + }, + { + "epoch": 17.59240924092409, + "grad_norm": 0.007115126587450504, + "learning_rate": 2.17505091998029e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33747776, + "step": 159915 + }, + { + "epoch": 17.592959295929592, + "grad_norm": 0.03208763524889946, + "learning_rate": 2.1740718836582046e-06, + "loss": 0.0521, + "num_input_tokens_seen": 33748832, + "step": 159920 + }, + { + "epoch": 17.593509350935093, + "grad_norm": 0.04243900626897812, + "learning_rate": 2.1730930577117475e-06, + "loss": 0.0243, + "num_input_tokens_seen": 33749888, + "step": 159925 + }, + { + "epoch": 17.594059405940595, + "grad_norm": 0.014497905038297176, + "learning_rate": 2.1721144421499418e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33750976, + "step": 159930 + }, + { + "epoch": 17.594609460946096, + "grad_norm": 0.00854499451816082, + "learning_rate": 2.1711360369817994e-06, + "loss": 0.0494, + "num_input_tokens_seen": 33752064, + "step": 159935 + }, + { + "epoch": 17.595159515951597, + "grad_norm": 0.011188850738108158, + "learning_rate": 2.1701578422163442e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33753088, + "step": 159940 + }, + { + "epoch": 17.595709570957094, + "grad_norm": 0.5021872520446777, + "learning_rate": 2.1691798578625883e-06, + "loss": 0.0069, + "num_input_tokens_seen": 33754208, + "step": 159945 + }, + { + "epoch": 17.596259625962595, + "grad_norm": 0.09938736259937286, + "learning_rate": 2.1682020839295465e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33755264, + "step": 159950 + }, + { + "epoch": 17.596809680968097, + "grad_norm": 0.007194613106548786, + "learning_rate": 2.167224520426228e-06, + "loss": 0.0868, + "num_input_tokens_seen": 33756320, + "step": 159955 + }, + { + "epoch": 17.597359735973598, + "grad_norm": 0.0228874534368515, + "learning_rate": 2.1662471673616456e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33757312, + "step": 159960 + }, + { + "epoch": 17.5979097909791, + "grad_norm": 2.2916946411132812, + "learning_rate": 2.1652700247448117e-06, + "loss": 0.1357, + "num_input_tokens_seen": 33758432, + "step": 159965 + }, + { + "epoch": 17.5984598459846, + "grad_norm": 5.378520965576172, + "learning_rate": 2.164293092584721e-06, + "loss": 0.0288, + "num_input_tokens_seen": 33759520, + "step": 159970 + }, + { + "epoch": 17.599009900990097, + "grad_norm": 0.0634588748216629, + "learning_rate": 2.1633163708903866e-06, + "loss": 0.084, + "num_input_tokens_seen": 33760640, + "step": 159975 + }, + { + "epoch": 17.5995599559956, + "grad_norm": 0.10608270019292831, + "learning_rate": 2.1623398596708034e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33761696, + "step": 159980 + }, + { + "epoch": 17.6001100110011, + "grad_norm": 0.023031925782561302, + "learning_rate": 2.1613635589349756e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33762752, + "step": 159985 + }, + { + "epoch": 17.6006600660066, + "grad_norm": 0.055119141936302185, + "learning_rate": 2.1603874686919048e-06, + "loss": 0.0797, + "num_input_tokens_seen": 33763808, + "step": 159990 + }, + { + "epoch": 17.6012101210121, + "grad_norm": 2.338329315185547, + "learning_rate": 2.15941158895058e-06, + "loss": 0.0087, + "num_input_tokens_seen": 33764832, + "step": 159995 + }, + { + "epoch": 17.601760176017603, + "grad_norm": 2.0598835945129395, + "learning_rate": 2.1584359197200004e-06, + "loss": 0.0669, + "num_input_tokens_seen": 33765856, + "step": 160000 + }, + { + "epoch": 17.602310231023104, + "grad_norm": 0.08630669116973877, + "learning_rate": 2.1574604610091587e-06, + "loss": 0.0391, + "num_input_tokens_seen": 33766912, + "step": 160005 + }, + { + "epoch": 17.6028602860286, + "grad_norm": 0.6984458565711975, + "learning_rate": 2.156485212827039e-06, + "loss": 0.0091, + "num_input_tokens_seen": 33767936, + "step": 160010 + }, + { + "epoch": 17.603410341034103, + "grad_norm": 0.016866611316800117, + "learning_rate": 2.155510175182632e-06, + "loss": 0.004, + "num_input_tokens_seen": 33768960, + "step": 160015 + }, + { + "epoch": 17.603960396039604, + "grad_norm": 0.033643320202827454, + "learning_rate": 2.154535348084932e-06, + "loss": 0.0321, + "num_input_tokens_seen": 33770048, + "step": 160020 + }, + { + "epoch": 17.604510451045105, + "grad_norm": 0.02297912910580635, + "learning_rate": 2.1535607315429112e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33771072, + "step": 160025 + }, + { + "epoch": 17.605060506050606, + "grad_norm": 0.0441063791513443, + "learning_rate": 2.152586325565564e-06, + "loss": 0.0073, + "num_input_tokens_seen": 33772128, + "step": 160030 + }, + { + "epoch": 17.605610561056107, + "grad_norm": 1.0223578214645386, + "learning_rate": 2.151612130161859e-06, + "loss": 0.0329, + "num_input_tokens_seen": 33773216, + "step": 160035 + }, + { + "epoch": 17.606160616061608, + "grad_norm": 1.253333568572998, + "learning_rate": 2.150638145340783e-06, + "loss": 0.026, + "num_input_tokens_seen": 33774304, + "step": 160040 + }, + { + "epoch": 17.606710671067106, + "grad_norm": 0.0872347429394722, + "learning_rate": 2.1496643711113153e-06, + "loss": 0.031, + "num_input_tokens_seen": 33775392, + "step": 160045 + }, + { + "epoch": 17.607260726072607, + "grad_norm": 0.04509597271680832, + "learning_rate": 2.148690807482423e-06, + "loss": 0.0579, + "num_input_tokens_seen": 33776480, + "step": 160050 + }, + { + "epoch": 17.607810781078108, + "grad_norm": 0.013167515397071838, + "learning_rate": 2.147717454463086e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33777504, + "step": 160055 + }, + { + "epoch": 17.60836083608361, + "grad_norm": 0.0035746314097195864, + "learning_rate": 2.1467443120622653e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33778560, + "step": 160060 + }, + { + "epoch": 17.60891089108911, + "grad_norm": 0.5221408009529114, + "learning_rate": 2.145771380288944e-06, + "loss": 0.0054, + "num_input_tokens_seen": 33779616, + "step": 160065 + }, + { + "epoch": 17.60946094609461, + "grad_norm": 0.018346253782510757, + "learning_rate": 2.144798659152075e-06, + "loss": 0.1, + "num_input_tokens_seen": 33780640, + "step": 160070 + }, + { + "epoch": 17.61001100110011, + "grad_norm": 0.00497745256870985, + "learning_rate": 2.14382614866063e-06, + "loss": 0.0092, + "num_input_tokens_seen": 33781696, + "step": 160075 + }, + { + "epoch": 17.61056105610561, + "grad_norm": 0.0274963416159153, + "learning_rate": 2.1428538488235754e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33782752, + "step": 160080 + }, + { + "epoch": 17.61111111111111, + "grad_norm": 0.0184108167886734, + "learning_rate": 2.1418817596498663e-06, + "loss": 0.1053, + "num_input_tokens_seen": 33783776, + "step": 160085 + }, + { + "epoch": 17.611661166116612, + "grad_norm": 0.09300710260868073, + "learning_rate": 2.1409098811484674e-06, + "loss": 0.0069, + "num_input_tokens_seen": 33784864, + "step": 160090 + }, + { + "epoch": 17.612211221122113, + "grad_norm": 0.032179635018110275, + "learning_rate": 2.139938213328335e-06, + "loss": 0.001, + "num_input_tokens_seen": 33785856, + "step": 160095 + }, + { + "epoch": 17.612761276127614, + "grad_norm": 0.06065542995929718, + "learning_rate": 2.1389667561984127e-06, + "loss": 0.0071, + "num_input_tokens_seen": 33786912, + "step": 160100 + }, + { + "epoch": 17.61331133113311, + "grad_norm": 1.3497049808502197, + "learning_rate": 2.1379955097676734e-06, + "loss": 0.0396, + "num_input_tokens_seen": 33788000, + "step": 160105 + }, + { + "epoch": 17.613861386138613, + "grad_norm": 0.06580071151256561, + "learning_rate": 2.1370244740450546e-06, + "loss": 0.0039, + "num_input_tokens_seen": 33788992, + "step": 160110 + }, + { + "epoch": 17.614411441144114, + "grad_norm": 4.872659206390381, + "learning_rate": 2.1360536490395154e-06, + "loss": 0.123, + "num_input_tokens_seen": 33790016, + "step": 160115 + }, + { + "epoch": 17.614961496149615, + "grad_norm": 0.09966764599084854, + "learning_rate": 2.1350830347599992e-06, + "loss": 0.0318, + "num_input_tokens_seen": 33791040, + "step": 160120 + }, + { + "epoch": 17.615511551155116, + "grad_norm": 0.03568867966532707, + "learning_rate": 2.134112631215443e-06, + "loss": 0.0272, + "num_input_tokens_seen": 33792128, + "step": 160125 + }, + { + "epoch": 17.616061606160617, + "grad_norm": 0.019803786650300026, + "learning_rate": 2.133142438414801e-06, + "loss": 0.0227, + "num_input_tokens_seen": 33793216, + "step": 160130 + }, + { + "epoch": 17.616611661166118, + "grad_norm": 0.897514820098877, + "learning_rate": 2.1321724563670127e-06, + "loss": 0.0045, + "num_input_tokens_seen": 33794336, + "step": 160135 + }, + { + "epoch": 17.617161716171616, + "grad_norm": 0.02088845893740654, + "learning_rate": 2.131202685081024e-06, + "loss": 0.0208, + "num_input_tokens_seen": 33795360, + "step": 160140 + }, + { + "epoch": 17.617711771177117, + "grad_norm": 0.029551062732934952, + "learning_rate": 2.130233124565764e-06, + "loss": 0.0187, + "num_input_tokens_seen": 33796448, + "step": 160145 + }, + { + "epoch": 17.618261826182618, + "grad_norm": 0.022504029795527458, + "learning_rate": 2.1292637748301674e-06, + "loss": 0.0058, + "num_input_tokens_seen": 33797504, + "step": 160150 + }, + { + "epoch": 17.61881188118812, + "grad_norm": 0.019218066707253456, + "learning_rate": 2.128294635883177e-06, + "loss": 0.0149, + "num_input_tokens_seen": 33798624, + "step": 160155 + }, + { + "epoch": 17.61936193619362, + "grad_norm": 0.053519356995821, + "learning_rate": 2.12732570773371e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33799712, + "step": 160160 + }, + { + "epoch": 17.61991199119912, + "grad_norm": 0.47529229521751404, + "learning_rate": 2.1263569903907182e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33800800, + "step": 160165 + }, + { + "epoch": 17.620462046204622, + "grad_norm": 0.04087110981345177, + "learning_rate": 2.1253884838631167e-06, + "loss": 0.0019, + "num_input_tokens_seen": 33801952, + "step": 160170 + }, + { + "epoch": 17.62101210121012, + "grad_norm": 0.07901938259601593, + "learning_rate": 2.1244201881598287e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33803072, + "step": 160175 + }, + { + "epoch": 17.62156215621562, + "grad_norm": 0.015286900103092194, + "learning_rate": 2.1234521032897857e-06, + "loss": 0.0072, + "num_input_tokens_seen": 33804160, + "step": 160180 + }, + { + "epoch": 17.622112211221122, + "grad_norm": 0.00825437344610691, + "learning_rate": 2.1224842292619033e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33805152, + "step": 160185 + }, + { + "epoch": 17.622662266226623, + "grad_norm": 0.009386587888002396, + "learning_rate": 2.121516566085108e-06, + "loss": 0.0015, + "num_input_tokens_seen": 33806208, + "step": 160190 + }, + { + "epoch": 17.623212321232124, + "grad_norm": 0.00843835063278675, + "learning_rate": 2.120549113768322e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33807296, + "step": 160195 + }, + { + "epoch": 17.623762376237625, + "grad_norm": 0.0030446588061749935, + "learning_rate": 2.1195818723204473e-06, + "loss": 0.0233, + "num_input_tokens_seen": 33808384, + "step": 160200 + }, + { + "epoch": 17.624312431243123, + "grad_norm": 0.12968285381793976, + "learning_rate": 2.118614841750416e-06, + "loss": 0.0029, + "num_input_tokens_seen": 33809408, + "step": 160205 + }, + { + "epoch": 17.624862486248624, + "grad_norm": 0.570166826248169, + "learning_rate": 2.117648022067126e-06, + "loss": 0.0053, + "num_input_tokens_seen": 33810560, + "step": 160210 + }, + { + "epoch": 17.625412541254125, + "grad_norm": 0.01931433565914631, + "learning_rate": 2.116681413279498e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33811584, + "step": 160215 + }, + { + "epoch": 17.625962596259626, + "grad_norm": 3.218160390853882, + "learning_rate": 2.115715015396433e-06, + "loss": 0.0979, + "num_input_tokens_seen": 33812608, + "step": 160220 + }, + { + "epoch": 17.626512651265127, + "grad_norm": 0.002482873620465398, + "learning_rate": 2.1147488284268436e-06, + "loss": 0.007, + "num_input_tokens_seen": 33813760, + "step": 160225 + }, + { + "epoch": 17.627062706270628, + "grad_norm": 0.0326823815703392, + "learning_rate": 2.113782852379634e-06, + "loss": 0.1228, + "num_input_tokens_seen": 33814816, + "step": 160230 + }, + { + "epoch": 17.62761276127613, + "grad_norm": 0.006120490841567516, + "learning_rate": 2.1128170872637078e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33815936, + "step": 160235 + }, + { + "epoch": 17.628162816281627, + "grad_norm": 2.089266061782837, + "learning_rate": 2.111851533087961e-06, + "loss": 0.0698, + "num_input_tokens_seen": 33817024, + "step": 160240 + }, + { + "epoch": 17.628712871287128, + "grad_norm": 0.025431718677282333, + "learning_rate": 2.110886189861294e-06, + "loss": 0.0194, + "num_input_tokens_seen": 33818016, + "step": 160245 + }, + { + "epoch": 17.62926292629263, + "grad_norm": 0.02344849891960621, + "learning_rate": 2.1099210575926066e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33819168, + "step": 160250 + }, + { + "epoch": 17.62981298129813, + "grad_norm": 0.04070404917001724, + "learning_rate": 2.1089561362907963e-06, + "loss": 0.0048, + "num_input_tokens_seen": 33820224, + "step": 160255 + }, + { + "epoch": 17.63036303630363, + "grad_norm": 0.009423301555216312, + "learning_rate": 2.1079914259647533e-06, + "loss": 0.001, + "num_input_tokens_seen": 33821248, + "step": 160260 + }, + { + "epoch": 17.630913091309132, + "grad_norm": 0.01796034909784794, + "learning_rate": 2.107026926623365e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33822272, + "step": 160265 + }, + { + "epoch": 17.63146314631463, + "grad_norm": 2.738170623779297, + "learning_rate": 2.1060626382755274e-06, + "loss": 0.0923, + "num_input_tokens_seen": 33823296, + "step": 160270 + }, + { + "epoch": 17.63201320132013, + "grad_norm": 0.02387610450387001, + "learning_rate": 2.1050985609301216e-06, + "loss": 0.0118, + "num_input_tokens_seen": 33824416, + "step": 160275 + }, + { + "epoch": 17.632563256325632, + "grad_norm": 0.0189544428139925, + "learning_rate": 2.104134694596038e-06, + "loss": 0.0008, + "num_input_tokens_seen": 33825504, + "step": 160280 + }, + { + "epoch": 17.633113311331133, + "grad_norm": 3.07511043548584, + "learning_rate": 2.103171039282162e-06, + "loss": 0.091, + "num_input_tokens_seen": 33826528, + "step": 160285 + }, + { + "epoch": 17.633663366336634, + "grad_norm": 0.01050394494086504, + "learning_rate": 2.102207594997366e-06, + "loss": 0.001, + "num_input_tokens_seen": 33827584, + "step": 160290 + }, + { + "epoch": 17.634213421342135, + "grad_norm": 0.07404574006795883, + "learning_rate": 2.10124436175054e-06, + "loss": 0.0046, + "num_input_tokens_seen": 33828608, + "step": 160295 + }, + { + "epoch": 17.634763476347636, + "grad_norm": 0.06831508129835129, + "learning_rate": 2.10028133955055e-06, + "loss": 0.007, + "num_input_tokens_seen": 33829632, + "step": 160300 + }, + { + "epoch": 17.635313531353134, + "grad_norm": 0.15493012964725494, + "learning_rate": 2.0993185284062826e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33830688, + "step": 160305 + }, + { + "epoch": 17.635863586358635, + "grad_norm": 0.012627520598471165, + "learning_rate": 2.098355928326609e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33831776, + "step": 160310 + }, + { + "epoch": 17.636413641364136, + "grad_norm": 0.03891501948237419, + "learning_rate": 2.097393539320397e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33832864, + "step": 160315 + }, + { + "epoch": 17.636963696369637, + "grad_norm": 0.13108551502227783, + "learning_rate": 2.0964313613965225e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33833920, + "step": 160320 + }, + { + "epoch": 17.63751375137514, + "grad_norm": 0.022775918245315552, + "learning_rate": 2.0954693945638455e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33834944, + "step": 160325 + }, + { + "epoch": 17.63806380638064, + "grad_norm": 1.2371022701263428, + "learning_rate": 2.0945076388312417e-06, + "loss": 0.1812, + "num_input_tokens_seen": 33835968, + "step": 160330 + }, + { + "epoch": 17.638613861386137, + "grad_norm": 0.0018915764521807432, + "learning_rate": 2.093546094207566e-06, + "loss": 0.0917, + "num_input_tokens_seen": 33836992, + "step": 160335 + }, + { + "epoch": 17.639163916391638, + "grad_norm": 0.02713083103299141, + "learning_rate": 2.0925847607016824e-06, + "loss": 0.0073, + "num_input_tokens_seen": 33838048, + "step": 160340 + }, + { + "epoch": 17.63971397139714, + "grad_norm": 0.04588604345917702, + "learning_rate": 2.091623638322457e-06, + "loss": 0.0061, + "num_input_tokens_seen": 33839072, + "step": 160345 + }, + { + "epoch": 17.64026402640264, + "grad_norm": 0.01262262836098671, + "learning_rate": 2.0906627270787403e-06, + "loss": 0.0535, + "num_input_tokens_seen": 33840160, + "step": 160350 + }, + { + "epoch": 17.64081408140814, + "grad_norm": 0.34084203839302063, + "learning_rate": 2.0897020269793984e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33841184, + "step": 160355 + }, + { + "epoch": 17.641364136413642, + "grad_norm": 0.004601175431162119, + "learning_rate": 2.0887415380332733e-06, + "loss": 0.0168, + "num_input_tokens_seen": 33842240, + "step": 160360 + }, + { + "epoch": 17.641914191419144, + "grad_norm": 2.0013186931610107, + "learning_rate": 2.0877812602492246e-06, + "loss": 0.0223, + "num_input_tokens_seen": 33843296, + "step": 160365 + }, + { + "epoch": 17.64246424642464, + "grad_norm": 0.017371011897921562, + "learning_rate": 2.086821193636107e-06, + "loss": 0.0018, + "num_input_tokens_seen": 33844384, + "step": 160370 + }, + { + "epoch": 17.643014301430142, + "grad_norm": 1.859707236289978, + "learning_rate": 2.0858613382027576e-06, + "loss": 0.139, + "num_input_tokens_seen": 33845440, + "step": 160375 + }, + { + "epoch": 17.643564356435643, + "grad_norm": 0.0060996683314442635, + "learning_rate": 2.084901693958033e-06, + "loss": 0.1832, + "num_input_tokens_seen": 33846432, + "step": 160380 + }, + { + "epoch": 17.644114411441144, + "grad_norm": 0.6029657125473022, + "learning_rate": 2.0839422609107735e-06, + "loss": 0.0068, + "num_input_tokens_seen": 33847520, + "step": 160385 + }, + { + "epoch": 17.644664466446645, + "grad_norm": 0.016117507591843605, + "learning_rate": 2.082983039069819e-06, + "loss": 0.0119, + "num_input_tokens_seen": 33848576, + "step": 160390 + }, + { + "epoch": 17.645214521452147, + "grad_norm": 0.04960672929883003, + "learning_rate": 2.082024028444013e-06, + "loss": 0.0056, + "num_input_tokens_seen": 33849632, + "step": 160395 + }, + { + "epoch": 17.645764576457644, + "grad_norm": 0.021077832207083702, + "learning_rate": 2.081065229042195e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33850656, + "step": 160400 + }, + { + "epoch": 17.646314631463145, + "grad_norm": 0.008030653931200504, + "learning_rate": 2.080106640873203e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33851680, + "step": 160405 + }, + { + "epoch": 17.646864686468646, + "grad_norm": 0.06947055459022522, + "learning_rate": 2.0791482639458736e-06, + "loss": 0.032, + "num_input_tokens_seen": 33852800, + "step": 160410 + }, + { + "epoch": 17.647414741474147, + "grad_norm": 0.03590529039502144, + "learning_rate": 2.0781900982690284e-06, + "loss": 0.0353, + "num_input_tokens_seen": 33853792, + "step": 160415 + }, + { + "epoch": 17.64796479647965, + "grad_norm": 0.017023315653204918, + "learning_rate": 2.077232143851507e-06, + "loss": 0.0875, + "num_input_tokens_seen": 33854848, + "step": 160420 + }, + { + "epoch": 17.64851485148515, + "grad_norm": 0.07984425872564316, + "learning_rate": 2.076274400702144e-06, + "loss": 0.0083, + "num_input_tokens_seen": 33855904, + "step": 160425 + }, + { + "epoch": 17.64906490649065, + "grad_norm": 4.198234558105469, + "learning_rate": 2.075316868829755e-06, + "loss": 0.0231, + "num_input_tokens_seen": 33856960, + "step": 160430 + }, + { + "epoch": 17.649614961496148, + "grad_norm": 0.015728091821074486, + "learning_rate": 2.074359548243171e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33858080, + "step": 160435 + }, + { + "epoch": 17.65016501650165, + "grad_norm": 0.9479109048843384, + "learning_rate": 2.0734024389512134e-06, + "loss": 0.0134, + "num_input_tokens_seen": 33859136, + "step": 160440 + }, + { + "epoch": 17.65071507150715, + "grad_norm": 0.03633751720190048, + "learning_rate": 2.0724455409627087e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33860224, + "step": 160445 + }, + { + "epoch": 17.65126512651265, + "grad_norm": 0.016789378598332405, + "learning_rate": 2.0714888542864657e-06, + "loss": 0.007, + "num_input_tokens_seen": 33861280, + "step": 160450 + }, + { + "epoch": 17.651815181518153, + "grad_norm": 0.036507248878479004, + "learning_rate": 2.0705323789313088e-06, + "loss": 0.0316, + "num_input_tokens_seen": 33862304, + "step": 160455 + }, + { + "epoch": 17.652365236523654, + "grad_norm": 0.025586197152733803, + "learning_rate": 2.069576114906055e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33863296, + "step": 160460 + }, + { + "epoch": 17.652915291529155, + "grad_norm": 0.06047661975026131, + "learning_rate": 2.0686200622195117e-06, + "loss": 0.0863, + "num_input_tokens_seen": 33864384, + "step": 160465 + }, + { + "epoch": 17.653465346534652, + "grad_norm": 0.012911728583276272, + "learning_rate": 2.0676642208804996e-06, + "loss": 0.045, + "num_input_tokens_seen": 33865408, + "step": 160470 + }, + { + "epoch": 17.654015401540153, + "grad_norm": 4.992324352264404, + "learning_rate": 2.0667085908978173e-06, + "loss": 0.0949, + "num_input_tokens_seen": 33866432, + "step": 160475 + }, + { + "epoch": 17.654565456545654, + "grad_norm": 0.026512138545513153, + "learning_rate": 2.065753172280277e-06, + "loss": 0.0009, + "num_input_tokens_seen": 33867520, + "step": 160480 + }, + { + "epoch": 17.655115511551156, + "grad_norm": 0.03740109130740166, + "learning_rate": 2.0647979650366882e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33868576, + "step": 160485 + }, + { + "epoch": 17.655665566556657, + "grad_norm": 0.04349955916404724, + "learning_rate": 2.063842969175847e-06, + "loss": 0.0017, + "num_input_tokens_seen": 33869664, + "step": 160490 + }, + { + "epoch": 17.656215621562158, + "grad_norm": 0.1398944854736328, + "learning_rate": 2.0628881847065658e-06, + "loss": 0.066, + "num_input_tokens_seen": 33870688, + "step": 160495 + }, + { + "epoch": 17.656765676567655, + "grad_norm": 0.03364066779613495, + "learning_rate": 2.061933611637637e-06, + "loss": 0.0698, + "num_input_tokens_seen": 33871776, + "step": 160500 + }, + { + "epoch": 17.657315731573156, + "grad_norm": 0.03339523822069168, + "learning_rate": 2.060979249977854e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33872896, + "step": 160505 + }, + { + "epoch": 17.657865786578657, + "grad_norm": 0.2519495487213135, + "learning_rate": 2.060025099736018e-06, + "loss": 0.0065, + "num_input_tokens_seen": 33874016, + "step": 160510 + }, + { + "epoch": 17.65841584158416, + "grad_norm": 0.04061835631728172, + "learning_rate": 2.059071160920925e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33875104, + "step": 160515 + }, + { + "epoch": 17.65896589658966, + "grad_norm": 0.09253372251987457, + "learning_rate": 2.058117433541368e-06, + "loss": 0.0397, + "num_input_tokens_seen": 33876160, + "step": 160520 + }, + { + "epoch": 17.65951595159516, + "grad_norm": 0.029666483402252197, + "learning_rate": 2.0571639176061337e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33877184, + "step": 160525 + }, + { + "epoch": 17.66006600660066, + "grad_norm": 0.015853457152843475, + "learning_rate": 2.0562106131240045e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33878240, + "step": 160530 + }, + { + "epoch": 17.66061606160616, + "grad_norm": 2.7719104290008545, + "learning_rate": 2.055257520103776e-06, + "loss": 0.0709, + "num_input_tokens_seen": 33879232, + "step": 160535 + }, + { + "epoch": 17.66116611661166, + "grad_norm": 0.3793216943740845, + "learning_rate": 2.0543046385542277e-06, + "loss": 0.005, + "num_input_tokens_seen": 33880320, + "step": 160540 + }, + { + "epoch": 17.66171617161716, + "grad_norm": 0.01736743561923504, + "learning_rate": 2.053351968484146e-06, + "loss": 0.0506, + "num_input_tokens_seen": 33881376, + "step": 160545 + }, + { + "epoch": 17.662266226622663, + "grad_norm": 0.731368899345398, + "learning_rate": 2.0523995099023114e-06, + "loss": 0.0113, + "num_input_tokens_seen": 33882432, + "step": 160550 + }, + { + "epoch": 17.662816281628164, + "grad_norm": 0.10750836133956909, + "learning_rate": 2.0514472628174906e-06, + "loss": 0.0035, + "num_input_tokens_seen": 33883456, + "step": 160555 + }, + { + "epoch": 17.663366336633665, + "grad_norm": 0.16611263155937195, + "learning_rate": 2.0504952272384777e-06, + "loss": 0.0025, + "num_input_tokens_seen": 33884480, + "step": 160560 + }, + { + "epoch": 17.663916391639162, + "grad_norm": 0.04708165302872658, + "learning_rate": 2.0495434031740314e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33885600, + "step": 160565 + }, + { + "epoch": 17.664466446644663, + "grad_norm": 0.016650905832648277, + "learning_rate": 2.0485917906329283e-06, + "loss": 0.0033, + "num_input_tokens_seen": 33886656, + "step": 160570 + }, + { + "epoch": 17.665016501650165, + "grad_norm": 0.1302814930677414, + "learning_rate": 2.0476403896239503e-06, + "loss": 0.0022, + "num_input_tokens_seen": 33887744, + "step": 160575 + }, + { + "epoch": 17.665566556655666, + "grad_norm": 0.029804885387420654, + "learning_rate": 2.046689200155849e-06, + "loss": 0.0056, + "num_input_tokens_seen": 33888736, + "step": 160580 + }, + { + "epoch": 17.666116611661167, + "grad_norm": 4.348397254943848, + "learning_rate": 2.045738222237406e-06, + "loss": 0.1184, + "num_input_tokens_seen": 33889792, + "step": 160585 + }, + { + "epoch": 17.666666666666668, + "grad_norm": 0.006899584084749222, + "learning_rate": 2.0447874558773756e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33890912, + "step": 160590 + }, + { + "epoch": 17.66721672167217, + "grad_norm": 0.02231082133948803, + "learning_rate": 2.0438369010845226e-06, + "loss": 0.0574, + "num_input_tokens_seen": 33892032, + "step": 160595 + }, + { + "epoch": 17.667766776677666, + "grad_norm": 0.007368903141468763, + "learning_rate": 2.0428865578676152e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33893024, + "step": 160600 + }, + { + "epoch": 17.668316831683168, + "grad_norm": 0.39743924140930176, + "learning_rate": 2.0419364262353994e-06, + "loss": 0.0114, + "num_input_tokens_seen": 33894016, + "step": 160605 + }, + { + "epoch": 17.66886688668867, + "grad_norm": 0.04583492875099182, + "learning_rate": 2.040986506196646e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33895136, + "step": 160610 + }, + { + "epoch": 17.66941694169417, + "grad_norm": 0.02537214569747448, + "learning_rate": 2.040036797760103e-06, + "loss": 0.0028, + "num_input_tokens_seen": 33896160, + "step": 160615 + }, + { + "epoch": 17.66996699669967, + "grad_norm": 0.08494658023118973, + "learning_rate": 2.0390873009345198e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33897184, + "step": 160620 + }, + { + "epoch": 17.670517051705172, + "grad_norm": 0.04804634675383568, + "learning_rate": 2.0381380157286495e-06, + "loss": 0.0343, + "num_input_tokens_seen": 33898208, + "step": 160625 + }, + { + "epoch": 17.67106710671067, + "grad_norm": 0.22318950295448303, + "learning_rate": 2.037188942151244e-06, + "loss": 0.0336, + "num_input_tokens_seen": 33899232, + "step": 160630 + }, + { + "epoch": 17.67161716171617, + "grad_norm": 0.02748718298971653, + "learning_rate": 2.036240080211052e-06, + "loss": 0.0688, + "num_input_tokens_seen": 33900288, + "step": 160635 + }, + { + "epoch": 17.67216721672167, + "grad_norm": 0.051053524017333984, + "learning_rate": 2.035291429916819e-06, + "loss": 0.0414, + "num_input_tokens_seen": 33901312, + "step": 160640 + }, + { + "epoch": 17.672717271727173, + "grad_norm": 0.10919123888015747, + "learning_rate": 2.03434299127728e-06, + "loss": 0.0014, + "num_input_tokens_seen": 33902272, + "step": 160645 + }, + { + "epoch": 17.673267326732674, + "grad_norm": 0.004854300525039434, + "learning_rate": 2.0333947643011803e-06, + "loss": 0.013, + "num_input_tokens_seen": 33903328, + "step": 160650 + }, + { + "epoch": 17.673817381738175, + "grad_norm": 0.12548333406448364, + "learning_rate": 2.0324467489972606e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33904416, + "step": 160655 + }, + { + "epoch": 17.674367436743676, + "grad_norm": 0.06224915012717247, + "learning_rate": 2.0314989453742635e-06, + "loss": 0.039, + "num_input_tokens_seen": 33905504, + "step": 160660 + }, + { + "epoch": 17.674917491749174, + "grad_norm": 0.002780008828267455, + "learning_rate": 2.030551353440921e-06, + "loss": 0.0021, + "num_input_tokens_seen": 33906592, + "step": 160665 + }, + { + "epoch": 17.675467546754675, + "grad_norm": 0.49239471554756165, + "learning_rate": 2.029603973205957e-06, + "loss": 0.0759, + "num_input_tokens_seen": 33907680, + "step": 160670 + }, + { + "epoch": 17.676017601760176, + "grad_norm": 0.058514345437288284, + "learning_rate": 2.028656804678117e-06, + "loss": 0.1026, + "num_input_tokens_seen": 33908768, + "step": 160675 + }, + { + "epoch": 17.676567656765677, + "grad_norm": 2.418557643890381, + "learning_rate": 2.0277098478661216e-06, + "loss": 0.0147, + "num_input_tokens_seen": 33909824, + "step": 160680 + }, + { + "epoch": 17.677117711771178, + "grad_norm": 0.10854656249284744, + "learning_rate": 2.0267631027787003e-06, + "loss": 0.031, + "num_input_tokens_seen": 33910976, + "step": 160685 + }, + { + "epoch": 17.67766776677668, + "grad_norm": 0.04888947308063507, + "learning_rate": 2.0258165694245825e-06, + "loss": 0.0058, + "num_input_tokens_seen": 33912064, + "step": 160690 + }, + { + "epoch": 17.678217821782177, + "grad_norm": 0.028995715081691742, + "learning_rate": 2.024870247812488e-06, + "loss": 0.0027, + "num_input_tokens_seen": 33913152, + "step": 160695 + }, + { + "epoch": 17.678767876787678, + "grad_norm": 0.07646768540143967, + "learning_rate": 2.0239241379511413e-06, + "loss": 0.0524, + "num_input_tokens_seen": 33914144, + "step": 160700 + }, + { + "epoch": 17.67931793179318, + "grad_norm": 0.08295480161905289, + "learning_rate": 2.0229782398492576e-06, + "loss": 0.0159, + "num_input_tokens_seen": 33915200, + "step": 160705 + }, + { + "epoch": 17.67986798679868, + "grad_norm": 0.04985123872756958, + "learning_rate": 2.02203255351556e-06, + "loss": 0.0069, + "num_input_tokens_seen": 33916352, + "step": 160710 + }, + { + "epoch": 17.68041804180418, + "grad_norm": 0.03990335762500763, + "learning_rate": 2.0210870789587637e-06, + "loss": 0.0598, + "num_input_tokens_seen": 33917472, + "step": 160715 + }, + { + "epoch": 17.680968096809682, + "grad_norm": 0.16408541798591614, + "learning_rate": 2.020141816187579e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33918496, + "step": 160720 + }, + { + "epoch": 17.681518151815183, + "grad_norm": 0.12459888309240341, + "learning_rate": 2.0191967652107237e-06, + "loss": 0.0081, + "num_input_tokens_seen": 33919552, + "step": 160725 + }, + { + "epoch": 17.68206820682068, + "grad_norm": 0.044766440987586975, + "learning_rate": 2.0182519260369015e-06, + "loss": 0.1333, + "num_input_tokens_seen": 33920576, + "step": 160730 + }, + { + "epoch": 17.682618261826182, + "grad_norm": 0.03800896555185318, + "learning_rate": 2.0173072986748254e-06, + "loss": 0.0269, + "num_input_tokens_seen": 33921600, + "step": 160735 + }, + { + "epoch": 17.683168316831683, + "grad_norm": 0.20467768609523773, + "learning_rate": 2.0163628831331964e-06, + "loss": 0.0096, + "num_input_tokens_seen": 33922688, + "step": 160740 + }, + { + "epoch": 17.683718371837184, + "grad_norm": 0.020604247227311134, + "learning_rate": 2.0154186794207246e-06, + "loss": 0.0382, + "num_input_tokens_seen": 33923776, + "step": 160745 + }, + { + "epoch": 17.684268426842685, + "grad_norm": 0.018998203799128532, + "learning_rate": 2.014474687546111e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33924768, + "step": 160750 + }, + { + "epoch": 17.684818481848186, + "grad_norm": 0.007614566944539547, + "learning_rate": 2.013530907518052e-06, + "loss": 0.1031, + "num_input_tokens_seen": 33925856, + "step": 160755 + }, + { + "epoch": 17.685368536853684, + "grad_norm": 0.10399342328310013, + "learning_rate": 2.012587339345251e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33926848, + "step": 160760 + }, + { + "epoch": 17.685918591859185, + "grad_norm": 0.018438121303915977, + "learning_rate": 2.011643983036404e-06, + "loss": 0.0046, + "num_input_tokens_seen": 33927936, + "step": 160765 + }, + { + "epoch": 17.686468646864686, + "grad_norm": 0.09224429726600647, + "learning_rate": 2.0107008386001965e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33928992, + "step": 160770 + }, + { + "epoch": 17.687018701870187, + "grad_norm": 0.05291268602013588, + "learning_rate": 2.0097579060453343e-06, + "loss": 0.002, + "num_input_tokens_seen": 33930112, + "step": 160775 + }, + { + "epoch": 17.687568756875688, + "grad_norm": 0.004912933800369501, + "learning_rate": 2.008815185380497e-06, + "loss": 0.0024, + "num_input_tokens_seen": 33931168, + "step": 160780 + }, + { + "epoch": 17.68811881188119, + "grad_norm": 1.062131404876709, + "learning_rate": 2.0078726766143833e-06, + "loss": 0.012, + "num_input_tokens_seen": 33932288, + "step": 160785 + }, + { + "epoch": 17.68866886688669, + "grad_norm": 0.02269654907286167, + "learning_rate": 2.006930379755673e-06, + "loss": 0.0006, + "num_input_tokens_seen": 33933408, + "step": 160790 + }, + { + "epoch": 17.689218921892188, + "grad_norm": 0.02280663512647152, + "learning_rate": 2.005988294813049e-06, + "loss": 0.0007, + "num_input_tokens_seen": 33934464, + "step": 160795 + }, + { + "epoch": 17.68976897689769, + "grad_norm": 0.00846850872039795, + "learning_rate": 2.005046421795198e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33935488, + "step": 160800 + }, + { + "epoch": 17.69031903190319, + "grad_norm": 0.3419690430164337, + "learning_rate": 2.0041047607108037e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33936576, + "step": 160805 + }, + { + "epoch": 17.69086908690869, + "grad_norm": 0.020719734951853752, + "learning_rate": 2.003163311568537e-06, + "loss": 0.0043, + "num_input_tokens_seen": 33937632, + "step": 160810 + }, + { + "epoch": 17.691419141914192, + "grad_norm": 3.2492763996124268, + "learning_rate": 2.0022220743770833e-06, + "loss": 0.0839, + "num_input_tokens_seen": 33938656, + "step": 160815 + }, + { + "epoch": 17.691969196919693, + "grad_norm": 0.13808201253414154, + "learning_rate": 2.0012810491451094e-06, + "loss": 0.0285, + "num_input_tokens_seen": 33939680, + "step": 160820 + }, + { + "epoch": 17.69251925192519, + "grad_norm": 0.005290674977004528, + "learning_rate": 2.000340235881293e-06, + "loss": 0.0013, + "num_input_tokens_seen": 33940800, + "step": 160825 + }, + { + "epoch": 17.693069306930692, + "grad_norm": 1.861684799194336, + "learning_rate": 1.9993996345943067e-06, + "loss": 0.0668, + "num_input_tokens_seen": 33941888, + "step": 160830 + }, + { + "epoch": 17.693619361936193, + "grad_norm": 0.014385698363184929, + "learning_rate": 1.9984592452928137e-06, + "loss": 0.0428, + "num_input_tokens_seen": 33942944, + "step": 160835 + }, + { + "epoch": 17.694169416941694, + "grad_norm": 0.00989217683672905, + "learning_rate": 1.997519067985487e-06, + "loss": 0.0087, + "num_input_tokens_seen": 33944000, + "step": 160840 + }, + { + "epoch": 17.694719471947195, + "grad_norm": 0.02071797475218773, + "learning_rate": 1.9965791026809873e-06, + "loss": 0.1397, + "num_input_tokens_seen": 33945120, + "step": 160845 + }, + { + "epoch": 17.695269526952696, + "grad_norm": 0.017641307786107063, + "learning_rate": 1.995639349387984e-06, + "loss": 0.0063, + "num_input_tokens_seen": 33946176, + "step": 160850 + }, + { + "epoch": 17.695819581958197, + "grad_norm": 0.07252439111471176, + "learning_rate": 1.9946998081151303e-06, + "loss": 0.0032, + "num_input_tokens_seen": 33947296, + "step": 160855 + }, + { + "epoch": 17.696369636963695, + "grad_norm": 0.23978286981582642, + "learning_rate": 1.9937604788710902e-06, + "loss": 0.0051, + "num_input_tokens_seen": 33948384, + "step": 160860 + }, + { + "epoch": 17.696919691969196, + "grad_norm": 0.023536745458841324, + "learning_rate": 1.9928213616645237e-06, + "loss": 0.0132, + "num_input_tokens_seen": 33949440, + "step": 160865 + }, + { + "epoch": 17.697469746974697, + "grad_norm": 0.24110034108161926, + "learning_rate": 1.9918824565040768e-06, + "loss": 0.0484, + "num_input_tokens_seen": 33950432, + "step": 160870 + }, + { + "epoch": 17.698019801980198, + "grad_norm": 0.09569164365530014, + "learning_rate": 1.9909437633984146e-06, + "loss": 0.0103, + "num_input_tokens_seen": 33951456, + "step": 160875 + }, + { + "epoch": 17.6985698569857, + "grad_norm": 0.26909708976745605, + "learning_rate": 1.990005282356183e-06, + "loss": 0.0124, + "num_input_tokens_seen": 33952480, + "step": 160880 + }, + { + "epoch": 17.6991199119912, + "grad_norm": 0.055388014763593674, + "learning_rate": 1.989067013386023e-06, + "loss": 0.0784, + "num_input_tokens_seen": 33953568, + "step": 160885 + }, + { + "epoch": 17.6996699669967, + "grad_norm": 0.00965129118412733, + "learning_rate": 1.988128956496599e-06, + "loss": 0.0068, + "num_input_tokens_seen": 33954624, + "step": 160890 + }, + { + "epoch": 17.7002200220022, + "grad_norm": 0.5819938778877258, + "learning_rate": 1.987191111696546e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33955712, + "step": 160895 + }, + { + "epoch": 17.7007700770077, + "grad_norm": 0.0202578604221344, + "learning_rate": 1.98625347899451e-06, + "loss": 0.0068, + "num_input_tokens_seen": 33956800, + "step": 160900 + }, + { + "epoch": 17.7013201320132, + "grad_norm": 0.044636789709329605, + "learning_rate": 1.9853160583991366e-06, + "loss": 0.0493, + "num_input_tokens_seen": 33957792, + "step": 160905 + }, + { + "epoch": 17.701870187018702, + "grad_norm": 0.03548899292945862, + "learning_rate": 1.984378849919055e-06, + "loss": 0.0012, + "num_input_tokens_seen": 33958848, + "step": 160910 + }, + { + "epoch": 17.702420242024203, + "grad_norm": 0.024942317977547646, + "learning_rate": 1.983441853562909e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33959872, + "step": 160915 + }, + { + "epoch": 17.702970297029704, + "grad_norm": 0.005887494422495365, + "learning_rate": 1.982505069339338e-06, + "loss": 0.0011, + "num_input_tokens_seen": 33960928, + "step": 160920 + }, + { + "epoch": 17.703520352035202, + "grad_norm": 0.023200133815407753, + "learning_rate": 1.9815684972569747e-06, + "loss": 0.0036, + "num_input_tokens_seen": 33961984, + "step": 160925 + }, + { + "epoch": 17.704070407040703, + "grad_norm": 0.017940420657396317, + "learning_rate": 1.9806321373244506e-06, + "loss": 0.0592, + "num_input_tokens_seen": 33963008, + "step": 160930 + }, + { + "epoch": 17.704620462046204, + "grad_norm": 0.006660409737378359, + "learning_rate": 1.979695989550387e-06, + "loss": 0.0356, + "num_input_tokens_seen": 33964096, + "step": 160935 + }, + { + "epoch": 17.705170517051705, + "grad_norm": 0.06168946996331215, + "learning_rate": 1.9787600539434264e-06, + "loss": 0.0071, + "num_input_tokens_seen": 33965120, + "step": 160940 + }, + { + "epoch": 17.705720572057206, + "grad_norm": 0.6704016327857971, + "learning_rate": 1.977824330512179e-06, + "loss": 0.0059, + "num_input_tokens_seen": 33966144, + "step": 160945 + }, + { + "epoch": 17.706270627062707, + "grad_norm": 1.063551664352417, + "learning_rate": 1.976888819265285e-06, + "loss": 0.0451, + "num_input_tokens_seen": 33967264, + "step": 160950 + }, + { + "epoch": 17.706820682068205, + "grad_norm": 0.07100848108530045, + "learning_rate": 1.97595352021136e-06, + "loss": 0.0441, + "num_input_tokens_seen": 33968352, + "step": 160955 + }, + { + "epoch": 17.707370737073706, + "grad_norm": 0.011126428842544556, + "learning_rate": 1.9750184333590187e-06, + "loss": 0.0044, + "num_input_tokens_seen": 33969504, + "step": 160960 + }, + { + "epoch": 17.707920792079207, + "grad_norm": 0.014124961569905281, + "learning_rate": 1.974083558716891e-06, + "loss": 0.0368, + "num_input_tokens_seen": 33970560, + "step": 160965 + }, + { + "epoch": 17.70847084708471, + "grad_norm": 0.060927119106054306, + "learning_rate": 1.973148896293578e-06, + "loss": 0.006, + "num_input_tokens_seen": 33971616, + "step": 160970 + }, + { + "epoch": 17.70902090209021, + "grad_norm": 0.013127228245139122, + "learning_rate": 1.972214446097703e-06, + "loss": 0.067, + "num_input_tokens_seen": 33972640, + "step": 160975 + }, + { + "epoch": 17.70957095709571, + "grad_norm": 1.4425324201583862, + "learning_rate": 1.971280208137885e-06, + "loss": 0.0321, + "num_input_tokens_seen": 33973728, + "step": 160980 + }, + { + "epoch": 17.71012101210121, + "grad_norm": 0.006705587729811668, + "learning_rate": 1.9703461824227194e-06, + "loss": 0.0004, + "num_input_tokens_seen": 33974720, + "step": 160985 + }, + { + "epoch": 17.71067106710671, + "grad_norm": 0.011301117949187756, + "learning_rate": 1.969412368960827e-06, + "loss": 0.063, + "num_input_tokens_seen": 33975808, + "step": 160990 + }, + { + "epoch": 17.71122112211221, + "grad_norm": 0.004546893294900656, + "learning_rate": 1.968478767760812e-06, + "loss": 0.0544, + "num_input_tokens_seen": 33976864, + "step": 160995 + }, + { + "epoch": 17.71177117711771, + "grad_norm": 0.014952585101127625, + "learning_rate": 1.9675453788312704e-06, + "loss": 0.0586, + "num_input_tokens_seen": 33977888, + "step": 161000 + }, + { + "epoch": 17.712321232123212, + "grad_norm": 0.01703161932528019, + "learning_rate": 1.966612202180812e-06, + "loss": 0.0034, + "num_input_tokens_seen": 33978880, + "step": 161005 + }, + { + "epoch": 17.712871287128714, + "grad_norm": 0.03944806009531021, + "learning_rate": 1.9656792378180357e-06, + "loss": 0.0026, + "num_input_tokens_seen": 33979936, + "step": 161010 + }, + { + "epoch": 17.713421342134215, + "grad_norm": 0.0017654465045779943, + "learning_rate": 1.964746485751545e-06, + "loss": 0.0884, + "num_input_tokens_seen": 33980960, + "step": 161015 + }, + { + "epoch": 17.713971397139716, + "grad_norm": 2.4532909393310547, + "learning_rate": 1.9638139459899315e-06, + "loss": 0.1793, + "num_input_tokens_seen": 33981920, + "step": 161020 + }, + { + "epoch": 17.714521452145213, + "grad_norm": 0.6567838191986084, + "learning_rate": 1.9628816185417897e-06, + "loss": 0.0066, + "num_input_tokens_seen": 33982944, + "step": 161025 + }, + { + "epoch": 17.715071507150714, + "grad_norm": 0.03280533850193024, + "learning_rate": 1.961949503415711e-06, + "loss": 0.0612, + "num_input_tokens_seen": 33984032, + "step": 161030 + }, + { + "epoch": 17.715621562156215, + "grad_norm": 0.02144532836973667, + "learning_rate": 1.9610176006202905e-06, + "loss": 0.0717, + "num_input_tokens_seen": 33985120, + "step": 161035 + }, + { + "epoch": 17.716171617161717, + "grad_norm": 0.005337979644536972, + "learning_rate": 1.9600859101641194e-06, + "loss": 0.001, + "num_input_tokens_seen": 33986240, + "step": 161040 + }, + { + "epoch": 17.716721672167218, + "grad_norm": 0.33506834506988525, + "learning_rate": 1.959154432055782e-06, + "loss": 0.0236, + "num_input_tokens_seen": 33987264, + "step": 161045 + }, + { + "epoch": 17.71727172717272, + "grad_norm": 1.4449985027313232, + "learning_rate": 1.9582231663038546e-06, + "loss": 0.0298, + "num_input_tokens_seen": 33988320, + "step": 161050 + }, + { + "epoch": 17.717821782178216, + "grad_norm": 0.06124059110879898, + "learning_rate": 1.9572921129169365e-06, + "loss": 0.0067, + "num_input_tokens_seen": 33989376, + "step": 161055 + }, + { + "epoch": 17.718371837183717, + "grad_norm": 0.045552246272563934, + "learning_rate": 1.956361271903592e-06, + "loss": 0.0016, + "num_input_tokens_seen": 33990400, + "step": 161060 + }, + { + "epoch": 17.71892189218922, + "grad_norm": 2.0889737606048584, + "learning_rate": 1.9554306432724103e-06, + "loss": 0.1076, + "num_input_tokens_seen": 33991424, + "step": 161065 + }, + { + "epoch": 17.71947194719472, + "grad_norm": 0.2950782775878906, + "learning_rate": 1.954500227031969e-06, + "loss": 0.0038, + "num_input_tokens_seen": 33992480, + "step": 161070 + }, + { + "epoch": 17.72002200220022, + "grad_norm": 0.036387231200933456, + "learning_rate": 1.95357002319084e-06, + "loss": 0.0099, + "num_input_tokens_seen": 33993568, + "step": 161075 + }, + { + "epoch": 17.72057205720572, + "grad_norm": 0.05133304372429848, + "learning_rate": 1.952640031757599e-06, + "loss": 0.0216, + "num_input_tokens_seen": 33994592, + "step": 161080 + }, + { + "epoch": 17.721122112211223, + "grad_norm": 0.25509241223335266, + "learning_rate": 1.9517102527408097e-06, + "loss": 0.0044, + "num_input_tokens_seen": 33995616, + "step": 161085 + }, + { + "epoch": 17.72167216721672, + "grad_norm": 0.06932073831558228, + "learning_rate": 1.9507806861490507e-06, + "loss": 0.0048, + "num_input_tokens_seen": 33996704, + "step": 161090 + }, + { + "epoch": 17.72222222222222, + "grad_norm": 0.18875332176685333, + "learning_rate": 1.94985133199089e-06, + "loss": 0.08, + "num_input_tokens_seen": 33997824, + "step": 161095 + }, + { + "epoch": 17.722772277227723, + "grad_norm": 2.8135762214660645, + "learning_rate": 1.948922190274882e-06, + "loss": 0.0186, + "num_input_tokens_seen": 33998912, + "step": 161100 + }, + { + "epoch": 17.723322332233224, + "grad_norm": 0.524218738079071, + "learning_rate": 1.9479932610096035e-06, + "loss": 0.0686, + "num_input_tokens_seen": 33999904, + "step": 161105 + }, + { + "epoch": 17.723872387238725, + "grad_norm": 0.03876304626464844, + "learning_rate": 1.9470645442036056e-06, + "loss": 0.0087, + "num_input_tokens_seen": 34000960, + "step": 161110 + }, + { + "epoch": 17.724422442244226, + "grad_norm": 0.11029317229986191, + "learning_rate": 1.9461360398654566e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34002016, + "step": 161115 + }, + { + "epoch": 17.724972497249723, + "grad_norm": 3.7617270946502686, + "learning_rate": 1.945207748003702e-06, + "loss": 0.1409, + "num_input_tokens_seen": 34003040, + "step": 161120 + }, + { + "epoch": 17.725522552255224, + "grad_norm": 2.128265142440796, + "learning_rate": 1.9442796686269083e-06, + "loss": 0.0813, + "num_input_tokens_seen": 34004064, + "step": 161125 + }, + { + "epoch": 17.726072607260726, + "grad_norm": 0.13725963234901428, + "learning_rate": 1.9433518017436287e-06, + "loss": 0.039, + "num_input_tokens_seen": 34005088, + "step": 161130 + }, + { + "epoch": 17.726622662266227, + "grad_norm": 0.01066719088703394, + "learning_rate": 1.9424241473624067e-06, + "loss": 0.0887, + "num_input_tokens_seen": 34006144, + "step": 161135 + }, + { + "epoch": 17.727172717271728, + "grad_norm": 0.02508385106921196, + "learning_rate": 1.941496705491802e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34007168, + "step": 161140 + }, + { + "epoch": 17.72772277227723, + "grad_norm": 0.040243737399578094, + "learning_rate": 1.9405694761403554e-06, + "loss": 0.0037, + "num_input_tokens_seen": 34008256, + "step": 161145 + }, + { + "epoch": 17.72827282728273, + "grad_norm": 0.00605203490704298, + "learning_rate": 1.9396424593166124e-06, + "loss": 0.0063, + "num_input_tokens_seen": 34009248, + "step": 161150 + }, + { + "epoch": 17.728822882288227, + "grad_norm": 2.197195529937744, + "learning_rate": 1.9387156550291247e-06, + "loss": 0.1474, + "num_input_tokens_seen": 34010208, + "step": 161155 + }, + { + "epoch": 17.72937293729373, + "grad_norm": 0.5740019679069519, + "learning_rate": 1.9377890632864243e-06, + "loss": 0.0047, + "num_input_tokens_seen": 34011200, + "step": 161160 + }, + { + "epoch": 17.72992299229923, + "grad_norm": 0.012492462992668152, + "learning_rate": 1.93686268409706e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34012288, + "step": 161165 + }, + { + "epoch": 17.73047304730473, + "grad_norm": 0.02995196171104908, + "learning_rate": 1.9359365174695665e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34013312, + "step": 161170 + }, + { + "epoch": 17.731023102310232, + "grad_norm": 0.019611719995737076, + "learning_rate": 1.9350105634124733e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34014304, + "step": 161175 + }, + { + "epoch": 17.731573157315733, + "grad_norm": 0.6237192749977112, + "learning_rate": 1.9340848219343232e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34015392, + "step": 161180 + }, + { + "epoch": 17.73212321232123, + "grad_norm": 0.11836909502744675, + "learning_rate": 1.933159293043646e-06, + "loss": 0.0242, + "num_input_tokens_seen": 34016448, + "step": 161185 + }, + { + "epoch": 17.73267326732673, + "grad_norm": 1.5592877864837646, + "learning_rate": 1.9322339767489676e-06, + "loss": 0.0333, + "num_input_tokens_seen": 34017472, + "step": 161190 + }, + { + "epoch": 17.733223322332233, + "grad_norm": 0.029304860159754753, + "learning_rate": 1.9313088730588234e-06, + "loss": 0.0325, + "num_input_tokens_seen": 34018528, + "step": 161195 + }, + { + "epoch": 17.733773377337734, + "grad_norm": 0.01583208329975605, + "learning_rate": 1.9303839819817345e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34019552, + "step": 161200 + }, + { + "epoch": 17.734323432343235, + "grad_norm": 0.017892764881253242, + "learning_rate": 1.9294593035262236e-06, + "loss": 0.0098, + "num_input_tokens_seen": 34020576, + "step": 161205 + }, + { + "epoch": 17.734873487348736, + "grad_norm": 7.084002494812012, + "learning_rate": 1.928534837700821e-06, + "loss": 0.1461, + "num_input_tokens_seen": 34021632, + "step": 161210 + }, + { + "epoch": 17.735423542354237, + "grad_norm": 0.6282220482826233, + "learning_rate": 1.927610584514036e-06, + "loss": 0.0089, + "num_input_tokens_seen": 34022688, + "step": 161215 + }, + { + "epoch": 17.735973597359735, + "grad_norm": 0.06490173935890198, + "learning_rate": 1.9266865439744014e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34023808, + "step": 161220 + }, + { + "epoch": 17.736523652365236, + "grad_norm": 1.9762803316116333, + "learning_rate": 1.925762716090418e-06, + "loss": 0.0817, + "num_input_tokens_seen": 34024800, + "step": 161225 + }, + { + "epoch": 17.737073707370737, + "grad_norm": 0.008941998705267906, + "learning_rate": 1.9248391008706124e-06, + "loss": 0.0082, + "num_input_tokens_seen": 34025824, + "step": 161230 + }, + { + "epoch": 17.737623762376238, + "grad_norm": 0.07807379215955734, + "learning_rate": 1.923915698323489e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34026880, + "step": 161235 + }, + { + "epoch": 17.73817381738174, + "grad_norm": 0.005695335566997528, + "learning_rate": 1.922992508457561e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34027968, + "step": 161240 + }, + { + "epoch": 17.73872387238724, + "grad_norm": 0.002105000428855419, + "learning_rate": 1.9220695312813398e-06, + "loss": 0.0499, + "num_input_tokens_seen": 34029056, + "step": 161245 + }, + { + "epoch": 17.739273927392738, + "grad_norm": 0.02071058191359043, + "learning_rate": 1.921146766803328e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34030048, + "step": 161250 + }, + { + "epoch": 17.73982398239824, + "grad_norm": 1.2117747068405151, + "learning_rate": 1.920224215032035e-06, + "loss": 0.0146, + "num_input_tokens_seen": 34031072, + "step": 161255 + }, + { + "epoch": 17.74037403740374, + "grad_norm": 0.6909542083740234, + "learning_rate": 1.9193018759759595e-06, + "loss": 0.0107, + "num_input_tokens_seen": 34032096, + "step": 161260 + }, + { + "epoch": 17.74092409240924, + "grad_norm": 0.014709288254380226, + "learning_rate": 1.918379749643601e-06, + "loss": 0.073, + "num_input_tokens_seen": 34033184, + "step": 161265 + }, + { + "epoch": 17.741474147414742, + "grad_norm": 0.03694021329283714, + "learning_rate": 1.9174578360434653e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34034240, + "step": 161270 + }, + { + "epoch": 17.742024202420243, + "grad_norm": 4.603362083435059, + "learning_rate": 1.9165361351840383e-06, + "loss": 0.0549, + "num_input_tokens_seen": 34035296, + "step": 161275 + }, + { + "epoch": 17.742574257425744, + "grad_norm": 0.21240568161010742, + "learning_rate": 1.9156146470738297e-06, + "loss": 0.0148, + "num_input_tokens_seen": 34036352, + "step": 161280 + }, + { + "epoch": 17.74312431243124, + "grad_norm": 1.91834557056427, + "learning_rate": 1.9146933717213212e-06, + "loss": 0.074, + "num_input_tokens_seen": 34037344, + "step": 161285 + }, + { + "epoch": 17.743674367436743, + "grad_norm": 0.010487465187907219, + "learning_rate": 1.9137723091350006e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34038400, + "step": 161290 + }, + { + "epoch": 17.744224422442244, + "grad_norm": 0.2109614610671997, + "learning_rate": 1.9128514593233672e-06, + "loss": 0.0411, + "num_input_tokens_seen": 34039520, + "step": 161295 + }, + { + "epoch": 17.744774477447745, + "grad_norm": 0.03182126209139824, + "learning_rate": 1.9119308222948997e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34040544, + "step": 161300 + }, + { + "epoch": 17.745324532453246, + "grad_norm": 3.2397286891937256, + "learning_rate": 1.911010398058094e-06, + "loss": 0.0804, + "num_input_tokens_seen": 34041568, + "step": 161305 + }, + { + "epoch": 17.745874587458747, + "grad_norm": 0.02190472185611725, + "learning_rate": 1.9100901866214244e-06, + "loss": 0.0119, + "num_input_tokens_seen": 34042688, + "step": 161310 + }, + { + "epoch": 17.746424642464248, + "grad_norm": 0.025574492290616035, + "learning_rate": 1.90917018799337e-06, + "loss": 0.0072, + "num_input_tokens_seen": 34043776, + "step": 161315 + }, + { + "epoch": 17.746974697469746, + "grad_norm": 0.055358655750751495, + "learning_rate": 1.9082504021824156e-06, + "loss": 0.0954, + "num_input_tokens_seen": 34044832, + "step": 161320 + }, + { + "epoch": 17.747524752475247, + "grad_norm": 0.0049986992962658405, + "learning_rate": 1.907330829197035e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34045920, + "step": 161325 + }, + { + "epoch": 17.748074807480748, + "grad_norm": 0.016773566603660583, + "learning_rate": 1.9064114690457102e-06, + "loss": 0.0083, + "num_input_tokens_seen": 34047072, + "step": 161330 + }, + { + "epoch": 17.74862486248625, + "grad_norm": 0.11388570815324783, + "learning_rate": 1.9054923217369098e-06, + "loss": 0.0131, + "num_input_tokens_seen": 34048192, + "step": 161335 + }, + { + "epoch": 17.74917491749175, + "grad_norm": 2.82953143119812, + "learning_rate": 1.904573387279099e-06, + "loss": 0.0672, + "num_input_tokens_seen": 34049248, + "step": 161340 + }, + { + "epoch": 17.74972497249725, + "grad_norm": 0.010459582321345806, + "learning_rate": 1.9036546656807596e-06, + "loss": 0.0119, + "num_input_tokens_seen": 34050304, + "step": 161345 + }, + { + "epoch": 17.75027502750275, + "grad_norm": 0.02150532230734825, + "learning_rate": 1.9027361569503465e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34051392, + "step": 161350 + }, + { + "epoch": 17.75082508250825, + "grad_norm": 0.3625600337982178, + "learning_rate": 1.9018178610963306e-06, + "loss": 0.0037, + "num_input_tokens_seen": 34052448, + "step": 161355 + }, + { + "epoch": 17.75137513751375, + "grad_norm": 3.022954225540161, + "learning_rate": 1.9008997781271825e-06, + "loss": 0.1386, + "num_input_tokens_seen": 34053504, + "step": 161360 + }, + { + "epoch": 17.751925192519252, + "grad_norm": 0.019649555906653404, + "learning_rate": 1.8999819080513514e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34054560, + "step": 161365 + }, + { + "epoch": 17.752475247524753, + "grad_norm": 3.0113885402679443, + "learning_rate": 1.8990642508773053e-06, + "loss": 0.0261, + "num_input_tokens_seen": 34055584, + "step": 161370 + }, + { + "epoch": 17.753025302530254, + "grad_norm": 0.04702947288751602, + "learning_rate": 1.898146806613496e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34056576, + "step": 161375 + }, + { + "epoch": 17.753575357535752, + "grad_norm": 3.4952774047851562, + "learning_rate": 1.8972295752683833e-06, + "loss": 0.0091, + "num_input_tokens_seen": 34057600, + "step": 161380 + }, + { + "epoch": 17.754125412541253, + "grad_norm": 0.017435774207115173, + "learning_rate": 1.8963125568504242e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34058624, + "step": 161385 + }, + { + "epoch": 17.754675467546754, + "grad_norm": 0.01795259118080139, + "learning_rate": 1.8953957513680621e-06, + "loss": 0.0746, + "num_input_tokens_seen": 34059744, + "step": 161390 + }, + { + "epoch": 17.755225522552255, + "grad_norm": 0.022563010454177856, + "learning_rate": 1.8944791588297512e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34060928, + "step": 161395 + }, + { + "epoch": 17.755775577557756, + "grad_norm": 0.4274735450744629, + "learning_rate": 1.893562779243943e-06, + "loss": 0.0059, + "num_input_tokens_seen": 34061952, + "step": 161400 + }, + { + "epoch": 17.756325632563257, + "grad_norm": 0.04365288466215134, + "learning_rate": 1.8926466126190729e-06, + "loss": 0.0032, + "num_input_tokens_seen": 34062976, + "step": 161405 + }, + { + "epoch": 17.75687568756876, + "grad_norm": 1.507927656173706, + "learning_rate": 1.891730658963592e-06, + "loss": 0.0181, + "num_input_tokens_seen": 34064032, + "step": 161410 + }, + { + "epoch": 17.757425742574256, + "grad_norm": 0.1350901573896408, + "learning_rate": 1.8908149182859408e-06, + "loss": 0.0847, + "num_input_tokens_seen": 34065088, + "step": 161415 + }, + { + "epoch": 17.757975797579757, + "grad_norm": 0.02172262780368328, + "learning_rate": 1.8898993905945627e-06, + "loss": 0.0058, + "num_input_tokens_seen": 34066144, + "step": 161420 + }, + { + "epoch": 17.758525852585258, + "grad_norm": 0.003824487328529358, + "learning_rate": 1.8889840758978928e-06, + "loss": 0.001, + "num_input_tokens_seen": 34067104, + "step": 161425 + }, + { + "epoch": 17.75907590759076, + "grad_norm": 0.006405053194612265, + "learning_rate": 1.8880689742043656e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34068160, + "step": 161430 + }, + { + "epoch": 17.75962596259626, + "grad_norm": 0.06351952999830246, + "learning_rate": 1.8871540855224134e-06, + "loss": 0.0093, + "num_input_tokens_seen": 34069248, + "step": 161435 + }, + { + "epoch": 17.76017601760176, + "grad_norm": 1.888192892074585, + "learning_rate": 1.8862394098604713e-06, + "loss": 0.0365, + "num_input_tokens_seen": 34070272, + "step": 161440 + }, + { + "epoch": 17.760726072607262, + "grad_norm": 0.0026931038592010736, + "learning_rate": 1.8853249472269768e-06, + "loss": 0.0003, + "num_input_tokens_seen": 34071328, + "step": 161445 + }, + { + "epoch": 17.76127612761276, + "grad_norm": 2.9883830547332764, + "learning_rate": 1.8844106976303482e-06, + "loss": 0.107, + "num_input_tokens_seen": 34072416, + "step": 161450 + }, + { + "epoch": 17.76182618261826, + "grad_norm": 0.1268669217824936, + "learning_rate": 1.8834966610790094e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34073440, + "step": 161455 + }, + { + "epoch": 17.762376237623762, + "grad_norm": 0.08275715261697769, + "learning_rate": 1.8825828375813952e-06, + "loss": 0.0064, + "num_input_tokens_seen": 34074400, + "step": 161460 + }, + { + "epoch": 17.762926292629263, + "grad_norm": 2.5129036903381348, + "learning_rate": 1.8816692271459186e-06, + "loss": 0.1466, + "num_input_tokens_seen": 34075488, + "step": 161465 + }, + { + "epoch": 17.763476347634764, + "grad_norm": 0.29642900824546814, + "learning_rate": 1.880755829781003e-06, + "loss": 0.0042, + "num_input_tokens_seen": 34076544, + "step": 161470 + }, + { + "epoch": 17.764026402640265, + "grad_norm": 1.6246589422225952, + "learning_rate": 1.8798426454950696e-06, + "loss": 0.0223, + "num_input_tokens_seen": 34077600, + "step": 161475 + }, + { + "epoch": 17.764576457645763, + "grad_norm": 0.05830301716923714, + "learning_rate": 1.8789296742965284e-06, + "loss": 0.0879, + "num_input_tokens_seen": 34078688, + "step": 161480 + }, + { + "epoch": 17.765126512651264, + "grad_norm": 0.12157581746578217, + "learning_rate": 1.8780169161938033e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34079776, + "step": 161485 + }, + { + "epoch": 17.765676567656765, + "grad_norm": 1.7516858577728271, + "learning_rate": 1.8771043711952986e-06, + "loss": 0.0474, + "num_input_tokens_seen": 34080800, + "step": 161490 + }, + { + "epoch": 17.766226622662266, + "grad_norm": 0.37733110785484314, + "learning_rate": 1.8761920393094268e-06, + "loss": 0.0436, + "num_input_tokens_seen": 34081792, + "step": 161495 + }, + { + "epoch": 17.766776677667767, + "grad_norm": 0.01828617788851261, + "learning_rate": 1.8752799205445982e-06, + "loss": 0.0103, + "num_input_tokens_seen": 34082816, + "step": 161500 + }, + { + "epoch": 17.76732673267327, + "grad_norm": 0.02835262008011341, + "learning_rate": 1.8743680149092168e-06, + "loss": 0.1009, + "num_input_tokens_seen": 34083904, + "step": 161505 + }, + { + "epoch": 17.76787678767877, + "grad_norm": 0.011716763488948345, + "learning_rate": 1.873456322411693e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34084960, + "step": 161510 + }, + { + "epoch": 17.768426842684267, + "grad_norm": 0.036272935569286346, + "learning_rate": 1.8725448430604197e-06, + "loss": 0.1115, + "num_input_tokens_seen": 34086048, + "step": 161515 + }, + { + "epoch": 17.768976897689768, + "grad_norm": 2.1654751300811768, + "learning_rate": 1.8716335768638072e-06, + "loss": 0.0453, + "num_input_tokens_seen": 34087104, + "step": 161520 + }, + { + "epoch": 17.76952695269527, + "grad_norm": 0.015570742078125477, + "learning_rate": 1.8707225238302456e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34088160, + "step": 161525 + }, + { + "epoch": 17.77007700770077, + "grad_norm": 0.0940466970205307, + "learning_rate": 1.8698116839681369e-06, + "loss": 0.0044, + "num_input_tokens_seen": 34089248, + "step": 161530 + }, + { + "epoch": 17.77062706270627, + "grad_norm": 2.035353660583496, + "learning_rate": 1.8689010572858795e-06, + "loss": 0.0316, + "num_input_tokens_seen": 34090304, + "step": 161535 + }, + { + "epoch": 17.771177117711773, + "grad_norm": 0.02734760195016861, + "learning_rate": 1.867990643791856e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34091360, + "step": 161540 + }, + { + "epoch": 17.77172717271727, + "grad_norm": 0.0948028489947319, + "learning_rate": 1.8670804434944677e-06, + "loss": 0.0055, + "num_input_tokens_seen": 34092384, + "step": 161545 + }, + { + "epoch": 17.77227722772277, + "grad_norm": 0.15818527340888977, + "learning_rate": 1.8661704564020999e-06, + "loss": 0.0887, + "num_input_tokens_seen": 34093440, + "step": 161550 + }, + { + "epoch": 17.772827282728272, + "grad_norm": 0.032145217061042786, + "learning_rate": 1.8652606825231289e-06, + "loss": 0.0433, + "num_input_tokens_seen": 34094496, + "step": 161555 + }, + { + "epoch": 17.773377337733773, + "grad_norm": 0.0060455044731497765, + "learning_rate": 1.8643511218659564e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34095552, + "step": 161560 + }, + { + "epoch": 17.773927392739274, + "grad_norm": 0.34894031286239624, + "learning_rate": 1.8634417744389593e-06, + "loss": 0.0433, + "num_input_tokens_seen": 34096576, + "step": 161565 + }, + { + "epoch": 17.774477447744776, + "grad_norm": 0.0015110510867089033, + "learning_rate": 1.8625326402505138e-06, + "loss": 0.0119, + "num_input_tokens_seen": 34097568, + "step": 161570 + }, + { + "epoch": 17.775027502750277, + "grad_norm": 0.042044054716825485, + "learning_rate": 1.8616237193090053e-06, + "loss": 0.0921, + "num_input_tokens_seen": 34098656, + "step": 161575 + }, + { + "epoch": 17.775577557755774, + "grad_norm": 2.4149158000946045, + "learning_rate": 1.8607150116228044e-06, + "loss": 0.2096, + "num_input_tokens_seen": 34099744, + "step": 161580 + }, + { + "epoch": 17.776127612761275, + "grad_norm": 2.320736885070801, + "learning_rate": 1.8598065172002909e-06, + "loss": 0.0751, + "num_input_tokens_seen": 34100832, + "step": 161585 + }, + { + "epoch": 17.776677667766776, + "grad_norm": 0.12753936648368835, + "learning_rate": 1.8588982360498386e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34101856, + "step": 161590 + }, + { + "epoch": 17.777227722772277, + "grad_norm": 0.11995001882314682, + "learning_rate": 1.8579901681798157e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34102880, + "step": 161595 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 0.07314096391201019, + "learning_rate": 1.857082313598596e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34103904, + "step": 161600 + }, + { + "epoch": 17.77832783278328, + "grad_norm": 0.03188749775290489, + "learning_rate": 1.8561746723145395e-06, + "loss": 0.0979, + "num_input_tokens_seen": 34104960, + "step": 161605 + }, + { + "epoch": 17.778877887788777, + "grad_norm": 0.04802221432328224, + "learning_rate": 1.8552672443360148e-06, + "loss": 0.002, + "num_input_tokens_seen": 34106048, + "step": 161610 + }, + { + "epoch": 17.77942794279428, + "grad_norm": 0.8860093355178833, + "learning_rate": 1.8543600296713898e-06, + "loss": 0.0077, + "num_input_tokens_seen": 34107104, + "step": 161615 + }, + { + "epoch": 17.77997799779978, + "grad_norm": 0.13200537860393524, + "learning_rate": 1.8534530283290191e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34108160, + "step": 161620 + }, + { + "epoch": 17.78052805280528, + "grad_norm": 0.015393865294754505, + "learning_rate": 1.8525462403172683e-06, + "loss": 0.0462, + "num_input_tokens_seen": 34109312, + "step": 161625 + }, + { + "epoch": 17.78107810781078, + "grad_norm": 0.38421475887298584, + "learning_rate": 1.851639665644489e-06, + "loss": 0.0387, + "num_input_tokens_seen": 34110464, + "step": 161630 + }, + { + "epoch": 17.781628162816283, + "grad_norm": 0.020953256636857986, + "learning_rate": 1.8507333043190411e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34111552, + "step": 161635 + }, + { + "epoch": 17.782178217821784, + "grad_norm": 0.03401060029864311, + "learning_rate": 1.8498271563492737e-06, + "loss": 0.0635, + "num_input_tokens_seen": 34112544, + "step": 161640 + }, + { + "epoch": 17.78272827282728, + "grad_norm": 0.44811365008354187, + "learning_rate": 1.848921221743541e-06, + "loss": 0.0053, + "num_input_tokens_seen": 34113600, + "step": 161645 + }, + { + "epoch": 17.783278327832782, + "grad_norm": 0.020812032744288445, + "learning_rate": 1.8480155005101947e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34114624, + "step": 161650 + }, + { + "epoch": 17.783828382838283, + "grad_norm": 0.11880118399858475, + "learning_rate": 1.847109992657578e-06, + "loss": 0.0067, + "num_input_tokens_seen": 34115648, + "step": 161655 + }, + { + "epoch": 17.784378437843785, + "grad_norm": 0.017435524612665176, + "learning_rate": 1.8462046981940429e-06, + "loss": 0.0044, + "num_input_tokens_seen": 34116736, + "step": 161660 + }, + { + "epoch": 17.784928492849286, + "grad_norm": 0.054749563336372375, + "learning_rate": 1.8452996171279269e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34117824, + "step": 161665 + }, + { + "epoch": 17.785478547854787, + "grad_norm": 0.03672659769654274, + "learning_rate": 1.8443947494675679e-06, + "loss": 0.1353, + "num_input_tokens_seen": 34118848, + "step": 161670 + }, + { + "epoch": 17.786028602860284, + "grad_norm": 0.012904131785035133, + "learning_rate": 1.8434900952213174e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34119872, + "step": 161675 + }, + { + "epoch": 17.786578657865785, + "grad_norm": 0.09404578059911728, + "learning_rate": 1.8425856543975051e-06, + "loss": 0.0442, + "num_input_tokens_seen": 34120896, + "step": 161680 + }, + { + "epoch": 17.787128712871286, + "grad_norm": 0.021603407338261604, + "learning_rate": 1.8416814270044712e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34121920, + "step": 161685 + }, + { + "epoch": 17.787678767876788, + "grad_norm": 0.004870448261499405, + "learning_rate": 1.8407774130505483e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34122912, + "step": 161690 + }, + { + "epoch": 17.78822882288229, + "grad_norm": 0.13103504478931427, + "learning_rate": 1.8398736125440602e-06, + "loss": 0.0563, + "num_input_tokens_seen": 34124000, + "step": 161695 + }, + { + "epoch": 17.78877887788779, + "grad_norm": 0.006737115327268839, + "learning_rate": 1.8389700254933472e-06, + "loss": 0.002, + "num_input_tokens_seen": 34125056, + "step": 161700 + }, + { + "epoch": 17.78932893289329, + "grad_norm": 0.10940132290124893, + "learning_rate": 1.8380666519067308e-06, + "loss": 0.0096, + "num_input_tokens_seen": 34126144, + "step": 161705 + }, + { + "epoch": 17.78987898789879, + "grad_norm": 0.013119077309966087, + "learning_rate": 1.8371634917925456e-06, + "loss": 0.1128, + "num_input_tokens_seen": 34127200, + "step": 161710 + }, + { + "epoch": 17.79042904290429, + "grad_norm": 0.7583354115486145, + "learning_rate": 1.8362605451591103e-06, + "loss": 0.0053, + "num_input_tokens_seen": 34128288, + "step": 161715 + }, + { + "epoch": 17.79097909790979, + "grad_norm": 0.05276387929916382, + "learning_rate": 1.835357812014743e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34129376, + "step": 161720 + }, + { + "epoch": 17.79152915291529, + "grad_norm": 2.962211847305298, + "learning_rate": 1.8344552923677705e-06, + "loss": 0.0175, + "num_input_tokens_seen": 34130432, + "step": 161725 + }, + { + "epoch": 17.792079207920793, + "grad_norm": 0.34155362844467163, + "learning_rate": 1.8335529862264973e-06, + "loss": 0.0149, + "num_input_tokens_seen": 34131520, + "step": 161730 + }, + { + "epoch": 17.792629262926294, + "grad_norm": 0.30879393219947815, + "learning_rate": 1.8326508935992609e-06, + "loss": 0.005, + "num_input_tokens_seen": 34132576, + "step": 161735 + }, + { + "epoch": 17.793179317931795, + "grad_norm": 2.821711301803589, + "learning_rate": 1.8317490144943633e-06, + "loss": 0.1004, + "num_input_tokens_seen": 34133632, + "step": 161740 + }, + { + "epoch": 17.793729372937293, + "grad_norm": 0.008190367370843887, + "learning_rate": 1.8308473489201144e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34134720, + "step": 161745 + }, + { + "epoch": 17.794279427942794, + "grad_norm": 0.03144611790776253, + "learning_rate": 1.8299458968848327e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34135744, + "step": 161750 + }, + { + "epoch": 17.794829482948295, + "grad_norm": 0.21654923260211945, + "learning_rate": 1.829044658396817e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34136896, + "step": 161755 + }, + { + "epoch": 17.795379537953796, + "grad_norm": 0.009296517819166183, + "learning_rate": 1.828143633464377e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34137952, + "step": 161760 + }, + { + "epoch": 17.795929592959297, + "grad_norm": 0.011205929331481457, + "learning_rate": 1.827242822095823e-06, + "loss": 0.0752, + "num_input_tokens_seen": 34139008, + "step": 161765 + }, + { + "epoch": 17.796479647964798, + "grad_norm": 0.04741840064525604, + "learning_rate": 1.8263422242994461e-06, + "loss": 0.0422, + "num_input_tokens_seen": 34140032, + "step": 161770 + }, + { + "epoch": 17.797029702970296, + "grad_norm": 0.03492430970072746, + "learning_rate": 1.825441840083561e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34141152, + "step": 161775 + }, + { + "epoch": 17.797579757975797, + "grad_norm": 0.3210889399051666, + "learning_rate": 1.824541669456456e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34142176, + "step": 161780 + }, + { + "epoch": 17.798129812981298, + "grad_norm": 0.011112921871244907, + "learning_rate": 1.8236417124264272e-06, + "loss": 0.0597, + "num_input_tokens_seen": 34143296, + "step": 161785 + }, + { + "epoch": 17.7986798679868, + "grad_norm": 0.30273616313934326, + "learning_rate": 1.8227419690017705e-06, + "loss": 0.0066, + "num_input_tokens_seen": 34144352, + "step": 161790 + }, + { + "epoch": 17.7992299229923, + "grad_norm": 0.007592504844069481, + "learning_rate": 1.8218424391907795e-06, + "loss": 0.0154, + "num_input_tokens_seen": 34145408, + "step": 161795 + }, + { + "epoch": 17.7997799779978, + "grad_norm": 0.003437979845330119, + "learning_rate": 1.8209431230017477e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34146464, + "step": 161800 + }, + { + "epoch": 17.8003300330033, + "grad_norm": 0.0035017835907638073, + "learning_rate": 1.8200440204429626e-06, + "loss": 0.0043, + "num_input_tokens_seen": 34147520, + "step": 161805 + }, + { + "epoch": 17.8008800880088, + "grad_norm": 0.006110135465860367, + "learning_rate": 1.8191451315227066e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34148544, + "step": 161810 + }, + { + "epoch": 17.8014301430143, + "grad_norm": 0.031006643548607826, + "learning_rate": 1.8182464562492619e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34149600, + "step": 161815 + }, + { + "epoch": 17.801980198019802, + "grad_norm": 0.0035004375968128443, + "learning_rate": 1.817347994630919e-06, + "loss": 0.0323, + "num_input_tokens_seen": 34150656, + "step": 161820 + }, + { + "epoch": 17.802530253025303, + "grad_norm": 0.12693580985069275, + "learning_rate": 1.8164497466759606e-06, + "loss": 0.0255, + "num_input_tokens_seen": 34151744, + "step": 161825 + }, + { + "epoch": 17.803080308030804, + "grad_norm": 0.005826445762068033, + "learning_rate": 1.8155517123926574e-06, + "loss": 0.0056, + "num_input_tokens_seen": 34152768, + "step": 161830 + }, + { + "epoch": 17.803630363036305, + "grad_norm": 0.06795299798250198, + "learning_rate": 1.8146538917892864e-06, + "loss": 0.0264, + "num_input_tokens_seen": 34153760, + "step": 161835 + }, + { + "epoch": 17.804180418041803, + "grad_norm": 0.004458845127373934, + "learning_rate": 1.8137562848741297e-06, + "loss": 0.0375, + "num_input_tokens_seen": 34154816, + "step": 161840 + }, + { + "epoch": 17.804730473047304, + "grad_norm": 0.011569458059966564, + "learning_rate": 1.81285889165545e-06, + "loss": 0.0785, + "num_input_tokens_seen": 34155840, + "step": 161845 + }, + { + "epoch": 17.805280528052805, + "grad_norm": 0.05227983742952347, + "learning_rate": 1.8119617121415244e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34156928, + "step": 161850 + }, + { + "epoch": 17.805830583058306, + "grad_norm": 0.03588598594069481, + "learning_rate": 1.8110647463406266e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34157984, + "step": 161855 + }, + { + "epoch": 17.806380638063807, + "grad_norm": 0.0379270538687706, + "learning_rate": 1.810167994261011e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34159072, + "step": 161860 + }, + { + "epoch": 17.806930693069308, + "grad_norm": 0.02206970565021038, + "learning_rate": 1.8092714559109576e-06, + "loss": 0.0296, + "num_input_tokens_seen": 34160096, + "step": 161865 + }, + { + "epoch": 17.80748074807481, + "grad_norm": 0.7432577610015869, + "learning_rate": 1.808375131298712e-06, + "loss": 0.0056, + "num_input_tokens_seen": 34161152, + "step": 161870 + }, + { + "epoch": 17.808030803080307, + "grad_norm": 0.38601842522621155, + "learning_rate": 1.8074790204325481e-06, + "loss": 0.0054, + "num_input_tokens_seen": 34162176, + "step": 161875 + }, + { + "epoch": 17.808580858085808, + "grad_norm": 0.024144791066646576, + "learning_rate": 1.8065831233207236e-06, + "loss": 0.0088, + "num_input_tokens_seen": 34163232, + "step": 161880 + }, + { + "epoch": 17.80913091309131, + "grad_norm": 0.20335930585861206, + "learning_rate": 1.8056874399714902e-06, + "loss": 0.0102, + "num_input_tokens_seen": 34164320, + "step": 161885 + }, + { + "epoch": 17.80968096809681, + "grad_norm": 0.06772469729185104, + "learning_rate": 1.8047919703931077e-06, + "loss": 0.007, + "num_input_tokens_seen": 34165440, + "step": 161890 + }, + { + "epoch": 17.81023102310231, + "grad_norm": 0.011209139600396156, + "learning_rate": 1.803896714593825e-06, + "loss": 0.0596, + "num_input_tokens_seen": 34166528, + "step": 161895 + }, + { + "epoch": 17.810781078107812, + "grad_norm": 0.025005917996168137, + "learning_rate": 1.8030016725818998e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34167552, + "step": 161900 + }, + { + "epoch": 17.81133113311331, + "grad_norm": 0.007974853739142418, + "learning_rate": 1.8021068443655753e-06, + "loss": 0.0227, + "num_input_tokens_seen": 34168544, + "step": 161905 + }, + { + "epoch": 17.81188118811881, + "grad_norm": 0.2395184487104416, + "learning_rate": 1.8012122299530976e-06, + "loss": 0.0256, + "num_input_tokens_seen": 34169632, + "step": 161910 + }, + { + "epoch": 17.812431243124312, + "grad_norm": 0.010558201000094414, + "learning_rate": 1.8003178293527212e-06, + "loss": 0.0279, + "num_input_tokens_seen": 34170688, + "step": 161915 + }, + { + "epoch": 17.812981298129813, + "grad_norm": 0.08507677167654037, + "learning_rate": 1.7994236425726757e-06, + "loss": 0.0119, + "num_input_tokens_seen": 34171776, + "step": 161920 + }, + { + "epoch": 17.813531353135314, + "grad_norm": 0.0060233669355511665, + "learning_rate": 1.7985296696212156e-06, + "loss": 0.0091, + "num_input_tokens_seen": 34172896, + "step": 161925 + }, + { + "epoch": 17.814081408140815, + "grad_norm": 0.13105669617652893, + "learning_rate": 1.7976359105065704e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34174048, + "step": 161930 + }, + { + "epoch": 17.814631463146316, + "grad_norm": 0.05563744902610779, + "learning_rate": 1.7967423652369837e-06, + "loss": 0.0095, + "num_input_tokens_seen": 34175136, + "step": 161935 + }, + { + "epoch": 17.815181518151814, + "grad_norm": 0.04512513428926468, + "learning_rate": 1.7958490338206901e-06, + "loss": 0.1222, + "num_input_tokens_seen": 34176128, + "step": 161940 + }, + { + "epoch": 17.815731573157315, + "grad_norm": 0.6151639819145203, + "learning_rate": 1.7949559162659168e-06, + "loss": 0.0139, + "num_input_tokens_seen": 34177184, + "step": 161945 + }, + { + "epoch": 17.816281628162816, + "grad_norm": 0.028585780411958694, + "learning_rate": 1.7940630125809043e-06, + "loss": 0.0441, + "num_input_tokens_seen": 34178272, + "step": 161950 + }, + { + "epoch": 17.816831683168317, + "grad_norm": 0.006127891596406698, + "learning_rate": 1.7931703227738793e-06, + "loss": 0.0462, + "num_input_tokens_seen": 34179328, + "step": 161955 + }, + { + "epoch": 17.817381738173818, + "grad_norm": 0.07385256886482239, + "learning_rate": 1.7922778468530633e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34180384, + "step": 161960 + }, + { + "epoch": 17.81793179317932, + "grad_norm": 0.02728770300745964, + "learning_rate": 1.7913855848266852e-06, + "loss": 0.0119, + "num_input_tokens_seen": 34181472, + "step": 161965 + }, + { + "epoch": 17.818481848184817, + "grad_norm": 0.7106223702430725, + "learning_rate": 1.7904935367029752e-06, + "loss": 0.0227, + "num_input_tokens_seen": 34182560, + "step": 161970 + }, + { + "epoch": 17.819031903190318, + "grad_norm": 0.010698674246668816, + "learning_rate": 1.7896017024901428e-06, + "loss": 0.0215, + "num_input_tokens_seen": 34183680, + "step": 161975 + }, + { + "epoch": 17.81958195819582, + "grad_norm": 0.08865123987197876, + "learning_rate": 1.7887100821964209e-06, + "loss": 0.0428, + "num_input_tokens_seen": 34184768, + "step": 161980 + }, + { + "epoch": 17.82013201320132, + "grad_norm": 0.008454951457679272, + "learning_rate": 1.7878186758300137e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34185824, + "step": 161985 + }, + { + "epoch": 17.82068206820682, + "grad_norm": 0.022593718022108078, + "learning_rate": 1.786927483399145e-06, + "loss": 0.0572, + "num_input_tokens_seen": 34186912, + "step": 161990 + }, + { + "epoch": 17.821232123212322, + "grad_norm": 0.045551225543022156, + "learning_rate": 1.7860365049120309e-06, + "loss": 0.0409, + "num_input_tokens_seen": 34188096, + "step": 161995 + }, + { + "epoch": 17.821782178217823, + "grad_norm": 3.5919201374053955, + "learning_rate": 1.7851457403768756e-06, + "loss": 0.1632, + "num_input_tokens_seen": 34189184, + "step": 162000 + }, + { + "epoch": 17.82233223322332, + "grad_norm": 0.023004349321126938, + "learning_rate": 1.784255189801895e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34190304, + "step": 162005 + }, + { + "epoch": 17.822882288228822, + "grad_norm": 0.013403677381575108, + "learning_rate": 1.7833648531952907e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34191360, + "step": 162010 + }, + { + "epoch": 17.823432343234323, + "grad_norm": 0.030315492302179337, + "learning_rate": 1.7824747305652783e-06, + "loss": 0.068, + "num_input_tokens_seen": 34192384, + "step": 162015 + }, + { + "epoch": 17.823982398239824, + "grad_norm": 0.006262298673391342, + "learning_rate": 1.7815848219200488e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34193408, + "step": 162020 + }, + { + "epoch": 17.824532453245325, + "grad_norm": 0.04620454087853432, + "learning_rate": 1.7806951272678119e-06, + "loss": 0.0114, + "num_input_tokens_seen": 34194464, + "step": 162025 + }, + { + "epoch": 17.825082508250826, + "grad_norm": 0.01729518733918667, + "learning_rate": 1.7798056466167696e-06, + "loss": 0.1364, + "num_input_tokens_seen": 34195552, + "step": 162030 + }, + { + "epoch": 17.825632563256324, + "grad_norm": 0.04050806909799576, + "learning_rate": 1.7789163799751124e-06, + "loss": 0.1618, + "num_input_tokens_seen": 34196608, + "step": 162035 + }, + { + "epoch": 17.826182618261825, + "grad_norm": 2.022118330001831, + "learning_rate": 1.778027327351045e-06, + "loss": 0.0797, + "num_input_tokens_seen": 34197792, + "step": 162040 + }, + { + "epoch": 17.826732673267326, + "grad_norm": 0.009427502751350403, + "learning_rate": 1.7771384887527526e-06, + "loss": 0.0115, + "num_input_tokens_seen": 34198816, + "step": 162045 + }, + { + "epoch": 17.827282728272827, + "grad_norm": 0.6136928200721741, + "learning_rate": 1.7762498641884312e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34199904, + "step": 162050 + }, + { + "epoch": 17.82783278327833, + "grad_norm": 0.011371553875505924, + "learning_rate": 1.7753614536662772e-06, + "loss": 0.0053, + "num_input_tokens_seen": 34200896, + "step": 162055 + }, + { + "epoch": 17.82838283828383, + "grad_norm": 0.023725952953100204, + "learning_rate": 1.7744732571944644e-06, + "loss": 0.0922, + "num_input_tokens_seen": 34201984, + "step": 162060 + }, + { + "epoch": 17.82893289328933, + "grad_norm": 1.7996631860733032, + "learning_rate": 1.7735852747811948e-06, + "loss": 0.0958, + "num_input_tokens_seen": 34203040, + "step": 162065 + }, + { + "epoch": 17.829482948294828, + "grad_norm": 0.008050881326198578, + "learning_rate": 1.7726975064346423e-06, + "loss": 0.0678, + "num_input_tokens_seen": 34204096, + "step": 162070 + }, + { + "epoch": 17.83003300330033, + "grad_norm": 0.31688255071640015, + "learning_rate": 1.7718099521629865e-06, + "loss": 0.0055, + "num_input_tokens_seen": 34205216, + "step": 162075 + }, + { + "epoch": 17.83058305830583, + "grad_norm": 0.10551699250936508, + "learning_rate": 1.7709226119744155e-06, + "loss": 0.0112, + "num_input_tokens_seen": 34206208, + "step": 162080 + }, + { + "epoch": 17.83113311331133, + "grad_norm": 1.9650079011917114, + "learning_rate": 1.7700354858771001e-06, + "loss": 0.0738, + "num_input_tokens_seen": 34207296, + "step": 162085 + }, + { + "epoch": 17.831683168316832, + "grad_norm": 0.00692647323012352, + "learning_rate": 1.7691485738792285e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34208320, + "step": 162090 + }, + { + "epoch": 17.832233223322334, + "grad_norm": 0.0419023372232914, + "learning_rate": 1.7682618759889664e-06, + "loss": 0.0089, + "num_input_tokens_seen": 34209408, + "step": 162095 + }, + { + "epoch": 17.83278327832783, + "grad_norm": 0.00743825314566493, + "learning_rate": 1.767375392214482e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34210400, + "step": 162100 + }, + { + "epoch": 17.833333333333332, + "grad_norm": 0.5936022996902466, + "learning_rate": 1.7664891225639495e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34211456, + "step": 162105 + }, + { + "epoch": 17.833883388338833, + "grad_norm": 0.06450346112251282, + "learning_rate": 1.7656030670455404e-06, + "loss": 0.0437, + "num_input_tokens_seen": 34212480, + "step": 162110 + }, + { + "epoch": 17.834433443344334, + "grad_norm": 0.014841875061392784, + "learning_rate": 1.7647172256674227e-06, + "loss": 0.0465, + "num_input_tokens_seen": 34213568, + "step": 162115 + }, + { + "epoch": 17.834983498349835, + "grad_norm": 0.36493387818336487, + "learning_rate": 1.7638315984377568e-06, + "loss": 0.0061, + "num_input_tokens_seen": 34214560, + "step": 162120 + }, + { + "epoch": 17.835533553355337, + "grad_norm": 0.07018500566482544, + "learning_rate": 1.7629461853647e-06, + "loss": 0.0059, + "num_input_tokens_seen": 34215616, + "step": 162125 + }, + { + "epoch": 17.836083608360838, + "grad_norm": 2.099379301071167, + "learning_rate": 1.7620609864564237e-06, + "loss": 0.0651, + "num_input_tokens_seen": 34216640, + "step": 162130 + }, + { + "epoch": 17.836633663366335, + "grad_norm": 0.2860388457775116, + "learning_rate": 1.7611760017210767e-06, + "loss": 0.0575, + "num_input_tokens_seen": 34217696, + "step": 162135 + }, + { + "epoch": 17.837183718371836, + "grad_norm": 0.35545387864112854, + "learning_rate": 1.760291231166819e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34218752, + "step": 162140 + }, + { + "epoch": 17.837733773377337, + "grad_norm": 0.019669009372591972, + "learning_rate": 1.7594066748018085e-06, + "loss": 0.0127, + "num_input_tokens_seen": 34219808, + "step": 162145 + }, + { + "epoch": 17.83828382838284, + "grad_norm": 0.03343212231993675, + "learning_rate": 1.758522332634191e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34220864, + "step": 162150 + }, + { + "epoch": 17.83883388338834, + "grad_norm": 0.012260637246072292, + "learning_rate": 1.757638204672124e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34221888, + "step": 162155 + }, + { + "epoch": 17.83938393839384, + "grad_norm": 0.07463577389717102, + "learning_rate": 1.756754290923751e-06, + "loss": 0.0472, + "num_input_tokens_seen": 34223008, + "step": 162160 + }, + { + "epoch": 17.83993399339934, + "grad_norm": 0.11683574318885803, + "learning_rate": 1.7558705913972211e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34224000, + "step": 162165 + }, + { + "epoch": 17.84048404840484, + "grad_norm": 0.15087734162807465, + "learning_rate": 1.7549871061006806e-06, + "loss": 0.0748, + "num_input_tokens_seen": 34225056, + "step": 162170 + }, + { + "epoch": 17.84103410341034, + "grad_norm": 2.641690492630005, + "learning_rate": 1.7541038350422645e-06, + "loss": 0.0355, + "num_input_tokens_seen": 34226080, + "step": 162175 + }, + { + "epoch": 17.84158415841584, + "grad_norm": 0.395844042301178, + "learning_rate": 1.7532207782301247e-06, + "loss": 0.0073, + "num_input_tokens_seen": 34227168, + "step": 162180 + }, + { + "epoch": 17.842134213421343, + "grad_norm": 0.07189520448446274, + "learning_rate": 1.7523379356723934e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34228160, + "step": 162185 + }, + { + "epoch": 17.842684268426844, + "grad_norm": 2.91760516166687, + "learning_rate": 1.751455307377206e-06, + "loss": 0.0096, + "num_input_tokens_seen": 34229216, + "step": 162190 + }, + { + "epoch": 17.843234323432345, + "grad_norm": 1.4206242561340332, + "learning_rate": 1.7505728933526978e-06, + "loss": 0.0905, + "num_input_tokens_seen": 34230304, + "step": 162195 + }, + { + "epoch": 17.843784378437842, + "grad_norm": 0.07462162524461746, + "learning_rate": 1.7496906936070035e-06, + "loss": 0.0368, + "num_input_tokens_seen": 34231360, + "step": 162200 + }, + { + "epoch": 17.844334433443343, + "grad_norm": 0.038794610649347305, + "learning_rate": 1.7488087081482557e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34232384, + "step": 162205 + }, + { + "epoch": 17.844884488448844, + "grad_norm": 1.3805198669433594, + "learning_rate": 1.747926936984584e-06, + "loss": 0.0268, + "num_input_tokens_seen": 34233376, + "step": 162210 + }, + { + "epoch": 17.845434543454346, + "grad_norm": 0.03487243875861168, + "learning_rate": 1.747045380124107e-06, + "loss": 0.0606, + "num_input_tokens_seen": 34234432, + "step": 162215 + }, + { + "epoch": 17.845984598459847, + "grad_norm": 0.0026413036976009607, + "learning_rate": 1.7461640375749543e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34235456, + "step": 162220 + }, + { + "epoch": 17.846534653465348, + "grad_norm": 0.1531987339258194, + "learning_rate": 1.74528290934525e-06, + "loss": 0.0047, + "num_input_tokens_seen": 34236512, + "step": 162225 + }, + { + "epoch": 17.847084708470845, + "grad_norm": 0.15117420256137848, + "learning_rate": 1.7444019954431178e-06, + "loss": 0.004, + "num_input_tokens_seen": 34237600, + "step": 162230 + }, + { + "epoch": 17.847634763476346, + "grad_norm": 0.018408458679914474, + "learning_rate": 1.743521295876674e-06, + "loss": 0.0072, + "num_input_tokens_seen": 34238688, + "step": 162235 + }, + { + "epoch": 17.848184818481847, + "grad_norm": 0.016383588314056396, + "learning_rate": 1.742640810654028e-06, + "loss": 0.0511, + "num_input_tokens_seen": 34239744, + "step": 162240 + }, + { + "epoch": 17.84873487348735, + "grad_norm": 1.8580375909805298, + "learning_rate": 1.7417605397833103e-06, + "loss": 0.189, + "num_input_tokens_seen": 34240800, + "step": 162245 + }, + { + "epoch": 17.84928492849285, + "grad_norm": 0.03248032182455063, + "learning_rate": 1.7408804832726194e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34241856, + "step": 162250 + }, + { + "epoch": 17.84983498349835, + "grad_norm": 0.06063692644238472, + "learning_rate": 1.7400006411300712e-06, + "loss": 0.0048, + "num_input_tokens_seen": 34242912, + "step": 162255 + }, + { + "epoch": 17.850385038503852, + "grad_norm": 0.007368071470409632, + "learning_rate": 1.7391210133637815e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34243904, + "step": 162260 + }, + { + "epoch": 17.85093509350935, + "grad_norm": 0.06316454708576202, + "learning_rate": 1.7382415999818462e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34244928, + "step": 162265 + }, + { + "epoch": 17.85148514851485, + "grad_norm": 0.7318447828292847, + "learning_rate": 1.7373624009923817e-06, + "loss": 0.0147, + "num_input_tokens_seen": 34245984, + "step": 162270 + }, + { + "epoch": 17.85203520352035, + "grad_norm": 0.08058450371026993, + "learning_rate": 1.7364834164034782e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34246976, + "step": 162275 + }, + { + "epoch": 17.852585258525853, + "grad_norm": 0.6786344051361084, + "learning_rate": 1.735604646223249e-06, + "loss": 0.0083, + "num_input_tokens_seen": 34248064, + "step": 162280 + }, + { + "epoch": 17.853135313531354, + "grad_norm": 0.02391563355922699, + "learning_rate": 1.73472609045979e-06, + "loss": 0.0381, + "num_input_tokens_seen": 34249088, + "step": 162285 + }, + { + "epoch": 17.853685368536855, + "grad_norm": 6.546896934509277, + "learning_rate": 1.7338477491211923e-06, + "loss": 0.0727, + "num_input_tokens_seen": 34250208, + "step": 162290 + }, + { + "epoch": 17.854235423542356, + "grad_norm": 0.010134940035641193, + "learning_rate": 1.7329696222155607e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34251200, + "step": 162295 + }, + { + "epoch": 17.854785478547853, + "grad_norm": 0.01945057138800621, + "learning_rate": 1.732091709750977e-06, + "loss": 0.0103, + "num_input_tokens_seen": 34252288, + "step": 162300 + }, + { + "epoch": 17.855335533553355, + "grad_norm": 0.15855644643306732, + "learning_rate": 1.7312140117355464e-06, + "loss": 0.0087, + "num_input_tokens_seen": 34253312, + "step": 162305 + }, + { + "epoch": 17.855885588558856, + "grad_norm": 3.0203213691711426, + "learning_rate": 1.7303365281773453e-06, + "loss": 0.1539, + "num_input_tokens_seen": 34254304, + "step": 162310 + }, + { + "epoch": 17.856435643564357, + "grad_norm": 1.421280860900879, + "learning_rate": 1.729459259084465e-06, + "loss": 0.0987, + "num_input_tokens_seen": 34255328, + "step": 162315 + }, + { + "epoch": 17.856985698569858, + "grad_norm": 0.0408187061548233, + "learning_rate": 1.7285822044649985e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34256480, + "step": 162320 + }, + { + "epoch": 17.85753575357536, + "grad_norm": 0.04860377311706543, + "learning_rate": 1.7277053643270203e-06, + "loss": 0.0051, + "num_input_tokens_seen": 34257536, + "step": 162325 + }, + { + "epoch": 17.858085808580856, + "grad_norm": 0.004048035945743322, + "learning_rate": 1.7268287386786153e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34258560, + "step": 162330 + }, + { + "epoch": 17.858635863586358, + "grad_norm": 0.020687993615865707, + "learning_rate": 1.7259523275278634e-06, + "loss": 0.0939, + "num_input_tokens_seen": 34259616, + "step": 162335 + }, + { + "epoch": 17.85918591859186, + "grad_norm": 0.007646922022104263, + "learning_rate": 1.7250761308828356e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34260672, + "step": 162340 + }, + { + "epoch": 17.85973597359736, + "grad_norm": 0.007109489291906357, + "learning_rate": 1.7242001487516175e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34261792, + "step": 162345 + }, + { + "epoch": 17.86028602860286, + "grad_norm": 0.04608738422393799, + "learning_rate": 1.72332438114228e-06, + "loss": 0.0161, + "num_input_tokens_seen": 34262816, + "step": 162350 + }, + { + "epoch": 17.860836083608362, + "grad_norm": 0.12326496094465256, + "learning_rate": 1.7224488280628864e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34263840, + "step": 162355 + }, + { + "epoch": 17.861386138613863, + "grad_norm": 0.007248086389154196, + "learning_rate": 1.7215734895215163e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34264896, + "step": 162360 + }, + { + "epoch": 17.86193619361936, + "grad_norm": 0.01476180087774992, + "learning_rate": 1.7206983655262298e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34265920, + "step": 162365 + }, + { + "epoch": 17.86248624862486, + "grad_norm": 0.016640182584524155, + "learning_rate": 1.7198234560850984e-06, + "loss": 0.0458, + "num_input_tokens_seen": 34267008, + "step": 162370 + }, + { + "epoch": 17.863036303630363, + "grad_norm": 0.13509289920330048, + "learning_rate": 1.718948761206185e-06, + "loss": 0.0194, + "num_input_tokens_seen": 34268096, + "step": 162375 + }, + { + "epoch": 17.863586358635864, + "grad_norm": 0.07115458697080612, + "learning_rate": 1.7180742808975443e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34269184, + "step": 162380 + }, + { + "epoch": 17.864136413641365, + "grad_norm": 0.6361761093139648, + "learning_rate": 1.7172000151672473e-06, + "loss": 0.0084, + "num_input_tokens_seen": 34270208, + "step": 162385 + }, + { + "epoch": 17.864686468646866, + "grad_norm": 3.580472469329834, + "learning_rate": 1.716325964023341e-06, + "loss": 0.0799, + "num_input_tokens_seen": 34271168, + "step": 162390 + }, + { + "epoch": 17.865236523652364, + "grad_norm": 0.09634851664304733, + "learning_rate": 1.7154521274738849e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34272192, + "step": 162395 + }, + { + "epoch": 17.865786578657865, + "grad_norm": 0.06150886416435242, + "learning_rate": 1.7145785055269397e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34273184, + "step": 162400 + }, + { + "epoch": 17.866336633663366, + "grad_norm": 0.6717583537101746, + "learning_rate": 1.7137050981905433e-06, + "loss": 0.0067, + "num_input_tokens_seen": 34274272, + "step": 162405 + }, + { + "epoch": 17.866886688668867, + "grad_norm": 0.03759857639670372, + "learning_rate": 1.7128319054727614e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34275296, + "step": 162410 + }, + { + "epoch": 17.867436743674368, + "grad_norm": 0.27145522832870483, + "learning_rate": 1.7119589273816267e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34276320, + "step": 162415 + }, + { + "epoch": 17.86798679867987, + "grad_norm": 0.024357685819268227, + "learning_rate": 1.7110861639251962e-06, + "loss": 0.0498, + "num_input_tokens_seen": 34277344, + "step": 162420 + }, + { + "epoch": 17.86853685368537, + "grad_norm": 0.06463979929685593, + "learning_rate": 1.7102136151115056e-06, + "loss": 0.0467, + "num_input_tokens_seen": 34278432, + "step": 162425 + }, + { + "epoch": 17.869086908690868, + "grad_norm": 0.005051227752119303, + "learning_rate": 1.709341280948601e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34279424, + "step": 162430 + }, + { + "epoch": 17.86963696369637, + "grad_norm": 2.1123886108398438, + "learning_rate": 1.708469161444526e-06, + "loss": 0.3175, + "num_input_tokens_seen": 34280512, + "step": 162435 + }, + { + "epoch": 17.87018701870187, + "grad_norm": 0.00485136266797781, + "learning_rate": 1.70759725660731e-06, + "loss": 0.003, + "num_input_tokens_seen": 34281568, + "step": 162440 + }, + { + "epoch": 17.87073707370737, + "grad_norm": 0.004736354574561119, + "learning_rate": 1.7067255664449999e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34282656, + "step": 162445 + }, + { + "epoch": 17.871287128712872, + "grad_norm": 0.012985206209123135, + "learning_rate": 1.7058540909656196e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34283744, + "step": 162450 + }, + { + "epoch": 17.871837183718373, + "grad_norm": 0.03484009578824043, + "learning_rate": 1.7049828301771986e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34284800, + "step": 162455 + }, + { + "epoch": 17.87238723872387, + "grad_norm": 0.5076724886894226, + "learning_rate": 1.704111784087778e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34285792, + "step": 162460 + }, + { + "epoch": 17.872937293729372, + "grad_norm": 2.0932374000549316, + "learning_rate": 1.703240952705379e-06, + "loss": 0.0121, + "num_input_tokens_seen": 34286816, + "step": 162465 + }, + { + "epoch": 17.873487348734873, + "grad_norm": 0.02329309657216072, + "learning_rate": 1.7023703360380339e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34287872, + "step": 162470 + }, + { + "epoch": 17.874037403740374, + "grad_norm": 0.06949500739574432, + "learning_rate": 1.7014999340937588e-06, + "loss": 0.0561, + "num_input_tokens_seen": 34288864, + "step": 162475 + }, + { + "epoch": 17.874587458745875, + "grad_norm": 0.08503668010234833, + "learning_rate": 1.7006297468805776e-06, + "loss": 0.0111, + "num_input_tokens_seen": 34289952, + "step": 162480 + }, + { + "epoch": 17.875137513751376, + "grad_norm": 0.22046373784542084, + "learning_rate": 1.6997597744065119e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34291008, + "step": 162485 + }, + { + "epoch": 17.875687568756877, + "grad_norm": 1.4508007764816284, + "learning_rate": 1.6988900166795802e-06, + "loss": 0.0122, + "num_input_tokens_seen": 34292096, + "step": 162490 + }, + { + "epoch": 17.876237623762375, + "grad_norm": 0.006774222943931818, + "learning_rate": 1.698020473707801e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34293216, + "step": 162495 + }, + { + "epoch": 17.876787678767876, + "grad_norm": 0.07397475093603134, + "learning_rate": 1.6971511454991874e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34294272, + "step": 162500 + }, + { + "epoch": 17.877337733773377, + "grad_norm": 0.011884561739861965, + "learning_rate": 1.696282032061744e-06, + "loss": 0.0278, + "num_input_tokens_seen": 34295360, + "step": 162505 + }, + { + "epoch": 17.877887788778878, + "grad_norm": 0.06150508299469948, + "learning_rate": 1.6954131334034922e-06, + "loss": 0.004, + "num_input_tokens_seen": 34296416, + "step": 162510 + }, + { + "epoch": 17.87843784378438, + "grad_norm": 0.0319514237344265, + "learning_rate": 1.6945444495324259e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34297504, + "step": 162515 + }, + { + "epoch": 17.87898789878988, + "grad_norm": 1.2645288705825806, + "learning_rate": 1.6936759804565688e-06, + "loss": 0.0357, + "num_input_tokens_seen": 34298528, + "step": 162520 + }, + { + "epoch": 17.879537953795378, + "grad_norm": 0.03681229054927826, + "learning_rate": 1.6928077261839148e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34299552, + "step": 162525 + }, + { + "epoch": 17.88008800880088, + "grad_norm": 0.07402101159095764, + "learning_rate": 1.6919396867224658e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34300704, + "step": 162530 + }, + { + "epoch": 17.88063806380638, + "grad_norm": 3.586107015609741, + "learning_rate": 1.691071862080229e-06, + "loss": 0.2054, + "num_input_tokens_seen": 34301792, + "step": 162535 + }, + { + "epoch": 17.88118811881188, + "grad_norm": 0.22948984801769257, + "learning_rate": 1.6902042522651955e-06, + "loss": 0.0134, + "num_input_tokens_seen": 34302816, + "step": 162540 + }, + { + "epoch": 17.881738173817382, + "grad_norm": 0.015251962468028069, + "learning_rate": 1.6893368572853618e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34303840, + "step": 162545 + }, + { + "epoch": 17.882288228822883, + "grad_norm": 0.020972473546862602, + "learning_rate": 1.6884696771487297e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34304896, + "step": 162550 + }, + { + "epoch": 17.882838283828384, + "grad_norm": 0.0363457091152668, + "learning_rate": 1.6876027118632815e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34305952, + "step": 162555 + }, + { + "epoch": 17.883388338833882, + "grad_norm": 0.051129214465618134, + "learning_rate": 1.6867359614370192e-06, + "loss": 0.0462, + "num_input_tokens_seen": 34306976, + "step": 162560 + }, + { + "epoch": 17.883938393839383, + "grad_norm": 0.032384518533945084, + "learning_rate": 1.685869425877923e-06, + "loss": 0.06, + "num_input_tokens_seen": 34308096, + "step": 162565 + }, + { + "epoch": 17.884488448844884, + "grad_norm": 2.1075549125671387, + "learning_rate": 1.6850031051939774e-06, + "loss": 0.0643, + "num_input_tokens_seen": 34309120, + "step": 162570 + }, + { + "epoch": 17.885038503850385, + "grad_norm": 0.007253974210470915, + "learning_rate": 1.6841369993931683e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34310112, + "step": 162575 + }, + { + "epoch": 17.885588558855886, + "grad_norm": 0.5135143399238586, + "learning_rate": 1.6832711084834835e-06, + "loss": 0.0093, + "num_input_tokens_seen": 34311136, + "step": 162580 + }, + { + "epoch": 17.886138613861387, + "grad_norm": 0.04880190268158913, + "learning_rate": 1.6824054324729027e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34312192, + "step": 162585 + }, + { + "epoch": 17.88668866886689, + "grad_norm": 0.024862008169293404, + "learning_rate": 1.681539971369403e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34313216, + "step": 162590 + }, + { + "epoch": 17.887238723872386, + "grad_norm": 0.03159738704562187, + "learning_rate": 1.6806747251809584e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34314272, + "step": 162595 + }, + { + "epoch": 17.887788778877887, + "grad_norm": 0.11691258102655411, + "learning_rate": 1.6798096939155404e-06, + "loss": 0.1435, + "num_input_tokens_seen": 34315296, + "step": 162600 + }, + { + "epoch": 17.888338833883388, + "grad_norm": 0.06600785255432129, + "learning_rate": 1.678944877581129e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34316384, + "step": 162605 + }, + { + "epoch": 17.88888888888889, + "grad_norm": 1.9682848453521729, + "learning_rate": 1.6780802761856979e-06, + "loss": 0.0382, + "num_input_tokens_seen": 34317536, + "step": 162610 + }, + { + "epoch": 17.88943894389439, + "grad_norm": 0.009533128701150417, + "learning_rate": 1.6772158897372077e-06, + "loss": 0.0274, + "num_input_tokens_seen": 34318592, + "step": 162615 + }, + { + "epoch": 17.88998899889989, + "grad_norm": 0.016676949337124825, + "learning_rate": 1.676351718243624e-06, + "loss": 0.0669, + "num_input_tokens_seen": 34319616, + "step": 162620 + }, + { + "epoch": 17.89053905390539, + "grad_norm": 0.032699890434741974, + "learning_rate": 1.6754877617129183e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34320768, + "step": 162625 + }, + { + "epoch": 17.89108910891089, + "grad_norm": 0.0033362817484885454, + "learning_rate": 1.6746240201530456e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34321920, + "step": 162630 + }, + { + "epoch": 17.89163916391639, + "grad_norm": 0.03231455013155937, + "learning_rate": 1.6737604935719686e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34322944, + "step": 162635 + }, + { + "epoch": 17.892189218921892, + "grad_norm": 0.018674883991479874, + "learning_rate": 1.6728971819776562e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34324000, + "step": 162640 + }, + { + "epoch": 17.892739273927393, + "grad_norm": 0.20410150289535522, + "learning_rate": 1.6720340853780491e-06, + "loss": 0.0048, + "num_input_tokens_seen": 34325120, + "step": 162645 + }, + { + "epoch": 17.893289328932894, + "grad_norm": 0.17374135553836823, + "learning_rate": 1.6711712037811162e-06, + "loss": 0.0324, + "num_input_tokens_seen": 34326240, + "step": 162650 + }, + { + "epoch": 17.893839383938392, + "grad_norm": 0.05986079201102257, + "learning_rate": 1.6703085371948007e-06, + "loss": 0.0543, + "num_input_tokens_seen": 34327264, + "step": 162655 + }, + { + "epoch": 17.894389438943893, + "grad_norm": 0.11373056471347809, + "learning_rate": 1.669446085627055e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34328256, + "step": 162660 + }, + { + "epoch": 17.894939493949394, + "grad_norm": 0.09172850102186203, + "learning_rate": 1.6685838490858364e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34329344, + "step": 162665 + }, + { + "epoch": 17.895489548954895, + "grad_norm": 6.81296968460083, + "learning_rate": 1.6677218275790773e-06, + "loss": 0.0585, + "num_input_tokens_seen": 34330464, + "step": 162670 + }, + { + "epoch": 17.896039603960396, + "grad_norm": 0.3617149591445923, + "learning_rate": 1.6668600211147385e-06, + "loss": 0.0111, + "num_input_tokens_seen": 34331616, + "step": 162675 + }, + { + "epoch": 17.896589658965897, + "grad_norm": 0.08702284842729568, + "learning_rate": 1.6659984297007465e-06, + "loss": 0.0058, + "num_input_tokens_seen": 34332704, + "step": 162680 + }, + { + "epoch": 17.8971397139714, + "grad_norm": 0.03123229369521141, + "learning_rate": 1.665137053345056e-06, + "loss": 0.058, + "num_input_tokens_seen": 34333728, + "step": 162685 + }, + { + "epoch": 17.897689768976896, + "grad_norm": 1.3660932779312134, + "learning_rate": 1.6642758920555972e-06, + "loss": 0.0819, + "num_input_tokens_seen": 34334752, + "step": 162690 + }, + { + "epoch": 17.898239823982397, + "grad_norm": 3.977926254272461, + "learning_rate": 1.663414945840308e-06, + "loss": 0.0379, + "num_input_tokens_seen": 34335776, + "step": 162695 + }, + { + "epoch": 17.8987898789879, + "grad_norm": 0.03860209137201309, + "learning_rate": 1.6625542147071293e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34336832, + "step": 162700 + }, + { + "epoch": 17.8993399339934, + "grad_norm": 0.03339875862002373, + "learning_rate": 1.661693698663988e-06, + "loss": 0.1256, + "num_input_tokens_seen": 34337888, + "step": 162705 + }, + { + "epoch": 17.8998899889989, + "grad_norm": 0.18356889486312866, + "learning_rate": 1.6608333977188194e-06, + "loss": 0.0203, + "num_input_tokens_seen": 34338880, + "step": 162710 + }, + { + "epoch": 17.9004400440044, + "grad_norm": 0.009275618940591812, + "learning_rate": 1.659973311879545e-06, + "loss": 0.0365, + "num_input_tokens_seen": 34339936, + "step": 162715 + }, + { + "epoch": 17.900990099009903, + "grad_norm": 0.00794192124158144, + "learning_rate": 1.6591134411541004e-06, + "loss": 0.075, + "num_input_tokens_seen": 34340992, + "step": 162720 + }, + { + "epoch": 17.9015401540154, + "grad_norm": 2.5037038326263428, + "learning_rate": 1.6582537855504065e-06, + "loss": 0.0607, + "num_input_tokens_seen": 34342048, + "step": 162725 + }, + { + "epoch": 17.9020902090209, + "grad_norm": 0.0049437955021858215, + "learning_rate": 1.6573943450763907e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34343104, + "step": 162730 + }, + { + "epoch": 17.902640264026402, + "grad_norm": 1.519610047340393, + "learning_rate": 1.6565351197399658e-06, + "loss": 0.1132, + "num_input_tokens_seen": 34344096, + "step": 162735 + }, + { + "epoch": 17.903190319031903, + "grad_norm": 9.307713508605957, + "learning_rate": 1.6556761095490592e-06, + "loss": 0.0791, + "num_input_tokens_seen": 34345152, + "step": 162740 + }, + { + "epoch": 17.903740374037405, + "grad_norm": 0.05176049843430519, + "learning_rate": 1.654817314511581e-06, + "loss": 0.001, + "num_input_tokens_seen": 34346208, + "step": 162745 + }, + { + "epoch": 17.904290429042906, + "grad_norm": 0.03568553179502487, + "learning_rate": 1.6539587346354469e-06, + "loss": 0.0191, + "num_input_tokens_seen": 34347296, + "step": 162750 + }, + { + "epoch": 17.904840484048403, + "grad_norm": 1.3678346872329712, + "learning_rate": 1.6531003699285786e-06, + "loss": 0.0081, + "num_input_tokens_seen": 34348416, + "step": 162755 + }, + { + "epoch": 17.905390539053904, + "grad_norm": 0.0301025602966547, + "learning_rate": 1.6522422203988753e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34349408, + "step": 162760 + }, + { + "epoch": 17.905940594059405, + "grad_norm": 0.035822171717882156, + "learning_rate": 1.6513842860542583e-06, + "loss": 0.0562, + "num_input_tokens_seen": 34350432, + "step": 162765 + }, + { + "epoch": 17.906490649064907, + "grad_norm": 0.029809650033712387, + "learning_rate": 1.6505265669026243e-06, + "loss": 0.0977, + "num_input_tokens_seen": 34351424, + "step": 162770 + }, + { + "epoch": 17.907040704070408, + "grad_norm": 0.04092414304614067, + "learning_rate": 1.6496690629518834e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34352480, + "step": 162775 + }, + { + "epoch": 17.90759075907591, + "grad_norm": 0.10385202616453171, + "learning_rate": 1.6488117742099406e-06, + "loss": 0.0072, + "num_input_tokens_seen": 34353568, + "step": 162780 + }, + { + "epoch": 17.90814081408141, + "grad_norm": 0.01284661702811718, + "learning_rate": 1.6479547006846925e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34354592, + "step": 162785 + }, + { + "epoch": 17.908690869086907, + "grad_norm": 0.009584885090589523, + "learning_rate": 1.6470978423840406e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34355648, + "step": 162790 + }, + { + "epoch": 17.90924092409241, + "grad_norm": 2.2707762718200684, + "learning_rate": 1.6462411993158815e-06, + "loss": 0.017, + "num_input_tokens_seen": 34356736, + "step": 162795 + }, + { + "epoch": 17.90979097909791, + "grad_norm": 0.011800101026892662, + "learning_rate": 1.6453847714881149e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34357696, + "step": 162800 + }, + { + "epoch": 17.91034103410341, + "grad_norm": 0.011384635232388973, + "learning_rate": 1.6445285589086257e-06, + "loss": 0.0249, + "num_input_tokens_seen": 34358752, + "step": 162805 + }, + { + "epoch": 17.91089108910891, + "grad_norm": 0.26815298199653625, + "learning_rate": 1.6436725615853105e-06, + "loss": 0.0115, + "num_input_tokens_seen": 34359808, + "step": 162810 + }, + { + "epoch": 17.911441144114413, + "grad_norm": 0.09348130226135254, + "learning_rate": 1.6428167795260601e-06, + "loss": 0.0152, + "num_input_tokens_seen": 34360864, + "step": 162815 + }, + { + "epoch": 17.91199119911991, + "grad_norm": 0.3026757836341858, + "learning_rate": 1.6419612127387574e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34361952, + "step": 162820 + }, + { + "epoch": 17.91254125412541, + "grad_norm": 0.48837342858314514, + "learning_rate": 1.641105861231293e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34363008, + "step": 162825 + }, + { + "epoch": 17.913091309130913, + "grad_norm": 0.017671581357717514, + "learning_rate": 1.6402507250115411e-06, + "loss": 0.0156, + "num_input_tokens_seen": 34364032, + "step": 162830 + }, + { + "epoch": 17.913641364136414, + "grad_norm": 0.002160718897357583, + "learning_rate": 1.639395804087393e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34365088, + "step": 162835 + }, + { + "epoch": 17.914191419141915, + "grad_norm": 0.0820283517241478, + "learning_rate": 1.6385410984667254e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34366208, + "step": 162840 + }, + { + "epoch": 17.914741474147416, + "grad_norm": 0.06639721989631653, + "learning_rate": 1.6376866081574127e-06, + "loss": 0.0043, + "num_input_tokens_seen": 34367232, + "step": 162845 + }, + { + "epoch": 17.915291529152917, + "grad_norm": 0.08245112746953964, + "learning_rate": 1.6368323331673346e-06, + "loss": 0.0062, + "num_input_tokens_seen": 34368320, + "step": 162850 + }, + { + "epoch": 17.915841584158414, + "grad_norm": 0.34361207485198975, + "learning_rate": 1.6359782735043627e-06, + "loss": 0.0413, + "num_input_tokens_seen": 34369344, + "step": 162855 + }, + { + "epoch": 17.916391639163916, + "grad_norm": 0.19921119511127472, + "learning_rate": 1.6351244291763629e-06, + "loss": 0.0075, + "num_input_tokens_seen": 34370400, + "step": 162860 + }, + { + "epoch": 17.916941694169417, + "grad_norm": 0.06950516998767853, + "learning_rate": 1.634270800191212e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34371456, + "step": 162865 + }, + { + "epoch": 17.917491749174918, + "grad_norm": 0.01914861984550953, + "learning_rate": 1.6334173865567737e-06, + "loss": 0.001, + "num_input_tokens_seen": 34372512, + "step": 162870 + }, + { + "epoch": 17.91804180418042, + "grad_norm": 0.01510967593640089, + "learning_rate": 1.6325641882809188e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34373472, + "step": 162875 + }, + { + "epoch": 17.91859185918592, + "grad_norm": 0.10745670646429062, + "learning_rate": 1.6317112053715083e-06, + "loss": 0.0074, + "num_input_tokens_seen": 34374528, + "step": 162880 + }, + { + "epoch": 17.919141914191417, + "grad_norm": 0.08792299032211304, + "learning_rate": 1.6308584378363968e-06, + "loss": 0.0993, + "num_input_tokens_seen": 34375552, + "step": 162885 + }, + { + "epoch": 17.91969196919692, + "grad_norm": 0.03671792894601822, + "learning_rate": 1.6300058856834498e-06, + "loss": 0.0447, + "num_input_tokens_seen": 34376672, + "step": 162890 + }, + { + "epoch": 17.92024202420242, + "grad_norm": 0.03241094946861267, + "learning_rate": 1.6291535489205256e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34377760, + "step": 162895 + }, + { + "epoch": 17.92079207920792, + "grad_norm": 0.1571619212627411, + "learning_rate": 1.6283014275554814e-06, + "loss": 0.0076, + "num_input_tokens_seen": 34378816, + "step": 162900 + }, + { + "epoch": 17.921342134213422, + "grad_norm": 0.07000058144330978, + "learning_rate": 1.6274495215961694e-06, + "loss": 0.0274, + "num_input_tokens_seen": 34379840, + "step": 162905 + }, + { + "epoch": 17.921892189218923, + "grad_norm": 0.010643582791090012, + "learning_rate": 1.626597831050436e-06, + "loss": 0.0528, + "num_input_tokens_seen": 34380896, + "step": 162910 + }, + { + "epoch": 17.922442244224424, + "grad_norm": 2.014930486679077, + "learning_rate": 1.625746355926136e-06, + "loss": 0.0471, + "num_input_tokens_seen": 34382016, + "step": 162915 + }, + { + "epoch": 17.92299229922992, + "grad_norm": 0.05424008145928383, + "learning_rate": 1.6248950962311133e-06, + "loss": 0.0094, + "num_input_tokens_seen": 34383072, + "step": 162920 + }, + { + "epoch": 17.923542354235423, + "grad_norm": 0.23440763354301453, + "learning_rate": 1.624044051973217e-06, + "loss": 0.0058, + "num_input_tokens_seen": 34384160, + "step": 162925 + }, + { + "epoch": 17.924092409240924, + "grad_norm": 0.1410609483718872, + "learning_rate": 1.6231932231602936e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34385216, + "step": 162930 + }, + { + "epoch": 17.924642464246425, + "grad_norm": 0.009078803472220898, + "learning_rate": 1.622342609800176e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34386208, + "step": 162935 + }, + { + "epoch": 17.925192519251926, + "grad_norm": 0.02150527946650982, + "learning_rate": 1.6214922119007159e-06, + "loss": 0.0185, + "num_input_tokens_seen": 34387328, + "step": 162940 + }, + { + "epoch": 17.925742574257427, + "grad_norm": 0.025834539905190468, + "learning_rate": 1.6206420294697377e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34388416, + "step": 162945 + }, + { + "epoch": 17.926292629262925, + "grad_norm": 0.23323994874954224, + "learning_rate": 1.6197920625150826e-06, + "loss": 0.1294, + "num_input_tokens_seen": 34389472, + "step": 162950 + }, + { + "epoch": 17.926842684268426, + "grad_norm": 0.019070619717240334, + "learning_rate": 1.6189423110445884e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34390496, + "step": 162955 + }, + { + "epoch": 17.927392739273927, + "grad_norm": 0.5230884552001953, + "learning_rate": 1.6180927750660823e-06, + "loss": 0.0096, + "num_input_tokens_seen": 34391488, + "step": 162960 + }, + { + "epoch": 17.927942794279428, + "grad_norm": 0.43492820858955383, + "learning_rate": 1.617243454587397e-06, + "loss": 0.0043, + "num_input_tokens_seen": 34392576, + "step": 162965 + }, + { + "epoch": 17.92849284928493, + "grad_norm": 0.010407079011201859, + "learning_rate": 1.6163943496163597e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34393600, + "step": 162970 + }, + { + "epoch": 17.92904290429043, + "grad_norm": 0.5349811911582947, + "learning_rate": 1.6155454601607916e-06, + "loss": 0.0037, + "num_input_tokens_seen": 34394624, + "step": 162975 + }, + { + "epoch": 17.92959295929593, + "grad_norm": 0.8996835947036743, + "learning_rate": 1.6146967862285201e-06, + "loss": 0.0097, + "num_input_tokens_seen": 34395680, + "step": 162980 + }, + { + "epoch": 17.93014301430143, + "grad_norm": 0.012519260868430138, + "learning_rate": 1.6138483278273663e-06, + "loss": 0.0039, + "num_input_tokens_seen": 34396704, + "step": 162985 + }, + { + "epoch": 17.93069306930693, + "grad_norm": 0.023192651569843292, + "learning_rate": 1.613000084965155e-06, + "loss": 0.0307, + "num_input_tokens_seen": 34397760, + "step": 162990 + }, + { + "epoch": 17.93124312431243, + "grad_norm": 0.024550508707761765, + "learning_rate": 1.612152057649699e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34398816, + "step": 162995 + }, + { + "epoch": 17.931793179317932, + "grad_norm": 0.03809484839439392, + "learning_rate": 1.6113042458888118e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34399872, + "step": 163000 + }, + { + "epoch": 17.932343234323433, + "grad_norm": 0.035765573382377625, + "learning_rate": 1.6104566496903118e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34400992, + "step": 163005 + }, + { + "epoch": 17.932893289328934, + "grad_norm": 0.004834462888538837, + "learning_rate": 1.609609269062007e-06, + "loss": 0.0004, + "num_input_tokens_seen": 34402080, + "step": 163010 + }, + { + "epoch": 17.933443344334435, + "grad_norm": 0.33846917748451233, + "learning_rate": 1.6087621040117157e-06, + "loss": 0.1349, + "num_input_tokens_seen": 34403168, + "step": 163015 + }, + { + "epoch": 17.933993399339933, + "grad_norm": 0.06953129172325134, + "learning_rate": 1.6079151545472377e-06, + "loss": 0.0328, + "num_input_tokens_seen": 34404224, + "step": 163020 + }, + { + "epoch": 17.934543454345434, + "grad_norm": 0.5012115240097046, + "learning_rate": 1.6070684206763775e-06, + "loss": 0.0054, + "num_input_tokens_seen": 34405344, + "step": 163025 + }, + { + "epoch": 17.935093509350935, + "grad_norm": 0.015454967506229877, + "learning_rate": 1.6062219024069485e-06, + "loss": 0.0046, + "num_input_tokens_seen": 34406432, + "step": 163030 + }, + { + "epoch": 17.935643564356436, + "grad_norm": 0.06437978893518448, + "learning_rate": 1.6053755997467417e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34407488, + "step": 163035 + }, + { + "epoch": 17.936193619361937, + "grad_norm": 0.37320682406425476, + "learning_rate": 1.604529512703562e-06, + "loss": 0.0083, + "num_input_tokens_seen": 34408448, + "step": 163040 + }, + { + "epoch": 17.936743674367438, + "grad_norm": 0.08324738591909409, + "learning_rate": 1.6036836412852113e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34409440, + "step": 163045 + }, + { + "epoch": 17.937293729372936, + "grad_norm": 0.16514864563941956, + "learning_rate": 1.602837985499478e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34410432, + "step": 163050 + }, + { + "epoch": 17.937843784378437, + "grad_norm": 0.10881945490837097, + "learning_rate": 1.601992545354164e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34411488, + "step": 163055 + }, + { + "epoch": 17.938393839383938, + "grad_norm": 0.028009872883558273, + "learning_rate": 1.601147320857052e-06, + "loss": 0.0071, + "num_input_tokens_seen": 34412512, + "step": 163060 + }, + { + "epoch": 17.93894389438944, + "grad_norm": 0.06015462428331375, + "learning_rate": 1.600302312015939e-06, + "loss": 0.0064, + "num_input_tokens_seen": 34413664, + "step": 163065 + }, + { + "epoch": 17.93949394939494, + "grad_norm": 0.17653888463974, + "learning_rate": 1.599457518838615e-06, + "loss": 0.0658, + "num_input_tokens_seen": 34414784, + "step": 163070 + }, + { + "epoch": 17.94004400440044, + "grad_norm": 0.16308428347110748, + "learning_rate": 1.5986129413328582e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34415904, + "step": 163075 + }, + { + "epoch": 17.94059405940594, + "grad_norm": 0.016674915328621864, + "learning_rate": 1.5977685795064618e-06, + "loss": 0.1081, + "num_input_tokens_seen": 34416960, + "step": 163080 + }, + { + "epoch": 17.94114411441144, + "grad_norm": 0.02838549204170704, + "learning_rate": 1.5969244333671974e-06, + "loss": 0.0739, + "num_input_tokens_seen": 34417952, + "step": 163085 + }, + { + "epoch": 17.94169416941694, + "grad_norm": 0.005513285286724567, + "learning_rate": 1.596080502922856e-06, + "loss": 0.001, + "num_input_tokens_seen": 34418944, + "step": 163090 + }, + { + "epoch": 17.942244224422442, + "grad_norm": 0.1034141406416893, + "learning_rate": 1.5952367881812064e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34420000, + "step": 163095 + }, + { + "epoch": 17.942794279427943, + "grad_norm": 1.3483986854553223, + "learning_rate": 1.5943932891500257e-06, + "loss": 0.0275, + "num_input_tokens_seen": 34421088, + "step": 163100 + }, + { + "epoch": 17.943344334433444, + "grad_norm": 0.02040186896920204, + "learning_rate": 1.5935500058370995e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34422176, + "step": 163105 + }, + { + "epoch": 17.943894389438945, + "grad_norm": 0.012191429734230042, + "learning_rate": 1.592706938250188e-06, + "loss": 0.0342, + "num_input_tokens_seen": 34423200, + "step": 163110 + }, + { + "epoch": 17.944444444444443, + "grad_norm": 0.01699776016175747, + "learning_rate": 1.5918640863970602e-06, + "loss": 0.078, + "num_input_tokens_seen": 34424224, + "step": 163115 + }, + { + "epoch": 17.944994499449944, + "grad_norm": 0.022148190066218376, + "learning_rate": 1.5910214502854958e-06, + "loss": 0.1259, + "num_input_tokens_seen": 34425280, + "step": 163120 + }, + { + "epoch": 17.945544554455445, + "grad_norm": 0.007966679520905018, + "learning_rate": 1.5901790299232444e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34426304, + "step": 163125 + }, + { + "epoch": 17.946094609460946, + "grad_norm": 2.555974006652832, + "learning_rate": 1.5893368253180884e-06, + "loss": 0.0582, + "num_input_tokens_seen": 34427392, + "step": 163130 + }, + { + "epoch": 17.946644664466447, + "grad_norm": 0.03883155807852745, + "learning_rate": 1.5884948364777802e-06, + "loss": 0.0209, + "num_input_tokens_seen": 34428448, + "step": 163135 + }, + { + "epoch": 17.94719471947195, + "grad_norm": 0.027140818536281586, + "learning_rate": 1.5876530634100772e-06, + "loss": 0.0004, + "num_input_tokens_seen": 34429472, + "step": 163140 + }, + { + "epoch": 17.94774477447745, + "grad_norm": 0.3616408407688141, + "learning_rate": 1.5868115061227457e-06, + "loss": 0.005, + "num_input_tokens_seen": 34430528, + "step": 163145 + }, + { + "epoch": 17.948294829482947, + "grad_norm": 0.34836113452911377, + "learning_rate": 1.5859701646235347e-06, + "loss": 0.05, + "num_input_tokens_seen": 34431616, + "step": 163150 + }, + { + "epoch": 17.948844884488448, + "grad_norm": 0.001927295234054327, + "learning_rate": 1.5851290389202022e-06, + "loss": 0.0845, + "num_input_tokens_seen": 34432768, + "step": 163155 + }, + { + "epoch": 17.94939493949395, + "grad_norm": 0.05917596071958542, + "learning_rate": 1.5842881290205031e-06, + "loss": 0.018, + "num_input_tokens_seen": 34433728, + "step": 163160 + }, + { + "epoch": 17.94994499449945, + "grad_norm": 0.009788885712623596, + "learning_rate": 1.5834474349321783e-06, + "loss": 0.0145, + "num_input_tokens_seen": 34434784, + "step": 163165 + }, + { + "epoch": 17.95049504950495, + "grad_norm": 2.4264423847198486, + "learning_rate": 1.582606956662988e-06, + "loss": 0.1653, + "num_input_tokens_seen": 34435808, + "step": 163170 + }, + { + "epoch": 17.951045104510452, + "grad_norm": 0.04282022640109062, + "learning_rate": 1.5817666942206683e-06, + "loss": 0.048, + "num_input_tokens_seen": 34436832, + "step": 163175 + }, + { + "epoch": 17.95159515951595, + "grad_norm": 4.750308513641357, + "learning_rate": 1.580926647612968e-06, + "loss": 0.1539, + "num_input_tokens_seen": 34437856, + "step": 163180 + }, + { + "epoch": 17.95214521452145, + "grad_norm": 0.024048443883657455, + "learning_rate": 1.580086816847634e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34438880, + "step": 163185 + }, + { + "epoch": 17.952695269526952, + "grad_norm": 0.03768853098154068, + "learning_rate": 1.5792472019323962e-06, + "loss": 0.0576, + "num_input_tokens_seen": 34439936, + "step": 163190 + }, + { + "epoch": 17.953245324532453, + "grad_norm": 1.5624924898147583, + "learning_rate": 1.5784078028750038e-06, + "loss": 0.0829, + "num_input_tokens_seen": 34440992, + "step": 163195 + }, + { + "epoch": 17.953795379537954, + "grad_norm": 0.025258535519242287, + "learning_rate": 1.577568619683184e-06, + "loss": 0.0051, + "num_input_tokens_seen": 34442016, + "step": 163200 + }, + { + "epoch": 17.954345434543455, + "grad_norm": 0.21516694128513336, + "learning_rate": 1.576729652364678e-06, + "loss": 0.0716, + "num_input_tokens_seen": 34443136, + "step": 163205 + }, + { + "epoch": 17.954895489548957, + "grad_norm": 0.019145630300045013, + "learning_rate": 1.5758909009272127e-06, + "loss": 0.003, + "num_input_tokens_seen": 34444192, + "step": 163210 + }, + { + "epoch": 17.955445544554454, + "grad_norm": 0.03333601355552673, + "learning_rate": 1.5750523653785181e-06, + "loss": 0.0338, + "num_input_tokens_seen": 34445248, + "step": 163215 + }, + { + "epoch": 17.955995599559955, + "grad_norm": 0.36657458543777466, + "learning_rate": 1.5742140457263328e-06, + "loss": 0.01, + "num_input_tokens_seen": 34446336, + "step": 163220 + }, + { + "epoch": 17.956545654565456, + "grad_norm": 1.4075182676315308, + "learning_rate": 1.5733759419783696e-06, + "loss": 0.0506, + "num_input_tokens_seen": 34447392, + "step": 163225 + }, + { + "epoch": 17.957095709570957, + "grad_norm": 0.015659170225262642, + "learning_rate": 1.572538054142364e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34448416, + "step": 163230 + }, + { + "epoch": 17.95764576457646, + "grad_norm": 0.0054986970499157906, + "learning_rate": 1.5717003822260323e-06, + "loss": 0.048, + "num_input_tokens_seen": 34449472, + "step": 163235 + }, + { + "epoch": 17.95819581958196, + "grad_norm": 0.054982975125312805, + "learning_rate": 1.5708629262370905e-06, + "loss": 0.1549, + "num_input_tokens_seen": 34450592, + "step": 163240 + }, + { + "epoch": 17.958745874587457, + "grad_norm": 0.48652884364128113, + "learning_rate": 1.5700256861832713e-06, + "loss": 0.0793, + "num_input_tokens_seen": 34451552, + "step": 163245 + }, + { + "epoch": 17.959295929592958, + "grad_norm": 0.008032887242734432, + "learning_rate": 1.5691886620722768e-06, + "loss": 0.0537, + "num_input_tokens_seen": 34452576, + "step": 163250 + }, + { + "epoch": 17.95984598459846, + "grad_norm": 3.6316609382629395, + "learning_rate": 1.5683518539118314e-06, + "loss": 0.099, + "num_input_tokens_seen": 34453568, + "step": 163255 + }, + { + "epoch": 17.96039603960396, + "grad_norm": 0.016163840889930725, + "learning_rate": 1.5675152617096427e-06, + "loss": 0.1025, + "num_input_tokens_seen": 34454560, + "step": 163260 + }, + { + "epoch": 17.96094609460946, + "grad_norm": 0.012319442816078663, + "learning_rate": 1.5666788854734187e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34455616, + "step": 163265 + }, + { + "epoch": 17.961496149614963, + "grad_norm": 0.035401709377765656, + "learning_rate": 1.5658427252108699e-06, + "loss": 0.0117, + "num_input_tokens_seen": 34456704, + "step": 163270 + }, + { + "epoch": 17.962046204620464, + "grad_norm": 0.07156863808631897, + "learning_rate": 1.5650067809297037e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34457728, + "step": 163275 + }, + { + "epoch": 17.96259625962596, + "grad_norm": 0.09298273175954819, + "learning_rate": 1.5641710526376307e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34458816, + "step": 163280 + }, + { + "epoch": 17.963146314631462, + "grad_norm": 0.017842138186097145, + "learning_rate": 1.5633355403423477e-06, + "loss": 0.0166, + "num_input_tokens_seen": 34459840, + "step": 163285 + }, + { + "epoch": 17.963696369636963, + "grad_norm": 0.01930079236626625, + "learning_rate": 1.5625002440515484e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34460864, + "step": 163290 + }, + { + "epoch": 17.964246424642464, + "grad_norm": 0.012828153558075428, + "learning_rate": 1.5616651637729436e-06, + "loss": 0.0074, + "num_input_tokens_seen": 34461856, + "step": 163295 + }, + { + "epoch": 17.964796479647966, + "grad_norm": 2.9823379516601562, + "learning_rate": 1.560830299514221e-06, + "loss": 0.0344, + "num_input_tokens_seen": 34462912, + "step": 163300 + }, + { + "epoch": 17.965346534653467, + "grad_norm": 2.7111172676086426, + "learning_rate": 1.5599956512830777e-06, + "loss": 0.1803, + "num_input_tokens_seen": 34463936, + "step": 163305 + }, + { + "epoch": 17.965896589658964, + "grad_norm": 0.022314375266432762, + "learning_rate": 1.5591612190872102e-06, + "loss": 0.0904, + "num_input_tokens_seen": 34464960, + "step": 163310 + }, + { + "epoch": 17.966446644664465, + "grad_norm": 0.04156296327710152, + "learning_rate": 1.5583270029343011e-06, + "loss": 0.0846, + "num_input_tokens_seen": 34466016, + "step": 163315 + }, + { + "epoch": 17.966996699669966, + "grad_norm": 0.020967448130249977, + "learning_rate": 1.55749300283205e-06, + "loss": 0.006, + "num_input_tokens_seen": 34467104, + "step": 163320 + }, + { + "epoch": 17.967546754675467, + "grad_norm": 0.026017215102910995, + "learning_rate": 1.5566592187881313e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34468192, + "step": 163325 + }, + { + "epoch": 17.96809680968097, + "grad_norm": 0.015394153073430061, + "learning_rate": 1.5558256508102358e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34469248, + "step": 163330 + }, + { + "epoch": 17.96864686468647, + "grad_norm": 0.07645756751298904, + "learning_rate": 1.5549922989060495e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34470336, + "step": 163335 + }, + { + "epoch": 17.96919691969197, + "grad_norm": 0.021134914830327034, + "learning_rate": 1.5541591630832465e-06, + "loss": 0.0321, + "num_input_tokens_seen": 34471392, + "step": 163340 + }, + { + "epoch": 17.96974697469747, + "grad_norm": 0.11095938831567764, + "learning_rate": 1.5533262433495093e-06, + "loss": 0.0125, + "num_input_tokens_seen": 34472448, + "step": 163345 + }, + { + "epoch": 17.97029702970297, + "grad_norm": 1.463618516921997, + "learning_rate": 1.5524935397125156e-06, + "loss": 0.0374, + "num_input_tokens_seen": 34473504, + "step": 163350 + }, + { + "epoch": 17.97084708470847, + "grad_norm": 0.06564954668283463, + "learning_rate": 1.5516610521799313e-06, + "loss": 0.007, + "num_input_tokens_seen": 34474560, + "step": 163355 + }, + { + "epoch": 17.97139713971397, + "grad_norm": 0.037960484623909, + "learning_rate": 1.550828780759439e-06, + "loss": 0.0349, + "num_input_tokens_seen": 34475584, + "step": 163360 + }, + { + "epoch": 17.971947194719473, + "grad_norm": 1.2250319719314575, + "learning_rate": 1.549996725458705e-06, + "loss": 0.0591, + "num_input_tokens_seen": 34476672, + "step": 163365 + }, + { + "epoch": 17.972497249724974, + "grad_norm": 7.693031311035156, + "learning_rate": 1.5491648862854008e-06, + "loss": 0.0283, + "num_input_tokens_seen": 34477696, + "step": 163370 + }, + { + "epoch": 17.97304730473047, + "grad_norm": 0.04028688371181488, + "learning_rate": 1.5483332632471925e-06, + "loss": 0.0061, + "num_input_tokens_seen": 34478720, + "step": 163375 + }, + { + "epoch": 17.973597359735972, + "grad_norm": 0.031694915145635605, + "learning_rate": 1.547501856351738e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34479808, + "step": 163380 + }, + { + "epoch": 17.974147414741473, + "grad_norm": 0.5570192337036133, + "learning_rate": 1.546670665606706e-06, + "loss": 0.0064, + "num_input_tokens_seen": 34480864, + "step": 163385 + }, + { + "epoch": 17.974697469746975, + "grad_norm": 0.09567795693874359, + "learning_rate": 1.545839691019757e-06, + "loss": 0.0185, + "num_input_tokens_seen": 34481856, + "step": 163390 + }, + { + "epoch": 17.975247524752476, + "grad_norm": 0.09899092465639114, + "learning_rate": 1.5450089325985516e-06, + "loss": 0.0054, + "num_input_tokens_seen": 34482880, + "step": 163395 + }, + { + "epoch": 17.975797579757977, + "grad_norm": 2.1231346130371094, + "learning_rate": 1.544178390350745e-06, + "loss": 0.0834, + "num_input_tokens_seen": 34483936, + "step": 163400 + }, + { + "epoch": 17.976347634763478, + "grad_norm": 0.012312456034123898, + "learning_rate": 1.5433480642839892e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34484992, + "step": 163405 + }, + { + "epoch": 17.976897689768975, + "grad_norm": 0.18704350292682648, + "learning_rate": 1.5425179544059392e-06, + "loss": 0.0835, + "num_input_tokens_seen": 34486016, + "step": 163410 + }, + { + "epoch": 17.977447744774476, + "grad_norm": 0.029804810881614685, + "learning_rate": 1.5416880607242446e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34487104, + "step": 163415 + }, + { + "epoch": 17.977997799779978, + "grad_norm": 0.00918524619191885, + "learning_rate": 1.5408583832465517e-06, + "loss": 0.0126, + "num_input_tokens_seen": 34488128, + "step": 163420 + }, + { + "epoch": 17.97854785478548, + "grad_norm": 0.7323847413063049, + "learning_rate": 1.5400289219805159e-06, + "loss": 0.1136, + "num_input_tokens_seen": 34489152, + "step": 163425 + }, + { + "epoch": 17.97909790979098, + "grad_norm": 0.09884901344776154, + "learning_rate": 1.5391996769337724e-06, + "loss": 0.0134, + "num_input_tokens_seen": 34490304, + "step": 163430 + }, + { + "epoch": 17.97964796479648, + "grad_norm": 0.3651660978794098, + "learning_rate": 1.5383706481139708e-06, + "loss": 0.0039, + "num_input_tokens_seen": 34491392, + "step": 163435 + }, + { + "epoch": 17.980198019801982, + "grad_norm": 0.015249207615852356, + "learning_rate": 1.537541835528747e-06, + "loss": 0.0419, + "num_input_tokens_seen": 34492480, + "step": 163440 + }, + { + "epoch": 17.98074807480748, + "grad_norm": 0.4189896583557129, + "learning_rate": 1.5367132391857385e-06, + "loss": 0.0037, + "num_input_tokens_seen": 34493568, + "step": 163445 + }, + { + "epoch": 17.98129812981298, + "grad_norm": 0.1815458983182907, + "learning_rate": 1.53588485909259e-06, + "loss": 0.0074, + "num_input_tokens_seen": 34494656, + "step": 163450 + }, + { + "epoch": 17.98184818481848, + "grad_norm": 0.4619535505771637, + "learning_rate": 1.5350566952569312e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34495680, + "step": 163455 + }, + { + "epoch": 17.982398239823983, + "grad_norm": 0.02967044897377491, + "learning_rate": 1.534228747686395e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34496736, + "step": 163460 + }, + { + "epoch": 17.982948294829484, + "grad_norm": 0.15470048785209656, + "learning_rate": 1.533401016388611e-06, + "loss": 0.0538, + "num_input_tokens_seen": 34497824, + "step": 163465 + }, + { + "epoch": 17.983498349834985, + "grad_norm": 0.05177130922675133, + "learning_rate": 1.5325735013712123e-06, + "loss": 0.0053, + "num_input_tokens_seen": 34498848, + "step": 163470 + }, + { + "epoch": 17.984048404840483, + "grad_norm": 2.5927484035491943, + "learning_rate": 1.5317462026418206e-06, + "loss": 0.0132, + "num_input_tokens_seen": 34499904, + "step": 163475 + }, + { + "epoch": 17.984598459845984, + "grad_norm": 1.3788425922393799, + "learning_rate": 1.5309191202080602e-06, + "loss": 0.0729, + "num_input_tokens_seen": 34500960, + "step": 163480 + }, + { + "epoch": 17.985148514851485, + "grad_norm": 0.024899665266275406, + "learning_rate": 1.5300922540775614e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34501984, + "step": 163485 + }, + { + "epoch": 17.985698569856986, + "grad_norm": 0.01762731932103634, + "learning_rate": 1.5292656042579373e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34503040, + "step": 163490 + }, + { + "epoch": 17.986248624862487, + "grad_norm": 0.0032867095433175564, + "learning_rate": 1.528439170756815e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34504160, + "step": 163495 + }, + { + "epoch": 17.986798679867988, + "grad_norm": 0.41181236505508423, + "learning_rate": 1.5276129535818001e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34505152, + "step": 163500 + }, + { + "epoch": 17.98734873487349, + "grad_norm": 0.01676108129322529, + "learning_rate": 1.5267869527405137e-06, + "loss": 0.0058, + "num_input_tokens_seen": 34506176, + "step": 163505 + }, + { + "epoch": 17.987898789878987, + "grad_norm": 0.5155669450759888, + "learning_rate": 1.5259611682405722e-06, + "loss": 0.0819, + "num_input_tokens_seen": 34507168, + "step": 163510 + }, + { + "epoch": 17.988448844884488, + "grad_norm": 0.029653271660208702, + "learning_rate": 1.5251356000895832e-06, + "loss": 0.0504, + "num_input_tokens_seen": 34508224, + "step": 163515 + }, + { + "epoch": 17.98899889988999, + "grad_norm": 0.03782951086759567, + "learning_rate": 1.524310248295152e-06, + "loss": 0.0063, + "num_input_tokens_seen": 34509312, + "step": 163520 + }, + { + "epoch": 17.98954895489549, + "grad_norm": 0.0298142246901989, + "learning_rate": 1.5234851128648919e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34510400, + "step": 163525 + }, + { + "epoch": 17.99009900990099, + "grad_norm": 0.00778444018214941, + "learning_rate": 1.5226601938063995e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34511392, + "step": 163530 + }, + { + "epoch": 17.990649064906492, + "grad_norm": 0.7302140593528748, + "learning_rate": 1.5218354911272825e-06, + "loss": 0.121, + "num_input_tokens_seen": 34512448, + "step": 163535 + }, + { + "epoch": 17.99119911991199, + "grad_norm": 0.006764795631170273, + "learning_rate": 1.5210110048351461e-06, + "loss": 0.0089, + "num_input_tokens_seen": 34513504, + "step": 163540 + }, + { + "epoch": 17.99174917491749, + "grad_norm": 0.009236638434231281, + "learning_rate": 1.5201867349375814e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34514528, + "step": 163545 + }, + { + "epoch": 17.992299229922992, + "grad_norm": 0.2849055826663971, + "learning_rate": 1.5193626814421936e-06, + "loss": 0.0571, + "num_input_tokens_seen": 34515616, + "step": 163550 + }, + { + "epoch": 17.992849284928493, + "grad_norm": 0.017016926780343056, + "learning_rate": 1.5185388443565679e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34516672, + "step": 163555 + }, + { + "epoch": 17.993399339933994, + "grad_norm": 0.027041908353567123, + "learning_rate": 1.5177152236883012e-06, + "loss": 0.1451, + "num_input_tokens_seen": 34517760, + "step": 163560 + }, + { + "epoch": 17.993949394939495, + "grad_norm": 0.03875133395195007, + "learning_rate": 1.5168918194449906e-06, + "loss": 0.0081, + "num_input_tokens_seen": 34518816, + "step": 163565 + }, + { + "epoch": 17.994499449944996, + "grad_norm": 0.024668782949447632, + "learning_rate": 1.5160686316342155e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34519904, + "step": 163570 + }, + { + "epoch": 17.995049504950494, + "grad_norm": 1.0231499671936035, + "learning_rate": 1.5152456602635729e-06, + "loss": 0.0158, + "num_input_tokens_seen": 34520928, + "step": 163575 + }, + { + "epoch": 17.995599559955995, + "grad_norm": 0.013665138743817806, + "learning_rate": 1.5144229053406373e-06, + "loss": 0.0568, + "num_input_tokens_seen": 34521984, + "step": 163580 + }, + { + "epoch": 17.996149614961496, + "grad_norm": 0.37283241748809814, + "learning_rate": 1.513600366872997e-06, + "loss": 0.0621, + "num_input_tokens_seen": 34523040, + "step": 163585 + }, + { + "epoch": 17.996699669966997, + "grad_norm": 0.02041139081120491, + "learning_rate": 1.5127780448682322e-06, + "loss": 0.0764, + "num_input_tokens_seen": 34524128, + "step": 163590 + }, + { + "epoch": 17.997249724972498, + "grad_norm": 0.0948132798075676, + "learning_rate": 1.51195593933392e-06, + "loss": 0.0985, + "num_input_tokens_seen": 34525184, + "step": 163595 + }, + { + "epoch": 17.997799779978, + "grad_norm": 0.05410280451178551, + "learning_rate": 1.5111340502776433e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34526176, + "step": 163600 + }, + { + "epoch": 17.998349834983497, + "grad_norm": 0.04286143183708191, + "learning_rate": 1.5103123777069683e-06, + "loss": 0.0047, + "num_input_tokens_seen": 34527232, + "step": 163605 + }, + { + "epoch": 17.998899889988998, + "grad_norm": 1.4189900159835815, + "learning_rate": 1.5094909216294777e-06, + "loss": 0.0384, + "num_input_tokens_seen": 34528224, + "step": 163610 + }, + { + "epoch": 17.9994499449945, + "grad_norm": 0.00626932829618454, + "learning_rate": 1.5086696820527352e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34529184, + "step": 163615 + }, + { + "epoch": 18.0, + "grad_norm": 0.0031000529415905476, + "learning_rate": 1.5078486589843093e-06, + "loss": 0.0958, + "num_input_tokens_seen": 34530176, + "step": 163620 + }, + { + "epoch": 18.0, + "eval_loss": 0.0780656486749649, + "eval_runtime": 37.0143, + "eval_samples_per_second": 109.147, + "eval_steps_per_second": 27.287, + "num_input_tokens_seen": 34530176, + "step": 163620 + }, + { + "epoch": 18.0005500550055, + "grad_norm": 0.1571768969297409, + "learning_rate": 1.507027852431775e-06, + "loss": 0.005, + "num_input_tokens_seen": 34531232, + "step": 163625 + }, + { + "epoch": 18.001100110011002, + "grad_norm": 0.049295905977487564, + "learning_rate": 1.5062072624026868e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34532256, + "step": 163630 + }, + { + "epoch": 18.001650165016503, + "grad_norm": 0.06783606112003326, + "learning_rate": 1.5053868889046169e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34533344, + "step": 163635 + }, + { + "epoch": 18.002200220022, + "grad_norm": 6.527153968811035, + "learning_rate": 1.5045667319451228e-06, + "loss": 0.0498, + "num_input_tokens_seen": 34534400, + "step": 163640 + }, + { + "epoch": 18.002750275027502, + "grad_norm": 0.038892004638910294, + "learning_rate": 1.50374679153176e-06, + "loss": 0.004, + "num_input_tokens_seen": 34535424, + "step": 163645 + }, + { + "epoch": 18.003300330033003, + "grad_norm": 2.2228519916534424, + "learning_rate": 1.5029270676720885e-06, + "loss": 0.0562, + "num_input_tokens_seen": 34536480, + "step": 163650 + }, + { + "epoch": 18.003850385038504, + "grad_norm": 0.005954378750175238, + "learning_rate": 1.5021075603736613e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34537504, + "step": 163655 + }, + { + "epoch": 18.004400440044005, + "grad_norm": 1.7274757623672485, + "learning_rate": 1.5012882696440384e-06, + "loss": 0.0257, + "num_input_tokens_seen": 34538624, + "step": 163660 + }, + { + "epoch": 18.004950495049506, + "grad_norm": 0.007251658942550421, + "learning_rate": 1.500469195490764e-06, + "loss": 0.0061, + "num_input_tokens_seen": 34539616, + "step": 163665 + }, + { + "epoch": 18.005500550055004, + "grad_norm": 1.5310828685760498, + "learning_rate": 1.4996503379213878e-06, + "loss": 0.0653, + "num_input_tokens_seen": 34540672, + "step": 163670 + }, + { + "epoch": 18.006050605060505, + "grad_norm": 0.07165779173374176, + "learning_rate": 1.4988316969434564e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34541760, + "step": 163675 + }, + { + "epoch": 18.006600660066006, + "grad_norm": 0.011142410337924957, + "learning_rate": 1.4980132725645163e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34542816, + "step": 163680 + }, + { + "epoch": 18.007150715071507, + "grad_norm": 0.028476344421505928, + "learning_rate": 1.4971950647921146e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34543840, + "step": 163685 + }, + { + "epoch": 18.007700770077008, + "grad_norm": 1.1131446361541748, + "learning_rate": 1.4963770736337896e-06, + "loss": 0.007, + "num_input_tokens_seen": 34544896, + "step": 163690 + }, + { + "epoch": 18.00825082508251, + "grad_norm": 0.05606638267636299, + "learning_rate": 1.4955592990970712e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34546016, + "step": 163695 + }, + { + "epoch": 18.00880088008801, + "grad_norm": 0.005963836796581745, + "learning_rate": 1.494741741189512e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34547040, + "step": 163700 + }, + { + "epoch": 18.009350935093508, + "grad_norm": 0.02228737249970436, + "learning_rate": 1.4939243999186336e-06, + "loss": 0.0274, + "num_input_tokens_seen": 34548064, + "step": 163705 + }, + { + "epoch": 18.00990099009901, + "grad_norm": 0.07936880737543106, + "learning_rate": 1.4931072752919746e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34549152, + "step": 163710 + }, + { + "epoch": 18.01045104510451, + "grad_norm": 0.1641170084476471, + "learning_rate": 1.4922903673170703e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34550240, + "step": 163715 + }, + { + "epoch": 18.01100110011001, + "grad_norm": 0.002857352839782834, + "learning_rate": 1.4914736760014425e-06, + "loss": 0.0373, + "num_input_tokens_seen": 34551264, + "step": 163720 + }, + { + "epoch": 18.011551155115512, + "grad_norm": 0.8169304728507996, + "learning_rate": 1.4906572013526243e-06, + "loss": 0.0123, + "num_input_tokens_seen": 34552352, + "step": 163725 + }, + { + "epoch": 18.012101210121013, + "grad_norm": 0.3550679385662079, + "learning_rate": 1.4898409433781347e-06, + "loss": 0.0066, + "num_input_tokens_seen": 34553440, + "step": 163730 + }, + { + "epoch": 18.01265126512651, + "grad_norm": 0.025235261768102646, + "learning_rate": 1.4890249020855007e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34554528, + "step": 163735 + }, + { + "epoch": 18.013201320132012, + "grad_norm": 0.03155888244509697, + "learning_rate": 1.4882090774822444e-06, + "loss": 0.0352, + "num_input_tokens_seen": 34555616, + "step": 163740 + }, + { + "epoch": 18.013751375137513, + "grad_norm": 0.02179444581270218, + "learning_rate": 1.487393469575879e-06, + "loss": 0.1538, + "num_input_tokens_seen": 34556672, + "step": 163745 + }, + { + "epoch": 18.014301430143014, + "grad_norm": 0.08535356819629669, + "learning_rate": 1.4865780783739292e-06, + "loss": 0.0315, + "num_input_tokens_seen": 34557696, + "step": 163750 + }, + { + "epoch": 18.014851485148515, + "grad_norm": 0.009476137347519398, + "learning_rate": 1.4857629038839083e-06, + "loss": 0.0198, + "num_input_tokens_seen": 34558752, + "step": 163755 + }, + { + "epoch": 18.015401540154016, + "grad_norm": 0.013614031486213207, + "learning_rate": 1.4849479461133214e-06, + "loss": 0.0797, + "num_input_tokens_seen": 34559808, + "step": 163760 + }, + { + "epoch": 18.015951595159517, + "grad_norm": 0.011729325167834759, + "learning_rate": 1.4841332050696877e-06, + "loss": 0.0005, + "num_input_tokens_seen": 34560768, + "step": 163765 + }, + { + "epoch": 18.016501650165015, + "grad_norm": 0.16961967945098877, + "learning_rate": 1.483318680760515e-06, + "loss": 0.076, + "num_input_tokens_seen": 34561824, + "step": 163770 + }, + { + "epoch": 18.017051705170516, + "grad_norm": 0.013606159016489983, + "learning_rate": 1.4825043731933108e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34562976, + "step": 163775 + }, + { + "epoch": 18.017601760176017, + "grad_norm": 0.008014489896595478, + "learning_rate": 1.4816902823755808e-06, + "loss": 0.0099, + "num_input_tokens_seen": 34564064, + "step": 163780 + }, + { + "epoch": 18.01815181518152, + "grad_norm": 0.7539157867431641, + "learning_rate": 1.4808764083148212e-06, + "loss": 0.048, + "num_input_tokens_seen": 34565056, + "step": 163785 + }, + { + "epoch": 18.01870187018702, + "grad_norm": 0.0073305112309753895, + "learning_rate": 1.4800627510185379e-06, + "loss": 0.052, + "num_input_tokens_seen": 34566144, + "step": 163790 + }, + { + "epoch": 18.01925192519252, + "grad_norm": 0.02753070741891861, + "learning_rate": 1.4792493104942324e-06, + "loss": 0.0802, + "num_input_tokens_seen": 34567136, + "step": 163795 + }, + { + "epoch": 18.019801980198018, + "grad_norm": 0.1362370401620865, + "learning_rate": 1.4784360867494017e-06, + "loss": 0.0934, + "num_input_tokens_seen": 34568160, + "step": 163800 + }, + { + "epoch": 18.02035203520352, + "grad_norm": 0.002265094080939889, + "learning_rate": 1.4776230797915402e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34569184, + "step": 163805 + }, + { + "epoch": 18.02090209020902, + "grad_norm": 0.030761761590838432, + "learning_rate": 1.4768102896281333e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34570240, + "step": 163810 + }, + { + "epoch": 18.02145214521452, + "grad_norm": 1.747368335723877, + "learning_rate": 1.4759977162666861e-06, + "loss": 0.2229, + "num_input_tokens_seen": 34571360, + "step": 163815 + }, + { + "epoch": 18.022002200220022, + "grad_norm": 0.06978137791156769, + "learning_rate": 1.475185359714673e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34572448, + "step": 163820 + }, + { + "epoch": 18.022552255225524, + "grad_norm": 0.38148587942123413, + "learning_rate": 1.4743732199795885e-06, + "loss": 0.0609, + "num_input_tokens_seen": 34573440, + "step": 163825 + }, + { + "epoch": 18.023102310231025, + "grad_norm": 3.698143720626831, + "learning_rate": 1.4735612970689233e-06, + "loss": 0.0806, + "num_input_tokens_seen": 34574496, + "step": 163830 + }, + { + "epoch": 18.023652365236522, + "grad_norm": 0.011685153469443321, + "learning_rate": 1.4727495909901495e-06, + "loss": 0.001, + "num_input_tokens_seen": 34575520, + "step": 163835 + }, + { + "epoch": 18.024202420242023, + "grad_norm": 0.04977903142571449, + "learning_rate": 1.4719381017507555e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34576576, + "step": 163840 + }, + { + "epoch": 18.024752475247524, + "grad_norm": 0.05557410046458244, + "learning_rate": 1.471126829358216e-06, + "loss": 0.0056, + "num_input_tokens_seen": 34577632, + "step": 163845 + }, + { + "epoch": 18.025302530253025, + "grad_norm": 0.01353904977440834, + "learning_rate": 1.4703157738200108e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34578656, + "step": 163850 + }, + { + "epoch": 18.025852585258527, + "grad_norm": 0.013128932565450668, + "learning_rate": 1.4695049351436175e-06, + "loss": 0.0411, + "num_input_tokens_seen": 34579648, + "step": 163855 + }, + { + "epoch": 18.026402640264028, + "grad_norm": 0.023498941212892532, + "learning_rate": 1.4686943133365023e-06, + "loss": 0.0275, + "num_input_tokens_seen": 34580672, + "step": 163860 + }, + { + "epoch": 18.02695269526953, + "grad_norm": 0.01787073351442814, + "learning_rate": 1.4678839084061424e-06, + "loss": 0.0302, + "num_input_tokens_seen": 34581760, + "step": 163865 + }, + { + "epoch": 18.027502750275026, + "grad_norm": 0.06669086962938309, + "learning_rate": 1.4670737203600043e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34582752, + "step": 163870 + }, + { + "epoch": 18.028052805280527, + "grad_norm": 0.0163679588586092, + "learning_rate": 1.466263749205557e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34583872, + "step": 163875 + }, + { + "epoch": 18.02860286028603, + "grad_norm": 0.1376543492078781, + "learning_rate": 1.465453994950261e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34584832, + "step": 163880 + }, + { + "epoch": 18.02915291529153, + "grad_norm": 0.03269076719880104, + "learning_rate": 1.4646444576015828e-06, + "loss": 0.0444, + "num_input_tokens_seen": 34585888, + "step": 163885 + }, + { + "epoch": 18.02970297029703, + "grad_norm": 0.015165291726589203, + "learning_rate": 1.4638351371669857e-06, + "loss": 0.1081, + "num_input_tokens_seen": 34586944, + "step": 163890 + }, + { + "epoch": 18.03025302530253, + "grad_norm": 0.027494307607412338, + "learning_rate": 1.4630260336539275e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34587936, + "step": 163895 + }, + { + "epoch": 18.03080308030803, + "grad_norm": 0.23643207550048828, + "learning_rate": 1.462217147069861e-06, + "loss": 0.0114, + "num_input_tokens_seen": 34589024, + "step": 163900 + }, + { + "epoch": 18.03135313531353, + "grad_norm": 0.0096452496945858, + "learning_rate": 1.4614084774222464e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34590112, + "step": 163905 + }, + { + "epoch": 18.03190319031903, + "grad_norm": 0.27464067935943604, + "learning_rate": 1.460600024718528e-06, + "loss": 0.0294, + "num_input_tokens_seen": 34591104, + "step": 163910 + }, + { + "epoch": 18.032453245324533, + "grad_norm": 0.013040488585829735, + "learning_rate": 1.4597917889661722e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34592160, + "step": 163915 + }, + { + "epoch": 18.033003300330034, + "grad_norm": 0.03974749520421028, + "learning_rate": 1.45898377017262e-06, + "loss": 0.0437, + "num_input_tokens_seen": 34593216, + "step": 163920 + }, + { + "epoch": 18.033553355335535, + "grad_norm": 0.09949537366628647, + "learning_rate": 1.4581759683453156e-06, + "loss": 0.1176, + "num_input_tokens_seen": 34594304, + "step": 163925 + }, + { + "epoch": 18.034103410341036, + "grad_norm": 0.5231912732124329, + "learning_rate": 1.4573683834917085e-06, + "loss": 0.0059, + "num_input_tokens_seen": 34595360, + "step": 163930 + }, + { + "epoch": 18.034653465346533, + "grad_norm": 0.0696643814444542, + "learning_rate": 1.456561015619237e-06, + "loss": 0.0297, + "num_input_tokens_seen": 34596384, + "step": 163935 + }, + { + "epoch": 18.035203520352034, + "grad_norm": 0.8701348304748535, + "learning_rate": 1.4557538647353458e-06, + "loss": 0.0056, + "num_input_tokens_seen": 34597472, + "step": 163940 + }, + { + "epoch": 18.035753575357536, + "grad_norm": 4.164614200592041, + "learning_rate": 1.4549469308474755e-06, + "loss": 0.1183, + "num_input_tokens_seen": 34598528, + "step": 163945 + }, + { + "epoch": 18.036303630363037, + "grad_norm": 0.016342155635356903, + "learning_rate": 1.4541402139630594e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34599584, + "step": 163950 + }, + { + "epoch": 18.036853685368538, + "grad_norm": 0.030579933896660805, + "learning_rate": 1.4533337140895386e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34600704, + "step": 163955 + }, + { + "epoch": 18.03740374037404, + "grad_norm": 0.04267895221710205, + "learning_rate": 1.452527431234338e-06, + "loss": 0.0339, + "num_input_tokens_seen": 34601792, + "step": 163960 + }, + { + "epoch": 18.037953795379536, + "grad_norm": 0.4173809885978699, + "learning_rate": 1.4517213654048929e-06, + "loss": 0.0077, + "num_input_tokens_seen": 34602848, + "step": 163965 + }, + { + "epoch": 18.038503850385037, + "grad_norm": 1.9227678775787354, + "learning_rate": 1.4509155166086363e-06, + "loss": 0.0485, + "num_input_tokens_seen": 34603872, + "step": 163970 + }, + { + "epoch": 18.03905390539054, + "grad_norm": 0.07144738733768463, + "learning_rate": 1.4501098848529875e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34604864, + "step": 163975 + }, + { + "epoch": 18.03960396039604, + "grad_norm": 0.047062478959560394, + "learning_rate": 1.4493044701453766e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34605888, + "step": 163980 + }, + { + "epoch": 18.04015401540154, + "grad_norm": 0.13381102681159973, + "learning_rate": 1.4484992724932257e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34606880, + "step": 163985 + }, + { + "epoch": 18.040704070407042, + "grad_norm": 0.03501543775200844, + "learning_rate": 1.4476942919039587e-06, + "loss": 0.0098, + "num_input_tokens_seen": 34607968, + "step": 163990 + }, + { + "epoch": 18.041254125412543, + "grad_norm": 0.0038400711491703987, + "learning_rate": 1.446889528384987e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34609024, + "step": 163995 + }, + { + "epoch": 18.04180418041804, + "grad_norm": 0.01998189091682434, + "learning_rate": 1.446084981943735e-06, + "loss": 0.0009, + "num_input_tokens_seen": 34610080, + "step": 164000 + }, + { + "epoch": 18.04235423542354, + "grad_norm": 0.10135583579540253, + "learning_rate": 1.445280652587616e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34611168, + "step": 164005 + }, + { + "epoch": 18.042904290429043, + "grad_norm": 0.18617761135101318, + "learning_rate": 1.4444765403240413e-06, + "loss": 0.004, + "num_input_tokens_seen": 34612256, + "step": 164010 + }, + { + "epoch": 18.043454345434544, + "grad_norm": 0.08843056857585907, + "learning_rate": 1.4436726451604266e-06, + "loss": 0.0052, + "num_input_tokens_seen": 34613376, + "step": 164015 + }, + { + "epoch": 18.044004400440045, + "grad_norm": 0.31746143102645874, + "learning_rate": 1.4428689671041774e-06, + "loss": 0.0052, + "num_input_tokens_seen": 34614464, + "step": 164020 + }, + { + "epoch": 18.044554455445546, + "grad_norm": 0.06501135230064392, + "learning_rate": 1.4420655061626932e-06, + "loss": 0.0258, + "num_input_tokens_seen": 34615456, + "step": 164025 + }, + { + "epoch": 18.045104510451043, + "grad_norm": 0.010541434399783611, + "learning_rate": 1.4412622623433958e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34616448, + "step": 164030 + }, + { + "epoch": 18.045654565456545, + "grad_norm": 0.369708389043808, + "learning_rate": 1.440459235653674e-06, + "loss": 0.0061, + "num_input_tokens_seen": 34617472, + "step": 164035 + }, + { + "epoch": 18.046204620462046, + "grad_norm": 0.2156660407781601, + "learning_rate": 1.4396564261009382e-06, + "loss": 0.001, + "num_input_tokens_seen": 34618624, + "step": 164040 + }, + { + "epoch": 18.046754675467547, + "grad_norm": 0.022735802456736565, + "learning_rate": 1.4388538336925856e-06, + "loss": 0.0062, + "num_input_tokens_seen": 34619680, + "step": 164045 + }, + { + "epoch": 18.047304730473048, + "grad_norm": 0.05547074228525162, + "learning_rate": 1.4380514584360072e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34620800, + "step": 164050 + }, + { + "epoch": 18.04785478547855, + "grad_norm": 2.205071449279785, + "learning_rate": 1.4372493003386028e-06, + "loss": 0.0946, + "num_input_tokens_seen": 34621856, + "step": 164055 + }, + { + "epoch": 18.04840484048405, + "grad_norm": 0.05277260020375252, + "learning_rate": 1.4364473594077639e-06, + "loss": 0.0638, + "num_input_tokens_seen": 34622944, + "step": 164060 + }, + { + "epoch": 18.048954895489548, + "grad_norm": 0.008133924566209316, + "learning_rate": 1.435645635650887e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34624000, + "step": 164065 + }, + { + "epoch": 18.04950495049505, + "grad_norm": 0.01314831804484129, + "learning_rate": 1.4348441290753551e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34624992, + "step": 164070 + }, + { + "epoch": 18.05005500550055, + "grad_norm": 0.018357373774051666, + "learning_rate": 1.434042839688557e-06, + "loss": 0.0139, + "num_input_tokens_seen": 34626016, + "step": 164075 + }, + { + "epoch": 18.05060506050605, + "grad_norm": 0.01787177473306656, + "learning_rate": 1.4332417674978782e-06, + "loss": 0.0384, + "num_input_tokens_seen": 34627104, + "step": 164080 + }, + { + "epoch": 18.051155115511552, + "grad_norm": 0.042506616562604904, + "learning_rate": 1.4324409125107018e-06, + "loss": 0.0043, + "num_input_tokens_seen": 34628160, + "step": 164085 + }, + { + "epoch": 18.051705170517053, + "grad_norm": 0.5715176463127136, + "learning_rate": 1.4316402747344055e-06, + "loss": 0.0072, + "num_input_tokens_seen": 34629248, + "step": 164090 + }, + { + "epoch": 18.05225522552255, + "grad_norm": 0.032346196472644806, + "learning_rate": 1.4308398541763773e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34630368, + "step": 164095 + }, + { + "epoch": 18.05280528052805, + "grad_norm": 0.025119544938206673, + "learning_rate": 1.430039650843984e-06, + "loss": 0.0004, + "num_input_tokens_seen": 34631424, + "step": 164100 + }, + { + "epoch": 18.053355335533553, + "grad_norm": 0.04053569957613945, + "learning_rate": 1.4292396647446081e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34632512, + "step": 164105 + }, + { + "epoch": 18.053905390539054, + "grad_norm": 0.04980635270476341, + "learning_rate": 1.4284398958856165e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34633504, + "step": 164110 + }, + { + "epoch": 18.054455445544555, + "grad_norm": 0.04281248897314072, + "learning_rate": 1.4276403442743835e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34634560, + "step": 164115 + }, + { + "epoch": 18.055005500550056, + "grad_norm": 0.02509734220802784, + "learning_rate": 1.426841009918284e-06, + "loss": 0.1036, + "num_input_tokens_seen": 34635584, + "step": 164120 + }, + { + "epoch": 18.055555555555557, + "grad_norm": 0.002542249858379364, + "learning_rate": 1.4260418928246727e-06, + "loss": 0.1003, + "num_input_tokens_seen": 34636640, + "step": 164125 + }, + { + "epoch": 18.056105610561055, + "grad_norm": 0.018689345568418503, + "learning_rate": 1.4252429930009276e-06, + "loss": 0.0005, + "num_input_tokens_seen": 34637696, + "step": 164130 + }, + { + "epoch": 18.056655665566556, + "grad_norm": 0.011841030791401863, + "learning_rate": 1.4244443104544064e-06, + "loss": 0.012, + "num_input_tokens_seen": 34638784, + "step": 164135 + }, + { + "epoch": 18.057205720572057, + "grad_norm": 0.005831110756844282, + "learning_rate": 1.4236458451924617e-06, + "loss": 0.0359, + "num_input_tokens_seen": 34639840, + "step": 164140 + }, + { + "epoch": 18.057755775577558, + "grad_norm": 0.04521019384264946, + "learning_rate": 1.4228475972224625e-06, + "loss": 0.0044, + "num_input_tokens_seen": 34640832, + "step": 164145 + }, + { + "epoch": 18.05830583058306, + "grad_norm": 0.015085697174072266, + "learning_rate": 1.4220495665517642e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34641920, + "step": 164150 + }, + { + "epoch": 18.05885588558856, + "grad_norm": 0.6913699507713318, + "learning_rate": 1.4212517531877246e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34642944, + "step": 164155 + }, + { + "epoch": 18.059405940594058, + "grad_norm": 0.014115162193775177, + "learning_rate": 1.4204541571376933e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34643936, + "step": 164160 + }, + { + "epoch": 18.05995599559956, + "grad_norm": 0.5888715386390686, + "learning_rate": 1.4196567784090177e-06, + "loss": 0.0086, + "num_input_tokens_seen": 34644992, + "step": 164165 + }, + { + "epoch": 18.06050605060506, + "grad_norm": 0.05859772861003876, + "learning_rate": 1.4188596170090495e-06, + "loss": 0.005, + "num_input_tokens_seen": 34646048, + "step": 164170 + }, + { + "epoch": 18.06105610561056, + "grad_norm": 3.6954853534698486, + "learning_rate": 1.418062672945139e-06, + "loss": 0.0849, + "num_input_tokens_seen": 34647072, + "step": 164175 + }, + { + "epoch": 18.061606160616062, + "grad_norm": 1.278615117073059, + "learning_rate": 1.41726594622463e-06, + "loss": 0.0753, + "num_input_tokens_seen": 34648096, + "step": 164180 + }, + { + "epoch": 18.062156215621563, + "grad_norm": 0.05085943639278412, + "learning_rate": 1.416469436854867e-06, + "loss": 0.019, + "num_input_tokens_seen": 34649056, + "step": 164185 + }, + { + "epoch": 18.062706270627064, + "grad_norm": 0.24300681054592133, + "learning_rate": 1.4156731448431853e-06, + "loss": 0.0048, + "num_input_tokens_seen": 34650112, + "step": 164190 + }, + { + "epoch": 18.063256325632562, + "grad_norm": 0.022627277299761772, + "learning_rate": 1.4148770701969295e-06, + "loss": 0.034, + "num_input_tokens_seen": 34651200, + "step": 164195 + }, + { + "epoch": 18.063806380638063, + "grad_norm": 0.030838822945952415, + "learning_rate": 1.4140812129234321e-06, + "loss": 0.0057, + "num_input_tokens_seen": 34652192, + "step": 164200 + }, + { + "epoch": 18.064356435643564, + "grad_norm": 0.05639466270804405, + "learning_rate": 1.4132855730300294e-06, + "loss": 0.0051, + "num_input_tokens_seen": 34653216, + "step": 164205 + }, + { + "epoch": 18.064906490649065, + "grad_norm": 0.015061833895742893, + "learning_rate": 1.4124901505240596e-06, + "loss": 0.1021, + "num_input_tokens_seen": 34654304, + "step": 164210 + }, + { + "epoch": 18.065456545654566, + "grad_norm": 0.015692131593823433, + "learning_rate": 1.4116949454128476e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34655360, + "step": 164215 + }, + { + "epoch": 18.066006600660067, + "grad_norm": 0.013121824711561203, + "learning_rate": 1.4108999577037263e-06, + "loss": 0.0064, + "num_input_tokens_seen": 34656416, + "step": 164220 + }, + { + "epoch": 18.066556655665565, + "grad_norm": 0.12194347381591797, + "learning_rate": 1.4101051874040177e-06, + "loss": 0.0312, + "num_input_tokens_seen": 34657472, + "step": 164225 + }, + { + "epoch": 18.067106710671066, + "grad_norm": 0.0402066670358181, + "learning_rate": 1.4093106345210494e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34658528, + "step": 164230 + }, + { + "epoch": 18.067656765676567, + "grad_norm": 0.10548073053359985, + "learning_rate": 1.4085162990621486e-06, + "loss": 0.0811, + "num_input_tokens_seen": 34659520, + "step": 164235 + }, + { + "epoch": 18.068206820682068, + "grad_norm": 0.014146259985864162, + "learning_rate": 1.407722181034632e-06, + "loss": 0.1095, + "num_input_tokens_seen": 34660512, + "step": 164240 + }, + { + "epoch": 18.06875687568757, + "grad_norm": 1.8590235710144043, + "learning_rate": 1.406928280445821e-06, + "loss": 0.064, + "num_input_tokens_seen": 34661664, + "step": 164245 + }, + { + "epoch": 18.06930693069307, + "grad_norm": 0.0062920162454247475, + "learning_rate": 1.406134597303027e-06, + "loss": 0.0004, + "num_input_tokens_seen": 34662688, + "step": 164250 + }, + { + "epoch": 18.06985698569857, + "grad_norm": 0.1865733414888382, + "learning_rate": 1.4053411316135718e-06, + "loss": 0.0055, + "num_input_tokens_seen": 34663744, + "step": 164255 + }, + { + "epoch": 18.07040704070407, + "grad_norm": 0.16654983162879944, + "learning_rate": 1.404547883384763e-06, + "loss": 0.0039, + "num_input_tokens_seen": 34664800, + "step": 164260 + }, + { + "epoch": 18.07095709570957, + "grad_norm": 0.15284976363182068, + "learning_rate": 1.4037548526239147e-06, + "loss": 0.0056, + "num_input_tokens_seen": 34665856, + "step": 164265 + }, + { + "epoch": 18.07150715071507, + "grad_norm": 0.056068576872348785, + "learning_rate": 1.4029620393383376e-06, + "loss": 0.0358, + "num_input_tokens_seen": 34666944, + "step": 164270 + }, + { + "epoch": 18.072057205720572, + "grad_norm": 0.007815780118107796, + "learning_rate": 1.4021694435353393e-06, + "loss": 0.0005, + "num_input_tokens_seen": 34668032, + "step": 164275 + }, + { + "epoch": 18.072607260726073, + "grad_norm": 0.016756197437644005, + "learning_rate": 1.4013770652222146e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34669088, + "step": 164280 + }, + { + "epoch": 18.073157315731574, + "grad_norm": 0.015053221955895424, + "learning_rate": 1.4005849044062769e-06, + "loss": 0.0347, + "num_input_tokens_seen": 34670176, + "step": 164285 + }, + { + "epoch": 18.073707370737075, + "grad_norm": 0.008495467714965343, + "learning_rate": 1.399792961094823e-06, + "loss": 0.1887, + "num_input_tokens_seen": 34671200, + "step": 164290 + }, + { + "epoch": 18.074257425742573, + "grad_norm": 0.008436255156993866, + "learning_rate": 1.3990012352951554e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34672288, + "step": 164295 + }, + { + "epoch": 18.074807480748074, + "grad_norm": 0.11519575119018555, + "learning_rate": 1.3982097270145684e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34673376, + "step": 164300 + }, + { + "epoch": 18.075357535753575, + "grad_norm": 0.019209615886211395, + "learning_rate": 1.397418436260356e-06, + "loss": 0.0297, + "num_input_tokens_seen": 34674400, + "step": 164305 + }, + { + "epoch": 18.075907590759076, + "grad_norm": 0.0445735938847065, + "learning_rate": 1.3966273630398124e-06, + "loss": 0.0301, + "num_input_tokens_seen": 34675392, + "step": 164310 + }, + { + "epoch": 18.076457645764577, + "grad_norm": 2.5270309448242188, + "learning_rate": 1.3958365073602264e-06, + "loss": 0.1255, + "num_input_tokens_seen": 34676512, + "step": 164315 + }, + { + "epoch": 18.07700770077008, + "grad_norm": 0.010162685997784138, + "learning_rate": 1.3950458692288893e-06, + "loss": 0.0454, + "num_input_tokens_seen": 34677568, + "step": 164320 + }, + { + "epoch": 18.077557755775576, + "grad_norm": 0.0029363539069890976, + "learning_rate": 1.3942554486530924e-06, + "loss": 0.0062, + "num_input_tokens_seen": 34678592, + "step": 164325 + }, + { + "epoch": 18.078107810781077, + "grad_norm": 0.04287924990057945, + "learning_rate": 1.3934652456401103e-06, + "loss": 0.0352, + "num_input_tokens_seen": 34679680, + "step": 164330 + }, + { + "epoch": 18.078657865786578, + "grad_norm": 0.07680865377187729, + "learning_rate": 1.3926752601972376e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34680672, + "step": 164335 + }, + { + "epoch": 18.07920792079208, + "grad_norm": 0.09531525522470474, + "learning_rate": 1.391885492331743e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34681760, + "step": 164340 + }, + { + "epoch": 18.07975797579758, + "grad_norm": 1.7426549196243286, + "learning_rate": 1.3910959420509128e-06, + "loss": 0.007, + "num_input_tokens_seen": 34682752, + "step": 164345 + }, + { + "epoch": 18.08030803080308, + "grad_norm": 0.011170675978064537, + "learning_rate": 1.3903066093620242e-06, + "loss": 0.005, + "num_input_tokens_seen": 34683840, + "step": 164350 + }, + { + "epoch": 18.080858085808583, + "grad_norm": 0.1008124053478241, + "learning_rate": 1.3895174942723494e-06, + "loss": 0.0567, + "num_input_tokens_seen": 34684928, + "step": 164355 + }, + { + "epoch": 18.08140814081408, + "grad_norm": 0.013561991974711418, + "learning_rate": 1.3887285967891655e-06, + "loss": 0.0595, + "num_input_tokens_seen": 34685984, + "step": 164360 + }, + { + "epoch": 18.08195819581958, + "grad_norm": 0.01441359706223011, + "learning_rate": 1.3879399169197366e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34687040, + "step": 164365 + }, + { + "epoch": 18.082508250825082, + "grad_norm": 0.009838929399847984, + "learning_rate": 1.387151454671337e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34688096, + "step": 164370 + }, + { + "epoch": 18.083058305830583, + "grad_norm": 0.11329150199890137, + "learning_rate": 1.3863632100512308e-06, + "loss": 0.2023, + "num_input_tokens_seen": 34689088, + "step": 164375 + }, + { + "epoch": 18.083608360836084, + "grad_norm": 0.5944956541061401, + "learning_rate": 1.3855751830666841e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34690144, + "step": 164380 + }, + { + "epoch": 18.084158415841586, + "grad_norm": 0.8492432236671448, + "learning_rate": 1.3847873737249606e-06, + "loss": 0.0076, + "num_input_tokens_seen": 34691136, + "step": 164385 + }, + { + "epoch": 18.084708470847083, + "grad_norm": 0.062256861478090286, + "learning_rate": 1.3839997820333183e-06, + "loss": 0.006, + "num_input_tokens_seen": 34692192, + "step": 164390 + }, + { + "epoch": 18.085258525852584, + "grad_norm": 0.008399829268455505, + "learning_rate": 1.383212407999021e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34693216, + "step": 164395 + }, + { + "epoch": 18.085808580858085, + "grad_norm": 0.12013866752386093, + "learning_rate": 1.3824252516293212e-06, + "loss": 0.018, + "num_input_tokens_seen": 34694368, + "step": 164400 + }, + { + "epoch": 18.086358635863586, + "grad_norm": 0.30309295654296875, + "learning_rate": 1.3816383129314714e-06, + "loss": 0.0155, + "num_input_tokens_seen": 34695392, + "step": 164405 + }, + { + "epoch": 18.086908690869087, + "grad_norm": 0.08005648106336594, + "learning_rate": 1.3808515919127352e-06, + "loss": 0.0991, + "num_input_tokens_seen": 34696544, + "step": 164410 + }, + { + "epoch": 18.08745874587459, + "grad_norm": 0.09606841951608658, + "learning_rate": 1.3800650885803513e-06, + "loss": 0.022, + "num_input_tokens_seen": 34697568, + "step": 164415 + }, + { + "epoch": 18.08800880088009, + "grad_norm": 0.022273199632763863, + "learning_rate": 1.3792788029415775e-06, + "loss": 0.0286, + "num_input_tokens_seen": 34698624, + "step": 164420 + }, + { + "epoch": 18.088558855885587, + "grad_norm": 0.0498291440308094, + "learning_rate": 1.3784927350036558e-06, + "loss": 0.0988, + "num_input_tokens_seen": 34699680, + "step": 164425 + }, + { + "epoch": 18.08910891089109, + "grad_norm": 0.024640653282403946, + "learning_rate": 1.3777068847738272e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34700864, + "step": 164430 + }, + { + "epoch": 18.08965896589659, + "grad_norm": 0.07239100337028503, + "learning_rate": 1.3769212522593417e-06, + "loss": 0.002, + "num_input_tokens_seen": 34702016, + "step": 164435 + }, + { + "epoch": 18.09020902090209, + "grad_norm": 0.04283435642719269, + "learning_rate": 1.3761358374674348e-06, + "loss": 0.026, + "num_input_tokens_seen": 34703104, + "step": 164440 + }, + { + "epoch": 18.09075907590759, + "grad_norm": 0.023291343823075294, + "learning_rate": 1.375350640405354e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34704096, + "step": 164445 + }, + { + "epoch": 18.091309130913093, + "grad_norm": 0.04547366872429848, + "learning_rate": 1.3745656610803292e-06, + "loss": 0.0361, + "num_input_tokens_seen": 34705152, + "step": 164450 + }, + { + "epoch": 18.09185918591859, + "grad_norm": 0.046971872448921204, + "learning_rate": 1.3737808994995937e-06, + "loss": 0.0073, + "num_input_tokens_seen": 34706176, + "step": 164455 + }, + { + "epoch": 18.09240924092409, + "grad_norm": 0.017255840823054314, + "learning_rate": 1.3729963556703807e-06, + "loss": 0.0119, + "num_input_tokens_seen": 34707232, + "step": 164460 + }, + { + "epoch": 18.092959295929592, + "grad_norm": 0.6509055495262146, + "learning_rate": 1.3722120295999285e-06, + "loss": 0.0054, + "num_input_tokens_seen": 34708256, + "step": 164465 + }, + { + "epoch": 18.093509350935093, + "grad_norm": 0.008021188899874687, + "learning_rate": 1.3714279212954538e-06, + "loss": 0.0868, + "num_input_tokens_seen": 34709312, + "step": 164470 + }, + { + "epoch": 18.094059405940595, + "grad_norm": 0.11838340014219284, + "learning_rate": 1.3706440307641954e-06, + "loss": 0.003, + "num_input_tokens_seen": 34710464, + "step": 164475 + }, + { + "epoch": 18.094609460946096, + "grad_norm": 0.02462785318493843, + "learning_rate": 1.3698603580133696e-06, + "loss": 0.0066, + "num_input_tokens_seen": 34711488, + "step": 164480 + }, + { + "epoch": 18.095159515951597, + "grad_norm": 0.014978605322539806, + "learning_rate": 1.3690769030502038e-06, + "loss": 0.0525, + "num_input_tokens_seen": 34712576, + "step": 164485 + }, + { + "epoch": 18.095709570957094, + "grad_norm": 0.010196012444794178, + "learning_rate": 1.3682936658819145e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34713664, + "step": 164490 + }, + { + "epoch": 18.096259625962595, + "grad_norm": 0.05118374526500702, + "learning_rate": 1.367510646515724e-06, + "loss": 0.0589, + "num_input_tokens_seen": 34714720, + "step": 164495 + }, + { + "epoch": 18.096809680968097, + "grad_norm": 2.3206632137298584, + "learning_rate": 1.366727844958851e-06, + "loss": 0.0494, + "num_input_tokens_seen": 34715776, + "step": 164500 + }, + { + "epoch": 18.097359735973598, + "grad_norm": 0.004778970964252949, + "learning_rate": 1.3659452612185013e-06, + "loss": 0.0043, + "num_input_tokens_seen": 34716896, + "step": 164505 + }, + { + "epoch": 18.0979097909791, + "grad_norm": 0.10478512197732925, + "learning_rate": 1.3651628953018997e-06, + "loss": 0.0537, + "num_input_tokens_seen": 34717984, + "step": 164510 + }, + { + "epoch": 18.0984598459846, + "grad_norm": 0.0041451952420175076, + "learning_rate": 1.3643807472162456e-06, + "loss": 0.0197, + "num_input_tokens_seen": 34719040, + "step": 164515 + }, + { + "epoch": 18.099009900990097, + "grad_norm": 0.08301892131567001, + "learning_rate": 1.363598816968753e-06, + "loss": 0.0042, + "num_input_tokens_seen": 34720160, + "step": 164520 + }, + { + "epoch": 18.0995599559956, + "grad_norm": 0.3043779134750366, + "learning_rate": 1.3628171045666327e-06, + "loss": 0.0115, + "num_input_tokens_seen": 34721312, + "step": 164525 + }, + { + "epoch": 18.1001100110011, + "grad_norm": 0.010460129007697105, + "learning_rate": 1.362035610017079e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34722336, + "step": 164530 + }, + { + "epoch": 18.1006600660066, + "grad_norm": 0.07792466133832932, + "learning_rate": 1.3612543333273054e-06, + "loss": 0.0126, + "num_input_tokens_seen": 34723392, + "step": 164535 + }, + { + "epoch": 18.1012101210121, + "grad_norm": 0.4683710038661957, + "learning_rate": 1.360473274504509e-06, + "loss": 0.0126, + "num_input_tokens_seen": 34724448, + "step": 164540 + }, + { + "epoch": 18.101760176017603, + "grad_norm": 0.01214190199971199, + "learning_rate": 1.3596924335558813e-06, + "loss": 0.022, + "num_input_tokens_seen": 34725504, + "step": 164545 + }, + { + "epoch": 18.102310231023104, + "grad_norm": 2.975785732269287, + "learning_rate": 1.358911810488625e-06, + "loss": 0.0689, + "num_input_tokens_seen": 34726528, + "step": 164550 + }, + { + "epoch": 18.1028602860286, + "grad_norm": 3.013786792755127, + "learning_rate": 1.3581314053099343e-06, + "loss": 0.1102, + "num_input_tokens_seen": 34727584, + "step": 164555 + }, + { + "epoch": 18.103410341034103, + "grad_norm": 0.02176934853196144, + "learning_rate": 1.3573512180270087e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34728640, + "step": 164560 + }, + { + "epoch": 18.103960396039604, + "grad_norm": 0.02060626447200775, + "learning_rate": 1.356571248647029e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34729696, + "step": 164565 + }, + { + "epoch": 18.104510451045105, + "grad_norm": 0.01464660745114088, + "learning_rate": 1.3557914971771834e-06, + "loss": 0.0296, + "num_input_tokens_seen": 34730816, + "step": 164570 + }, + { + "epoch": 18.105060506050606, + "grad_norm": 0.01720600761473179, + "learning_rate": 1.355011963624664e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34731808, + "step": 164575 + }, + { + "epoch": 18.105610561056107, + "grad_norm": 0.055857352912425995, + "learning_rate": 1.3542326479966506e-06, + "loss": 0.0064, + "num_input_tokens_seen": 34732896, + "step": 164580 + }, + { + "epoch": 18.106160616061604, + "grad_norm": 5.016173362731934, + "learning_rate": 1.353453550300335e-06, + "loss": 0.0253, + "num_input_tokens_seen": 34733952, + "step": 164585 + }, + { + "epoch": 18.106710671067106, + "grad_norm": 0.016976993530988693, + "learning_rate": 1.3526746705428888e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34735008, + "step": 164590 + }, + { + "epoch": 18.107260726072607, + "grad_norm": 1.3053979873657227, + "learning_rate": 1.3518960087314903e-06, + "loss": 0.1051, + "num_input_tokens_seen": 34736064, + "step": 164595 + }, + { + "epoch": 18.107810781078108, + "grad_norm": 0.011420474387705326, + "learning_rate": 1.3511175648733221e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34737088, + "step": 164600 + }, + { + "epoch": 18.10836083608361, + "grad_norm": 0.01942838542163372, + "learning_rate": 1.3503393389755537e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34738144, + "step": 164605 + }, + { + "epoch": 18.10891089108911, + "grad_norm": 0.28547465801239014, + "learning_rate": 1.349561331045357e-06, + "loss": 0.0105, + "num_input_tokens_seen": 34739264, + "step": 164610 + }, + { + "epoch": 18.10946094609461, + "grad_norm": 3.543348550796509, + "learning_rate": 1.3487835410899096e-06, + "loss": 0.0927, + "num_input_tokens_seen": 34740256, + "step": 164615 + }, + { + "epoch": 18.11001100110011, + "grad_norm": 0.007659757975488901, + "learning_rate": 1.3480059691163726e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34741376, + "step": 164620 + }, + { + "epoch": 18.11056105610561, + "grad_norm": 0.018606478348374367, + "learning_rate": 1.3472286151319181e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34742432, + "step": 164625 + }, + { + "epoch": 18.11111111111111, + "grad_norm": 0.11412414908409119, + "learning_rate": 1.3464514791437038e-06, + "loss": 0.0355, + "num_input_tokens_seen": 34743520, + "step": 164630 + }, + { + "epoch": 18.111661166116612, + "grad_norm": 0.2809253931045532, + "learning_rate": 1.3456745611588939e-06, + "loss": 0.1018, + "num_input_tokens_seen": 34744672, + "step": 164635 + }, + { + "epoch": 18.112211221122113, + "grad_norm": 0.3001595735549927, + "learning_rate": 1.3448978611846574e-06, + "loss": 0.0129, + "num_input_tokens_seen": 34745728, + "step": 164640 + }, + { + "epoch": 18.112761276127614, + "grad_norm": 3.155247211456299, + "learning_rate": 1.3441213792281442e-06, + "loss": 0.115, + "num_input_tokens_seen": 34746816, + "step": 164645 + }, + { + "epoch": 18.11331133113311, + "grad_norm": 0.07592083513736725, + "learning_rate": 1.3433451152965155e-06, + "loss": 0.0413, + "num_input_tokens_seen": 34747872, + "step": 164650 + }, + { + "epoch": 18.113861386138613, + "grad_norm": 0.042602039873600006, + "learning_rate": 1.3425690693969234e-06, + "loss": 0.0248, + "num_input_tokens_seen": 34748960, + "step": 164655 + }, + { + "epoch": 18.114411441144114, + "grad_norm": 0.02804557979106903, + "learning_rate": 1.3417932415365153e-06, + "loss": 0.003, + "num_input_tokens_seen": 34750080, + "step": 164660 + }, + { + "epoch": 18.114961496149615, + "grad_norm": 0.10823163390159607, + "learning_rate": 1.3410176317224465e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34751104, + "step": 164665 + }, + { + "epoch": 18.115511551155116, + "grad_norm": 0.02923933044075966, + "learning_rate": 1.340242239961867e-06, + "loss": 0.001, + "num_input_tokens_seen": 34752128, + "step": 164670 + }, + { + "epoch": 18.116061606160617, + "grad_norm": 0.030076518654823303, + "learning_rate": 1.3394670662619236e-06, + "loss": 0.0473, + "num_input_tokens_seen": 34753120, + "step": 164675 + }, + { + "epoch": 18.116611661166118, + "grad_norm": 1.9680938720703125, + "learning_rate": 1.3386921106297607e-06, + "loss": 0.0832, + "num_input_tokens_seen": 34754208, + "step": 164680 + }, + { + "epoch": 18.117161716171616, + "grad_norm": 0.06354740262031555, + "learning_rate": 1.3379173730725142e-06, + "loss": 0.0519, + "num_input_tokens_seen": 34755264, + "step": 164685 + }, + { + "epoch": 18.117711771177117, + "grad_norm": 0.03289812430739403, + "learning_rate": 1.337142853597334e-06, + "loss": 0.1136, + "num_input_tokens_seen": 34756352, + "step": 164690 + }, + { + "epoch": 18.118261826182618, + "grad_norm": 0.17084555327892303, + "learning_rate": 1.336368552211345e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34757504, + "step": 164695 + }, + { + "epoch": 18.11881188118812, + "grad_norm": 0.02251717820763588, + "learning_rate": 1.3355944689216998e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34758496, + "step": 164700 + }, + { + "epoch": 18.11936193619362, + "grad_norm": 0.008721895515918732, + "learning_rate": 1.334820603735526e-06, + "loss": 0.012, + "num_input_tokens_seen": 34759520, + "step": 164705 + }, + { + "epoch": 18.11991199119912, + "grad_norm": 0.051150087267160416, + "learning_rate": 1.3340469566599483e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34760640, + "step": 164710 + }, + { + "epoch": 18.120462046204622, + "grad_norm": 0.026433710008859634, + "learning_rate": 1.3332735277021113e-06, + "loss": 0.0379, + "num_input_tokens_seen": 34761760, + "step": 164715 + }, + { + "epoch": 18.12101210121012, + "grad_norm": 0.010475408285856247, + "learning_rate": 1.3325003168691314e-06, + "loss": 0.0014, + "num_input_tokens_seen": 34762752, + "step": 164720 + }, + { + "epoch": 18.12156215621562, + "grad_norm": 0.017865709960460663, + "learning_rate": 1.3317273241681362e-06, + "loss": 0.1468, + "num_input_tokens_seen": 34763840, + "step": 164725 + }, + { + "epoch": 18.122112211221122, + "grad_norm": 2.126912832260132, + "learning_rate": 1.3309545496062586e-06, + "loss": 0.191, + "num_input_tokens_seen": 34764832, + "step": 164730 + }, + { + "epoch": 18.122662266226623, + "grad_norm": 0.019493527710437775, + "learning_rate": 1.330181993190613e-06, + "loss": 0.0127, + "num_input_tokens_seen": 34765824, + "step": 164735 + }, + { + "epoch": 18.123212321232124, + "grad_norm": 0.0337538905441761, + "learning_rate": 1.3294096549283236e-06, + "loss": 0.0032, + "num_input_tokens_seen": 34766880, + "step": 164740 + }, + { + "epoch": 18.123762376237625, + "grad_norm": 0.21248848736286163, + "learning_rate": 1.3286375348265045e-06, + "loss": 0.0931, + "num_input_tokens_seen": 34767968, + "step": 164745 + }, + { + "epoch": 18.124312431243123, + "grad_norm": 0.05847956985235214, + "learning_rate": 1.3278656328922807e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34768992, + "step": 164750 + }, + { + "epoch": 18.124862486248624, + "grad_norm": 0.750139594078064, + "learning_rate": 1.3270939491327543e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34770016, + "step": 164755 + }, + { + "epoch": 18.125412541254125, + "grad_norm": 0.09076036512851715, + "learning_rate": 1.3263224835550425e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34771008, + "step": 164760 + }, + { + "epoch": 18.125962596259626, + "grad_norm": 0.31720733642578125, + "learning_rate": 1.3255512361662613e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34772096, + "step": 164765 + }, + { + "epoch": 18.126512651265127, + "grad_norm": 0.07263855636119843, + "learning_rate": 1.324780206973511e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34773152, + "step": 164770 + }, + { + "epoch": 18.127062706270628, + "grad_norm": 0.06377490609884262, + "learning_rate": 1.324009395983905e-06, + "loss": 0.0303, + "num_input_tokens_seen": 34774272, + "step": 164775 + }, + { + "epoch": 18.12761276127613, + "grad_norm": 0.011858400888741016, + "learning_rate": 1.3232388032045406e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34775424, + "step": 164780 + }, + { + "epoch": 18.128162816281627, + "grad_norm": 0.013435754925012589, + "learning_rate": 1.3224684286425203e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34776448, + "step": 164785 + }, + { + "epoch": 18.128712871287128, + "grad_norm": 4.813085556030273, + "learning_rate": 1.3216982723049527e-06, + "loss": 0.0381, + "num_input_tokens_seen": 34777536, + "step": 164790 + }, + { + "epoch": 18.12926292629263, + "grad_norm": 0.0035357337910681963, + "learning_rate": 1.320928334198926e-06, + "loss": 0.0196, + "num_input_tokens_seen": 34778656, + "step": 164795 + }, + { + "epoch": 18.12981298129813, + "grad_norm": 0.01747603714466095, + "learning_rate": 1.3201586143315432e-06, + "loss": 0.003, + "num_input_tokens_seen": 34779776, + "step": 164800 + }, + { + "epoch": 18.13036303630363, + "grad_norm": 0.13211683928966522, + "learning_rate": 1.3193891127098984e-06, + "loss": 0.1168, + "num_input_tokens_seen": 34780832, + "step": 164805 + }, + { + "epoch": 18.130913091309132, + "grad_norm": 0.04016947001218796, + "learning_rate": 1.3186198293410722e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34781856, + "step": 164810 + }, + { + "epoch": 18.13146314631463, + "grad_norm": 0.003397751599550247, + "learning_rate": 1.3178507642321703e-06, + "loss": 0.0824, + "num_input_tokens_seen": 34782848, + "step": 164815 + }, + { + "epoch": 18.13201320132013, + "grad_norm": 2.2441585063934326, + "learning_rate": 1.3170819173902698e-06, + "loss": 0.0396, + "num_input_tokens_seen": 34783808, + "step": 164820 + }, + { + "epoch": 18.132563256325632, + "grad_norm": 0.47131219506263733, + "learning_rate": 1.3163132888224655e-06, + "loss": 0.0051, + "num_input_tokens_seen": 34784864, + "step": 164825 + }, + { + "epoch": 18.133113311331133, + "grad_norm": 0.012738281860947609, + "learning_rate": 1.3155448785358376e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34786016, + "step": 164830 + }, + { + "epoch": 18.133663366336634, + "grad_norm": 0.1256534606218338, + "learning_rate": 1.3147766865374638e-06, + "loss": 0.0317, + "num_input_tokens_seen": 34787104, + "step": 164835 + }, + { + "epoch": 18.134213421342135, + "grad_norm": 0.02233641967177391, + "learning_rate": 1.3140087128344275e-06, + "loss": 0.0066, + "num_input_tokens_seen": 34788160, + "step": 164840 + }, + { + "epoch": 18.134763476347636, + "grad_norm": 0.0360325425863266, + "learning_rate": 1.3132409574338117e-06, + "loss": 0.002, + "num_input_tokens_seen": 34789216, + "step": 164845 + }, + { + "epoch": 18.135313531353134, + "grad_norm": 2.806912899017334, + "learning_rate": 1.3124734203426831e-06, + "loss": 0.134, + "num_input_tokens_seen": 34790336, + "step": 164850 + }, + { + "epoch": 18.135863586358635, + "grad_norm": 0.03827635571360588, + "learning_rate": 1.3117061015681275e-06, + "loss": 0.004, + "num_input_tokens_seen": 34791392, + "step": 164855 + }, + { + "epoch": 18.136413641364136, + "grad_norm": 0.11803673952817917, + "learning_rate": 1.3109390011172035e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34792448, + "step": 164860 + }, + { + "epoch": 18.136963696369637, + "grad_norm": 0.5672522187232971, + "learning_rate": 1.3101721189969913e-06, + "loss": 0.0672, + "num_input_tokens_seen": 34793504, + "step": 164865 + }, + { + "epoch": 18.13751375137514, + "grad_norm": 0.027173595502972603, + "learning_rate": 1.3094054552145519e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34794464, + "step": 164870 + }, + { + "epoch": 18.13806380638064, + "grad_norm": 0.008297300897538662, + "learning_rate": 1.3086390097769547e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34795520, + "step": 164875 + }, + { + "epoch": 18.138613861386137, + "grad_norm": 0.13877372443675995, + "learning_rate": 1.3078727826912663e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34796704, + "step": 164880 + }, + { + "epoch": 18.139163916391638, + "grad_norm": 0.05461229011416435, + "learning_rate": 1.3071067739645448e-06, + "loss": 0.0717, + "num_input_tokens_seen": 34797760, + "step": 164885 + }, + { + "epoch": 18.13971397139714, + "grad_norm": 0.09061554074287415, + "learning_rate": 1.3063409836038515e-06, + "loss": 0.0045, + "num_input_tokens_seen": 34798848, + "step": 164890 + }, + { + "epoch": 18.14026402640264, + "grad_norm": 2.1271138191223145, + "learning_rate": 1.3055754116162416e-06, + "loss": 0.0137, + "num_input_tokens_seen": 34799936, + "step": 164895 + }, + { + "epoch": 18.14081408140814, + "grad_norm": 0.18563055992126465, + "learning_rate": 1.3048100580087764e-06, + "loss": 0.0061, + "num_input_tokens_seen": 34801024, + "step": 164900 + }, + { + "epoch": 18.141364136413642, + "grad_norm": 0.9959508180618286, + "learning_rate": 1.3040449227885053e-06, + "loss": 0.0728, + "num_input_tokens_seen": 34802016, + "step": 164905 + }, + { + "epoch": 18.141914191419144, + "grad_norm": 0.16108712553977966, + "learning_rate": 1.3032800059624817e-06, + "loss": 0.0637, + "num_input_tokens_seen": 34803008, + "step": 164910 + }, + { + "epoch": 18.14246424642464, + "grad_norm": 0.021710434928536415, + "learning_rate": 1.3025153075377577e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34804032, + "step": 164915 + }, + { + "epoch": 18.143014301430142, + "grad_norm": 0.015216812491416931, + "learning_rate": 1.3017508275213808e-06, + "loss": 0.0602, + "num_input_tokens_seen": 34805120, + "step": 164920 + }, + { + "epoch": 18.143564356435643, + "grad_norm": 0.2311839461326599, + "learning_rate": 1.3009865659203896e-06, + "loss": 0.0066, + "num_input_tokens_seen": 34806240, + "step": 164925 + }, + { + "epoch": 18.144114411441144, + "grad_norm": 1.8542101383209229, + "learning_rate": 1.300222522741834e-06, + "loss": 0.0735, + "num_input_tokens_seen": 34807232, + "step": 164930 + }, + { + "epoch": 18.144664466446645, + "grad_norm": 0.015257452614605427, + "learning_rate": 1.2994586979927559e-06, + "loss": 0.001, + "num_input_tokens_seen": 34808256, + "step": 164935 + }, + { + "epoch": 18.145214521452147, + "grad_norm": 1.2675930261611938, + "learning_rate": 1.2986950916801965e-06, + "loss": 0.0546, + "num_input_tokens_seen": 34809248, + "step": 164940 + }, + { + "epoch": 18.145764576457644, + "grad_norm": 0.007393213454633951, + "learning_rate": 1.297931703811192e-06, + "loss": 0.0588, + "num_input_tokens_seen": 34810304, + "step": 164945 + }, + { + "epoch": 18.146314631463145, + "grad_norm": 0.09477227181196213, + "learning_rate": 1.297168534392773e-06, + "loss": 0.0161, + "num_input_tokens_seen": 34811360, + "step": 164950 + }, + { + "epoch": 18.146864686468646, + "grad_norm": 0.012158864177763462, + "learning_rate": 1.2964055834319784e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34812416, + "step": 164955 + }, + { + "epoch": 18.147414741474147, + "grad_norm": 0.08394602686166763, + "learning_rate": 1.295642850935841e-06, + "loss": 0.0074, + "num_input_tokens_seen": 34813408, + "step": 164960 + }, + { + "epoch": 18.14796479647965, + "grad_norm": 0.15021151304244995, + "learning_rate": 1.2948803369113915e-06, + "loss": 0.1017, + "num_input_tokens_seen": 34814464, + "step": 164965 + }, + { + "epoch": 18.14851485148515, + "grad_norm": 0.4932628273963928, + "learning_rate": 1.2941180413656552e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34815456, + "step": 164970 + }, + { + "epoch": 18.14906490649065, + "grad_norm": 0.10009128600358963, + "learning_rate": 1.2933559643056536e-06, + "loss": 0.0109, + "num_input_tokens_seen": 34816544, + "step": 164975 + }, + { + "epoch": 18.149614961496148, + "grad_norm": 0.0020123275462538004, + "learning_rate": 1.2925941057384177e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34817632, + "step": 164980 + }, + { + "epoch": 18.15016501650165, + "grad_norm": 0.1095796525478363, + "learning_rate": 1.291832465670964e-06, + "loss": 0.0107, + "num_input_tokens_seen": 34818720, + "step": 164985 + }, + { + "epoch": 18.15071507150715, + "grad_norm": 0.08352735638618469, + "learning_rate": 1.2910710441103119e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34819744, + "step": 164990 + }, + { + "epoch": 18.15126512651265, + "grad_norm": 0.020521188154816628, + "learning_rate": 1.2903098410634861e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34820800, + "step": 164995 + }, + { + "epoch": 18.151815181518153, + "grad_norm": 0.009287296794354916, + "learning_rate": 1.2895488565374952e-06, + "loss": 0.0122, + "num_input_tokens_seen": 34821888, + "step": 165000 + }, + { + "epoch": 18.152365236523654, + "grad_norm": 0.0588693805038929, + "learning_rate": 1.2887880905393584e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34822944, + "step": 165005 + }, + { + "epoch": 18.15291529152915, + "grad_norm": 0.1239602342247963, + "learning_rate": 1.2880275430760784e-06, + "loss": 0.0433, + "num_input_tokens_seen": 34823968, + "step": 165010 + }, + { + "epoch": 18.153465346534652, + "grad_norm": 0.057119932025671005, + "learning_rate": 1.287267214154672e-06, + "loss": 0.0036, + "num_input_tokens_seen": 34825024, + "step": 165015 + }, + { + "epoch": 18.154015401540153, + "grad_norm": 0.03140898421406746, + "learning_rate": 1.2865071037821501e-06, + "loss": 0.0016, + "num_input_tokens_seen": 34826080, + "step": 165020 + }, + { + "epoch": 18.154565456545654, + "grad_norm": 1.8842272758483887, + "learning_rate": 1.2857472119655073e-06, + "loss": 0.0639, + "num_input_tokens_seen": 34827136, + "step": 165025 + }, + { + "epoch": 18.155115511551156, + "grad_norm": 0.0132603133097291, + "learning_rate": 1.2849875387117572e-06, + "loss": 0.0039, + "num_input_tokens_seen": 34828288, + "step": 165030 + }, + { + "epoch": 18.155665566556657, + "grad_norm": 3.0508251190185547, + "learning_rate": 1.2842280840278997e-06, + "loss": 0.0372, + "num_input_tokens_seen": 34829376, + "step": 165035 + }, + { + "epoch": 18.156215621562158, + "grad_norm": 0.009016009047627449, + "learning_rate": 1.2834688479209267e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34830464, + "step": 165040 + }, + { + "epoch": 18.156765676567655, + "grad_norm": 0.3003312647342682, + "learning_rate": 1.2827098303978407e-06, + "loss": 0.0196, + "num_input_tokens_seen": 34831552, + "step": 165045 + }, + { + "epoch": 18.157315731573156, + "grad_norm": 0.015089044347405434, + "learning_rate": 1.2819510314656363e-06, + "loss": 0.0678, + "num_input_tokens_seen": 34832608, + "step": 165050 + }, + { + "epoch": 18.157865786578657, + "grad_norm": 0.017755717039108276, + "learning_rate": 1.2811924511313133e-06, + "loss": 0.0412, + "num_input_tokens_seen": 34833696, + "step": 165055 + }, + { + "epoch": 18.15841584158416, + "grad_norm": 2.03322696685791, + "learning_rate": 1.280434089401858e-06, + "loss": 0.1976, + "num_input_tokens_seen": 34834752, + "step": 165060 + }, + { + "epoch": 18.15896589658966, + "grad_norm": 1.8645960092544556, + "learning_rate": 1.2796759462842562e-06, + "loss": 0.0661, + "num_input_tokens_seen": 34835840, + "step": 165065 + }, + { + "epoch": 18.15951595159516, + "grad_norm": 1.538642168045044, + "learning_rate": 1.2789180217854996e-06, + "loss": 0.0183, + "num_input_tokens_seen": 34836928, + "step": 165070 + }, + { + "epoch": 18.16006600660066, + "grad_norm": 0.06339820474386215, + "learning_rate": 1.2781603159125743e-06, + "loss": 0.0054, + "num_input_tokens_seen": 34838016, + "step": 165075 + }, + { + "epoch": 18.16061606160616, + "grad_norm": 0.003570297732949257, + "learning_rate": 1.2774028286724638e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34839040, + "step": 165080 + }, + { + "epoch": 18.16116611661166, + "grad_norm": 0.02074829302728176, + "learning_rate": 1.2766455600721482e-06, + "loss": 0.2039, + "num_input_tokens_seen": 34840064, + "step": 165085 + }, + { + "epoch": 18.16171617161716, + "grad_norm": 0.027656173333525658, + "learning_rate": 1.2758885101186058e-06, + "loss": 0.0038, + "num_input_tokens_seen": 34841152, + "step": 165090 + }, + { + "epoch": 18.162266226622663, + "grad_norm": 0.17575183510780334, + "learning_rate": 1.2751316788188166e-06, + "loss": 0.0221, + "num_input_tokens_seen": 34842208, + "step": 165095 + }, + { + "epoch": 18.162816281628164, + "grad_norm": 1.5782862901687622, + "learning_rate": 1.2743750661797503e-06, + "loss": 0.0455, + "num_input_tokens_seen": 34843232, + "step": 165100 + }, + { + "epoch": 18.163366336633665, + "grad_norm": 0.04871717840433121, + "learning_rate": 1.2736186722083876e-06, + "loss": 0.0214, + "num_input_tokens_seen": 34844256, + "step": 165105 + }, + { + "epoch": 18.163916391639162, + "grad_norm": 0.002390375128015876, + "learning_rate": 1.2728624969116976e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34845376, + "step": 165110 + }, + { + "epoch": 18.164466446644663, + "grad_norm": 1.491787075996399, + "learning_rate": 1.2721065402966441e-06, + "loss": 0.0434, + "num_input_tokens_seen": 34846368, + "step": 165115 + }, + { + "epoch": 18.165016501650165, + "grad_norm": 0.06063425540924072, + "learning_rate": 1.2713508023702053e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34847392, + "step": 165120 + }, + { + "epoch": 18.165566556655666, + "grad_norm": 0.1166495829820633, + "learning_rate": 1.2705952831393336e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34848512, + "step": 165125 + }, + { + "epoch": 18.166116611661167, + "grad_norm": 1.7197909355163574, + "learning_rate": 1.2698399826110014e-06, + "loss": 0.0332, + "num_input_tokens_seen": 34849600, + "step": 165130 + }, + { + "epoch": 18.166666666666668, + "grad_norm": 0.007786355912685394, + "learning_rate": 1.2690849007921695e-06, + "loss": 0.0392, + "num_input_tokens_seen": 34850656, + "step": 165135 + }, + { + "epoch": 18.16721672167217, + "grad_norm": 0.02994205243885517, + "learning_rate": 1.268330037689791e-06, + "loss": 0.0027, + "num_input_tokens_seen": 34851744, + "step": 165140 + }, + { + "epoch": 18.167766776677666, + "grad_norm": 0.013871251605451107, + "learning_rate": 1.2675753933108297e-06, + "loss": 0.0011, + "num_input_tokens_seen": 34852800, + "step": 165145 + }, + { + "epoch": 18.168316831683168, + "grad_norm": 0.0896729826927185, + "learning_rate": 1.2668209676622356e-06, + "loss": 0.0237, + "num_input_tokens_seen": 34853856, + "step": 165150 + }, + { + "epoch": 18.16886688668867, + "grad_norm": 0.06635424494743347, + "learning_rate": 1.2660667607509669e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34854912, + "step": 165155 + }, + { + "epoch": 18.16941694169417, + "grad_norm": 0.01664159446954727, + "learning_rate": 1.265312772583971e-06, + "loss": 0.0057, + "num_input_tokens_seen": 34855968, + "step": 165160 + }, + { + "epoch": 18.16996699669967, + "grad_norm": 0.023624766618013382, + "learning_rate": 1.2645590031681976e-06, + "loss": 0.0784, + "num_input_tokens_seen": 34856928, + "step": 165165 + }, + { + "epoch": 18.170517051705172, + "grad_norm": 0.06119144707918167, + "learning_rate": 1.263805452510597e-06, + "loss": 0.0029, + "num_input_tokens_seen": 34857984, + "step": 165170 + }, + { + "epoch": 18.17106710671067, + "grad_norm": 0.01558113656938076, + "learning_rate": 1.2630521206181079e-06, + "loss": 0.1127, + "num_input_tokens_seen": 34859008, + "step": 165175 + }, + { + "epoch": 18.17161716171617, + "grad_norm": 0.02140810899436474, + "learning_rate": 1.2622990074976804e-06, + "loss": 0.0104, + "num_input_tokens_seen": 34860096, + "step": 165180 + }, + { + "epoch": 18.17216721672167, + "grad_norm": 0.023501789197325706, + "learning_rate": 1.2615461131562507e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34861152, + "step": 165185 + }, + { + "epoch": 18.172717271727173, + "grad_norm": 0.0027087435591965914, + "learning_rate": 1.26079343760076e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34862208, + "step": 165190 + }, + { + "epoch": 18.173267326732674, + "grad_norm": 0.008280550129711628, + "learning_rate": 1.260040980838148e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34863232, + "step": 165195 + }, + { + "epoch": 18.173817381738175, + "grad_norm": 0.07080910354852676, + "learning_rate": 1.2592887428753418e-06, + "loss": 0.0067, + "num_input_tokens_seen": 34864256, + "step": 165200 + }, + { + "epoch": 18.174367436743676, + "grad_norm": 0.013376007787883282, + "learning_rate": 1.2585367237192835e-06, + "loss": 0.0056, + "num_input_tokens_seen": 34865312, + "step": 165205 + }, + { + "epoch": 18.174917491749174, + "grad_norm": 0.04645470902323723, + "learning_rate": 1.2577849233769008e-06, + "loss": 0.0257, + "num_input_tokens_seen": 34866400, + "step": 165210 + }, + { + "epoch": 18.175467546754675, + "grad_norm": 0.04135383665561676, + "learning_rate": 1.2570333418551184e-06, + "loss": 0.0807, + "num_input_tokens_seen": 34867456, + "step": 165215 + }, + { + "epoch": 18.176017601760176, + "grad_norm": 0.0698981061577797, + "learning_rate": 1.2562819791608644e-06, + "loss": 0.0097, + "num_input_tokens_seen": 34868480, + "step": 165220 + }, + { + "epoch": 18.176567656765677, + "grad_norm": 0.07476790249347687, + "learning_rate": 1.2555308353010692e-06, + "loss": 0.0063, + "num_input_tokens_seen": 34869568, + "step": 165225 + }, + { + "epoch": 18.177117711771178, + "grad_norm": 0.10459119826555252, + "learning_rate": 1.2547799102826551e-06, + "loss": 0.0107, + "num_input_tokens_seen": 34870624, + "step": 165230 + }, + { + "epoch": 18.17766776677668, + "grad_norm": 0.03259908780455589, + "learning_rate": 1.254029204112539e-06, + "loss": 0.0855, + "num_input_tokens_seen": 34871680, + "step": 165235 + }, + { + "epoch": 18.178217821782177, + "grad_norm": 0.013871213421225548, + "learning_rate": 1.2532787167976401e-06, + "loss": 0.0772, + "num_input_tokens_seen": 34872736, + "step": 165240 + }, + { + "epoch": 18.178767876787678, + "grad_norm": 1.3018442392349243, + "learning_rate": 1.2525284483448751e-06, + "loss": 0.0592, + "num_input_tokens_seen": 34873856, + "step": 165245 + }, + { + "epoch": 18.17931793179318, + "grad_norm": 0.03275163099169731, + "learning_rate": 1.2517783987611637e-06, + "loss": 0.006, + "num_input_tokens_seen": 34874880, + "step": 165250 + }, + { + "epoch": 18.17986798679868, + "grad_norm": 0.018248867243528366, + "learning_rate": 1.2510285680534112e-06, + "loss": 0.0015, + "num_input_tokens_seen": 34875936, + "step": 165255 + }, + { + "epoch": 18.18041804180418, + "grad_norm": 0.03132755681872368, + "learning_rate": 1.2502789562285371e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34876928, + "step": 165260 + }, + { + "epoch": 18.180968096809682, + "grad_norm": 0.1465490162372589, + "learning_rate": 1.2495295632934417e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34877984, + "step": 165265 + }, + { + "epoch": 18.181518151815183, + "grad_norm": 0.02963683195412159, + "learning_rate": 1.2487803892550388e-06, + "loss": 0.0842, + "num_input_tokens_seen": 34879008, + "step": 165270 + }, + { + "epoch": 18.18206820682068, + "grad_norm": 0.768311619758606, + "learning_rate": 1.2480314341202281e-06, + "loss": 0.0095, + "num_input_tokens_seen": 34880000, + "step": 165275 + }, + { + "epoch": 18.182618261826182, + "grad_norm": 0.007184838410466909, + "learning_rate": 1.2472826978959128e-06, + "loss": 0.0111, + "num_input_tokens_seen": 34881088, + "step": 165280 + }, + { + "epoch": 18.183168316831683, + "grad_norm": 0.07307987660169601, + "learning_rate": 1.2465341805889985e-06, + "loss": 0.001, + "num_input_tokens_seen": 34882176, + "step": 165285 + }, + { + "epoch": 18.183718371837184, + "grad_norm": 0.044096995145082474, + "learning_rate": 1.2457858822063794e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34883200, + "step": 165290 + }, + { + "epoch": 18.184268426842685, + "grad_norm": 0.7789785265922546, + "learning_rate": 1.2450378027549559e-06, + "loss": 0.1491, + "num_input_tokens_seen": 34884256, + "step": 165295 + }, + { + "epoch": 18.184818481848186, + "grad_norm": 0.3771010935306549, + "learning_rate": 1.2442899422416166e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34885312, + "step": 165300 + }, + { + "epoch": 18.185368536853684, + "grad_norm": 0.11383253335952759, + "learning_rate": 1.243542300673256e-06, + "loss": 0.0076, + "num_input_tokens_seen": 34886336, + "step": 165305 + }, + { + "epoch": 18.185918591859185, + "grad_norm": 1.0339059829711914, + "learning_rate": 1.2427948780567716e-06, + "loss": 0.0088, + "num_input_tokens_seen": 34887328, + "step": 165310 + }, + { + "epoch": 18.186468646864686, + "grad_norm": 0.004299220163375139, + "learning_rate": 1.2420476743990439e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34888384, + "step": 165315 + }, + { + "epoch": 18.187018701870187, + "grad_norm": 0.5235437750816345, + "learning_rate": 1.2413006897069673e-06, + "loss": 0.046, + "num_input_tokens_seen": 34889568, + "step": 165320 + }, + { + "epoch": 18.187568756875688, + "grad_norm": 0.01686343364417553, + "learning_rate": 1.2405539239874198e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34890560, + "step": 165325 + }, + { + "epoch": 18.18811881188119, + "grad_norm": 1.7854071855545044, + "learning_rate": 1.239807377247282e-06, + "loss": 0.0627, + "num_input_tokens_seen": 34891584, + "step": 165330 + }, + { + "epoch": 18.18866886688669, + "grad_norm": 1.2816238403320312, + "learning_rate": 1.2390610494934397e-06, + "loss": 0.0436, + "num_input_tokens_seen": 34892704, + "step": 165335 + }, + { + "epoch": 18.189218921892188, + "grad_norm": 0.02358327805995941, + "learning_rate": 1.2383149407327683e-06, + "loss": 0.0013, + "num_input_tokens_seen": 34893728, + "step": 165340 + }, + { + "epoch": 18.18976897689769, + "grad_norm": 0.2559822201728821, + "learning_rate": 1.2375690509721511e-06, + "loss": 0.0028, + "num_input_tokens_seen": 34894720, + "step": 165345 + }, + { + "epoch": 18.19031903190319, + "grad_norm": 2.247418165206909, + "learning_rate": 1.2368233802184575e-06, + "loss": 0.1029, + "num_input_tokens_seen": 34895808, + "step": 165350 + }, + { + "epoch": 18.19086908690869, + "grad_norm": 1.0688992738723755, + "learning_rate": 1.2360779284785574e-06, + "loss": 0.0618, + "num_input_tokens_seen": 34896800, + "step": 165355 + }, + { + "epoch": 18.191419141914192, + "grad_norm": 0.013129986822605133, + "learning_rate": 1.2353326957593224e-06, + "loss": 0.1382, + "num_input_tokens_seen": 34897888, + "step": 165360 + }, + { + "epoch": 18.191969196919693, + "grad_norm": 0.006310020573437214, + "learning_rate": 1.2345876820676227e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34898976, + "step": 165365 + }, + { + "epoch": 18.19251925192519, + "grad_norm": 0.005718757398426533, + "learning_rate": 1.2338428874103303e-06, + "loss": 0.064, + "num_input_tokens_seen": 34899968, + "step": 165370 + }, + { + "epoch": 18.193069306930692, + "grad_norm": 0.06835275143384933, + "learning_rate": 1.2330983117943007e-06, + "loss": 0.065, + "num_input_tokens_seen": 34900960, + "step": 165375 + }, + { + "epoch": 18.193619361936193, + "grad_norm": 0.013792077079415321, + "learning_rate": 1.232353955226398e-06, + "loss": 0.0543, + "num_input_tokens_seen": 34901984, + "step": 165380 + }, + { + "epoch": 18.194169416941694, + "grad_norm": 0.02561250515282154, + "learning_rate": 1.2316098177134888e-06, + "loss": 0.0106, + "num_input_tokens_seen": 34903008, + "step": 165385 + }, + { + "epoch": 18.194719471947195, + "grad_norm": 0.115752674639225, + "learning_rate": 1.2308658992624206e-06, + "loss": 0.0574, + "num_input_tokens_seen": 34904096, + "step": 165390 + }, + { + "epoch": 18.195269526952696, + "grad_norm": 0.042824722826480865, + "learning_rate": 1.2301221998800571e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34905184, + "step": 165395 + }, + { + "epoch": 18.195819581958197, + "grad_norm": 0.007648359518498182, + "learning_rate": 1.2293787195732541e-06, + "loss": 0.0225, + "num_input_tokens_seen": 34906272, + "step": 165400 + }, + { + "epoch": 18.196369636963695, + "grad_norm": 0.03455595299601555, + "learning_rate": 1.228635458348859e-06, + "loss": 0.0821, + "num_input_tokens_seen": 34907328, + "step": 165405 + }, + { + "epoch": 18.196919691969196, + "grad_norm": 0.011861857026815414, + "learning_rate": 1.2278924162137246e-06, + "loss": 0.0021, + "num_input_tokens_seen": 34908480, + "step": 165410 + }, + { + "epoch": 18.197469746974697, + "grad_norm": 0.008910839445888996, + "learning_rate": 1.2271495931746979e-06, + "loss": 0.0018, + "num_input_tokens_seen": 34909600, + "step": 165415 + }, + { + "epoch": 18.198019801980198, + "grad_norm": 0.011244866997003555, + "learning_rate": 1.2264069892386266e-06, + "loss": 0.0006, + "num_input_tokens_seen": 34910688, + "step": 165420 + }, + { + "epoch": 18.1985698569857, + "grad_norm": 0.4472688138484955, + "learning_rate": 1.2256646044123576e-06, + "loss": 0.0591, + "num_input_tokens_seen": 34911712, + "step": 165425 + }, + { + "epoch": 18.1991199119912, + "grad_norm": 0.02837926708161831, + "learning_rate": 1.2249224387027248e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34912768, + "step": 165430 + }, + { + "epoch": 18.199669966996698, + "grad_norm": 0.01939479261636734, + "learning_rate": 1.2241804921165778e-06, + "loss": 0.0008, + "num_input_tokens_seen": 34913856, + "step": 165435 + }, + { + "epoch": 18.2002200220022, + "grad_norm": 0.017335930839180946, + "learning_rate": 1.223438764660753e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34914880, + "step": 165440 + }, + { + "epoch": 18.2007700770077, + "grad_norm": 0.011208076030015945, + "learning_rate": 1.2226972563420784e-06, + "loss": 0.0051, + "num_input_tokens_seen": 34915936, + "step": 165445 + }, + { + "epoch": 18.2013201320132, + "grad_norm": 0.12653151154518127, + "learning_rate": 1.221955967167393e-06, + "loss": 0.0075, + "num_input_tokens_seen": 34917024, + "step": 165450 + }, + { + "epoch": 18.201870187018702, + "grad_norm": 0.018043357878923416, + "learning_rate": 1.22121489714353e-06, + "loss": 0.0317, + "num_input_tokens_seen": 34918048, + "step": 165455 + }, + { + "epoch": 18.202420242024203, + "grad_norm": 0.02264740690588951, + "learning_rate": 1.220474046277323e-06, + "loss": 0.0071, + "num_input_tokens_seen": 34919168, + "step": 165460 + }, + { + "epoch": 18.202970297029704, + "grad_norm": 0.05371817573904991, + "learning_rate": 1.2197334145755968e-06, + "loss": 0.0041, + "num_input_tokens_seen": 34920224, + "step": 165465 + }, + { + "epoch": 18.203520352035202, + "grad_norm": 0.03747791424393654, + "learning_rate": 1.2189930020451711e-06, + "loss": 0.002, + "num_input_tokens_seen": 34921216, + "step": 165470 + }, + { + "epoch": 18.204070407040703, + "grad_norm": 0.014072681777179241, + "learning_rate": 1.2182528086928824e-06, + "loss": 0.0077, + "num_input_tokens_seen": 34922272, + "step": 165475 + }, + { + "epoch": 18.204620462046204, + "grad_norm": 0.03821390122175217, + "learning_rate": 1.217512834525536e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34923296, + "step": 165480 + }, + { + "epoch": 18.205170517051705, + "grad_norm": 1.930784821510315, + "learning_rate": 1.216773079549971e-06, + "loss": 0.0698, + "num_input_tokens_seen": 34924288, + "step": 165485 + }, + { + "epoch": 18.205720572057206, + "grad_norm": 1.1873353719711304, + "learning_rate": 1.216033543772993e-06, + "loss": 0.0156, + "num_input_tokens_seen": 34925408, + "step": 165490 + }, + { + "epoch": 18.206270627062707, + "grad_norm": 0.006839095149189234, + "learning_rate": 1.2152942272014216e-06, + "loss": 0.0102, + "num_input_tokens_seen": 34926528, + "step": 165495 + }, + { + "epoch": 18.206820682068205, + "grad_norm": 0.043644532561302185, + "learning_rate": 1.214555129842071e-06, + "loss": 0.0311, + "num_input_tokens_seen": 34927584, + "step": 165500 + }, + { + "epoch": 18.207370737073706, + "grad_norm": 0.04772377014160156, + "learning_rate": 1.213816251701752e-06, + "loss": 0.0064, + "num_input_tokens_seen": 34928640, + "step": 165505 + }, + { + "epoch": 18.207920792079207, + "grad_norm": 0.0544242337346077, + "learning_rate": 1.2130775927872734e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34929760, + "step": 165510 + }, + { + "epoch": 18.20847084708471, + "grad_norm": 0.056344226002693176, + "learning_rate": 1.2123391531054461e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34930816, + "step": 165515 + }, + { + "epoch": 18.20902090209021, + "grad_norm": 0.010370157659053802, + "learning_rate": 1.2116009326630734e-06, + "loss": 0.0798, + "num_input_tokens_seen": 34931904, + "step": 165520 + }, + { + "epoch": 18.20957095709571, + "grad_norm": 0.03228108584880829, + "learning_rate": 1.2108629314669606e-06, + "loss": 0.0017, + "num_input_tokens_seen": 34932960, + "step": 165525 + }, + { + "epoch": 18.21012101210121, + "grad_norm": 0.005248637869954109, + "learning_rate": 1.210125149523908e-06, + "loss": 0.0942, + "num_input_tokens_seen": 34934048, + "step": 165530 + }, + { + "epoch": 18.21067106710671, + "grad_norm": 0.03590399771928787, + "learning_rate": 1.2093875868407212e-06, + "loss": 0.0135, + "num_input_tokens_seen": 34935104, + "step": 165535 + }, + { + "epoch": 18.21122112211221, + "grad_norm": 0.01522032544016838, + "learning_rate": 1.2086502434241865e-06, + "loss": 0.0728, + "num_input_tokens_seen": 34936224, + "step": 165540 + }, + { + "epoch": 18.21177117711771, + "grad_norm": 0.11720958352088928, + "learning_rate": 1.2079131192811094e-06, + "loss": 0.0097, + "num_input_tokens_seen": 34937248, + "step": 165545 + }, + { + "epoch": 18.212321232123212, + "grad_norm": 0.15634924173355103, + "learning_rate": 1.207176214418282e-06, + "loss": 0.0099, + "num_input_tokens_seen": 34938304, + "step": 165550 + }, + { + "epoch": 18.212871287128714, + "grad_norm": 0.16759584844112396, + "learning_rate": 1.2064395288424901e-06, + "loss": 0.0541, + "num_input_tokens_seen": 34939360, + "step": 165555 + }, + { + "epoch": 18.213421342134215, + "grad_norm": 0.19668784737586975, + "learning_rate": 1.2057030625605343e-06, + "loss": 0.0035, + "num_input_tokens_seen": 34940416, + "step": 165560 + }, + { + "epoch": 18.213971397139716, + "grad_norm": 0.009999250993132591, + "learning_rate": 1.2049668155791894e-06, + "loss": 0.0047, + "num_input_tokens_seen": 34941408, + "step": 165565 + }, + { + "epoch": 18.214521452145213, + "grad_norm": 0.019802479073405266, + "learning_rate": 1.2042307879052472e-06, + "loss": 0.0153, + "num_input_tokens_seen": 34942432, + "step": 165570 + }, + { + "epoch": 18.215071507150714, + "grad_norm": 0.01876864582300186, + "learning_rate": 1.203494979545497e-06, + "loss": 0.0394, + "num_input_tokens_seen": 34943456, + "step": 165575 + }, + { + "epoch": 18.215621562156215, + "grad_norm": 0.018169308081269264, + "learning_rate": 1.2027593905067108e-06, + "loss": 0.0022, + "num_input_tokens_seen": 34944576, + "step": 165580 + }, + { + "epoch": 18.216171617161717, + "grad_norm": 1.730256199836731, + "learning_rate": 1.2020240207956752e-06, + "loss": 0.1428, + "num_input_tokens_seen": 34945632, + "step": 165585 + }, + { + "epoch": 18.216721672167218, + "grad_norm": 0.19291163980960846, + "learning_rate": 1.2012888704191677e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34946624, + "step": 165590 + }, + { + "epoch": 18.21727172717272, + "grad_norm": 0.017030872404575348, + "learning_rate": 1.20055393938395e-06, + "loss": 0.0032, + "num_input_tokens_seen": 34947680, + "step": 165595 + }, + { + "epoch": 18.217821782178216, + "grad_norm": 0.025768538936972618, + "learning_rate": 1.1998192276968162e-06, + "loss": 0.051, + "num_input_tokens_seen": 34948672, + "step": 165600 + }, + { + "epoch": 18.218371837183717, + "grad_norm": 0.034331705421209335, + "learning_rate": 1.1990847353645224e-06, + "loss": 0.0031, + "num_input_tokens_seen": 34949728, + "step": 165605 + }, + { + "epoch": 18.21892189218922, + "grad_norm": 0.10540121048688889, + "learning_rate": 1.198350462393849e-06, + "loss": 0.0784, + "num_input_tokens_seen": 34950752, + "step": 165610 + }, + { + "epoch": 18.21947194719472, + "grad_norm": 1.1407556533813477, + "learning_rate": 1.1976164087915576e-06, + "loss": 0.0135, + "num_input_tokens_seen": 34951840, + "step": 165615 + }, + { + "epoch": 18.22002200220022, + "grad_norm": 0.03894025832414627, + "learning_rate": 1.196882574564412e-06, + "loss": 0.0061, + "num_input_tokens_seen": 34952800, + "step": 165620 + }, + { + "epoch": 18.22057205720572, + "grad_norm": 0.02951471507549286, + "learning_rate": 1.196148959719176e-06, + "loss": 0.0972, + "num_input_tokens_seen": 34953888, + "step": 165625 + }, + { + "epoch": 18.221122112211223, + "grad_norm": 0.046912577003240585, + "learning_rate": 1.195415564262617e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34954976, + "step": 165630 + }, + { + "epoch": 18.22167216721672, + "grad_norm": 0.02032455801963806, + "learning_rate": 1.1946823882014874e-06, + "loss": 0.007, + "num_input_tokens_seen": 34956000, + "step": 165635 + }, + { + "epoch": 18.22222222222222, + "grad_norm": 2.985581159591675, + "learning_rate": 1.1939494315425487e-06, + "loss": 0.0423, + "num_input_tokens_seen": 34957056, + "step": 165640 + }, + { + "epoch": 18.222772277227723, + "grad_norm": 2.531646251678467, + "learning_rate": 1.1932166942925538e-06, + "loss": 0.0311, + "num_input_tokens_seen": 34958112, + "step": 165645 + }, + { + "epoch": 18.223322332233224, + "grad_norm": 0.009944546967744827, + "learning_rate": 1.1924841764582584e-06, + "loss": 0.001, + "num_input_tokens_seen": 34959232, + "step": 165650 + }, + { + "epoch": 18.223872387238725, + "grad_norm": 0.0657830536365509, + "learning_rate": 1.1917518780464126e-06, + "loss": 0.0012, + "num_input_tokens_seen": 34960320, + "step": 165655 + }, + { + "epoch": 18.224422442244226, + "grad_norm": 0.5939368605613708, + "learning_rate": 1.191019799063761e-06, + "loss": 0.0087, + "num_input_tokens_seen": 34961344, + "step": 165660 + }, + { + "epoch": 18.224972497249723, + "grad_norm": 0.024499502032995224, + "learning_rate": 1.1902879395170624e-06, + "loss": 0.0195, + "num_input_tokens_seen": 34962336, + "step": 165665 + }, + { + "epoch": 18.225522552255224, + "grad_norm": 0.0211233701556921, + "learning_rate": 1.1895562994130499e-06, + "loss": 0.0007, + "num_input_tokens_seen": 34963328, + "step": 165670 + }, + { + "epoch": 18.226072607260726, + "grad_norm": 0.203498974442482, + "learning_rate": 1.1888248787584737e-06, + "loss": 0.0265, + "num_input_tokens_seen": 34964352, + "step": 165675 + }, + { + "epoch": 18.226622662266227, + "grad_norm": 0.011656931601464748, + "learning_rate": 1.1880936775600704e-06, + "loss": 0.0034, + "num_input_tokens_seen": 34965440, + "step": 165680 + }, + { + "epoch": 18.227172717271728, + "grad_norm": 0.10910554230213165, + "learning_rate": 1.1873626958245814e-06, + "loss": 0.081, + "num_input_tokens_seen": 34966496, + "step": 165685 + }, + { + "epoch": 18.22772277227723, + "grad_norm": 2.5820398330688477, + "learning_rate": 1.186631933558749e-06, + "loss": 0.0679, + "num_input_tokens_seen": 34967584, + "step": 165690 + }, + { + "epoch": 18.22827282728273, + "grad_norm": 0.14576999843120575, + "learning_rate": 1.1859013907692979e-06, + "loss": 0.0025, + "num_input_tokens_seen": 34968672, + "step": 165695 + }, + { + "epoch": 18.228822882288227, + "grad_norm": 0.008005640469491482, + "learning_rate": 1.18517106746297e-06, + "loss": 0.0073, + "num_input_tokens_seen": 34969728, + "step": 165700 + }, + { + "epoch": 18.22937293729373, + "grad_norm": 0.9979589581489563, + "learning_rate": 1.1844409636464938e-06, + "loss": 0.021, + "num_input_tokens_seen": 34970752, + "step": 165705 + }, + { + "epoch": 18.22992299229923, + "grad_norm": 0.005208726041018963, + "learning_rate": 1.1837110793265937e-06, + "loss": 0.0033, + "num_input_tokens_seen": 34971840, + "step": 165710 + }, + { + "epoch": 18.23047304730473, + "grad_norm": 0.4126425087451935, + "learning_rate": 1.182981414510001e-06, + "loss": 0.0058, + "num_input_tokens_seen": 34972896, + "step": 165715 + }, + { + "epoch": 18.231023102310232, + "grad_norm": 0.00894793588668108, + "learning_rate": 1.1822519692034406e-06, + "loss": 0.0049, + "num_input_tokens_seen": 34973984, + "step": 165720 + }, + { + "epoch": 18.231573157315733, + "grad_norm": 0.07306278496980667, + "learning_rate": 1.1815227434136378e-06, + "loss": 0.0023, + "num_input_tokens_seen": 34975008, + "step": 165725 + }, + { + "epoch": 18.23212321232123, + "grad_norm": 1.821181058883667, + "learning_rate": 1.1807937371473094e-06, + "loss": 0.0463, + "num_input_tokens_seen": 34976064, + "step": 165730 + }, + { + "epoch": 18.23267326732673, + "grad_norm": 0.0367426872253418, + "learning_rate": 1.180064950411172e-06, + "loss": 0.0241, + "num_input_tokens_seen": 34977152, + "step": 165735 + }, + { + "epoch": 18.233223322332233, + "grad_norm": 0.0074700647965073586, + "learning_rate": 1.1793363832119486e-06, + "loss": 0.0117, + "num_input_tokens_seen": 34978144, + "step": 165740 + }, + { + "epoch": 18.233773377337734, + "grad_norm": 0.004879375454038382, + "learning_rate": 1.17860803555635e-06, + "loss": 0.0615, + "num_input_tokens_seen": 34979264, + "step": 165745 + }, + { + "epoch": 18.234323432343235, + "grad_norm": 2.3154287338256836, + "learning_rate": 1.1778799074510932e-06, + "loss": 0.112, + "num_input_tokens_seen": 34980288, + "step": 165750 + }, + { + "epoch": 18.234873487348736, + "grad_norm": 2.4722888469696045, + "learning_rate": 1.1771519989028868e-06, + "loss": 0.1341, + "num_input_tokens_seen": 34981312, + "step": 165755 + }, + { + "epoch": 18.235423542354237, + "grad_norm": 0.39283445477485657, + "learning_rate": 1.1764243099184364e-06, + "loss": 0.0672, + "num_input_tokens_seen": 34982336, + "step": 165760 + }, + { + "epoch": 18.235973597359735, + "grad_norm": 0.17431603372097015, + "learning_rate": 1.1756968405044532e-06, + "loss": 0.0072, + "num_input_tokens_seen": 34983360, + "step": 165765 + }, + { + "epoch": 18.236523652365236, + "grad_norm": 0.0034057649318128824, + "learning_rate": 1.1749695906676377e-06, + "loss": 0.0238, + "num_input_tokens_seen": 34984448, + "step": 165770 + }, + { + "epoch": 18.237073707370737, + "grad_norm": 0.018127435818314552, + "learning_rate": 1.1742425604146956e-06, + "loss": 0.0048, + "num_input_tokens_seen": 34985504, + "step": 165775 + }, + { + "epoch": 18.237623762376238, + "grad_norm": 1.7826842069625854, + "learning_rate": 1.1735157497523325e-06, + "loss": 0.0985, + "num_input_tokens_seen": 34986560, + "step": 165780 + }, + { + "epoch": 18.23817381738174, + "grad_norm": 0.012753013521432877, + "learning_rate": 1.1727891586872348e-06, + "loss": 0.0424, + "num_input_tokens_seen": 34987552, + "step": 165785 + }, + { + "epoch": 18.23872387238724, + "grad_norm": 2.384077548980713, + "learning_rate": 1.1720627872261108e-06, + "loss": 0.0589, + "num_input_tokens_seen": 34988608, + "step": 165790 + }, + { + "epoch": 18.239273927392738, + "grad_norm": 0.014072868973016739, + "learning_rate": 1.1713366353756472e-06, + "loss": 0.0992, + "num_input_tokens_seen": 34989632, + "step": 165795 + }, + { + "epoch": 18.23982398239824, + "grad_norm": 0.11785886436700821, + "learning_rate": 1.1706107031425383e-06, + "loss": 0.1431, + "num_input_tokens_seen": 34990656, + "step": 165800 + }, + { + "epoch": 18.24037403740374, + "grad_norm": 0.014753296971321106, + "learning_rate": 1.1698849905334819e-06, + "loss": 0.0542, + "num_input_tokens_seen": 34991680, + "step": 165805 + }, + { + "epoch": 18.24092409240924, + "grad_norm": 0.0020852277521044016, + "learning_rate": 1.1691594975551557e-06, + "loss": 0.034, + "num_input_tokens_seen": 34992736, + "step": 165810 + }, + { + "epoch": 18.241474147414742, + "grad_norm": 2.0871827602386475, + "learning_rate": 1.1684342242142544e-06, + "loss": 0.1035, + "num_input_tokens_seen": 34993760, + "step": 165815 + }, + { + "epoch": 18.242024202420243, + "grad_norm": 0.13357646763324738, + "learning_rate": 1.1677091705174614e-06, + "loss": 0.0026, + "num_input_tokens_seen": 34994784, + "step": 165820 + }, + { + "epoch": 18.242574257425744, + "grad_norm": 0.022625455632805824, + "learning_rate": 1.1669843364714521e-06, + "loss": 0.0024, + "num_input_tokens_seen": 34995872, + "step": 165825 + }, + { + "epoch": 18.24312431243124, + "grad_norm": 0.00939196441322565, + "learning_rate": 1.1662597220829103e-06, + "loss": 0.003, + "num_input_tokens_seen": 34996960, + "step": 165830 + }, + { + "epoch": 18.243674367436743, + "grad_norm": 0.07520405948162079, + "learning_rate": 1.165535327358519e-06, + "loss": 0.0019, + "num_input_tokens_seen": 34998016, + "step": 165835 + }, + { + "epoch": 18.244224422442244, + "grad_norm": 0.09207062423229218, + "learning_rate": 1.1648111523049537e-06, + "loss": 0.007, + "num_input_tokens_seen": 34999072, + "step": 165840 + }, + { + "epoch": 18.244774477447745, + "grad_norm": 0.04181433841586113, + "learning_rate": 1.164087196928887e-06, + "loss": 0.003, + "num_input_tokens_seen": 35000096, + "step": 165845 + }, + { + "epoch": 18.245324532453246, + "grad_norm": 2.1245615482330322, + "learning_rate": 1.1633634612369882e-06, + "loss": 0.0189, + "num_input_tokens_seen": 35001120, + "step": 165850 + }, + { + "epoch": 18.245874587458747, + "grad_norm": 0.02147597447037697, + "learning_rate": 1.1626399452359299e-06, + "loss": 0.0105, + "num_input_tokens_seen": 35002144, + "step": 165855 + }, + { + "epoch": 18.246424642464245, + "grad_norm": 0.03944634273648262, + "learning_rate": 1.1619166489323818e-06, + "loss": 0.0051, + "num_input_tokens_seen": 35003200, + "step": 165860 + }, + { + "epoch": 18.246974697469746, + "grad_norm": 3.3317503929138184, + "learning_rate": 1.1611935723330108e-06, + "loss": 0.1578, + "num_input_tokens_seen": 35004288, + "step": 165865 + }, + { + "epoch": 18.247524752475247, + "grad_norm": 0.6448934078216553, + "learning_rate": 1.160470715444481e-06, + "loss": 0.1065, + "num_input_tokens_seen": 35005312, + "step": 165870 + }, + { + "epoch": 18.248074807480748, + "grad_norm": 0.028920188546180725, + "learning_rate": 1.159748078273451e-06, + "loss": 0.0258, + "num_input_tokens_seen": 35006400, + "step": 165875 + }, + { + "epoch": 18.24862486248625, + "grad_norm": 0.06173182278871536, + "learning_rate": 1.1590256608265849e-06, + "loss": 0.0032, + "num_input_tokens_seen": 35007424, + "step": 165880 + }, + { + "epoch": 18.24917491749175, + "grad_norm": 0.004795255605131388, + "learning_rate": 1.1583034631105354e-06, + "loss": 0.0008, + "num_input_tokens_seen": 35008480, + "step": 165885 + }, + { + "epoch": 18.24972497249725, + "grad_norm": 4.03104305267334, + "learning_rate": 1.1575814851319644e-06, + "loss": 0.0406, + "num_input_tokens_seen": 35009504, + "step": 165890 + }, + { + "epoch": 18.25027502750275, + "grad_norm": 0.022715553641319275, + "learning_rate": 1.1568597268975272e-06, + "loss": 0.0127, + "num_input_tokens_seen": 35010528, + "step": 165895 + }, + { + "epoch": 18.25082508250825, + "grad_norm": 0.16116963326931, + "learning_rate": 1.1561381884138688e-06, + "loss": 0.0025, + "num_input_tokens_seen": 35011552, + "step": 165900 + }, + { + "epoch": 18.25137513751375, + "grad_norm": 0.033967144787311554, + "learning_rate": 1.155416869687645e-06, + "loss": 0.0479, + "num_input_tokens_seen": 35012640, + "step": 165905 + }, + { + "epoch": 18.251925192519252, + "grad_norm": 0.8873351216316223, + "learning_rate": 1.154695770725503e-06, + "loss": 0.0094, + "num_input_tokens_seen": 35013696, + "step": 165910 + }, + { + "epoch": 18.252475247524753, + "grad_norm": 2.6927056312561035, + "learning_rate": 1.1539748915340847e-06, + "loss": 0.0823, + "num_input_tokens_seen": 35014752, + "step": 165915 + }, + { + "epoch": 18.253025302530254, + "grad_norm": 0.0029862960800528526, + "learning_rate": 1.1532542321200407e-06, + "loss": 0.021, + "num_input_tokens_seen": 35015776, + "step": 165920 + }, + { + "epoch": 18.253575357535752, + "grad_norm": 0.12014643102884293, + "learning_rate": 1.152533792490007e-06, + "loss": 0.0024, + "num_input_tokens_seen": 35016832, + "step": 165925 + }, + { + "epoch": 18.254125412541253, + "grad_norm": 0.020448418334126472, + "learning_rate": 1.1518135726506312e-06, + "loss": 0.0039, + "num_input_tokens_seen": 35017856, + "step": 165930 + }, + { + "epoch": 18.254675467546754, + "grad_norm": 0.10192590951919556, + "learning_rate": 1.1510935726085414e-06, + "loss": 0.001, + "num_input_tokens_seen": 35019008, + "step": 165935 + }, + { + "epoch": 18.255225522552255, + "grad_norm": 0.27921611070632935, + "learning_rate": 1.1503737923703821e-06, + "loss": 0.0303, + "num_input_tokens_seen": 35020096, + "step": 165940 + }, + { + "epoch": 18.255775577557756, + "grad_norm": 0.11503547430038452, + "learning_rate": 1.1496542319427789e-06, + "loss": 0.0061, + "num_input_tokens_seen": 35021152, + "step": 165945 + }, + { + "epoch": 18.256325632563257, + "grad_norm": 0.0062410784885287285, + "learning_rate": 1.1489348913323706e-06, + "loss": 0.0008, + "num_input_tokens_seen": 35022240, + "step": 165950 + }, + { + "epoch": 18.25687568756876, + "grad_norm": 0.01526682823896408, + "learning_rate": 1.1482157705457852e-06, + "loss": 0.0009, + "num_input_tokens_seen": 35023328, + "step": 165955 + }, + { + "epoch": 18.257425742574256, + "grad_norm": 0.06038317829370499, + "learning_rate": 1.1474968695896482e-06, + "loss": 0.0047, + "num_input_tokens_seen": 35024416, + "step": 165960 + }, + { + "epoch": 18.257975797579757, + "grad_norm": 5.054842948913574, + "learning_rate": 1.1467781884705874e-06, + "loss": 0.027, + "num_input_tokens_seen": 35025472, + "step": 165965 + }, + { + "epoch": 18.258525852585258, + "grad_norm": 0.1163124367594719, + "learning_rate": 1.1460597271952257e-06, + "loss": 0.0036, + "num_input_tokens_seen": 35026528, + "step": 165970 + }, + { + "epoch": 18.25907590759076, + "grad_norm": 0.05189651623368263, + "learning_rate": 1.145341485770185e-06, + "loss": 0.0038, + "num_input_tokens_seen": 35027584, + "step": 165975 + }, + { + "epoch": 18.25962596259626, + "grad_norm": 0.004285746719688177, + "learning_rate": 1.144623464202088e-06, + "loss": 0.0079, + "num_input_tokens_seen": 35028576, + "step": 165980 + }, + { + "epoch": 18.26017601760176, + "grad_norm": 0.6299381852149963, + "learning_rate": 1.1439056624975465e-06, + "loss": 0.1114, + "num_input_tokens_seen": 35029600, + "step": 165985 + }, + { + "epoch": 18.260726072607262, + "grad_norm": 0.2223428189754486, + "learning_rate": 1.1431880806631822e-06, + "loss": 0.002, + "num_input_tokens_seen": 35030624, + "step": 165990 + }, + { + "epoch": 18.26127612761276, + "grad_norm": 0.05373813584446907, + "learning_rate": 1.1424707187056072e-06, + "loss": 0.0043, + "num_input_tokens_seen": 35031680, + "step": 165995 + }, + { + "epoch": 18.26182618261826, + "grad_norm": 0.910555362701416, + "learning_rate": 1.1417535766314268e-06, + "loss": 0.1191, + "num_input_tokens_seen": 35032736, + "step": 166000 + }, + { + "epoch": 18.262376237623762, + "grad_norm": 0.008206705562770367, + "learning_rate": 1.1410366544472556e-06, + "loss": 0.0015, + "num_input_tokens_seen": 35033760, + "step": 166005 + }, + { + "epoch": 18.262926292629263, + "grad_norm": 0.001960412133485079, + "learning_rate": 1.1403199521597047e-06, + "loss": 0.0007, + "num_input_tokens_seen": 35034784, + "step": 166010 + }, + { + "epoch": 18.263476347634764, + "grad_norm": 0.1334645301103592, + "learning_rate": 1.1396034697753717e-06, + "loss": 0.0061, + "num_input_tokens_seen": 35035840, + "step": 166015 + }, + { + "epoch": 18.264026402640265, + "grad_norm": 0.043393753468990326, + "learning_rate": 1.1388872073008704e-06, + "loss": 0.0375, + "num_input_tokens_seen": 35036928, + "step": 166020 + }, + { + "epoch": 18.264576457645763, + "grad_norm": 0.03436988964676857, + "learning_rate": 1.1381711647427907e-06, + "loss": 0.0246, + "num_input_tokens_seen": 35038080, + "step": 166025 + }, + { + "epoch": 18.265126512651264, + "grad_norm": 0.03187350183725357, + "learning_rate": 1.1374553421077349e-06, + "loss": 0.0041, + "num_input_tokens_seen": 35039232, + "step": 166030 + }, + { + "epoch": 18.265676567656765, + "grad_norm": 0.08407996594905853, + "learning_rate": 1.1367397394023094e-06, + "loss": 0.0074, + "num_input_tokens_seen": 35040256, + "step": 166035 + }, + { + "epoch": 18.266226622662266, + "grad_norm": 0.014095432125031948, + "learning_rate": 1.1360243566330975e-06, + "loss": 0.0028, + "num_input_tokens_seen": 35041376, + "step": 166040 + }, + { + "epoch": 18.266776677667767, + "grad_norm": 2.113807439804077, + "learning_rate": 1.1353091938067023e-06, + "loss": 0.1133, + "num_input_tokens_seen": 35042432, + "step": 166045 + }, + { + "epoch": 18.26732673267327, + "grad_norm": 0.149189293384552, + "learning_rate": 1.1345942509297076e-06, + "loss": 0.1209, + "num_input_tokens_seen": 35043456, + "step": 166050 + }, + { + "epoch": 18.26787678767877, + "grad_norm": 0.029556533321738243, + "learning_rate": 1.1338795280087106e-06, + "loss": 0.0023, + "num_input_tokens_seen": 35044480, + "step": 166055 + }, + { + "epoch": 18.268426842684267, + "grad_norm": 0.14488829672336578, + "learning_rate": 1.1331650250502895e-06, + "loss": 0.0061, + "num_input_tokens_seen": 35045568, + "step": 166060 + }, + { + "epoch": 18.268976897689768, + "grad_norm": 0.043185170739889145, + "learning_rate": 1.1324507420610336e-06, + "loss": 0.0092, + "num_input_tokens_seen": 35046592, + "step": 166065 + }, + { + "epoch": 18.26952695269527, + "grad_norm": 0.6583673357963562, + "learning_rate": 1.1317366790475292e-06, + "loss": 0.0494, + "num_input_tokens_seen": 35047648, + "step": 166070 + }, + { + "epoch": 18.27007700770077, + "grad_norm": 3.0474212169647217, + "learning_rate": 1.1310228360163544e-06, + "loss": 0.0341, + "num_input_tokens_seen": 35048672, + "step": 166075 + }, + { + "epoch": 18.27062706270627, + "grad_norm": 2.9614620208740234, + "learning_rate": 1.1303092129740872e-06, + "loss": 0.0803, + "num_input_tokens_seen": 35049728, + "step": 166080 + }, + { + "epoch": 18.271177117711773, + "grad_norm": 0.009716280736029148, + "learning_rate": 1.1295958099273057e-06, + "loss": 0.0822, + "num_input_tokens_seen": 35050816, + "step": 166085 + }, + { + "epoch": 18.27172717271727, + "grad_norm": 0.2515307068824768, + "learning_rate": 1.1288826268825852e-06, + "loss": 0.0085, + "num_input_tokens_seen": 35051904, + "step": 166090 + }, + { + "epoch": 18.27227722772277, + "grad_norm": 0.08571527153253555, + "learning_rate": 1.128169663846504e-06, + "loss": 0.0787, + "num_input_tokens_seen": 35053024, + "step": 166095 + }, + { + "epoch": 18.272827282728272, + "grad_norm": 0.7171400785446167, + "learning_rate": 1.1274569208256231e-06, + "loss": 0.0048, + "num_input_tokens_seen": 35054048, + "step": 166100 + }, + { + "epoch": 18.273377337733773, + "grad_norm": 0.13378643989562988, + "learning_rate": 1.1267443978265207e-06, + "loss": 0.0074, + "num_input_tokens_seen": 35055136, + "step": 166105 + }, + { + "epoch": 18.273927392739274, + "grad_norm": 0.06373202055692673, + "learning_rate": 1.1260320948557585e-06, + "loss": 0.0044, + "num_input_tokens_seen": 35056192, + "step": 166110 + }, + { + "epoch": 18.274477447744776, + "grad_norm": 2.3952245712280273, + "learning_rate": 1.1253200119199003e-06, + "loss": 0.0747, + "num_input_tokens_seen": 35057184, + "step": 166115 + }, + { + "epoch": 18.275027502750277, + "grad_norm": 0.13407570123672485, + "learning_rate": 1.1246081490255105e-06, + "loss": 0.0037, + "num_input_tokens_seen": 35058208, + "step": 166120 + }, + { + "epoch": 18.275577557755774, + "grad_norm": 0.12109658867120743, + "learning_rate": 1.1238965061791506e-06, + "loss": 0.082, + "num_input_tokens_seen": 35059296, + "step": 166125 + }, + { + "epoch": 18.276127612761275, + "grad_norm": 0.201150581240654, + "learning_rate": 1.1231850833873847e-06, + "loss": 0.0041, + "num_input_tokens_seen": 35060352, + "step": 166130 + }, + { + "epoch": 18.276677667766776, + "grad_norm": 0.06898123025894165, + "learning_rate": 1.122473880656763e-06, + "loss": 0.0703, + "num_input_tokens_seen": 35061376, + "step": 166135 + }, + { + "epoch": 18.277227722772277, + "grad_norm": 0.1957593709230423, + "learning_rate": 1.1217628979938388e-06, + "loss": 0.0531, + "num_input_tokens_seen": 35062464, + "step": 166140 + }, + { + "epoch": 18.27777777777778, + "grad_norm": 0.02118011750280857, + "learning_rate": 1.1210521354051707e-06, + "loss": 0.0387, + "num_input_tokens_seen": 35063520, + "step": 166145 + }, + { + "epoch": 18.27832783278328, + "grad_norm": 0.15096156299114227, + "learning_rate": 1.1203415928973033e-06, + "loss": 0.0469, + "num_input_tokens_seen": 35064640, + "step": 166150 + }, + { + "epoch": 18.278877887788777, + "grad_norm": 1.3040448427200317, + "learning_rate": 1.1196312704767952e-06, + "loss": 0.0178, + "num_input_tokens_seen": 35065664, + "step": 166155 + }, + { + "epoch": 18.27942794279428, + "grad_norm": 0.03806287795305252, + "learning_rate": 1.118921168150186e-06, + "loss": 0.0141, + "num_input_tokens_seen": 35066720, + "step": 166160 + }, + { + "epoch": 18.27997799779978, + "grad_norm": 0.03903118520975113, + "learning_rate": 1.1182112859240146e-06, + "loss": 0.0023, + "num_input_tokens_seen": 35067808, + "step": 166165 + }, + { + "epoch": 18.28052805280528, + "grad_norm": 0.0348641537129879, + "learning_rate": 1.117501623804837e-06, + "loss": 0.0041, + "num_input_tokens_seen": 35068832, + "step": 166170 + }, + { + "epoch": 18.28107810781078, + "grad_norm": 0.0019984785467386246, + "learning_rate": 1.1167921817991838e-06, + "loss": 0.0016, + "num_input_tokens_seen": 35069920, + "step": 166175 + }, + { + "epoch": 18.281628162816283, + "grad_norm": 0.10930381715297699, + "learning_rate": 1.1160829599135946e-06, + "loss": 0.0038, + "num_input_tokens_seen": 35071008, + "step": 166180 + }, + { + "epoch": 18.282178217821784, + "grad_norm": 0.15375091135501862, + "learning_rate": 1.1153739581546141e-06, + "loss": 0.044, + "num_input_tokens_seen": 35072128, + "step": 166185 + }, + { + "epoch": 18.28272827282728, + "grad_norm": 0.013175563886761665, + "learning_rate": 1.1146651765287648e-06, + "loss": 0.0026, + "num_input_tokens_seen": 35073184, + "step": 166190 + }, + { + "epoch": 18.283278327832782, + "grad_norm": 0.09766548126935959, + "learning_rate": 1.1139566150425885e-06, + "loss": 0.0039, + "num_input_tokens_seen": 35074240, + "step": 166195 + }, + { + "epoch": 18.283828382838283, + "grad_norm": 0.03179269656538963, + "learning_rate": 1.1132482737026162e-06, + "loss": 0.0023, + "num_input_tokens_seen": 35075296, + "step": 166200 + }, + { + "epoch": 18.284378437843785, + "grad_norm": 0.01483872253447771, + "learning_rate": 1.1125401525153623e-06, + "loss": 0.001, + "num_input_tokens_seen": 35076384, + "step": 166205 + }, + { + "epoch": 18.284928492849286, + "grad_norm": 0.06804005801677704, + "learning_rate": 1.1118322514873714e-06, + "loss": 0.004, + "num_input_tokens_seen": 35077472, + "step": 166210 + }, + { + "epoch": 18.285478547854787, + "grad_norm": 0.038243722170591354, + "learning_rate": 1.1111245706251578e-06, + "loss": 0.0144, + "num_input_tokens_seen": 35078528, + "step": 166215 + }, + { + "epoch": 18.286028602860284, + "grad_norm": 0.2821600139141083, + "learning_rate": 1.1104171099352467e-06, + "loss": 0.0055, + "num_input_tokens_seen": 35079552, + "step": 166220 + }, + { + "epoch": 18.286578657865785, + "grad_norm": 0.09699677675962448, + "learning_rate": 1.1097098694241582e-06, + "loss": 0.0025, + "num_input_tokens_seen": 35080608, + "step": 166225 + }, + { + "epoch": 18.287128712871286, + "grad_norm": 0.014948629774153233, + "learning_rate": 1.1090028490984089e-06, + "loss": 0.049, + "num_input_tokens_seen": 35081664, + "step": 166230 + }, + { + "epoch": 18.287678767876788, + "grad_norm": 0.0027460523415356874, + "learning_rate": 1.108296048964516e-06, + "loss": 0.0386, + "num_input_tokens_seen": 35082720, + "step": 166235 + }, + { + "epoch": 18.28822882288229, + "grad_norm": 0.039589881896972656, + "learning_rate": 1.107589469028994e-06, + "loss": 0.0042, + "num_input_tokens_seen": 35083776, + "step": 166240 + }, + { + "epoch": 18.28877887788779, + "grad_norm": 0.021097633987665176, + "learning_rate": 1.1068831092983567e-06, + "loss": 0.0252, + "num_input_tokens_seen": 35084864, + "step": 166245 + }, + { + "epoch": 18.28932893289329, + "grad_norm": 0.3711412847042084, + "learning_rate": 1.106176969779113e-06, + "loss": 0.0028, + "num_input_tokens_seen": 35085888, + "step": 166250 + }, + { + "epoch": 18.28987898789879, + "grad_norm": 0.028116492554545403, + "learning_rate": 1.1054710504777688e-06, + "loss": 0.005, + "num_input_tokens_seen": 35086912, + "step": 166255 + }, + { + "epoch": 18.29042904290429, + "grad_norm": 0.26405033469200134, + "learning_rate": 1.1047653514008355e-06, + "loss": 0.0042, + "num_input_tokens_seen": 35087936, + "step": 166260 + }, + { + "epoch": 18.29097909790979, + "grad_norm": 2.512803316116333, + "learning_rate": 1.1040598725548052e-06, + "loss": 0.066, + "num_input_tokens_seen": 35088992, + "step": 166265 + }, + { + "epoch": 18.29152915291529, + "grad_norm": 0.03882669657468796, + "learning_rate": 1.1033546139461975e-06, + "loss": 0.1157, + "num_input_tokens_seen": 35090144, + "step": 166270 + }, + { + "epoch": 18.292079207920793, + "grad_norm": 0.009045334532856941, + "learning_rate": 1.1026495755815046e-06, + "loss": 0.0033, + "num_input_tokens_seen": 35091232, + "step": 166275 + }, + { + "epoch": 18.292629262926294, + "grad_norm": 0.7341119050979614, + "learning_rate": 1.1019447574672187e-06, + "loss": 0.0092, + "num_input_tokens_seen": 35092224, + "step": 166280 + }, + { + "epoch": 18.293179317931795, + "grad_norm": 0.022209500893950462, + "learning_rate": 1.1012401596098426e-06, + "loss": 0.0043, + "num_input_tokens_seen": 35093216, + "step": 166285 + }, + { + "epoch": 18.293729372937293, + "grad_norm": 0.02571568265557289, + "learning_rate": 1.1005357820158656e-06, + "loss": 0.0018, + "num_input_tokens_seen": 35094336, + "step": 166290 + }, + { + "epoch": 18.294279427942794, + "grad_norm": 0.3247845768928528, + "learning_rate": 1.0998316246917828e-06, + "loss": 0.0541, + "num_input_tokens_seen": 35095360, + "step": 166295 + }, + { + "epoch": 18.294829482948295, + "grad_norm": 0.37910643219947815, + "learning_rate": 1.0991276876440887e-06, + "loss": 0.0036, + "num_input_tokens_seen": 35096448, + "step": 166300 + }, + { + "epoch": 18.295379537953796, + "grad_norm": 0.025256069377064705, + "learning_rate": 1.0984239708792588e-06, + "loss": 0.0709, + "num_input_tokens_seen": 35097504, + "step": 166305 + }, + { + "epoch": 18.295929592959297, + "grad_norm": 0.007233298383653164, + "learning_rate": 1.0977204744037906e-06, + "loss": 0.0119, + "num_input_tokens_seen": 35098560, + "step": 166310 + }, + { + "epoch": 18.296479647964798, + "grad_norm": 0.9127904772758484, + "learning_rate": 1.0970171982241624e-06, + "loss": 0.0181, + "num_input_tokens_seen": 35099680, + "step": 166315 + }, + { + "epoch": 18.297029702970296, + "grad_norm": 0.02753666415810585, + "learning_rate": 1.0963141423468575e-06, + "loss": 0.0029, + "num_input_tokens_seen": 35100736, + "step": 166320 + }, + { + "epoch": 18.297579757975797, + "grad_norm": 0.015338655561208725, + "learning_rate": 1.0956113067783547e-06, + "loss": 0.0316, + "num_input_tokens_seen": 35101760, + "step": 166325 + }, + { + "epoch": 18.298129812981298, + "grad_norm": 0.06680811196565628, + "learning_rate": 1.0949086915251288e-06, + "loss": 0.0024, + "num_input_tokens_seen": 35102816, + "step": 166330 + }, + { + "epoch": 18.2986798679868, + "grad_norm": 0.016967562958598137, + "learning_rate": 1.094206296593664e-06, + "loss": 0.0039, + "num_input_tokens_seen": 35103840, + "step": 166335 + }, + { + "epoch": 18.2992299229923, + "grad_norm": 0.09777634590864182, + "learning_rate": 1.093504121990424e-06, + "loss": 0.0068, + "num_input_tokens_seen": 35104928, + "step": 166340 + }, + { + "epoch": 18.2997799779978, + "grad_norm": 0.0050690555945038795, + "learning_rate": 1.0928021677218903e-06, + "loss": 0.0039, + "num_input_tokens_seen": 35106080, + "step": 166345 + }, + { + "epoch": 18.300330033003302, + "grad_norm": 0.02842104434967041, + "learning_rate": 1.0921004337945212e-06, + "loss": 0.098, + "num_input_tokens_seen": 35107168, + "step": 166350 + }, + { + "epoch": 18.3008800880088, + "grad_norm": 0.16875427961349487, + "learning_rate": 1.0913989202147922e-06, + "loss": 0.0019, + "num_input_tokens_seen": 35108192, + "step": 166355 + }, + { + "epoch": 18.3014301430143, + "grad_norm": 2.507783889770508, + "learning_rate": 1.0906976269891706e-06, + "loss": 0.1694, + "num_input_tokens_seen": 35109248, + "step": 166360 + }, + { + "epoch": 18.301980198019802, + "grad_norm": 0.014002717100083828, + "learning_rate": 1.0899965541241119e-06, + "loss": 0.0015, + "num_input_tokens_seen": 35110304, + "step": 166365 + }, + { + "epoch": 18.302530253025303, + "grad_norm": 0.7831909656524658, + "learning_rate": 1.0892957016260836e-06, + "loss": 0.0116, + "num_input_tokens_seen": 35111328, + "step": 166370 + }, + { + "epoch": 18.303080308030804, + "grad_norm": 0.04411373287439346, + "learning_rate": 1.0885950695015467e-06, + "loss": 0.002, + "num_input_tokens_seen": 35112384, + "step": 166375 + }, + { + "epoch": 18.303630363036305, + "grad_norm": 0.12202242761850357, + "learning_rate": 1.0878946577569465e-06, + "loss": 0.0031, + "num_input_tokens_seen": 35113472, + "step": 166380 + }, + { + "epoch": 18.304180418041803, + "grad_norm": 0.09882645308971405, + "learning_rate": 1.0871944663987526e-06, + "loss": 0.0097, + "num_input_tokens_seen": 35114496, + "step": 166385 + }, + { + "epoch": 18.304730473047304, + "grad_norm": 0.19083045423030853, + "learning_rate": 1.0864944954334155e-06, + "loss": 0.0303, + "num_input_tokens_seen": 35115520, + "step": 166390 + }, + { + "epoch": 18.305280528052805, + "grad_norm": 0.2287643998861313, + "learning_rate": 1.08579474486738e-06, + "loss": 0.006, + "num_input_tokens_seen": 35116608, + "step": 166395 + }, + { + "epoch": 18.305830583058306, + "grad_norm": 0.008700860664248466, + "learning_rate": 1.085095214707102e-06, + "loss": 0.0016, + "num_input_tokens_seen": 35117664, + "step": 166400 + }, + { + "epoch": 18.306380638063807, + "grad_norm": 0.18224206566810608, + "learning_rate": 1.0843959049590208e-06, + "loss": 0.0133, + "num_input_tokens_seen": 35118720, + "step": 166405 + }, + { + "epoch": 18.306930693069308, + "grad_norm": 0.028592156246304512, + "learning_rate": 1.0836968156295895e-06, + "loss": 0.0023, + "num_input_tokens_seen": 35119776, + "step": 166410 + }, + { + "epoch": 18.30748074807481, + "grad_norm": 0.059060610830783844, + "learning_rate": 1.0829979467252504e-06, + "loss": 0.005, + "num_input_tokens_seen": 35120832, + "step": 166415 + }, + { + "epoch": 18.308030803080307, + "grad_norm": 0.01801321655511856, + "learning_rate": 1.08229929825244e-06, + "loss": 0.054, + "num_input_tokens_seen": 35121856, + "step": 166420 + }, + { + "epoch": 18.308580858085808, + "grad_norm": 0.13909125328063965, + "learning_rate": 1.081600870217603e-06, + "loss": 0.0058, + "num_input_tokens_seen": 35122912, + "step": 166425 + }, + { + "epoch": 18.30913091309131, + "grad_norm": 0.054257333278656006, + "learning_rate": 1.0809026626271701e-06, + "loss": 0.1691, + "num_input_tokens_seen": 35123968, + "step": 166430 + }, + { + "epoch": 18.30968096809681, + "grad_norm": 1.6187869310379028, + "learning_rate": 1.0802046754875839e-06, + "loss": 0.1019, + "num_input_tokens_seen": 35125024, + "step": 166435 + }, + { + "epoch": 18.31023102310231, + "grad_norm": 0.01303129643201828, + "learning_rate": 1.0795069088052694e-06, + "loss": 0.0036, + "num_input_tokens_seen": 35126048, + "step": 166440 + }, + { + "epoch": 18.310781078107812, + "grad_norm": 0.5525730848312378, + "learning_rate": 1.0788093625866608e-06, + "loss": 0.0102, + "num_input_tokens_seen": 35127104, + "step": 166445 + }, + { + "epoch": 18.31133113311331, + "grad_norm": 2.3927652835845947, + "learning_rate": 1.0781120368381913e-06, + "loss": 0.1318, + "num_input_tokens_seen": 35128224, + "step": 166450 + }, + { + "epoch": 18.31188118811881, + "grad_norm": 0.046280283480882645, + "learning_rate": 1.077414931566284e-06, + "loss": 0.0029, + "num_input_tokens_seen": 35129280, + "step": 166455 + }, + { + "epoch": 18.312431243124312, + "grad_norm": 0.00712664844468236, + "learning_rate": 1.076718046777364e-06, + "loss": 0.0073, + "num_input_tokens_seen": 35130368, + "step": 166460 + }, + { + "epoch": 18.312981298129813, + "grad_norm": 2.619215250015259, + "learning_rate": 1.0760213824778515e-06, + "loss": 0.0638, + "num_input_tokens_seen": 35131456, + "step": 166465 + }, + { + "epoch": 18.313531353135314, + "grad_norm": 0.018005261197686195, + "learning_rate": 1.0753249386741715e-06, + "loss": 0.0156, + "num_input_tokens_seen": 35132576, + "step": 166470 + }, + { + "epoch": 18.314081408140815, + "grad_norm": 0.03327411785721779, + "learning_rate": 1.0746287153727442e-06, + "loss": 0.0091, + "num_input_tokens_seen": 35133632, + "step": 166475 + }, + { + "epoch": 18.314631463146316, + "grad_norm": 0.23407478630542755, + "learning_rate": 1.073932712579981e-06, + "loss": 0.0031, + "num_input_tokens_seen": 35134656, + "step": 166480 + }, + { + "epoch": 18.315181518151814, + "grad_norm": 0.030719846487045288, + "learning_rate": 1.0732369303023015e-06, + "loss": 0.005, + "num_input_tokens_seen": 35135680, + "step": 166485 + }, + { + "epoch": 18.315731573157315, + "grad_norm": 0.0062553309835493565, + "learning_rate": 1.072541368546115e-06, + "loss": 0.0483, + "num_input_tokens_seen": 35136800, + "step": 166490 + }, + { + "epoch": 18.316281628162816, + "grad_norm": 0.4389220178127289, + "learning_rate": 1.0718460273178299e-06, + "loss": 0.0058, + "num_input_tokens_seen": 35137856, + "step": 166495 + }, + { + "epoch": 18.316831683168317, + "grad_norm": 0.45104798674583435, + "learning_rate": 1.0711509066238607e-06, + "loss": 0.0047, + "num_input_tokens_seen": 35138880, + "step": 166500 + }, + { + "epoch": 18.317381738173818, + "grad_norm": 0.20783191919326782, + "learning_rate": 1.0704560064706105e-06, + "loss": 0.0037, + "num_input_tokens_seen": 35140000, + "step": 166505 + }, + { + "epoch": 18.31793179317932, + "grad_norm": 0.2369833141565323, + "learning_rate": 1.0697613268644852e-06, + "loss": 0.0026, + "num_input_tokens_seen": 35141024, + "step": 166510 + }, + { + "epoch": 18.318481848184817, + "grad_norm": 0.03579409793019295, + "learning_rate": 1.0690668678118909e-06, + "loss": 0.0011, + "num_input_tokens_seen": 35142080, + "step": 166515 + }, + { + "epoch": 18.319031903190318, + "grad_norm": 0.0407845638692379, + "learning_rate": 1.068372629319217e-06, + "loss": 0.0112, + "num_input_tokens_seen": 35143072, + "step": 166520 + }, + { + "epoch": 18.31958195819582, + "grad_norm": 0.006243912968784571, + "learning_rate": 1.067678611392872e-06, + "loss": 0.0349, + "num_input_tokens_seen": 35144096, + "step": 166525 + }, + { + "epoch": 18.32013201320132, + "grad_norm": 0.7222011685371399, + "learning_rate": 1.0669848140392486e-06, + "loss": 0.0765, + "num_input_tokens_seen": 35145120, + "step": 166530 + }, + { + "epoch": 18.32068206820682, + "grad_norm": 0.3166934847831726, + "learning_rate": 1.0662912372647437e-06, + "loss": 0.0044, + "num_input_tokens_seen": 35146144, + "step": 166535 + }, + { + "epoch": 18.321232123212322, + "grad_norm": 0.24714817106723785, + "learning_rate": 1.06559788107575e-06, + "loss": 0.0025, + "num_input_tokens_seen": 35147232, + "step": 166540 + }, + { + "epoch": 18.321782178217823, + "grad_norm": 0.019614964723587036, + "learning_rate": 1.0649047454786538e-06, + "loss": 0.1319, + "num_input_tokens_seen": 35148288, + "step": 166545 + }, + { + "epoch": 18.32233223322332, + "grad_norm": 0.0075294263660907745, + "learning_rate": 1.0642118304798442e-06, + "loss": 0.013, + "num_input_tokens_seen": 35149408, + "step": 166550 + }, + { + "epoch": 18.322882288228822, + "grad_norm": 0.019651127979159355, + "learning_rate": 1.0635191360857083e-06, + "loss": 0.0371, + "num_input_tokens_seen": 35150496, + "step": 166555 + }, + { + "epoch": 18.323432343234323, + "grad_norm": 0.10427361726760864, + "learning_rate": 1.0628266623026323e-06, + "loss": 0.0253, + "num_input_tokens_seen": 35151520, + "step": 166560 + }, + { + "epoch": 18.323982398239824, + "grad_norm": 0.010099577717483044, + "learning_rate": 1.0621344091369972e-06, + "loss": 0.0887, + "num_input_tokens_seen": 35152576, + "step": 166565 + }, + { + "epoch": 18.324532453245325, + "grad_norm": 0.10029688477516174, + "learning_rate": 1.0614423765951786e-06, + "loss": 0.0061, + "num_input_tokens_seen": 35153632, + "step": 166570 + }, + { + "epoch": 18.325082508250826, + "grad_norm": 0.017958134412765503, + "learning_rate": 1.0607505646835659e-06, + "loss": 0.0006, + "num_input_tokens_seen": 35154720, + "step": 166575 + }, + { + "epoch": 18.325632563256324, + "grad_norm": 0.016213351860642433, + "learning_rate": 1.060058973408523e-06, + "loss": 0.0541, + "num_input_tokens_seen": 35155744, + "step": 166580 + }, + { + "epoch": 18.326182618261825, + "grad_norm": 0.0963866263628006, + "learning_rate": 1.0593676027764287e-06, + "loss": 0.0022, + "num_input_tokens_seen": 35156768, + "step": 166585 + }, + { + "epoch": 18.326732673267326, + "grad_norm": 0.02714371494948864, + "learning_rate": 1.0586764527936583e-06, + "loss": 0.0017, + "num_input_tokens_seen": 35157824, + "step": 166590 + }, + { + "epoch": 18.327282728272827, + "grad_norm": 0.049843594431877136, + "learning_rate": 1.0579855234665758e-06, + "loss": 0.0024, + "num_input_tokens_seen": 35158880, + "step": 166595 + }, + { + "epoch": 18.32783278327833, + "grad_norm": 0.021809719502925873, + "learning_rate": 1.0572948148015543e-06, + "loss": 0.0428, + "num_input_tokens_seen": 35159936, + "step": 166600 + }, + { + "epoch": 18.32838283828383, + "grad_norm": 0.026486987248063087, + "learning_rate": 1.0566043268049608e-06, + "loss": 0.0043, + "num_input_tokens_seen": 35160992, + "step": 166605 + }, + { + "epoch": 18.32893289328933, + "grad_norm": 0.32089003920555115, + "learning_rate": 1.0559140594831512e-06, + "loss": 0.0084, + "num_input_tokens_seen": 35162016, + "step": 166610 + }, + { + "epoch": 18.329482948294828, + "grad_norm": 0.041565556079149246, + "learning_rate": 1.05522401284249e-06, + "loss": 0.0008, + "num_input_tokens_seen": 35163040, + "step": 166615 + }, + { + "epoch": 18.33003300330033, + "grad_norm": 0.03678906336426735, + "learning_rate": 1.0545341868893416e-06, + "loss": 0.0059, + "num_input_tokens_seen": 35164096, + "step": 166620 + }, + { + "epoch": 18.33058305830583, + "grad_norm": 2.8005199432373047, + "learning_rate": 1.0538445816300646e-06, + "loss": 0.0948, + "num_input_tokens_seen": 35165216, + "step": 166625 + }, + { + "epoch": 18.33113311331133, + "grad_norm": 0.07427207380533218, + "learning_rate": 1.0531551970710124e-06, + "loss": 0.0263, + "num_input_tokens_seen": 35166272, + "step": 166630 + }, + { + "epoch": 18.331683168316832, + "grad_norm": 0.020344754680991173, + "learning_rate": 1.0524660332185326e-06, + "loss": 0.0013, + "num_input_tokens_seen": 35167328, + "step": 166635 + }, + { + "epoch": 18.332233223322334, + "grad_norm": 0.016901744529604912, + "learning_rate": 1.0517770900789843e-06, + "loss": 0.0846, + "num_input_tokens_seen": 35168416, + "step": 166640 + }, + { + "epoch": 18.33278327832783, + "grad_norm": 0.034462302923202515, + "learning_rate": 1.0510883676587147e-06, + "loss": 0.0459, + "num_input_tokens_seen": 35169472, + "step": 166645 + }, + { + "epoch": 18.333333333333332, + "grad_norm": 0.011323507875204086, + "learning_rate": 1.0503998659640747e-06, + "loss": 0.1463, + "num_input_tokens_seen": 35170496, + "step": 166650 + }, + { + "epoch": 18.333883388338833, + "grad_norm": 0.07027214020490646, + "learning_rate": 1.0497115850014062e-06, + "loss": 0.0269, + "num_input_tokens_seen": 35171520, + "step": 166655 + }, + { + "epoch": 18.334433443344334, + "grad_norm": 0.019805701449513435, + "learning_rate": 1.0490235247770513e-06, + "loss": 0.0022, + "num_input_tokens_seen": 35172544, + "step": 166660 + }, + { + "epoch": 18.334983498349835, + "grad_norm": 0.4705648422241211, + "learning_rate": 1.0483356852973553e-06, + "loss": 0.0576, + "num_input_tokens_seen": 35173632, + "step": 166665 + }, + { + "epoch": 18.335533553355337, + "grad_norm": 0.041349586099386215, + "learning_rate": 1.0476480665686545e-06, + "loss": 0.1589, + "num_input_tokens_seen": 35174624, + "step": 166670 + }, + { + "epoch": 18.336083608360838, + "grad_norm": 0.03279103711247444, + "learning_rate": 1.0469606685972882e-06, + "loss": 0.0118, + "num_input_tokens_seen": 35175712, + "step": 166675 + }, + { + "epoch": 18.336633663366335, + "grad_norm": 0.301395982503891, + "learning_rate": 1.0462734913895934e-06, + "loss": 0.0779, + "num_input_tokens_seen": 35176768, + "step": 166680 + }, + { + "epoch": 18.337183718371836, + "grad_norm": 0.0548107884824276, + "learning_rate": 1.0455865349519006e-06, + "loss": 0.0015, + "num_input_tokens_seen": 35177856, + "step": 166685 + }, + { + "epoch": 18.337733773377337, + "grad_norm": 0.012355681508779526, + "learning_rate": 1.0448997992905441e-06, + "loss": 0.002, + "num_input_tokens_seen": 35178848, + "step": 166690 + }, + { + "epoch": 18.33828382838284, + "grad_norm": 0.2370024472475052, + "learning_rate": 1.044213284411849e-06, + "loss": 0.1, + "num_input_tokens_seen": 35179936, + "step": 166695 + }, + { + "epoch": 18.33883388338834, + "grad_norm": 0.01660301350057125, + "learning_rate": 1.043526990322144e-06, + "loss": 0.002, + "num_input_tokens_seen": 35180992, + "step": 166700 + }, + { + "epoch": 18.33938393839384, + "grad_norm": 0.0084525216370821, + "learning_rate": 1.042840917027757e-06, + "loss": 0.0011, + "num_input_tokens_seen": 35181984, + "step": 166705 + }, + { + "epoch": 18.33993399339934, + "grad_norm": 0.020665843039751053, + "learning_rate": 1.042155064535008e-06, + "loss": 0.1735, + "num_input_tokens_seen": 35182944, + "step": 166710 + }, + { + "epoch": 18.34048404840484, + "grad_norm": 0.03649279847741127, + "learning_rate": 1.0414694328502256e-06, + "loss": 0.0191, + "num_input_tokens_seen": 35184032, + "step": 166715 + }, + { + "epoch": 18.34103410341034, + "grad_norm": 1.7705092430114746, + "learning_rate": 1.0407840219797154e-06, + "loss": 0.0988, + "num_input_tokens_seen": 35185056, + "step": 166720 + }, + { + "epoch": 18.34158415841584, + "grad_norm": 0.038803327828645706, + "learning_rate": 1.040098831929809e-06, + "loss": 0.0461, + "num_input_tokens_seen": 35186112, + "step": 166725 + }, + { + "epoch": 18.342134213421343, + "grad_norm": 0.04206390306353569, + "learning_rate": 1.039413862706809e-06, + "loss": 0.0019, + "num_input_tokens_seen": 35187136, + "step": 166730 + }, + { + "epoch": 18.342684268426844, + "grad_norm": 0.0911054015159607, + "learning_rate": 1.038729114317033e-06, + "loss": 0.0029, + "num_input_tokens_seen": 35188128, + "step": 166735 + }, + { + "epoch": 18.343234323432345, + "grad_norm": 0.8235071301460266, + "learning_rate": 1.0380445867667982e-06, + "loss": 0.0092, + "num_input_tokens_seen": 35189152, + "step": 166740 + }, + { + "epoch": 18.343784378437842, + "grad_norm": 2.7929396629333496, + "learning_rate": 1.0373602800624049e-06, + "loss": 0.0697, + "num_input_tokens_seen": 35190176, + "step": 166745 + }, + { + "epoch": 18.344334433443343, + "grad_norm": 0.2939825654029846, + "learning_rate": 1.0366761942101677e-06, + "loss": 0.1399, + "num_input_tokens_seen": 35191232, + "step": 166750 + }, + { + "epoch": 18.344884488448844, + "grad_norm": 0.27388498187065125, + "learning_rate": 1.0359923292163815e-06, + "loss": 0.0719, + "num_input_tokens_seen": 35192320, + "step": 166755 + }, + { + "epoch": 18.345434543454346, + "grad_norm": 0.12542709708213806, + "learning_rate": 1.0353086850873578e-06, + "loss": 0.0027, + "num_input_tokens_seen": 35193376, + "step": 166760 + }, + { + "epoch": 18.345984598459847, + "grad_norm": 0.003982010297477245, + "learning_rate": 1.0346252618293971e-06, + "loss": 0.0012, + "num_input_tokens_seen": 35194464, + "step": 166765 + }, + { + "epoch": 18.346534653465348, + "grad_norm": 0.0547553189098835, + "learning_rate": 1.033942059448792e-06, + "loss": 0.0046, + "num_input_tokens_seen": 35195552, + "step": 166770 + }, + { + "epoch": 18.34708470847085, + "grad_norm": 0.022698530927300453, + "learning_rate": 1.0332590779518452e-06, + "loss": 0.0009, + "num_input_tokens_seen": 35196576, + "step": 166775 + }, + { + "epoch": 18.347634763476346, + "grad_norm": 0.04130934923887253, + "learning_rate": 1.0325763173448494e-06, + "loss": 0.0199, + "num_input_tokens_seen": 35197632, + "step": 166780 + }, + { + "epoch": 18.348184818481847, + "grad_norm": 0.008270606398582458, + "learning_rate": 1.0318937776340937e-06, + "loss": 0.0011, + "num_input_tokens_seen": 35198720, + "step": 166785 + }, + { + "epoch": 18.34873487348735, + "grad_norm": 0.008896755054593086, + "learning_rate": 1.031211458825873e-06, + "loss": 0.0617, + "num_input_tokens_seen": 35199840, + "step": 166790 + }, + { + "epoch": 18.34928492849285, + "grad_norm": 0.024950269609689713, + "learning_rate": 1.0305293609264771e-06, + "loss": 0.105, + "num_input_tokens_seen": 35200864, + "step": 166795 + }, + { + "epoch": 18.34983498349835, + "grad_norm": 0.02418488636612892, + "learning_rate": 1.0298474839421895e-06, + "loss": 0.037, + "num_input_tokens_seen": 35201920, + "step": 166800 + }, + { + "epoch": 18.350385038503852, + "grad_norm": 0.21608012914657593, + "learning_rate": 1.0291658278792971e-06, + "loss": 0.1164, + "num_input_tokens_seen": 35203040, + "step": 166805 + }, + { + "epoch": 18.35093509350935, + "grad_norm": 0.02408234030008316, + "learning_rate": 1.0284843927440751e-06, + "loss": 0.0036, + "num_input_tokens_seen": 35204096, + "step": 166810 + }, + { + "epoch": 18.35148514851485, + "grad_norm": 0.01654979959130287, + "learning_rate": 1.0278031785428132e-06, + "loss": 0.1165, + "num_input_tokens_seen": 35205216, + "step": 166815 + }, + { + "epoch": 18.35203520352035, + "grad_norm": 0.10002925246953964, + "learning_rate": 1.0271221852817896e-06, + "loss": 0.0167, + "num_input_tokens_seen": 35206272, + "step": 166820 + }, + { + "epoch": 18.352585258525853, + "grad_norm": 0.10956770181655884, + "learning_rate": 1.0264414129672745e-06, + "loss": 0.0016, + "num_input_tokens_seen": 35207424, + "step": 166825 + }, + { + "epoch": 18.353135313531354, + "grad_norm": 0.007209957577288151, + "learning_rate": 1.0257608616055458e-06, + "loss": 0.0023, + "num_input_tokens_seen": 35208448, + "step": 166830 + }, + { + "epoch": 18.353685368536855, + "grad_norm": 0.007830333895981312, + "learning_rate": 1.0250805312028739e-06, + "loss": 0.1072, + "num_input_tokens_seen": 35209504, + "step": 166835 + }, + { + "epoch": 18.354235423542356, + "grad_norm": 0.019316498190164566, + "learning_rate": 1.024400421765534e-06, + "loss": 0.0009, + "num_input_tokens_seen": 35210592, + "step": 166840 + }, + { + "epoch": 18.354785478547853, + "grad_norm": 0.07813102006912231, + "learning_rate": 1.023720533299788e-06, + "loss": 0.0022, + "num_input_tokens_seen": 35211584, + "step": 166845 + }, + { + "epoch": 18.355335533553355, + "grad_norm": 0.02933388389647007, + "learning_rate": 1.0230408658119028e-06, + "loss": 0.0014, + "num_input_tokens_seen": 35212640, + "step": 166850 + }, + { + "epoch": 18.355885588558856, + "grad_norm": 0.708926796913147, + "learning_rate": 1.0223614193081515e-06, + "loss": 0.0869, + "num_input_tokens_seen": 35213696, + "step": 166855 + }, + { + "epoch": 18.356435643564357, + "grad_norm": 0.2043607532978058, + "learning_rate": 1.0216821937947818e-06, + "loss": 0.0119, + "num_input_tokens_seen": 35214752, + "step": 166860 + }, + { + "epoch": 18.356985698569858, + "grad_norm": 0.28261709213256836, + "learning_rate": 1.0210031892780664e-06, + "loss": 0.0886, + "num_input_tokens_seen": 35215744, + "step": 166865 + }, + { + "epoch": 18.35753575357536, + "grad_norm": 0.018582668155431747, + "learning_rate": 1.0203244057642557e-06, + "loss": 0.0015, + "num_input_tokens_seen": 35216800, + "step": 166870 + }, + { + "epoch": 18.358085808580856, + "grad_norm": 0.13941897451877594, + "learning_rate": 1.019645843259609e-06, + "loss": 0.0571, + "num_input_tokens_seen": 35217920, + "step": 166875 + }, + { + "epoch": 18.358635863586358, + "grad_norm": 0.01076181698590517, + "learning_rate": 1.018967501770382e-06, + "loss": 0.008, + "num_input_tokens_seen": 35218880, + "step": 166880 + }, + { + "epoch": 18.35918591859186, + "grad_norm": 0.024830035865306854, + "learning_rate": 1.01828938130282e-06, + "loss": 0.0059, + "num_input_tokens_seen": 35219968, + "step": 166885 + }, + { + "epoch": 18.35973597359736, + "grad_norm": 0.01462364662438631, + "learning_rate": 1.0176114818631815e-06, + "loss": 0.0535, + "num_input_tokens_seen": 35221088, + "step": 166890 + }, + { + "epoch": 18.36028602860286, + "grad_norm": 0.008493665605783463, + "learning_rate": 1.016933803457712e-06, + "loss": 0.0985, + "num_input_tokens_seen": 35222144, + "step": 166895 + }, + { + "epoch": 18.360836083608362, + "grad_norm": 0.05199991166591644, + "learning_rate": 1.0162563460926505e-06, + "loss": 0.0232, + "num_input_tokens_seen": 35223136, + "step": 166900 + }, + { + "epoch": 18.361386138613863, + "grad_norm": 0.5905939936637878, + "learning_rate": 1.015579109774245e-06, + "loss": 0.0217, + "num_input_tokens_seen": 35224192, + "step": 166905 + }, + { + "epoch": 18.36193619361936, + "grad_norm": 0.028999140486121178, + "learning_rate": 1.014902094508738e-06, + "loss": 0.0438, + "num_input_tokens_seen": 35225216, + "step": 166910 + }, + { + "epoch": 18.36248624862486, + "grad_norm": 0.0515727624297142, + "learning_rate": 1.0142253003023739e-06, + "loss": 0.076, + "num_input_tokens_seen": 35226336, + "step": 166915 + }, + { + "epoch": 18.363036303630363, + "grad_norm": 1.8137335777282715, + "learning_rate": 1.013548727161384e-06, + "loss": 0.0248, + "num_input_tokens_seen": 35227392, + "step": 166920 + }, + { + "epoch": 18.363586358635864, + "grad_norm": 0.1666201800107956, + "learning_rate": 1.0128723750920027e-06, + "loss": 0.0026, + "num_input_tokens_seen": 35228416, + "step": 166925 + }, + { + "epoch": 18.364136413641365, + "grad_norm": 0.0426940880715847, + "learning_rate": 1.012196244100469e-06, + "loss": 0.0379, + "num_input_tokens_seen": 35229472, + "step": 166930 + }, + { + "epoch": 18.364686468646866, + "grad_norm": 0.7793228030204773, + "learning_rate": 1.0115203341930113e-06, + "loss": 0.0057, + "num_input_tokens_seen": 35230560, + "step": 166935 + }, + { + "epoch": 18.365236523652364, + "grad_norm": 0.02700096368789673, + "learning_rate": 1.0108446453758607e-06, + "loss": 0.0876, + "num_input_tokens_seen": 35231584, + "step": 166940 + }, + { + "epoch": 18.365786578657865, + "grad_norm": 2.3483517169952393, + "learning_rate": 1.0101691776552458e-06, + "loss": 0.0528, + "num_input_tokens_seen": 35232608, + "step": 166945 + }, + { + "epoch": 18.366336633663366, + "grad_norm": 0.03815969452261925, + "learning_rate": 1.0094939310373863e-06, + "loss": 0.0059, + "num_input_tokens_seen": 35233728, + "step": 166950 + }, + { + "epoch": 18.366886688668867, + "grad_norm": 0.0711260586977005, + "learning_rate": 1.0088189055285135e-06, + "loss": 0.0664, + "num_input_tokens_seen": 35234752, + "step": 166955 + }, + { + "epoch": 18.367436743674368, + "grad_norm": 0.01049906387925148, + "learning_rate": 1.008144101134839e-06, + "loss": 0.0027, + "num_input_tokens_seen": 35235840, + "step": 166960 + }, + { + "epoch": 18.36798679867987, + "grad_norm": 1.5676240921020508, + "learning_rate": 1.0074695178625914e-06, + "loss": 0.0523, + "num_input_tokens_seen": 35236928, + "step": 166965 + }, + { + "epoch": 18.36853685368537, + "grad_norm": 0.09300018846988678, + "learning_rate": 1.006795155717985e-06, + "loss": 0.0225, + "num_input_tokens_seen": 35237984, + "step": 166970 + }, + { + "epoch": 18.369086908690868, + "grad_norm": 0.02830621227622032, + "learning_rate": 1.0061210147072314e-06, + "loss": 0.0024, + "num_input_tokens_seen": 35239008, + "step": 166975 + }, + { + "epoch": 18.36963696369637, + "grad_norm": 0.02397163212299347, + "learning_rate": 1.0054470948365535e-06, + "loss": 0.0011, + "num_input_tokens_seen": 35240096, + "step": 166980 + }, + { + "epoch": 18.37018701870187, + "grad_norm": 0.042611271142959595, + "learning_rate": 1.004773396112152e-06, + "loss": 0.0837, + "num_input_tokens_seen": 35241120, + "step": 166985 + }, + { + "epoch": 18.37073707370737, + "grad_norm": 0.01198519580066204, + "learning_rate": 1.0040999185402355e-06, + "loss": 0.0014, + "num_input_tokens_seen": 35242208, + "step": 166990 + }, + { + "epoch": 18.371287128712872, + "grad_norm": 0.19069159030914307, + "learning_rate": 1.0034266621270216e-06, + "loss": 0.0579, + "num_input_tokens_seen": 35243264, + "step": 166995 + }, + { + "epoch": 18.371837183718373, + "grad_norm": 0.016527293249964714, + "learning_rate": 1.002753626878708e-06, + "loss": 0.0369, + "num_input_tokens_seen": 35244320, + "step": 167000 + }, + { + "epoch": 18.37238723872387, + "grad_norm": 2.6457953453063965, + "learning_rate": 1.0020808128015009e-06, + "loss": 0.0455, + "num_input_tokens_seen": 35245440, + "step": 167005 + }, + { + "epoch": 18.372937293729372, + "grad_norm": 1.59514582157135, + "learning_rate": 1.0014082199016006e-06, + "loss": 0.0598, + "num_input_tokens_seen": 35246496, + "step": 167010 + }, + { + "epoch": 18.373487348734873, + "grad_norm": 0.15123379230499268, + "learning_rate": 1.0007358481851998e-06, + "loss": 0.0066, + "num_input_tokens_seen": 35247552, + "step": 167015 + }, + { + "epoch": 18.374037403740374, + "grad_norm": 0.4875313341617584, + "learning_rate": 1.0000636976585043e-06, + "loss": 0.0054, + "num_input_tokens_seen": 35248608, + "step": 167020 + }, + { + "epoch": 18.374587458745875, + "grad_norm": 1.188515067100525, + "learning_rate": 9.99391768327701e-07, + "loss": 0.0762, + "num_input_tokens_seen": 35249600, + "step": 167025 + }, + { + "epoch": 18.375137513751376, + "grad_norm": 3.1470608711242676, + "learning_rate": 9.987200601989932e-07, + "loss": 0.0684, + "num_input_tokens_seen": 35250624, + "step": 167030 + }, + { + "epoch": 18.375687568756877, + "grad_norm": 0.04783688113093376, + "learning_rate": 9.98048573278565e-07, + "loss": 0.002, + "num_input_tokens_seen": 35251712, + "step": 167035 + }, + { + "epoch": 18.376237623762375, + "grad_norm": 1.771085262298584, + "learning_rate": 9.973773075726029e-07, + "loss": 0.1122, + "num_input_tokens_seen": 35252800, + "step": 167040 + }, + { + "epoch": 18.376787678767876, + "grad_norm": 0.019100923091173172, + "learning_rate": 9.967062630872991e-07, + "loss": 0.008, + "num_input_tokens_seen": 35253824, + "step": 167045 + }, + { + "epoch": 18.377337733773377, + "grad_norm": 0.03398611769080162, + "learning_rate": 9.960354398288296e-07, + "loss": 0.0843, + "num_input_tokens_seen": 35254848, + "step": 167050 + }, + { + "epoch": 18.377887788778878, + "grad_norm": 0.02730836160480976, + "learning_rate": 9.95364837803392e-07, + "loss": 0.0703, + "num_input_tokens_seen": 35255872, + "step": 167055 + }, + { + "epoch": 18.37843784378438, + "grad_norm": 0.05303356423974037, + "learning_rate": 9.946944570171536e-07, + "loss": 0.0051, + "num_input_tokens_seen": 35256864, + "step": 167060 + }, + { + "epoch": 18.37898789878988, + "grad_norm": 0.2095423936843872, + "learning_rate": 9.940242974762981e-07, + "loss": 0.0093, + "num_input_tokens_seen": 35257824, + "step": 167065 + }, + { + "epoch": 18.379537953795378, + "grad_norm": 0.005528084002435207, + "learning_rate": 9.933543591870044e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35258880, + "step": 167070 + }, + { + "epoch": 18.38008800880088, + "grad_norm": 0.009790726937353611, + "learning_rate": 9.926846421554393e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35259872, + "step": 167075 + }, + { + "epoch": 18.38063806380638, + "grad_norm": 0.01400841400027275, + "learning_rate": 9.920151463877785e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35260896, + "step": 167080 + }, + { + "epoch": 18.38118811881188, + "grad_norm": 0.05484892800450325, + "learning_rate": 9.913458718902007e-07, + "loss": 0.0883, + "num_input_tokens_seen": 35261952, + "step": 167085 + }, + { + "epoch": 18.381738173817382, + "grad_norm": 0.1027967780828476, + "learning_rate": 9.90676818668862e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35262944, + "step": 167090 + }, + { + "epoch": 18.382288228822883, + "grad_norm": 0.05214071273803711, + "learning_rate": 9.900079867299379e-07, + "loss": 0.0172, + "num_input_tokens_seen": 35263968, + "step": 167095 + }, + { + "epoch": 18.382838283828384, + "grad_norm": 4.588684558868408, + "learning_rate": 9.893393760795871e-07, + "loss": 0.0417, + "num_input_tokens_seen": 35265056, + "step": 167100 + }, + { + "epoch": 18.383388338833882, + "grad_norm": 0.8792474865913391, + "learning_rate": 9.886709867239774e-07, + "loss": 0.0065, + "num_input_tokens_seen": 35266176, + "step": 167105 + }, + { + "epoch": 18.383938393839383, + "grad_norm": 0.025743581354618073, + "learning_rate": 9.880028186692591e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35267328, + "step": 167110 + }, + { + "epoch": 18.384488448844884, + "grad_norm": 0.46983590722084045, + "learning_rate": 9.873348719215996e-07, + "loss": 0.136, + "num_input_tokens_seen": 35268288, + "step": 167115 + }, + { + "epoch": 18.385038503850385, + "grad_norm": 0.030286109074950218, + "learning_rate": 9.866671464871552e-07, + "loss": 0.0134, + "num_input_tokens_seen": 35269344, + "step": 167120 + }, + { + "epoch": 18.385588558855886, + "grad_norm": 0.01395462267100811, + "learning_rate": 9.859996423720708e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35270368, + "step": 167125 + }, + { + "epoch": 18.386138613861387, + "grad_norm": 0.10499756038188934, + "learning_rate": 9.85332359582511e-07, + "loss": 0.0329, + "num_input_tokens_seen": 35271456, + "step": 167130 + }, + { + "epoch": 18.38668866886689, + "grad_norm": 0.01328956987708807, + "learning_rate": 9.846652981246153e-07, + "loss": 0.0084, + "num_input_tokens_seen": 35272544, + "step": 167135 + }, + { + "epoch": 18.387238723872386, + "grad_norm": 0.009241411462426186, + "learning_rate": 9.839984580045369e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35273568, + "step": 167140 + }, + { + "epoch": 18.387788778877887, + "grad_norm": 0.408208966255188, + "learning_rate": 9.833318392284212e-07, + "loss": 0.0792, + "num_input_tokens_seen": 35274656, + "step": 167145 + }, + { + "epoch": 18.388338833883388, + "grad_norm": 0.01279556192457676, + "learning_rate": 9.826654418024106e-07, + "loss": 0.0066, + "num_input_tokens_seen": 35275744, + "step": 167150 + }, + { + "epoch": 18.38888888888889, + "grad_norm": 0.05180155485868454, + "learning_rate": 9.819992657326526e-07, + "loss": 0.0042, + "num_input_tokens_seen": 35276832, + "step": 167155 + }, + { + "epoch": 18.38943894389439, + "grad_norm": 0.005685299169272184, + "learning_rate": 9.813333110252787e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35277920, + "step": 167160 + }, + { + "epoch": 18.38998899889989, + "grad_norm": 0.021374909207224846, + "learning_rate": 9.806675776864256e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35279072, + "step": 167165 + }, + { + "epoch": 18.39053905390539, + "grad_norm": 0.01649426482617855, + "learning_rate": 9.800020657222408e-07, + "loss": 0.0219, + "num_input_tokens_seen": 35280160, + "step": 167170 + }, + { + "epoch": 18.39108910891089, + "grad_norm": 0.04780012369155884, + "learning_rate": 9.793367751388505e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35281216, + "step": 167175 + }, + { + "epoch": 18.39163916391639, + "grad_norm": 0.058665040880441666, + "learning_rate": 9.786717059423856e-07, + "loss": 0.0281, + "num_input_tokens_seen": 35282272, + "step": 167180 + }, + { + "epoch": 18.392189218921892, + "grad_norm": 0.037010520696640015, + "learning_rate": 9.780068581389772e-07, + "loss": 0.0783, + "num_input_tokens_seen": 35283328, + "step": 167185 + }, + { + "epoch": 18.392739273927393, + "grad_norm": 0.014846800826489925, + "learning_rate": 9.773422317347515e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35284352, + "step": 167190 + }, + { + "epoch": 18.393289328932894, + "grad_norm": 0.006217617075890303, + "learning_rate": 9.766778267358333e-07, + "loss": 0.005, + "num_input_tokens_seen": 35285440, + "step": 167195 + }, + { + "epoch": 18.393839383938396, + "grad_norm": 0.019718023017048836, + "learning_rate": 9.760136431483545e-07, + "loss": 0.0036, + "num_input_tokens_seen": 35286528, + "step": 167200 + }, + { + "epoch": 18.394389438943893, + "grad_norm": 0.044269341975450516, + "learning_rate": 9.75349680978424e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35287584, + "step": 167205 + }, + { + "epoch": 18.394939493949394, + "grad_norm": 0.1029328927397728, + "learning_rate": 9.746859402321728e-07, + "loss": 0.0048, + "num_input_tokens_seen": 35288704, + "step": 167210 + }, + { + "epoch": 18.395489548954895, + "grad_norm": 1.695791244506836, + "learning_rate": 9.740224209157073e-07, + "loss": 0.0449, + "num_input_tokens_seen": 35289696, + "step": 167215 + }, + { + "epoch": 18.396039603960396, + "grad_norm": 0.00913435872644186, + "learning_rate": 9.73359123035153e-07, + "loss": 0.0032, + "num_input_tokens_seen": 35290720, + "step": 167220 + }, + { + "epoch": 18.396589658965897, + "grad_norm": 0.019543012604117393, + "learning_rate": 9.726960465966162e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35291776, + "step": 167225 + }, + { + "epoch": 18.3971397139714, + "grad_norm": 0.04609067365527153, + "learning_rate": 9.720331916062085e-07, + "loss": 0.0028, + "num_input_tokens_seen": 35292800, + "step": 167230 + }, + { + "epoch": 18.397689768976896, + "grad_norm": 0.043397363275289536, + "learning_rate": 9.71370558070045e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35293888, + "step": 167235 + }, + { + "epoch": 18.398239823982397, + "grad_norm": 0.1863345503807068, + "learning_rate": 9.707081459942258e-07, + "loss": 0.0177, + "num_input_tokens_seen": 35294944, + "step": 167240 + }, + { + "epoch": 18.3987898789879, + "grad_norm": 0.016858190298080444, + "learning_rate": 9.70045955384863e-07, + "loss": 0.1347, + "num_input_tokens_seen": 35296032, + "step": 167245 + }, + { + "epoch": 18.3993399339934, + "grad_norm": 0.014402282424271107, + "learning_rate": 9.693839862480514e-07, + "loss": 0.002, + "num_input_tokens_seen": 35297152, + "step": 167250 + }, + { + "epoch": 18.3998899889989, + "grad_norm": 0.009593598544597626, + "learning_rate": 9.687222385898976e-07, + "loss": 0.0702, + "num_input_tokens_seen": 35298208, + "step": 167255 + }, + { + "epoch": 18.4004400440044, + "grad_norm": 0.03080456517636776, + "learning_rate": 9.680607124165048e-07, + "loss": 0.0578, + "num_input_tokens_seen": 35299296, + "step": 167260 + }, + { + "epoch": 18.400990099009903, + "grad_norm": 0.28116828203201294, + "learning_rate": 9.673994077339571e-07, + "loss": 0.0097, + "num_input_tokens_seen": 35300384, + "step": 167265 + }, + { + "epoch": 18.4015401540154, + "grad_norm": 0.02483593113720417, + "learning_rate": 9.667383245483635e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35301440, + "step": 167270 + }, + { + "epoch": 18.4020902090209, + "grad_norm": 0.16485530138015747, + "learning_rate": 9.66077462865811e-07, + "loss": 0.0039, + "num_input_tokens_seen": 35302528, + "step": 167275 + }, + { + "epoch": 18.402640264026402, + "grad_norm": 0.12149412930011749, + "learning_rate": 9.654168226923833e-07, + "loss": 0.0862, + "num_input_tokens_seen": 35303584, + "step": 167280 + }, + { + "epoch": 18.403190319031903, + "grad_norm": 2.8216474056243896, + "learning_rate": 9.647564040341783e-07, + "loss": 0.1426, + "num_input_tokens_seen": 35304576, + "step": 167285 + }, + { + "epoch": 18.403740374037405, + "grad_norm": 0.05782417207956314, + "learning_rate": 9.640962068972802e-07, + "loss": 0.0075, + "num_input_tokens_seen": 35305664, + "step": 167290 + }, + { + "epoch": 18.404290429042906, + "grad_norm": 0.26023685932159424, + "learning_rate": 9.634362312877788e-07, + "loss": 0.0099, + "num_input_tokens_seen": 35306720, + "step": 167295 + }, + { + "epoch": 18.404840484048403, + "grad_norm": 0.031603939831256866, + "learning_rate": 9.627764772117492e-07, + "loss": 0.0108, + "num_input_tokens_seen": 35307808, + "step": 167300 + }, + { + "epoch": 18.405390539053904, + "grad_norm": 0.007788926362991333, + "learning_rate": 9.621169446752705e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35308896, + "step": 167305 + }, + { + "epoch": 18.405940594059405, + "grad_norm": 2.43381667137146, + "learning_rate": 9.614576336844289e-07, + "loss": 0.0816, + "num_input_tokens_seen": 35310016, + "step": 167310 + }, + { + "epoch": 18.406490649064907, + "grad_norm": 0.08742859959602356, + "learning_rate": 9.60798544245295e-07, + "loss": 0.0095, + "num_input_tokens_seen": 35311040, + "step": 167315 + }, + { + "epoch": 18.407040704070408, + "grad_norm": 0.030360162258148193, + "learning_rate": 9.601396763639498e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35312128, + "step": 167320 + }, + { + "epoch": 18.40759075907591, + "grad_norm": 0.0203083585947752, + "learning_rate": 9.594810300464607e-07, + "loss": 0.1011, + "num_input_tokens_seen": 35313152, + "step": 167325 + }, + { + "epoch": 18.40814081408141, + "grad_norm": 0.01688358187675476, + "learning_rate": 9.588226052988952e-07, + "loss": 0.0411, + "num_input_tokens_seen": 35314240, + "step": 167330 + }, + { + "epoch": 18.408690869086907, + "grad_norm": 0.006168496794998646, + "learning_rate": 9.581644021273317e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35315264, + "step": 167335 + }, + { + "epoch": 18.40924092409241, + "grad_norm": 0.12361407279968262, + "learning_rate": 9.575064205378265e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35316320, + "step": 167340 + }, + { + "epoch": 18.40979097909791, + "grad_norm": 0.18072284758090973, + "learning_rate": 9.56848660536447e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35317408, + "step": 167345 + }, + { + "epoch": 18.41034103410341, + "grad_norm": 0.10754731297492981, + "learning_rate": 9.561911221292602e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35318368, + "step": 167350 + }, + { + "epoch": 18.41089108910891, + "grad_norm": 1.9792118072509766, + "learning_rate": 9.555338053223173e-07, + "loss": 0.0241, + "num_input_tokens_seen": 35319424, + "step": 167355 + }, + { + "epoch": 18.411441144114413, + "grad_norm": 0.030954834073781967, + "learning_rate": 9.548767101216883e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35320480, + "step": 167360 + }, + { + "epoch": 18.41199119911991, + "grad_norm": 0.19812177121639252, + "learning_rate": 9.542198365334182e-07, + "loss": 0.07, + "num_input_tokens_seen": 35321536, + "step": 167365 + }, + { + "epoch": 18.41254125412541, + "grad_norm": 0.04093950614333153, + "learning_rate": 9.535631845635635e-07, + "loss": 0.0035, + "num_input_tokens_seen": 35322592, + "step": 167370 + }, + { + "epoch": 18.413091309130913, + "grad_norm": 0.02333315648138523, + "learning_rate": 9.529067542181829e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35323680, + "step": 167375 + }, + { + "epoch": 18.413641364136414, + "grad_norm": 0.18740436434745789, + "learning_rate": 9.522505455033193e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35324704, + "step": 167380 + }, + { + "epoch": 18.414191419141915, + "grad_norm": 0.024823741987347603, + "learning_rate": 9.515945584250258e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35325824, + "step": 167385 + }, + { + "epoch": 18.414741474147416, + "grad_norm": 0.035734277218580246, + "learning_rate": 9.509387929893476e-07, + "loss": 0.0256, + "num_input_tokens_seen": 35326848, + "step": 167390 + }, + { + "epoch": 18.415291529152917, + "grad_norm": 0.18766076862812042, + "learning_rate": 9.502832492023217e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35327968, + "step": 167395 + }, + { + "epoch": 18.415841584158414, + "grad_norm": 0.0048114643432199955, + "learning_rate": 9.496279270699931e-07, + "loss": 0.0709, + "num_input_tokens_seen": 35328992, + "step": 167400 + }, + { + "epoch": 18.416391639163916, + "grad_norm": 0.2605167329311371, + "learning_rate": 9.489728265984071e-07, + "loss": 0.004, + "num_input_tokens_seen": 35330048, + "step": 167405 + }, + { + "epoch": 18.416941694169417, + "grad_norm": 0.045955050736665726, + "learning_rate": 9.483179477935977e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35331104, + "step": 167410 + }, + { + "epoch": 18.417491749174918, + "grad_norm": 0.028208216652274132, + "learning_rate": 9.476632906616017e-07, + "loss": 0.0048, + "num_input_tokens_seen": 35332160, + "step": 167415 + }, + { + "epoch": 18.41804180418042, + "grad_norm": 0.09956665337085724, + "learning_rate": 9.470088552084505e-07, + "loss": 0.0047, + "num_input_tokens_seen": 35333216, + "step": 167420 + }, + { + "epoch": 18.41859185918592, + "grad_norm": 0.3341790735721588, + "learning_rate": 9.463546414401752e-07, + "loss": 0.0482, + "num_input_tokens_seen": 35334304, + "step": 167425 + }, + { + "epoch": 18.419141914191417, + "grad_norm": 3.331737756729126, + "learning_rate": 9.457006493628073e-07, + "loss": 0.0939, + "num_input_tokens_seen": 35335360, + "step": 167430 + }, + { + "epoch": 18.41969196919692, + "grad_norm": 2.2997148036956787, + "learning_rate": 9.45046878982378e-07, + "loss": 0.0366, + "num_input_tokens_seen": 35336416, + "step": 167435 + }, + { + "epoch": 18.42024202420242, + "grad_norm": 0.037626076489686966, + "learning_rate": 9.443933303049074e-07, + "loss": 0.1009, + "num_input_tokens_seen": 35337440, + "step": 167440 + }, + { + "epoch": 18.42079207920792, + "grad_norm": 0.007864675484597683, + "learning_rate": 9.437400033364185e-07, + "loss": 0.0073, + "num_input_tokens_seen": 35338496, + "step": 167445 + }, + { + "epoch": 18.421342134213422, + "grad_norm": 0.015968814492225647, + "learning_rate": 9.430868980829372e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35339488, + "step": 167450 + }, + { + "epoch": 18.421892189218923, + "grad_norm": 0.02101043239235878, + "learning_rate": 9.424340145504778e-07, + "loss": 0.0486, + "num_input_tokens_seen": 35340608, + "step": 167455 + }, + { + "epoch": 18.422442244224424, + "grad_norm": 0.021502569317817688, + "learning_rate": 9.417813527450608e-07, + "loss": 0.1759, + "num_input_tokens_seen": 35341632, + "step": 167460 + }, + { + "epoch": 18.42299229922992, + "grad_norm": 0.16623671352863312, + "learning_rate": 9.411289126727008e-07, + "loss": 0.003, + "num_input_tokens_seen": 35342720, + "step": 167465 + }, + { + "epoch": 18.423542354235423, + "grad_norm": 0.038355108350515366, + "learning_rate": 9.404766943394122e-07, + "loss": 0.0116, + "num_input_tokens_seen": 35343744, + "step": 167470 + }, + { + "epoch": 18.424092409240924, + "grad_norm": 0.012631239369511604, + "learning_rate": 9.398246977512043e-07, + "loss": 0.0987, + "num_input_tokens_seen": 35344768, + "step": 167475 + }, + { + "epoch": 18.424642464246425, + "grad_norm": 0.032854534685611725, + "learning_rate": 9.391729229140861e-07, + "loss": 0.0842, + "num_input_tokens_seen": 35345792, + "step": 167480 + }, + { + "epoch": 18.425192519251926, + "grad_norm": 0.11633280664682388, + "learning_rate": 9.385213698340667e-07, + "loss": 0.0041, + "num_input_tokens_seen": 35346848, + "step": 167485 + }, + { + "epoch": 18.425742574257427, + "grad_norm": 0.006885723676532507, + "learning_rate": 9.378700385171496e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35347936, + "step": 167490 + }, + { + "epoch": 18.426292629262925, + "grad_norm": 0.016250822693109512, + "learning_rate": 9.372189289693384e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35348928, + "step": 167495 + }, + { + "epoch": 18.426842684268426, + "grad_norm": 0.013734596781432629, + "learning_rate": 9.365680411966338e-07, + "loss": 0.0366, + "num_input_tokens_seen": 35349984, + "step": 167500 + }, + { + "epoch": 18.427392739273927, + "grad_norm": 1.2429407835006714, + "learning_rate": 9.359173752050338e-07, + "loss": 0.1175, + "num_input_tokens_seen": 35351040, + "step": 167505 + }, + { + "epoch": 18.427942794279428, + "grad_norm": 0.02129477448761463, + "learning_rate": 9.352669310005391e-07, + "loss": 0.0635, + "num_input_tokens_seen": 35352096, + "step": 167510 + }, + { + "epoch": 18.42849284928493, + "grad_norm": 0.07108860462903976, + "learning_rate": 9.346167085891394e-07, + "loss": 0.0059, + "num_input_tokens_seen": 35353120, + "step": 167515 + }, + { + "epoch": 18.42904290429043, + "grad_norm": 0.015956981107592583, + "learning_rate": 9.3396670797683e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35354144, + "step": 167520 + }, + { + "epoch": 18.42959295929593, + "grad_norm": 0.6645735502243042, + "learning_rate": 9.333169291696031e-07, + "loss": 0.0431, + "num_input_tokens_seen": 35355200, + "step": 167525 + }, + { + "epoch": 18.43014301430143, + "grad_norm": 0.010789756663143635, + "learning_rate": 9.32667372173443e-07, + "loss": 0.0087, + "num_input_tokens_seen": 35356224, + "step": 167530 + }, + { + "epoch": 18.43069306930693, + "grad_norm": 1.5014537572860718, + "learning_rate": 9.320180369943421e-07, + "loss": 0.0252, + "num_input_tokens_seen": 35357280, + "step": 167535 + }, + { + "epoch": 18.43124312431243, + "grad_norm": 0.022205715999007225, + "learning_rate": 9.313689236382788e-07, + "loss": 0.0335, + "num_input_tokens_seen": 35358304, + "step": 167540 + }, + { + "epoch": 18.431793179317932, + "grad_norm": 0.005883360747247934, + "learning_rate": 9.307200321112375e-07, + "loss": 0.0066, + "num_input_tokens_seen": 35359328, + "step": 167545 + }, + { + "epoch": 18.432343234323433, + "grad_norm": 0.3151504397392273, + "learning_rate": 9.300713624192047e-07, + "loss": 0.0073, + "num_input_tokens_seen": 35360288, + "step": 167550 + }, + { + "epoch": 18.432893289328934, + "grad_norm": 0.017107447609305382, + "learning_rate": 9.294229145681565e-07, + "loss": 0.0004, + "num_input_tokens_seen": 35361280, + "step": 167555 + }, + { + "epoch": 18.433443344334435, + "grad_norm": 0.31275370717048645, + "learning_rate": 9.287746885640603e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35362336, + "step": 167560 + }, + { + "epoch": 18.433993399339933, + "grad_norm": 0.32101306319236755, + "learning_rate": 9.281266844129027e-07, + "loss": 0.016, + "num_input_tokens_seen": 35363392, + "step": 167565 + }, + { + "epoch": 18.434543454345434, + "grad_norm": 0.03130386769771576, + "learning_rate": 9.274789021206459e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35364416, + "step": 167570 + }, + { + "epoch": 18.435093509350935, + "grad_norm": 0.020405955612659454, + "learning_rate": 9.268313416932683e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35365504, + "step": 167575 + }, + { + "epoch": 18.435643564356436, + "grad_norm": 0.07645870000123978, + "learning_rate": 9.261840031367347e-07, + "loss": 0.0043, + "num_input_tokens_seen": 35366624, + "step": 167580 + }, + { + "epoch": 18.436193619361937, + "grad_norm": 0.0441175252199173, + "learning_rate": 9.255368864570097e-07, + "loss": 0.0337, + "num_input_tokens_seen": 35367680, + "step": 167585 + }, + { + "epoch": 18.436743674367438, + "grad_norm": 2.1663992404937744, + "learning_rate": 9.248899916600606e-07, + "loss": 0.0158, + "num_input_tokens_seen": 35368768, + "step": 167590 + }, + { + "epoch": 18.437293729372936, + "grad_norm": 0.11618684232234955, + "learning_rate": 9.242433187518468e-07, + "loss": 0.1221, + "num_input_tokens_seen": 35369888, + "step": 167595 + }, + { + "epoch": 18.437843784378437, + "grad_norm": 0.0074779558926820755, + "learning_rate": 9.2359686773833e-07, + "loss": 0.0393, + "num_input_tokens_seen": 35370912, + "step": 167600 + }, + { + "epoch": 18.438393839383938, + "grad_norm": 0.328006774187088, + "learning_rate": 9.229506386254693e-07, + "loss": 0.0066, + "num_input_tokens_seen": 35371936, + "step": 167605 + }, + { + "epoch": 18.43894389438944, + "grad_norm": 5.932077407836914, + "learning_rate": 9.223046314192158e-07, + "loss": 0.0573, + "num_input_tokens_seen": 35372992, + "step": 167610 + }, + { + "epoch": 18.43949394939494, + "grad_norm": 0.10817887634038925, + "learning_rate": 9.21658846125531e-07, + "loss": 0.0061, + "num_input_tokens_seen": 35374016, + "step": 167615 + }, + { + "epoch": 18.44004400440044, + "grad_norm": 0.00863591767847538, + "learning_rate": 9.210132827503604e-07, + "loss": 0.0794, + "num_input_tokens_seen": 35375008, + "step": 167620 + }, + { + "epoch": 18.440594059405942, + "grad_norm": 0.005257712211459875, + "learning_rate": 9.203679412996575e-07, + "loss": 0.0259, + "num_input_tokens_seen": 35376064, + "step": 167625 + }, + { + "epoch": 18.44114411441144, + "grad_norm": 0.06977388262748718, + "learning_rate": 9.197228217793675e-07, + "loss": 0.0808, + "num_input_tokens_seen": 35377184, + "step": 167630 + }, + { + "epoch": 18.44169416941694, + "grad_norm": 0.011326932348310947, + "learning_rate": 9.190779241954356e-07, + "loss": 0.0097, + "num_input_tokens_seen": 35378240, + "step": 167635 + }, + { + "epoch": 18.442244224422442, + "grad_norm": 0.10931827872991562, + "learning_rate": 9.184332485538128e-07, + "loss": 0.0314, + "num_input_tokens_seen": 35379264, + "step": 167640 + }, + { + "epoch": 18.442794279427943, + "grad_norm": 2.1888246536254883, + "learning_rate": 9.177887948604302e-07, + "loss": 0.0518, + "num_input_tokens_seen": 35380320, + "step": 167645 + }, + { + "epoch": 18.443344334433444, + "grad_norm": 0.017265357077121735, + "learning_rate": 9.171445631212361e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35381344, + "step": 167650 + }, + { + "epoch": 18.443894389438945, + "grad_norm": 0.024053867906332016, + "learning_rate": 9.165005533421616e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35382432, + "step": 167655 + }, + { + "epoch": 18.444444444444443, + "grad_norm": 0.030270760878920555, + "learning_rate": 9.158567655291439e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35383456, + "step": 167660 + }, + { + "epoch": 18.444994499449944, + "grad_norm": 0.004590968135744333, + "learning_rate": 9.152131996881224e-07, + "loss": 0.0047, + "num_input_tokens_seen": 35384512, + "step": 167665 + }, + { + "epoch": 18.445544554455445, + "grad_norm": 9.525565147399902, + "learning_rate": 9.145698558250204e-07, + "loss": 0.1146, + "num_input_tokens_seen": 35385568, + "step": 167670 + }, + { + "epoch": 18.446094609460946, + "grad_norm": 0.01742836833000183, + "learning_rate": 9.139267339457747e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35386592, + "step": 167675 + }, + { + "epoch": 18.446644664466447, + "grad_norm": 2.594095468521118, + "learning_rate": 9.132838340563055e-07, + "loss": 0.1027, + "num_input_tokens_seen": 35387616, + "step": 167680 + }, + { + "epoch": 18.44719471947195, + "grad_norm": 0.0073303347453475, + "learning_rate": 9.126411561625414e-07, + "loss": 0.1201, + "num_input_tokens_seen": 35388704, + "step": 167685 + }, + { + "epoch": 18.44774477447745, + "grad_norm": 0.025701183825731277, + "learning_rate": 9.119987002704028e-07, + "loss": 0.1189, + "num_input_tokens_seen": 35389696, + "step": 167690 + }, + { + "epoch": 18.448294829482947, + "grad_norm": 0.005114343483000994, + "learning_rate": 9.113564663858154e-07, + "loss": 0.072, + "num_input_tokens_seen": 35390688, + "step": 167695 + }, + { + "epoch": 18.448844884488448, + "grad_norm": 0.02324514463543892, + "learning_rate": 9.107144545146995e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35391680, + "step": 167700 + }, + { + "epoch": 18.44939493949395, + "grad_norm": 0.3577326238155365, + "learning_rate": 9.100726646629699e-07, + "loss": 0.0044, + "num_input_tokens_seen": 35392864, + "step": 167705 + }, + { + "epoch": 18.44994499449945, + "grad_norm": 0.09544610232114792, + "learning_rate": 9.094310968365383e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35393920, + "step": 167710 + }, + { + "epoch": 18.45049504950495, + "grad_norm": 0.02757168374955654, + "learning_rate": 9.087897510413196e-07, + "loss": 0.0092, + "num_input_tokens_seen": 35394976, + "step": 167715 + }, + { + "epoch": 18.451045104510452, + "grad_norm": 0.028392696753144264, + "learning_rate": 9.081486272832257e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35396064, + "step": 167720 + }, + { + "epoch": 18.45159515951595, + "grad_norm": 0.023017479106783867, + "learning_rate": 9.075077255681686e-07, + "loss": 0.0629, + "num_input_tokens_seen": 35397088, + "step": 167725 + }, + { + "epoch": 18.45214521452145, + "grad_norm": 0.08574952930212021, + "learning_rate": 9.068670459020517e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35398112, + "step": 167730 + }, + { + "epoch": 18.452695269526952, + "grad_norm": 0.031835220754146576, + "learning_rate": 9.062265882907761e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35399232, + "step": 167735 + }, + { + "epoch": 18.453245324532453, + "grad_norm": 0.5838547348976135, + "learning_rate": 9.055863527402536e-07, + "loss": 0.005, + "num_input_tokens_seen": 35400352, + "step": 167740 + }, + { + "epoch": 18.453795379537954, + "grad_norm": 0.24938993155956268, + "learning_rate": 9.049463392563767e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35401408, + "step": 167745 + }, + { + "epoch": 18.454345434543455, + "grad_norm": 0.04891075938940048, + "learning_rate": 9.043065478450491e-07, + "loss": 0.0052, + "num_input_tokens_seen": 35402432, + "step": 167750 + }, + { + "epoch": 18.454895489548957, + "grad_norm": 0.2984946370124817, + "learning_rate": 9.03666978512166e-07, + "loss": 0.0034, + "num_input_tokens_seen": 35403488, + "step": 167755 + }, + { + "epoch": 18.455445544554454, + "grad_norm": 0.05107704922556877, + "learning_rate": 9.030276312636199e-07, + "loss": 0.0546, + "num_input_tokens_seen": 35404544, + "step": 167760 + }, + { + "epoch": 18.455995599559955, + "grad_norm": 0.04093972221016884, + "learning_rate": 9.023885061053089e-07, + "loss": 0.0703, + "num_input_tokens_seen": 35405568, + "step": 167765 + }, + { + "epoch": 18.456545654565456, + "grad_norm": 0.0399591401219368, + "learning_rate": 9.017496030431199e-07, + "loss": 0.0205, + "num_input_tokens_seen": 35406688, + "step": 167770 + }, + { + "epoch": 18.457095709570957, + "grad_norm": 0.047433819621801376, + "learning_rate": 9.011109220829344e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35407712, + "step": 167775 + }, + { + "epoch": 18.45764576457646, + "grad_norm": 0.10632084310054779, + "learning_rate": 9.004724632306505e-07, + "loss": 0.0042, + "num_input_tokens_seen": 35408768, + "step": 167780 + }, + { + "epoch": 18.45819581958196, + "grad_norm": 0.21069537103176117, + "learning_rate": 8.998342264921467e-07, + "loss": 0.0111, + "num_input_tokens_seen": 35409792, + "step": 167785 + }, + { + "epoch": 18.458745874587457, + "grad_norm": 0.05379915609955788, + "learning_rate": 8.991962118733099e-07, + "loss": 0.1416, + "num_input_tokens_seen": 35410816, + "step": 167790 + }, + { + "epoch": 18.459295929592958, + "grad_norm": 0.008323227986693382, + "learning_rate": 8.985584193800161e-07, + "loss": 0.0891, + "num_input_tokens_seen": 35411872, + "step": 167795 + }, + { + "epoch": 18.45984598459846, + "grad_norm": 4.918956756591797, + "learning_rate": 8.979208490181413e-07, + "loss": 0.0624, + "num_input_tokens_seen": 35412992, + "step": 167800 + }, + { + "epoch": 18.46039603960396, + "grad_norm": 2.6756224632263184, + "learning_rate": 8.972835007935637e-07, + "loss": 0.0543, + "num_input_tokens_seen": 35414144, + "step": 167805 + }, + { + "epoch": 18.46094609460946, + "grad_norm": 0.03861837461590767, + "learning_rate": 8.966463747121595e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35415136, + "step": 167810 + }, + { + "epoch": 18.461496149614963, + "grad_norm": 0.184513121843338, + "learning_rate": 8.960094707798045e-07, + "loss": 0.0054, + "num_input_tokens_seen": 35416224, + "step": 167815 + }, + { + "epoch": 18.462046204620464, + "grad_norm": 0.7242891788482666, + "learning_rate": 8.953727890023606e-07, + "loss": 0.0069, + "num_input_tokens_seen": 35417248, + "step": 167820 + }, + { + "epoch": 18.46259625962596, + "grad_norm": 0.28504160046577454, + "learning_rate": 8.947363293856981e-07, + "loss": 0.0471, + "num_input_tokens_seen": 35418304, + "step": 167825 + }, + { + "epoch": 18.463146314631462, + "grad_norm": 0.006929895840585232, + "learning_rate": 8.941000919356873e-07, + "loss": 0.008, + "num_input_tokens_seen": 35419328, + "step": 167830 + }, + { + "epoch": 18.463696369636963, + "grad_norm": 0.04113321378827095, + "learning_rate": 8.93464076658182e-07, + "loss": 0.0134, + "num_input_tokens_seen": 35420416, + "step": 167835 + }, + { + "epoch": 18.464246424642464, + "grad_norm": 0.00939446222037077, + "learning_rate": 8.928282835590579e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35421472, + "step": 167840 + }, + { + "epoch": 18.464796479647966, + "grad_norm": 0.01815972290933132, + "learning_rate": 8.92192712644166e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35422528, + "step": 167845 + }, + { + "epoch": 18.465346534653467, + "grad_norm": 0.05764758214354515, + "learning_rate": 8.915573639193653e-07, + "loss": 0.0138, + "num_input_tokens_seen": 35423584, + "step": 167850 + }, + { + "epoch": 18.465896589658964, + "grad_norm": 0.0300342608243227, + "learning_rate": 8.909222373905124e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35424704, + "step": 167855 + }, + { + "epoch": 18.466446644664465, + "grad_norm": 0.005901598837226629, + "learning_rate": 8.90287333063461e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35425760, + "step": 167860 + }, + { + "epoch": 18.466996699669966, + "grad_norm": 0.061421364545822144, + "learning_rate": 8.896526509440589e-07, + "loss": 0.0087, + "num_input_tokens_seen": 35426816, + "step": 167865 + }, + { + "epoch": 18.467546754675467, + "grad_norm": 0.009434014558792114, + "learning_rate": 8.890181910381629e-07, + "loss": 0.0585, + "num_input_tokens_seen": 35427872, + "step": 167870 + }, + { + "epoch": 18.46809680968097, + "grad_norm": 0.44318482279777527, + "learning_rate": 8.883839533516153e-07, + "loss": 0.0772, + "num_input_tokens_seen": 35428928, + "step": 167875 + }, + { + "epoch": 18.46864686468647, + "grad_norm": 6.083002090454102, + "learning_rate": 8.877499378902643e-07, + "loss": 0.1249, + "num_input_tokens_seen": 35429984, + "step": 167880 + }, + { + "epoch": 18.46919691969197, + "grad_norm": 1.591530442237854, + "learning_rate": 8.871161446599496e-07, + "loss": 0.0204, + "num_input_tokens_seen": 35431072, + "step": 167885 + }, + { + "epoch": 18.46974697469747, + "grad_norm": 0.017232486978173256, + "learning_rate": 8.864825736665167e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35432160, + "step": 167890 + }, + { + "epoch": 18.47029702970297, + "grad_norm": 0.013439934700727463, + "learning_rate": 8.858492249158024e-07, + "loss": 0.0078, + "num_input_tokens_seen": 35433152, + "step": 167895 + }, + { + "epoch": 18.47084708470847, + "grad_norm": 0.05381176620721817, + "learning_rate": 8.85216098413641e-07, + "loss": 0.0659, + "num_input_tokens_seen": 35434272, + "step": 167900 + }, + { + "epoch": 18.47139713971397, + "grad_norm": 1.28571355342865, + "learning_rate": 8.845831941658777e-07, + "loss": 0.007, + "num_input_tokens_seen": 35435296, + "step": 167905 + }, + { + "epoch": 18.471947194719473, + "grad_norm": 0.014195160008966923, + "learning_rate": 8.839505121783359e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35436288, + "step": 167910 + }, + { + "epoch": 18.472497249724974, + "grad_norm": 0.0055548883974552155, + "learning_rate": 8.833180524568524e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35437344, + "step": 167915 + }, + { + "epoch": 18.47304730473047, + "grad_norm": 0.17753373086452484, + "learning_rate": 8.826858150072503e-07, + "loss": 0.0028, + "num_input_tokens_seen": 35438432, + "step": 167920 + }, + { + "epoch": 18.473597359735972, + "grad_norm": 0.01984383724629879, + "learning_rate": 8.820537998353639e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35439456, + "step": 167925 + }, + { + "epoch": 18.474147414741473, + "grad_norm": 0.01779051311314106, + "learning_rate": 8.814220069470136e-07, + "loss": 0.0032, + "num_input_tokens_seen": 35440512, + "step": 167930 + }, + { + "epoch": 18.474697469746975, + "grad_norm": 0.21354639530181885, + "learning_rate": 8.807904363480251e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35441536, + "step": 167935 + }, + { + "epoch": 18.475247524752476, + "grad_norm": 0.03465857356786728, + "learning_rate": 8.801590880442162e-07, + "loss": 0.0375, + "num_input_tokens_seen": 35442624, + "step": 167940 + }, + { + "epoch": 18.475797579757977, + "grad_norm": 0.04194222763180733, + "learning_rate": 8.795279620414071e-07, + "loss": 0.0067, + "num_input_tokens_seen": 35443616, + "step": 167945 + }, + { + "epoch": 18.476347634763478, + "grad_norm": 0.03963132202625275, + "learning_rate": 8.788970583454099e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35444640, + "step": 167950 + }, + { + "epoch": 18.476897689768975, + "grad_norm": 0.010539724491536617, + "learning_rate": 8.782663769620531e-07, + "loss": 0.0067, + "num_input_tokens_seen": 35445728, + "step": 167955 + }, + { + "epoch": 18.477447744774476, + "grad_norm": 0.12431242316961288, + "learning_rate": 8.77635917897135e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35446848, + "step": 167960 + }, + { + "epoch": 18.477997799779978, + "grad_norm": 0.017870374023914337, + "learning_rate": 8.770056811564731e-07, + "loss": 0.0782, + "num_input_tokens_seen": 35447904, + "step": 167965 + }, + { + "epoch": 18.47854785478548, + "grad_norm": 0.0689200758934021, + "learning_rate": 8.763756667458739e-07, + "loss": 0.0036, + "num_input_tokens_seen": 35448992, + "step": 167970 + }, + { + "epoch": 18.47909790979098, + "grad_norm": 0.25527188181877136, + "learning_rate": 8.757458746711439e-07, + "loss": 0.0044, + "num_input_tokens_seen": 35449984, + "step": 167975 + }, + { + "epoch": 18.47964796479648, + "grad_norm": 0.069351427257061, + "learning_rate": 8.751163049380895e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35451072, + "step": 167980 + }, + { + "epoch": 18.480198019801982, + "grad_norm": 0.01528032124042511, + "learning_rate": 8.744869575525116e-07, + "loss": 0.0082, + "num_input_tokens_seen": 35452096, + "step": 167985 + }, + { + "epoch": 18.48074807480748, + "grad_norm": 0.049441706389188766, + "learning_rate": 8.738578325202085e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35453120, + "step": 167990 + }, + { + "epoch": 18.48129812981298, + "grad_norm": 0.1251319944858551, + "learning_rate": 8.732289298469864e-07, + "loss": 0.0082, + "num_input_tokens_seen": 35454176, + "step": 167995 + }, + { + "epoch": 18.48184818481848, + "grad_norm": 0.06976780295372009, + "learning_rate": 8.726002495386298e-07, + "loss": 0.0047, + "num_input_tokens_seen": 35455264, + "step": 168000 + }, + { + "epoch": 18.482398239823983, + "grad_norm": 0.03272020071744919, + "learning_rate": 8.719717916009451e-07, + "loss": 0.0048, + "num_input_tokens_seen": 35456320, + "step": 168005 + }, + { + "epoch": 18.482948294829484, + "grad_norm": 0.03428172320127487, + "learning_rate": 8.713435560397137e-07, + "loss": 0.1095, + "num_input_tokens_seen": 35457376, + "step": 168010 + }, + { + "epoch": 18.483498349834985, + "grad_norm": 0.3463999330997467, + "learning_rate": 8.707155428607311e-07, + "loss": 0.0517, + "num_input_tokens_seen": 35458336, + "step": 168015 + }, + { + "epoch": 18.484048404840483, + "grad_norm": 0.008029012940824032, + "learning_rate": 8.700877520697898e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35459424, + "step": 168020 + }, + { + "epoch": 18.484598459845984, + "grad_norm": 0.14305329322814941, + "learning_rate": 8.694601836726657e-07, + "loss": 0.006, + "num_input_tokens_seen": 35460416, + "step": 168025 + }, + { + "epoch": 18.485148514851485, + "grad_norm": 0.044670622795820236, + "learning_rate": 8.688328376751515e-07, + "loss": 0.0221, + "num_input_tokens_seen": 35461440, + "step": 168030 + }, + { + "epoch": 18.485698569856986, + "grad_norm": 0.079469233751297, + "learning_rate": 8.68205714083023e-07, + "loss": 0.001, + "num_input_tokens_seen": 35462400, + "step": 168035 + }, + { + "epoch": 18.486248624862487, + "grad_norm": 0.006836592685431242, + "learning_rate": 8.675788129020645e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35463520, + "step": 168040 + }, + { + "epoch": 18.486798679867988, + "grad_norm": 0.11319635808467865, + "learning_rate": 8.669521341380549e-07, + "loss": 0.047, + "num_input_tokens_seen": 35464544, + "step": 168045 + }, + { + "epoch": 18.48734873487349, + "grad_norm": 0.033693160861730576, + "learning_rate": 8.663256777967616e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35465568, + "step": 168050 + }, + { + "epoch": 18.487898789878987, + "grad_norm": 0.36806538701057434, + "learning_rate": 8.656994438839688e-07, + "loss": 0.0044, + "num_input_tokens_seen": 35466656, + "step": 168055 + }, + { + "epoch": 18.488448844884488, + "grad_norm": 0.007340597454458475, + "learning_rate": 8.650734324054443e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35467680, + "step": 168060 + }, + { + "epoch": 18.48899889988999, + "grad_norm": 0.008621703833341599, + "learning_rate": 8.64447643366953e-07, + "loss": 0.0609, + "num_input_tokens_seen": 35468768, + "step": 168065 + }, + { + "epoch": 18.48954895489549, + "grad_norm": 1.8589742183685303, + "learning_rate": 8.63822076774265e-07, + "loss": 0.0119, + "num_input_tokens_seen": 35469760, + "step": 168070 + }, + { + "epoch": 18.49009900990099, + "grad_norm": 0.10923263430595398, + "learning_rate": 8.631967326331508e-07, + "loss": 0.0139, + "num_input_tokens_seen": 35470880, + "step": 168075 + }, + { + "epoch": 18.490649064906492, + "grad_norm": 0.09818997234106064, + "learning_rate": 8.625716109493698e-07, + "loss": 0.0122, + "num_input_tokens_seen": 35471968, + "step": 168080 + }, + { + "epoch": 18.49119911991199, + "grad_norm": 0.006778747774660587, + "learning_rate": 8.619467117286867e-07, + "loss": 0.0666, + "num_input_tokens_seen": 35473024, + "step": 168085 + }, + { + "epoch": 18.49174917491749, + "grad_norm": 0.009157123975455761, + "learning_rate": 8.613220349768525e-07, + "loss": 0.0043, + "num_input_tokens_seen": 35474080, + "step": 168090 + }, + { + "epoch": 18.492299229922992, + "grad_norm": 0.39214229583740234, + "learning_rate": 8.60697580699632e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35475072, + "step": 168095 + }, + { + "epoch": 18.492849284928493, + "grad_norm": 0.23646710813045502, + "learning_rate": 8.600733489027791e-07, + "loss": 0.002, + "num_input_tokens_seen": 35476160, + "step": 168100 + }, + { + "epoch": 18.493399339933994, + "grad_norm": 0.006407187320291996, + "learning_rate": 8.594493395920473e-07, + "loss": 0.0796, + "num_input_tokens_seen": 35477152, + "step": 168105 + }, + { + "epoch": 18.493949394939495, + "grad_norm": 0.018575893715023994, + "learning_rate": 8.588255527731903e-07, + "loss": 0.0435, + "num_input_tokens_seen": 35478272, + "step": 168110 + }, + { + "epoch": 18.494499449944996, + "grad_norm": 0.020767685025930405, + "learning_rate": 8.582019884519482e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35479296, + "step": 168115 + }, + { + "epoch": 18.495049504950494, + "grad_norm": 0.024090485647320747, + "learning_rate": 8.575786466340802e-07, + "loss": 0.0369, + "num_input_tokens_seen": 35480320, + "step": 168120 + }, + { + "epoch": 18.495599559955995, + "grad_norm": 0.004494995344430208, + "learning_rate": 8.569555273253205e-07, + "loss": 0.0414, + "num_input_tokens_seen": 35481408, + "step": 168125 + }, + { + "epoch": 18.496149614961496, + "grad_norm": 0.0052026547491550446, + "learning_rate": 8.563326305314146e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35482432, + "step": 168130 + }, + { + "epoch": 18.496699669966997, + "grad_norm": 2.2826404571533203, + "learning_rate": 8.557099562581106e-07, + "loss": 0.0154, + "num_input_tokens_seen": 35483520, + "step": 168135 + }, + { + "epoch": 18.497249724972498, + "grad_norm": 1.7584264278411865, + "learning_rate": 8.550875045111401e-07, + "loss": 0.0368, + "num_input_tokens_seen": 35484576, + "step": 168140 + }, + { + "epoch": 18.497799779978, + "grad_norm": 0.0035264273174107075, + "learning_rate": 8.544652752962456e-07, + "loss": 0.1383, + "num_input_tokens_seen": 35485600, + "step": 168145 + }, + { + "epoch": 18.498349834983497, + "grad_norm": 0.07305215299129486, + "learning_rate": 8.538432686191533e-07, + "loss": 0.19, + "num_input_tokens_seen": 35486656, + "step": 168150 + }, + { + "epoch": 18.498899889988998, + "grad_norm": 0.05376352369785309, + "learning_rate": 8.532214844856029e-07, + "loss": 0.0345, + "num_input_tokens_seen": 35487712, + "step": 168155 + }, + { + "epoch": 18.4994499449945, + "grad_norm": 1.2923181056976318, + "learning_rate": 8.52599922901326e-07, + "loss": 0.0724, + "num_input_tokens_seen": 35488736, + "step": 168160 + }, + { + "epoch": 18.5, + "grad_norm": 0.3562231957912445, + "learning_rate": 8.519785838720457e-07, + "loss": 0.0209, + "num_input_tokens_seen": 35489792, + "step": 168165 + }, + { + "epoch": 18.5005500550055, + "grad_norm": 0.04120389372110367, + "learning_rate": 8.513574674034936e-07, + "loss": 0.0028, + "num_input_tokens_seen": 35490880, + "step": 168170 + }, + { + "epoch": 18.501100110011002, + "grad_norm": 0.05466528981924057, + "learning_rate": 8.507365735013928e-07, + "loss": 0.016, + "num_input_tokens_seen": 35492064, + "step": 168175 + }, + { + "epoch": 18.501650165016503, + "grad_norm": 0.024986373260617256, + "learning_rate": 8.501159021714611e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35493056, + "step": 168180 + }, + { + "epoch": 18.502200220022, + "grad_norm": 0.012497632764279842, + "learning_rate": 8.494954534194216e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35494112, + "step": 168185 + }, + { + "epoch": 18.502750275027502, + "grad_norm": 0.06856207549571991, + "learning_rate": 8.488752272509975e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35495168, + "step": 168190 + }, + { + "epoch": 18.503300330033003, + "grad_norm": 0.17746877670288086, + "learning_rate": 8.482552236719038e-07, + "loss": 0.0029, + "num_input_tokens_seen": 35496224, + "step": 168195 + }, + { + "epoch": 18.503850385038504, + "grad_norm": 0.2758859694004059, + "learning_rate": 8.476354426878497e-07, + "loss": 0.0653, + "num_input_tokens_seen": 35497280, + "step": 168200 + }, + { + "epoch": 18.504400440044005, + "grad_norm": 0.011793076992034912, + "learning_rate": 8.470158843045501e-07, + "loss": 0.0989, + "num_input_tokens_seen": 35498368, + "step": 168205 + }, + { + "epoch": 18.504950495049506, + "grad_norm": 0.025837475433945656, + "learning_rate": 8.463965485277143e-07, + "loss": 0.0904, + "num_input_tokens_seen": 35499456, + "step": 168210 + }, + { + "epoch": 18.505500550055004, + "grad_norm": 0.06378036737442017, + "learning_rate": 8.457774353630516e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35500512, + "step": 168215 + }, + { + "epoch": 18.506050605060505, + "grad_norm": 0.030609773471951485, + "learning_rate": 8.451585448162713e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35501536, + "step": 168220 + }, + { + "epoch": 18.506600660066006, + "grad_norm": 0.051559027284383774, + "learning_rate": 8.445398768930745e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35502560, + "step": 168225 + }, + { + "epoch": 18.507150715071507, + "grad_norm": 0.04429619759321213, + "learning_rate": 8.439214315991595e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35503552, + "step": 168230 + }, + { + "epoch": 18.507700770077008, + "grad_norm": 0.36998701095581055, + "learning_rate": 8.433032089402326e-07, + "loss": 0.049, + "num_input_tokens_seen": 35504576, + "step": 168235 + }, + { + "epoch": 18.50825082508251, + "grad_norm": 0.008635648526251316, + "learning_rate": 8.426852089219838e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35505632, + "step": 168240 + }, + { + "epoch": 18.50880088008801, + "grad_norm": 0.16177067160606384, + "learning_rate": 8.420674315501142e-07, + "loss": 0.1122, + "num_input_tokens_seen": 35506656, + "step": 168245 + }, + { + "epoch": 18.509350935093508, + "grad_norm": 0.05691646784543991, + "learning_rate": 8.414498768303219e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35507680, + "step": 168250 + }, + { + "epoch": 18.50990099009901, + "grad_norm": 0.31856873631477356, + "learning_rate": 8.408325447682886e-07, + "loss": 0.0696, + "num_input_tokens_seen": 35508800, + "step": 168255 + }, + { + "epoch": 18.51045104510451, + "grad_norm": 0.006907645612955093, + "learning_rate": 8.402154353697123e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35509760, + "step": 168260 + }, + { + "epoch": 18.51100110011001, + "grad_norm": 0.3814089000225067, + "learning_rate": 8.395985486402774e-07, + "loss": 0.0237, + "num_input_tokens_seen": 35510848, + "step": 168265 + }, + { + "epoch": 18.511551155115512, + "grad_norm": 0.0752350315451622, + "learning_rate": 8.389818845856656e-07, + "loss": 0.0723, + "num_input_tokens_seen": 35512032, + "step": 168270 + }, + { + "epoch": 18.512101210121013, + "grad_norm": 0.015104066580533981, + "learning_rate": 8.383654432115696e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35513120, + "step": 168275 + }, + { + "epoch": 18.51265126512651, + "grad_norm": 0.011572973802685738, + "learning_rate": 8.377492245236596e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35514112, + "step": 168280 + }, + { + "epoch": 18.513201320132012, + "grad_norm": 1.3707095384597778, + "learning_rate": 8.371332285276257e-07, + "loss": 0.0515, + "num_input_tokens_seen": 35515136, + "step": 168285 + }, + { + "epoch": 18.513751375137513, + "grad_norm": 0.24067722260951996, + "learning_rate": 8.365174552291382e-07, + "loss": 0.0057, + "num_input_tokens_seen": 35516192, + "step": 168290 + }, + { + "epoch": 18.514301430143014, + "grad_norm": 0.041768014430999756, + "learning_rate": 8.359019046338762e-07, + "loss": 0.0654, + "num_input_tokens_seen": 35517216, + "step": 168295 + }, + { + "epoch": 18.514851485148515, + "grad_norm": 0.007275638170540333, + "learning_rate": 8.352865767475071e-07, + "loss": 0.0036, + "num_input_tokens_seen": 35518304, + "step": 168300 + }, + { + "epoch": 18.515401540154016, + "grad_norm": 0.06938132643699646, + "learning_rate": 8.346714715757098e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35519360, + "step": 168305 + }, + { + "epoch": 18.515951595159517, + "grad_norm": 0.3137865364551544, + "learning_rate": 8.340565891241491e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35520416, + "step": 168310 + }, + { + "epoch": 18.516501650165015, + "grad_norm": 0.5581592321395874, + "learning_rate": 8.334419293984957e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35521472, + "step": 168315 + }, + { + "epoch": 18.517051705170516, + "grad_norm": 0.18779772520065308, + "learning_rate": 8.328274924044089e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35522560, + "step": 168320 + }, + { + "epoch": 18.517601760176017, + "grad_norm": 0.0016969015123322606, + "learning_rate": 8.322132781475534e-07, + "loss": 0.0746, + "num_input_tokens_seen": 35523584, + "step": 168325 + }, + { + "epoch": 18.51815181518152, + "grad_norm": 0.8685909509658813, + "learning_rate": 8.315992866335914e-07, + "loss": 0.1865, + "num_input_tokens_seen": 35524608, + "step": 168330 + }, + { + "epoch": 18.51870187018702, + "grad_norm": 0.014319312758743763, + "learning_rate": 8.309855178681825e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35525696, + "step": 168335 + }, + { + "epoch": 18.51925192519252, + "grad_norm": 0.013509519398212433, + "learning_rate": 8.303719718569858e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35526752, + "step": 168340 + }, + { + "epoch": 18.519801980198018, + "grad_norm": 3.0883631706237793, + "learning_rate": 8.297586486056496e-07, + "loss": 0.0831, + "num_input_tokens_seen": 35527776, + "step": 168345 + }, + { + "epoch": 18.52035203520352, + "grad_norm": 0.08729498088359833, + "learning_rate": 8.291455481198307e-07, + "loss": 0.068, + "num_input_tokens_seen": 35528864, + "step": 168350 + }, + { + "epoch": 18.52090209020902, + "grad_norm": 0.03465087711811066, + "learning_rate": 8.285326704051772e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35529856, + "step": 168355 + }, + { + "epoch": 18.52145214521452, + "grad_norm": 0.011534478515386581, + "learning_rate": 8.2792001546734e-07, + "loss": 0.0159, + "num_input_tokens_seen": 35530944, + "step": 168360 + }, + { + "epoch": 18.522002200220022, + "grad_norm": 0.1741400510072708, + "learning_rate": 8.273075833119675e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35532064, + "step": 168365 + }, + { + "epoch": 18.522552255225524, + "grad_norm": 0.007701529189944267, + "learning_rate": 8.266953739446998e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35533120, + "step": 168370 + }, + { + "epoch": 18.523102310231025, + "grad_norm": 0.1072838231921196, + "learning_rate": 8.260833873711849e-07, + "loss": 0.0153, + "num_input_tokens_seen": 35534144, + "step": 168375 + }, + { + "epoch": 18.523652365236522, + "grad_norm": 0.25970396399497986, + "learning_rate": 8.254716235970544e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35535168, + "step": 168380 + }, + { + "epoch": 18.524202420242023, + "grad_norm": 0.025170227512717247, + "learning_rate": 8.24860082627954e-07, + "loss": 0.0219, + "num_input_tokens_seen": 35536224, + "step": 168385 + }, + { + "epoch": 18.524752475247524, + "grad_norm": 0.019980134442448616, + "learning_rate": 8.242487644695179e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35537312, + "step": 168390 + }, + { + "epoch": 18.525302530253025, + "grad_norm": 0.02285464107990265, + "learning_rate": 8.236376691273806e-07, + "loss": 0.1002, + "num_input_tokens_seen": 35538368, + "step": 168395 + }, + { + "epoch": 18.525852585258527, + "grad_norm": 0.008685179054737091, + "learning_rate": 8.230267966071737e-07, + "loss": 0.0122, + "num_input_tokens_seen": 35539456, + "step": 168400 + }, + { + "epoch": 18.526402640264028, + "grad_norm": 0.01727670058608055, + "learning_rate": 8.224161469145286e-07, + "loss": 0.1159, + "num_input_tokens_seen": 35540480, + "step": 168405 + }, + { + "epoch": 18.52695269526953, + "grad_norm": 0.38783958554267883, + "learning_rate": 8.218057200550717e-07, + "loss": 0.0331, + "num_input_tokens_seen": 35541568, + "step": 168410 + }, + { + "epoch": 18.527502750275026, + "grad_norm": 0.2228553146123886, + "learning_rate": 8.211955160344287e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35542656, + "step": 168415 + }, + { + "epoch": 18.528052805280527, + "grad_norm": 0.3397778868675232, + "learning_rate": 8.205855348582231e-07, + "loss": 0.0262, + "num_input_tokens_seen": 35543680, + "step": 168420 + }, + { + "epoch": 18.52860286028603, + "grad_norm": 0.07624336332082748, + "learning_rate": 8.199757765320836e-07, + "loss": 0.0034, + "num_input_tokens_seen": 35544800, + "step": 168425 + }, + { + "epoch": 18.52915291529153, + "grad_norm": 0.027009624987840652, + "learning_rate": 8.193662410616198e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35545888, + "step": 168430 + }, + { + "epoch": 18.52970297029703, + "grad_norm": 0.9672313928604126, + "learning_rate": 8.187569284524576e-07, + "loss": 0.0615, + "num_input_tokens_seen": 35546912, + "step": 168435 + }, + { + "epoch": 18.53025302530253, + "grad_norm": 0.011868124827742577, + "learning_rate": 8.181478387102065e-07, + "loss": 0.0235, + "num_input_tokens_seen": 35547968, + "step": 168440 + }, + { + "epoch": 18.53080308030803, + "grad_norm": 0.48894086480140686, + "learning_rate": 8.175389718404814e-07, + "loss": 0.0907, + "num_input_tokens_seen": 35549056, + "step": 168445 + }, + { + "epoch": 18.53135313531353, + "grad_norm": 0.006490407045930624, + "learning_rate": 8.169303278489026e-07, + "loss": 0.0026, + "num_input_tokens_seen": 35550144, + "step": 168450 + }, + { + "epoch": 18.53190319031903, + "grad_norm": 0.04039774835109711, + "learning_rate": 8.16321906741066e-07, + "loss": 0.0162, + "num_input_tokens_seen": 35551232, + "step": 168455 + }, + { + "epoch": 18.532453245324533, + "grad_norm": 1.4660921096801758, + "learning_rate": 8.157137085225918e-07, + "loss": 0.0259, + "num_input_tokens_seen": 35552320, + "step": 168460 + }, + { + "epoch": 18.533003300330034, + "grad_norm": 0.008798758499324322, + "learning_rate": 8.151057331990786e-07, + "loss": 0.0044, + "num_input_tokens_seen": 35553344, + "step": 168465 + }, + { + "epoch": 18.533553355335535, + "grad_norm": 0.07133424282073975, + "learning_rate": 8.144979807761272e-07, + "loss": 0.0415, + "num_input_tokens_seen": 35554336, + "step": 168470 + }, + { + "epoch": 18.534103410341036, + "grad_norm": 0.11375584453344345, + "learning_rate": 8.138904512593443e-07, + "loss": 0.0359, + "num_input_tokens_seen": 35555328, + "step": 168475 + }, + { + "epoch": 18.534653465346533, + "grad_norm": 0.07871579378843307, + "learning_rate": 8.132831446543254e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35556320, + "step": 168480 + }, + { + "epoch": 18.535203520352034, + "grad_norm": 0.03288499638438225, + "learning_rate": 8.126760609666745e-07, + "loss": 0.0614, + "num_input_tokens_seen": 35557376, + "step": 168485 + }, + { + "epoch": 18.535753575357536, + "grad_norm": 0.07002563029527664, + "learning_rate": 8.120692002019814e-07, + "loss": 0.1441, + "num_input_tokens_seen": 35558400, + "step": 168490 + }, + { + "epoch": 18.536303630363037, + "grad_norm": 0.019345344975590706, + "learning_rate": 8.114625623658362e-07, + "loss": 0.0029, + "num_input_tokens_seen": 35559424, + "step": 168495 + }, + { + "epoch": 18.536853685368538, + "grad_norm": 0.02170403301715851, + "learning_rate": 8.108561474638343e-07, + "loss": 0.1114, + "num_input_tokens_seen": 35560512, + "step": 168500 + }, + { + "epoch": 18.53740374037404, + "grad_norm": 2.3321380615234375, + "learning_rate": 8.10249955501563e-07, + "loss": 0.1148, + "num_input_tokens_seen": 35561568, + "step": 168505 + }, + { + "epoch": 18.537953795379536, + "grad_norm": 0.02053016982972622, + "learning_rate": 8.09643986484615e-07, + "loss": 0.0218, + "num_input_tokens_seen": 35562656, + "step": 168510 + }, + { + "epoch": 18.538503850385037, + "grad_norm": 1.6682286262512207, + "learning_rate": 8.09038240418572e-07, + "loss": 0.0159, + "num_input_tokens_seen": 35563712, + "step": 168515 + }, + { + "epoch": 18.53905390539054, + "grad_norm": 0.0871114581823349, + "learning_rate": 8.084327173090101e-07, + "loss": 0.0131, + "num_input_tokens_seen": 35564704, + "step": 168520 + }, + { + "epoch": 18.53960396039604, + "grad_norm": 0.2882121503353119, + "learning_rate": 8.078274171615191e-07, + "loss": 0.0028, + "num_input_tokens_seen": 35565728, + "step": 168525 + }, + { + "epoch": 18.54015401540154, + "grad_norm": 2.8558638095855713, + "learning_rate": 8.072223399816726e-07, + "loss": 0.0149, + "num_input_tokens_seen": 35566784, + "step": 168530 + }, + { + "epoch": 18.540704070407042, + "grad_norm": 0.009728334844112396, + "learning_rate": 8.066174857750492e-07, + "loss": 0.0632, + "num_input_tokens_seen": 35567808, + "step": 168535 + }, + { + "epoch": 18.541254125412543, + "grad_norm": 0.045617394149303436, + "learning_rate": 8.06012854547225e-07, + "loss": 0.0759, + "num_input_tokens_seen": 35568832, + "step": 168540 + }, + { + "epoch": 18.54180418041804, + "grad_norm": 3.124122381210327, + "learning_rate": 8.054084463037681e-07, + "loss": 0.1508, + "num_input_tokens_seen": 35569888, + "step": 168545 + }, + { + "epoch": 18.54235423542354, + "grad_norm": 0.008721070364117622, + "learning_rate": 8.048042610502543e-07, + "loss": 0.0223, + "num_input_tokens_seen": 35570944, + "step": 168550 + }, + { + "epoch": 18.542904290429043, + "grad_norm": 0.015664778649806976, + "learning_rate": 8.042002987922515e-07, + "loss": 0.053, + "num_input_tokens_seen": 35572000, + "step": 168555 + }, + { + "epoch": 18.543454345434544, + "grad_norm": 0.010526781901717186, + "learning_rate": 8.035965595353162e-07, + "loss": 0.0195, + "num_input_tokens_seen": 35573120, + "step": 168560 + }, + { + "epoch": 18.544004400440045, + "grad_norm": 0.013711120933294296, + "learning_rate": 8.029930432850275e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35574176, + "step": 168565 + }, + { + "epoch": 18.544554455445546, + "grad_norm": 0.00416719913482666, + "learning_rate": 8.023897500469391e-07, + "loss": 0.002, + "num_input_tokens_seen": 35575232, + "step": 168570 + }, + { + "epoch": 18.545104510451043, + "grad_norm": 0.020288435742259026, + "learning_rate": 8.01786679826616e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35576288, + "step": 168575 + }, + { + "epoch": 18.545654565456545, + "grad_norm": 0.016331655904650688, + "learning_rate": 8.011838326296123e-07, + "loss": 0.0118, + "num_input_tokens_seen": 35577312, + "step": 168580 + }, + { + "epoch": 18.546204620462046, + "grad_norm": 0.026186034083366394, + "learning_rate": 8.005812084614844e-07, + "loss": 0.001, + "num_input_tokens_seen": 35578336, + "step": 168585 + }, + { + "epoch": 18.546754675467547, + "grad_norm": 0.07313402742147446, + "learning_rate": 7.999788073277865e-07, + "loss": 0.0082, + "num_input_tokens_seen": 35579360, + "step": 168590 + }, + { + "epoch": 18.547304730473048, + "grad_norm": 0.030292071402072906, + "learning_rate": 7.993766292340721e-07, + "loss": 0.0342, + "num_input_tokens_seen": 35580448, + "step": 168595 + }, + { + "epoch": 18.54785478547855, + "grad_norm": 0.6375836730003357, + "learning_rate": 7.987746741858926e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35581472, + "step": 168600 + }, + { + "epoch": 18.54840484048405, + "grad_norm": 0.02160942181944847, + "learning_rate": 7.981729421887934e-07, + "loss": 0.0806, + "num_input_tokens_seen": 35582592, + "step": 168605 + }, + { + "epoch": 18.548954895489548, + "grad_norm": 3.944190263748169, + "learning_rate": 7.975714332483203e-07, + "loss": 0.0312, + "num_input_tokens_seen": 35583712, + "step": 168610 + }, + { + "epoch": 18.54950495049505, + "grad_norm": 0.06155037134885788, + "learning_rate": 7.969701473700186e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35584800, + "step": 168615 + }, + { + "epoch": 18.55005500550055, + "grad_norm": 2.1840925216674805, + "learning_rate": 7.963690845594258e-07, + "loss": 0.0307, + "num_input_tokens_seen": 35585824, + "step": 168620 + }, + { + "epoch": 18.55060506050605, + "grad_norm": 0.32923176884651184, + "learning_rate": 7.957682448220899e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35586944, + "step": 168625 + }, + { + "epoch": 18.551155115511552, + "grad_norm": 0.8290339112281799, + "learning_rate": 7.951676281635428e-07, + "loss": 0.0372, + "num_input_tokens_seen": 35587968, + "step": 168630 + }, + { + "epoch": 18.551705170517053, + "grad_norm": 0.3924430012702942, + "learning_rate": 7.945672345893191e-07, + "loss": 0.0047, + "num_input_tokens_seen": 35589024, + "step": 168635 + }, + { + "epoch": 18.55225522552255, + "grad_norm": 0.02670927532017231, + "learning_rate": 7.939670641049585e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35590080, + "step": 168640 + }, + { + "epoch": 18.55280528052805, + "grad_norm": 0.017576929181814194, + "learning_rate": 7.933671167159817e-07, + "loss": 0.0114, + "num_input_tokens_seen": 35591104, + "step": 168645 + }, + { + "epoch": 18.553355335533553, + "grad_norm": 0.5570368766784668, + "learning_rate": 7.927673924279288e-07, + "loss": 0.0134, + "num_input_tokens_seen": 35592160, + "step": 168650 + }, + { + "epoch": 18.553905390539054, + "grad_norm": 0.03350791707634926, + "learning_rate": 7.921678912463232e-07, + "loss": 0.003, + "num_input_tokens_seen": 35593184, + "step": 168655 + }, + { + "epoch": 18.554455445544555, + "grad_norm": 0.2158193141222, + "learning_rate": 7.91568613176688e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35594208, + "step": 168660 + }, + { + "epoch": 18.555005500550056, + "grad_norm": 0.01092501450330019, + "learning_rate": 7.909695582245524e-07, + "loss": 0.0766, + "num_input_tokens_seen": 35595328, + "step": 168665 + }, + { + "epoch": 18.555555555555557, + "grad_norm": 0.20932705700397491, + "learning_rate": 7.903707263954285e-07, + "loss": 0.002, + "num_input_tokens_seen": 35596416, + "step": 168670 + }, + { + "epoch": 18.556105610561055, + "grad_norm": 0.009821291081607342, + "learning_rate": 7.897721176948453e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35597536, + "step": 168675 + }, + { + "epoch": 18.556655665566556, + "grad_norm": 0.0540148988366127, + "learning_rate": 7.891737321283149e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35598624, + "step": 168680 + }, + { + "epoch": 18.557205720572057, + "grad_norm": 0.08338631689548492, + "learning_rate": 7.885755697013497e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35599616, + "step": 168685 + }, + { + "epoch": 18.557755775577558, + "grad_norm": 0.3507887125015259, + "learning_rate": 7.879776304194675e-07, + "loss": 0.0046, + "num_input_tokens_seen": 35600640, + "step": 168690 + }, + { + "epoch": 18.55830583058306, + "grad_norm": 0.06142356991767883, + "learning_rate": 7.873799142881777e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35601696, + "step": 168695 + }, + { + "epoch": 18.55885588558856, + "grad_norm": 2.1343390941619873, + "learning_rate": 7.867824213129899e-07, + "loss": 0.0161, + "num_input_tokens_seen": 35602752, + "step": 168700 + }, + { + "epoch": 18.55940594059406, + "grad_norm": 0.43660783767700195, + "learning_rate": 7.861851514994051e-07, + "loss": 0.0155, + "num_input_tokens_seen": 35603776, + "step": 168705 + }, + { + "epoch": 18.55995599559956, + "grad_norm": 0.06071893870830536, + "learning_rate": 7.855881048529357e-07, + "loss": 0.0623, + "num_input_tokens_seen": 35604864, + "step": 168710 + }, + { + "epoch": 18.56050605060506, + "grad_norm": 0.038201577961444855, + "learning_rate": 7.849912813790855e-07, + "loss": 0.002, + "num_input_tokens_seen": 35605920, + "step": 168715 + }, + { + "epoch": 18.56105610561056, + "grad_norm": 0.031445495784282684, + "learning_rate": 7.843946810833474e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35606976, + "step": 168720 + }, + { + "epoch": 18.561606160616062, + "grad_norm": 0.0195319801568985, + "learning_rate": 7.837983039712227e-07, + "loss": 0.0047, + "num_input_tokens_seen": 35608032, + "step": 168725 + }, + { + "epoch": 18.562156215621563, + "grad_norm": 0.05937139689922333, + "learning_rate": 7.832021500482123e-07, + "loss": 0.0707, + "num_input_tokens_seen": 35609088, + "step": 168730 + }, + { + "epoch": 18.562706270627064, + "grad_norm": 1.4804388284683228, + "learning_rate": 7.826062193198036e-07, + "loss": 0.0243, + "num_input_tokens_seen": 35610176, + "step": 168735 + }, + { + "epoch": 18.563256325632562, + "grad_norm": 0.06329183280467987, + "learning_rate": 7.82010511791495e-07, + "loss": 0.0854, + "num_input_tokens_seen": 35611264, + "step": 168740 + }, + { + "epoch": 18.563806380638063, + "grad_norm": 0.08203775435686111, + "learning_rate": 7.814150274687737e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35612320, + "step": 168745 + }, + { + "epoch": 18.564356435643564, + "grad_norm": 0.2144898772239685, + "learning_rate": 7.808197663571299e-07, + "loss": 0.0036, + "num_input_tokens_seen": 35613408, + "step": 168750 + }, + { + "epoch": 18.564906490649065, + "grad_norm": 0.006252865772694349, + "learning_rate": 7.802247284620479e-07, + "loss": 0.0488, + "num_input_tokens_seen": 35614496, + "step": 168755 + }, + { + "epoch": 18.565456545654566, + "grad_norm": 0.04027685523033142, + "learning_rate": 7.796299137890123e-07, + "loss": 0.0335, + "num_input_tokens_seen": 35615552, + "step": 168760 + }, + { + "epoch": 18.566006600660067, + "grad_norm": 0.05793413892388344, + "learning_rate": 7.790353223435076e-07, + "loss": 0.0041, + "num_input_tokens_seen": 35616608, + "step": 168765 + }, + { + "epoch": 18.566556655665565, + "grad_norm": 0.020441658794879913, + "learning_rate": 7.7844095413101e-07, + "loss": 0.0743, + "num_input_tokens_seen": 35617664, + "step": 168770 + }, + { + "epoch": 18.567106710671066, + "grad_norm": 0.1509907841682434, + "learning_rate": 7.778468091570012e-07, + "loss": 0.0983, + "num_input_tokens_seen": 35618656, + "step": 168775 + }, + { + "epoch": 18.567656765676567, + "grad_norm": 0.03283023089170456, + "learning_rate": 7.772528874269547e-07, + "loss": 0.004, + "num_input_tokens_seen": 35619744, + "step": 168780 + }, + { + "epoch": 18.568206820682068, + "grad_norm": 1.6430662870407104, + "learning_rate": 7.766591889463437e-07, + "loss": 0.0591, + "num_input_tokens_seen": 35620800, + "step": 168785 + }, + { + "epoch": 18.56875687568757, + "grad_norm": 0.009037992917001247, + "learning_rate": 7.760657137206445e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35621856, + "step": 168790 + }, + { + "epoch": 18.56930693069307, + "grad_norm": 0.04880750551819801, + "learning_rate": 7.754724617553221e-07, + "loss": 0.0039, + "num_input_tokens_seen": 35622912, + "step": 168795 + }, + { + "epoch": 18.56985698569857, + "grad_norm": 0.014990855008363724, + "learning_rate": 7.748794330558446e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35624000, + "step": 168800 + }, + { + "epoch": 18.57040704070407, + "grad_norm": 0.022867079824209213, + "learning_rate": 7.742866276276823e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35625024, + "step": 168805 + }, + { + "epoch": 18.57095709570957, + "grad_norm": 0.003861854085698724, + "learning_rate": 7.73694045476292e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35626112, + "step": 168810 + }, + { + "epoch": 18.57150715071507, + "grad_norm": 0.008191480301320553, + "learning_rate": 7.731016866071417e-07, + "loss": 0.0451, + "num_input_tokens_seen": 35627200, + "step": 168815 + }, + { + "epoch": 18.572057205720572, + "grad_norm": 2.6213133335113525, + "learning_rate": 7.725095510256853e-07, + "loss": 0.0123, + "num_input_tokens_seen": 35628256, + "step": 168820 + }, + { + "epoch": 18.572607260726073, + "grad_norm": 0.019543413072824478, + "learning_rate": 7.71917638737385e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35629312, + "step": 168825 + }, + { + "epoch": 18.573157315731574, + "grad_norm": 0.020412249490618706, + "learning_rate": 7.713259497476949e-07, + "loss": 0.0812, + "num_input_tokens_seen": 35630368, + "step": 168830 + }, + { + "epoch": 18.573707370737075, + "grad_norm": 0.12908010184764862, + "learning_rate": 7.707344840620634e-07, + "loss": 0.1198, + "num_input_tokens_seen": 35631392, + "step": 168835 + }, + { + "epoch": 18.574257425742573, + "grad_norm": 0.03868547081947327, + "learning_rate": 7.701432416859499e-07, + "loss": 0.0035, + "num_input_tokens_seen": 35632480, + "step": 168840 + }, + { + "epoch": 18.574807480748074, + "grad_norm": 0.007112847175449133, + "learning_rate": 7.695522226248003e-07, + "loss": 0.0042, + "num_input_tokens_seen": 35633568, + "step": 168845 + }, + { + "epoch": 18.575357535753575, + "grad_norm": 0.006668627727776766, + "learning_rate": 7.689614268840573e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35634592, + "step": 168850 + }, + { + "epoch": 18.575907590759076, + "grad_norm": 0.25006577372550964, + "learning_rate": 7.683708544691692e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35635680, + "step": 168855 + }, + { + "epoch": 18.576457645764577, + "grad_norm": 0.057682327926158905, + "learning_rate": 7.677805053855791e-07, + "loss": 0.002, + "num_input_tokens_seen": 35636768, + "step": 168860 + }, + { + "epoch": 18.57700770077008, + "grad_norm": 0.01795320399105549, + "learning_rate": 7.671903796387325e-07, + "loss": 0.0459, + "num_input_tokens_seen": 35637824, + "step": 168865 + }, + { + "epoch": 18.577557755775576, + "grad_norm": 0.237569659948349, + "learning_rate": 7.66600477234064e-07, + "loss": 0.0034, + "num_input_tokens_seen": 35638880, + "step": 168870 + }, + { + "epoch": 18.578107810781077, + "grad_norm": 0.01607133075594902, + "learning_rate": 7.660107981770081e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35639936, + "step": 168875 + }, + { + "epoch": 18.578657865786578, + "grad_norm": 0.04807170107960701, + "learning_rate": 7.654213424730022e-07, + "loss": 0.0152, + "num_input_tokens_seen": 35640960, + "step": 168880 + }, + { + "epoch": 18.57920792079208, + "grad_norm": 0.02421809546649456, + "learning_rate": 7.64832110127478e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35641952, + "step": 168885 + }, + { + "epoch": 18.57975797579758, + "grad_norm": 0.09245274215936661, + "learning_rate": 7.642431011458701e-07, + "loss": 0.0595, + "num_input_tokens_seen": 35643008, + "step": 168890 + }, + { + "epoch": 18.58030803080308, + "grad_norm": 0.09215916693210602, + "learning_rate": 7.636543155336046e-07, + "loss": 0.0043, + "num_input_tokens_seen": 35644032, + "step": 168895 + }, + { + "epoch": 18.580858085808583, + "grad_norm": 4.384836196899414, + "learning_rate": 7.630657532961022e-07, + "loss": 0.0416, + "num_input_tokens_seen": 35645088, + "step": 168900 + }, + { + "epoch": 18.58140814081408, + "grad_norm": 0.03792007640004158, + "learning_rate": 7.624774144387975e-07, + "loss": 0.0534, + "num_input_tokens_seen": 35646080, + "step": 168905 + }, + { + "epoch": 18.58195819581958, + "grad_norm": 0.003317774971947074, + "learning_rate": 7.618892989671056e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35647136, + "step": 168910 + }, + { + "epoch": 18.582508250825082, + "grad_norm": 0.008017556741833687, + "learning_rate": 7.613014068864499e-07, + "loss": 0.0449, + "num_input_tokens_seen": 35648224, + "step": 168915 + }, + { + "epoch": 18.583058305830583, + "grad_norm": 0.005846166517585516, + "learning_rate": 7.607137382022511e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35649280, + "step": 168920 + }, + { + "epoch": 18.583608360836084, + "grad_norm": 0.5554692149162292, + "learning_rate": 7.601262929199188e-07, + "loss": 0.0123, + "num_input_tokens_seen": 35650368, + "step": 168925 + }, + { + "epoch": 18.584158415841586, + "grad_norm": 0.024541571736335754, + "learning_rate": 7.595390710448735e-07, + "loss": 0.0324, + "num_input_tokens_seen": 35651392, + "step": 168930 + }, + { + "epoch": 18.584708470847083, + "grad_norm": 1.2655967473983765, + "learning_rate": 7.589520725825222e-07, + "loss": 0.0318, + "num_input_tokens_seen": 35652448, + "step": 168935 + }, + { + "epoch": 18.585258525852584, + "grad_norm": 0.009247270412743092, + "learning_rate": 7.583652975382799e-07, + "loss": 0.0487, + "num_input_tokens_seen": 35653408, + "step": 168940 + }, + { + "epoch": 18.585808580858085, + "grad_norm": 0.030263831838965416, + "learning_rate": 7.577787459175534e-07, + "loss": 0.0065, + "num_input_tokens_seen": 35654432, + "step": 168945 + }, + { + "epoch": 18.586358635863586, + "grad_norm": 0.004212470725178719, + "learning_rate": 7.571924177257439e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35655456, + "step": 168950 + }, + { + "epoch": 18.586908690869087, + "grad_norm": 0.45136046409606934, + "learning_rate": 7.566063129682638e-07, + "loss": 0.0323, + "num_input_tokens_seen": 35656608, + "step": 168955 + }, + { + "epoch": 18.58745874587459, + "grad_norm": 4.478260517120361, + "learning_rate": 7.560204316505087e-07, + "loss": 0.0733, + "num_input_tokens_seen": 35657696, + "step": 168960 + }, + { + "epoch": 18.58800880088009, + "grad_norm": 0.017908083274960518, + "learning_rate": 7.554347737778772e-07, + "loss": 0.0457, + "num_input_tokens_seen": 35658816, + "step": 168965 + }, + { + "epoch": 18.588558855885587, + "grad_norm": 0.02707696333527565, + "learning_rate": 7.548493393557704e-07, + "loss": 0.0099, + "num_input_tokens_seen": 35659840, + "step": 168970 + }, + { + "epoch": 18.58910891089109, + "grad_norm": 0.014440781436860561, + "learning_rate": 7.54264128389584e-07, + "loss": 0.0148, + "num_input_tokens_seen": 35660864, + "step": 168975 + }, + { + "epoch": 18.58965896589659, + "grad_norm": 0.011523602530360222, + "learning_rate": 7.536791408847139e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35661888, + "step": 168980 + }, + { + "epoch": 18.59020902090209, + "grad_norm": 0.14035311341285706, + "learning_rate": 7.53094376846547e-07, + "loss": 0.0336, + "num_input_tokens_seen": 35662912, + "step": 168985 + }, + { + "epoch": 18.59075907590759, + "grad_norm": 0.23879052698612213, + "learning_rate": 7.525098362804711e-07, + "loss": 0.0077, + "num_input_tokens_seen": 35664032, + "step": 168990 + }, + { + "epoch": 18.591309130913093, + "grad_norm": 0.025402994826436043, + "learning_rate": 7.519255191918789e-07, + "loss": 0.0343, + "num_input_tokens_seen": 35665056, + "step": 168995 + }, + { + "epoch": 18.59185918591859, + "grad_norm": 0.036480844020843506, + "learning_rate": 7.513414255861522e-07, + "loss": 0.0088, + "num_input_tokens_seen": 35666112, + "step": 169000 + }, + { + "epoch": 18.59240924092409, + "grad_norm": 0.025368962436914444, + "learning_rate": 7.507575554686813e-07, + "loss": 0.0523, + "num_input_tokens_seen": 35667168, + "step": 169005 + }, + { + "epoch": 18.592959295929592, + "grad_norm": 0.33816850185394287, + "learning_rate": 7.501739088448395e-07, + "loss": 0.099, + "num_input_tokens_seen": 35668256, + "step": 169010 + }, + { + "epoch": 18.593509350935093, + "grad_norm": 0.03658153489232063, + "learning_rate": 7.49590485720006e-07, + "loss": 0.053, + "num_input_tokens_seen": 35669312, + "step": 169015 + }, + { + "epoch": 18.594059405940595, + "grad_norm": 0.2438105046749115, + "learning_rate": 7.49007286099565e-07, + "loss": 0.0062, + "num_input_tokens_seen": 35670368, + "step": 169020 + }, + { + "epoch": 18.594609460946096, + "grad_norm": 0.006058951839804649, + "learning_rate": 7.484243099888822e-07, + "loss": 0.0007, + "num_input_tokens_seen": 35671456, + "step": 169025 + }, + { + "epoch": 18.595159515951597, + "grad_norm": 0.008542482741177082, + "learning_rate": 7.478415573933362e-07, + "loss": 0.1182, + "num_input_tokens_seen": 35672480, + "step": 169030 + }, + { + "epoch": 18.595709570957094, + "grad_norm": 0.0024043226148933172, + "learning_rate": 7.472590283183006e-07, + "loss": 0.0506, + "num_input_tokens_seen": 35673504, + "step": 169035 + }, + { + "epoch": 18.596259625962595, + "grad_norm": 0.23613135516643524, + "learning_rate": 7.466767227691379e-07, + "loss": 0.004, + "num_input_tokens_seen": 35674560, + "step": 169040 + }, + { + "epoch": 18.596809680968097, + "grad_norm": 0.11910940706729889, + "learning_rate": 7.460946407512215e-07, + "loss": 0.1351, + "num_input_tokens_seen": 35675584, + "step": 169045 + }, + { + "epoch": 18.597359735973598, + "grad_norm": 0.06904305517673492, + "learning_rate": 7.455127822699082e-07, + "loss": 0.1257, + "num_input_tokens_seen": 35676608, + "step": 169050 + }, + { + "epoch": 18.5979097909791, + "grad_norm": 0.007321471814066172, + "learning_rate": 7.449311473305659e-07, + "loss": 0.002, + "num_input_tokens_seen": 35677696, + "step": 169055 + }, + { + "epoch": 18.5984598459846, + "grad_norm": 1.319305419921875, + "learning_rate": 7.443497359385571e-07, + "loss": 0.0413, + "num_input_tokens_seen": 35678720, + "step": 169060 + }, + { + "epoch": 18.599009900990097, + "grad_norm": 1.1374133825302124, + "learning_rate": 7.437685480992356e-07, + "loss": 0.0106, + "num_input_tokens_seen": 35679776, + "step": 169065 + }, + { + "epoch": 18.5995599559956, + "grad_norm": 0.047408442944288254, + "learning_rate": 7.431875838179641e-07, + "loss": 0.0032, + "num_input_tokens_seen": 35680832, + "step": 169070 + }, + { + "epoch": 18.6001100110011, + "grad_norm": 0.869796872138977, + "learning_rate": 7.426068431000882e-07, + "loss": 0.0075, + "num_input_tokens_seen": 35681984, + "step": 169075 + }, + { + "epoch": 18.6006600660066, + "grad_norm": 0.09928860515356064, + "learning_rate": 7.420263259509674e-07, + "loss": 0.0078, + "num_input_tokens_seen": 35683072, + "step": 169080 + }, + { + "epoch": 18.6012101210121, + "grad_norm": 0.04105779528617859, + "learning_rate": 7.414460323759503e-07, + "loss": 0.0039, + "num_input_tokens_seen": 35684064, + "step": 169085 + }, + { + "epoch": 18.601760176017603, + "grad_norm": 0.24651925265789032, + "learning_rate": 7.408659623803827e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35685088, + "step": 169090 + }, + { + "epoch": 18.602310231023104, + "grad_norm": 2.0408742427825928, + "learning_rate": 7.402861159696156e-07, + "loss": 0.1528, + "num_input_tokens_seen": 35686144, + "step": 169095 + }, + { + "epoch": 18.6028602860286, + "grad_norm": 0.027340609580278397, + "learning_rate": 7.397064931489894e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35687168, + "step": 169100 + }, + { + "epoch": 18.603410341034103, + "grad_norm": 0.05824946239590645, + "learning_rate": 7.39127093923847e-07, + "loss": 0.0104, + "num_input_tokens_seen": 35688224, + "step": 169105 + }, + { + "epoch": 18.603960396039604, + "grad_norm": 0.07146446406841278, + "learning_rate": 7.385479182995259e-07, + "loss": 0.082, + "num_input_tokens_seen": 35689248, + "step": 169110 + }, + { + "epoch": 18.604510451045105, + "grad_norm": 3.850640058517456, + "learning_rate": 7.379689662813688e-07, + "loss": 0.0519, + "num_input_tokens_seen": 35690304, + "step": 169115 + }, + { + "epoch": 18.605060506050606, + "grad_norm": 0.010268948040902615, + "learning_rate": 7.373902378747105e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35691360, + "step": 169120 + }, + { + "epoch": 18.605610561056107, + "grad_norm": 0.03038940392434597, + "learning_rate": 7.368117330848856e-07, + "loss": 0.006, + "num_input_tokens_seen": 35692448, + "step": 169125 + }, + { + "epoch": 18.606160616061608, + "grad_norm": 1.0715343952178955, + "learning_rate": 7.362334519172231e-07, + "loss": 0.013, + "num_input_tokens_seen": 35693536, + "step": 169130 + }, + { + "epoch": 18.606710671067106, + "grad_norm": 0.27315953373908997, + "learning_rate": 7.35655394377055e-07, + "loss": 0.0041, + "num_input_tokens_seen": 35694624, + "step": 169135 + }, + { + "epoch": 18.607260726072607, + "grad_norm": 0.020389476791024208, + "learning_rate": 7.350775604697046e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35695680, + "step": 169140 + }, + { + "epoch": 18.607810781078108, + "grad_norm": 0.01903912052512169, + "learning_rate": 7.344999502005012e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35696768, + "step": 169145 + }, + { + "epoch": 18.60836083608361, + "grad_norm": 2.3740341663360596, + "learning_rate": 7.339225635747709e-07, + "loss": 0.0731, + "num_input_tokens_seen": 35697824, + "step": 169150 + }, + { + "epoch": 18.60891089108911, + "grad_norm": 0.02390114963054657, + "learning_rate": 7.333454005978318e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35698912, + "step": 169155 + }, + { + "epoch": 18.60946094609461, + "grad_norm": 0.07589158415794373, + "learning_rate": 7.327684612750046e-07, + "loss": 0.0042, + "num_input_tokens_seen": 35699968, + "step": 169160 + }, + { + "epoch": 18.61001100110011, + "grad_norm": 0.0078080384992063046, + "learning_rate": 7.321917456116018e-07, + "loss": 0.0128, + "num_input_tokens_seen": 35700992, + "step": 169165 + }, + { + "epoch": 18.61056105610561, + "grad_norm": 0.10973145067691803, + "learning_rate": 7.316152536129439e-07, + "loss": 0.0166, + "num_input_tokens_seen": 35702080, + "step": 169170 + }, + { + "epoch": 18.61111111111111, + "grad_norm": 0.011506091803312302, + "learning_rate": 7.310389852843464e-07, + "loss": 0.0081, + "num_input_tokens_seen": 35703104, + "step": 169175 + }, + { + "epoch": 18.611661166116612, + "grad_norm": 0.024088237434625626, + "learning_rate": 7.304629406311158e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35704160, + "step": 169180 + }, + { + "epoch": 18.612211221122113, + "grad_norm": 0.009868272580206394, + "learning_rate": 7.298871196585621e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35705184, + "step": 169185 + }, + { + "epoch": 18.612761276127614, + "grad_norm": 0.032183267176151276, + "learning_rate": 7.293115223719921e-07, + "loss": 0.0221, + "num_input_tokens_seen": 35706208, + "step": 169190 + }, + { + "epoch": 18.61331133113311, + "grad_norm": 0.0027955833356827497, + "learning_rate": 7.287361487767153e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35707264, + "step": 169195 + }, + { + "epoch": 18.613861386138613, + "grad_norm": 0.0027311432640999556, + "learning_rate": 7.281609988780274e-07, + "loss": 0.0466, + "num_input_tokens_seen": 35708320, + "step": 169200 + }, + { + "epoch": 18.614411441144114, + "grad_norm": 0.0200976375490427, + "learning_rate": 7.275860726812301e-07, + "loss": 0.0034, + "num_input_tokens_seen": 35709376, + "step": 169205 + }, + { + "epoch": 18.614961496149615, + "grad_norm": 0.052874211221933365, + "learning_rate": 7.270113701916298e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35710464, + "step": 169210 + }, + { + "epoch": 18.615511551155116, + "grad_norm": 0.06417365372180939, + "learning_rate": 7.26436891414517e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35711488, + "step": 169215 + }, + { + "epoch": 18.616061606160617, + "grad_norm": 0.003246550913900137, + "learning_rate": 7.258626363551901e-07, + "loss": 0.0132, + "num_input_tokens_seen": 35712576, + "step": 169220 + }, + { + "epoch": 18.616611661166118, + "grad_norm": 0.02646464668214321, + "learning_rate": 7.25288605018934e-07, + "loss": 0.005, + "num_input_tokens_seen": 35713568, + "step": 169225 + }, + { + "epoch": 18.617161716171616, + "grad_norm": 0.008548949845135212, + "learning_rate": 7.247147974110469e-07, + "loss": 0.0688, + "num_input_tokens_seen": 35714592, + "step": 169230 + }, + { + "epoch": 18.617711771177117, + "grad_norm": 0.8208214640617371, + "learning_rate": 7.241412135368192e-07, + "loss": 0.0083, + "num_input_tokens_seen": 35715648, + "step": 169235 + }, + { + "epoch": 18.618261826182618, + "grad_norm": 0.3502710461616516, + "learning_rate": 7.235678534015272e-07, + "loss": 0.0365, + "num_input_tokens_seen": 35716672, + "step": 169240 + }, + { + "epoch": 18.61881188118812, + "grad_norm": 0.05189270153641701, + "learning_rate": 7.229947170104667e-07, + "loss": 0.0282, + "num_input_tokens_seen": 35717728, + "step": 169245 + }, + { + "epoch": 18.61936193619362, + "grad_norm": 0.4022012948989868, + "learning_rate": 7.22421804368914e-07, + "loss": 0.0045, + "num_input_tokens_seen": 35718848, + "step": 169250 + }, + { + "epoch": 18.61991199119912, + "grad_norm": 0.1992252916097641, + "learning_rate": 7.218491154821455e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35719872, + "step": 169255 + }, + { + "epoch": 18.620462046204622, + "grad_norm": 0.1534063220024109, + "learning_rate": 7.212766503554458e-07, + "loss": 0.0055, + "num_input_tokens_seen": 35720960, + "step": 169260 + }, + { + "epoch": 18.62101210121012, + "grad_norm": 0.006380569655448198, + "learning_rate": 7.207044089940884e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35721984, + "step": 169265 + }, + { + "epoch": 18.62156215621562, + "grad_norm": 0.058874186128377914, + "learning_rate": 7.201323914033525e-07, + "loss": 0.0372, + "num_input_tokens_seen": 35723072, + "step": 169270 + }, + { + "epoch": 18.622112211221122, + "grad_norm": 2.0345497131347656, + "learning_rate": 7.195605975885033e-07, + "loss": 0.044, + "num_input_tokens_seen": 35724160, + "step": 169275 + }, + { + "epoch": 18.622662266226623, + "grad_norm": 0.018527084961533546, + "learning_rate": 7.189890275548117e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35725248, + "step": 169280 + }, + { + "epoch": 18.623212321232124, + "grad_norm": 0.003854036098346114, + "learning_rate": 7.184176813075483e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35726272, + "step": 169285 + }, + { + "epoch": 18.623762376237625, + "grad_norm": 0.21794791519641876, + "learning_rate": 7.178465588519784e-07, + "loss": 0.0987, + "num_input_tokens_seen": 35727328, + "step": 169290 + }, + { + "epoch": 18.624312431243123, + "grad_norm": 0.03240549564361572, + "learning_rate": 7.172756601933645e-07, + "loss": 0.0804, + "num_input_tokens_seen": 35728384, + "step": 169295 + }, + { + "epoch": 18.624862486248624, + "grad_norm": 0.5316628813743591, + "learning_rate": 7.167049853369689e-07, + "loss": 0.0167, + "num_input_tokens_seen": 35729472, + "step": 169300 + }, + { + "epoch": 18.625412541254125, + "grad_norm": 0.007381673902273178, + "learning_rate": 7.161345342880487e-07, + "loss": 0.0069, + "num_input_tokens_seen": 35730528, + "step": 169305 + }, + { + "epoch": 18.625962596259626, + "grad_norm": 0.03893469646573067, + "learning_rate": 7.155643070518692e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35731520, + "step": 169310 + }, + { + "epoch": 18.626512651265127, + "grad_norm": 0.018498359248042107, + "learning_rate": 7.14994303633676e-07, + "loss": 0.0115, + "num_input_tokens_seen": 35732576, + "step": 169315 + }, + { + "epoch": 18.627062706270628, + "grad_norm": 0.41942134499549866, + "learning_rate": 7.144245240387259e-07, + "loss": 0.021, + "num_input_tokens_seen": 35733568, + "step": 169320 + }, + { + "epoch": 18.62761276127613, + "grad_norm": 0.04784919321537018, + "learning_rate": 7.138549682722762e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35734624, + "step": 169325 + }, + { + "epoch": 18.628162816281627, + "grad_norm": 0.016993969678878784, + "learning_rate": 7.132856363395696e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35735712, + "step": 169330 + }, + { + "epoch": 18.628712871287128, + "grad_norm": 0.00889009702950716, + "learning_rate": 7.127165282458576e-07, + "loss": 0.0867, + "num_input_tokens_seen": 35736768, + "step": 169335 + }, + { + "epoch": 18.62926292629263, + "grad_norm": 0.10837861150503159, + "learning_rate": 7.121476439963831e-07, + "loss": 0.1184, + "num_input_tokens_seen": 35737856, + "step": 169340 + }, + { + "epoch": 18.62981298129813, + "grad_norm": 0.0069599878042936325, + "learning_rate": 7.115789835963837e-07, + "loss": 0.0076, + "num_input_tokens_seen": 35738912, + "step": 169345 + }, + { + "epoch": 18.63036303630363, + "grad_norm": 0.014902533032000065, + "learning_rate": 7.110105470511108e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35739936, + "step": 169350 + }, + { + "epoch": 18.630913091309132, + "grad_norm": 0.006745967548340559, + "learning_rate": 7.104423343657962e-07, + "loss": 0.0038, + "num_input_tokens_seen": 35740960, + "step": 169355 + }, + { + "epoch": 18.63146314631463, + "grad_norm": 0.0271304901689291, + "learning_rate": 7.09874345545683e-07, + "loss": 0.1174, + "num_input_tokens_seen": 35742048, + "step": 169360 + }, + { + "epoch": 18.63201320132013, + "grad_norm": 0.008819273672997952, + "learning_rate": 7.093065805960031e-07, + "loss": 0.0012, + "num_input_tokens_seen": 35743168, + "step": 169365 + }, + { + "epoch": 18.632563256325632, + "grad_norm": 0.028756456449627876, + "learning_rate": 7.087390395219828e-07, + "loss": 0.003, + "num_input_tokens_seen": 35744192, + "step": 169370 + }, + { + "epoch": 18.633113311331133, + "grad_norm": 0.008489192463457584, + "learning_rate": 7.081717223288625e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35745280, + "step": 169375 + }, + { + "epoch": 18.633663366336634, + "grad_norm": 0.02368631213903427, + "learning_rate": 7.076046290218629e-07, + "loss": 0.006, + "num_input_tokens_seen": 35746368, + "step": 169380 + }, + { + "epoch": 18.634213421342135, + "grad_norm": 0.22151519358158112, + "learning_rate": 7.070377596062216e-07, + "loss": 0.0029, + "num_input_tokens_seen": 35747392, + "step": 169385 + }, + { + "epoch": 18.634763476347636, + "grad_norm": 0.013848007656633854, + "learning_rate": 7.064711140871538e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35748448, + "step": 169390 + }, + { + "epoch": 18.635313531353134, + "grad_norm": 0.8737460970878601, + "learning_rate": 7.059046924698831e-07, + "loss": 0.0098, + "num_input_tokens_seen": 35749504, + "step": 169395 + }, + { + "epoch": 18.635863586358635, + "grad_norm": 0.00970539078116417, + "learning_rate": 7.05338494759633e-07, + "loss": 0.0276, + "num_input_tokens_seen": 35750560, + "step": 169400 + }, + { + "epoch": 18.636413641364136, + "grad_norm": 0.6279535293579102, + "learning_rate": 7.047725209616162e-07, + "loss": 0.0056, + "num_input_tokens_seen": 35751680, + "step": 169405 + }, + { + "epoch": 18.636963696369637, + "grad_norm": 0.021120676770806313, + "learning_rate": 7.04206771081059e-07, + "loss": 0.0159, + "num_input_tokens_seen": 35752672, + "step": 169410 + }, + { + "epoch": 18.63751375137514, + "grad_norm": 1.7678831815719604, + "learning_rate": 7.036412451231683e-07, + "loss": 0.0169, + "num_input_tokens_seen": 35753696, + "step": 169415 + }, + { + "epoch": 18.63806380638064, + "grad_norm": 0.03600502759218216, + "learning_rate": 7.030759430931538e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35754752, + "step": 169420 + }, + { + "epoch": 18.638613861386137, + "grad_norm": 0.005168893840163946, + "learning_rate": 7.025108649962336e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35755776, + "step": 169425 + }, + { + "epoch": 18.639163916391638, + "grad_norm": 0.011227903887629509, + "learning_rate": 7.019460108376091e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35756800, + "step": 169430 + }, + { + "epoch": 18.63971397139714, + "grad_norm": 0.07571521401405334, + "learning_rate": 7.0138138062249e-07, + "loss": 0.0039, + "num_input_tokens_seen": 35757792, + "step": 169435 + }, + { + "epoch": 18.64026402640264, + "grad_norm": 0.017113178968429565, + "learning_rate": 7.008169743560806e-07, + "loss": 0.0622, + "num_input_tokens_seen": 35758880, + "step": 169440 + }, + { + "epoch": 18.64081408140814, + "grad_norm": 0.017793213948607445, + "learning_rate": 7.002527920435792e-07, + "loss": 0.0395, + "num_input_tokens_seen": 35759936, + "step": 169445 + }, + { + "epoch": 18.641364136413642, + "grad_norm": 0.015125652775168419, + "learning_rate": 6.996888336901902e-07, + "loss": 0.0675, + "num_input_tokens_seen": 35760960, + "step": 169450 + }, + { + "epoch": 18.641914191419144, + "grad_norm": 0.1266184151172638, + "learning_rate": 6.991250993011039e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35762080, + "step": 169455 + }, + { + "epoch": 18.64246424642464, + "grad_norm": 1.9402508735656738, + "learning_rate": 6.985615888815273e-07, + "loss": 0.0479, + "num_input_tokens_seen": 35763136, + "step": 169460 + }, + { + "epoch": 18.643014301430142, + "grad_norm": 0.05950276181101799, + "learning_rate": 6.979983024366421e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35764128, + "step": 169465 + }, + { + "epoch": 18.643564356435643, + "grad_norm": 0.4239281713962555, + "learning_rate": 6.974352399716472e-07, + "loss": 0.0073, + "num_input_tokens_seen": 35765216, + "step": 169470 + }, + { + "epoch": 18.644114411441144, + "grad_norm": 0.03469965234398842, + "learning_rate": 6.968724014917299e-07, + "loss": 0.0041, + "num_input_tokens_seen": 35766208, + "step": 169475 + }, + { + "epoch": 18.644664466446645, + "grad_norm": 0.0050765457563102245, + "learning_rate": 6.963097870020807e-07, + "loss": 0.0062, + "num_input_tokens_seen": 35767232, + "step": 169480 + }, + { + "epoch": 18.645214521452147, + "grad_norm": 2.7847788333892822, + "learning_rate": 6.957473965078787e-07, + "loss": 0.0305, + "num_input_tokens_seen": 35768320, + "step": 169485 + }, + { + "epoch": 18.645764576457644, + "grad_norm": 1.9174115657806396, + "learning_rate": 6.951852300143114e-07, + "loss": 0.0286, + "num_input_tokens_seen": 35769408, + "step": 169490 + }, + { + "epoch": 18.646314631463145, + "grad_norm": 0.003232046961784363, + "learning_rate": 6.946232875265579e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35770432, + "step": 169495 + }, + { + "epoch": 18.646864686468646, + "grad_norm": 0.15404871106147766, + "learning_rate": 6.940615690498003e-07, + "loss": 0.0029, + "num_input_tokens_seen": 35771488, + "step": 169500 + }, + { + "epoch": 18.647414741474147, + "grad_norm": 0.022331027314066887, + "learning_rate": 6.935000745892179e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35772544, + "step": 169505 + }, + { + "epoch": 18.64796479647965, + "grad_norm": 0.04670441523194313, + "learning_rate": 6.929388041499757e-07, + "loss": 0.0392, + "num_input_tokens_seen": 35773632, + "step": 169510 + }, + { + "epoch": 18.64851485148515, + "grad_norm": 0.01042722538113594, + "learning_rate": 6.923777577372559e-07, + "loss": 0.0084, + "num_input_tokens_seen": 35774688, + "step": 169515 + }, + { + "epoch": 18.64906490649065, + "grad_norm": 0.0870472863316536, + "learning_rate": 6.918169353562265e-07, + "loss": 0.044, + "num_input_tokens_seen": 35775776, + "step": 169520 + }, + { + "epoch": 18.649614961496148, + "grad_norm": 0.14414392411708832, + "learning_rate": 6.912563370120528e-07, + "loss": 0.0036, + "num_input_tokens_seen": 35776864, + "step": 169525 + }, + { + "epoch": 18.65016501650165, + "grad_norm": 0.04367212578654289, + "learning_rate": 6.906959627099113e-07, + "loss": 0.0046, + "num_input_tokens_seen": 35777888, + "step": 169530 + }, + { + "epoch": 18.65071507150715, + "grad_norm": 0.09560573101043701, + "learning_rate": 6.901358124549562e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35778944, + "step": 169535 + }, + { + "epoch": 18.65126512651265, + "grad_norm": 0.006387399509549141, + "learning_rate": 6.895758862523555e-07, + "loss": 0.0799, + "num_input_tokens_seen": 35779936, + "step": 169540 + }, + { + "epoch": 18.651815181518153, + "grad_norm": 0.06971636414527893, + "learning_rate": 6.890161841072662e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35781024, + "step": 169545 + }, + { + "epoch": 18.652365236523654, + "grad_norm": 0.19398096203804016, + "learning_rate": 6.884567060248509e-07, + "loss": 0.0068, + "num_input_tokens_seen": 35782016, + "step": 169550 + }, + { + "epoch": 18.652915291529155, + "grad_norm": 0.05109229311347008, + "learning_rate": 6.878974520102638e-07, + "loss": 0.0035, + "num_input_tokens_seen": 35783040, + "step": 169555 + }, + { + "epoch": 18.653465346534652, + "grad_norm": 0.011618271470069885, + "learning_rate": 6.873384220686619e-07, + "loss": 0.1037, + "num_input_tokens_seen": 35784064, + "step": 169560 + }, + { + "epoch": 18.654015401540153, + "grad_norm": 0.16327004134655, + "learning_rate": 6.867796162051937e-07, + "loss": 0.0043, + "num_input_tokens_seen": 35785120, + "step": 169565 + }, + { + "epoch": 18.654565456545654, + "grad_norm": 0.034433018416166306, + "learning_rate": 6.862210344250108e-07, + "loss": 0.0042, + "num_input_tokens_seen": 35786176, + "step": 169570 + }, + { + "epoch": 18.655115511551156, + "grad_norm": 0.228509321808815, + "learning_rate": 6.856626767332646e-07, + "loss": 0.0044, + "num_input_tokens_seen": 35787232, + "step": 169575 + }, + { + "epoch": 18.655665566556657, + "grad_norm": 0.033219147473573685, + "learning_rate": 6.851045431350927e-07, + "loss": 0.0042, + "num_input_tokens_seen": 35788256, + "step": 169580 + }, + { + "epoch": 18.656215621562158, + "grad_norm": 0.009505385532975197, + "learning_rate": 6.845466336356494e-07, + "loss": 0.0039, + "num_input_tokens_seen": 35789280, + "step": 169585 + }, + { + "epoch": 18.656765676567655, + "grad_norm": 0.077361561357975, + "learning_rate": 6.839889482400719e-07, + "loss": 0.0404, + "num_input_tokens_seen": 35790304, + "step": 169590 + }, + { + "epoch": 18.657315731573156, + "grad_norm": 0.01011402253061533, + "learning_rate": 6.83431486953498e-07, + "loss": 0.085, + "num_input_tokens_seen": 35791328, + "step": 169595 + }, + { + "epoch": 18.657865786578657, + "grad_norm": 0.020685911178588867, + "learning_rate": 6.828742497810708e-07, + "loss": 0.0046, + "num_input_tokens_seen": 35792384, + "step": 169600 + }, + { + "epoch": 18.65841584158416, + "grad_norm": 0.029945172369480133, + "learning_rate": 6.823172367279195e-07, + "loss": 0.0456, + "num_input_tokens_seen": 35793440, + "step": 169605 + }, + { + "epoch": 18.65896589658966, + "grad_norm": 0.01562964916229248, + "learning_rate": 6.817604477991818e-07, + "loss": 0.1188, + "num_input_tokens_seen": 35794464, + "step": 169610 + }, + { + "epoch": 18.65951595159516, + "grad_norm": 0.017087992280721664, + "learning_rate": 6.812038829999923e-07, + "loss": 0.0059, + "num_input_tokens_seen": 35795584, + "step": 169615 + }, + { + "epoch": 18.66006600660066, + "grad_norm": 0.014263315126299858, + "learning_rate": 6.806475423354747e-07, + "loss": 0.0045, + "num_input_tokens_seen": 35796544, + "step": 169620 + }, + { + "epoch": 18.66061606160616, + "grad_norm": 2.338675022125244, + "learning_rate": 6.800914258107611e-07, + "loss": 0.0917, + "num_input_tokens_seen": 35797504, + "step": 169625 + }, + { + "epoch": 18.66116611661166, + "grad_norm": 0.051253098994493484, + "learning_rate": 6.795355334309777e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35798496, + "step": 169630 + }, + { + "epoch": 18.66171617161716, + "grad_norm": 0.03898398205637932, + "learning_rate": 6.789798652012402e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35799520, + "step": 169635 + }, + { + "epoch": 18.662266226622663, + "grad_norm": 0.758538544178009, + "learning_rate": 6.78424421126675e-07, + "loss": 0.0142, + "num_input_tokens_seen": 35800544, + "step": 169640 + }, + { + "epoch": 18.662816281628164, + "grad_norm": 0.008561968803405762, + "learning_rate": 6.778692012124e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35801536, + "step": 169645 + }, + { + "epoch": 18.663366336633665, + "grad_norm": 0.028756603598594666, + "learning_rate": 6.773142054635389e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35802560, + "step": 169650 + }, + { + "epoch": 18.663916391639162, + "grad_norm": 0.020079391077160835, + "learning_rate": 6.76759433885199e-07, + "loss": 0.0091, + "num_input_tokens_seen": 35803552, + "step": 169655 + }, + { + "epoch": 18.664466446644663, + "grad_norm": 0.10414345562458038, + "learning_rate": 6.762048864824955e-07, + "loss": 0.0073, + "num_input_tokens_seen": 35804608, + "step": 169660 + }, + { + "epoch": 18.665016501650165, + "grad_norm": 0.38552942872047424, + "learning_rate": 6.756505632605381e-07, + "loss": 0.0234, + "num_input_tokens_seen": 35805760, + "step": 169665 + }, + { + "epoch": 18.665566556655666, + "grad_norm": 2.419522523880005, + "learning_rate": 6.750964642244423e-07, + "loss": 0.1063, + "num_input_tokens_seen": 35806816, + "step": 169670 + }, + { + "epoch": 18.666116611661167, + "grad_norm": 0.013747058808803558, + "learning_rate": 6.745425893793067e-07, + "loss": 0.014, + "num_input_tokens_seen": 35807936, + "step": 169675 + }, + { + "epoch": 18.666666666666668, + "grad_norm": 2.1719701290130615, + "learning_rate": 6.739889387302411e-07, + "loss": 0.0139, + "num_input_tokens_seen": 35809024, + "step": 169680 + }, + { + "epoch": 18.66721672167217, + "grad_norm": 0.956045389175415, + "learning_rate": 6.734355122823444e-07, + "loss": 0.0299, + "num_input_tokens_seen": 35810080, + "step": 169685 + }, + { + "epoch": 18.667766776677666, + "grad_norm": 0.007631686050444841, + "learning_rate": 6.728823100407234e-07, + "loss": 0.1023, + "num_input_tokens_seen": 35811104, + "step": 169690 + }, + { + "epoch": 18.668316831683168, + "grad_norm": 0.060789547860622406, + "learning_rate": 6.723293320104685e-07, + "loss": 0.0322, + "num_input_tokens_seen": 35812192, + "step": 169695 + }, + { + "epoch": 18.66886688668867, + "grad_norm": 0.09307482838630676, + "learning_rate": 6.717765781966839e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35813248, + "step": 169700 + }, + { + "epoch": 18.66941694169417, + "grad_norm": 0.8004139065742493, + "learning_rate": 6.712240486044602e-07, + "loss": 0.0185, + "num_input_tokens_seen": 35814336, + "step": 169705 + }, + { + "epoch": 18.66996699669967, + "grad_norm": 0.012322350405156612, + "learning_rate": 6.706717432388876e-07, + "loss": 0.0071, + "num_input_tokens_seen": 35815456, + "step": 169710 + }, + { + "epoch": 18.670517051705172, + "grad_norm": 0.011038273572921753, + "learning_rate": 6.701196621050648e-07, + "loss": 0.0194, + "num_input_tokens_seen": 35816576, + "step": 169715 + }, + { + "epoch": 18.67106710671067, + "grad_norm": 0.005088771693408489, + "learning_rate": 6.695678052080684e-07, + "loss": 0.0008, + "num_input_tokens_seen": 35817632, + "step": 169720 + }, + { + "epoch": 18.67161716171617, + "grad_norm": 1.4685887098312378, + "learning_rate": 6.690161725529915e-07, + "loss": 0.0445, + "num_input_tokens_seen": 35818688, + "step": 169725 + }, + { + "epoch": 18.67216721672167, + "grad_norm": 2.6267082691192627, + "learning_rate": 6.684647641449215e-07, + "loss": 0.0956, + "num_input_tokens_seen": 35819712, + "step": 169730 + }, + { + "epoch": 18.672717271727173, + "grad_norm": 0.013674094341695309, + "learning_rate": 6.679135799889325e-07, + "loss": 0.0621, + "num_input_tokens_seen": 35820800, + "step": 169735 + }, + { + "epoch": 18.673267326732674, + "grad_norm": 0.06897997856140137, + "learning_rate": 6.673626200901117e-07, + "loss": 0.0505, + "num_input_tokens_seen": 35821856, + "step": 169740 + }, + { + "epoch": 18.673817381738175, + "grad_norm": 0.05959576740860939, + "learning_rate": 6.668118844535331e-07, + "loss": 0.0062, + "num_input_tokens_seen": 35822880, + "step": 169745 + }, + { + "epoch": 18.674367436743676, + "grad_norm": 4.600268363952637, + "learning_rate": 6.662613730842704e-07, + "loss": 0.0718, + "num_input_tokens_seen": 35823968, + "step": 169750 + }, + { + "epoch": 18.674917491749174, + "grad_norm": 2.1270766258239746, + "learning_rate": 6.657110859874027e-07, + "loss": 0.0909, + "num_input_tokens_seen": 35824992, + "step": 169755 + }, + { + "epoch": 18.675467546754675, + "grad_norm": 0.07944565266370773, + "learning_rate": 6.651610231679956e-07, + "loss": 0.0279, + "num_input_tokens_seen": 35826048, + "step": 169760 + }, + { + "epoch": 18.676017601760176, + "grad_norm": 0.22952096164226532, + "learning_rate": 6.646111846311254e-07, + "loss": 0.0251, + "num_input_tokens_seen": 35827136, + "step": 169765 + }, + { + "epoch": 18.676567656765677, + "grad_norm": 0.005572719499468803, + "learning_rate": 6.640615703818575e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35828192, + "step": 169770 + }, + { + "epoch": 18.677117711771178, + "grad_norm": 0.0791698694229126, + "learning_rate": 6.635121804252548e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35829216, + "step": 169775 + }, + { + "epoch": 18.67766776677668, + "grad_norm": 0.006347703747451305, + "learning_rate": 6.629630147663823e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35830272, + "step": 169780 + }, + { + "epoch": 18.678217821782177, + "grad_norm": 0.019679242745041847, + "learning_rate": 6.624140734103029e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35831360, + "step": 169785 + }, + { + "epoch": 18.678767876787678, + "grad_norm": 3.4348881244659424, + "learning_rate": 6.618653563620736e-07, + "loss": 0.0313, + "num_input_tokens_seen": 35832416, + "step": 169790 + }, + { + "epoch": 18.67931793179318, + "grad_norm": 2.137383222579956, + "learning_rate": 6.613168636267569e-07, + "loss": 0.0651, + "num_input_tokens_seen": 35833504, + "step": 169795 + }, + { + "epoch": 18.67986798679868, + "grad_norm": 0.0030493203084915876, + "learning_rate": 6.607685952093989e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35834624, + "step": 169800 + }, + { + "epoch": 18.68041804180418, + "grad_norm": 0.08729430288076401, + "learning_rate": 6.602205511150594e-07, + "loss": 0.0041, + "num_input_tokens_seen": 35835744, + "step": 169805 + }, + { + "epoch": 18.680968096809682, + "grad_norm": 0.6845901012420654, + "learning_rate": 6.596727313487872e-07, + "loss": 0.0152, + "num_input_tokens_seen": 35836736, + "step": 169810 + }, + { + "epoch": 18.681518151815183, + "grad_norm": 0.04497929662466049, + "learning_rate": 6.591251359156308e-07, + "loss": 0.2011, + "num_input_tokens_seen": 35837824, + "step": 169815 + }, + { + "epoch": 18.68206820682068, + "grad_norm": 1.2280699014663696, + "learning_rate": 6.585777648206392e-07, + "loss": 0.0171, + "num_input_tokens_seen": 35838944, + "step": 169820 + }, + { + "epoch": 18.682618261826182, + "grad_norm": 0.07491226494312286, + "learning_rate": 6.580306180688556e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35840032, + "step": 169825 + }, + { + "epoch": 18.683168316831683, + "grad_norm": 1.706157922744751, + "learning_rate": 6.574836956653257e-07, + "loss": 0.0917, + "num_input_tokens_seen": 35841088, + "step": 169830 + }, + { + "epoch": 18.683718371837184, + "grad_norm": 0.08857034891843796, + "learning_rate": 6.569369976150847e-07, + "loss": 0.0367, + "num_input_tokens_seen": 35842144, + "step": 169835 + }, + { + "epoch": 18.684268426842685, + "grad_norm": 0.008476568385958672, + "learning_rate": 6.563905239231755e-07, + "loss": 0.0408, + "num_input_tokens_seen": 35843200, + "step": 169840 + }, + { + "epoch": 18.684818481848186, + "grad_norm": 0.005050070583820343, + "learning_rate": 6.558442745946358e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35844224, + "step": 169845 + }, + { + "epoch": 18.685368536853684, + "grad_norm": 0.04812481254339218, + "learning_rate": 6.55298249634495e-07, + "loss": 0.0477, + "num_input_tokens_seen": 35845248, + "step": 169850 + }, + { + "epoch": 18.685918591859185, + "grad_norm": 0.032111797481775284, + "learning_rate": 6.547524490477907e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35846240, + "step": 169855 + }, + { + "epoch": 18.686468646864686, + "grad_norm": 0.016291780397295952, + "learning_rate": 6.542068728395523e-07, + "loss": 0.0658, + "num_input_tokens_seen": 35847232, + "step": 169860 + }, + { + "epoch": 18.687018701870187, + "grad_norm": 3.373009204864502, + "learning_rate": 6.536615210148033e-07, + "loss": 0.0988, + "num_input_tokens_seen": 35848288, + "step": 169865 + }, + { + "epoch": 18.687568756875688, + "grad_norm": 0.035124074667692184, + "learning_rate": 6.531163935785734e-07, + "loss": 0.0449, + "num_input_tokens_seen": 35849376, + "step": 169870 + }, + { + "epoch": 18.68811881188119, + "grad_norm": 0.08958996087312698, + "learning_rate": 6.525714905358888e-07, + "loss": 0.0683, + "num_input_tokens_seen": 35850368, + "step": 169875 + }, + { + "epoch": 18.68866886688669, + "grad_norm": 0.3090049624443054, + "learning_rate": 6.520268118917705e-07, + "loss": 0.027, + "num_input_tokens_seen": 35851392, + "step": 169880 + }, + { + "epoch": 18.689218921892188, + "grad_norm": 0.30707013607025146, + "learning_rate": 6.514823576512369e-07, + "loss": 0.0492, + "num_input_tokens_seen": 35852416, + "step": 169885 + }, + { + "epoch": 18.68976897689769, + "grad_norm": 0.0681241974234581, + "learning_rate": 6.509381278193061e-07, + "loss": 0.0496, + "num_input_tokens_seen": 35853504, + "step": 169890 + }, + { + "epoch": 18.69031903190319, + "grad_norm": 0.01725546456873417, + "learning_rate": 6.503941224009963e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35854560, + "step": 169895 + }, + { + "epoch": 18.69086908690869, + "grad_norm": 0.021098393946886063, + "learning_rate": 6.498503414013174e-07, + "loss": 0.0342, + "num_input_tokens_seen": 35855616, + "step": 169900 + }, + { + "epoch": 18.691419141914192, + "grad_norm": 0.12224720418453217, + "learning_rate": 6.493067848252849e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35856640, + "step": 169905 + }, + { + "epoch": 18.691969196919693, + "grad_norm": 0.005862185265868902, + "learning_rate": 6.487634526779085e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35857664, + "step": 169910 + }, + { + "epoch": 18.69251925192519, + "grad_norm": 0.04448104649782181, + "learning_rate": 6.482203449641927e-07, + "loss": 0.001, + "num_input_tokens_seen": 35858656, + "step": 169915 + }, + { + "epoch": 18.693069306930692, + "grad_norm": 0.03040342777967453, + "learning_rate": 6.476774616891473e-07, + "loss": 0.003, + "num_input_tokens_seen": 35859744, + "step": 169920 + }, + { + "epoch": 18.693619361936193, + "grad_norm": 0.006384649313986301, + "learning_rate": 6.471348028577712e-07, + "loss": 0.0014, + "num_input_tokens_seen": 35860736, + "step": 169925 + }, + { + "epoch": 18.694169416941694, + "grad_norm": 0.07601626217365265, + "learning_rate": 6.465923684750658e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35861792, + "step": 169930 + }, + { + "epoch": 18.694719471947195, + "grad_norm": 0.12399717420339584, + "learning_rate": 6.460501585460382e-07, + "loss": 0.0061, + "num_input_tokens_seen": 35862912, + "step": 169935 + }, + { + "epoch": 18.695269526952696, + "grad_norm": 0.004207727964967489, + "learning_rate": 6.455081730756763e-07, + "loss": 0.0713, + "num_input_tokens_seen": 35864000, + "step": 169940 + }, + { + "epoch": 18.695819581958197, + "grad_norm": 0.041717279702425, + "learning_rate": 6.449664120689813e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35865088, + "step": 169945 + }, + { + "epoch": 18.696369636963695, + "grad_norm": 0.09086371213197708, + "learning_rate": 6.444248755309413e-07, + "loss": 0.0641, + "num_input_tokens_seen": 35866112, + "step": 169950 + }, + { + "epoch": 18.696919691969196, + "grad_norm": 0.015155826695263386, + "learning_rate": 6.438835634665491e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35867136, + "step": 169955 + }, + { + "epoch": 18.697469746974697, + "grad_norm": 0.0902685597538948, + "learning_rate": 6.433424758808009e-07, + "loss": 0.0612, + "num_input_tokens_seen": 35868160, + "step": 169960 + }, + { + "epoch": 18.698019801980198, + "grad_norm": 0.07105652987957001, + "learning_rate": 6.428016127786735e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35869152, + "step": 169965 + }, + { + "epoch": 18.6985698569857, + "grad_norm": 0.5061234831809998, + "learning_rate": 6.42260974165157e-07, + "loss": 0.0082, + "num_input_tokens_seen": 35870176, + "step": 169970 + }, + { + "epoch": 18.6991199119912, + "grad_norm": 0.015271464362740517, + "learning_rate": 6.417205600452336e-07, + "loss": 0.1057, + "num_input_tokens_seen": 35871328, + "step": 169975 + }, + { + "epoch": 18.6996699669967, + "grad_norm": 0.01790020987391472, + "learning_rate": 6.411803704238828e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35872288, + "step": 169980 + }, + { + "epoch": 18.7002200220022, + "grad_norm": 0.024604173377156258, + "learning_rate": 6.406404053060838e-07, + "loss": 0.0097, + "num_input_tokens_seen": 35873376, + "step": 169985 + }, + { + "epoch": 18.7007700770077, + "grad_norm": 0.006987424101680517, + "learning_rate": 6.401006646968133e-07, + "loss": 0.0029, + "num_input_tokens_seen": 35874528, + "step": 169990 + }, + { + "epoch": 18.7013201320132, + "grad_norm": 1.8165496587753296, + "learning_rate": 6.395611486010478e-07, + "loss": 0.0364, + "num_input_tokens_seen": 35875552, + "step": 169995 + }, + { + "epoch": 18.701870187018702, + "grad_norm": 0.015507355332374573, + "learning_rate": 6.390218570237527e-07, + "loss": 0.0429, + "num_input_tokens_seen": 35876640, + "step": 170000 + }, + { + "epoch": 18.702420242024203, + "grad_norm": 0.03908082842826843, + "learning_rate": 6.384827899699103e-07, + "loss": 0.2051, + "num_input_tokens_seen": 35877696, + "step": 170005 + }, + { + "epoch": 18.702970297029704, + "grad_norm": 0.05501454696059227, + "learning_rate": 6.379439474444776e-07, + "loss": 0.0465, + "num_input_tokens_seen": 35878720, + "step": 170010 + }, + { + "epoch": 18.703520352035202, + "grad_norm": 0.04475565254688263, + "learning_rate": 6.374053294524257e-07, + "loss": 0.0159, + "num_input_tokens_seen": 35879808, + "step": 170015 + }, + { + "epoch": 18.704070407040703, + "grad_norm": 0.028778890147805214, + "learning_rate": 6.368669359987228e-07, + "loss": 0.0018, + "num_input_tokens_seen": 35880768, + "step": 170020 + }, + { + "epoch": 18.704620462046204, + "grad_norm": 0.5541447401046753, + "learning_rate": 6.363287670883233e-07, + "loss": 0.0087, + "num_input_tokens_seen": 35881824, + "step": 170025 + }, + { + "epoch": 18.705170517051705, + "grad_norm": 0.007600408047437668, + "learning_rate": 6.357908227261928e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35882848, + "step": 170030 + }, + { + "epoch": 18.705720572057206, + "grad_norm": 0.07427014410495758, + "learning_rate": 6.352531029172881e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35883936, + "step": 170035 + }, + { + "epoch": 18.706270627062707, + "grad_norm": 0.11945784091949463, + "learning_rate": 6.347156076665584e-07, + "loss": 0.0074, + "num_input_tokens_seen": 35885056, + "step": 170040 + }, + { + "epoch": 18.706820682068205, + "grad_norm": 0.04625670984387398, + "learning_rate": 6.341783369789661e-07, + "loss": 0.0489, + "num_input_tokens_seen": 35886112, + "step": 170045 + }, + { + "epoch": 18.707370737073706, + "grad_norm": 0.5559850931167603, + "learning_rate": 6.336412908594602e-07, + "loss": 0.1343, + "num_input_tokens_seen": 35887168, + "step": 170050 + }, + { + "epoch": 18.707920792079207, + "grad_norm": 0.07295933365821838, + "learning_rate": 6.331044693129923e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35888224, + "step": 170055 + }, + { + "epoch": 18.70847084708471, + "grad_norm": 0.03515806421637535, + "learning_rate": 6.325678723445083e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35889248, + "step": 170060 + }, + { + "epoch": 18.70902090209021, + "grad_norm": 0.024236056953668594, + "learning_rate": 6.320314999589516e-07, + "loss": 0.1264, + "num_input_tokens_seen": 35890304, + "step": 170065 + }, + { + "epoch": 18.70957095709571, + "grad_norm": 0.015629567205905914, + "learning_rate": 6.314953521612682e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35891360, + "step": 170070 + }, + { + "epoch": 18.71012101210121, + "grad_norm": 0.020609457045793533, + "learning_rate": 6.309594289564014e-07, + "loss": 0.0053, + "num_input_tokens_seen": 35892480, + "step": 170075 + }, + { + "epoch": 18.71067106710671, + "grad_norm": 0.04274972528219223, + "learning_rate": 6.304237303492833e-07, + "loss": 0.0034, + "num_input_tokens_seen": 35893600, + "step": 170080 + }, + { + "epoch": 18.71122112211221, + "grad_norm": 0.021671464666724205, + "learning_rate": 6.298882563448599e-07, + "loss": 0.1199, + "num_input_tokens_seen": 35894656, + "step": 170085 + }, + { + "epoch": 18.71177117711771, + "grad_norm": 0.040332358330488205, + "learning_rate": 6.293530069480607e-07, + "loss": 0.0193, + "num_input_tokens_seen": 35895744, + "step": 170090 + }, + { + "epoch": 18.712321232123212, + "grad_norm": 0.22910580039024353, + "learning_rate": 6.288179821638235e-07, + "loss": 0.0077, + "num_input_tokens_seen": 35896864, + "step": 170095 + }, + { + "epoch": 18.712871287128714, + "grad_norm": 0.0613800548017025, + "learning_rate": 6.282831819970747e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35897984, + "step": 170100 + }, + { + "epoch": 18.713421342134215, + "grad_norm": 2.572958469390869, + "learning_rate": 6.27748606452741e-07, + "loss": 0.0409, + "num_input_tokens_seen": 35899040, + "step": 170105 + }, + { + "epoch": 18.713971397139716, + "grad_norm": 0.008910288102924824, + "learning_rate": 6.272142555357601e-07, + "loss": 0.0618, + "num_input_tokens_seen": 35900064, + "step": 170110 + }, + { + "epoch": 18.714521452145213, + "grad_norm": 0.13553640246391296, + "learning_rate": 6.266801292510449e-07, + "loss": 0.0411, + "num_input_tokens_seen": 35901152, + "step": 170115 + }, + { + "epoch": 18.715071507150714, + "grad_norm": 0.3644474446773529, + "learning_rate": 6.261462276035274e-07, + "loss": 0.1199, + "num_input_tokens_seen": 35902208, + "step": 170120 + }, + { + "epoch": 18.715621562156215, + "grad_norm": 0.014186195097863674, + "learning_rate": 6.256125505981258e-07, + "loss": 0.0178, + "num_input_tokens_seen": 35903264, + "step": 170125 + }, + { + "epoch": 18.716171617161717, + "grad_norm": 0.02875945158302784, + "learning_rate": 6.250790982397503e-07, + "loss": 0.0019, + "num_input_tokens_seen": 35904320, + "step": 170130 + }, + { + "epoch": 18.716721672167218, + "grad_norm": 0.01672171801328659, + "learning_rate": 6.245458705333301e-07, + "loss": 0.1327, + "num_input_tokens_seen": 35905408, + "step": 170135 + }, + { + "epoch": 18.71727172717272, + "grad_norm": 0.029500199481844902, + "learning_rate": 6.240128674837726e-07, + "loss": 0.0049, + "num_input_tokens_seen": 35906464, + "step": 170140 + }, + { + "epoch": 18.717821782178216, + "grad_norm": 2.486039161682129, + "learning_rate": 6.23480089095993e-07, + "loss": 0.0363, + "num_input_tokens_seen": 35907552, + "step": 170145 + }, + { + "epoch": 18.718371837183717, + "grad_norm": 0.0177728570997715, + "learning_rate": 6.229475353749015e-07, + "loss": 0.0243, + "num_input_tokens_seen": 35908672, + "step": 170150 + }, + { + "epoch": 18.71892189218922, + "grad_norm": 0.013742439448833466, + "learning_rate": 6.224152063253996e-07, + "loss": 0.0031, + "num_input_tokens_seen": 35909760, + "step": 170155 + }, + { + "epoch": 18.71947194719472, + "grad_norm": 2.91601300239563, + "learning_rate": 6.218831019524029e-07, + "loss": 0.0721, + "num_input_tokens_seen": 35910848, + "step": 170160 + }, + { + "epoch": 18.72002200220022, + "grad_norm": 0.04680061340332031, + "learning_rate": 6.213512222608103e-07, + "loss": 0.005, + "num_input_tokens_seen": 35911872, + "step": 170165 + }, + { + "epoch": 18.72057205720572, + "grad_norm": 0.016507649794220924, + "learning_rate": 6.20819567255529e-07, + "loss": 0.0032, + "num_input_tokens_seen": 35912896, + "step": 170170 + }, + { + "epoch": 18.721122112211223, + "grad_norm": 0.02492053434252739, + "learning_rate": 6.202881369414549e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35913984, + "step": 170175 + }, + { + "epoch": 18.72167216721672, + "grad_norm": 0.3440885543823242, + "learning_rate": 6.197569313234841e-07, + "loss": 0.0072, + "num_input_tokens_seen": 35915072, + "step": 170180 + }, + { + "epoch": 18.72222222222222, + "grad_norm": 0.156476691365242, + "learning_rate": 6.192259504065156e-07, + "loss": 0.0027, + "num_input_tokens_seen": 35916032, + "step": 170185 + }, + { + "epoch": 18.722772277227723, + "grad_norm": 0.004834736697375774, + "learning_rate": 6.1869519419544e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35917056, + "step": 170190 + }, + { + "epoch": 18.723322332233224, + "grad_norm": 1.5589677095413208, + "learning_rate": 6.181646626951559e-07, + "loss": 0.01, + "num_input_tokens_seen": 35918112, + "step": 170195 + }, + { + "epoch": 18.723872387238725, + "grad_norm": 2.9328267574310303, + "learning_rate": 6.176343559105485e-07, + "loss": 0.1027, + "num_input_tokens_seen": 35919136, + "step": 170200 + }, + { + "epoch": 18.724422442244226, + "grad_norm": 0.00523660983890295, + "learning_rate": 6.171042738465027e-07, + "loss": 0.0107, + "num_input_tokens_seen": 35920192, + "step": 170205 + }, + { + "epoch": 18.724972497249723, + "grad_norm": 1.8718894720077515, + "learning_rate": 6.16574416507909e-07, + "loss": 0.1005, + "num_input_tokens_seen": 35921280, + "step": 170210 + }, + { + "epoch": 18.725522552255224, + "grad_norm": 0.008016962558031082, + "learning_rate": 6.16044783899647e-07, + "loss": 0.0519, + "num_input_tokens_seen": 35922400, + "step": 170215 + }, + { + "epoch": 18.726072607260726, + "grad_norm": 0.13460634648799896, + "learning_rate": 6.155153760265986e-07, + "loss": 0.0032, + "num_input_tokens_seen": 35923456, + "step": 170220 + }, + { + "epoch": 18.726622662266227, + "grad_norm": 0.08214633911848068, + "learning_rate": 6.149861928936463e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35924544, + "step": 170225 + }, + { + "epoch": 18.727172717271728, + "grad_norm": 0.13810347020626068, + "learning_rate": 6.144572345056637e-07, + "loss": 0.0022, + "num_input_tokens_seen": 35925600, + "step": 170230 + }, + { + "epoch": 18.72772277227723, + "grad_norm": 3.692302703857422, + "learning_rate": 6.139285008675305e-07, + "loss": 0.0435, + "num_input_tokens_seen": 35926656, + "step": 170235 + }, + { + "epoch": 18.72827282728273, + "grad_norm": 0.005338442977517843, + "learning_rate": 6.133999919841121e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35927712, + "step": 170240 + }, + { + "epoch": 18.728822882288227, + "grad_norm": 0.013863877393305302, + "learning_rate": 6.12871707860288e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35928736, + "step": 170245 + }, + { + "epoch": 18.72937293729373, + "grad_norm": 0.03011421300470829, + "learning_rate": 6.12343648500921e-07, + "loss": 0.0079, + "num_input_tokens_seen": 35929760, + "step": 170250 + }, + { + "epoch": 18.72992299229923, + "grad_norm": 0.10165253281593323, + "learning_rate": 6.118158139108821e-07, + "loss": 0.0414, + "num_input_tokens_seen": 35930912, + "step": 170255 + }, + { + "epoch": 18.73047304730473, + "grad_norm": 0.024217799305915833, + "learning_rate": 6.112882040950341e-07, + "loss": 0.0084, + "num_input_tokens_seen": 35931936, + "step": 170260 + }, + { + "epoch": 18.731023102310232, + "grad_norm": 0.041163552552461624, + "learning_rate": 6.107608190582398e-07, + "loss": 0.0011, + "num_input_tokens_seen": 35933024, + "step": 170265 + }, + { + "epoch": 18.731573157315733, + "grad_norm": 0.05618252605199814, + "learning_rate": 6.102336588053593e-07, + "loss": 0.0021, + "num_input_tokens_seen": 35934048, + "step": 170270 + }, + { + "epoch": 18.73212321232123, + "grad_norm": 2.1795294284820557, + "learning_rate": 6.097067233412496e-07, + "loss": 0.0156, + "num_input_tokens_seen": 35935104, + "step": 170275 + }, + { + "epoch": 18.73267326732673, + "grad_norm": 0.058115582913160324, + "learning_rate": 6.091800126707708e-07, + "loss": 0.0713, + "num_input_tokens_seen": 35936160, + "step": 170280 + }, + { + "epoch": 18.733223322332233, + "grad_norm": 0.2197757363319397, + "learning_rate": 6.086535267987776e-07, + "loss": 0.0417, + "num_input_tokens_seen": 35937248, + "step": 170285 + }, + { + "epoch": 18.733773377337734, + "grad_norm": 0.09918218851089478, + "learning_rate": 6.081272657301213e-07, + "loss": 0.0028, + "num_input_tokens_seen": 35938304, + "step": 170290 + }, + { + "epoch": 18.734323432343235, + "grad_norm": 0.048772454261779785, + "learning_rate": 6.076012294696509e-07, + "loss": 0.0713, + "num_input_tokens_seen": 35939296, + "step": 170295 + }, + { + "epoch": 18.734873487348736, + "grad_norm": 0.34153902530670166, + "learning_rate": 6.070754180222154e-07, + "loss": 0.0098, + "num_input_tokens_seen": 35940384, + "step": 170300 + }, + { + "epoch": 18.735423542354237, + "grad_norm": 0.24392114579677582, + "learning_rate": 6.065498313926582e-07, + "loss": 0.0026, + "num_input_tokens_seen": 35941408, + "step": 170305 + }, + { + "epoch": 18.735973597359735, + "grad_norm": 0.022268325090408325, + "learning_rate": 6.06024469585828e-07, + "loss": 0.0061, + "num_input_tokens_seen": 35942496, + "step": 170310 + }, + { + "epoch": 18.736523652365236, + "grad_norm": 0.22910235822200775, + "learning_rate": 6.054993326065656e-07, + "loss": 0.0066, + "num_input_tokens_seen": 35943584, + "step": 170315 + }, + { + "epoch": 18.737073707370737, + "grad_norm": 0.0426022931933403, + "learning_rate": 6.049744204597085e-07, + "loss": 0.046, + "num_input_tokens_seen": 35944576, + "step": 170320 + }, + { + "epoch": 18.737623762376238, + "grad_norm": 0.29462704062461853, + "learning_rate": 6.044497331501004e-07, + "loss": 0.0024, + "num_input_tokens_seen": 35945696, + "step": 170325 + }, + { + "epoch": 18.73817381738174, + "grad_norm": 0.006346406880766153, + "learning_rate": 6.039252706825677e-07, + "loss": 0.043, + "num_input_tokens_seen": 35946752, + "step": 170330 + }, + { + "epoch": 18.73872387238724, + "grad_norm": 0.9429675340652466, + "learning_rate": 6.034010330619511e-07, + "loss": 0.0048, + "num_input_tokens_seen": 35947840, + "step": 170335 + }, + { + "epoch": 18.739273927392738, + "grad_norm": 0.6404916048049927, + "learning_rate": 6.028770202930828e-07, + "loss": 0.0768, + "num_input_tokens_seen": 35948896, + "step": 170340 + }, + { + "epoch": 18.73982398239824, + "grad_norm": 0.00923945289105177, + "learning_rate": 6.023532323807867e-07, + "loss": 0.0033, + "num_input_tokens_seen": 35949920, + "step": 170345 + }, + { + "epoch": 18.74037403740374, + "grad_norm": 0.025759678333997726, + "learning_rate": 6.01829669329898e-07, + "loss": 0.002, + "num_input_tokens_seen": 35950912, + "step": 170350 + }, + { + "epoch": 18.74092409240924, + "grad_norm": 0.16534239053726196, + "learning_rate": 6.013063311452321e-07, + "loss": 0.0047, + "num_input_tokens_seen": 35952032, + "step": 170355 + }, + { + "epoch": 18.741474147414742, + "grad_norm": 0.03717787191271782, + "learning_rate": 6.007832178316242e-07, + "loss": 0.0081, + "num_input_tokens_seen": 35953056, + "step": 170360 + }, + { + "epoch": 18.742024202420243, + "grad_norm": 2.114443778991699, + "learning_rate": 6.002603293938841e-07, + "loss": 0.0471, + "num_input_tokens_seen": 35954048, + "step": 170365 + }, + { + "epoch": 18.742574257425744, + "grad_norm": 0.010380770079791546, + "learning_rate": 5.997376658368387e-07, + "loss": 0.0025, + "num_input_tokens_seen": 35955136, + "step": 170370 + }, + { + "epoch": 18.74312431243124, + "grad_norm": 0.07102013379335403, + "learning_rate": 5.992152271653062e-07, + "loss": 0.002, + "num_input_tokens_seen": 35956160, + "step": 170375 + }, + { + "epoch": 18.743674367436743, + "grad_norm": 0.012641890905797482, + "learning_rate": 5.98693013384094e-07, + "loss": 0.0366, + "num_input_tokens_seen": 35957152, + "step": 170380 + }, + { + "epoch": 18.744224422442244, + "grad_norm": 0.1050446480512619, + "learning_rate": 5.981710244980204e-07, + "loss": 0.0058, + "num_input_tokens_seen": 35958240, + "step": 170385 + }, + { + "epoch": 18.744774477447745, + "grad_norm": 0.041417766362428665, + "learning_rate": 5.976492605118955e-07, + "loss": 0.0059, + "num_input_tokens_seen": 35959296, + "step": 170390 + }, + { + "epoch": 18.745324532453246, + "grad_norm": 0.05058848485350609, + "learning_rate": 5.971277214305266e-07, + "loss": 0.062, + "num_input_tokens_seen": 35960352, + "step": 170395 + }, + { + "epoch": 18.745874587458747, + "grad_norm": 0.037520308047533035, + "learning_rate": 5.966064072587235e-07, + "loss": 0.0475, + "num_input_tokens_seen": 35961408, + "step": 170400 + }, + { + "epoch": 18.746424642464248, + "grad_norm": 0.025301450863480568, + "learning_rate": 5.960853180012882e-07, + "loss": 0.0171, + "num_input_tokens_seen": 35962400, + "step": 170405 + }, + { + "epoch": 18.746974697469746, + "grad_norm": 0.008766877464950085, + "learning_rate": 5.955644536630278e-07, + "loss": 0.0065, + "num_input_tokens_seen": 35963424, + "step": 170410 + }, + { + "epoch": 18.747524752475247, + "grad_norm": 0.07063870877027512, + "learning_rate": 5.950438142487386e-07, + "loss": 0.067, + "num_input_tokens_seen": 35964448, + "step": 170415 + }, + { + "epoch": 18.748074807480748, + "grad_norm": 0.9764643311500549, + "learning_rate": 5.945233997632165e-07, + "loss": 0.0096, + "num_input_tokens_seen": 35965504, + "step": 170420 + }, + { + "epoch": 18.74862486248625, + "grad_norm": 0.005849209148436785, + "learning_rate": 5.940032102112636e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35966496, + "step": 170425 + }, + { + "epoch": 18.74917491749175, + "grad_norm": 0.016316451132297516, + "learning_rate": 5.934832455976702e-07, + "loss": 0.0204, + "num_input_tokens_seen": 35967488, + "step": 170430 + }, + { + "epoch": 18.74972497249725, + "grad_norm": 0.05045202374458313, + "learning_rate": 5.929635059272326e-07, + "loss": 0.001, + "num_input_tokens_seen": 35968576, + "step": 170435 + }, + { + "epoch": 18.75027502750275, + "grad_norm": 0.008057446219027042, + "learning_rate": 5.924439912047386e-07, + "loss": 0.0803, + "num_input_tokens_seen": 35969568, + "step": 170440 + }, + { + "epoch": 18.75082508250825, + "grad_norm": 0.08253800868988037, + "learning_rate": 5.919247014349733e-07, + "loss": 0.0612, + "num_input_tokens_seen": 35970656, + "step": 170445 + }, + { + "epoch": 18.75137513751375, + "grad_norm": 0.1383661925792694, + "learning_rate": 5.914056366227272e-07, + "loss": 0.0151, + "num_input_tokens_seen": 35971712, + "step": 170450 + }, + { + "epoch": 18.751925192519252, + "grad_norm": 0.34254351258277893, + "learning_rate": 5.908867967727854e-07, + "loss": 0.0023, + "num_input_tokens_seen": 35972832, + "step": 170455 + }, + { + "epoch": 18.752475247524753, + "grad_norm": 0.3015294373035431, + "learning_rate": 5.903681818899249e-07, + "loss": 0.0641, + "num_input_tokens_seen": 35973856, + "step": 170460 + }, + { + "epoch": 18.753025302530254, + "grad_norm": 0.1609269678592682, + "learning_rate": 5.898497919789303e-07, + "loss": 0.004, + "num_input_tokens_seen": 35974880, + "step": 170465 + }, + { + "epoch": 18.753575357535752, + "grad_norm": 0.05141926929354668, + "learning_rate": 5.893316270445759e-07, + "loss": 0.0805, + "num_input_tokens_seen": 35976032, + "step": 170470 + }, + { + "epoch": 18.754125412541253, + "grad_norm": 0.046873971819877625, + "learning_rate": 5.888136870916383e-07, + "loss": 0.0009, + "num_input_tokens_seen": 35977120, + "step": 170475 + }, + { + "epoch": 18.754675467546754, + "grad_norm": 0.05475952848792076, + "learning_rate": 5.882959721248916e-07, + "loss": 0.003, + "num_input_tokens_seen": 35978176, + "step": 170480 + }, + { + "epoch": 18.755225522552255, + "grad_norm": 0.05044001713395119, + "learning_rate": 5.877784821491067e-07, + "loss": 0.0037, + "num_input_tokens_seen": 35979264, + "step": 170485 + }, + { + "epoch": 18.755775577557756, + "grad_norm": 0.0015222432557493448, + "learning_rate": 5.872612171690522e-07, + "loss": 0.0006, + "num_input_tokens_seen": 35980352, + "step": 170490 + }, + { + "epoch": 18.756325632563257, + "grad_norm": 0.009959167800843716, + "learning_rate": 5.867441771894994e-07, + "loss": 0.0005, + "num_input_tokens_seen": 35981440, + "step": 170495 + }, + { + "epoch": 18.75687568756876, + "grad_norm": 0.015154585242271423, + "learning_rate": 5.862273622152109e-07, + "loss": 0.0385, + "num_input_tokens_seen": 35982464, + "step": 170500 + }, + { + "epoch": 18.757425742574256, + "grad_norm": 6.067567348480225, + "learning_rate": 5.857107722509469e-07, + "loss": 0.1049, + "num_input_tokens_seen": 35983456, + "step": 170505 + }, + { + "epoch": 18.757975797579757, + "grad_norm": 0.007735688239336014, + "learning_rate": 5.851944073014703e-07, + "loss": 0.0817, + "num_input_tokens_seen": 35984448, + "step": 170510 + }, + { + "epoch": 18.758525852585258, + "grad_norm": 0.011089268140494823, + "learning_rate": 5.846782673715467e-07, + "loss": 0.0357, + "num_input_tokens_seen": 35985536, + "step": 170515 + }, + { + "epoch": 18.75907590759076, + "grad_norm": 0.003813265124335885, + "learning_rate": 5.84162352465925e-07, + "loss": 0.0058, + "num_input_tokens_seen": 35986624, + "step": 170520 + }, + { + "epoch": 18.75962596259626, + "grad_norm": 0.12391599267721176, + "learning_rate": 5.836466625893655e-07, + "loss": 0.0038, + "num_input_tokens_seen": 35987712, + "step": 170525 + }, + { + "epoch": 18.76017601760176, + "grad_norm": 4.484974384307861, + "learning_rate": 5.831311977466169e-07, + "loss": 0.1355, + "num_input_tokens_seen": 35988800, + "step": 170530 + }, + { + "epoch": 18.760726072607262, + "grad_norm": 0.007961438968777657, + "learning_rate": 5.826159579424312e-07, + "loss": 0.0061, + "num_input_tokens_seen": 35989824, + "step": 170535 + }, + { + "epoch": 18.76127612761276, + "grad_norm": 0.07362060248851776, + "learning_rate": 5.821009431815572e-07, + "loss": 0.0396, + "num_input_tokens_seen": 35990944, + "step": 170540 + }, + { + "epoch": 18.76182618261826, + "grad_norm": 0.01222789566963911, + "learning_rate": 5.815861534687411e-07, + "loss": 0.0015, + "num_input_tokens_seen": 35991936, + "step": 170545 + }, + { + "epoch": 18.762376237623762, + "grad_norm": 0.06710632145404816, + "learning_rate": 5.81071588808732e-07, + "loss": 0.0126, + "num_input_tokens_seen": 35992960, + "step": 170550 + }, + { + "epoch": 18.762926292629263, + "grad_norm": 0.9719622731208801, + "learning_rate": 5.805572492062678e-07, + "loss": 0.0134, + "num_input_tokens_seen": 35994016, + "step": 170555 + }, + { + "epoch": 18.763476347634764, + "grad_norm": 0.09044191241264343, + "learning_rate": 5.800431346660862e-07, + "loss": 0.0017, + "num_input_tokens_seen": 35995072, + "step": 170560 + }, + { + "epoch": 18.764026402640265, + "grad_norm": 0.040219079703092575, + "learning_rate": 5.795292451929307e-07, + "loss": 0.0013, + "num_input_tokens_seen": 35996160, + "step": 170565 + }, + { + "epoch": 18.764576457645763, + "grad_norm": 0.027617398649454117, + "learning_rate": 5.790155807915365e-07, + "loss": 0.0818, + "num_input_tokens_seen": 35997184, + "step": 170570 + }, + { + "epoch": 18.765126512651264, + "grad_norm": 0.02317154034972191, + "learning_rate": 5.785021414666413e-07, + "loss": 0.0016, + "num_input_tokens_seen": 35998240, + "step": 170575 + }, + { + "epoch": 18.765676567656765, + "grad_norm": 0.023897038772702217, + "learning_rate": 5.779889272229693e-07, + "loss": 0.0535, + "num_input_tokens_seen": 35999264, + "step": 170580 + }, + { + "epoch": 18.766226622662266, + "grad_norm": 0.051905419677495956, + "learning_rate": 5.774759380652556e-07, + "loss": 0.0035, + "num_input_tokens_seen": 36000320, + "step": 170585 + }, + { + "epoch": 18.766776677667767, + "grad_norm": 0.021799420937895775, + "learning_rate": 5.769631739982267e-07, + "loss": 0.0436, + "num_input_tokens_seen": 36001440, + "step": 170590 + }, + { + "epoch": 18.76732673267327, + "grad_norm": 0.0046309796161949635, + "learning_rate": 5.764506350266096e-07, + "loss": 0.1246, + "num_input_tokens_seen": 36002464, + "step": 170595 + }, + { + "epoch": 18.76787678767877, + "grad_norm": 0.09448213875293732, + "learning_rate": 5.759383211551256e-07, + "loss": 0.0124, + "num_input_tokens_seen": 36003520, + "step": 170600 + }, + { + "epoch": 18.768426842684267, + "grad_norm": 0.11242341995239258, + "learning_rate": 5.754262323885012e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36004608, + "step": 170605 + }, + { + "epoch": 18.768976897689768, + "grad_norm": 0.03666531294584274, + "learning_rate": 5.749143687314495e-07, + "loss": 0.0114, + "num_input_tokens_seen": 36005696, + "step": 170610 + }, + { + "epoch": 18.76952695269527, + "grad_norm": 0.01362481527030468, + "learning_rate": 5.744027301886945e-07, + "loss": 0.0643, + "num_input_tokens_seen": 36006656, + "step": 170615 + }, + { + "epoch": 18.77007700770077, + "grad_norm": 0.017803089693188667, + "learning_rate": 5.738913167649462e-07, + "loss": 0.0261, + "num_input_tokens_seen": 36007680, + "step": 170620 + }, + { + "epoch": 18.77062706270627, + "grad_norm": 0.03743284195661545, + "learning_rate": 5.733801284649232e-07, + "loss": 0.0042, + "num_input_tokens_seen": 36008736, + "step": 170625 + }, + { + "epoch": 18.771177117711773, + "grad_norm": 0.04161621630191803, + "learning_rate": 5.728691652933355e-07, + "loss": 0.0242, + "num_input_tokens_seen": 36009824, + "step": 170630 + }, + { + "epoch": 18.77172717271727, + "grad_norm": 0.079261913895607, + "learning_rate": 5.723584272548877e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36010880, + "step": 170635 + }, + { + "epoch": 18.77227722772277, + "grad_norm": 0.03170100972056389, + "learning_rate": 5.718479143542954e-07, + "loss": 0.0045, + "num_input_tokens_seen": 36011968, + "step": 170640 + }, + { + "epoch": 18.772827282728272, + "grad_norm": 0.07474880665540695, + "learning_rate": 5.713376265962578e-07, + "loss": 0.0067, + "num_input_tokens_seen": 36012992, + "step": 170645 + }, + { + "epoch": 18.773377337733773, + "grad_norm": 0.022360417991876602, + "learning_rate": 5.708275639854765e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36014080, + "step": 170650 + }, + { + "epoch": 18.773927392739274, + "grad_norm": 0.064787857234478, + "learning_rate": 5.703177265266563e-07, + "loss": 0.0035, + "num_input_tokens_seen": 36015104, + "step": 170655 + }, + { + "epoch": 18.774477447744776, + "grad_norm": 0.005952088627964258, + "learning_rate": 5.69808114224496e-07, + "loss": 0.0211, + "num_input_tokens_seen": 36016224, + "step": 170660 + }, + { + "epoch": 18.775027502750277, + "grad_norm": 0.0038233923260122538, + "learning_rate": 5.692987270836919e-07, + "loss": 0.001, + "num_input_tokens_seen": 36017216, + "step": 170665 + }, + { + "epoch": 18.775577557755774, + "grad_norm": 0.009395276196300983, + "learning_rate": 5.687895651089403e-07, + "loss": 0.078, + "num_input_tokens_seen": 36018272, + "step": 170670 + }, + { + "epoch": 18.776127612761275, + "grad_norm": 0.10202774405479431, + "learning_rate": 5.682806283049292e-07, + "loss": 0.0901, + "num_input_tokens_seen": 36019392, + "step": 170675 + }, + { + "epoch": 18.776677667766776, + "grad_norm": 2.059675693511963, + "learning_rate": 5.677719166763518e-07, + "loss": 0.0099, + "num_input_tokens_seen": 36020448, + "step": 170680 + }, + { + "epoch": 18.777227722772277, + "grad_norm": 0.032927218824625015, + "learning_rate": 5.672634302278962e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36021504, + "step": 170685 + }, + { + "epoch": 18.77777777777778, + "grad_norm": 2.972606897354126, + "learning_rate": 5.667551689642531e-07, + "loss": 0.048, + "num_input_tokens_seen": 36022592, + "step": 170690 + }, + { + "epoch": 18.77832783278328, + "grad_norm": 0.15662164986133575, + "learning_rate": 5.66247132890102e-07, + "loss": 0.0865, + "num_input_tokens_seen": 36023680, + "step": 170695 + }, + { + "epoch": 18.778877887788777, + "grad_norm": 0.23784323036670685, + "learning_rate": 5.657393220101254e-07, + "loss": 0.0036, + "num_input_tokens_seen": 36024704, + "step": 170700 + }, + { + "epoch": 18.77942794279428, + "grad_norm": 0.03272927552461624, + "learning_rate": 5.652317363290055e-07, + "loss": 0.0125, + "num_input_tokens_seen": 36025792, + "step": 170705 + }, + { + "epoch": 18.77997799779978, + "grad_norm": 0.32747533917427063, + "learning_rate": 5.647243758514193e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36026848, + "step": 170710 + }, + { + "epoch": 18.78052805280528, + "grad_norm": 0.030500127002596855, + "learning_rate": 5.642172405820433e-07, + "loss": 0.0533, + "num_input_tokens_seen": 36027936, + "step": 170715 + }, + { + "epoch": 18.78107810781078, + "grad_norm": 0.04594185948371887, + "learning_rate": 5.63710330525552e-07, + "loss": 0.0047, + "num_input_tokens_seen": 36029056, + "step": 170720 + }, + { + "epoch": 18.781628162816283, + "grad_norm": 1.9945812225341797, + "learning_rate": 5.632036456866163e-07, + "loss": 0.0339, + "num_input_tokens_seen": 36030048, + "step": 170725 + }, + { + "epoch": 18.782178217821784, + "grad_norm": 0.011215309612452984, + "learning_rate": 5.626971860699076e-07, + "loss": 0.0867, + "num_input_tokens_seen": 36031104, + "step": 170730 + }, + { + "epoch": 18.78272827282728, + "grad_norm": 0.054361533373594284, + "learning_rate": 5.621909516800916e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36032128, + "step": 170735 + }, + { + "epoch": 18.783278327832782, + "grad_norm": 0.1867048293352127, + "learning_rate": 5.616849425218367e-07, + "loss": 0.0027, + "num_input_tokens_seen": 36033184, + "step": 170740 + }, + { + "epoch": 18.783828382838283, + "grad_norm": 0.26545634865760803, + "learning_rate": 5.611791585998061e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36034144, + "step": 170745 + }, + { + "epoch": 18.784378437843785, + "grad_norm": 0.09219544380903244, + "learning_rate": 5.606735999186568e-07, + "loss": 0.0027, + "num_input_tokens_seen": 36035200, + "step": 170750 + }, + { + "epoch": 18.784928492849286, + "grad_norm": 0.006631086580455303, + "learning_rate": 5.601682664830549e-07, + "loss": 0.1001, + "num_input_tokens_seen": 36036256, + "step": 170755 + }, + { + "epoch": 18.785478547854787, + "grad_norm": 0.21415366232395172, + "learning_rate": 5.596631582976519e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36037344, + "step": 170760 + }, + { + "epoch": 18.786028602860284, + "grad_norm": 0.03471570461988449, + "learning_rate": 5.591582753671082e-07, + "loss": 0.003, + "num_input_tokens_seen": 36038400, + "step": 170765 + }, + { + "epoch": 18.786578657865785, + "grad_norm": 0.06918325275182724, + "learning_rate": 5.586536176960728e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36039456, + "step": 170770 + }, + { + "epoch": 18.787128712871286, + "grad_norm": 0.01819656975567341, + "learning_rate": 5.581491852892001e-07, + "loss": 0.0969, + "num_input_tokens_seen": 36040544, + "step": 170775 + }, + { + "epoch": 18.787678767876788, + "grad_norm": 0.08354665338993073, + "learning_rate": 5.576449781511394e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36041568, + "step": 170780 + }, + { + "epoch": 18.78822882288229, + "grad_norm": 0.039062805473804474, + "learning_rate": 5.571409962865343e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36042592, + "step": 170785 + }, + { + "epoch": 18.78877887788779, + "grad_norm": 0.005411603953689337, + "learning_rate": 5.566372397000335e-07, + "loss": 0.0343, + "num_input_tokens_seen": 36043648, + "step": 170790 + }, + { + "epoch": 18.78932893289329, + "grad_norm": 0.11453031748533249, + "learning_rate": 5.561337083962781e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36044704, + "step": 170795 + }, + { + "epoch": 18.78987898789879, + "grad_norm": 0.024444445967674255, + "learning_rate": 5.556304023799058e-07, + "loss": 0.1039, + "num_input_tokens_seen": 36045728, + "step": 170800 + }, + { + "epoch": 18.79042904290429, + "grad_norm": 0.37435469031333923, + "learning_rate": 5.55127321655563e-07, + "loss": 0.033, + "num_input_tokens_seen": 36046784, + "step": 170805 + }, + { + "epoch": 18.79097909790979, + "grad_norm": 0.07573948055505753, + "learning_rate": 5.546244662278766e-07, + "loss": 0.0263, + "num_input_tokens_seen": 36047936, + "step": 170810 + }, + { + "epoch": 18.79152915291529, + "grad_norm": 0.9794524312019348, + "learning_rate": 5.541218361014927e-07, + "loss": 0.0066, + "num_input_tokens_seen": 36048928, + "step": 170815 + }, + { + "epoch": 18.792079207920793, + "grad_norm": 0.0922284871339798, + "learning_rate": 5.536194312810355e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36049920, + "step": 170820 + }, + { + "epoch": 18.792629262926294, + "grad_norm": 3.8829946517944336, + "learning_rate": 5.531172517711347e-07, + "loss": 0.1212, + "num_input_tokens_seen": 36050976, + "step": 170825 + }, + { + "epoch": 18.793179317931795, + "grad_norm": 0.23940539360046387, + "learning_rate": 5.526152975764226e-07, + "loss": 0.0086, + "num_input_tokens_seen": 36052064, + "step": 170830 + }, + { + "epoch": 18.793729372937293, + "grad_norm": 0.022465858608484268, + "learning_rate": 5.521135687015261e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36053056, + "step": 170835 + }, + { + "epoch": 18.794279427942794, + "grad_norm": 0.005774301942437887, + "learning_rate": 5.516120651510665e-07, + "loss": 0.0452, + "num_input_tokens_seen": 36054208, + "step": 170840 + }, + { + "epoch": 18.794829482948295, + "grad_norm": 2.0890848636627197, + "learning_rate": 5.511107869296677e-07, + "loss": 0.1028, + "num_input_tokens_seen": 36055232, + "step": 170845 + }, + { + "epoch": 18.795379537953796, + "grad_norm": 0.05408769100904465, + "learning_rate": 5.506097340419458e-07, + "loss": 0.013, + "num_input_tokens_seen": 36056288, + "step": 170850 + }, + { + "epoch": 18.795929592959297, + "grad_norm": 0.18714803457260132, + "learning_rate": 5.501089064925246e-07, + "loss": 0.0037, + "num_input_tokens_seen": 36057312, + "step": 170855 + }, + { + "epoch": 18.796479647964798, + "grad_norm": 0.05770699307322502, + "learning_rate": 5.496083042860201e-07, + "loss": 0.0069, + "num_input_tokens_seen": 36058400, + "step": 170860 + }, + { + "epoch": 18.797029702970296, + "grad_norm": 0.03237457945942879, + "learning_rate": 5.491079274270394e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36059424, + "step": 170865 + }, + { + "epoch": 18.797579757975797, + "grad_norm": 0.0044518448412418365, + "learning_rate": 5.486077759202013e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36060448, + "step": 170870 + }, + { + "epoch": 18.798129812981298, + "grad_norm": 2.2383105754852295, + "learning_rate": 5.481078497701075e-07, + "loss": 0.0289, + "num_input_tokens_seen": 36061440, + "step": 170875 + }, + { + "epoch": 18.7986798679868, + "grad_norm": 0.02049352042376995, + "learning_rate": 5.476081489813767e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36062496, + "step": 170880 + }, + { + "epoch": 18.7992299229923, + "grad_norm": 0.014915459789335728, + "learning_rate": 5.471086735586023e-07, + "loss": 0.0617, + "num_input_tokens_seen": 36063520, + "step": 170885 + }, + { + "epoch": 18.7997799779978, + "grad_norm": 0.061595167964696884, + "learning_rate": 5.466094235063973e-07, + "loss": 0.0457, + "num_input_tokens_seen": 36064576, + "step": 170890 + }, + { + "epoch": 18.8003300330033, + "grad_norm": 0.0027502889279276133, + "learning_rate": 5.461103988293581e-07, + "loss": 0.0081, + "num_input_tokens_seen": 36065632, + "step": 170895 + }, + { + "epoch": 18.8008800880088, + "grad_norm": 0.040395211428403854, + "learning_rate": 5.456115995320866e-07, + "loss": 0.0596, + "num_input_tokens_seen": 36066624, + "step": 170900 + }, + { + "epoch": 18.8014301430143, + "grad_norm": 0.010859117843210697, + "learning_rate": 5.451130256191789e-07, + "loss": 0.0755, + "num_input_tokens_seen": 36067712, + "step": 170905 + }, + { + "epoch": 18.801980198019802, + "grad_norm": 0.6377915143966675, + "learning_rate": 5.446146770952288e-07, + "loss": 0.0224, + "num_input_tokens_seen": 36068736, + "step": 170910 + }, + { + "epoch": 18.802530253025303, + "grad_norm": 0.006329936441034079, + "learning_rate": 5.441165539648269e-07, + "loss": 0.0437, + "num_input_tokens_seen": 36069792, + "step": 170915 + }, + { + "epoch": 18.803080308030804, + "grad_norm": 0.047332316637039185, + "learning_rate": 5.436186562325724e-07, + "loss": 0.002, + "num_input_tokens_seen": 36070816, + "step": 170920 + }, + { + "epoch": 18.803630363036305, + "grad_norm": 0.026736335828900337, + "learning_rate": 5.431209839030477e-07, + "loss": 0.0571, + "num_input_tokens_seen": 36071808, + "step": 170925 + }, + { + "epoch": 18.804180418041803, + "grad_norm": 0.009840895421802998, + "learning_rate": 5.426235369808408e-07, + "loss": 0.053, + "num_input_tokens_seen": 36072864, + "step": 170930 + }, + { + "epoch": 18.804730473047304, + "grad_norm": 0.2877131998538971, + "learning_rate": 5.421263154705397e-07, + "loss": 0.015, + "num_input_tokens_seen": 36073824, + "step": 170935 + }, + { + "epoch": 18.805280528052805, + "grad_norm": 0.039435096085071564, + "learning_rate": 5.416293193767214e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36074848, + "step": 170940 + }, + { + "epoch": 18.805830583058306, + "grad_norm": 0.1164797842502594, + "learning_rate": 5.411325487039709e-07, + "loss": 0.0337, + "num_input_tokens_seen": 36075872, + "step": 170945 + }, + { + "epoch": 18.806380638063807, + "grad_norm": 0.011949100531637669, + "learning_rate": 5.406360034568625e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36076960, + "step": 170950 + }, + { + "epoch": 18.806930693069308, + "grad_norm": 0.06960628926753998, + "learning_rate": 5.401396836399786e-07, + "loss": 0.081, + "num_input_tokens_seen": 36077984, + "step": 170955 + }, + { + "epoch": 18.80748074807481, + "grad_norm": 0.1477135270833969, + "learning_rate": 5.396435892578905e-07, + "loss": 0.0042, + "num_input_tokens_seen": 36079040, + "step": 170960 + }, + { + "epoch": 18.808030803080307, + "grad_norm": 0.031005611643195152, + "learning_rate": 5.391477203151696e-07, + "loss": 0.0457, + "num_input_tokens_seen": 36080128, + "step": 170965 + }, + { + "epoch": 18.808580858085808, + "grad_norm": 0.02433479018509388, + "learning_rate": 5.386520768163872e-07, + "loss": 0.0432, + "num_input_tokens_seen": 36081184, + "step": 170970 + }, + { + "epoch": 18.80913091309131, + "grad_norm": 0.010419760830700397, + "learning_rate": 5.381566587661064e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36082272, + "step": 170975 + }, + { + "epoch": 18.80968096809681, + "grad_norm": 0.0068898978643119335, + "learning_rate": 5.376614661689012e-07, + "loss": 0.0546, + "num_input_tokens_seen": 36083328, + "step": 170980 + }, + { + "epoch": 18.81023102310231, + "grad_norm": 0.03473740443587303, + "learning_rate": 5.371664990293346e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36084448, + "step": 170985 + }, + { + "epoch": 18.810781078107812, + "grad_norm": 0.06878332048654556, + "learning_rate": 5.366717573519614e-07, + "loss": 0.1086, + "num_input_tokens_seen": 36085472, + "step": 170990 + }, + { + "epoch": 18.81133113311331, + "grad_norm": 0.010200172662734985, + "learning_rate": 5.361772411413502e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36086528, + "step": 170995 + }, + { + "epoch": 18.81188118811881, + "grad_norm": 0.07399115711450577, + "learning_rate": 5.3568295040205e-07, + "loss": 0.0097, + "num_input_tokens_seen": 36087552, + "step": 171000 + }, + { + "epoch": 18.812431243124312, + "grad_norm": 0.041861847043037415, + "learning_rate": 5.351888851386211e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36088640, + "step": 171005 + }, + { + "epoch": 18.812981298129813, + "grad_norm": 0.07012435793876648, + "learning_rate": 5.34695045355621e-07, + "loss": 0.0355, + "num_input_tokens_seen": 36089696, + "step": 171010 + }, + { + "epoch": 18.813531353135314, + "grad_norm": 0.09961026906967163, + "learning_rate": 5.342014310575933e-07, + "loss": 0.0852, + "num_input_tokens_seen": 36090816, + "step": 171015 + }, + { + "epoch": 18.814081408140815, + "grad_norm": 0.013337657786905766, + "learning_rate": 5.337080422490954e-07, + "loss": 0.0047, + "num_input_tokens_seen": 36091872, + "step": 171020 + }, + { + "epoch": 18.814631463146316, + "grad_norm": 0.01599816419184208, + "learning_rate": 5.33214878934668e-07, + "loss": 0.001, + "num_input_tokens_seen": 36092960, + "step": 171025 + }, + { + "epoch": 18.815181518151814, + "grad_norm": 0.08711404353380203, + "learning_rate": 5.327219411188577e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36094048, + "step": 171030 + }, + { + "epoch": 18.815731573157315, + "grad_norm": 4.993767738342285, + "learning_rate": 5.322292288062081e-07, + "loss": 0.0168, + "num_input_tokens_seen": 36095072, + "step": 171035 + }, + { + "epoch": 18.816281628162816, + "grad_norm": 0.05522114783525467, + "learning_rate": 5.317367420012625e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36096128, + "step": 171040 + }, + { + "epoch": 18.816831683168317, + "grad_norm": 2.104325294494629, + "learning_rate": 5.312444807085592e-07, + "loss": 0.056, + "num_input_tokens_seen": 36097184, + "step": 171045 + }, + { + "epoch": 18.817381738173818, + "grad_norm": 0.05601634830236435, + "learning_rate": 5.307524449326363e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36098240, + "step": 171050 + }, + { + "epoch": 18.81793179317932, + "grad_norm": 0.9262897372245789, + "learning_rate": 5.302606346780203e-07, + "loss": 0.0062, + "num_input_tokens_seen": 36099296, + "step": 171055 + }, + { + "epoch": 18.818481848184817, + "grad_norm": 0.027026144787669182, + "learning_rate": 5.297690499492525e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36100384, + "step": 171060 + }, + { + "epoch": 18.819031903190318, + "grad_norm": 0.23837660253047943, + "learning_rate": 5.292776907508623e-07, + "loss": 0.0061, + "num_input_tokens_seen": 36101504, + "step": 171065 + }, + { + "epoch": 18.81958195819582, + "grad_norm": 0.01539201382547617, + "learning_rate": 5.287865570873796e-07, + "loss": 0.0739, + "num_input_tokens_seen": 36102656, + "step": 171070 + }, + { + "epoch": 18.82013201320132, + "grad_norm": 0.0345689058303833, + "learning_rate": 5.282956489633284e-07, + "loss": 0.0134, + "num_input_tokens_seen": 36103776, + "step": 171075 + }, + { + "epoch": 18.82068206820682, + "grad_norm": 0.007730761542916298, + "learning_rate": 5.278049663832302e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36104832, + "step": 171080 + }, + { + "epoch": 18.821232123212322, + "grad_norm": 0.015921924263238907, + "learning_rate": 5.273145093516146e-07, + "loss": 0.0182, + "num_input_tokens_seen": 36105888, + "step": 171085 + }, + { + "epoch": 18.821782178217823, + "grad_norm": 0.04158617556095123, + "learning_rate": 5.268242778729948e-07, + "loss": 0.002, + "num_input_tokens_seen": 36106976, + "step": 171090 + }, + { + "epoch": 18.82233223322332, + "grad_norm": 0.0339365154504776, + "learning_rate": 5.263342719518921e-07, + "loss": 0.0301, + "num_input_tokens_seen": 36108032, + "step": 171095 + }, + { + "epoch": 18.822882288228822, + "grad_norm": 0.40896496176719666, + "learning_rate": 5.258444915928252e-07, + "loss": 0.11, + "num_input_tokens_seen": 36109024, + "step": 171100 + }, + { + "epoch": 18.823432343234323, + "grad_norm": 0.015077976509928703, + "learning_rate": 5.253549368003041e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36110048, + "step": 171105 + }, + { + "epoch": 18.823982398239824, + "grad_norm": 0.015189089812338352, + "learning_rate": 5.24865607578845e-07, + "loss": 0.003, + "num_input_tokens_seen": 36111104, + "step": 171110 + }, + { + "epoch": 18.824532453245325, + "grad_norm": 0.3609815835952759, + "learning_rate": 5.243765039329524e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36112096, + "step": 171115 + }, + { + "epoch": 18.825082508250826, + "grad_norm": 1.8256597518920898, + "learning_rate": 5.238876258671366e-07, + "loss": 0.0141, + "num_input_tokens_seen": 36113120, + "step": 171120 + }, + { + "epoch": 18.825632563256324, + "grad_norm": 0.1511034369468689, + "learning_rate": 5.233989733859052e-07, + "loss": 0.0725, + "num_input_tokens_seen": 36114176, + "step": 171125 + }, + { + "epoch": 18.826182618261825, + "grad_norm": 0.02180662751197815, + "learning_rate": 5.2291054649376e-07, + "loss": 0.033, + "num_input_tokens_seen": 36115136, + "step": 171130 + }, + { + "epoch": 18.826732673267326, + "grad_norm": 0.03849030286073685, + "learning_rate": 5.22422345195206e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36116160, + "step": 171135 + }, + { + "epoch": 18.827282728272827, + "grad_norm": 0.06554332375526428, + "learning_rate": 5.219343694947365e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36117248, + "step": 171140 + }, + { + "epoch": 18.82783278327833, + "grad_norm": 0.007181256078183651, + "learning_rate": 5.214466193968537e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36118336, + "step": 171145 + }, + { + "epoch": 18.82838283828383, + "grad_norm": 1.1615867614746094, + "learning_rate": 5.209590949060483e-07, + "loss": 0.0097, + "num_input_tokens_seen": 36119424, + "step": 171150 + }, + { + "epoch": 18.82893289328933, + "grad_norm": 0.01995597966015339, + "learning_rate": 5.204717960268196e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36120480, + "step": 171155 + }, + { + "epoch": 18.829482948294828, + "grad_norm": 0.030761929228901863, + "learning_rate": 5.199847227636556e-07, + "loss": 0.1032, + "num_input_tokens_seen": 36121568, + "step": 171160 + }, + { + "epoch": 18.83003300330033, + "grad_norm": 0.0027751110028475523, + "learning_rate": 5.194978751210444e-07, + "loss": 0.0049, + "num_input_tokens_seen": 36122688, + "step": 171165 + }, + { + "epoch": 18.83058305830583, + "grad_norm": 0.0136237358674407, + "learning_rate": 5.190112531034769e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36123712, + "step": 171170 + }, + { + "epoch": 18.83113311331133, + "grad_norm": 0.01759140007197857, + "learning_rate": 5.185248567154327e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36124832, + "step": 171175 + }, + { + "epoch": 18.831683168316832, + "grad_norm": 0.18348652124404907, + "learning_rate": 5.180386859614001e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36125920, + "step": 171180 + }, + { + "epoch": 18.832233223322334, + "grad_norm": 0.05038623511791229, + "learning_rate": 5.175527408458558e-07, + "loss": 0.0074, + "num_input_tokens_seen": 36126944, + "step": 171185 + }, + { + "epoch": 18.83278327832783, + "grad_norm": 0.23274806141853333, + "learning_rate": 5.170670213732798e-07, + "loss": 0.0061, + "num_input_tokens_seen": 36128032, + "step": 171190 + }, + { + "epoch": 18.833333333333332, + "grad_norm": 0.11230384558439255, + "learning_rate": 5.165815275481517e-07, + "loss": 0.0277, + "num_input_tokens_seen": 36129056, + "step": 171195 + }, + { + "epoch": 18.833883388338833, + "grad_norm": 0.002415176946669817, + "learning_rate": 5.16096259374943e-07, + "loss": 0.0655, + "num_input_tokens_seen": 36130080, + "step": 171200 + }, + { + "epoch": 18.834433443344334, + "grad_norm": 0.03103211149573326, + "learning_rate": 5.156112168581251e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36131072, + "step": 171205 + }, + { + "epoch": 18.834983498349835, + "grad_norm": 0.03625805303454399, + "learning_rate": 5.151264000021666e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36132096, + "step": 171210 + }, + { + "epoch": 18.835533553355337, + "grad_norm": 0.12454285472631454, + "learning_rate": 5.146418088115445e-07, + "loss": 0.0523, + "num_input_tokens_seen": 36133152, + "step": 171215 + }, + { + "epoch": 18.836083608360838, + "grad_norm": 0.007737788371741772, + "learning_rate": 5.141574432907165e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36134208, + "step": 171220 + }, + { + "epoch": 18.836633663366335, + "grad_norm": 0.0256990734487772, + "learning_rate": 5.136733034441538e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36135200, + "step": 171225 + }, + { + "epoch": 18.837183718371836, + "grad_norm": 0.006997105665504932, + "learning_rate": 5.131893892763085e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36136288, + "step": 171230 + }, + { + "epoch": 18.837733773377337, + "grad_norm": 0.11001898348331451, + "learning_rate": 5.127057007916492e-07, + "loss": 0.0544, + "num_input_tokens_seen": 36137376, + "step": 171235 + }, + { + "epoch": 18.83828382838284, + "grad_norm": 0.08249631524085999, + "learning_rate": 5.122222379946334e-07, + "loss": 0.0714, + "num_input_tokens_seen": 36138400, + "step": 171240 + }, + { + "epoch": 18.83883388338834, + "grad_norm": 0.01950235851109028, + "learning_rate": 5.117390008897133e-07, + "loss": 0.1623, + "num_input_tokens_seen": 36139360, + "step": 171245 + }, + { + "epoch": 18.83938393839384, + "grad_norm": 0.274859756231308, + "learning_rate": 5.112559894813434e-07, + "loss": 0.0086, + "num_input_tokens_seen": 36140416, + "step": 171250 + }, + { + "epoch": 18.83993399339934, + "grad_norm": 0.28894683718681335, + "learning_rate": 5.107732037739787e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36141472, + "step": 171255 + }, + { + "epoch": 18.84048404840484, + "grad_norm": 0.00349739333614707, + "learning_rate": 5.102906437720656e-07, + "loss": 0.0078, + "num_input_tokens_seen": 36142528, + "step": 171260 + }, + { + "epoch": 18.84103410341034, + "grad_norm": 0.012083037756383419, + "learning_rate": 5.098083094800505e-07, + "loss": 0.0377, + "num_input_tokens_seen": 36143616, + "step": 171265 + }, + { + "epoch": 18.84158415841584, + "grad_norm": 0.10046937316656113, + "learning_rate": 5.093262009023797e-07, + "loss": 0.0052, + "num_input_tokens_seen": 36144736, + "step": 171270 + }, + { + "epoch": 18.842134213421343, + "grad_norm": 0.07987526804208755, + "learning_rate": 5.088443180434998e-07, + "loss": 0.0352, + "num_input_tokens_seen": 36145760, + "step": 171275 + }, + { + "epoch": 18.842684268426844, + "grad_norm": 0.046946931630373, + "learning_rate": 5.08362660907849e-07, + "loss": 0.0076, + "num_input_tokens_seen": 36146784, + "step": 171280 + }, + { + "epoch": 18.843234323432345, + "grad_norm": 0.07250738143920898, + "learning_rate": 5.078812294998708e-07, + "loss": 0.0176, + "num_input_tokens_seen": 36147872, + "step": 171285 + }, + { + "epoch": 18.843784378437842, + "grad_norm": 0.03713265806436539, + "learning_rate": 5.074000238239924e-07, + "loss": 0.1104, + "num_input_tokens_seen": 36148992, + "step": 171290 + }, + { + "epoch": 18.844334433443343, + "grad_norm": 2.1921567916870117, + "learning_rate": 5.0691904388466e-07, + "loss": 0.0378, + "num_input_tokens_seen": 36150080, + "step": 171295 + }, + { + "epoch": 18.844884488448844, + "grad_norm": 0.007277548313140869, + "learning_rate": 5.064382896863007e-07, + "loss": 0.0762, + "num_input_tokens_seen": 36151168, + "step": 171300 + }, + { + "epoch": 18.845434543454346, + "grad_norm": 0.14915135502815247, + "learning_rate": 5.059577612333443e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36152256, + "step": 171305 + }, + { + "epoch": 18.845984598459847, + "grad_norm": 0.004801280330866575, + "learning_rate": 5.054774585302264e-07, + "loss": 0.0369, + "num_input_tokens_seen": 36153280, + "step": 171310 + }, + { + "epoch": 18.846534653465348, + "grad_norm": 0.7719210386276245, + "learning_rate": 5.04997381581368e-07, + "loss": 0.0118, + "num_input_tokens_seen": 36154304, + "step": 171315 + }, + { + "epoch": 18.847084708470845, + "grad_norm": 0.11746735870838165, + "learning_rate": 5.045175303911937e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36155360, + "step": 171320 + }, + { + "epoch": 18.847634763476346, + "grad_norm": 1.1166706085205078, + "learning_rate": 5.040379049641247e-07, + "loss": 0.0327, + "num_input_tokens_seen": 36156416, + "step": 171325 + }, + { + "epoch": 18.848184818481847, + "grad_norm": 2.0407910346984863, + "learning_rate": 5.035585053045855e-07, + "loss": 0.0248, + "num_input_tokens_seen": 36157440, + "step": 171330 + }, + { + "epoch": 18.84873487348735, + "grad_norm": 0.007410266902297735, + "learning_rate": 5.030793314169946e-07, + "loss": 0.0074, + "num_input_tokens_seen": 36158528, + "step": 171335 + }, + { + "epoch": 18.84928492849285, + "grad_norm": 0.020226482301950455, + "learning_rate": 5.02600383305768e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36159584, + "step": 171340 + }, + { + "epoch": 18.84983498349835, + "grad_norm": 0.0034410215448588133, + "learning_rate": 5.02121660975316e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36160672, + "step": 171345 + }, + { + "epoch": 18.850385038503852, + "grad_norm": 0.36955246329307556, + "learning_rate": 5.016431644300517e-07, + "loss": 0.0036, + "num_input_tokens_seen": 36161728, + "step": 171350 + }, + { + "epoch": 18.85093509350935, + "grad_norm": 0.09238763898611069, + "learning_rate": 5.011648936743885e-07, + "loss": 0.026, + "num_input_tokens_seen": 36162816, + "step": 171355 + }, + { + "epoch": 18.85148514851485, + "grad_norm": 2.530820369720459, + "learning_rate": 5.006868487127364e-07, + "loss": 0.1144, + "num_input_tokens_seen": 36163840, + "step": 171360 + }, + { + "epoch": 18.85203520352035, + "grad_norm": 0.2584715783596039, + "learning_rate": 5.002090295494949e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36164864, + "step": 171365 + }, + { + "epoch": 18.852585258525853, + "grad_norm": 0.036202799528837204, + "learning_rate": 4.997314361890715e-07, + "loss": 0.0041, + "num_input_tokens_seen": 36166016, + "step": 171370 + }, + { + "epoch": 18.853135313531354, + "grad_norm": 0.006297758314758539, + "learning_rate": 4.992540686358682e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36166976, + "step": 171375 + }, + { + "epoch": 18.853685368536855, + "grad_norm": 0.582635760307312, + "learning_rate": 4.987769268942788e-07, + "loss": 0.0352, + "num_input_tokens_seen": 36168000, + "step": 171380 + }, + { + "epoch": 18.854235423542356, + "grad_norm": 0.09053274989128113, + "learning_rate": 4.983000109687081e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36169152, + "step": 171385 + }, + { + "epoch": 18.854785478547853, + "grad_norm": 0.07075855880975723, + "learning_rate": 4.978233208635497e-07, + "loss": 0.0864, + "num_input_tokens_seen": 36170240, + "step": 171390 + }, + { + "epoch": 18.855335533553355, + "grad_norm": 0.005754241719841957, + "learning_rate": 4.973468565831973e-07, + "loss": 0.002, + "num_input_tokens_seen": 36171296, + "step": 171395 + }, + { + "epoch": 18.855885588558856, + "grad_norm": 0.17940902709960938, + "learning_rate": 4.968706181320393e-07, + "loss": 0.005, + "num_input_tokens_seen": 36172320, + "step": 171400 + }, + { + "epoch": 18.856435643564357, + "grad_norm": 4.506389617919922, + "learning_rate": 4.963946055144691e-07, + "loss": 0.0732, + "num_input_tokens_seen": 36173344, + "step": 171405 + }, + { + "epoch": 18.856985698569858, + "grad_norm": 0.08523071557283401, + "learning_rate": 4.959188187348695e-07, + "loss": 0.0499, + "num_input_tokens_seen": 36174368, + "step": 171410 + }, + { + "epoch": 18.85753575357536, + "grad_norm": 0.1200856864452362, + "learning_rate": 4.954432577976315e-07, + "loss": 0.0233, + "num_input_tokens_seen": 36175424, + "step": 171415 + }, + { + "epoch": 18.858085808580856, + "grad_norm": 0.04300343617796898, + "learning_rate": 4.949679227071291e-07, + "loss": 0.0067, + "num_input_tokens_seen": 36176608, + "step": 171420 + }, + { + "epoch": 18.858635863586358, + "grad_norm": 0.014320731163024902, + "learning_rate": 4.944928134677535e-07, + "loss": 0.0085, + "num_input_tokens_seen": 36177664, + "step": 171425 + }, + { + "epoch": 18.85918591859186, + "grad_norm": 0.6701051592826843, + "learning_rate": 4.940179300838788e-07, + "loss": 0.0169, + "num_input_tokens_seen": 36178656, + "step": 171430 + }, + { + "epoch": 18.85973597359736, + "grad_norm": 0.024181129410862923, + "learning_rate": 4.935432725598793e-07, + "loss": 0.0144, + "num_input_tokens_seen": 36179808, + "step": 171435 + }, + { + "epoch": 18.86028602860286, + "grad_norm": 0.059547506272792816, + "learning_rate": 4.930688409001322e-07, + "loss": 0.0838, + "num_input_tokens_seen": 36180896, + "step": 171440 + }, + { + "epoch": 18.860836083608362, + "grad_norm": 0.08976297825574875, + "learning_rate": 4.925946351090116e-07, + "loss": 0.0058, + "num_input_tokens_seen": 36181888, + "step": 171445 + }, + { + "epoch": 18.861386138613863, + "grad_norm": 0.12575384974479675, + "learning_rate": 4.921206551908864e-07, + "loss": 0.187, + "num_input_tokens_seen": 36182976, + "step": 171450 + }, + { + "epoch": 18.86193619361936, + "grad_norm": 0.027745453640818596, + "learning_rate": 4.916469011501251e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36184032, + "step": 171455 + }, + { + "epoch": 18.86248624862486, + "grad_norm": 0.05272693932056427, + "learning_rate": 4.911733729910911e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36185024, + "step": 171460 + }, + { + "epoch": 18.863036303630363, + "grad_norm": 0.2632715106010437, + "learning_rate": 4.90700070718153e-07, + "loss": 0.0115, + "num_input_tokens_seen": 36186048, + "step": 171465 + }, + { + "epoch": 18.863586358635864, + "grad_norm": 4.489149570465088, + "learning_rate": 4.902269943356713e-07, + "loss": 0.0338, + "num_input_tokens_seen": 36187104, + "step": 171470 + }, + { + "epoch": 18.864136413641365, + "grad_norm": 0.003473080927506089, + "learning_rate": 4.897541438480091e-07, + "loss": 0.0102, + "num_input_tokens_seen": 36188128, + "step": 171475 + }, + { + "epoch": 18.864686468646866, + "grad_norm": 0.013536931946873665, + "learning_rate": 4.89281519259524e-07, + "loss": 0.001, + "num_input_tokens_seen": 36189216, + "step": 171480 + }, + { + "epoch": 18.865236523652364, + "grad_norm": 0.012700566090643406, + "learning_rate": 4.888091205745626e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36190208, + "step": 171485 + }, + { + "epoch": 18.865786578657865, + "grad_norm": 3.556178569793701, + "learning_rate": 4.883369477974909e-07, + "loss": 0.0962, + "num_input_tokens_seen": 36191264, + "step": 171490 + }, + { + "epoch": 18.866336633663366, + "grad_norm": 0.020197506994009018, + "learning_rate": 4.878650009326552e-07, + "loss": 0.0092, + "num_input_tokens_seen": 36192320, + "step": 171495 + }, + { + "epoch": 18.866886688668867, + "grad_norm": 0.3959624171257019, + "learning_rate": 4.873932799844022e-07, + "loss": 0.0051, + "num_input_tokens_seen": 36193344, + "step": 171500 + }, + { + "epoch": 18.867436743674368, + "grad_norm": 0.03677162528038025, + "learning_rate": 4.869217849570868e-07, + "loss": 0.0396, + "num_input_tokens_seen": 36194368, + "step": 171505 + }, + { + "epoch": 18.86798679867987, + "grad_norm": 0.07468623667955399, + "learning_rate": 4.864505158550498e-07, + "loss": 0.0044, + "num_input_tokens_seen": 36195488, + "step": 171510 + }, + { + "epoch": 18.86853685368537, + "grad_norm": 0.017986075952649117, + "learning_rate": 4.85979472682635e-07, + "loss": 0.0045, + "num_input_tokens_seen": 36196576, + "step": 171515 + }, + { + "epoch": 18.869086908690868, + "grad_norm": 0.11688901484012604, + "learning_rate": 4.855086554441862e-07, + "loss": 0.0425, + "num_input_tokens_seen": 36197600, + "step": 171520 + }, + { + "epoch": 18.86963696369637, + "grad_norm": 1.113075852394104, + "learning_rate": 4.850380641440388e-07, + "loss": 0.1007, + "num_input_tokens_seen": 36198720, + "step": 171525 + }, + { + "epoch": 18.87018701870187, + "grad_norm": 0.05715010315179825, + "learning_rate": 4.845676987865338e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36199776, + "step": 171530 + }, + { + "epoch": 18.87073707370737, + "grad_norm": 1.087296962738037, + "learning_rate": 4.840975593760011e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36200864, + "step": 171535 + }, + { + "epoch": 18.871287128712872, + "grad_norm": 0.2799840271472931, + "learning_rate": 4.836276459167816e-07, + "loss": 0.0067, + "num_input_tokens_seen": 36201984, + "step": 171540 + }, + { + "epoch": 18.871837183718373, + "grad_norm": 0.14817547798156738, + "learning_rate": 4.831579584131996e-07, + "loss": 0.0183, + "num_input_tokens_seen": 36203008, + "step": 171545 + }, + { + "epoch": 18.87238723872387, + "grad_norm": 2.0055174827575684, + "learning_rate": 4.826884968695905e-07, + "loss": 0.0336, + "num_input_tokens_seen": 36204064, + "step": 171550 + }, + { + "epoch": 18.872937293729372, + "grad_norm": 0.03132397681474686, + "learning_rate": 4.822192612902704e-07, + "loss": 0.089, + "num_input_tokens_seen": 36205056, + "step": 171555 + }, + { + "epoch": 18.873487348734873, + "grad_norm": 4.838985919952393, + "learning_rate": 4.817502516795747e-07, + "loss": 0.0323, + "num_input_tokens_seen": 36206112, + "step": 171560 + }, + { + "epoch": 18.874037403740374, + "grad_norm": 0.012781431898474693, + "learning_rate": 4.812814680418221e-07, + "loss": 0.0794, + "num_input_tokens_seen": 36207072, + "step": 171565 + }, + { + "epoch": 18.874587458745875, + "grad_norm": 0.056012410670518875, + "learning_rate": 4.808129103813313e-07, + "loss": 0.0049, + "num_input_tokens_seen": 36208160, + "step": 171570 + }, + { + "epoch": 18.875137513751376, + "grad_norm": 0.02677934803068638, + "learning_rate": 4.803445787024241e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36209120, + "step": 171575 + }, + { + "epoch": 18.875687568756877, + "grad_norm": 0.06364884972572327, + "learning_rate": 4.798764730094163e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36210112, + "step": 171580 + }, + { + "epoch": 18.876237623762375, + "grad_norm": 0.01787872426211834, + "learning_rate": 4.794085933066156e-07, + "loss": 0.0392, + "num_input_tokens_seen": 36211232, + "step": 171585 + }, + { + "epoch": 18.876787678767876, + "grad_norm": 0.3129260540008545, + "learning_rate": 4.789409395983435e-07, + "loss": 0.0663, + "num_input_tokens_seen": 36212320, + "step": 171590 + }, + { + "epoch": 18.877337733773377, + "grad_norm": 0.05708093196153641, + "learning_rate": 4.784735118889077e-07, + "loss": 0.0302, + "num_input_tokens_seen": 36213440, + "step": 171595 + }, + { + "epoch": 18.877887788778878, + "grad_norm": 0.06282458454370499, + "learning_rate": 4.780063101826132e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36214464, + "step": 171600 + }, + { + "epoch": 18.87843784378438, + "grad_norm": 0.04744873568415642, + "learning_rate": 4.775393344837676e-07, + "loss": 0.0064, + "num_input_tokens_seen": 36215520, + "step": 171605 + }, + { + "epoch": 18.87898789878988, + "grad_norm": 0.008663025684654713, + "learning_rate": 4.770725847966756e-07, + "loss": 0.011, + "num_input_tokens_seen": 36216576, + "step": 171610 + }, + { + "epoch": 18.879537953795378, + "grad_norm": 0.0031951211858540773, + "learning_rate": 4.766060611256368e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36217664, + "step": 171615 + }, + { + "epoch": 18.88008800880088, + "grad_norm": 0.06614504754543304, + "learning_rate": 4.761397634749559e-07, + "loss": 0.0358, + "num_input_tokens_seen": 36218720, + "step": 171620 + }, + { + "epoch": 18.88063806380638, + "grad_norm": 2.102705717086792, + "learning_rate": 4.7567369184892407e-07, + "loss": 0.0343, + "num_input_tokens_seen": 36219840, + "step": 171625 + }, + { + "epoch": 18.88118811881188, + "grad_norm": 0.03146975487470627, + "learning_rate": 4.752078462518406e-07, + "loss": 0.1525, + "num_input_tokens_seen": 36220896, + "step": 171630 + }, + { + "epoch": 18.881738173817382, + "grad_norm": 0.02221653237938881, + "learning_rate": 4.747422266879992e-07, + "loss": 0.0052, + "num_input_tokens_seen": 36221952, + "step": 171635 + }, + { + "epoch": 18.882288228822883, + "grad_norm": 0.040632762014865875, + "learning_rate": 4.7427683316168814e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36222912, + "step": 171640 + }, + { + "epoch": 18.882838283828384, + "grad_norm": 0.032765090465545654, + "learning_rate": 4.7381166567720124e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36223968, + "step": 171645 + }, + { + "epoch": 18.883388338833882, + "grad_norm": 0.00965152122080326, + "learning_rate": 4.733467242388212e-07, + "loss": 0.124, + "num_input_tokens_seen": 36225024, + "step": 171650 + }, + { + "epoch": 18.883938393839383, + "grad_norm": 0.009132696315646172, + "learning_rate": 4.72882008850839e-07, + "loss": 0.0673, + "num_input_tokens_seen": 36226048, + "step": 171655 + }, + { + "epoch": 18.884488448844884, + "grad_norm": 0.01218496449291706, + "learning_rate": 4.7241751951752897e-07, + "loss": 0.0064, + "num_input_tokens_seen": 36227072, + "step": 171660 + }, + { + "epoch": 18.885038503850385, + "grad_norm": 0.03841136768460274, + "learning_rate": 4.719532562431822e-07, + "loss": 0.0423, + "num_input_tokens_seen": 36228160, + "step": 171665 + }, + { + "epoch": 18.885588558855886, + "grad_norm": 2.7394654750823975, + "learning_rate": 4.714892190320702e-07, + "loss": 0.1022, + "num_input_tokens_seen": 36229216, + "step": 171670 + }, + { + "epoch": 18.886138613861387, + "grad_norm": 0.06908787786960602, + "learning_rate": 4.7102540788847013e-07, + "loss": 0.0222, + "num_input_tokens_seen": 36230304, + "step": 171675 + }, + { + "epoch": 18.88668866886689, + "grad_norm": 0.095645971596241, + "learning_rate": 4.7056182281666194e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36231328, + "step": 171680 + }, + { + "epoch": 18.887238723872386, + "grad_norm": 0.6447641849517822, + "learning_rate": 4.7009846382091436e-07, + "loss": 0.0113, + "num_input_tokens_seen": 36232416, + "step": 171685 + }, + { + "epoch": 18.887788778877887, + "grad_norm": 0.016968555748462677, + "learning_rate": 4.6963533090549904e-07, + "loss": 0.0081, + "num_input_tokens_seen": 36233440, + "step": 171690 + }, + { + "epoch": 18.888338833883388, + "grad_norm": 1.5173485279083252, + "learning_rate": 4.6917242407468476e-07, + "loss": 0.1175, + "num_input_tokens_seen": 36234432, + "step": 171695 + }, + { + "epoch": 18.88888888888889, + "grad_norm": 0.020900612697005272, + "learning_rate": 4.68709743332732e-07, + "loss": 0.0369, + "num_input_tokens_seen": 36235520, + "step": 171700 + }, + { + "epoch": 18.88943894389439, + "grad_norm": 0.09057869762182236, + "learning_rate": 4.682472886839151e-07, + "loss": 0.0864, + "num_input_tokens_seen": 36236608, + "step": 171705 + }, + { + "epoch": 18.88998899889989, + "grad_norm": 0.0017234799452126026, + "learning_rate": 4.677850601324918e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36237632, + "step": 171710 + }, + { + "epoch": 18.89053905390539, + "grad_norm": 0.03224805369973183, + "learning_rate": 4.673230576827198e-07, + "loss": 0.0604, + "num_input_tokens_seen": 36238688, + "step": 171715 + }, + { + "epoch": 18.89108910891089, + "grad_norm": 0.05547988787293434, + "learning_rate": 4.6686128133886233e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36239744, + "step": 171720 + }, + { + "epoch": 18.89163916391639, + "grad_norm": 0.005255864933133125, + "learning_rate": 4.6639973110516875e-07, + "loss": 0.192, + "num_input_tokens_seen": 36240800, + "step": 171725 + }, + { + "epoch": 18.892189218921892, + "grad_norm": 1.629980206489563, + "learning_rate": 4.6593840698589684e-07, + "loss": 0.0945, + "num_input_tokens_seen": 36241824, + "step": 171730 + }, + { + "epoch": 18.892739273927393, + "grad_norm": 0.19010774791240692, + "learning_rate": 4.654773089852987e-07, + "loss": 0.0067, + "num_input_tokens_seen": 36242848, + "step": 171735 + }, + { + "epoch": 18.893289328932894, + "grad_norm": 0.01663362979888916, + "learning_rate": 4.6501643710762375e-07, + "loss": 0.019, + "num_input_tokens_seen": 36243904, + "step": 171740 + }, + { + "epoch": 18.893839383938392, + "grad_norm": 0.010598959401249886, + "learning_rate": 4.645557913571214e-07, + "loss": 0.0375, + "num_input_tokens_seen": 36244928, + "step": 171745 + }, + { + "epoch": 18.894389438943893, + "grad_norm": 3.7856194972991943, + "learning_rate": 4.6409537173802973e-07, + "loss": 0.0666, + "num_input_tokens_seen": 36245984, + "step": 171750 + }, + { + "epoch": 18.894939493949394, + "grad_norm": 0.004631728399544954, + "learning_rate": 4.636351782546039e-07, + "loss": 0.001, + "num_input_tokens_seen": 36247040, + "step": 171755 + }, + { + "epoch": 18.895489548954895, + "grad_norm": 0.2376251071691513, + "learning_rate": 4.6317521091107095e-07, + "loss": 0.0876, + "num_input_tokens_seen": 36248032, + "step": 171760 + }, + { + "epoch": 18.896039603960396, + "grad_norm": 0.016433624550700188, + "learning_rate": 4.627154697116859e-07, + "loss": 0.003, + "num_input_tokens_seen": 36249088, + "step": 171765 + }, + { + "epoch": 18.896589658965897, + "grad_norm": 0.007502448745071888, + "learning_rate": 4.6225595466067585e-07, + "loss": 0.0307, + "num_input_tokens_seen": 36250112, + "step": 171770 + }, + { + "epoch": 18.8971397139714, + "grad_norm": 0.009743603877723217, + "learning_rate": 4.6179666576227633e-07, + "loss": 0.0565, + "num_input_tokens_seen": 36251168, + "step": 171775 + }, + { + "epoch": 18.897689768976896, + "grad_norm": 0.17082205414772034, + "learning_rate": 4.613376030207256e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36252224, + "step": 171780 + }, + { + "epoch": 18.898239823982397, + "grad_norm": 0.09484776109457016, + "learning_rate": 4.608787664402481e-07, + "loss": 0.0183, + "num_input_tokens_seen": 36253280, + "step": 171785 + }, + { + "epoch": 18.8987898789879, + "grad_norm": 0.12140490859746933, + "learning_rate": 4.604201560250765e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36254368, + "step": 171790 + }, + { + "epoch": 18.8993399339934, + "grad_norm": 0.09321434050798416, + "learning_rate": 4.59961771779438e-07, + "loss": 0.0035, + "num_input_tokens_seen": 36255392, + "step": 171795 + }, + { + "epoch": 18.8998899889989, + "grad_norm": 0.18193219602108002, + "learning_rate": 4.59503613707557e-07, + "loss": 0.007, + "num_input_tokens_seen": 36256352, + "step": 171800 + }, + { + "epoch": 18.9004400440044, + "grad_norm": 1.2328740358352661, + "learning_rate": 4.5904568181365516e-07, + "loss": 0.016, + "num_input_tokens_seen": 36257440, + "step": 171805 + }, + { + "epoch": 18.900990099009903, + "grad_norm": 0.7750352621078491, + "learning_rate": 4.58587976101954e-07, + "loss": 0.0468, + "num_input_tokens_seen": 36258560, + "step": 171810 + }, + { + "epoch": 18.9015401540154, + "grad_norm": 0.01729699969291687, + "learning_rate": 4.5813049657666976e-07, + "loss": 0.1092, + "num_input_tokens_seen": 36259648, + "step": 171815 + }, + { + "epoch": 18.9020902090209, + "grad_norm": 1.8302069902420044, + "learning_rate": 4.576732432420211e-07, + "loss": 0.1501, + "num_input_tokens_seen": 36260704, + "step": 171820 + }, + { + "epoch": 18.902640264026402, + "grad_norm": 0.009886330924928188, + "learning_rate": 4.5721621610221866e-07, + "loss": 0.0093, + "num_input_tokens_seen": 36261728, + "step": 171825 + }, + { + "epoch": 18.903190319031903, + "grad_norm": 0.021281033754348755, + "learning_rate": 4.5675941516148124e-07, + "loss": 0.0559, + "num_input_tokens_seen": 36262784, + "step": 171830 + }, + { + "epoch": 18.903740374037405, + "grad_norm": 0.013206402771174908, + "learning_rate": 4.563028404240166e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36263776, + "step": 171835 + }, + { + "epoch": 18.904290429042906, + "grad_norm": 0.002980117918923497, + "learning_rate": 4.5584649189402975e-07, + "loss": 0.0109, + "num_input_tokens_seen": 36264864, + "step": 171840 + }, + { + "epoch": 18.904840484048403, + "grad_norm": 0.011288443580269814, + "learning_rate": 4.5539036957572556e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36265920, + "step": 171845 + }, + { + "epoch": 18.905390539053904, + "grad_norm": 2.1130166053771973, + "learning_rate": 4.549344734733119e-07, + "loss": 0.0331, + "num_input_tokens_seen": 36266912, + "step": 171850 + }, + { + "epoch": 18.905940594059405, + "grad_norm": 0.0056273797526955605, + "learning_rate": 4.544788035909936e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36267968, + "step": 171855 + }, + { + "epoch": 18.906490649064907, + "grad_norm": 0.48217135667800903, + "learning_rate": 4.5402335993296465e-07, + "loss": 0.1585, + "num_input_tokens_seen": 36269056, + "step": 171860 + }, + { + "epoch": 18.907040704070408, + "grad_norm": 0.02504628524184227, + "learning_rate": 4.5356814250342163e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36270080, + "step": 171865 + }, + { + "epoch": 18.90759075907591, + "grad_norm": 0.034071799367666245, + "learning_rate": 4.5311315130656394e-07, + "loss": 0.0005, + "num_input_tokens_seen": 36271168, + "step": 171870 + }, + { + "epoch": 18.90814081408141, + "grad_norm": 0.007019681390374899, + "learning_rate": 4.5265838634658275e-07, + "loss": 0.0768, + "num_input_tokens_seen": 36272192, + "step": 171875 + }, + { + "epoch": 18.908690869086907, + "grad_norm": 0.002448584884405136, + "learning_rate": 4.522038476276691e-07, + "loss": 0.0604, + "num_input_tokens_seen": 36273216, + "step": 171880 + }, + { + "epoch": 18.90924092409241, + "grad_norm": 0.30869120359420776, + "learning_rate": 4.5174953515401405e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36274304, + "step": 171885 + }, + { + "epoch": 18.90979097909791, + "grad_norm": 0.024508750066161156, + "learning_rate": 4.5129544892980604e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36275360, + "step": 171890 + }, + { + "epoch": 18.91034103410341, + "grad_norm": 0.003713174955919385, + "learning_rate": 4.5084158895922767e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36276416, + "step": 171895 + }, + { + "epoch": 18.91089108910891, + "grad_norm": 0.059761643409729004, + "learning_rate": 4.503879552464618e-07, + "loss": 0.004, + "num_input_tokens_seen": 36277408, + "step": 171900 + }, + { + "epoch": 18.911441144114413, + "grad_norm": 1.0898432731628418, + "learning_rate": 4.4993454779568844e-07, + "loss": 0.0719, + "num_input_tokens_seen": 36278496, + "step": 171905 + }, + { + "epoch": 18.91199119911991, + "grad_norm": 0.06741944700479507, + "learning_rate": 4.494813666110903e-07, + "loss": 0.0062, + "num_input_tokens_seen": 36279552, + "step": 171910 + }, + { + "epoch": 18.91254125412541, + "grad_norm": 0.6827479600906372, + "learning_rate": 4.4902841169684183e-07, + "loss": 0.0089, + "num_input_tokens_seen": 36280640, + "step": 171915 + }, + { + "epoch": 18.913091309130913, + "grad_norm": 0.7413759827613831, + "learning_rate": 4.485756830571175e-07, + "loss": 0.0057, + "num_input_tokens_seen": 36281696, + "step": 171920 + }, + { + "epoch": 18.913641364136414, + "grad_norm": 0.18287762999534607, + "learning_rate": 4.4812318069609174e-07, + "loss": 0.1209, + "num_input_tokens_seen": 36282688, + "step": 171925 + }, + { + "epoch": 18.914191419141915, + "grad_norm": 0.09288741648197174, + "learning_rate": 4.476709046179306e-07, + "loss": 0.0486, + "num_input_tokens_seen": 36283680, + "step": 171930 + }, + { + "epoch": 18.914741474147416, + "grad_norm": 0.0852225050330162, + "learning_rate": 4.4721885482680583e-07, + "loss": 0.0183, + "num_input_tokens_seen": 36284736, + "step": 171935 + }, + { + "epoch": 18.915291529152917, + "grad_norm": 0.10795392096042633, + "learning_rate": 4.4676703132688346e-07, + "loss": 0.0205, + "num_input_tokens_seen": 36285760, + "step": 171940 + }, + { + "epoch": 18.915841584158414, + "grad_norm": 0.058825910091400146, + "learning_rate": 4.4631543412232966e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36286784, + "step": 171945 + }, + { + "epoch": 18.916391639163916, + "grad_norm": 2.792797565460205, + "learning_rate": 4.4586406321730224e-07, + "loss": 0.11, + "num_input_tokens_seen": 36287840, + "step": 171950 + }, + { + "epoch": 18.916941694169417, + "grad_norm": 0.5832982063293457, + "learning_rate": 4.4541291861596724e-07, + "loss": 0.0081, + "num_input_tokens_seen": 36288896, + "step": 171955 + }, + { + "epoch": 18.917491749174918, + "grad_norm": 3.4012258052825928, + "learning_rate": 4.4496200032247416e-07, + "loss": 0.0639, + "num_input_tokens_seen": 36289952, + "step": 171960 + }, + { + "epoch": 18.91804180418042, + "grad_norm": 0.02766837738454342, + "learning_rate": 4.4451130834098633e-07, + "loss": 0.0834, + "num_input_tokens_seen": 36291040, + "step": 171965 + }, + { + "epoch": 18.91859185918592, + "grad_norm": 0.15793921053409576, + "learning_rate": 4.440608426756532e-07, + "loss": 0.0089, + "num_input_tokens_seen": 36292096, + "step": 171970 + }, + { + "epoch": 18.919141914191417, + "grad_norm": 0.017198940739035606, + "learning_rate": 4.436106033306298e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36293184, + "step": 171975 + }, + { + "epoch": 18.91969196919692, + "grad_norm": 0.007928315550088882, + "learning_rate": 4.431605903100655e-07, + "loss": 0.0477, + "num_input_tokens_seen": 36294208, + "step": 171980 + }, + { + "epoch": 18.92024202420242, + "grad_norm": 0.6076159477233887, + "learning_rate": 4.4271080361810437e-07, + "loss": 0.003, + "num_input_tokens_seen": 36295200, + "step": 171985 + }, + { + "epoch": 18.92079207920792, + "grad_norm": 0.044097062200307846, + "learning_rate": 4.4226124325889297e-07, + "loss": 0.0636, + "num_input_tokens_seen": 36296256, + "step": 171990 + }, + { + "epoch": 18.921342134213422, + "grad_norm": 0.010958710685372353, + "learning_rate": 4.41811909236578e-07, + "loss": 0.0434, + "num_input_tokens_seen": 36297376, + "step": 171995 + }, + { + "epoch": 18.921892189218923, + "grad_norm": 0.03916047886013985, + "learning_rate": 4.4136280155529783e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36298432, + "step": 172000 + }, + { + "epoch": 18.922442244224424, + "grad_norm": 0.04736426845192909, + "learning_rate": 4.4091392021919087e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36299488, + "step": 172005 + }, + { + "epoch": 18.92299229922992, + "grad_norm": 0.4784226715564728, + "learning_rate": 4.4046526523239816e-07, + "loss": 0.0109, + "num_input_tokens_seen": 36300576, + "step": 172010 + }, + { + "epoch": 18.923542354235423, + "grad_norm": 0.12298313528299332, + "learning_rate": 4.400168365990498e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36301632, + "step": 172015 + }, + { + "epoch": 18.924092409240924, + "grad_norm": 0.05140378326177597, + "learning_rate": 4.3956863432328133e-07, + "loss": 0.2372, + "num_input_tokens_seen": 36302688, + "step": 172020 + }, + { + "epoch": 18.924642464246425, + "grad_norm": 1.1879912614822388, + "learning_rate": 4.391206584092256e-07, + "loss": 0.011, + "num_input_tokens_seen": 36303808, + "step": 172025 + }, + { + "epoch": 18.925192519251926, + "grad_norm": 0.006475450936704874, + "learning_rate": 4.3867290886100974e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36304832, + "step": 172030 + }, + { + "epoch": 18.925742574257427, + "grad_norm": 0.010359871201217175, + "learning_rate": 4.382253856827584e-07, + "loss": 0.0066, + "num_input_tokens_seen": 36305856, + "step": 172035 + }, + { + "epoch": 18.926292629262925, + "grad_norm": 0.1728862076997757, + "learning_rate": 4.3777808887859873e-07, + "loss": 0.1673, + "num_input_tokens_seen": 36306880, + "step": 172040 + }, + { + "epoch": 18.926842684268426, + "grad_norm": 0.1521337926387787, + "learning_rate": 4.3733101845265245e-07, + "loss": 0.0041, + "num_input_tokens_seen": 36307968, + "step": 172045 + }, + { + "epoch": 18.927392739273927, + "grad_norm": 0.0097082844004035, + "learning_rate": 4.368841744090385e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36309056, + "step": 172050 + }, + { + "epoch": 18.927942794279428, + "grad_norm": 0.06878902018070221, + "learning_rate": 4.364375567518758e-07, + "loss": 0.0066, + "num_input_tokens_seen": 36310080, + "step": 172055 + }, + { + "epoch": 18.92849284928493, + "grad_norm": 2.0252628326416016, + "learning_rate": 4.359911654852833e-07, + "loss": 0.1275, + "num_input_tokens_seen": 36311136, + "step": 172060 + }, + { + "epoch": 18.92904290429043, + "grad_norm": 0.3375456631183624, + "learning_rate": 4.355450006133716e-07, + "loss": 0.0557, + "num_input_tokens_seen": 36312192, + "step": 172065 + }, + { + "epoch": 18.92959295929593, + "grad_norm": 1.6610344648361206, + "learning_rate": 4.3509906214025676e-07, + "loss": 0.009, + "num_input_tokens_seen": 36313216, + "step": 172070 + }, + { + "epoch": 18.93014301430143, + "grad_norm": 0.03456384688615799, + "learning_rate": 4.3465335007004393e-07, + "loss": 0.0742, + "num_input_tokens_seen": 36314208, + "step": 172075 + }, + { + "epoch": 18.93069306930693, + "grad_norm": 2.0868802070617676, + "learning_rate": 4.342078644068437e-07, + "loss": 0.0079, + "num_input_tokens_seen": 36315296, + "step": 172080 + }, + { + "epoch": 18.93124312431243, + "grad_norm": 0.017733603715896606, + "learning_rate": 4.3376260515476384e-07, + "loss": 0.0732, + "num_input_tokens_seen": 36316352, + "step": 172085 + }, + { + "epoch": 18.931793179317932, + "grad_norm": 4.205083847045898, + "learning_rate": 4.333175723179039e-07, + "loss": 0.0226, + "num_input_tokens_seen": 36317376, + "step": 172090 + }, + { + "epoch": 18.932343234323433, + "grad_norm": 0.012572367675602436, + "learning_rate": 4.3287276590037173e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36318432, + "step": 172095 + }, + { + "epoch": 18.932893289328934, + "grad_norm": 0.02937871776521206, + "learning_rate": 4.324281859062612e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36319488, + "step": 172100 + }, + { + "epoch": 18.933443344334435, + "grad_norm": 0.018103796988725662, + "learning_rate": 4.319838323396691e-07, + "loss": 0.0526, + "num_input_tokens_seen": 36320544, + "step": 172105 + }, + { + "epoch": 18.933993399339933, + "grad_norm": 0.1296844780445099, + "learning_rate": 4.3153970520469213e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36321536, + "step": 172110 + }, + { + "epoch": 18.934543454345434, + "grad_norm": 0.10311290621757507, + "learning_rate": 4.3109580450542696e-07, + "loss": 0.1038, + "num_input_tokens_seen": 36322560, + "step": 172115 + }, + { + "epoch": 18.935093509350935, + "grad_norm": 0.00851526577025652, + "learning_rate": 4.3065213024596207e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36323680, + "step": 172120 + }, + { + "epoch": 18.935643564356436, + "grad_norm": 0.003438649233430624, + "learning_rate": 4.302086824303886e-07, + "loss": 0.0778, + "num_input_tokens_seen": 36324736, + "step": 172125 + }, + { + "epoch": 18.936193619361937, + "grad_norm": 0.018451057374477386, + "learning_rate": 4.2976546106278935e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36325760, + "step": 172130 + }, + { + "epoch": 18.936743674367438, + "grad_norm": 0.01468745805323124, + "learning_rate": 4.293224661472528e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36326816, + "step": 172135 + }, + { + "epoch": 18.937293729372936, + "grad_norm": 0.014977275393903255, + "learning_rate": 4.288796976878617e-07, + "loss": 0.011, + "num_input_tokens_seen": 36327904, + "step": 172140 + }, + { + "epoch": 18.937843784378437, + "grad_norm": 0.032753776758909225, + "learning_rate": 4.284371556886962e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36328960, + "step": 172145 + }, + { + "epoch": 18.938393839383938, + "grad_norm": 0.01398247666656971, + "learning_rate": 4.279948401538336e-07, + "loss": 0.0213, + "num_input_tokens_seen": 36329952, + "step": 172150 + }, + { + "epoch": 18.93894389438944, + "grad_norm": 0.01806420460343361, + "learning_rate": 4.2755275108735113e-07, + "loss": 0.0048, + "num_input_tokens_seen": 36330976, + "step": 172155 + }, + { + "epoch": 18.93949394939494, + "grad_norm": 0.01679457537829876, + "learning_rate": 4.2711088849332894e-07, + "loss": 0.0231, + "num_input_tokens_seen": 36332032, + "step": 172160 + }, + { + "epoch": 18.94004400440044, + "grad_norm": 0.31106066703796387, + "learning_rate": 4.2666925237582755e-07, + "loss": 0.0065, + "num_input_tokens_seen": 36333024, + "step": 172165 + }, + { + "epoch": 18.94059405940594, + "grad_norm": 0.02490834705531597, + "learning_rate": 4.262278427389271e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36334016, + "step": 172170 + }, + { + "epoch": 18.94114411441144, + "grad_norm": 0.4114079475402832, + "learning_rate": 4.2578665958669383e-07, + "loss": 0.004, + "num_input_tokens_seen": 36335040, + "step": 172175 + }, + { + "epoch": 18.94169416941694, + "grad_norm": 0.051474954932928085, + "learning_rate": 4.253457029231911e-07, + "loss": 0.004, + "num_input_tokens_seen": 36336064, + "step": 172180 + }, + { + "epoch": 18.942244224422442, + "grad_norm": 0.11994338780641556, + "learning_rate": 4.2490497275248785e-07, + "loss": 0.0431, + "num_input_tokens_seen": 36337088, + "step": 172185 + }, + { + "epoch": 18.942794279427943, + "grad_norm": 2.2622268199920654, + "learning_rate": 4.24464469078642e-07, + "loss": 0.0703, + "num_input_tokens_seen": 36338144, + "step": 172190 + }, + { + "epoch": 18.943344334433444, + "grad_norm": 0.6461362838745117, + "learning_rate": 4.240241919057142e-07, + "loss": 0.0887, + "num_input_tokens_seen": 36339136, + "step": 172195 + }, + { + "epoch": 18.943894389438945, + "grad_norm": 0.049607615917921066, + "learning_rate": 4.235841412377622e-07, + "loss": 0.005, + "num_input_tokens_seen": 36340224, + "step": 172200 + }, + { + "epoch": 18.944444444444443, + "grad_norm": 0.01893085055053234, + "learning_rate": 4.2314431707884406e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36341280, + "step": 172205 + }, + { + "epoch": 18.944994499449944, + "grad_norm": 0.01383508276194334, + "learning_rate": 4.2270471943300913e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36342368, + "step": 172210 + }, + { + "epoch": 18.945544554455445, + "grad_norm": 0.08151920139789581, + "learning_rate": 4.2226534830431543e-07, + "loss": 0.0351, + "num_input_tokens_seen": 36343392, + "step": 172215 + }, + { + "epoch": 18.946094609460946, + "grad_norm": 0.004400178790092468, + "learning_rate": 4.2182620369680403e-07, + "loss": 0.0111, + "num_input_tokens_seen": 36344416, + "step": 172220 + }, + { + "epoch": 18.946644664466447, + "grad_norm": 0.06305316835641861, + "learning_rate": 4.213872856145273e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36345408, + "step": 172225 + }, + { + "epoch": 18.94719471947195, + "grad_norm": 0.2537473440170288, + "learning_rate": 4.209485940615321e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36346464, + "step": 172230 + }, + { + "epoch": 18.94774477447745, + "grad_norm": 1.8290249109268188, + "learning_rate": 4.205101290418595e-07, + "loss": 0.0108, + "num_input_tokens_seen": 36347488, + "step": 172235 + }, + { + "epoch": 18.948294829482947, + "grad_norm": 0.215135395526886, + "learning_rate": 4.2007189055955076e-07, + "loss": 0.0787, + "num_input_tokens_seen": 36348480, + "step": 172240 + }, + { + "epoch": 18.948844884488448, + "grad_norm": 0.021555496379733086, + "learning_rate": 4.196338786186443e-07, + "loss": 0.0061, + "num_input_tokens_seen": 36349504, + "step": 172245 + }, + { + "epoch": 18.94939493949395, + "grad_norm": 0.02401101402938366, + "learning_rate": 4.191960932231759e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36350624, + "step": 172250 + }, + { + "epoch": 18.94994499449945, + "grad_norm": 0.5946118235588074, + "learning_rate": 4.18758534377181e-07, + "loss": 0.0051, + "num_input_tokens_seen": 36351680, + "step": 172255 + }, + { + "epoch": 18.95049504950495, + "grad_norm": 0.13310888409614563, + "learning_rate": 4.183212020846983e-07, + "loss": 0.0912, + "num_input_tokens_seen": 36352768, + "step": 172260 + }, + { + "epoch": 18.951045104510452, + "grad_norm": 3.1209819316864014, + "learning_rate": 4.178840963497521e-07, + "loss": 0.125, + "num_input_tokens_seen": 36353760, + "step": 172265 + }, + { + "epoch": 18.95159515951595, + "grad_norm": 0.4791029095649719, + "learning_rate": 4.1744721717636993e-07, + "loss": 0.0912, + "num_input_tokens_seen": 36354880, + "step": 172270 + }, + { + "epoch": 18.95214521452145, + "grad_norm": 0.060686927288770676, + "learning_rate": 4.1701056456858456e-07, + "loss": 0.0639, + "num_input_tokens_seen": 36355904, + "step": 172275 + }, + { + "epoch": 18.952695269526952, + "grad_norm": 3.74312424659729, + "learning_rate": 4.1657413853041225e-07, + "loss": 0.0241, + "num_input_tokens_seen": 36356928, + "step": 172280 + }, + { + "epoch": 18.953245324532453, + "grad_norm": 0.013007193803787231, + "learning_rate": 4.161379390658832e-07, + "loss": 0.1035, + "num_input_tokens_seen": 36357952, + "step": 172285 + }, + { + "epoch": 18.953795379537954, + "grad_norm": 0.32123470306396484, + "learning_rate": 4.1570196617901346e-07, + "loss": 0.004, + "num_input_tokens_seen": 36359040, + "step": 172290 + }, + { + "epoch": 18.954345434543455, + "grad_norm": 0.031110292300581932, + "learning_rate": 4.152662198738222e-07, + "loss": 0.0742, + "num_input_tokens_seen": 36360096, + "step": 172295 + }, + { + "epoch": 18.954895489548957, + "grad_norm": 0.27428001165390015, + "learning_rate": 4.1483070015432555e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36361216, + "step": 172300 + }, + { + "epoch": 18.955445544554454, + "grad_norm": 1.9627026319503784, + "learning_rate": 4.1439540702453703e-07, + "loss": 0.0382, + "num_input_tokens_seen": 36362336, + "step": 172305 + }, + { + "epoch": 18.955995599559955, + "grad_norm": 0.004973188508301973, + "learning_rate": 4.1396034048846733e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36363424, + "step": 172310 + }, + { + "epoch": 18.956545654565456, + "grad_norm": 0.40390512347221375, + "learning_rate": 4.1352550055012985e-07, + "loss": 0.003, + "num_input_tokens_seen": 36364416, + "step": 172315 + }, + { + "epoch": 18.957095709570957, + "grad_norm": 0.05812597647309303, + "learning_rate": 4.1309088721352694e-07, + "loss": 0.0483, + "num_input_tokens_seen": 36365504, + "step": 172320 + }, + { + "epoch": 18.95764576457646, + "grad_norm": 0.04931327700614929, + "learning_rate": 4.1265650048267214e-07, + "loss": 0.0583, + "num_input_tokens_seen": 36366592, + "step": 172325 + }, + { + "epoch": 18.95819581958196, + "grad_norm": 0.08020215481519699, + "learning_rate": 4.122223403615594e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36367680, + "step": 172330 + }, + { + "epoch": 18.958745874587457, + "grad_norm": 0.13325464725494385, + "learning_rate": 4.117884068541994e-07, + "loss": 0.0107, + "num_input_tokens_seen": 36368768, + "step": 172335 + }, + { + "epoch": 18.959295929592958, + "grad_norm": 1.6895482540130615, + "learning_rate": 4.113546999645834e-07, + "loss": 0.0321, + "num_input_tokens_seen": 36369760, + "step": 172340 + }, + { + "epoch": 18.95984598459846, + "grad_norm": 0.07524780929088593, + "learning_rate": 4.109212196967138e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36370816, + "step": 172345 + }, + { + "epoch": 18.96039603960396, + "grad_norm": 1.7575223445892334, + "learning_rate": 4.104879660545846e-07, + "loss": 0.026, + "num_input_tokens_seen": 36371840, + "step": 172350 + }, + { + "epoch": 18.96094609460946, + "grad_norm": 2.0826826095581055, + "learning_rate": 4.100549390421871e-07, + "loss": 0.0064, + "num_input_tokens_seen": 36372896, + "step": 172355 + }, + { + "epoch": 18.961496149614963, + "grad_norm": 0.0049304659478366375, + "learning_rate": 4.0962213866351517e-07, + "loss": 0.0161, + "num_input_tokens_seen": 36373888, + "step": 172360 + }, + { + "epoch": 18.962046204620464, + "grad_norm": 0.11889534443616867, + "learning_rate": 4.0918956492255746e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36374976, + "step": 172365 + }, + { + "epoch": 18.96259625962596, + "grad_norm": 1.5843729972839355, + "learning_rate": 4.087572178232968e-07, + "loss": 0.0675, + "num_input_tokens_seen": 36376000, + "step": 172370 + }, + { + "epoch": 18.963146314631462, + "grad_norm": 0.2849724590778351, + "learning_rate": 4.083250973697245e-07, + "loss": 0.0203, + "num_input_tokens_seen": 36376992, + "step": 172375 + }, + { + "epoch": 18.963696369636963, + "grad_norm": 0.008111853152513504, + "learning_rate": 4.0789320356581786e-07, + "loss": 0.02, + "num_input_tokens_seen": 36378048, + "step": 172380 + }, + { + "epoch": 18.964246424642464, + "grad_norm": 0.005230766721069813, + "learning_rate": 4.0746153641555705e-07, + "loss": 0.1393, + "num_input_tokens_seen": 36379040, + "step": 172385 + }, + { + "epoch": 18.964796479647966, + "grad_norm": 0.1057671457529068, + "learning_rate": 4.070300959229251e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36380096, + "step": 172390 + }, + { + "epoch": 18.965346534653467, + "grad_norm": 0.023928722366690636, + "learning_rate": 4.065988820918937e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36381120, + "step": 172395 + }, + { + "epoch": 18.965896589658964, + "grad_norm": 1.8119758367538452, + "learning_rate": 4.0616789492644025e-07, + "loss": 0.0621, + "num_input_tokens_seen": 36382208, + "step": 172400 + }, + { + "epoch": 18.966446644664465, + "grad_norm": 0.7894815802574158, + "learning_rate": 4.057371344305394e-07, + "loss": 0.0105, + "num_input_tokens_seen": 36383264, + "step": 172405 + }, + { + "epoch": 18.966996699669966, + "grad_norm": 0.012549800798296928, + "learning_rate": 4.0530660060815464e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36384352, + "step": 172410 + }, + { + "epoch": 18.967546754675467, + "grad_norm": 0.05706928297877312, + "learning_rate": 4.0487629346325773e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36385376, + "step": 172415 + }, + { + "epoch": 18.96809680968097, + "grad_norm": 0.016168590635061264, + "learning_rate": 4.0444621299981225e-07, + "loss": 0.0771, + "num_input_tokens_seen": 36386432, + "step": 172420 + }, + { + "epoch": 18.96864686468647, + "grad_norm": 0.006982684601098299, + "learning_rate": 4.040163592217872e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36387520, + "step": 172425 + }, + { + "epoch": 18.96919691969197, + "grad_norm": 0.0035944427363574505, + "learning_rate": 4.035867321331405e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36388640, + "step": 172430 + }, + { + "epoch": 18.96974697469747, + "grad_norm": 0.028731638565659523, + "learning_rate": 4.031573317378301e-07, + "loss": 0.0036, + "num_input_tokens_seen": 36389664, + "step": 172435 + }, + { + "epoch": 18.97029702970297, + "grad_norm": 0.011998782865703106, + "learning_rate": 4.0272815803981954e-07, + "loss": 0.0063, + "num_input_tokens_seen": 36390752, + "step": 172440 + }, + { + "epoch": 18.97084708470847, + "grad_norm": 0.054727815091609955, + "learning_rate": 4.022992110430557e-07, + "loss": 0.003, + "num_input_tokens_seen": 36391808, + "step": 172445 + }, + { + "epoch": 18.97139713971397, + "grad_norm": 0.04734938591718674, + "learning_rate": 4.0187049075150194e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36392928, + "step": 172450 + }, + { + "epoch": 18.971947194719473, + "grad_norm": 0.011390252970159054, + "learning_rate": 4.014419971690997e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36393984, + "step": 172455 + }, + { + "epoch": 18.972497249724974, + "grad_norm": 0.014020797796547413, + "learning_rate": 4.0101373029980683e-07, + "loss": 0.0669, + "num_input_tokens_seen": 36394976, + "step": 172460 + }, + { + "epoch": 18.97304730473047, + "grad_norm": 0.021410232409834862, + "learning_rate": 4.005856901475646e-07, + "loss": 0.0448, + "num_input_tokens_seen": 36396096, + "step": 172465 + }, + { + "epoch": 18.973597359735972, + "grad_norm": 0.12941676378250122, + "learning_rate": 4.0015787671632e-07, + "loss": 0.0052, + "num_input_tokens_seen": 36397120, + "step": 172470 + }, + { + "epoch": 18.974147414741473, + "grad_norm": 0.01758471690118313, + "learning_rate": 3.997302900100197e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36398272, + "step": 172475 + }, + { + "epoch": 18.974697469746975, + "grad_norm": 0.3386082947254181, + "learning_rate": 3.9930293003259957e-07, + "loss": 0.0091, + "num_input_tokens_seen": 36399296, + "step": 172480 + }, + { + "epoch": 18.975247524752476, + "grad_norm": 0.07149849086999893, + "learning_rate": 3.9887579678799526e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36400288, + "step": 172485 + }, + { + "epoch": 18.975797579757977, + "grad_norm": 0.008645261637866497, + "learning_rate": 3.984488902801509e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36401312, + "step": 172490 + }, + { + "epoch": 18.976347634763478, + "grad_norm": 2.388061285018921, + "learning_rate": 3.9802221051299935e-07, + "loss": 0.0128, + "num_input_tokens_seen": 36402336, + "step": 172495 + }, + { + "epoch": 18.976897689768975, + "grad_norm": 0.05890457332134247, + "learning_rate": 3.9759575749047096e-07, + "loss": 0.0437, + "num_input_tokens_seen": 36403456, + "step": 172500 + }, + { + "epoch": 18.977447744774476, + "grad_norm": 0.07807456701993942, + "learning_rate": 3.971695312164986e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36404544, + "step": 172505 + }, + { + "epoch": 18.977997799779978, + "grad_norm": 0.2216566652059555, + "learning_rate": 3.967435316950069e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36405632, + "step": 172510 + }, + { + "epoch": 18.97854785478548, + "grad_norm": 0.06452693790197372, + "learning_rate": 3.963177589299233e-07, + "loss": 0.0107, + "num_input_tokens_seen": 36406656, + "step": 172515 + }, + { + "epoch": 18.97909790979098, + "grad_norm": 2.3770811557769775, + "learning_rate": 3.958922129251752e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36407680, + "step": 172520 + }, + { + "epoch": 18.97964796479648, + "grad_norm": 0.012288040481507778, + "learning_rate": 3.954668936846817e-07, + "loss": 0.001, + "num_input_tokens_seen": 36408672, + "step": 172525 + }, + { + "epoch": 18.980198019801982, + "grad_norm": 0.054893866181373596, + "learning_rate": 3.9504180121236465e-07, + "loss": 0.0701, + "num_input_tokens_seen": 36409760, + "step": 172530 + }, + { + "epoch": 18.98074807480748, + "grad_norm": 0.014005156233906746, + "learning_rate": 3.946169355121404e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36410816, + "step": 172535 + }, + { + "epoch": 18.98129812981298, + "grad_norm": 0.03437188267707825, + "learning_rate": 3.9419229658792523e-07, + "loss": 0.0662, + "num_input_tokens_seen": 36411840, + "step": 172540 + }, + { + "epoch": 18.98184818481848, + "grad_norm": 0.4162941873073578, + "learning_rate": 3.937678844436271e-07, + "loss": 0.0406, + "num_input_tokens_seen": 36412928, + "step": 172545 + }, + { + "epoch": 18.982398239823983, + "grad_norm": 0.008126620203256607, + "learning_rate": 3.933436990831707e-07, + "loss": 0.0092, + "num_input_tokens_seen": 36413952, + "step": 172550 + }, + { + "epoch": 18.982948294829484, + "grad_norm": 0.006121553014963865, + "learning_rate": 3.929197405104557e-07, + "loss": 0.0303, + "num_input_tokens_seen": 36415008, + "step": 172555 + }, + { + "epoch": 18.983498349834985, + "grad_norm": 0.008185683749616146, + "learning_rate": 3.9249600872938994e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36416032, + "step": 172560 + }, + { + "epoch": 18.984048404840483, + "grad_norm": 0.19147096574306488, + "learning_rate": 3.920725037438816e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36417088, + "step": 172565 + }, + { + "epoch": 18.984598459845984, + "grad_norm": 0.023080207407474518, + "learning_rate": 3.91649225557833e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36418144, + "step": 172570 + }, + { + "epoch": 18.985148514851485, + "grad_norm": 0.24855245649814606, + "learning_rate": 3.9122617417514384e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36419232, + "step": 172575 + }, + { + "epoch": 18.985698569856986, + "grad_norm": 0.05420324578881264, + "learning_rate": 3.908033495997193e-07, + "loss": 0.0541, + "num_input_tokens_seen": 36420256, + "step": 172580 + }, + { + "epoch": 18.986248624862487, + "grad_norm": 0.010984758846461773, + "learning_rate": 3.903807518354452e-07, + "loss": 0.0523, + "num_input_tokens_seen": 36421344, + "step": 172585 + }, + { + "epoch": 18.986798679867988, + "grad_norm": 0.02036166377365589, + "learning_rate": 3.899583808862295e-07, + "loss": 0.0264, + "num_input_tokens_seen": 36422400, + "step": 172590 + }, + { + "epoch": 18.98734873487349, + "grad_norm": 0.056963663548231125, + "learning_rate": 3.895362367559552e-07, + "loss": 0.0636, + "num_input_tokens_seen": 36423488, + "step": 172595 + }, + { + "epoch": 18.987898789878987, + "grad_norm": 0.04193830117583275, + "learning_rate": 3.891143194485164e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36424544, + "step": 172600 + }, + { + "epoch": 18.988448844884488, + "grad_norm": 0.014171333983540535, + "learning_rate": 3.886926289677989e-07, + "loss": 0.0429, + "num_input_tokens_seen": 36425600, + "step": 172605 + }, + { + "epoch": 18.98899889988999, + "grad_norm": 0.015543612651526928, + "learning_rate": 3.88271165317694e-07, + "loss": 0.039, + "num_input_tokens_seen": 36426752, + "step": 172610 + }, + { + "epoch": 18.98954895489549, + "grad_norm": 0.011529541574418545, + "learning_rate": 3.878499285020848e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36427808, + "step": 172615 + }, + { + "epoch": 18.99009900990099, + "grad_norm": 0.05293436348438263, + "learning_rate": 3.8742891852485143e-07, + "loss": 0.0137, + "num_input_tokens_seen": 36428896, + "step": 172620 + }, + { + "epoch": 18.990649064906492, + "grad_norm": 0.42212170362472534, + "learning_rate": 3.870081353898769e-07, + "loss": 0.0405, + "num_input_tokens_seen": 36429984, + "step": 172625 + }, + { + "epoch": 18.99119911991199, + "grad_norm": 0.008944542147219181, + "learning_rate": 3.865875791010359e-07, + "loss": 0.0057, + "num_input_tokens_seen": 36431040, + "step": 172630 + }, + { + "epoch": 18.99174917491749, + "grad_norm": 0.02847261354327202, + "learning_rate": 3.8616724966220596e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36432032, + "step": 172635 + }, + { + "epoch": 18.992299229922992, + "grad_norm": 1.6159812211990356, + "learning_rate": 3.8574714707726444e-07, + "loss": 0.0198, + "num_input_tokens_seen": 36433024, + "step": 172640 + }, + { + "epoch": 18.992849284928493, + "grad_norm": 0.025205660611391068, + "learning_rate": 3.853272713500805e-07, + "loss": 0.1289, + "num_input_tokens_seen": 36434112, + "step": 172645 + }, + { + "epoch": 18.993399339933994, + "grad_norm": 0.007147409487515688, + "learning_rate": 3.849076224845205e-07, + "loss": 0.02, + "num_input_tokens_seen": 36435136, + "step": 172650 + }, + { + "epoch": 18.993949394939495, + "grad_norm": 0.015375670976936817, + "learning_rate": 3.844882004844591e-07, + "loss": 0.1081, + "num_input_tokens_seen": 36436288, + "step": 172655 + }, + { + "epoch": 18.994499449944996, + "grad_norm": 1.2073782682418823, + "learning_rate": 3.8406900535375713e-07, + "loss": 0.0078, + "num_input_tokens_seen": 36437344, + "step": 172660 + }, + { + "epoch": 18.995049504950494, + "grad_norm": 0.0630141943693161, + "learning_rate": 3.8365003709627535e-07, + "loss": 0.0313, + "num_input_tokens_seen": 36438336, + "step": 172665 + }, + { + "epoch": 18.995599559955995, + "grad_norm": 0.007816381752490997, + "learning_rate": 3.832312957158857e-07, + "loss": 0.035, + "num_input_tokens_seen": 36439360, + "step": 172670 + }, + { + "epoch": 18.996149614961496, + "grad_norm": 0.02230878733098507, + "learning_rate": 3.82812781216435e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36440416, + "step": 172675 + }, + { + "epoch": 18.996699669966997, + "grad_norm": 0.00960113201290369, + "learning_rate": 3.823944936017926e-07, + "loss": 0.0796, + "num_input_tokens_seen": 36441408, + "step": 172680 + }, + { + "epoch": 18.997249724972498, + "grad_norm": 0.5493595004081726, + "learning_rate": 3.8197643287580243e-07, + "loss": 0.1083, + "num_input_tokens_seen": 36442496, + "step": 172685 + }, + { + "epoch": 18.997799779978, + "grad_norm": 0.007620587013661861, + "learning_rate": 3.815585990423226e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36443552, + "step": 172690 + }, + { + "epoch": 18.998349834983497, + "grad_norm": 0.09933406114578247, + "learning_rate": 3.8114099210520836e-07, + "loss": 0.0054, + "num_input_tokens_seen": 36444608, + "step": 172695 + }, + { + "epoch": 18.998899889988998, + "grad_norm": 2.264451742172241, + "learning_rate": 3.8072361206829834e-07, + "loss": 0.0404, + "num_input_tokens_seen": 36445632, + "step": 172700 + }, + { + "epoch": 18.9994499449945, + "grad_norm": 0.03539905697107315, + "learning_rate": 3.803064589354505e-07, + "loss": 0.1746, + "num_input_tokens_seen": 36446624, + "step": 172705 + }, + { + "epoch": 19.0, + "grad_norm": 0.01640779711306095, + "learning_rate": 3.7988953271050065e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36447600, + "step": 172710 + }, + { + "epoch": 19.0, + "eval_loss": 0.07838060706853867, + "eval_runtime": 37.0083, + "eval_samples_per_second": 109.165, + "eval_steps_per_second": 27.291, + "num_input_tokens_seen": 36447600, + "step": 172710 + }, + { + "epoch": 19.0005500550055, + "grad_norm": 0.02266954444348812, + "learning_rate": 3.7947283339729853e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36448624, + "step": 172715 + }, + { + "epoch": 19.001100110011002, + "grad_norm": 3.585195779800415, + "learning_rate": 3.7905636099968e-07, + "loss": 0.0692, + "num_input_tokens_seen": 36449712, + "step": 172720 + }, + { + "epoch": 19.001650165016503, + "grad_norm": 0.02267060987651348, + "learning_rate": 3.786401155214836e-07, + "loss": 0.0051, + "num_input_tokens_seen": 36450768, + "step": 172725 + }, + { + "epoch": 19.002200220022, + "grad_norm": 0.08572787791490555, + "learning_rate": 3.782240969665479e-07, + "loss": 0.0055, + "num_input_tokens_seen": 36451792, + "step": 172730 + }, + { + "epoch": 19.002750275027502, + "grad_norm": 0.0020054078195244074, + "learning_rate": 3.7780830533870604e-07, + "loss": 0.0232, + "num_input_tokens_seen": 36452880, + "step": 172735 + }, + { + "epoch": 19.003300330033003, + "grad_norm": 0.020642684772610664, + "learning_rate": 3.77392740641791e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36453904, + "step": 172740 + }, + { + "epoch": 19.003850385038504, + "grad_norm": 0.05278410762548447, + "learning_rate": 3.7697740287962744e-07, + "loss": 0.0083, + "num_input_tokens_seen": 36454928, + "step": 172745 + }, + { + "epoch": 19.004400440044005, + "grad_norm": 0.07976468652486801, + "learning_rate": 3.7656229205605133e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36455920, + "step": 172750 + }, + { + "epoch": 19.004950495049506, + "grad_norm": 0.08199556171894073, + "learning_rate": 3.761474081748873e-07, + "loss": 0.0044, + "num_input_tokens_seen": 36457008, + "step": 172755 + }, + { + "epoch": 19.005500550055004, + "grad_norm": 0.038277771323919296, + "learning_rate": 3.7573275123995445e-07, + "loss": 0.0242, + "num_input_tokens_seen": 36458096, + "step": 172760 + }, + { + "epoch": 19.006050605060505, + "grad_norm": 0.00859750621020794, + "learning_rate": 3.753183212550776e-07, + "loss": 0.0522, + "num_input_tokens_seen": 36459152, + "step": 172765 + }, + { + "epoch": 19.006600660066006, + "grad_norm": 0.0037173929158598185, + "learning_rate": 3.749041182240759e-07, + "loss": 0.0055, + "num_input_tokens_seen": 36460144, + "step": 172770 + }, + { + "epoch": 19.007150715071507, + "grad_norm": 0.04714616760611534, + "learning_rate": 3.7449014215076297e-07, + "loss": 0.0256, + "num_input_tokens_seen": 36461200, + "step": 172775 + }, + { + "epoch": 19.007700770077008, + "grad_norm": 0.05595744028687477, + "learning_rate": 3.740763930389607e-07, + "loss": 0.0641, + "num_input_tokens_seen": 36462288, + "step": 172780 + }, + { + "epoch": 19.00825082508251, + "grad_norm": 0.19019663333892822, + "learning_rate": 3.7366287089248273e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36463408, + "step": 172785 + }, + { + "epoch": 19.00880088008801, + "grad_norm": 0.3311217129230499, + "learning_rate": 3.7324957571513163e-07, + "loss": 0.0165, + "num_input_tokens_seen": 36464432, + "step": 172790 + }, + { + "epoch": 19.009350935093508, + "grad_norm": 0.1066628247499466, + "learning_rate": 3.7283650751072654e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36465552, + "step": 172795 + }, + { + "epoch": 19.00990099009901, + "grad_norm": 0.015218597836792469, + "learning_rate": 3.7242366628306714e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36466608, + "step": 172800 + }, + { + "epoch": 19.01045104510451, + "grad_norm": 0.021740412339568138, + "learning_rate": 3.7201105203596154e-07, + "loss": 0.0341, + "num_input_tokens_seen": 36467632, + "step": 172805 + }, + { + "epoch": 19.01100110011001, + "grad_norm": 0.12303952872753143, + "learning_rate": 3.71598664773215e-07, + "loss": 0.0073, + "num_input_tokens_seen": 36468656, + "step": 172810 + }, + { + "epoch": 19.011551155115512, + "grad_norm": 0.0968225821852684, + "learning_rate": 3.711865044986246e-07, + "loss": 0.0051, + "num_input_tokens_seen": 36469680, + "step": 172815 + }, + { + "epoch": 19.012101210121013, + "grad_norm": 0.024639906361699104, + "learning_rate": 3.707745712159899e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36470736, + "step": 172820 + }, + { + "epoch": 19.01265126512651, + "grad_norm": 0.031641457229852676, + "learning_rate": 3.70362864929108e-07, + "loss": 0.0658, + "num_input_tokens_seen": 36471760, + "step": 172825 + }, + { + "epoch": 19.013201320132012, + "grad_norm": 0.012455211021006107, + "learning_rate": 3.69951385641773e-07, + "loss": 0.091, + "num_input_tokens_seen": 36472848, + "step": 172830 + }, + { + "epoch": 19.013751375137513, + "grad_norm": 0.018068552017211914, + "learning_rate": 3.6954013335777636e-07, + "loss": 0.1057, + "num_input_tokens_seen": 36473904, + "step": 172835 + }, + { + "epoch": 19.014301430143014, + "grad_norm": 0.04789448902010918, + "learning_rate": 3.691291080809095e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36474960, + "step": 172840 + }, + { + "epoch": 19.014851485148515, + "grad_norm": 0.030962269753217697, + "learning_rate": 3.6871830981496104e-07, + "loss": 0.0068, + "num_input_tokens_seen": 36475952, + "step": 172845 + }, + { + "epoch": 19.015401540154016, + "grad_norm": 0.040290120989084244, + "learning_rate": 3.6830773856371683e-07, + "loss": 0.1246, + "num_input_tokens_seen": 36476976, + "step": 172850 + }, + { + "epoch": 19.015951595159517, + "grad_norm": 0.008053654804825783, + "learning_rate": 3.678973943309627e-07, + "loss": 0.0076, + "num_input_tokens_seen": 36478032, + "step": 172855 + }, + { + "epoch": 19.016501650165015, + "grad_norm": 0.3943670392036438, + "learning_rate": 3.6748727712047624e-07, + "loss": 0.1643, + "num_input_tokens_seen": 36479024, + "step": 172860 + }, + { + "epoch": 19.017051705170516, + "grad_norm": 0.009817955084145069, + "learning_rate": 3.6707738693603776e-07, + "loss": 0.0514, + "num_input_tokens_seen": 36480080, + "step": 172865 + }, + { + "epoch": 19.017601760176017, + "grad_norm": 0.34154847264289856, + "learning_rate": 3.6666772378143034e-07, + "loss": 0.0488, + "num_input_tokens_seen": 36481200, + "step": 172870 + }, + { + "epoch": 19.01815181518152, + "grad_norm": 0.03487737104296684, + "learning_rate": 3.662582876604259e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36482160, + "step": 172875 + }, + { + "epoch": 19.01870187018702, + "grad_norm": 0.03977973759174347, + "learning_rate": 3.6584907857679926e-07, + "loss": 0.0037, + "num_input_tokens_seen": 36483248, + "step": 172880 + }, + { + "epoch": 19.01925192519252, + "grad_norm": 0.16041788458824158, + "learning_rate": 3.6544009653431966e-07, + "loss": 0.0347, + "num_input_tokens_seen": 36484336, + "step": 172885 + }, + { + "epoch": 19.019801980198018, + "grad_norm": 2.304048776626587, + "learning_rate": 3.65031341536759e-07, + "loss": 0.1021, + "num_input_tokens_seen": 36485456, + "step": 172890 + }, + { + "epoch": 19.02035203520352, + "grad_norm": 0.0626910924911499, + "learning_rate": 3.6462281358788097e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36486480, + "step": 172895 + }, + { + "epoch": 19.02090209020902, + "grad_norm": 0.10489057749509811, + "learning_rate": 3.6421451269145477e-07, + "loss": 0.0041, + "num_input_tokens_seen": 36487632, + "step": 172900 + }, + { + "epoch": 19.02145214521452, + "grad_norm": 2.1000771522521973, + "learning_rate": 3.6380643885124135e-07, + "loss": 0.0701, + "num_input_tokens_seen": 36488720, + "step": 172905 + }, + { + "epoch": 19.022002200220022, + "grad_norm": 0.12562675774097443, + "learning_rate": 3.6339859207100424e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36489840, + "step": 172910 + }, + { + "epoch": 19.022552255225524, + "grad_norm": 0.6407726407051086, + "learning_rate": 3.629909723544989e-07, + "loss": 0.0437, + "num_input_tokens_seen": 36490896, + "step": 172915 + }, + { + "epoch": 19.023102310231025, + "grad_norm": 0.19172847270965576, + "learning_rate": 3.625835797054833e-07, + "loss": 0.0788, + "num_input_tokens_seen": 36492016, + "step": 172920 + }, + { + "epoch": 19.023652365236522, + "grad_norm": 0.20335322618484497, + "learning_rate": 3.6217641412771287e-07, + "loss": 0.007, + "num_input_tokens_seen": 36493040, + "step": 172925 + }, + { + "epoch": 19.024202420242023, + "grad_norm": 1.7986034154891968, + "learning_rate": 3.617694756249429e-07, + "loss": 0.0113, + "num_input_tokens_seen": 36494064, + "step": 172930 + }, + { + "epoch": 19.024752475247524, + "grad_norm": 0.021851517260074615, + "learning_rate": 3.613627642009204e-07, + "loss": 0.1038, + "num_input_tokens_seen": 36495184, + "step": 172935 + }, + { + "epoch": 19.025302530253025, + "grad_norm": 0.020047634840011597, + "learning_rate": 3.6095627985938965e-07, + "loss": 0.0349, + "num_input_tokens_seen": 36496208, + "step": 172940 + }, + { + "epoch": 19.025852585258527, + "grad_norm": 0.027390358969569206, + "learning_rate": 3.605500226041086e-07, + "loss": 0.05, + "num_input_tokens_seen": 36497264, + "step": 172945 + }, + { + "epoch": 19.026402640264028, + "grad_norm": 0.030005397275090218, + "learning_rate": 3.601439924388106e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36498288, + "step": 172950 + }, + { + "epoch": 19.02695269526953, + "grad_norm": 0.3025374710559845, + "learning_rate": 3.597381893672425e-07, + "loss": 0.0062, + "num_input_tokens_seen": 36499344, + "step": 172955 + }, + { + "epoch": 19.027502750275026, + "grad_norm": 0.5248759388923645, + "learning_rate": 3.593326133931457e-07, + "loss": 0.0099, + "num_input_tokens_seen": 36500400, + "step": 172960 + }, + { + "epoch": 19.028052805280527, + "grad_norm": 0.26804283261299133, + "learning_rate": 3.589272645202535e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36501424, + "step": 172965 + }, + { + "epoch": 19.02860286028603, + "grad_norm": 0.29471924901008606, + "learning_rate": 3.5852214275230724e-07, + "loss": 0.0044, + "num_input_tokens_seen": 36502480, + "step": 172970 + }, + { + "epoch": 19.02915291529153, + "grad_norm": 0.027561839669942856, + "learning_rate": 3.5811724809303725e-07, + "loss": 0.0184, + "num_input_tokens_seen": 36503568, + "step": 172975 + }, + { + "epoch": 19.02970297029703, + "grad_norm": 0.015473687089979649, + "learning_rate": 3.577125805461767e-07, + "loss": 0.0135, + "num_input_tokens_seen": 36504592, + "step": 172980 + }, + { + "epoch": 19.03025302530253, + "grad_norm": 0.08198602497577667, + "learning_rate": 3.5730814011545323e-07, + "loss": 0.061, + "num_input_tokens_seen": 36505680, + "step": 172985 + }, + { + "epoch": 19.03080308030803, + "grad_norm": 0.10624606162309647, + "learning_rate": 3.5690392680459705e-07, + "loss": 0.1199, + "num_input_tokens_seen": 36506704, + "step": 172990 + }, + { + "epoch": 19.03135313531353, + "grad_norm": 0.02666638046503067, + "learning_rate": 3.5649994061733306e-07, + "loss": 0.0851, + "num_input_tokens_seen": 36507760, + "step": 172995 + }, + { + "epoch": 19.03190319031903, + "grad_norm": 0.32667651772499084, + "learning_rate": 3.56096181557386e-07, + "loss": 0.0738, + "num_input_tokens_seen": 36508912, + "step": 173000 + }, + { + "epoch": 19.032453245324533, + "grad_norm": 1.3873356580734253, + "learning_rate": 3.5569264962847235e-07, + "loss": 0.0693, + "num_input_tokens_seen": 36509968, + "step": 173005 + }, + { + "epoch": 19.033003300330034, + "grad_norm": 0.679017961025238, + "learning_rate": 3.552893448343142e-07, + "loss": 0.0685, + "num_input_tokens_seen": 36511056, + "step": 173010 + }, + { + "epoch": 19.033553355335535, + "grad_norm": 0.03916581720113754, + "learning_rate": 3.5488626717862795e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36512144, + "step": 173015 + }, + { + "epoch": 19.034103410341036, + "grad_norm": 0.010381815023720264, + "learning_rate": 3.5448341666513007e-07, + "loss": 0.0512, + "num_input_tokens_seen": 36513200, + "step": 173020 + }, + { + "epoch": 19.034653465346533, + "grad_norm": 0.021174784749746323, + "learning_rate": 3.540807932975343e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36514288, + "step": 173025 + }, + { + "epoch": 19.035203520352034, + "grad_norm": 0.009035116992890835, + "learning_rate": 3.5367839707954596e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36515408, + "step": 173030 + }, + { + "epoch": 19.035753575357536, + "grad_norm": 0.61809903383255, + "learning_rate": 3.532762280148788e-07, + "loss": 0.0177, + "num_input_tokens_seen": 36516464, + "step": 173035 + }, + { + "epoch": 19.036303630363037, + "grad_norm": 0.5211594104766846, + "learning_rate": 3.528742861072382e-07, + "loss": 0.01, + "num_input_tokens_seen": 36517552, + "step": 173040 + }, + { + "epoch": 19.036853685368538, + "grad_norm": 0.052795857191085815, + "learning_rate": 3.524725713603294e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36518544, + "step": 173045 + }, + { + "epoch": 19.03740374037404, + "grad_norm": 0.13916048407554626, + "learning_rate": 3.5207108377785234e-07, + "loss": 0.0188, + "num_input_tokens_seen": 36519568, + "step": 173050 + }, + { + "epoch": 19.037953795379536, + "grad_norm": 2.9380412101745605, + "learning_rate": 3.516698233635096e-07, + "loss": 0.0887, + "num_input_tokens_seen": 36520624, + "step": 173055 + }, + { + "epoch": 19.038503850385037, + "grad_norm": 0.008101262152194977, + "learning_rate": 3.51268790121001e-07, + "loss": 0.0119, + "num_input_tokens_seen": 36521680, + "step": 173060 + }, + { + "epoch": 19.03905390539054, + "grad_norm": 0.059839095920324326, + "learning_rate": 3.508679840540152e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36522672, + "step": 173065 + }, + { + "epoch": 19.03960396039604, + "grad_norm": 0.007849576883018017, + "learning_rate": 3.504674051662549e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36523696, + "step": 173070 + }, + { + "epoch": 19.04015401540154, + "grad_norm": 0.22430691123008728, + "learning_rate": 3.5006705346140867e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36524848, + "step": 173075 + }, + { + "epoch": 19.040704070407042, + "grad_norm": 0.037771519273519516, + "learning_rate": 3.496669289431653e-07, + "loss": 0.004, + "num_input_tokens_seen": 36525872, + "step": 173080 + }, + { + "epoch": 19.041254125412543, + "grad_norm": 0.5167792439460754, + "learning_rate": 3.4926703161521356e-07, + "loss": 0.0045, + "num_input_tokens_seen": 36526928, + "step": 173085 + }, + { + "epoch": 19.04180418041804, + "grad_norm": 0.03942382335662842, + "learning_rate": 3.4886736148123656e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36527952, + "step": 173090 + }, + { + "epoch": 19.04235423542354, + "grad_norm": 0.012760905548930168, + "learning_rate": 3.48467918544923e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36528976, + "step": 173095 + }, + { + "epoch": 19.042904290429043, + "grad_norm": 0.04101722314953804, + "learning_rate": 3.480687028099533e-07, + "loss": 0.0554, + "num_input_tokens_seen": 36530000, + "step": 173100 + }, + { + "epoch": 19.043454345434544, + "grad_norm": 0.04882259666919708, + "learning_rate": 3.4766971428000226e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36531088, + "step": 173105 + }, + { + "epoch": 19.044004400440045, + "grad_norm": 0.012316642329096794, + "learning_rate": 3.472709529587531e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36532112, + "step": 173110 + }, + { + "epoch": 19.044554455445546, + "grad_norm": 0.9817726612091064, + "learning_rate": 3.468724188498751e-07, + "loss": 0.01, + "num_input_tokens_seen": 36533136, + "step": 173115 + }, + { + "epoch": 19.045104510451043, + "grad_norm": 0.4164978861808777, + "learning_rate": 3.4647411195704584e-07, + "loss": 0.0049, + "num_input_tokens_seen": 36534224, + "step": 173120 + }, + { + "epoch": 19.045654565456545, + "grad_norm": 0.07175278663635254, + "learning_rate": 3.4607603228393184e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36535280, + "step": 173125 + }, + { + "epoch": 19.046204620462046, + "grad_norm": 0.044723477214574814, + "learning_rate": 3.4567817983420794e-07, + "loss": 0.0204, + "num_input_tokens_seen": 36536368, + "step": 173130 + }, + { + "epoch": 19.046754675467547, + "grad_norm": 0.05230758339166641, + "learning_rate": 3.4528055461153786e-07, + "loss": 0.0202, + "num_input_tokens_seen": 36537424, + "step": 173135 + }, + { + "epoch": 19.047304730473048, + "grad_norm": 0.009662849828600883, + "learning_rate": 3.4488315661958815e-07, + "loss": 0.0095, + "num_input_tokens_seen": 36538512, + "step": 173140 + }, + { + "epoch": 19.04785478547855, + "grad_norm": 0.4800417721271515, + "learning_rate": 3.44485985862017e-07, + "loss": 0.0145, + "num_input_tokens_seen": 36539632, + "step": 173145 + }, + { + "epoch": 19.04840484048405, + "grad_norm": 0.0430745854973793, + "learning_rate": 3.4408904234248806e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36540720, + "step": 173150 + }, + { + "epoch": 19.048954895489548, + "grad_norm": 0.03274985030293465, + "learning_rate": 3.4369232606465683e-07, + "loss": 0.0096, + "num_input_tokens_seen": 36541808, + "step": 173155 + }, + { + "epoch": 19.04950495049505, + "grad_norm": 0.08288383483886719, + "learning_rate": 3.4329583703218704e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36542864, + "step": 173160 + }, + { + "epoch": 19.05005500550055, + "grad_norm": 0.01709287241101265, + "learning_rate": 3.4289957524872575e-07, + "loss": 0.0042, + "num_input_tokens_seen": 36543888, + "step": 173165 + }, + { + "epoch": 19.05060506050605, + "grad_norm": 0.25484368205070496, + "learning_rate": 3.425035407179283e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36544912, + "step": 173170 + }, + { + "epoch": 19.051155115511552, + "grad_norm": 0.04600309208035469, + "learning_rate": 3.4210773344344194e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36546032, + "step": 173175 + }, + { + "epoch": 19.051705170517053, + "grad_norm": 0.014710950665175915, + "learning_rate": 3.4171215342891915e-07, + "loss": 0.0116, + "num_input_tokens_seen": 36547024, + "step": 173180 + }, + { + "epoch": 19.05225522552255, + "grad_norm": 2.7342891693115234, + "learning_rate": 3.4131680067799876e-07, + "loss": 0.112, + "num_input_tokens_seen": 36548112, + "step": 173185 + }, + { + "epoch": 19.05280528052805, + "grad_norm": 1.872236967086792, + "learning_rate": 3.409216751943334e-07, + "loss": 0.0068, + "num_input_tokens_seen": 36549200, + "step": 173190 + }, + { + "epoch": 19.053355335533553, + "grad_norm": 0.038996025919914246, + "learning_rate": 3.40526776981559e-07, + "loss": 0.0055, + "num_input_tokens_seen": 36550320, + "step": 173195 + }, + { + "epoch": 19.053905390539054, + "grad_norm": 0.017255501821637154, + "learning_rate": 3.4013210604331714e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36551408, + "step": 173200 + }, + { + "epoch": 19.054455445544555, + "grad_norm": 2.0729620456695557, + "learning_rate": 3.397376623832438e-07, + "loss": 0.0351, + "num_input_tokens_seen": 36552432, + "step": 173205 + }, + { + "epoch": 19.055005500550056, + "grad_norm": 0.07900029420852661, + "learning_rate": 3.393434460049749e-07, + "loss": 0.0143, + "num_input_tokens_seen": 36553456, + "step": 173210 + }, + { + "epoch": 19.055555555555557, + "grad_norm": 0.04650816693902016, + "learning_rate": 3.3894945691214653e-07, + "loss": 0.0572, + "num_input_tokens_seen": 36554480, + "step": 173215 + }, + { + "epoch": 19.056105610561055, + "grad_norm": 0.1971922516822815, + "learning_rate": 3.3855569510838627e-07, + "loss": 0.1165, + "num_input_tokens_seen": 36555568, + "step": 173220 + }, + { + "epoch": 19.056655665566556, + "grad_norm": 3.74603533744812, + "learning_rate": 3.381621605973245e-07, + "loss": 0.021, + "num_input_tokens_seen": 36556656, + "step": 173225 + }, + { + "epoch": 19.057205720572057, + "grad_norm": 0.027300771325826645, + "learning_rate": 3.3776885338258626e-07, + "loss": 0.0551, + "num_input_tokens_seen": 36557744, + "step": 173230 + }, + { + "epoch": 19.057755775577558, + "grad_norm": 0.1381233185529709, + "learning_rate": 3.373757734678018e-07, + "loss": 0.0143, + "num_input_tokens_seen": 36558800, + "step": 173235 + }, + { + "epoch": 19.05830583058306, + "grad_norm": 0.06474245339632034, + "learning_rate": 3.369829208565878e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36559888, + "step": 173240 + }, + { + "epoch": 19.05885588558856, + "grad_norm": 0.16968008875846863, + "learning_rate": 3.36590295552569e-07, + "loss": 0.0207, + "num_input_tokens_seen": 36560944, + "step": 173245 + }, + { + "epoch": 19.059405940594058, + "grad_norm": 0.028933608904480934, + "learning_rate": 3.3619789755936214e-07, + "loss": 0.0052, + "num_input_tokens_seen": 36561968, + "step": 173250 + }, + { + "epoch": 19.05995599559956, + "grad_norm": 0.9435202479362488, + "learning_rate": 3.3580572688058365e-07, + "loss": 0.0075, + "num_input_tokens_seen": 36563056, + "step": 173255 + }, + { + "epoch": 19.06050605060506, + "grad_norm": 0.7890651226043701, + "learning_rate": 3.354137835198501e-07, + "loss": 0.0101, + "num_input_tokens_seen": 36564080, + "step": 173260 + }, + { + "epoch": 19.06105610561056, + "grad_norm": 0.00865618884563446, + "learning_rate": 3.3502206748077524e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36565104, + "step": 173265 + }, + { + "epoch": 19.061606160616062, + "grad_norm": 0.09004504978656769, + "learning_rate": 3.34630578766959e-07, + "loss": 0.0055, + "num_input_tokens_seen": 36566160, + "step": 173270 + }, + { + "epoch": 19.062156215621563, + "grad_norm": 2.3686676025390625, + "learning_rate": 3.342393173820235e-07, + "loss": 0.0699, + "num_input_tokens_seen": 36567184, + "step": 173275 + }, + { + "epoch": 19.062706270627064, + "grad_norm": 2.6576764583587646, + "learning_rate": 3.3384828332956586e-07, + "loss": 0.0831, + "num_input_tokens_seen": 36568272, + "step": 173280 + }, + { + "epoch": 19.063256325632562, + "grad_norm": 0.004373032134026289, + "learning_rate": 3.334574766131943e-07, + "loss": 0.0067, + "num_input_tokens_seen": 36569296, + "step": 173285 + }, + { + "epoch": 19.063806380638063, + "grad_norm": 0.7033944725990295, + "learning_rate": 3.330668972365086e-07, + "loss": 0.0679, + "num_input_tokens_seen": 36570256, + "step": 173290 + }, + { + "epoch": 19.064356435643564, + "grad_norm": 0.023664580658078194, + "learning_rate": 3.3267654520310886e-07, + "loss": 0.1341, + "num_input_tokens_seen": 36571312, + "step": 173295 + }, + { + "epoch": 19.064906490649065, + "grad_norm": 0.03538895025849342, + "learning_rate": 3.322864205165921e-07, + "loss": 0.0436, + "num_input_tokens_seen": 36572432, + "step": 173300 + }, + { + "epoch": 19.065456545654566, + "grad_norm": 0.056502144783735275, + "learning_rate": 3.3189652318055263e-07, + "loss": 0.0188, + "num_input_tokens_seen": 36573424, + "step": 173305 + }, + { + "epoch": 19.066006600660067, + "grad_norm": 0.05755237862467766, + "learning_rate": 3.315068531985904e-07, + "loss": 0.0025, + "num_input_tokens_seen": 36574480, + "step": 173310 + }, + { + "epoch": 19.066556655665565, + "grad_norm": 0.00879280362278223, + "learning_rate": 3.3111741057429145e-07, + "loss": 0.0494, + "num_input_tokens_seen": 36575504, + "step": 173315 + }, + { + "epoch": 19.067106710671066, + "grad_norm": 0.042043495923280716, + "learning_rate": 3.3072819531124455e-07, + "loss": 0.0271, + "num_input_tokens_seen": 36576560, + "step": 173320 + }, + { + "epoch": 19.067656765676567, + "grad_norm": 0.07931994646787643, + "learning_rate": 3.303392074130385e-07, + "loss": 0.0111, + "num_input_tokens_seen": 36577616, + "step": 173325 + }, + { + "epoch": 19.068206820682068, + "grad_norm": 0.015963410958647728, + "learning_rate": 3.299504468832565e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36578704, + "step": 173330 + }, + { + "epoch": 19.06875687568757, + "grad_norm": 0.04399330914020538, + "learning_rate": 3.2956191372548737e-07, + "loss": 0.1447, + "num_input_tokens_seen": 36579760, + "step": 173335 + }, + { + "epoch": 19.06930693069307, + "grad_norm": 0.05630791559815407, + "learning_rate": 3.291736079433061e-07, + "loss": 0.003, + "num_input_tokens_seen": 36580784, + "step": 173340 + }, + { + "epoch": 19.06985698569857, + "grad_norm": 0.011360571719706059, + "learning_rate": 3.2878552954029304e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36581808, + "step": 173345 + }, + { + "epoch": 19.07040704070407, + "grad_norm": 0.07125137746334076, + "learning_rate": 3.283976785200288e-07, + "loss": 0.0327, + "num_input_tokens_seen": 36582832, + "step": 173350 + }, + { + "epoch": 19.07095709570957, + "grad_norm": 1.333513855934143, + "learning_rate": 3.280100548860798e-07, + "loss": 0.036, + "num_input_tokens_seen": 36583920, + "step": 173355 + }, + { + "epoch": 19.07150715071507, + "grad_norm": 0.04901512339711189, + "learning_rate": 3.276226586420239e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36584912, + "step": 173360 + }, + { + "epoch": 19.072057205720572, + "grad_norm": 0.12554137408733368, + "learning_rate": 3.2723548979143313e-07, + "loss": 0.1331, + "num_input_tokens_seen": 36586000, + "step": 173365 + }, + { + "epoch": 19.072607260726073, + "grad_norm": 0.04799801856279373, + "learning_rate": 3.2684854833787416e-07, + "loss": 0.0891, + "num_input_tokens_seen": 36587024, + "step": 173370 + }, + { + "epoch": 19.073157315731574, + "grad_norm": 0.009423930197954178, + "learning_rate": 3.264618342849107e-07, + "loss": 0.0098, + "num_input_tokens_seen": 36588080, + "step": 173375 + }, + { + "epoch": 19.073707370737075, + "grad_norm": 0.049683887511491776, + "learning_rate": 3.2607534763611224e-07, + "loss": 0.0201, + "num_input_tokens_seen": 36589168, + "step": 173380 + }, + { + "epoch": 19.074257425742573, + "grad_norm": 0.048685960471630096, + "learning_rate": 3.256890883950342e-07, + "loss": 0.0143, + "num_input_tokens_seen": 36590224, + "step": 173385 + }, + { + "epoch": 19.074807480748074, + "grad_norm": 8.151731491088867, + "learning_rate": 3.253030565652404e-07, + "loss": 0.0974, + "num_input_tokens_seen": 36591312, + "step": 173390 + }, + { + "epoch": 19.075357535753575, + "grad_norm": 0.0517243854701519, + "learning_rate": 3.249172521502891e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36592368, + "step": 173395 + }, + { + "epoch": 19.075907590759076, + "grad_norm": 0.04041668027639389, + "learning_rate": 3.2453167515373584e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36593392, + "step": 173400 + }, + { + "epoch": 19.076457645764577, + "grad_norm": 0.3761308193206787, + "learning_rate": 3.2414632557913327e-07, + "loss": 0.0037, + "num_input_tokens_seen": 36594448, + "step": 173405 + }, + { + "epoch": 19.07700770077008, + "grad_norm": 0.12116552144289017, + "learning_rate": 3.237612034300314e-07, + "loss": 0.0037, + "num_input_tokens_seen": 36595504, + "step": 173410 + }, + { + "epoch": 19.077557755775576, + "grad_norm": 0.02705220878124237, + "learning_rate": 3.2337630870998283e-07, + "loss": 0.0917, + "num_input_tokens_seen": 36596592, + "step": 173415 + }, + { + "epoch": 19.078107810781077, + "grad_norm": 0.11841192096471786, + "learning_rate": 3.2299164142253204e-07, + "loss": 0.1122, + "num_input_tokens_seen": 36597648, + "step": 173420 + }, + { + "epoch": 19.078657865786578, + "grad_norm": 0.06443856656551361, + "learning_rate": 3.226072015712317e-07, + "loss": 0.0542, + "num_input_tokens_seen": 36598704, + "step": 173425 + }, + { + "epoch": 19.07920792079208, + "grad_norm": 0.02862650342285633, + "learning_rate": 3.222229891596151e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36599760, + "step": 173430 + }, + { + "epoch": 19.07975797579758, + "grad_norm": 1.9396332502365112, + "learning_rate": 3.2183900419122946e-07, + "loss": 0.039, + "num_input_tokens_seen": 36600784, + "step": 173435 + }, + { + "epoch": 19.08030803080308, + "grad_norm": 0.041681550443172455, + "learning_rate": 3.214552466696108e-07, + "loss": 0.0057, + "num_input_tokens_seen": 36601840, + "step": 173440 + }, + { + "epoch": 19.080858085808583, + "grad_norm": 0.03597879037261009, + "learning_rate": 3.2107171659829794e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36602896, + "step": 173445 + }, + { + "epoch": 19.08140814081408, + "grad_norm": 0.12318696081638336, + "learning_rate": 3.2068841398082414e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36603920, + "step": 173450 + }, + { + "epoch": 19.08195819581958, + "grad_norm": 0.01919793151319027, + "learning_rate": 3.203053388207228e-07, + "loss": 0.1408, + "num_input_tokens_seen": 36605008, + "step": 173455 + }, + { + "epoch": 19.082508250825082, + "grad_norm": 4.060215473175049, + "learning_rate": 3.199224911215243e-07, + "loss": 0.0892, + "num_input_tokens_seen": 36606032, + "step": 173460 + }, + { + "epoch": 19.083058305830583, + "grad_norm": 0.037104420363903046, + "learning_rate": 3.195398708867592e-07, + "loss": 0.067, + "num_input_tokens_seen": 36607056, + "step": 173465 + }, + { + "epoch": 19.083608360836084, + "grad_norm": 2.340524911880493, + "learning_rate": 3.1915747811995243e-07, + "loss": 0.0911, + "num_input_tokens_seen": 36608144, + "step": 173470 + }, + { + "epoch": 19.084158415841586, + "grad_norm": 0.007877102121710777, + "learning_rate": 3.1877531282462627e-07, + "loss": 0.163, + "num_input_tokens_seen": 36609264, + "step": 173475 + }, + { + "epoch": 19.084708470847083, + "grad_norm": 0.0696810707449913, + "learning_rate": 3.1839337500430557e-07, + "loss": 0.0703, + "num_input_tokens_seen": 36610320, + "step": 173480 + }, + { + "epoch": 19.085258525852584, + "grad_norm": 0.03503468260169029, + "learning_rate": 3.1801166466250976e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36611344, + "step": 173485 + }, + { + "epoch": 19.085808580858085, + "grad_norm": 0.055374544113874435, + "learning_rate": 3.1763018180275826e-07, + "loss": 0.0461, + "num_input_tokens_seen": 36612400, + "step": 173490 + }, + { + "epoch": 19.086358635863586, + "grad_norm": 0.04841437190771103, + "learning_rate": 3.17248926428565e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36613488, + "step": 173495 + }, + { + "epoch": 19.086908690869087, + "grad_norm": 0.2523476779460907, + "learning_rate": 3.168678985434437e-07, + "loss": 0.0166, + "num_input_tokens_seen": 36614672, + "step": 173500 + }, + { + "epoch": 19.08745874587459, + "grad_norm": 0.029929649084806442, + "learning_rate": 3.1648709815090824e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36615728, + "step": 173505 + }, + { + "epoch": 19.08800880088009, + "grad_norm": 0.032898638397455215, + "learning_rate": 3.1610652525446425e-07, + "loss": 0.0066, + "num_input_tokens_seen": 36616816, + "step": 173510 + }, + { + "epoch": 19.088558855885587, + "grad_norm": 2.6537625789642334, + "learning_rate": 3.1572617985762276e-07, + "loss": 0.0459, + "num_input_tokens_seen": 36617872, + "step": 173515 + }, + { + "epoch": 19.08910891089109, + "grad_norm": 0.01048421673476696, + "learning_rate": 3.153460619638893e-07, + "loss": 0.012, + "num_input_tokens_seen": 36618896, + "step": 173520 + }, + { + "epoch": 19.08965896589659, + "grad_norm": 0.3943819999694824, + "learning_rate": 3.1496617157676655e-07, + "loss": 0.0059, + "num_input_tokens_seen": 36619952, + "step": 173525 + }, + { + "epoch": 19.09020902090209, + "grad_norm": 0.004594702739268541, + "learning_rate": 3.145865086997546e-07, + "loss": 0.0612, + "num_input_tokens_seen": 36621104, + "step": 173530 + }, + { + "epoch": 19.09075907590759, + "grad_norm": 0.007554369978606701, + "learning_rate": 3.1420707333635337e-07, + "loss": 0.0184, + "num_input_tokens_seen": 36622096, + "step": 173535 + }, + { + "epoch": 19.091309130913093, + "grad_norm": 0.023837551474571228, + "learning_rate": 3.138278654900628e-07, + "loss": 0.0048, + "num_input_tokens_seen": 36623248, + "step": 173540 + }, + { + "epoch": 19.09185918591859, + "grad_norm": 0.017523542046546936, + "learning_rate": 3.1344888516437465e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36624336, + "step": 173545 + }, + { + "epoch": 19.09240924092409, + "grad_norm": 0.018693357706069946, + "learning_rate": 3.130701323627805e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36625360, + "step": 173550 + }, + { + "epoch": 19.092959295929592, + "grad_norm": 0.12923204898834229, + "learning_rate": 3.1269160708877475e-07, + "loss": 0.045, + "num_input_tokens_seen": 36626416, + "step": 173555 + }, + { + "epoch": 19.093509350935093, + "grad_norm": 0.10857332497835159, + "learning_rate": 3.123133093458408e-07, + "loss": 0.1528, + "num_input_tokens_seen": 36627408, + "step": 173560 + }, + { + "epoch": 19.094059405940595, + "grad_norm": 0.01385605987161398, + "learning_rate": 3.1193523913747023e-07, + "loss": 0.0441, + "num_input_tokens_seen": 36628464, + "step": 173565 + }, + { + "epoch": 19.094609460946096, + "grad_norm": 0.03216632455587387, + "learning_rate": 3.115573964671492e-07, + "loss": 0.0637, + "num_input_tokens_seen": 36629488, + "step": 173570 + }, + { + "epoch": 19.095159515951597, + "grad_norm": 0.03644147515296936, + "learning_rate": 3.1117978133835267e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36630544, + "step": 173575 + }, + { + "epoch": 19.095709570957094, + "grad_norm": 1.1528451442718506, + "learning_rate": 3.1080239375456956e-07, + "loss": 0.0092, + "num_input_tokens_seen": 36631600, + "step": 173580 + }, + { + "epoch": 19.096259625962595, + "grad_norm": 0.08395534008741379, + "learning_rate": 3.104252337192692e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36632656, + "step": 173585 + }, + { + "epoch": 19.096809680968097, + "grad_norm": 0.32924413681030273, + "learning_rate": 3.100483012359351e-07, + "loss": 0.0114, + "num_input_tokens_seen": 36633680, + "step": 173590 + }, + { + "epoch": 19.097359735973598, + "grad_norm": 0.009541484527289867, + "learning_rate": 3.096715963080393e-07, + "loss": 0.0201, + "num_input_tokens_seen": 36634768, + "step": 173595 + }, + { + "epoch": 19.0979097909791, + "grad_norm": 0.034292615950107574, + "learning_rate": 3.092951189390514e-07, + "loss": 0.0115, + "num_input_tokens_seen": 36635856, + "step": 173600 + }, + { + "epoch": 19.0984598459846, + "grad_norm": 0.09096381068229675, + "learning_rate": 3.089188691324435e-07, + "loss": 0.0539, + "num_input_tokens_seen": 36636944, + "step": 173605 + }, + { + "epoch": 19.099009900990097, + "grad_norm": 0.017261989414691925, + "learning_rate": 3.0854284689167956e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36638032, + "step": 173610 + }, + { + "epoch": 19.0995599559956, + "grad_norm": 0.08691039681434631, + "learning_rate": 3.081670522202318e-07, + "loss": 0.1236, + "num_input_tokens_seen": 36639152, + "step": 173615 + }, + { + "epoch": 19.1001100110011, + "grad_norm": 0.5034329891204834, + "learning_rate": 3.077914851215585e-07, + "loss": 0.0119, + "num_input_tokens_seen": 36640240, + "step": 173620 + }, + { + "epoch": 19.1006600660066, + "grad_norm": 0.009804471395909786, + "learning_rate": 3.074161455991209e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36641232, + "step": 173625 + }, + { + "epoch": 19.1012101210121, + "grad_norm": 0.003705881303176284, + "learning_rate": 3.0704103365638284e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36642288, + "step": 173630 + }, + { + "epoch": 19.101760176017603, + "grad_norm": 0.016467714682221413, + "learning_rate": 3.0666614929679705e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36643344, + "step": 173635 + }, + { + "epoch": 19.102310231023104, + "grad_norm": 0.007293447386473417, + "learning_rate": 3.0629149252382194e-07, + "loss": 0.1895, + "num_input_tokens_seen": 36644368, + "step": 173640 + }, + { + "epoch": 19.1028602860286, + "grad_norm": 0.015643931925296783, + "learning_rate": 3.0591706334090473e-07, + "loss": 0.0042, + "num_input_tokens_seen": 36645424, + "step": 173645 + }, + { + "epoch": 19.103410341034103, + "grad_norm": 0.8355017900466919, + "learning_rate": 3.055428617515038e-07, + "loss": 0.011, + "num_input_tokens_seen": 36646480, + "step": 173650 + }, + { + "epoch": 19.103960396039604, + "grad_norm": 0.015179958194494247, + "learning_rate": 3.0516888775906636e-07, + "loss": 0.0783, + "num_input_tokens_seen": 36647440, + "step": 173655 + }, + { + "epoch": 19.104510451045105, + "grad_norm": 0.06618718802928925, + "learning_rate": 3.047951413670341e-07, + "loss": 0.0123, + "num_input_tokens_seen": 36648432, + "step": 173660 + }, + { + "epoch": 19.105060506050606, + "grad_norm": 0.07392679154872894, + "learning_rate": 3.044216225788571e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36649520, + "step": 173665 + }, + { + "epoch": 19.105610561056107, + "grad_norm": 0.020636990666389465, + "learning_rate": 3.0404833139797695e-07, + "loss": 0.0027, + "num_input_tokens_seen": 36650544, + "step": 173670 + }, + { + "epoch": 19.106160616061604, + "grad_norm": 1.633898377418518, + "learning_rate": 3.0367526782783263e-07, + "loss": 0.0234, + "num_input_tokens_seen": 36651632, + "step": 173675 + }, + { + "epoch": 19.106710671067106, + "grad_norm": 0.040606413036584854, + "learning_rate": 3.033024318718602e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36652688, + "step": 173680 + }, + { + "epoch": 19.107260726072607, + "grad_norm": 1.5622652769088745, + "learning_rate": 3.0292982353349875e-07, + "loss": 0.0494, + "num_input_tokens_seen": 36653744, + "step": 173685 + }, + { + "epoch": 19.107810781078108, + "grad_norm": 0.00835210271179676, + "learning_rate": 3.0255744281618704e-07, + "loss": 0.0337, + "num_input_tokens_seen": 36654768, + "step": 173690 + }, + { + "epoch": 19.10836083608361, + "grad_norm": 0.04804854840040207, + "learning_rate": 3.0218528972335014e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36655824, + "step": 173695 + }, + { + "epoch": 19.10891089108911, + "grad_norm": 0.4054354131221771, + "learning_rate": 3.018133642584187e-07, + "loss": 0.0238, + "num_input_tokens_seen": 36656912, + "step": 173700 + }, + { + "epoch": 19.10946094609461, + "grad_norm": 0.1826387345790863, + "learning_rate": 3.014416664248204e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36658064, + "step": 173705 + }, + { + "epoch": 19.11001100110011, + "grad_norm": 0.03978966921567917, + "learning_rate": 3.0107019622598596e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36659152, + "step": 173710 + }, + { + "epoch": 19.11056105610561, + "grad_norm": 0.0050627198070287704, + "learning_rate": 3.0069895366533485e-07, + "loss": 0.0673, + "num_input_tokens_seen": 36660176, + "step": 173715 + }, + { + "epoch": 19.11111111111111, + "grad_norm": 0.033407386392354965, + "learning_rate": 3.0032793874629206e-07, + "loss": 0.1067, + "num_input_tokens_seen": 36661264, + "step": 173720 + }, + { + "epoch": 19.111661166116612, + "grad_norm": 0.2553313374519348, + "learning_rate": 2.9995715147227157e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36662288, + "step": 173725 + }, + { + "epoch": 19.112211221122113, + "grad_norm": 0.008604663424193859, + "learning_rate": 2.995865918466956e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36663312, + "step": 173730 + }, + { + "epoch": 19.112761276127614, + "grad_norm": 0.012801460921764374, + "learning_rate": 2.9921625987297533e-07, + "loss": 0.0228, + "num_input_tokens_seen": 36664336, + "step": 173735 + }, + { + "epoch": 19.11331133113311, + "grad_norm": 3.602461338043213, + "learning_rate": 2.9884615555452746e-07, + "loss": 0.1089, + "num_input_tokens_seen": 36665456, + "step": 173740 + }, + { + "epoch": 19.113861386138613, + "grad_norm": 0.05436314642429352, + "learning_rate": 2.9847627889476316e-07, + "loss": 0.001, + "num_input_tokens_seen": 36666544, + "step": 173745 + }, + { + "epoch": 19.114411441144114, + "grad_norm": 0.0285421684384346, + "learning_rate": 2.9810662989708806e-07, + "loss": 0.004, + "num_input_tokens_seen": 36667632, + "step": 173750 + }, + { + "epoch": 19.114961496149615, + "grad_norm": 0.45188164710998535, + "learning_rate": 2.9773720856491327e-07, + "loss": 0.0076, + "num_input_tokens_seen": 36668752, + "step": 173755 + }, + { + "epoch": 19.115511551155116, + "grad_norm": 0.23866820335388184, + "learning_rate": 2.973680149016389e-07, + "loss": 0.0045, + "num_input_tokens_seen": 36669712, + "step": 173760 + }, + { + "epoch": 19.116061606160617, + "grad_norm": 0.23241813480854034, + "learning_rate": 2.9699904891067055e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36670768, + "step": 173765 + }, + { + "epoch": 19.116611661166118, + "grad_norm": 0.046318940818309784, + "learning_rate": 2.9663031059540825e-07, + "loss": 0.0048, + "num_input_tokens_seen": 36671728, + "step": 173770 + }, + { + "epoch": 19.117161716171616, + "grad_norm": 0.037068404257297516, + "learning_rate": 2.962617999592493e-07, + "loss": 0.0454, + "num_input_tokens_seen": 36672784, + "step": 173775 + }, + { + "epoch": 19.117711771177117, + "grad_norm": 0.10209354013204575, + "learning_rate": 2.9589351700559374e-07, + "loss": 0.0518, + "num_input_tokens_seen": 36673840, + "step": 173780 + }, + { + "epoch": 19.118261826182618, + "grad_norm": 0.010054854676127434, + "learning_rate": 2.955254617378361e-07, + "loss": 0.0614, + "num_input_tokens_seen": 36674896, + "step": 173785 + }, + { + "epoch": 19.11881188118812, + "grad_norm": 0.005062648095190525, + "learning_rate": 2.951576341593598e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36675984, + "step": 173790 + }, + { + "epoch": 19.11936193619362, + "grad_norm": 0.01353597454726696, + "learning_rate": 2.947900342735649e-07, + "loss": 0.0183, + "num_input_tokens_seen": 36677104, + "step": 173795 + }, + { + "epoch": 19.11991199119912, + "grad_norm": 0.1549951583147049, + "learning_rate": 2.9442266208383194e-07, + "loss": 0.0165, + "num_input_tokens_seen": 36678160, + "step": 173800 + }, + { + "epoch": 19.120462046204622, + "grad_norm": 7.279114246368408, + "learning_rate": 2.9405551759355555e-07, + "loss": 0.0661, + "num_input_tokens_seen": 36679216, + "step": 173805 + }, + { + "epoch": 19.12101210121012, + "grad_norm": 0.04787515848875046, + "learning_rate": 2.936886008061135e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36680272, + "step": 173810 + }, + { + "epoch": 19.12156215621562, + "grad_norm": 0.013516244478523731, + "learning_rate": 2.933219117248837e-07, + "loss": 0.0068, + "num_input_tokens_seen": 36681328, + "step": 173815 + }, + { + "epoch": 19.122112211221122, + "grad_norm": 1.3755673170089722, + "learning_rate": 2.9295545035325224e-07, + "loss": 0.1106, + "num_input_tokens_seen": 36682320, + "step": 173820 + }, + { + "epoch": 19.122662266226623, + "grad_norm": 0.11010434478521347, + "learning_rate": 2.9258921669459705e-07, + "loss": 0.0075, + "num_input_tokens_seen": 36683344, + "step": 173825 + }, + { + "epoch": 19.123212321232124, + "grad_norm": 0.0333714559674263, + "learning_rate": 2.922232107522904e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36684368, + "step": 173830 + }, + { + "epoch": 19.123762376237625, + "grad_norm": 0.10056028515100479, + "learning_rate": 2.9185743252970743e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36685456, + "step": 173835 + }, + { + "epoch": 19.124312431243123, + "grad_norm": 0.07189708203077316, + "learning_rate": 2.914918820302176e-07, + "loss": 0.0036, + "num_input_tokens_seen": 36686512, + "step": 173840 + }, + { + "epoch": 19.124862486248624, + "grad_norm": 0.621846616268158, + "learning_rate": 2.9112655925719044e-07, + "loss": 0.1119, + "num_input_tokens_seen": 36687568, + "step": 173845 + }, + { + "epoch": 19.125412541254125, + "grad_norm": 0.005036631599068642, + "learning_rate": 2.9076146421399277e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36688560, + "step": 173850 + }, + { + "epoch": 19.125962596259626, + "grad_norm": 1.275620937347412, + "learning_rate": 2.9039659690398844e-07, + "loss": 0.0218, + "num_input_tokens_seen": 36689616, + "step": 173855 + }, + { + "epoch": 19.126512651265127, + "grad_norm": 0.04316936433315277, + "learning_rate": 2.9003195733054435e-07, + "loss": 0.0517, + "num_input_tokens_seen": 36690640, + "step": 173860 + }, + { + "epoch": 19.127062706270628, + "grad_norm": 0.01192889641970396, + "learning_rate": 2.896675454970188e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36691696, + "step": 173865 + }, + { + "epoch": 19.12761276127613, + "grad_norm": 4.348042964935303, + "learning_rate": 2.8930336140677025e-07, + "loss": 0.1304, + "num_input_tokens_seen": 36692688, + "step": 173870 + }, + { + "epoch": 19.128162816281627, + "grad_norm": 3.5612008571624756, + "learning_rate": 2.889394050631544e-07, + "loss": 0.0312, + "num_input_tokens_seen": 36693680, + "step": 173875 + }, + { + "epoch": 19.128712871287128, + "grad_norm": 0.13023938238620758, + "learning_rate": 2.8857567646952687e-07, + "loss": 0.0041, + "num_input_tokens_seen": 36694736, + "step": 173880 + }, + { + "epoch": 19.12926292629263, + "grad_norm": 0.016775600612163544, + "learning_rate": 2.882121756292405e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36695760, + "step": 173885 + }, + { + "epoch": 19.12981298129813, + "grad_norm": 0.6608508229255676, + "learning_rate": 2.8784890254564265e-07, + "loss": 0.0166, + "num_input_tokens_seen": 36696816, + "step": 173890 + }, + { + "epoch": 19.13036303630363, + "grad_norm": 0.07789377868175507, + "learning_rate": 2.874858572220862e-07, + "loss": 0.0549, + "num_input_tokens_seen": 36697840, + "step": 173895 + }, + { + "epoch": 19.130913091309132, + "grad_norm": 0.5044024586677551, + "learning_rate": 2.871230396619129e-07, + "loss": 0.0081, + "num_input_tokens_seen": 36698928, + "step": 173900 + }, + { + "epoch": 19.13146314631463, + "grad_norm": 0.18270909786224365, + "learning_rate": 2.8676044986846727e-07, + "loss": 0.0346, + "num_input_tokens_seen": 36699920, + "step": 173905 + }, + { + "epoch": 19.13201320132013, + "grad_norm": 0.14054690301418304, + "learning_rate": 2.863980878450939e-07, + "loss": 0.0357, + "num_input_tokens_seen": 36700976, + "step": 173910 + }, + { + "epoch": 19.132563256325632, + "grad_norm": 0.06361668556928635, + "learning_rate": 2.86035953595129e-07, + "loss": 0.051, + "num_input_tokens_seen": 36702000, + "step": 173915 + }, + { + "epoch": 19.133113311331133, + "grad_norm": 0.09721764177083969, + "learning_rate": 2.856740471219116e-07, + "loss": 0.0045, + "num_input_tokens_seen": 36703088, + "step": 173920 + }, + { + "epoch": 19.133663366336634, + "grad_norm": 0.017406484112143517, + "learning_rate": 2.8531236842878064e-07, + "loss": 0.0821, + "num_input_tokens_seen": 36704144, + "step": 173925 + }, + { + "epoch": 19.134213421342135, + "grad_norm": 0.23815539479255676, + "learning_rate": 2.849509175190612e-07, + "loss": 0.03, + "num_input_tokens_seen": 36705168, + "step": 173930 + }, + { + "epoch": 19.134763476347636, + "grad_norm": 0.004747070372104645, + "learning_rate": 2.845896943960952e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36706128, + "step": 173935 + }, + { + "epoch": 19.135313531353134, + "grad_norm": 0.03179502114653587, + "learning_rate": 2.842286990631993e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36707184, + "step": 173940 + }, + { + "epoch": 19.135863586358635, + "grad_norm": 2.7473747730255127, + "learning_rate": 2.8386793152371526e-07, + "loss": 0.1048, + "num_input_tokens_seen": 36708176, + "step": 173945 + }, + { + "epoch": 19.136413641364136, + "grad_norm": 0.21641503274440765, + "learning_rate": 2.835073917809572e-07, + "loss": 0.0038, + "num_input_tokens_seen": 36709296, + "step": 173950 + }, + { + "epoch": 19.136963696369637, + "grad_norm": 0.2871357798576355, + "learning_rate": 2.8314707983825015e-07, + "loss": 0.0069, + "num_input_tokens_seen": 36710352, + "step": 173955 + }, + { + "epoch": 19.13751375137514, + "grad_norm": 1.3623777627944946, + "learning_rate": 2.827869956989193e-07, + "loss": 0.0562, + "num_input_tokens_seen": 36711408, + "step": 173960 + }, + { + "epoch": 19.13806380638064, + "grad_norm": 0.011938399635255337, + "learning_rate": 2.8242713936627586e-07, + "loss": 0.0075, + "num_input_tokens_seen": 36712400, + "step": 173965 + }, + { + "epoch": 19.138613861386137, + "grad_norm": 0.03161292150616646, + "learning_rate": 2.820675108436449e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36713456, + "step": 173970 + }, + { + "epoch": 19.139163916391638, + "grad_norm": 0.1436520367860794, + "learning_rate": 2.8170811013433495e-07, + "loss": 0.081, + "num_input_tokens_seen": 36714480, + "step": 173975 + }, + { + "epoch": 19.13971397139714, + "grad_norm": 0.015196602791547775, + "learning_rate": 2.8134893724166e-07, + "loss": 0.001, + "num_input_tokens_seen": 36715504, + "step": 173980 + }, + { + "epoch": 19.14026402640264, + "grad_norm": 0.018166232854127884, + "learning_rate": 2.809899921689313e-07, + "loss": 0.0006, + "num_input_tokens_seen": 36716496, + "step": 173985 + }, + { + "epoch": 19.14081408140814, + "grad_norm": 0.0055095539428293705, + "learning_rate": 2.806312749194573e-07, + "loss": 0.0887, + "num_input_tokens_seen": 36717584, + "step": 173990 + }, + { + "epoch": 19.141364136413642, + "grad_norm": 0.060125600546598434, + "learning_rate": 2.802727854965409e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36718608, + "step": 173995 + }, + { + "epoch": 19.141914191419144, + "grad_norm": 0.034147582948207855, + "learning_rate": 2.799145239034934e-07, + "loss": 0.0248, + "num_input_tokens_seen": 36719664, + "step": 174000 + }, + { + "epoch": 19.14246424642464, + "grad_norm": 0.03238635137677193, + "learning_rate": 2.795564901436065e-07, + "loss": 0.1037, + "num_input_tokens_seen": 36720688, + "step": 174005 + }, + { + "epoch": 19.143014301430142, + "grad_norm": 0.8015339970588684, + "learning_rate": 2.791986842201888e-07, + "loss": 0.0092, + "num_input_tokens_seen": 36721808, + "step": 174010 + }, + { + "epoch": 19.143564356435643, + "grad_norm": 0.03419644385576248, + "learning_rate": 2.7884110613653204e-07, + "loss": 0.0671, + "num_input_tokens_seen": 36722832, + "step": 174015 + }, + { + "epoch": 19.144114411441144, + "grad_norm": 4.496432304382324, + "learning_rate": 2.784837558959391e-07, + "loss": 0.1091, + "num_input_tokens_seen": 36723856, + "step": 174020 + }, + { + "epoch": 19.144664466446645, + "grad_norm": 0.010274001397192478, + "learning_rate": 2.7812663350169357e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36724912, + "step": 174025 + }, + { + "epoch": 19.145214521452147, + "grad_norm": 0.13700291514396667, + "learning_rate": 2.777697389570927e-07, + "loss": 0.0058, + "num_input_tokens_seen": 36725904, + "step": 174030 + }, + { + "epoch": 19.145764576457644, + "grad_norm": 0.3138304650783539, + "learning_rate": 2.7741307226542833e-07, + "loss": 0.0083, + "num_input_tokens_seen": 36726960, + "step": 174035 + }, + { + "epoch": 19.146314631463145, + "grad_norm": 0.007424166891723871, + "learning_rate": 2.77056633429984e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36727984, + "step": 174040 + }, + { + "epoch": 19.146864686468646, + "grad_norm": 0.08273253589868546, + "learning_rate": 2.767004224540459e-07, + "loss": 0.0027, + "num_input_tokens_seen": 36729104, + "step": 174045 + }, + { + "epoch": 19.147414741474147, + "grad_norm": 0.005670499987900257, + "learning_rate": 2.763444393408948e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36730160, + "step": 174050 + }, + { + "epoch": 19.14796479647965, + "grad_norm": 4.8460469245910645, + "learning_rate": 2.759886840938114e-07, + "loss": 0.0769, + "num_input_tokens_seen": 36731184, + "step": 174055 + }, + { + "epoch": 19.14851485148515, + "grad_norm": 0.012485221959650517, + "learning_rate": 2.756331567160819e-07, + "loss": 0.0387, + "num_input_tokens_seen": 36732176, + "step": 174060 + }, + { + "epoch": 19.14906490649065, + "grad_norm": 0.054477836936712265, + "learning_rate": 2.752778572109732e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36733200, + "step": 174065 + }, + { + "epoch": 19.149614961496148, + "grad_norm": 0.08014523983001709, + "learning_rate": 2.7492278558176597e-07, + "loss": 0.0208, + "num_input_tokens_seen": 36734224, + "step": 174070 + }, + { + "epoch": 19.15016501650165, + "grad_norm": 0.03756481781601906, + "learning_rate": 2.745679418317326e-07, + "loss": 0.0124, + "num_input_tokens_seen": 36735280, + "step": 174075 + }, + { + "epoch": 19.15071507150715, + "grad_norm": 0.09725916385650635, + "learning_rate": 2.7421332596413997e-07, + "loss": 0.0028, + "num_input_tokens_seen": 36736336, + "step": 174080 + }, + { + "epoch": 19.15126512651265, + "grad_norm": 0.04392757639288902, + "learning_rate": 2.738589379822604e-07, + "loss": 0.0497, + "num_input_tokens_seen": 36737424, + "step": 174085 + }, + { + "epoch": 19.151815181518153, + "grad_norm": 0.28375551104545593, + "learning_rate": 2.7350477788935513e-07, + "loss": 0.0036, + "num_input_tokens_seen": 36738480, + "step": 174090 + }, + { + "epoch": 19.152365236523654, + "grad_norm": 0.07039009034633636, + "learning_rate": 2.731508456886939e-07, + "loss": 0.0061, + "num_input_tokens_seen": 36739568, + "step": 174095 + }, + { + "epoch": 19.15291529152915, + "grad_norm": 1.2295544147491455, + "learning_rate": 2.7279714138353785e-07, + "loss": 0.0857, + "num_input_tokens_seen": 36740656, + "step": 174100 + }, + { + "epoch": 19.153465346534652, + "grad_norm": 0.02565407007932663, + "learning_rate": 2.724436649771428e-07, + "loss": 0.0231, + "num_input_tokens_seen": 36741744, + "step": 174105 + }, + { + "epoch": 19.154015401540153, + "grad_norm": 0.01090642437338829, + "learning_rate": 2.7209041647277e-07, + "loss": 0.0027, + "num_input_tokens_seen": 36742800, + "step": 174110 + }, + { + "epoch": 19.154565456545654, + "grad_norm": 0.0539897158741951, + "learning_rate": 2.717373958736724e-07, + "loss": 0.0042, + "num_input_tokens_seen": 36743856, + "step": 174115 + }, + { + "epoch": 19.155115511551156, + "grad_norm": 0.018105190247297287, + "learning_rate": 2.7138460318310577e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36744912, + "step": 174120 + }, + { + "epoch": 19.155665566556657, + "grad_norm": 0.476526141166687, + "learning_rate": 2.71032038404323e-07, + "loss": 0.0427, + "num_input_tokens_seen": 36745968, + "step": 174125 + }, + { + "epoch": 19.156215621562158, + "grad_norm": 0.010111100971698761, + "learning_rate": 2.706797015405715e-07, + "loss": 0.0176, + "num_input_tokens_seen": 36747088, + "step": 174130 + }, + { + "epoch": 19.156765676567655, + "grad_norm": 0.020958049222826958, + "learning_rate": 2.703275925950988e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36748080, + "step": 174135 + }, + { + "epoch": 19.157315731573156, + "grad_norm": 0.01210523210465908, + "learning_rate": 2.6997571157114933e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36749136, + "step": 174140 + }, + { + "epoch": 19.157865786578657, + "grad_norm": 0.0198122076690197, + "learning_rate": 2.696240584719678e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36750224, + "step": 174145 + }, + { + "epoch": 19.15841584158416, + "grad_norm": 0.06113864481449127, + "learning_rate": 2.692726333007961e-07, + "loss": 0.0653, + "num_input_tokens_seen": 36751280, + "step": 174150 + }, + { + "epoch": 19.15896589658966, + "grad_norm": 1.4402216672897339, + "learning_rate": 2.689214360608677e-07, + "loss": 0.0203, + "num_input_tokens_seen": 36752336, + "step": 174155 + }, + { + "epoch": 19.15951595159516, + "grad_norm": 0.033245816826820374, + "learning_rate": 2.685704667554273e-07, + "loss": 0.1063, + "num_input_tokens_seen": 36753392, + "step": 174160 + }, + { + "epoch": 19.16006600660066, + "grad_norm": 0.04902717471122742, + "learning_rate": 2.6821972538770557e-07, + "loss": 0.0216, + "num_input_tokens_seen": 36754448, + "step": 174165 + }, + { + "epoch": 19.16061606160616, + "grad_norm": 0.12569692730903625, + "learning_rate": 2.678692119609333e-07, + "loss": 0.0355, + "num_input_tokens_seen": 36755504, + "step": 174170 + }, + { + "epoch": 19.16116611661166, + "grad_norm": 0.3088029623031616, + "learning_rate": 2.675189264783412e-07, + "loss": 0.0072, + "num_input_tokens_seen": 36756528, + "step": 174175 + }, + { + "epoch": 19.16171617161716, + "grad_norm": 0.008068256080150604, + "learning_rate": 2.6716886894316007e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36757520, + "step": 174180 + }, + { + "epoch": 19.162266226622663, + "grad_norm": 0.1205294281244278, + "learning_rate": 2.668190393586178e-07, + "loss": 0.0032, + "num_input_tokens_seen": 36758544, + "step": 174185 + }, + { + "epoch": 19.162816281628164, + "grad_norm": 0.052172813564538956, + "learning_rate": 2.66469437727937e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36759504, + "step": 174190 + }, + { + "epoch": 19.163366336633665, + "grad_norm": 0.016996433958411217, + "learning_rate": 2.661200640543371e-07, + "loss": 0.0801, + "num_input_tokens_seen": 36760496, + "step": 174195 + }, + { + "epoch": 19.163916391639162, + "grad_norm": 0.03372829034924507, + "learning_rate": 2.6577091834103783e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36761520, + "step": 174200 + }, + { + "epoch": 19.164466446644663, + "grad_norm": 0.009833160787820816, + "learning_rate": 2.6542200059126167e-07, + "loss": 0.0082, + "num_input_tokens_seen": 36762640, + "step": 174205 + }, + { + "epoch": 19.165016501650165, + "grad_norm": 0.007932541891932487, + "learning_rate": 2.650733108082226e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36763696, + "step": 174210 + }, + { + "epoch": 19.165566556655666, + "grad_norm": 0.01974913477897644, + "learning_rate": 2.647248489951348e-07, + "loss": 0.0052, + "num_input_tokens_seen": 36764848, + "step": 174215 + }, + { + "epoch": 19.166116611661167, + "grad_norm": 0.01316103432327509, + "learning_rate": 2.643766151552068e-07, + "loss": 0.001, + "num_input_tokens_seen": 36765872, + "step": 174220 + }, + { + "epoch": 19.166666666666668, + "grad_norm": 0.07342173904180527, + "learning_rate": 2.6402860929165277e-07, + "loss": 0.006, + "num_input_tokens_seen": 36766928, + "step": 174225 + }, + { + "epoch": 19.16721672167217, + "grad_norm": 0.01673264615237713, + "learning_rate": 2.636808314076755e-07, + "loss": 0.1466, + "num_input_tokens_seen": 36767920, + "step": 174230 + }, + { + "epoch": 19.167766776677666, + "grad_norm": 0.02581222914159298, + "learning_rate": 2.63333281506481e-07, + "loss": 0.0036, + "num_input_tokens_seen": 36768912, + "step": 174235 + }, + { + "epoch": 19.168316831683168, + "grad_norm": 0.023601403459906578, + "learning_rate": 2.629859595912776e-07, + "loss": 0.0172, + "num_input_tokens_seen": 36769936, + "step": 174240 + }, + { + "epoch": 19.16886688668867, + "grad_norm": 0.0433996357023716, + "learning_rate": 2.626388656652601e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36770992, + "step": 174245 + }, + { + "epoch": 19.16941694169417, + "grad_norm": 3.486501455307007, + "learning_rate": 2.6229199973163143e-07, + "loss": 0.0536, + "num_input_tokens_seen": 36771952, + "step": 174250 + }, + { + "epoch": 19.16996699669967, + "grad_norm": 0.04738166928291321, + "learning_rate": 2.619453617935863e-07, + "loss": 0.0309, + "num_input_tokens_seen": 36772976, + "step": 174255 + }, + { + "epoch": 19.170517051705172, + "grad_norm": 1.7266086339950562, + "learning_rate": 2.6159895185431936e-07, + "loss": 0.0433, + "num_input_tokens_seen": 36774000, + "step": 174260 + }, + { + "epoch": 19.17106710671067, + "grad_norm": 0.04242554306983948, + "learning_rate": 2.612527699170253e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36774992, + "step": 174265 + }, + { + "epoch": 19.17161716171617, + "grad_norm": 0.007099652662873268, + "learning_rate": 2.609068159848932e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36776080, + "step": 174270 + }, + { + "epoch": 19.17216721672167, + "grad_norm": 0.037158627063035965, + "learning_rate": 2.605610900611122e-07, + "loss": 0.1297, + "num_input_tokens_seen": 36777072, + "step": 174275 + }, + { + "epoch": 19.172717271727173, + "grad_norm": 0.011534260585904121, + "learning_rate": 2.602155921488686e-07, + "loss": 0.0035, + "num_input_tokens_seen": 36778128, + "step": 174280 + }, + { + "epoch": 19.173267326732674, + "grad_norm": 1.0866219997406006, + "learning_rate": 2.59870322251346e-07, + "loss": 0.0125, + "num_input_tokens_seen": 36779184, + "step": 174285 + }, + { + "epoch": 19.173817381738175, + "grad_norm": 0.29294252395629883, + "learning_rate": 2.5952528037172795e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36780176, + "step": 174290 + }, + { + "epoch": 19.174367436743676, + "grad_norm": 0.0897989496588707, + "learning_rate": 2.591804665131925e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36781200, + "step": 174295 + }, + { + "epoch": 19.174917491749174, + "grad_norm": 0.026208745315670967, + "learning_rate": 2.588358806789204e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36782192, + "step": 174300 + }, + { + "epoch": 19.175467546754675, + "grad_norm": 0.03540021553635597, + "learning_rate": 2.584915228720869e-07, + "loss": 0.0168, + "num_input_tokens_seen": 36783280, + "step": 174305 + }, + { + "epoch": 19.176017601760176, + "grad_norm": 0.03985446318984032, + "learning_rate": 2.5814739309586167e-07, + "loss": 0.0055, + "num_input_tokens_seen": 36784336, + "step": 174310 + }, + { + "epoch": 19.176567656765677, + "grad_norm": 0.014141103252768517, + "learning_rate": 2.5780349135342275e-07, + "loss": 0.001, + "num_input_tokens_seen": 36785360, + "step": 174315 + }, + { + "epoch": 19.177117711771178, + "grad_norm": 0.023615241050720215, + "learning_rate": 2.5745981764793425e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36786448, + "step": 174320 + }, + { + "epoch": 19.17766776677668, + "grad_norm": 0.08117853850126266, + "learning_rate": 2.571163719825687e-07, + "loss": 0.119, + "num_input_tokens_seen": 36787504, + "step": 174325 + }, + { + "epoch": 19.178217821782177, + "grad_norm": 0.03469066321849823, + "learning_rate": 2.567731543604873e-07, + "loss": 0.001, + "num_input_tokens_seen": 36788656, + "step": 174330 + }, + { + "epoch": 19.178767876787678, + "grad_norm": 0.04573046416044235, + "learning_rate": 2.5643016478485436e-07, + "loss": 0.0561, + "num_input_tokens_seen": 36789680, + "step": 174335 + }, + { + "epoch": 19.17931793179318, + "grad_norm": 0.005868969485163689, + "learning_rate": 2.560874032588312e-07, + "loss": 0.0076, + "num_input_tokens_seen": 36790704, + "step": 174340 + }, + { + "epoch": 19.17986798679868, + "grad_norm": 0.009491042234003544, + "learning_rate": 2.5574486978557633e-07, + "loss": 0.0011, + "num_input_tokens_seen": 36791792, + "step": 174345 + }, + { + "epoch": 19.18041804180418, + "grad_norm": 0.030015407130122185, + "learning_rate": 2.554025643682484e-07, + "loss": 0.0431, + "num_input_tokens_seen": 36792816, + "step": 174350 + }, + { + "epoch": 19.180968096809682, + "grad_norm": 2.3470771312713623, + "learning_rate": 2.5506048701000317e-07, + "loss": 0.1311, + "num_input_tokens_seen": 36793936, + "step": 174355 + }, + { + "epoch": 19.181518151815183, + "grad_norm": 0.019784294068813324, + "learning_rate": 2.5471863771399094e-07, + "loss": 0.0179, + "num_input_tokens_seen": 36794992, + "step": 174360 + }, + { + "epoch": 19.18206820682068, + "grad_norm": 0.17384032905101776, + "learning_rate": 2.54377016483362e-07, + "loss": 0.0398, + "num_input_tokens_seen": 36796016, + "step": 174365 + }, + { + "epoch": 19.182618261826182, + "grad_norm": 0.01475182082504034, + "learning_rate": 2.540356233212665e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36797072, + "step": 174370 + }, + { + "epoch": 19.183168316831683, + "grad_norm": 0.013651875779032707, + "learning_rate": 2.5369445823084926e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36798192, + "step": 174375 + }, + { + "epoch": 19.183718371837184, + "grad_norm": 0.01712634414434433, + "learning_rate": 2.533535212152577e-07, + "loss": 0.0483, + "num_input_tokens_seen": 36799376, + "step": 174380 + }, + { + "epoch": 19.184268426842685, + "grad_norm": 0.035473719239234924, + "learning_rate": 2.53012812277631e-07, + "loss": 0.1037, + "num_input_tokens_seen": 36800400, + "step": 174385 + }, + { + "epoch": 19.184818481848186, + "grad_norm": 0.038406968116760254, + "learning_rate": 2.5267233142111104e-07, + "loss": 0.0214, + "num_input_tokens_seen": 36801424, + "step": 174390 + }, + { + "epoch": 19.185368536853684, + "grad_norm": 0.06323916465044022, + "learning_rate": 2.5233207864883704e-07, + "loss": 0.0354, + "num_input_tokens_seen": 36802448, + "step": 174395 + }, + { + "epoch": 19.185918591859185, + "grad_norm": 0.49686336517333984, + "learning_rate": 2.5199205396393975e-07, + "loss": 0.0229, + "num_input_tokens_seen": 36803504, + "step": 174400 + }, + { + "epoch": 19.186468646864686, + "grad_norm": 3.010866403579712, + "learning_rate": 2.5165225736955845e-07, + "loss": 0.1017, + "num_input_tokens_seen": 36804560, + "step": 174405 + }, + { + "epoch": 19.187018701870187, + "grad_norm": 0.6378465294837952, + "learning_rate": 2.5131268886882107e-07, + "loss": 0.0746, + "num_input_tokens_seen": 36805616, + "step": 174410 + }, + { + "epoch": 19.187568756875688, + "grad_norm": 0.12355402112007141, + "learning_rate": 2.509733484648613e-07, + "loss": 0.003, + "num_input_tokens_seen": 36806672, + "step": 174415 + }, + { + "epoch": 19.18811881188119, + "grad_norm": 1.1472409963607788, + "learning_rate": 2.506342361608044e-07, + "loss": 0.1076, + "num_input_tokens_seen": 36807632, + "step": 174420 + }, + { + "epoch": 19.18866886688669, + "grad_norm": 0.013515908271074295, + "learning_rate": 2.502953519597756e-07, + "loss": 0.0704, + "num_input_tokens_seen": 36808752, + "step": 174425 + }, + { + "epoch": 19.189218921892188, + "grad_norm": 0.012207362800836563, + "learning_rate": 2.4995669586489467e-07, + "loss": 0.0484, + "num_input_tokens_seen": 36809776, + "step": 174430 + }, + { + "epoch": 19.18976897689769, + "grad_norm": 0.10207635909318924, + "learning_rate": 2.4961826787928965e-07, + "loss": 0.0029, + "num_input_tokens_seen": 36810832, + "step": 174435 + }, + { + "epoch": 19.19031903190319, + "grad_norm": 0.12338346987962723, + "learning_rate": 2.492800680060775e-07, + "loss": 0.0876, + "num_input_tokens_seen": 36811856, + "step": 174440 + }, + { + "epoch": 19.19086908690869, + "grad_norm": 0.01357986032962799, + "learning_rate": 2.489420962483752e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36812912, + "step": 174445 + }, + { + "epoch": 19.191419141914192, + "grad_norm": 0.005249956622719765, + "learning_rate": 2.4860435260929404e-07, + "loss": 0.002, + "num_input_tokens_seen": 36813968, + "step": 174450 + }, + { + "epoch": 19.191969196919693, + "grad_norm": 0.02811870537698269, + "learning_rate": 2.4826683709195384e-07, + "loss": 0.0352, + "num_input_tokens_seen": 36815088, + "step": 174455 + }, + { + "epoch": 19.19251925192519, + "grad_norm": 0.1923174411058426, + "learning_rate": 2.479295496994577e-07, + "loss": 0.002, + "num_input_tokens_seen": 36816080, + "step": 174460 + }, + { + "epoch": 19.193069306930692, + "grad_norm": 0.3187652826309204, + "learning_rate": 2.4759249043491694e-07, + "loss": 0.0865, + "num_input_tokens_seen": 36817040, + "step": 174465 + }, + { + "epoch": 19.193619361936193, + "grad_norm": 0.07043249160051346, + "learning_rate": 2.472556593014402e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36818096, + "step": 174470 + }, + { + "epoch": 19.194169416941694, + "grad_norm": 2.566622257232666, + "learning_rate": 2.469190563021306e-07, + "loss": 0.073, + "num_input_tokens_seen": 36819184, + "step": 174475 + }, + { + "epoch": 19.194719471947195, + "grad_norm": 0.05302054062485695, + "learning_rate": 2.4658268144009123e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36820208, + "step": 174480 + }, + { + "epoch": 19.195269526952696, + "grad_norm": 0.5105821490287781, + "learning_rate": 2.462465347184195e-07, + "loss": 0.0089, + "num_input_tokens_seen": 36821328, + "step": 174485 + }, + { + "epoch": 19.195819581958197, + "grad_norm": 0.03628935664892197, + "learning_rate": 2.4591061614021584e-07, + "loss": 0.0601, + "num_input_tokens_seen": 36822352, + "step": 174490 + }, + { + "epoch": 19.196369636963695, + "grad_norm": 2.0097599029541016, + "learning_rate": 2.4557492570857765e-07, + "loss": 0.0513, + "num_input_tokens_seen": 36823408, + "step": 174495 + }, + { + "epoch": 19.196919691969196, + "grad_norm": 0.0687098577618599, + "learning_rate": 2.452394634265942e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36824464, + "step": 174500 + }, + { + "epoch": 19.197469746974697, + "grad_norm": 0.007593199610710144, + "learning_rate": 2.44904229297363e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36825584, + "step": 174505 + }, + { + "epoch": 19.198019801980198, + "grad_norm": 2.796727418899536, + "learning_rate": 2.445692233239705e-07, + "loss": 0.0309, + "num_input_tokens_seen": 36826672, + "step": 174510 + }, + { + "epoch": 19.1985698569857, + "grad_norm": 0.051037851721048355, + "learning_rate": 2.44234445509503e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36827632, + "step": 174515 + }, + { + "epoch": 19.1991199119912, + "grad_norm": 0.19679534435272217, + "learning_rate": 2.4389989585704985e-07, + "loss": 0.0627, + "num_input_tokens_seen": 36828720, + "step": 174520 + }, + { + "epoch": 19.199669966996698, + "grad_norm": 0.008328745141625404, + "learning_rate": 2.4356557436968905e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36829808, + "step": 174525 + }, + { + "epoch": 19.2002200220022, + "grad_norm": 0.0580139234662056, + "learning_rate": 2.432314810505071e-07, + "loss": 0.0439, + "num_input_tokens_seen": 36830832, + "step": 174530 + }, + { + "epoch": 19.2007700770077, + "grad_norm": 0.03552119433879852, + "learning_rate": 2.4289761590258197e-07, + "loss": 0.0244, + "num_input_tokens_seen": 36831824, + "step": 174535 + }, + { + "epoch": 19.2013201320132, + "grad_norm": 0.23796896636486053, + "learning_rate": 2.425639789289891e-07, + "loss": 0.0206, + "num_input_tokens_seen": 36832848, + "step": 174540 + }, + { + "epoch": 19.201870187018702, + "grad_norm": 0.21031548082828522, + "learning_rate": 2.422305701328037e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36833904, + "step": 174545 + }, + { + "epoch": 19.202420242024203, + "grad_norm": 2.4100725650787354, + "learning_rate": 2.418973895170984e-07, + "loss": 0.0365, + "num_input_tokens_seen": 36834928, + "step": 174550 + }, + { + "epoch": 19.202970297029704, + "grad_norm": 0.17945201694965363, + "learning_rate": 2.4156443708494846e-07, + "loss": 0.1529, + "num_input_tokens_seen": 36835920, + "step": 174555 + }, + { + "epoch": 19.203520352035202, + "grad_norm": 0.08276360481977463, + "learning_rate": 2.412317128394154e-07, + "loss": 0.0586, + "num_input_tokens_seen": 36836976, + "step": 174560 + }, + { + "epoch": 19.204070407040703, + "grad_norm": 2.4010684490203857, + "learning_rate": 2.4089921678356885e-07, + "loss": 0.1234, + "num_input_tokens_seen": 36837968, + "step": 174565 + }, + { + "epoch": 19.204620462046204, + "grad_norm": 1.5754499435424805, + "learning_rate": 2.4056694892047593e-07, + "loss": 0.0832, + "num_input_tokens_seen": 36838928, + "step": 174570 + }, + { + "epoch": 19.205170517051705, + "grad_norm": 0.02555900439620018, + "learning_rate": 2.4023490925319524e-07, + "loss": 0.002, + "num_input_tokens_seen": 36840016, + "step": 174575 + }, + { + "epoch": 19.205720572057206, + "grad_norm": 0.08938591182231903, + "learning_rate": 2.3990309778478824e-07, + "loss": 0.1106, + "num_input_tokens_seen": 36841072, + "step": 174580 + }, + { + "epoch": 19.206270627062707, + "grad_norm": 0.07417966425418854, + "learning_rate": 2.395715145183136e-07, + "loss": 0.0588, + "num_input_tokens_seen": 36842096, + "step": 174585 + }, + { + "epoch": 19.206820682068205, + "grad_norm": 0.019695257768034935, + "learning_rate": 2.3924015945682997e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36843184, + "step": 174590 + }, + { + "epoch": 19.207370737073706, + "grad_norm": 0.483684241771698, + "learning_rate": 2.389090326033849e-07, + "loss": 0.0086, + "num_input_tokens_seen": 36844304, + "step": 174595 + }, + { + "epoch": 19.207920792079207, + "grad_norm": 0.006123627535998821, + "learning_rate": 2.3857813396103433e-07, + "loss": 0.0538, + "num_input_tokens_seen": 36845360, + "step": 174600 + }, + { + "epoch": 19.20847084708471, + "grad_norm": 0.020114121958613396, + "learning_rate": 2.3824746353282855e-07, + "loss": 0.0335, + "num_input_tokens_seen": 36846416, + "step": 174605 + }, + { + "epoch": 19.20902090209021, + "grad_norm": 0.047764912247657776, + "learning_rate": 2.3791702132181238e-07, + "loss": 0.01, + "num_input_tokens_seen": 36847536, + "step": 174610 + }, + { + "epoch": 19.20957095709571, + "grad_norm": 0.008932188153266907, + "learning_rate": 2.375868073310361e-07, + "loss": 0.0881, + "num_input_tokens_seen": 36848624, + "step": 174615 + }, + { + "epoch": 19.21012101210121, + "grad_norm": 4.62825870513916, + "learning_rate": 2.3725682156353902e-07, + "loss": 0.0576, + "num_input_tokens_seen": 36849712, + "step": 174620 + }, + { + "epoch": 19.21067106710671, + "grad_norm": 0.0472419299185276, + "learning_rate": 2.369270640223603e-07, + "loss": 0.0483, + "num_input_tokens_seen": 36850832, + "step": 174625 + }, + { + "epoch": 19.21122112211221, + "grad_norm": 0.04909389093518257, + "learning_rate": 2.365975347105448e-07, + "loss": 0.0089, + "num_input_tokens_seen": 36851952, + "step": 174630 + }, + { + "epoch": 19.21177117711771, + "grad_norm": 0.04029875248670578, + "learning_rate": 2.3626823363112894e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36853072, + "step": 174635 + }, + { + "epoch": 19.212321232123212, + "grad_norm": 1.0913119316101074, + "learning_rate": 2.3593916078714084e-07, + "loss": 0.0126, + "num_input_tokens_seen": 36854192, + "step": 174640 + }, + { + "epoch": 19.212871287128714, + "grad_norm": 0.002545291557908058, + "learning_rate": 2.3561031618162254e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36855280, + "step": 174645 + }, + { + "epoch": 19.213421342134215, + "grad_norm": 0.04807845503091812, + "learning_rate": 2.352816998175994e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36856400, + "step": 174650 + }, + { + "epoch": 19.213971397139716, + "grad_norm": 0.25121426582336426, + "learning_rate": 2.349533116981051e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36857392, + "step": 174655 + }, + { + "epoch": 19.214521452145213, + "grad_norm": 1.9300216436386108, + "learning_rate": 2.346251518261594e-07, + "loss": 0.0784, + "num_input_tokens_seen": 36858448, + "step": 174660 + }, + { + "epoch": 19.215071507150714, + "grad_norm": 0.7504816651344299, + "learning_rate": 2.3429722020478774e-07, + "loss": 0.0077, + "num_input_tokens_seen": 36859600, + "step": 174665 + }, + { + "epoch": 19.215621562156215, + "grad_norm": 0.05097551643848419, + "learning_rate": 2.3396951683701817e-07, + "loss": 0.0861, + "num_input_tokens_seen": 36860656, + "step": 174670 + }, + { + "epoch": 19.216171617161717, + "grad_norm": 0.08680787682533264, + "learning_rate": 2.3364204172586778e-07, + "loss": 0.0645, + "num_input_tokens_seen": 36861744, + "step": 174675 + }, + { + "epoch": 19.216721672167218, + "grad_norm": 0.03506791964173317, + "learning_rate": 2.333147948743536e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36862768, + "step": 174680 + }, + { + "epoch": 19.21727172717272, + "grad_norm": 0.06550359725952148, + "learning_rate": 2.329877762854954e-07, + "loss": 0.0015, + "num_input_tokens_seen": 36863824, + "step": 174685 + }, + { + "epoch": 19.217821782178216, + "grad_norm": 0.26765382289886475, + "learning_rate": 2.326609859623019e-07, + "loss": 0.0758, + "num_input_tokens_seen": 36864944, + "step": 174690 + }, + { + "epoch": 19.218371837183717, + "grad_norm": 0.016693048179149628, + "learning_rate": 2.3233442390778736e-07, + "loss": 0.022, + "num_input_tokens_seen": 36866000, + "step": 174695 + }, + { + "epoch": 19.21892189218922, + "grad_norm": 0.004643266089260578, + "learning_rate": 2.320080901249605e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36866992, + "step": 174700 + }, + { + "epoch": 19.21947194719472, + "grad_norm": 0.026876816526055336, + "learning_rate": 2.3168198461683278e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36868016, + "step": 174705 + }, + { + "epoch": 19.22002200220022, + "grad_norm": 0.01204950176179409, + "learning_rate": 2.3135610738640734e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36869040, + "step": 174710 + }, + { + "epoch": 19.22057205720572, + "grad_norm": 0.02727244235575199, + "learning_rate": 2.3103045843668735e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36870064, + "step": 174715 + }, + { + "epoch": 19.221122112211223, + "grad_norm": 0.0189772117882967, + "learning_rate": 2.307050377706732e-07, + "loss": 0.0065, + "num_input_tokens_seen": 36871152, + "step": 174720 + }, + { + "epoch": 19.22167216721672, + "grad_norm": 0.01166257169097662, + "learning_rate": 2.303798453913625e-07, + "loss": 0.0303, + "num_input_tokens_seen": 36872272, + "step": 174725 + }, + { + "epoch": 19.22222222222222, + "grad_norm": 0.4727206528186798, + "learning_rate": 2.3005488130176112e-07, + "loss": 0.0228, + "num_input_tokens_seen": 36873328, + "step": 174730 + }, + { + "epoch": 19.222772277227723, + "grad_norm": 0.011401820927858353, + "learning_rate": 2.2973014550485562e-07, + "loss": 0.1331, + "num_input_tokens_seen": 36874384, + "step": 174735 + }, + { + "epoch": 19.223322332233224, + "grad_norm": 0.1071702316403389, + "learning_rate": 2.2940563800364078e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36875472, + "step": 174740 + }, + { + "epoch": 19.223872387238725, + "grad_norm": 0.008641659282147884, + "learning_rate": 2.2908135880110871e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36876528, + "step": 174745 + }, + { + "epoch": 19.224422442244226, + "grad_norm": 0.0037471940740942955, + "learning_rate": 2.2875730790024585e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36877616, + "step": 174750 + }, + { + "epoch": 19.224972497249723, + "grad_norm": 0.0246987696737051, + "learning_rate": 2.2843348530404153e-07, + "loss": 0.0431, + "num_input_tokens_seen": 36878736, + "step": 174755 + }, + { + "epoch": 19.225522552255224, + "grad_norm": 0.04244719818234444, + "learning_rate": 2.2810989101547942e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36879728, + "step": 174760 + }, + { + "epoch": 19.226072607260726, + "grad_norm": 0.014510276727378368, + "learning_rate": 2.2778652503754328e-07, + "loss": 0.0201, + "num_input_tokens_seen": 36880784, + "step": 174765 + }, + { + "epoch": 19.226622662266227, + "grad_norm": 0.04515164718031883, + "learning_rate": 2.2746338737321127e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36881776, + "step": 174770 + }, + { + "epoch": 19.227172717271728, + "grad_norm": 0.028665989637374878, + "learning_rate": 2.271404780254588e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36882832, + "step": 174775 + }, + { + "epoch": 19.22772277227723, + "grad_norm": 0.04978117719292641, + "learning_rate": 2.2681779699726958e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36883920, + "step": 174780 + }, + { + "epoch": 19.22827282728273, + "grad_norm": 1.0601913928985596, + "learning_rate": 2.2649534429161067e-07, + "loss": 0.145, + "num_input_tokens_seen": 36885008, + "step": 174785 + }, + { + "epoch": 19.228822882288227, + "grad_norm": 0.020037934184074402, + "learning_rate": 2.261731199114575e-07, + "loss": 0.0241, + "num_input_tokens_seen": 36886064, + "step": 174790 + }, + { + "epoch": 19.22937293729373, + "grad_norm": 0.09768587350845337, + "learning_rate": 2.2585112385977992e-07, + "loss": 0.0087, + "num_input_tokens_seen": 36887120, + "step": 174795 + }, + { + "epoch": 19.22992299229923, + "grad_norm": 0.037551648914813995, + "learning_rate": 2.2552935613954495e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36888208, + "step": 174800 + }, + { + "epoch": 19.23047304730473, + "grad_norm": 0.02055666409432888, + "learning_rate": 2.2520781675371695e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36889264, + "step": 174805 + }, + { + "epoch": 19.231023102310232, + "grad_norm": 0.034953225404024124, + "learning_rate": 2.2488650570526014e-07, + "loss": 0.005, + "num_input_tokens_seen": 36890256, + "step": 174810 + }, + { + "epoch": 19.231573157315733, + "grad_norm": 0.03848142549395561, + "learning_rate": 2.2456542299713613e-07, + "loss": 0.0313, + "num_input_tokens_seen": 36891280, + "step": 174815 + }, + { + "epoch": 19.23212321232123, + "grad_norm": 0.029178021475672722, + "learning_rate": 2.2424456863230636e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36892304, + "step": 174820 + }, + { + "epoch": 19.23267326732673, + "grad_norm": 0.08693068474531174, + "learning_rate": 2.2392394261372128e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36893424, + "step": 174825 + }, + { + "epoch": 19.233223322332233, + "grad_norm": 0.002558633917942643, + "learning_rate": 2.236035449443452e-07, + "loss": 0.0084, + "num_input_tokens_seen": 36894512, + "step": 174830 + }, + { + "epoch": 19.233773377337734, + "grad_norm": 0.008745817467570305, + "learning_rate": 2.2328337562712576e-07, + "loss": 0.0625, + "num_input_tokens_seen": 36895568, + "step": 174835 + }, + { + "epoch": 19.234323432343235, + "grad_norm": 0.01689988374710083, + "learning_rate": 2.2296343466501057e-07, + "loss": 0.0391, + "num_input_tokens_seen": 36896624, + "step": 174840 + }, + { + "epoch": 19.234873487348736, + "grad_norm": 0.015275629237294197, + "learning_rate": 2.2264372206095563e-07, + "loss": 0.002, + "num_input_tokens_seen": 36897744, + "step": 174845 + }, + { + "epoch": 19.235423542354237, + "grad_norm": 1.5261588096618652, + "learning_rate": 2.2232423781790302e-07, + "loss": 0.1348, + "num_input_tokens_seen": 36898832, + "step": 174850 + }, + { + "epoch": 19.235973597359735, + "grad_norm": 0.0067211901769042015, + "learning_rate": 2.2200498193880038e-07, + "loss": 0.0512, + "num_input_tokens_seen": 36899824, + "step": 174855 + }, + { + "epoch": 19.236523652365236, + "grad_norm": 2.8920581340789795, + "learning_rate": 2.2168595442658703e-07, + "loss": 0.1132, + "num_input_tokens_seen": 36900912, + "step": 174860 + }, + { + "epoch": 19.237073707370737, + "grad_norm": 0.14494144916534424, + "learning_rate": 2.2136715528420227e-07, + "loss": 0.0285, + "num_input_tokens_seen": 36901968, + "step": 174865 + }, + { + "epoch": 19.237623762376238, + "grad_norm": 0.0041460213251411915, + "learning_rate": 2.2104858451458822e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36903024, + "step": 174870 + }, + { + "epoch": 19.23817381738174, + "grad_norm": 2.355684757232666, + "learning_rate": 2.2073024212068139e-07, + "loss": 0.0159, + "num_input_tokens_seen": 36904112, + "step": 174875 + }, + { + "epoch": 19.23872387238724, + "grad_norm": 0.33849042654037476, + "learning_rate": 2.2041212810541e-07, + "loss": 0.0072, + "num_input_tokens_seen": 36905104, + "step": 174880 + }, + { + "epoch": 19.239273927392738, + "grad_norm": 0.05121717229485512, + "learning_rate": 2.200942424717134e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36906192, + "step": 174885 + }, + { + "epoch": 19.23982398239824, + "grad_norm": 0.026316581293940544, + "learning_rate": 2.197765852225142e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36907248, + "step": 174890 + }, + { + "epoch": 19.24037403740374, + "grad_norm": 0.03354320675134659, + "learning_rate": 2.1945915636074621e-07, + "loss": 0.1226, + "num_input_tokens_seen": 36908368, + "step": 174895 + }, + { + "epoch": 19.24092409240924, + "grad_norm": 0.01784500479698181, + "learning_rate": 2.1914195588932929e-07, + "loss": 0.0053, + "num_input_tokens_seen": 36909424, + "step": 174900 + }, + { + "epoch": 19.241474147414742, + "grad_norm": 0.006698834244161844, + "learning_rate": 2.1882498381119164e-07, + "loss": 0.0052, + "num_input_tokens_seen": 36910480, + "step": 174905 + }, + { + "epoch": 19.242024202420243, + "grad_norm": 0.004828437697142363, + "learning_rate": 2.185082401292532e-07, + "loss": 0.0013, + "num_input_tokens_seen": 36911536, + "step": 174910 + }, + { + "epoch": 19.242574257425744, + "grad_norm": 0.0076648881658911705, + "learning_rate": 2.1819172484643103e-07, + "loss": 0.0884, + "num_input_tokens_seen": 36912624, + "step": 174915 + }, + { + "epoch": 19.24312431243124, + "grad_norm": 0.324543833732605, + "learning_rate": 2.178754379656478e-07, + "loss": 0.0055, + "num_input_tokens_seen": 36913744, + "step": 174920 + }, + { + "epoch": 19.243674367436743, + "grad_norm": 0.049196504056453705, + "learning_rate": 2.175593794898123e-07, + "loss": 0.0391, + "num_input_tokens_seen": 36914800, + "step": 174925 + }, + { + "epoch": 19.244224422442244, + "grad_norm": 0.03904476761817932, + "learning_rate": 2.1724354942183889e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36915856, + "step": 174930 + }, + { + "epoch": 19.244774477447745, + "grad_norm": 1.8988382816314697, + "learning_rate": 2.1692794776464463e-07, + "loss": 0.0662, + "num_input_tokens_seen": 36916944, + "step": 174935 + }, + { + "epoch": 19.245324532453246, + "grad_norm": 0.018124397844076157, + "learning_rate": 2.1661257452113003e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36917968, + "step": 174940 + }, + { + "epoch": 19.245874587458747, + "grad_norm": 0.02128497138619423, + "learning_rate": 2.1629742969420662e-07, + "loss": 0.0027, + "num_input_tokens_seen": 36919024, + "step": 174945 + }, + { + "epoch": 19.246424642464245, + "grad_norm": 0.11369503289461136, + "learning_rate": 2.1598251328677488e-07, + "loss": 0.0373, + "num_input_tokens_seen": 36920112, + "step": 174950 + }, + { + "epoch": 19.246974697469746, + "grad_norm": 0.018175747245550156, + "learning_rate": 2.156678253017408e-07, + "loss": 0.0392, + "num_input_tokens_seen": 36921168, + "step": 174955 + }, + { + "epoch": 19.247524752475247, + "grad_norm": 0.004892248660326004, + "learning_rate": 2.1535336574200482e-07, + "loss": 0.1094, + "num_input_tokens_seen": 36922320, + "step": 174960 + }, + { + "epoch": 19.248074807480748, + "grad_norm": 1.8641258478164673, + "learning_rate": 2.1503913461046187e-07, + "loss": 0.1919, + "num_input_tokens_seen": 36923344, + "step": 174965 + }, + { + "epoch": 19.24862486248625, + "grad_norm": 0.04649665206670761, + "learning_rate": 2.147251319100152e-07, + "loss": 0.0628, + "num_input_tokens_seen": 36924400, + "step": 174970 + }, + { + "epoch": 19.24917491749175, + "grad_norm": 0.02554737962782383, + "learning_rate": 2.1441135764355136e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36925456, + "step": 174975 + }, + { + "epoch": 19.24972497249725, + "grad_norm": 0.13845740258693695, + "learning_rate": 2.140978118139625e-07, + "loss": 0.0654, + "num_input_tokens_seen": 36926544, + "step": 174980 + }, + { + "epoch": 19.25027502750275, + "grad_norm": 1.419085144996643, + "learning_rate": 2.1378449442414073e-07, + "loss": 0.0241, + "num_input_tokens_seen": 36927600, + "step": 174985 + }, + { + "epoch": 19.25082508250825, + "grad_norm": 0.6138274073600769, + "learning_rate": 2.1347140547697543e-07, + "loss": 0.0512, + "num_input_tokens_seen": 36928624, + "step": 174990 + }, + { + "epoch": 19.25137513751375, + "grad_norm": 0.055987242609262466, + "learning_rate": 2.1315854497535036e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36929712, + "step": 174995 + }, + { + "epoch": 19.251925192519252, + "grad_norm": 0.0068481056950986385, + "learning_rate": 2.128459129221494e-07, + "loss": 0.0989, + "num_input_tokens_seen": 36930768, + "step": 175000 + }, + { + "epoch": 19.252475247524753, + "grad_norm": 0.011346457526087761, + "learning_rate": 2.125335093202535e-07, + "loss": 0.0046, + "num_input_tokens_seen": 36931888, + "step": 175005 + }, + { + "epoch": 19.253025302530254, + "grad_norm": 1.9118691682815552, + "learning_rate": 2.1222133417254098e-07, + "loss": 0.052, + "num_input_tokens_seen": 36932880, + "step": 175010 + }, + { + "epoch": 19.253575357535752, + "grad_norm": 0.012592912651598454, + "learning_rate": 2.1190938748189004e-07, + "loss": 0.001, + "num_input_tokens_seen": 36933904, + "step": 175015 + }, + { + "epoch": 19.254125412541253, + "grad_norm": 0.005737646017223597, + "learning_rate": 2.115976692511762e-07, + "loss": 0.0021, + "num_input_tokens_seen": 36935088, + "step": 175020 + }, + { + "epoch": 19.254675467546754, + "grad_norm": 0.037640273571014404, + "learning_rate": 2.112861794832749e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36936144, + "step": 175025 + }, + { + "epoch": 19.255225522552255, + "grad_norm": 4.277646541595459, + "learning_rate": 2.1097491818105053e-07, + "loss": 0.0359, + "num_input_tokens_seen": 36937168, + "step": 175030 + }, + { + "epoch": 19.255775577557756, + "grad_norm": 0.0045402562245726585, + "learning_rate": 2.106638853473758e-07, + "loss": 0.0024, + "num_input_tokens_seen": 36938224, + "step": 175035 + }, + { + "epoch": 19.256325632563257, + "grad_norm": 0.009054077789187431, + "learning_rate": 2.1035308098511785e-07, + "loss": 0.0022, + "num_input_tokens_seen": 36939216, + "step": 175040 + }, + { + "epoch": 19.25687568756876, + "grad_norm": 0.03163662552833557, + "learning_rate": 2.100425050971383e-07, + "loss": 0.0311, + "num_input_tokens_seen": 36940336, + "step": 175045 + }, + { + "epoch": 19.257425742574256, + "grad_norm": 0.018297674134373665, + "learning_rate": 2.0973215768630427e-07, + "loss": 0.0007, + "num_input_tokens_seen": 36941360, + "step": 175050 + }, + { + "epoch": 19.257975797579757, + "grad_norm": 0.022106854245066643, + "learning_rate": 2.0942203875547185e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36942384, + "step": 175055 + }, + { + "epoch": 19.258525852585258, + "grad_norm": 0.020965054631233215, + "learning_rate": 2.091121483075026e-07, + "loss": 0.0009, + "num_input_tokens_seen": 36943504, + "step": 175060 + }, + { + "epoch": 19.25907590759076, + "grad_norm": 0.14193123579025269, + "learning_rate": 2.088024863452498e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36944496, + "step": 175065 + }, + { + "epoch": 19.25962596259626, + "grad_norm": 0.00495888339355588, + "learning_rate": 2.084930528715695e-07, + "loss": 0.0026, + "num_input_tokens_seen": 36945552, + "step": 175070 + }, + { + "epoch": 19.26017601760176, + "grad_norm": 0.11161026358604431, + "learning_rate": 2.0818384788930945e-07, + "loss": 0.0085, + "num_input_tokens_seen": 36946608, + "step": 175075 + }, + { + "epoch": 19.260726072607262, + "grad_norm": 0.00620867358520627, + "learning_rate": 2.0787487140132568e-07, + "loss": 0.0034, + "num_input_tokens_seen": 36947664, + "step": 175080 + }, + { + "epoch": 19.26127612761276, + "grad_norm": 0.7217113375663757, + "learning_rate": 2.075661234104631e-07, + "loss": 0.0853, + "num_input_tokens_seen": 36948720, + "step": 175085 + }, + { + "epoch": 19.26182618261826, + "grad_norm": 0.0553266704082489, + "learning_rate": 2.0725760391956396e-07, + "loss": 0.1951, + "num_input_tokens_seen": 36949744, + "step": 175090 + }, + { + "epoch": 19.262376237623762, + "grad_norm": 0.008171943947672844, + "learning_rate": 2.0694931293147591e-07, + "loss": 0.0014, + "num_input_tokens_seen": 36950832, + "step": 175095 + }, + { + "epoch": 19.262926292629263, + "grad_norm": 0.10622067004442215, + "learning_rate": 2.0664125044904113e-07, + "loss": 0.0017, + "num_input_tokens_seen": 36951888, + "step": 175100 + }, + { + "epoch": 19.263476347634764, + "grad_norm": 0.24838797748088837, + "learning_rate": 2.0633341647509353e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36952944, + "step": 175105 + }, + { + "epoch": 19.264026402640265, + "grad_norm": 0.4355107247829437, + "learning_rate": 2.0602581101247798e-07, + "loss": 0.0266, + "num_input_tokens_seen": 36953968, + "step": 175110 + }, + { + "epoch": 19.264576457645763, + "grad_norm": 0.017492633312940598, + "learning_rate": 2.057184340640228e-07, + "loss": 0.0019, + "num_input_tokens_seen": 36955024, + "step": 175115 + }, + { + "epoch": 19.265126512651264, + "grad_norm": 0.20450514554977417, + "learning_rate": 2.0541128563256184e-07, + "loss": 0.0033, + "num_input_tokens_seen": 36956080, + "step": 175120 + }, + { + "epoch": 19.265676567656765, + "grad_norm": 0.3058244585990906, + "learning_rate": 2.0510436572093172e-07, + "loss": 0.0388, + "num_input_tokens_seen": 36957072, + "step": 175125 + }, + { + "epoch": 19.266226622662266, + "grad_norm": 0.28797534108161926, + "learning_rate": 2.047976743319524e-07, + "loss": 0.0039, + "num_input_tokens_seen": 36958128, + "step": 175130 + }, + { + "epoch": 19.266776677667767, + "grad_norm": 3.581085681915283, + "learning_rate": 2.0449121146845774e-07, + "loss": 0.1235, + "num_input_tokens_seen": 36959120, + "step": 175135 + }, + { + "epoch": 19.26732673267327, + "grad_norm": 0.010774869471788406, + "learning_rate": 2.0418497713326768e-07, + "loss": 0.0064, + "num_input_tokens_seen": 36960144, + "step": 175140 + }, + { + "epoch": 19.26787678767877, + "grad_norm": 0.03863111510872841, + "learning_rate": 2.0387897132920774e-07, + "loss": 0.0927, + "num_input_tokens_seen": 36961136, + "step": 175145 + }, + { + "epoch": 19.268426842684267, + "grad_norm": 1.5559614896774292, + "learning_rate": 2.0357319405909792e-07, + "loss": 0.0196, + "num_input_tokens_seen": 36962160, + "step": 175150 + }, + { + "epoch": 19.268976897689768, + "grad_norm": 0.0937264934182167, + "learning_rate": 2.0326764532575537e-07, + "loss": 0.0031, + "num_input_tokens_seen": 36963216, + "step": 175155 + }, + { + "epoch": 19.26952695269527, + "grad_norm": 0.050419051200151443, + "learning_rate": 2.0296232513199452e-07, + "loss": 0.0012, + "num_input_tokens_seen": 36964240, + "step": 175160 + }, + { + "epoch": 19.27007700770077, + "grad_norm": 0.018405117094516754, + "learning_rate": 2.026572334806326e-07, + "loss": 0.0712, + "num_input_tokens_seen": 36965264, + "step": 175165 + }, + { + "epoch": 19.27062706270627, + "grad_norm": 0.10862896591424942, + "learning_rate": 2.023523703744784e-07, + "loss": 0.0016, + "num_input_tokens_seen": 36966352, + "step": 175170 + }, + { + "epoch": 19.271177117711773, + "grad_norm": 0.037827324122190475, + "learning_rate": 2.0204773581634362e-07, + "loss": 0.0941, + "num_input_tokens_seen": 36967440, + "step": 175175 + }, + { + "epoch": 19.27172717271727, + "grad_norm": 0.060309480875730515, + "learning_rate": 2.017433298090371e-07, + "loss": 0.042, + "num_input_tokens_seen": 36968464, + "step": 175180 + }, + { + "epoch": 19.27227722772277, + "grad_norm": 0.031998973339796066, + "learning_rate": 2.0143915235536214e-07, + "loss": 0.1441, + "num_input_tokens_seen": 36969488, + "step": 175185 + }, + { + "epoch": 19.272827282728272, + "grad_norm": 0.12855778634548187, + "learning_rate": 2.0113520345812209e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36970608, + "step": 175190 + }, + { + "epoch": 19.273377337733773, + "grad_norm": 0.043182794004678726, + "learning_rate": 2.0083148312012024e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36971600, + "step": 175195 + }, + { + "epoch": 19.273927392739274, + "grad_norm": 0.020955968648195267, + "learning_rate": 2.0052799134415433e-07, + "loss": 0.0553, + "num_input_tokens_seen": 36972752, + "step": 175200 + }, + { + "epoch": 19.274477447744776, + "grad_norm": 0.0251277144998312, + "learning_rate": 2.0022472813302218e-07, + "loss": 0.0781, + "num_input_tokens_seen": 36973808, + "step": 175205 + }, + { + "epoch": 19.275027502750277, + "grad_norm": 0.8386856913566589, + "learning_rate": 1.999216934895215e-07, + "loss": 0.0775, + "num_input_tokens_seen": 36974832, + "step": 175210 + }, + { + "epoch": 19.275577557755774, + "grad_norm": 0.006236120592802763, + "learning_rate": 1.9961888741643898e-07, + "loss": 0.0092, + "num_input_tokens_seen": 36975920, + "step": 175215 + }, + { + "epoch": 19.276127612761275, + "grad_norm": 0.2126489132642746, + "learning_rate": 1.993163099165668e-07, + "loss": 0.0074, + "num_input_tokens_seen": 36977008, + "step": 175220 + }, + { + "epoch": 19.276677667766776, + "grad_norm": 0.11062343418598175, + "learning_rate": 1.9901396099269998e-07, + "loss": 0.0054, + "num_input_tokens_seen": 36978064, + "step": 175225 + }, + { + "epoch": 19.277227722772277, + "grad_norm": 0.002007989911362529, + "learning_rate": 1.987118406476196e-07, + "loss": 0.0561, + "num_input_tokens_seen": 36979120, + "step": 175230 + }, + { + "epoch": 19.27777777777778, + "grad_norm": 0.4882398545742035, + "learning_rate": 1.9840994888411236e-07, + "loss": 0.0062, + "num_input_tokens_seen": 36980176, + "step": 175235 + }, + { + "epoch": 19.27832783278328, + "grad_norm": 1.715798258781433, + "learning_rate": 1.981082857049593e-07, + "loss": 0.0658, + "num_input_tokens_seen": 36981232, + "step": 175240 + }, + { + "epoch": 19.278877887788777, + "grad_norm": 0.01118361484259367, + "learning_rate": 1.9780685111293883e-07, + "loss": 0.0018, + "num_input_tokens_seen": 36982256, + "step": 175245 + }, + { + "epoch": 19.27942794279428, + "grad_norm": 0.025665301829576492, + "learning_rate": 1.9750564511083202e-07, + "loss": 0.0067, + "num_input_tokens_seen": 36983344, + "step": 175250 + }, + { + "epoch": 19.27997799779978, + "grad_norm": 0.1692492663860321, + "learning_rate": 1.9720466770141444e-07, + "loss": 0.0074, + "num_input_tokens_seen": 36984432, + "step": 175255 + }, + { + "epoch": 19.28052805280528, + "grad_norm": 0.009980502538383007, + "learning_rate": 1.9690391888745886e-07, + "loss": 0.1389, + "num_input_tokens_seen": 36985488, + "step": 175260 + }, + { + "epoch": 19.28107810781078, + "grad_norm": 0.3688068687915802, + "learning_rate": 1.9660339867174082e-07, + "loss": 0.0586, + "num_input_tokens_seen": 36986640, + "step": 175265 + }, + { + "epoch": 19.281628162816283, + "grad_norm": 0.19794908165931702, + "learning_rate": 1.963031070570276e-07, + "loss": 0.0904, + "num_input_tokens_seen": 36987696, + "step": 175270 + }, + { + "epoch": 19.282178217821784, + "grad_norm": 0.09379339218139648, + "learning_rate": 1.960030440460836e-07, + "loss": 0.0809, + "num_input_tokens_seen": 36988784, + "step": 175275 + }, + { + "epoch": 19.28272827282728, + "grad_norm": 0.020759854465723038, + "learning_rate": 1.9570320964167887e-07, + "loss": 0.0577, + "num_input_tokens_seen": 36989904, + "step": 175280 + }, + { + "epoch": 19.283278327832782, + "grad_norm": 0.047046419233083725, + "learning_rate": 1.9540360384657508e-07, + "loss": 0.0044, + "num_input_tokens_seen": 36990992, + "step": 175285 + }, + { + "epoch": 19.283828382838283, + "grad_norm": 0.09488464891910553, + "learning_rate": 1.9510422666353668e-07, + "loss": 0.0043, + "num_input_tokens_seen": 36992080, + "step": 175290 + }, + { + "epoch": 19.284378437843785, + "grad_norm": 0.10760725289583206, + "learning_rate": 1.94805078095317e-07, + "loss": 0.0295, + "num_input_tokens_seen": 36993104, + "step": 175295 + }, + { + "epoch": 19.284928492849286, + "grad_norm": 0.04299599304795265, + "learning_rate": 1.9450615814467775e-07, + "loss": 0.0023, + "num_input_tokens_seen": 36994224, + "step": 175300 + }, + { + "epoch": 19.285478547854787, + "grad_norm": 0.022349663078784943, + "learning_rate": 1.9420746681436951e-07, + "loss": 0.0262, + "num_input_tokens_seen": 36995248, + "step": 175305 + }, + { + "epoch": 19.286028602860284, + "grad_norm": 0.012176021002233028, + "learning_rate": 1.9390900410715118e-07, + "loss": 0.0008, + "num_input_tokens_seen": 36996368, + "step": 175310 + }, + { + "epoch": 19.286578657865785, + "grad_norm": 0.05546221509575844, + "learning_rate": 1.9361077002577054e-07, + "loss": 0.0938, + "num_input_tokens_seen": 36997456, + "step": 175315 + }, + { + "epoch": 19.287128712871286, + "grad_norm": 0.078874371945858, + "learning_rate": 1.9331276457297543e-07, + "loss": 0.0193, + "num_input_tokens_seen": 36998544, + "step": 175320 + }, + { + "epoch": 19.287678767876788, + "grad_norm": 3.1726574897766113, + "learning_rate": 1.9301498775151361e-07, + "loss": 0.0728, + "num_input_tokens_seen": 36999536, + "step": 175325 + }, + { + "epoch": 19.28822882288229, + "grad_norm": 0.06949102133512497, + "learning_rate": 1.9271743956412735e-07, + "loss": 0.0031, + "num_input_tokens_seen": 37000592, + "step": 175330 + }, + { + "epoch": 19.28877887788779, + "grad_norm": 1.515303134918213, + "learning_rate": 1.924201200135617e-07, + "loss": 0.0101, + "num_input_tokens_seen": 37001616, + "step": 175335 + }, + { + "epoch": 19.28932893289329, + "grad_norm": 0.029679417610168457, + "learning_rate": 1.9212302910255608e-07, + "loss": 0.0019, + "num_input_tokens_seen": 37002704, + "step": 175340 + }, + { + "epoch": 19.28987898789879, + "grad_norm": 0.007326523307710886, + "learning_rate": 1.9182616683384724e-07, + "loss": 0.032, + "num_input_tokens_seen": 37003760, + "step": 175345 + }, + { + "epoch": 19.29042904290429, + "grad_norm": 0.02231600508093834, + "learning_rate": 1.9152953321017465e-07, + "loss": 0.0086, + "num_input_tokens_seen": 37004816, + "step": 175350 + }, + { + "epoch": 19.29097909790979, + "grad_norm": 1.697514533996582, + "learning_rate": 1.9123312823426942e-07, + "loss": 0.0655, + "num_input_tokens_seen": 37005872, + "step": 175355 + }, + { + "epoch": 19.29152915291529, + "grad_norm": 0.039504971355199814, + "learning_rate": 1.9093695190886274e-07, + "loss": 0.0179, + "num_input_tokens_seen": 37006896, + "step": 175360 + }, + { + "epoch": 19.292079207920793, + "grad_norm": 0.0803009420633316, + "learning_rate": 1.9064100423668852e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37007984, + "step": 175365 + }, + { + "epoch": 19.292629262926294, + "grad_norm": 0.013473526574671268, + "learning_rate": 1.903452852204668e-07, + "loss": 0.0563, + "num_input_tokens_seen": 37008976, + "step": 175370 + }, + { + "epoch": 19.293179317931795, + "grad_norm": 0.08145412802696228, + "learning_rate": 1.900497948629315e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37010096, + "step": 175375 + }, + { + "epoch": 19.293729372937293, + "grad_norm": 0.027558552101254463, + "learning_rate": 1.897545331668027e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37011184, + "step": 175380 + }, + { + "epoch": 19.294279427942794, + "grad_norm": 0.004663028288632631, + "learning_rate": 1.894595001348004e-07, + "loss": 0.0034, + "num_input_tokens_seen": 37012208, + "step": 175385 + }, + { + "epoch": 19.294829482948295, + "grad_norm": 0.8995954990386963, + "learning_rate": 1.8916469576964192e-07, + "loss": 0.0053, + "num_input_tokens_seen": 37013296, + "step": 175390 + }, + { + "epoch": 19.295379537953796, + "grad_norm": 0.061795223504304886, + "learning_rate": 1.8887012007405004e-07, + "loss": 0.0208, + "num_input_tokens_seen": 37014352, + "step": 175395 + }, + { + "epoch": 19.295929592959297, + "grad_norm": 1.4089868068695068, + "learning_rate": 1.8857577305073648e-07, + "loss": 0.0238, + "num_input_tokens_seen": 37015344, + "step": 175400 + }, + { + "epoch": 19.296479647964798, + "grad_norm": 1.7999852895736694, + "learning_rate": 1.8828165470241576e-07, + "loss": 0.1117, + "num_input_tokens_seen": 37016400, + "step": 175405 + }, + { + "epoch": 19.297029702970296, + "grad_norm": 0.018750477582216263, + "learning_rate": 1.8798776503179404e-07, + "loss": 0.0102, + "num_input_tokens_seen": 37017424, + "step": 175410 + }, + { + "epoch": 19.297579757975797, + "grad_norm": 0.013933415524661541, + "learning_rate": 1.8769410404158582e-07, + "loss": 0.0014, + "num_input_tokens_seen": 37018480, + "step": 175415 + }, + { + "epoch": 19.298129812981298, + "grad_norm": 2.3900272846221924, + "learning_rate": 1.8740067173449173e-07, + "loss": 0.0831, + "num_input_tokens_seen": 37019536, + "step": 175420 + }, + { + "epoch": 19.2986798679868, + "grad_norm": 0.05094137042760849, + "learning_rate": 1.8710746811322066e-07, + "loss": 0.0019, + "num_input_tokens_seen": 37020592, + "step": 175425 + }, + { + "epoch": 19.2992299229923, + "grad_norm": 0.005602238699793816, + "learning_rate": 1.8681449318047605e-07, + "loss": 0.0188, + "num_input_tokens_seen": 37021616, + "step": 175430 + }, + { + "epoch": 19.2997799779978, + "grad_norm": 0.01656026393175125, + "learning_rate": 1.8652174693895298e-07, + "loss": 0.0165, + "num_input_tokens_seen": 37022704, + "step": 175435 + }, + { + "epoch": 19.300330033003302, + "grad_norm": 0.13508152961730957, + "learning_rate": 1.8622922939135202e-07, + "loss": 0.0342, + "num_input_tokens_seen": 37023760, + "step": 175440 + }, + { + "epoch": 19.3008800880088, + "grad_norm": 0.38302695751190186, + "learning_rate": 1.8593694054037102e-07, + "loss": 0.0912, + "num_input_tokens_seen": 37024784, + "step": 175445 + }, + { + "epoch": 19.3014301430143, + "grad_norm": 0.00741580268368125, + "learning_rate": 1.8564488038870231e-07, + "loss": 0.1334, + "num_input_tokens_seen": 37025808, + "step": 175450 + }, + { + "epoch": 19.301980198019802, + "grad_norm": 0.09629593789577484, + "learning_rate": 1.8535304893903816e-07, + "loss": 0.0038, + "num_input_tokens_seen": 37026832, + "step": 175455 + }, + { + "epoch": 19.302530253025303, + "grad_norm": 0.008910398930311203, + "learning_rate": 1.850614461940653e-07, + "loss": 0.0453, + "num_input_tokens_seen": 37027952, + "step": 175460 + }, + { + "epoch": 19.303080308030804, + "grad_norm": 0.04421469196677208, + "learning_rate": 1.8477007215647603e-07, + "loss": 0.0208, + "num_input_tokens_seen": 37029040, + "step": 175465 + }, + { + "epoch": 19.303630363036305, + "grad_norm": 0.048655152320861816, + "learning_rate": 1.844789268289543e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37030064, + "step": 175470 + }, + { + "epoch": 19.304180418041803, + "grad_norm": 0.2097608745098114, + "learning_rate": 1.8418801021417852e-07, + "loss": 0.0048, + "num_input_tokens_seen": 37031120, + "step": 175475 + }, + { + "epoch": 19.304730473047304, + "grad_norm": 0.034895822405815125, + "learning_rate": 1.8389732231483825e-07, + "loss": 0.0101, + "num_input_tokens_seen": 37032240, + "step": 175480 + }, + { + "epoch": 19.305280528052805, + "grad_norm": 0.02091151662170887, + "learning_rate": 1.836068631336063e-07, + "loss": 0.0327, + "num_input_tokens_seen": 37033264, + "step": 175485 + }, + { + "epoch": 19.305830583058306, + "grad_norm": 0.03171311318874359, + "learning_rate": 1.8331663267316113e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37034288, + "step": 175490 + }, + { + "epoch": 19.306380638063807, + "grad_norm": 0.005822432227432728, + "learning_rate": 1.830266309361811e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37035312, + "step": 175495 + }, + { + "epoch": 19.306930693069308, + "grad_norm": 0.4104596674442291, + "learning_rate": 1.8273685792533356e-07, + "loss": 0.0063, + "num_input_tokens_seen": 37036400, + "step": 175500 + }, + { + "epoch": 19.30748074807481, + "grad_norm": 0.0929994061589241, + "learning_rate": 1.8244731364329414e-07, + "loss": 0.0039, + "num_input_tokens_seen": 37037424, + "step": 175505 + }, + { + "epoch": 19.308030803080307, + "grad_norm": 0.30444493889808655, + "learning_rate": 1.8215799809272737e-07, + "loss": 0.0064, + "num_input_tokens_seen": 37038512, + "step": 175510 + }, + { + "epoch": 19.308580858085808, + "grad_norm": 0.08772522211074829, + "learning_rate": 1.8186891127630333e-07, + "loss": 0.0055, + "num_input_tokens_seen": 37039600, + "step": 175515 + }, + { + "epoch": 19.30913091309131, + "grad_norm": 0.2978237271308899, + "learning_rate": 1.815800531966838e-07, + "loss": 0.0031, + "num_input_tokens_seen": 37040656, + "step": 175520 + }, + { + "epoch": 19.30968096809681, + "grad_norm": 0.027038006111979485, + "learning_rate": 1.8129142385653052e-07, + "loss": 0.0114, + "num_input_tokens_seen": 37041744, + "step": 175525 + }, + { + "epoch": 19.31023102310231, + "grad_norm": 0.10199148952960968, + "learning_rate": 1.8100302325850805e-07, + "loss": 0.0287, + "num_input_tokens_seen": 37042800, + "step": 175530 + }, + { + "epoch": 19.310781078107812, + "grad_norm": 0.03624369949102402, + "learning_rate": 1.80714851405267e-07, + "loss": 0.0059, + "num_input_tokens_seen": 37043888, + "step": 175535 + }, + { + "epoch": 19.31133113311331, + "grad_norm": 0.07864214479923248, + "learning_rate": 1.8042690829947195e-07, + "loss": 0.0846, + "num_input_tokens_seen": 37044976, + "step": 175540 + }, + { + "epoch": 19.31188118811881, + "grad_norm": 0.03507888317108154, + "learning_rate": 1.8013919394377076e-07, + "loss": 0.0042, + "num_input_tokens_seen": 37046032, + "step": 175545 + }, + { + "epoch": 19.312431243124312, + "grad_norm": 0.2705015242099762, + "learning_rate": 1.7985170834081688e-07, + "loss": 0.0369, + "num_input_tokens_seen": 37047088, + "step": 175550 + }, + { + "epoch": 19.312981298129813, + "grad_norm": 0.003331294283270836, + "learning_rate": 1.7956445149326096e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37048208, + "step": 175555 + }, + { + "epoch": 19.313531353135314, + "grad_norm": 0.02620864473283291, + "learning_rate": 1.792774234037481e-07, + "loss": 0.0499, + "num_input_tokens_seen": 37049232, + "step": 175560 + }, + { + "epoch": 19.314081408140815, + "grad_norm": 0.007617556490004063, + "learning_rate": 1.7899062407492618e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37050256, + "step": 175565 + }, + { + "epoch": 19.314631463146316, + "grad_norm": 0.018040627241134644, + "learning_rate": 1.7870405350943475e-07, + "loss": 0.0029, + "num_input_tokens_seen": 37051312, + "step": 175570 + }, + { + "epoch": 19.315181518151814, + "grad_norm": 0.026245208457112312, + "learning_rate": 1.7841771170991895e-07, + "loss": 0.009, + "num_input_tokens_seen": 37052336, + "step": 175575 + }, + { + "epoch": 19.315731573157315, + "grad_norm": 0.021313359960913658, + "learning_rate": 1.781315986790183e-07, + "loss": 0.0139, + "num_input_tokens_seen": 37053392, + "step": 175580 + }, + { + "epoch": 19.316281628162816, + "grad_norm": 0.011582656763494015, + "learning_rate": 1.7784571441936682e-07, + "loss": 0.0381, + "num_input_tokens_seen": 37054416, + "step": 175585 + }, + { + "epoch": 19.316831683168317, + "grad_norm": 0.3117642104625702, + "learning_rate": 1.7756005893360128e-07, + "loss": 0.0464, + "num_input_tokens_seen": 37055408, + "step": 175590 + }, + { + "epoch": 19.317381738173818, + "grad_norm": 0.023295575752854347, + "learning_rate": 1.7727463222435292e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37056400, + "step": 175595 + }, + { + "epoch": 19.31793179317932, + "grad_norm": 0.057576268911361694, + "learning_rate": 1.7698943429425296e-07, + "loss": 0.0389, + "num_input_tokens_seen": 37057424, + "step": 175600 + }, + { + "epoch": 19.318481848184817, + "grad_norm": 0.09321672469377518, + "learning_rate": 1.7670446514593264e-07, + "loss": 0.0218, + "num_input_tokens_seen": 37058544, + "step": 175605 + }, + { + "epoch": 19.319031903190318, + "grad_norm": 0.06318436563014984, + "learning_rate": 1.7641972478201207e-07, + "loss": 0.0037, + "num_input_tokens_seen": 37059632, + "step": 175610 + }, + { + "epoch": 19.31958195819582, + "grad_norm": 0.016306398436427116, + "learning_rate": 1.761352132051225e-07, + "loss": 0.0061, + "num_input_tokens_seen": 37060752, + "step": 175615 + }, + { + "epoch": 19.32013201320132, + "grad_norm": 0.03675127401947975, + "learning_rate": 1.7585093041788126e-07, + "loss": 0.0014, + "num_input_tokens_seen": 37061808, + "step": 175620 + }, + { + "epoch": 19.32068206820682, + "grad_norm": 0.02327018231153488, + "learning_rate": 1.755668764229085e-07, + "loss": 0.0032, + "num_input_tokens_seen": 37062800, + "step": 175625 + }, + { + "epoch": 19.321232123212322, + "grad_norm": 0.02379833720624447, + "learning_rate": 1.7528305122282707e-07, + "loss": 0.004, + "num_input_tokens_seen": 37063888, + "step": 175630 + }, + { + "epoch": 19.321782178217823, + "grad_norm": 0.2135225087404251, + "learning_rate": 1.749994548202488e-07, + "loss": 0.0094, + "num_input_tokens_seen": 37064880, + "step": 175635 + }, + { + "epoch": 19.32233223322332, + "grad_norm": 1.9490481615066528, + "learning_rate": 1.747160872177883e-07, + "loss": 0.0928, + "num_input_tokens_seen": 37065968, + "step": 175640 + }, + { + "epoch": 19.322882288228822, + "grad_norm": 1.9431655406951904, + "learning_rate": 1.7443294841805735e-07, + "loss": 0.0627, + "num_input_tokens_seen": 37067056, + "step": 175645 + }, + { + "epoch": 19.323432343234323, + "grad_norm": 0.009242625907063484, + "learning_rate": 1.7415003842366496e-07, + "loss": 0.0211, + "num_input_tokens_seen": 37068144, + "step": 175650 + }, + { + "epoch": 19.323982398239824, + "grad_norm": 1.8311834335327148, + "learning_rate": 1.7386735723722013e-07, + "loss": 0.0093, + "num_input_tokens_seen": 37069168, + "step": 175655 + }, + { + "epoch": 19.324532453245325, + "grad_norm": 1.6444313526153564, + "learning_rate": 1.735849048613264e-07, + "loss": 0.0445, + "num_input_tokens_seen": 37070192, + "step": 175660 + }, + { + "epoch": 19.325082508250826, + "grad_norm": 0.12370791286230087, + "learning_rate": 1.7330268129858717e-07, + "loss": 0.0844, + "num_input_tokens_seen": 37071248, + "step": 175665 + }, + { + "epoch": 19.325632563256324, + "grad_norm": 0.02002282813191414, + "learning_rate": 1.7302068655160596e-07, + "loss": 0.0052, + "num_input_tokens_seen": 37072368, + "step": 175670 + }, + { + "epoch": 19.326182618261825, + "grad_norm": 0.008134102448821068, + "learning_rate": 1.727389206229779e-07, + "loss": 0.0047, + "num_input_tokens_seen": 37073392, + "step": 175675 + }, + { + "epoch": 19.326732673267326, + "grad_norm": 1.2990190982818604, + "learning_rate": 1.7245738351530372e-07, + "loss": 0.0796, + "num_input_tokens_seen": 37074416, + "step": 175680 + }, + { + "epoch": 19.327282728272827, + "grad_norm": 0.5495684742927551, + "learning_rate": 1.7217607523117573e-07, + "loss": 0.1108, + "num_input_tokens_seen": 37075472, + "step": 175685 + }, + { + "epoch": 19.32783278327833, + "grad_norm": 0.01649019494652748, + "learning_rate": 1.7189499577318634e-07, + "loss": 0.1082, + "num_input_tokens_seen": 37076560, + "step": 175690 + }, + { + "epoch": 19.32838283828383, + "grad_norm": 0.07495944201946259, + "learning_rate": 1.7161414514392793e-07, + "loss": 0.0183, + "num_input_tokens_seen": 37077648, + "step": 175695 + }, + { + "epoch": 19.32893289328933, + "grad_norm": 0.16577085852622986, + "learning_rate": 1.713335233459873e-07, + "loss": 0.0061, + "num_input_tokens_seen": 37078672, + "step": 175700 + }, + { + "epoch": 19.329482948294828, + "grad_norm": 0.0645725280046463, + "learning_rate": 1.7105313038195403e-07, + "loss": 0.002, + "num_input_tokens_seen": 37079760, + "step": 175705 + }, + { + "epoch": 19.33003300330033, + "grad_norm": 0.019452625885605812, + "learning_rate": 1.7077296625440664e-07, + "loss": 0.0135, + "num_input_tokens_seen": 37080848, + "step": 175710 + }, + { + "epoch": 19.33058305830583, + "grad_norm": 0.22194696962833405, + "learning_rate": 1.7049303096593193e-07, + "loss": 0.0937, + "num_input_tokens_seen": 37081936, + "step": 175715 + }, + { + "epoch": 19.33113311331133, + "grad_norm": 1.6169240474700928, + "learning_rate": 1.7021332451910843e-07, + "loss": 0.0144, + "num_input_tokens_seen": 37082992, + "step": 175720 + }, + { + "epoch": 19.331683168316832, + "grad_norm": 0.8393735289573669, + "learning_rate": 1.699338469165146e-07, + "loss": 0.0083, + "num_input_tokens_seen": 37084080, + "step": 175725 + }, + { + "epoch": 19.332233223322334, + "grad_norm": 0.05063242092728615, + "learning_rate": 1.6965459816072616e-07, + "loss": 0.002, + "num_input_tokens_seen": 37085104, + "step": 175730 + }, + { + "epoch": 19.33278327832783, + "grad_norm": 0.015131248161196709, + "learning_rate": 1.6937557825431884e-07, + "loss": 0.0114, + "num_input_tokens_seen": 37086160, + "step": 175735 + }, + { + "epoch": 19.333333333333332, + "grad_norm": 0.11179917305707932, + "learning_rate": 1.6909678719985723e-07, + "loss": 0.009, + "num_input_tokens_seen": 37087248, + "step": 175740 + }, + { + "epoch": 19.333883388338833, + "grad_norm": 0.0038389284163713455, + "learning_rate": 1.6881822499991706e-07, + "loss": 0.0055, + "num_input_tokens_seen": 37088304, + "step": 175745 + }, + { + "epoch": 19.334433443344334, + "grad_norm": 0.018131770193576813, + "learning_rate": 1.6853989165706575e-07, + "loss": 0.0038, + "num_input_tokens_seen": 37089392, + "step": 175750 + }, + { + "epoch": 19.334983498349835, + "grad_norm": 1.446636438369751, + "learning_rate": 1.682617871738651e-07, + "loss": 0.0207, + "num_input_tokens_seen": 37090384, + "step": 175755 + }, + { + "epoch": 19.335533553355337, + "grad_norm": 0.01192282885313034, + "learning_rate": 1.6798391155288252e-07, + "loss": 0.0079, + "num_input_tokens_seen": 37091472, + "step": 175760 + }, + { + "epoch": 19.336083608360838, + "grad_norm": 0.0023866272531449795, + "learning_rate": 1.6770626479667428e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37092496, + "step": 175765 + }, + { + "epoch": 19.336633663366335, + "grad_norm": 0.9317359924316406, + "learning_rate": 1.6742884690780225e-07, + "loss": 0.0343, + "num_input_tokens_seen": 37093520, + "step": 175770 + }, + { + "epoch": 19.337183718371836, + "grad_norm": 0.017612196505069733, + "learning_rate": 1.671516578888227e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37094576, + "step": 175775 + }, + { + "epoch": 19.337733773377337, + "grad_norm": 0.3979741930961609, + "learning_rate": 1.6687469774229193e-07, + "loss": 0.0287, + "num_input_tokens_seen": 37095632, + "step": 175780 + }, + { + "epoch": 19.33828382838284, + "grad_norm": 0.22728849947452545, + "learning_rate": 1.6659796647076064e-07, + "loss": 0.0037, + "num_input_tokens_seen": 37096656, + "step": 175785 + }, + { + "epoch": 19.33883388338834, + "grad_norm": 0.9203644394874573, + "learning_rate": 1.6632146407677685e-07, + "loss": 0.0033, + "num_input_tokens_seen": 37097744, + "step": 175790 + }, + { + "epoch": 19.33938393839384, + "grad_norm": 0.1601806879043579, + "learning_rate": 1.6604519056289403e-07, + "loss": 0.0031, + "num_input_tokens_seen": 37098832, + "step": 175795 + }, + { + "epoch": 19.33993399339934, + "grad_norm": 1.8344330787658691, + "learning_rate": 1.657691459316546e-07, + "loss": 0.1593, + "num_input_tokens_seen": 37099888, + "step": 175800 + }, + { + "epoch": 19.34048404840484, + "grad_norm": 0.08729586750268936, + "learning_rate": 1.6549333018560654e-07, + "loss": 0.0243, + "num_input_tokens_seen": 37100976, + "step": 175805 + }, + { + "epoch": 19.34103410341034, + "grad_norm": 0.3368525207042694, + "learning_rate": 1.6521774332728947e-07, + "loss": 0.0028, + "num_input_tokens_seen": 37102032, + "step": 175810 + }, + { + "epoch": 19.34158415841584, + "grad_norm": 0.05442015081644058, + "learning_rate": 1.64942385359243e-07, + "loss": 0.0027, + "num_input_tokens_seen": 37103120, + "step": 175815 + }, + { + "epoch": 19.342134213421343, + "grad_norm": 0.03693090006709099, + "learning_rate": 1.6466725628400404e-07, + "loss": 0.0033, + "num_input_tokens_seen": 37104208, + "step": 175820 + }, + { + "epoch": 19.342684268426844, + "grad_norm": 0.05149935558438301, + "learning_rate": 1.6439235610411218e-07, + "loss": 0.004, + "num_input_tokens_seen": 37105200, + "step": 175825 + }, + { + "epoch": 19.343234323432345, + "grad_norm": 1.6460779905319214, + "learning_rate": 1.6411768482209599e-07, + "loss": 0.066, + "num_input_tokens_seen": 37106256, + "step": 175830 + }, + { + "epoch": 19.343784378437842, + "grad_norm": 0.2985234260559082, + "learning_rate": 1.638432424404923e-07, + "loss": 0.0756, + "num_input_tokens_seen": 37107248, + "step": 175835 + }, + { + "epoch": 19.344334433443343, + "grad_norm": 0.0549238957464695, + "learning_rate": 1.635690289618269e-07, + "loss": 0.054, + "num_input_tokens_seen": 37108304, + "step": 175840 + }, + { + "epoch": 19.344884488448844, + "grad_norm": 0.04915609210729599, + "learning_rate": 1.632950443886283e-07, + "loss": 0.0029, + "num_input_tokens_seen": 37109360, + "step": 175845 + }, + { + "epoch": 19.345434543454346, + "grad_norm": 0.2719837427139282, + "learning_rate": 1.6302128872342225e-07, + "loss": 0.0126, + "num_input_tokens_seen": 37110416, + "step": 175850 + }, + { + "epoch": 19.345984598459847, + "grad_norm": 0.004411733243614435, + "learning_rate": 1.6274776196872898e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37111376, + "step": 175855 + }, + { + "epoch": 19.346534653465348, + "grad_norm": 0.09735545516014099, + "learning_rate": 1.6247446412707423e-07, + "loss": 0.0057, + "num_input_tokens_seen": 37112400, + "step": 175860 + }, + { + "epoch": 19.34708470847085, + "grad_norm": 0.037624187767505646, + "learning_rate": 1.6220139520096988e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37113488, + "step": 175865 + }, + { + "epoch": 19.347634763476346, + "grad_norm": 0.004023811314254999, + "learning_rate": 1.619285551929417e-07, + "loss": 0.0038, + "num_input_tokens_seen": 37114576, + "step": 175870 + }, + { + "epoch": 19.348184818481847, + "grad_norm": 0.0012215328169986606, + "learning_rate": 1.6165594410549878e-07, + "loss": 0.0263, + "num_input_tokens_seen": 37115600, + "step": 175875 + }, + { + "epoch": 19.34873487348735, + "grad_norm": 0.8878464698791504, + "learning_rate": 1.61383561941153e-07, + "loss": 0.0091, + "num_input_tokens_seen": 37116560, + "step": 175880 + }, + { + "epoch": 19.34928492849285, + "grad_norm": 0.20880930125713348, + "learning_rate": 1.6111140870241347e-07, + "loss": 0.0036, + "num_input_tokens_seen": 37117584, + "step": 175885 + }, + { + "epoch": 19.34983498349835, + "grad_norm": 4.35296106338501, + "learning_rate": 1.6083948439179486e-07, + "loss": 0.0181, + "num_input_tokens_seen": 37118608, + "step": 175890 + }, + { + "epoch": 19.350385038503852, + "grad_norm": 0.19365815818309784, + "learning_rate": 1.605677890118007e-07, + "loss": 0.035, + "num_input_tokens_seen": 37119664, + "step": 175895 + }, + { + "epoch": 19.35093509350935, + "grad_norm": 0.07174738496541977, + "learning_rate": 1.6029632256493177e-07, + "loss": 0.0745, + "num_input_tokens_seen": 37120720, + "step": 175900 + }, + { + "epoch": 19.35148514851485, + "grad_norm": 0.21412761509418488, + "learning_rate": 1.6002508505369161e-07, + "loss": 0.0545, + "num_input_tokens_seen": 37121776, + "step": 175905 + }, + { + "epoch": 19.35203520352035, + "grad_norm": 0.25310638546943665, + "learning_rate": 1.5975407648058384e-07, + "loss": 0.2102, + "num_input_tokens_seen": 37122832, + "step": 175910 + }, + { + "epoch": 19.352585258525853, + "grad_norm": 0.22026707231998444, + "learning_rate": 1.594832968481008e-07, + "loss": 0.0975, + "num_input_tokens_seen": 37123856, + "step": 175915 + }, + { + "epoch": 19.353135313531354, + "grad_norm": 0.06674478203058243, + "learning_rate": 1.592127461587406e-07, + "loss": 0.1104, + "num_input_tokens_seen": 37124912, + "step": 175920 + }, + { + "epoch": 19.353685368536855, + "grad_norm": 0.004199302289634943, + "learning_rate": 1.589424244149984e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37126000, + "step": 175925 + }, + { + "epoch": 19.354235423542356, + "grad_norm": 0.0378873273730278, + "learning_rate": 1.586723316193639e-07, + "loss": 0.0377, + "num_input_tokens_seen": 37127088, + "step": 175930 + }, + { + "epoch": 19.354785478547853, + "grad_norm": 0.23485514521598816, + "learning_rate": 1.5840246777432678e-07, + "loss": 0.1127, + "num_input_tokens_seen": 37128208, + "step": 175935 + }, + { + "epoch": 19.355335533553355, + "grad_norm": 0.0329582653939724, + "learning_rate": 1.5813283288237114e-07, + "loss": 0.1098, + "num_input_tokens_seen": 37129264, + "step": 175940 + }, + { + "epoch": 19.355885588558856, + "grad_norm": 0.02262568101286888, + "learning_rate": 1.5786342694598667e-07, + "loss": 0.077, + "num_input_tokens_seen": 37130352, + "step": 175945 + }, + { + "epoch": 19.356435643564357, + "grad_norm": 0.02568219043314457, + "learning_rate": 1.575942499676547e-07, + "loss": 0.0051, + "num_input_tokens_seen": 37131376, + "step": 175950 + }, + { + "epoch": 19.356985698569858, + "grad_norm": 3.681880474090576, + "learning_rate": 1.5732530194985385e-07, + "loss": 0.0183, + "num_input_tokens_seen": 37132432, + "step": 175955 + }, + { + "epoch": 19.35753575357536, + "grad_norm": 1.0363572835922241, + "learning_rate": 1.570565828950682e-07, + "loss": 0.0714, + "num_input_tokens_seen": 37133520, + "step": 175960 + }, + { + "epoch": 19.358085808580856, + "grad_norm": 0.005802803672850132, + "learning_rate": 1.5678809280577077e-07, + "loss": 0.003, + "num_input_tokens_seen": 37134608, + "step": 175965 + }, + { + "epoch": 19.358635863586358, + "grad_norm": 0.025578489527106285, + "learning_rate": 1.565198316844374e-07, + "loss": 0.0675, + "num_input_tokens_seen": 37135760, + "step": 175970 + }, + { + "epoch": 19.35918591859186, + "grad_norm": 2.6130802631378174, + "learning_rate": 1.5625179953353552e-07, + "loss": 0.0244, + "num_input_tokens_seen": 37136816, + "step": 175975 + }, + { + "epoch": 19.35973597359736, + "grad_norm": 0.010635100305080414, + "learning_rate": 1.5598399635554374e-07, + "loss": 0.0093, + "num_input_tokens_seen": 37137840, + "step": 175980 + }, + { + "epoch": 19.36028602860286, + "grad_norm": 0.01415733527392149, + "learning_rate": 1.5571642215292392e-07, + "loss": 0.013, + "num_input_tokens_seen": 37138896, + "step": 175985 + }, + { + "epoch": 19.360836083608362, + "grad_norm": 0.010836085304617882, + "learning_rate": 1.554490769281436e-07, + "loss": 0.0025, + "num_input_tokens_seen": 37139952, + "step": 175990 + }, + { + "epoch": 19.361386138613863, + "grad_norm": 0.0017189305508509278, + "learning_rate": 1.5518196068367018e-07, + "loss": 0.0114, + "num_input_tokens_seen": 37141104, + "step": 175995 + }, + { + "epoch": 19.36193619361936, + "grad_norm": 0.03667745739221573, + "learning_rate": 1.5491507342196286e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37142160, + "step": 176000 + }, + { + "epoch": 19.36248624862486, + "grad_norm": 0.025183910503983498, + "learning_rate": 1.54648415145478e-07, + "loss": 0.0032, + "num_input_tokens_seen": 37143216, + "step": 176005 + }, + { + "epoch": 19.363036303630363, + "grad_norm": 0.05711635574698448, + "learning_rate": 1.543819858566803e-07, + "loss": 0.0018, + "num_input_tokens_seen": 37144240, + "step": 176010 + }, + { + "epoch": 19.363586358635864, + "grad_norm": 0.012403762899339199, + "learning_rate": 1.5411578555802053e-07, + "loss": 0.0028, + "num_input_tokens_seen": 37145264, + "step": 176015 + }, + { + "epoch": 19.364136413641365, + "grad_norm": 0.5770873427391052, + "learning_rate": 1.5384981425195512e-07, + "loss": 0.0252, + "num_input_tokens_seen": 37146256, + "step": 176020 + }, + { + "epoch": 19.364686468646866, + "grad_norm": 1.8717701435089111, + "learning_rate": 1.5358407194093206e-07, + "loss": 0.0128, + "num_input_tokens_seen": 37147376, + "step": 176025 + }, + { + "epoch": 19.365236523652364, + "grad_norm": 0.10245047509670258, + "learning_rate": 1.533185586273994e-07, + "loss": 0.0026, + "num_input_tokens_seen": 37148496, + "step": 176030 + }, + { + "epoch": 19.365786578657865, + "grad_norm": 0.01789303496479988, + "learning_rate": 1.5305327431381078e-07, + "loss": 0.0063, + "num_input_tokens_seen": 37149520, + "step": 176035 + }, + { + "epoch": 19.366336633663366, + "grad_norm": 0.7223193049430847, + "learning_rate": 1.5278821900260588e-07, + "loss": 0.1492, + "num_input_tokens_seen": 37150608, + "step": 176040 + }, + { + "epoch": 19.366886688668867, + "grad_norm": 0.010065131820738316, + "learning_rate": 1.5252339269622994e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37151632, + "step": 176045 + }, + { + "epoch": 19.367436743674368, + "grad_norm": 0.007083242759108543, + "learning_rate": 1.5225879539712273e-07, + "loss": 0.0025, + "num_input_tokens_seen": 37152656, + "step": 176050 + }, + { + "epoch": 19.36798679867987, + "grad_norm": 0.20178160071372986, + "learning_rate": 1.5199442710772117e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37153680, + "step": 176055 + }, + { + "epoch": 19.36853685368537, + "grad_norm": 3.061767578125, + "learning_rate": 1.5173028783046493e-07, + "loss": 0.0902, + "num_input_tokens_seen": 37154736, + "step": 176060 + }, + { + "epoch": 19.369086908690868, + "grad_norm": 0.016443811357021332, + "learning_rate": 1.5146637756778827e-07, + "loss": 0.003, + "num_input_tokens_seen": 37155760, + "step": 176065 + }, + { + "epoch": 19.36963696369637, + "grad_norm": 0.16423173248767853, + "learning_rate": 1.5120269632212246e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37156752, + "step": 176070 + }, + { + "epoch": 19.37018701870187, + "grad_norm": 0.012353695929050446, + "learning_rate": 1.50939244095899e-07, + "loss": 0.0007, + "num_input_tokens_seen": 37157840, + "step": 176075 + }, + { + "epoch": 19.37073707370737, + "grad_norm": 0.047051120549440384, + "learning_rate": 1.5067602089154086e-07, + "loss": 0.0021, + "num_input_tokens_seen": 37158864, + "step": 176080 + }, + { + "epoch": 19.371287128712872, + "grad_norm": 0.022512968629598618, + "learning_rate": 1.5041302671148226e-07, + "loss": 0.0052, + "num_input_tokens_seen": 37159920, + "step": 176085 + }, + { + "epoch": 19.371837183718373, + "grad_norm": 0.07682564109563828, + "learning_rate": 1.501502615581407e-07, + "loss": 0.0047, + "num_input_tokens_seen": 37160976, + "step": 176090 + }, + { + "epoch": 19.37238723872387, + "grad_norm": 0.11796795576810837, + "learning_rate": 1.4988772543393925e-07, + "loss": 0.1323, + "num_input_tokens_seen": 37162064, + "step": 176095 + }, + { + "epoch": 19.372937293729372, + "grad_norm": 0.01486726850271225, + "learning_rate": 1.4962541834129816e-07, + "loss": 0.0158, + "num_input_tokens_seen": 37163152, + "step": 176100 + }, + { + "epoch": 19.373487348734873, + "grad_norm": 0.02726783975958824, + "learning_rate": 1.4936334028263777e-07, + "loss": 0.0028, + "num_input_tokens_seen": 37164176, + "step": 176105 + }, + { + "epoch": 19.374037403740374, + "grad_norm": 0.07605226337909698, + "learning_rate": 1.4910149126036997e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37165328, + "step": 176110 + }, + { + "epoch": 19.374587458745875, + "grad_norm": 0.030756685882806778, + "learning_rate": 1.488398712769068e-07, + "loss": 0.0016, + "num_input_tokens_seen": 37166352, + "step": 176115 + }, + { + "epoch": 19.375137513751376, + "grad_norm": 0.007110011763870716, + "learning_rate": 1.4857848033466292e-07, + "loss": 0.0016, + "num_input_tokens_seen": 37167440, + "step": 176120 + }, + { + "epoch": 19.375687568756877, + "grad_norm": 0.17030954360961914, + "learning_rate": 1.4831731843604756e-07, + "loss": 0.0669, + "num_input_tokens_seen": 37168464, + "step": 176125 + }, + { + "epoch": 19.376237623762375, + "grad_norm": 0.0867978185415268, + "learning_rate": 1.4805638558346714e-07, + "loss": 0.1562, + "num_input_tokens_seen": 37169488, + "step": 176130 + }, + { + "epoch": 19.376787678767876, + "grad_norm": 0.06445488333702087, + "learning_rate": 1.4779568177932523e-07, + "loss": 0.0795, + "num_input_tokens_seen": 37170448, + "step": 176135 + }, + { + "epoch": 19.377337733773377, + "grad_norm": 0.0026214816607534885, + "learning_rate": 1.4753520702602552e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37171536, + "step": 176140 + }, + { + "epoch": 19.377887788778878, + "grad_norm": 0.2446296513080597, + "learning_rate": 1.472749613259661e-07, + "loss": 0.0022, + "num_input_tokens_seen": 37172592, + "step": 176145 + }, + { + "epoch": 19.37843784378438, + "grad_norm": 1.1518830060958862, + "learning_rate": 1.4701494468154775e-07, + "loss": 0.085, + "num_input_tokens_seen": 37173712, + "step": 176150 + }, + { + "epoch": 19.37898789878988, + "grad_norm": 0.08005433529615402, + "learning_rate": 1.4675515709516863e-07, + "loss": 0.0472, + "num_input_tokens_seen": 37174704, + "step": 176155 + }, + { + "epoch": 19.379537953795378, + "grad_norm": 4.707586288452148, + "learning_rate": 1.4649559856922124e-07, + "loss": 0.0506, + "num_input_tokens_seen": 37175760, + "step": 176160 + }, + { + "epoch": 19.38008800880088, + "grad_norm": 0.19134056568145752, + "learning_rate": 1.462362691061009e-07, + "loss": 0.0346, + "num_input_tokens_seen": 37176752, + "step": 176165 + }, + { + "epoch": 19.38063806380638, + "grad_norm": 0.003103590803220868, + "learning_rate": 1.459771687081918e-07, + "loss": 0.0016, + "num_input_tokens_seen": 37177872, + "step": 176170 + }, + { + "epoch": 19.38118811881188, + "grad_norm": 0.016218941658735275, + "learning_rate": 1.4571829737788369e-07, + "loss": 0.0458, + "num_input_tokens_seen": 37178928, + "step": 176175 + }, + { + "epoch": 19.381738173817382, + "grad_norm": 1.0722161531448364, + "learning_rate": 1.4545965511756632e-07, + "loss": 0.0071, + "num_input_tokens_seen": 37179984, + "step": 176180 + }, + { + "epoch": 19.382288228822883, + "grad_norm": 0.017640111967921257, + "learning_rate": 1.4520124192962115e-07, + "loss": 0.0395, + "num_input_tokens_seen": 37181040, + "step": 176185 + }, + { + "epoch": 19.382838283828384, + "grad_norm": 0.03533085063099861, + "learning_rate": 1.449430578164296e-07, + "loss": 0.032, + "num_input_tokens_seen": 37182000, + "step": 176190 + }, + { + "epoch": 19.383388338833882, + "grad_norm": 0.023457402363419533, + "learning_rate": 1.446851027803703e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37183088, + "step": 176195 + }, + { + "epoch": 19.383938393839383, + "grad_norm": 0.01567811705172062, + "learning_rate": 1.444273768238219e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37184144, + "step": 176200 + }, + { + "epoch": 19.384488448844884, + "grad_norm": 3.5105574131011963, + "learning_rate": 1.4416987994915755e-07, + "loss": 0.0646, + "num_input_tokens_seen": 37185200, + "step": 176205 + }, + { + "epoch": 19.385038503850385, + "grad_norm": 0.02902013435959816, + "learning_rate": 1.4391261215875308e-07, + "loss": 0.0025, + "num_input_tokens_seen": 37186224, + "step": 176210 + }, + { + "epoch": 19.385588558855886, + "grad_norm": 0.029924361035227776, + "learning_rate": 1.436555734549816e-07, + "loss": 0.0016, + "num_input_tokens_seen": 37187280, + "step": 176215 + }, + { + "epoch": 19.386138613861387, + "grad_norm": 0.12300996482372284, + "learning_rate": 1.4339876384020789e-07, + "loss": 0.0082, + "num_input_tokens_seen": 37188336, + "step": 176220 + }, + { + "epoch": 19.38668866886689, + "grad_norm": 0.007020898163318634, + "learning_rate": 1.431421833168023e-07, + "loss": 0.0877, + "num_input_tokens_seen": 37189392, + "step": 176225 + }, + { + "epoch": 19.387238723872386, + "grad_norm": 0.11963587999343872, + "learning_rate": 1.428858318871268e-07, + "loss": 0.0049, + "num_input_tokens_seen": 37190480, + "step": 176230 + }, + { + "epoch": 19.387788778877887, + "grad_norm": 0.007829247042536736, + "learning_rate": 1.4262970955354061e-07, + "loss": 0.0045, + "num_input_tokens_seen": 37191504, + "step": 176235 + }, + { + "epoch": 19.388338833883388, + "grad_norm": 0.2598581910133362, + "learning_rate": 1.4237381631841405e-07, + "loss": 0.0043, + "num_input_tokens_seen": 37192528, + "step": 176240 + }, + { + "epoch": 19.38888888888889, + "grad_norm": 0.09615111351013184, + "learning_rate": 1.4211815218409808e-07, + "loss": 0.0029, + "num_input_tokens_seen": 37193584, + "step": 176245 + }, + { + "epoch": 19.38943894389439, + "grad_norm": 0.011116147972643375, + "learning_rate": 1.4186271715295185e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37194608, + "step": 176250 + }, + { + "epoch": 19.38998899889989, + "grad_norm": 0.09890135377645493, + "learning_rate": 1.4160751122732906e-07, + "loss": 0.0059, + "num_input_tokens_seen": 37195664, + "step": 176255 + }, + { + "epoch": 19.39053905390539, + "grad_norm": 0.01484157145023346, + "learning_rate": 1.4135253440958062e-07, + "loss": 0.0092, + "num_input_tokens_seen": 37196720, + "step": 176260 + }, + { + "epoch": 19.39108910891089, + "grad_norm": 0.005483205895870924, + "learning_rate": 1.410977867020574e-07, + "loss": 0.0018, + "num_input_tokens_seen": 37197840, + "step": 176265 + }, + { + "epoch": 19.39163916391639, + "grad_norm": 0.08076201379299164, + "learning_rate": 1.408432681071048e-07, + "loss": 0.0015, + "num_input_tokens_seen": 37198896, + "step": 176270 + }, + { + "epoch": 19.392189218921892, + "grad_norm": 0.0083208242431283, + "learning_rate": 1.4058897862707643e-07, + "loss": 0.0067, + "num_input_tokens_seen": 37199888, + "step": 176275 + }, + { + "epoch": 19.392739273927393, + "grad_norm": 0.00899487640708685, + "learning_rate": 1.4033491826430934e-07, + "loss": 0.0145, + "num_input_tokens_seen": 37200976, + "step": 176280 + }, + { + "epoch": 19.393289328932894, + "grad_norm": 0.014402412809431553, + "learning_rate": 1.400810870211461e-07, + "loss": 0.0096, + "num_input_tokens_seen": 37202064, + "step": 176285 + }, + { + "epoch": 19.393839383938396, + "grad_norm": 0.10241050273180008, + "learning_rate": 1.3982748489992648e-07, + "loss": 0.032, + "num_input_tokens_seen": 37203120, + "step": 176290 + }, + { + "epoch": 19.394389438943893, + "grad_norm": 0.18295037746429443, + "learning_rate": 1.3957411190298475e-07, + "loss": 0.0188, + "num_input_tokens_seen": 37204208, + "step": 176295 + }, + { + "epoch": 19.394939493949394, + "grad_norm": 0.021953126415610313, + "learning_rate": 1.3932096803266347e-07, + "loss": 0.0032, + "num_input_tokens_seen": 37205264, + "step": 176300 + }, + { + "epoch": 19.395489548954895, + "grad_norm": 0.018092429265379906, + "learning_rate": 1.3906805329129137e-07, + "loss": 0.0923, + "num_input_tokens_seen": 37206352, + "step": 176305 + }, + { + "epoch": 19.396039603960396, + "grad_norm": 0.02502666600048542, + "learning_rate": 1.3881536768119708e-07, + "loss": 0.0432, + "num_input_tokens_seen": 37207440, + "step": 176310 + }, + { + "epoch": 19.396589658965897, + "grad_norm": 0.007238054182380438, + "learning_rate": 1.385629112047121e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37208432, + "step": 176315 + }, + { + "epoch": 19.3971397139714, + "grad_norm": 0.007080258801579475, + "learning_rate": 1.3831068386416513e-07, + "loss": 0.0541, + "num_input_tokens_seen": 37209520, + "step": 176320 + }, + { + "epoch": 19.397689768976896, + "grad_norm": 0.047254711389541626, + "learning_rate": 1.3805868566187652e-07, + "loss": 0.0026, + "num_input_tokens_seen": 37210576, + "step": 176325 + }, + { + "epoch": 19.398239823982397, + "grad_norm": 2.9274790287017822, + "learning_rate": 1.3780691660017499e-07, + "loss": 0.0383, + "num_input_tokens_seen": 37211664, + "step": 176330 + }, + { + "epoch": 19.3987898789879, + "grad_norm": 0.05878133326768875, + "learning_rate": 1.3755537668137252e-07, + "loss": 0.0019, + "num_input_tokens_seen": 37212720, + "step": 176335 + }, + { + "epoch": 19.3993399339934, + "grad_norm": 0.18191933631896973, + "learning_rate": 1.373040659077951e-07, + "loss": 0.0048, + "num_input_tokens_seen": 37213808, + "step": 176340 + }, + { + "epoch": 19.3998899889989, + "grad_norm": 0.2247827798128128, + "learning_rate": 1.3705298428175473e-07, + "loss": 0.0034, + "num_input_tokens_seen": 37214864, + "step": 176345 + }, + { + "epoch": 19.4004400440044, + "grad_norm": 0.035458944737911224, + "learning_rate": 1.3680213180556622e-07, + "loss": 0.0247, + "num_input_tokens_seen": 37215952, + "step": 176350 + }, + { + "epoch": 19.400990099009903, + "grad_norm": 3.334731101989746, + "learning_rate": 1.3655150848154162e-07, + "loss": 0.1118, + "num_input_tokens_seen": 37217072, + "step": 176355 + }, + { + "epoch": 19.4015401540154, + "grad_norm": 0.010230574756860733, + "learning_rate": 1.3630111431199023e-07, + "loss": 0.0022, + "num_input_tokens_seen": 37218192, + "step": 176360 + }, + { + "epoch": 19.4020902090209, + "grad_norm": 0.2150629758834839, + "learning_rate": 1.3605094929922403e-07, + "loss": 0.0111, + "num_input_tokens_seen": 37219280, + "step": 176365 + }, + { + "epoch": 19.402640264026402, + "grad_norm": 0.007166565861552954, + "learning_rate": 1.3580101344554397e-07, + "loss": 0.0075, + "num_input_tokens_seen": 37220368, + "step": 176370 + }, + { + "epoch": 19.403190319031903, + "grad_norm": 0.005584055092185736, + "learning_rate": 1.355513067532538e-07, + "loss": 0.001, + "num_input_tokens_seen": 37221424, + "step": 176375 + }, + { + "epoch": 19.403740374037405, + "grad_norm": 2.848790407180786, + "learning_rate": 1.3530182922465716e-07, + "loss": 0.075, + "num_input_tokens_seen": 37222544, + "step": 176380 + }, + { + "epoch": 19.404290429042906, + "grad_norm": 0.027990709990262985, + "learning_rate": 1.350525808620523e-07, + "loss": 0.0053, + "num_input_tokens_seen": 37223600, + "step": 176385 + }, + { + "epoch": 19.404840484048403, + "grad_norm": 0.17970292270183563, + "learning_rate": 1.3480356166773456e-07, + "loss": 0.0028, + "num_input_tokens_seen": 37224656, + "step": 176390 + }, + { + "epoch": 19.405390539053904, + "grad_norm": 0.025725040584802628, + "learning_rate": 1.345547716440021e-07, + "loss": 0.0014, + "num_input_tokens_seen": 37225712, + "step": 176395 + }, + { + "epoch": 19.405940594059405, + "grad_norm": 0.017802400514483452, + "learning_rate": 1.343062107931503e-07, + "loss": 0.0938, + "num_input_tokens_seen": 37226736, + "step": 176400 + }, + { + "epoch": 19.406490649064907, + "grad_norm": 0.07903789728879929, + "learning_rate": 1.340578791174635e-07, + "loss": 0.0028, + "num_input_tokens_seen": 37227728, + "step": 176405 + }, + { + "epoch": 19.407040704070408, + "grad_norm": 0.03254888206720352, + "learning_rate": 1.3380977661923144e-07, + "loss": 0.0042, + "num_input_tokens_seen": 37228784, + "step": 176410 + }, + { + "epoch": 19.40759075907591, + "grad_norm": 0.0050226980820298195, + "learning_rate": 1.3356190330074403e-07, + "loss": 0.0501, + "num_input_tokens_seen": 37229840, + "step": 176415 + }, + { + "epoch": 19.40814081408141, + "grad_norm": 2.8272225856781006, + "learning_rate": 1.3331425916428552e-07, + "loss": 0.0416, + "num_input_tokens_seen": 37230832, + "step": 176420 + }, + { + "epoch": 19.408690869086907, + "grad_norm": 0.17235562205314636, + "learning_rate": 1.3306684421213744e-07, + "loss": 0.005, + "num_input_tokens_seen": 37231888, + "step": 176425 + }, + { + "epoch": 19.40924092409241, + "grad_norm": 0.028937431052327156, + "learning_rate": 1.328196584465785e-07, + "loss": 0.006, + "num_input_tokens_seen": 37232976, + "step": 176430 + }, + { + "epoch": 19.40979097909791, + "grad_norm": 0.0035581381525844336, + "learning_rate": 1.3257270186988746e-07, + "loss": 0.0021, + "num_input_tokens_seen": 37234032, + "step": 176435 + }, + { + "epoch": 19.41034103410341, + "grad_norm": 0.0305650532245636, + "learning_rate": 1.3232597448434302e-07, + "loss": 0.0439, + "num_input_tokens_seen": 37235152, + "step": 176440 + }, + { + "epoch": 19.41089108910891, + "grad_norm": 0.04101315140724182, + "learning_rate": 1.320794762922184e-07, + "loss": 0.0011, + "num_input_tokens_seen": 37236176, + "step": 176445 + }, + { + "epoch": 19.411441144114413, + "grad_norm": 0.00451494287699461, + "learning_rate": 1.318332072957812e-07, + "loss": 0.0019, + "num_input_tokens_seen": 37237200, + "step": 176450 + }, + { + "epoch": 19.41199119911991, + "grad_norm": 0.1692720502614975, + "learning_rate": 1.3158716749730739e-07, + "loss": 0.005, + "num_input_tokens_seen": 37238256, + "step": 176455 + }, + { + "epoch": 19.41254125412541, + "grad_norm": 0.02680056355893612, + "learning_rate": 1.313413568990618e-07, + "loss": 0.0039, + "num_input_tokens_seen": 37239376, + "step": 176460 + }, + { + "epoch": 19.413091309130913, + "grad_norm": 0.18334588408470154, + "learning_rate": 1.310957755033093e-07, + "loss": 0.0052, + "num_input_tokens_seen": 37240464, + "step": 176465 + }, + { + "epoch": 19.413641364136414, + "grad_norm": 0.004313973244279623, + "learning_rate": 1.3085042331231202e-07, + "loss": 0.0078, + "num_input_tokens_seen": 37241552, + "step": 176470 + }, + { + "epoch": 19.414191419141915, + "grad_norm": 0.030161598697304726, + "learning_rate": 1.3060530032833475e-07, + "loss": 0.0072, + "num_input_tokens_seen": 37242640, + "step": 176475 + }, + { + "epoch": 19.414741474147416, + "grad_norm": 0.09148052334785461, + "learning_rate": 1.3036040655363402e-07, + "loss": 0.0262, + "num_input_tokens_seen": 37243696, + "step": 176480 + }, + { + "epoch": 19.415291529152917, + "grad_norm": 1.9562203884124756, + "learning_rate": 1.301157419904664e-07, + "loss": 0.0743, + "num_input_tokens_seen": 37244752, + "step": 176485 + }, + { + "epoch": 19.415841584158414, + "grad_norm": 0.17514079809188843, + "learning_rate": 1.2987130664109116e-07, + "loss": 0.1078, + "num_input_tokens_seen": 37245872, + "step": 176490 + }, + { + "epoch": 19.416391639163916, + "grad_norm": 0.2409237027168274, + "learning_rate": 1.296271005077565e-07, + "loss": 0.0709, + "num_input_tokens_seen": 37246864, + "step": 176495 + }, + { + "epoch": 19.416941694169417, + "grad_norm": 0.04025426134467125, + "learning_rate": 1.293831235927162e-07, + "loss": 0.0043, + "num_input_tokens_seen": 37247920, + "step": 176500 + }, + { + "epoch": 19.417491749174918, + "grad_norm": 1.4978023767471313, + "learning_rate": 1.2913937589821567e-07, + "loss": 0.0372, + "num_input_tokens_seen": 37248912, + "step": 176505 + }, + { + "epoch": 19.41804180418042, + "grad_norm": 0.0480591356754303, + "learning_rate": 1.288958574265059e-07, + "loss": 0.0024, + "num_input_tokens_seen": 37249904, + "step": 176510 + }, + { + "epoch": 19.41859185918592, + "grad_norm": 0.007465198636054993, + "learning_rate": 1.2865256817982674e-07, + "loss": 0.007, + "num_input_tokens_seen": 37251024, + "step": 176515 + }, + { + "epoch": 19.419141914191417, + "grad_norm": 0.0492866076529026, + "learning_rate": 1.2840950816042363e-07, + "loss": 0.0014, + "num_input_tokens_seen": 37252112, + "step": 176520 + }, + { + "epoch": 19.41969196919692, + "grad_norm": 0.016117502003908157, + "learning_rate": 1.2816667737053367e-07, + "loss": 0.0056, + "num_input_tokens_seen": 37253200, + "step": 176525 + }, + { + "epoch": 19.42024202420242, + "grad_norm": 0.07358325272798538, + "learning_rate": 1.2792407581239953e-07, + "loss": 0.0026, + "num_input_tokens_seen": 37254320, + "step": 176530 + }, + { + "epoch": 19.42079207920792, + "grad_norm": 0.009295718744397163, + "learning_rate": 1.2768170348824993e-07, + "loss": 0.1069, + "num_input_tokens_seen": 37255376, + "step": 176535 + }, + { + "epoch": 19.421342134213422, + "grad_norm": 1.821687936782837, + "learning_rate": 1.2743956040032757e-07, + "loss": 0.0224, + "num_input_tokens_seen": 37256400, + "step": 176540 + }, + { + "epoch": 19.421892189218923, + "grad_norm": 0.017853902652859688, + "learning_rate": 1.271976465508584e-07, + "loss": 0.0022, + "num_input_tokens_seen": 37257488, + "step": 176545 + }, + { + "epoch": 19.422442244224424, + "grad_norm": 0.0992545485496521, + "learning_rate": 1.2695596194207405e-07, + "loss": 0.0503, + "num_input_tokens_seen": 37258608, + "step": 176550 + }, + { + "epoch": 19.42299229922992, + "grad_norm": 0.004994812421500683, + "learning_rate": 1.2671450657620042e-07, + "loss": 0.005, + "num_input_tokens_seen": 37259728, + "step": 176555 + }, + { + "epoch": 19.423542354235423, + "grad_norm": 0.03760167583823204, + "learning_rate": 1.2647328045546357e-07, + "loss": 0.0922, + "num_input_tokens_seen": 37260720, + "step": 176560 + }, + { + "epoch": 19.424092409240924, + "grad_norm": 0.7105748653411865, + "learning_rate": 1.2623228358208672e-07, + "loss": 0.006, + "num_input_tokens_seen": 37261744, + "step": 176565 + }, + { + "epoch": 19.424642464246425, + "grad_norm": 0.02168606035411358, + "learning_rate": 1.2599151595829305e-07, + "loss": 0.0865, + "num_input_tokens_seen": 37262736, + "step": 176570 + }, + { + "epoch": 19.425192519251926, + "grad_norm": 0.0378434993326664, + "learning_rate": 1.2575097758629751e-07, + "loss": 0.0679, + "num_input_tokens_seen": 37263792, + "step": 176575 + }, + { + "epoch": 19.425742574257427, + "grad_norm": 0.0014019283698871732, + "learning_rate": 1.2551066846832328e-07, + "loss": 0.0775, + "num_input_tokens_seen": 37264784, + "step": 176580 + }, + { + "epoch": 19.426292629262925, + "grad_norm": 1.139819622039795, + "learning_rate": 1.2527058860657691e-07, + "loss": 0.0981, + "num_input_tokens_seen": 37265872, + "step": 176585 + }, + { + "epoch": 19.426842684268426, + "grad_norm": 0.010034838691353798, + "learning_rate": 1.2503073800327614e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37266992, + "step": 176590 + }, + { + "epoch": 19.427392739273927, + "grad_norm": 0.11110097914934158, + "learning_rate": 1.2479111666063304e-07, + "loss": 0.0066, + "num_input_tokens_seen": 37267984, + "step": 176595 + }, + { + "epoch": 19.427942794279428, + "grad_norm": 0.015368077903985977, + "learning_rate": 1.245517245808514e-07, + "loss": 0.0061, + "num_input_tokens_seen": 37269008, + "step": 176600 + }, + { + "epoch": 19.42849284928493, + "grad_norm": 0.013000636361539364, + "learning_rate": 1.2431256176614337e-07, + "loss": 0.0376, + "num_input_tokens_seen": 37270032, + "step": 176605 + }, + { + "epoch": 19.42904290429043, + "grad_norm": 0.010336427018046379, + "learning_rate": 1.2407362821870717e-07, + "loss": 0.0447, + "num_input_tokens_seen": 37271088, + "step": 176610 + }, + { + "epoch": 19.42959295929593, + "grad_norm": 6.116230010986328, + "learning_rate": 1.2383492394074937e-07, + "loss": 0.083, + "num_input_tokens_seen": 37272176, + "step": 176615 + }, + { + "epoch": 19.43014301430143, + "grad_norm": 0.0762602910399437, + "learning_rate": 1.235964489344682e-07, + "loss": 0.0016, + "num_input_tokens_seen": 37273328, + "step": 176620 + }, + { + "epoch": 19.43069306930693, + "grad_norm": 0.6405605673789978, + "learning_rate": 1.2335820320205916e-07, + "loss": 0.0557, + "num_input_tokens_seen": 37274384, + "step": 176625 + }, + { + "epoch": 19.43124312431243, + "grad_norm": 0.07870340347290039, + "learning_rate": 1.2312018674572322e-07, + "loss": 0.0039, + "num_input_tokens_seen": 37275376, + "step": 176630 + }, + { + "epoch": 19.431793179317932, + "grad_norm": 0.7198119163513184, + "learning_rate": 1.2288239956765313e-07, + "loss": 0.006, + "num_input_tokens_seen": 37276432, + "step": 176635 + }, + { + "epoch": 19.432343234323433, + "grad_norm": 2.309598684310913, + "learning_rate": 1.22644841670036e-07, + "loss": 0.0712, + "num_input_tokens_seen": 37277552, + "step": 176640 + }, + { + "epoch": 19.432893289328934, + "grad_norm": 0.05742933601140976, + "learning_rate": 1.224075130550617e-07, + "loss": 0.0055, + "num_input_tokens_seen": 37278704, + "step": 176645 + }, + { + "epoch": 19.433443344334435, + "grad_norm": 0.249970480799675, + "learning_rate": 1.22170413724923e-07, + "loss": 0.0033, + "num_input_tokens_seen": 37279728, + "step": 176650 + }, + { + "epoch": 19.433993399339933, + "grad_norm": 0.08914307504892349, + "learning_rate": 1.219335436818042e-07, + "loss": 0.012, + "num_input_tokens_seen": 37280720, + "step": 176655 + }, + { + "epoch": 19.434543454345434, + "grad_norm": 0.059493351727724075, + "learning_rate": 1.2169690292788417e-07, + "loss": 0.0017, + "num_input_tokens_seen": 37281744, + "step": 176660 + }, + { + "epoch": 19.435093509350935, + "grad_norm": 0.013011549599468708, + "learning_rate": 1.2146049146534443e-07, + "loss": 0.0037, + "num_input_tokens_seen": 37282832, + "step": 176665 + }, + { + "epoch": 19.435643564356436, + "grad_norm": 0.0649644285440445, + "learning_rate": 1.2122430929636662e-07, + "loss": 0.0291, + "num_input_tokens_seen": 37283856, + "step": 176670 + }, + { + "epoch": 19.436193619361937, + "grad_norm": 0.16080798208713531, + "learning_rate": 1.20988356423124e-07, + "loss": 0.0721, + "num_input_tokens_seen": 37284880, + "step": 176675 + }, + { + "epoch": 19.436743674367438, + "grad_norm": 0.009758302010595798, + "learning_rate": 1.2075263284779815e-07, + "loss": 0.0038, + "num_input_tokens_seen": 37285904, + "step": 176680 + }, + { + "epoch": 19.437293729372936, + "grad_norm": 0.02187921293079853, + "learning_rate": 1.2051713857255398e-07, + "loss": 0.0043, + "num_input_tokens_seen": 37287024, + "step": 176685 + }, + { + "epoch": 19.437843784378437, + "grad_norm": 2.0694048404693604, + "learning_rate": 1.2028187359956754e-07, + "loss": 0.0485, + "num_input_tokens_seen": 37288080, + "step": 176690 + }, + { + "epoch": 19.438393839383938, + "grad_norm": 0.008019122295081615, + "learning_rate": 1.200468379310038e-07, + "loss": 0.0608, + "num_input_tokens_seen": 37289168, + "step": 176695 + }, + { + "epoch": 19.43894389438944, + "grad_norm": 0.012938682921230793, + "learning_rate": 1.1981203156902765e-07, + "loss": 0.0008, + "num_input_tokens_seen": 37290192, + "step": 176700 + }, + { + "epoch": 19.43949394939494, + "grad_norm": 0.02287333272397518, + "learning_rate": 1.1957745451580682e-07, + "loss": 0.0952, + "num_input_tokens_seen": 37291184, + "step": 176705 + }, + { + "epoch": 19.44004400440044, + "grad_norm": 0.008017227984964848, + "learning_rate": 1.193431067735007e-07, + "loss": 0.0074, + "num_input_tokens_seen": 37292240, + "step": 176710 + }, + { + "epoch": 19.440594059405942, + "grad_norm": 0.00570854265242815, + "learning_rate": 1.1910898834427143e-07, + "loss": 0.0275, + "num_input_tokens_seen": 37293328, + "step": 176715 + }, + { + "epoch": 19.44114411441144, + "grad_norm": 0.8339309096336365, + "learning_rate": 1.1887509923027562e-07, + "loss": 0.0159, + "num_input_tokens_seen": 37294352, + "step": 176720 + }, + { + "epoch": 19.44169416941694, + "grad_norm": 0.27506765723228455, + "learning_rate": 1.186414394336699e-07, + "loss": 0.0368, + "num_input_tokens_seen": 37295440, + "step": 176725 + }, + { + "epoch": 19.442244224422442, + "grad_norm": 0.008842498064041138, + "learning_rate": 1.1840800895660531e-07, + "loss": 0.002, + "num_input_tokens_seen": 37296560, + "step": 176730 + }, + { + "epoch": 19.442794279427943, + "grad_norm": 0.008887250907719135, + "learning_rate": 1.1817480780123569e-07, + "loss": 0.0129, + "num_input_tokens_seen": 37297680, + "step": 176735 + }, + { + "epoch": 19.443344334433444, + "grad_norm": 1.9174320697784424, + "learning_rate": 1.1794183596970654e-07, + "loss": 0.0283, + "num_input_tokens_seen": 37298704, + "step": 176740 + }, + { + "epoch": 19.443894389438945, + "grad_norm": 0.21526984870433807, + "learning_rate": 1.1770909346417169e-07, + "loss": 0.0353, + "num_input_tokens_seen": 37299728, + "step": 176745 + }, + { + "epoch": 19.444444444444443, + "grad_norm": 0.06644486635923386, + "learning_rate": 1.1747658028677112e-07, + "loss": 0.0608, + "num_input_tokens_seen": 37300752, + "step": 176750 + }, + { + "epoch": 19.444994499449944, + "grad_norm": 0.1461285948753357, + "learning_rate": 1.1724429643964752e-07, + "loss": 0.0283, + "num_input_tokens_seen": 37301840, + "step": 176755 + }, + { + "epoch": 19.445544554455445, + "grad_norm": 0.039681486785411835, + "learning_rate": 1.1701224192494365e-07, + "loss": 0.0126, + "num_input_tokens_seen": 37302928, + "step": 176760 + }, + { + "epoch": 19.446094609460946, + "grad_norm": 0.1779540479183197, + "learning_rate": 1.1678041674479945e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37303952, + "step": 176765 + }, + { + "epoch": 19.446644664466447, + "grad_norm": 2.0430967807769775, + "learning_rate": 1.1654882090134933e-07, + "loss": 0.0245, + "num_input_tokens_seen": 37305040, + "step": 176770 + }, + { + "epoch": 19.44719471947195, + "grad_norm": 0.005526519846171141, + "learning_rate": 1.1631745439673047e-07, + "loss": 0.0383, + "num_input_tokens_seen": 37306064, + "step": 176775 + }, + { + "epoch": 19.44774477447745, + "grad_norm": 0.0387934148311615, + "learning_rate": 1.1608631723307173e-07, + "loss": 0.0018, + "num_input_tokens_seen": 37307120, + "step": 176780 + }, + { + "epoch": 19.448294829482947, + "grad_norm": 0.813019871711731, + "learning_rate": 1.1585540941250472e-07, + "loss": 0.0082, + "num_input_tokens_seen": 37308112, + "step": 176785 + }, + { + "epoch": 19.448844884488448, + "grad_norm": 0.07243039458990097, + "learning_rate": 1.1562473093715554e-07, + "loss": 0.0053, + "num_input_tokens_seen": 37309168, + "step": 176790 + }, + { + "epoch": 19.44939493949395, + "grad_norm": 3.362898826599121, + "learning_rate": 1.1539428180915579e-07, + "loss": 0.0681, + "num_input_tokens_seen": 37310224, + "step": 176795 + }, + { + "epoch": 19.44994499449945, + "grad_norm": 0.02993941679596901, + "learning_rate": 1.1516406203062602e-07, + "loss": 0.002, + "num_input_tokens_seen": 37311280, + "step": 176800 + }, + { + "epoch": 19.45049504950495, + "grad_norm": 0.013257158920168877, + "learning_rate": 1.1493407160368674e-07, + "loss": 0.0689, + "num_input_tokens_seen": 37312336, + "step": 176805 + }, + { + "epoch": 19.451045104510452, + "grad_norm": 0.0749010294675827, + "learning_rate": 1.1470431053046127e-07, + "loss": 0.1006, + "num_input_tokens_seen": 37313328, + "step": 176810 + }, + { + "epoch": 19.45159515951595, + "grad_norm": 0.0019068304682150483, + "learning_rate": 1.1447477881306179e-07, + "loss": 0.0041, + "num_input_tokens_seen": 37314352, + "step": 176815 + }, + { + "epoch": 19.45214521452145, + "grad_norm": 0.029322654008865356, + "learning_rate": 1.1424547645360884e-07, + "loss": 0.0636, + "num_input_tokens_seen": 37315344, + "step": 176820 + }, + { + "epoch": 19.452695269526952, + "grad_norm": 0.011896803975105286, + "learning_rate": 1.140164034542146e-07, + "loss": 0.0638, + "num_input_tokens_seen": 37316336, + "step": 176825 + }, + { + "epoch": 19.453245324532453, + "grad_norm": 0.09852565824985504, + "learning_rate": 1.1378755981698852e-07, + "loss": 0.0129, + "num_input_tokens_seen": 37317392, + "step": 176830 + }, + { + "epoch": 19.453795379537954, + "grad_norm": 0.01750071346759796, + "learning_rate": 1.1355894554404278e-07, + "loss": 0.0047, + "num_input_tokens_seen": 37318384, + "step": 176835 + }, + { + "epoch": 19.454345434543455, + "grad_norm": 0.014920157380402088, + "learning_rate": 1.133305606374785e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37319376, + "step": 176840 + }, + { + "epoch": 19.454895489548957, + "grad_norm": 0.4473333954811096, + "learning_rate": 1.1310240509940784e-07, + "loss": 0.0569, + "num_input_tokens_seen": 37320400, + "step": 176845 + }, + { + "epoch": 19.455445544554454, + "grad_norm": 1.0118458271026611, + "learning_rate": 1.1287447893192915e-07, + "loss": 0.0135, + "num_input_tokens_seen": 37321456, + "step": 176850 + }, + { + "epoch": 19.455995599559955, + "grad_norm": 0.2867203652858734, + "learning_rate": 1.126467821371463e-07, + "loss": 0.0064, + "num_input_tokens_seen": 37322544, + "step": 176855 + }, + { + "epoch": 19.456545654565456, + "grad_norm": 0.029040075838565826, + "learning_rate": 1.1241931471715483e-07, + "loss": 0.1184, + "num_input_tokens_seen": 37323664, + "step": 176860 + }, + { + "epoch": 19.457095709570957, + "grad_norm": 0.03605801984667778, + "learning_rate": 1.1219207667405307e-07, + "loss": 0.0057, + "num_input_tokens_seen": 37324720, + "step": 176865 + }, + { + "epoch": 19.45764576457646, + "grad_norm": 0.061849359422922134, + "learning_rate": 1.1196506800993378e-07, + "loss": 0.0056, + "num_input_tokens_seen": 37325744, + "step": 176870 + }, + { + "epoch": 19.45819581958196, + "grad_norm": 0.008559207431972027, + "learning_rate": 1.1173828872688974e-07, + "loss": 0.0824, + "num_input_tokens_seen": 37326832, + "step": 176875 + }, + { + "epoch": 19.458745874587457, + "grad_norm": 0.03526010736823082, + "learning_rate": 1.1151173882701093e-07, + "loss": 0.0688, + "num_input_tokens_seen": 37327856, + "step": 176880 + }, + { + "epoch": 19.459295929592958, + "grad_norm": 0.1640169769525528, + "learning_rate": 1.1128541831238737e-07, + "loss": 0.0641, + "num_input_tokens_seen": 37328912, + "step": 176885 + }, + { + "epoch": 19.45984598459846, + "grad_norm": 0.0020252345129847527, + "learning_rate": 1.110593271851007e-07, + "loss": 0.0078, + "num_input_tokens_seen": 37329968, + "step": 176890 + }, + { + "epoch": 19.46039603960396, + "grad_norm": 0.05090494081377983, + "learning_rate": 1.1083346544724093e-07, + "loss": 0.0083, + "num_input_tokens_seen": 37331024, + "step": 176895 + }, + { + "epoch": 19.46094609460946, + "grad_norm": 0.01668599061667919, + "learning_rate": 1.1060783310088419e-07, + "loss": 0.0065, + "num_input_tokens_seen": 37332080, + "step": 176900 + }, + { + "epoch": 19.461496149614963, + "grad_norm": 0.03407417982816696, + "learning_rate": 1.1038243014811212e-07, + "loss": 0.0112, + "num_input_tokens_seen": 37333168, + "step": 176905 + }, + { + "epoch": 19.462046204620464, + "grad_norm": 0.02164563350379467, + "learning_rate": 1.1015725659100084e-07, + "loss": 0.0079, + "num_input_tokens_seen": 37334256, + "step": 176910 + }, + { + "epoch": 19.46259625962596, + "grad_norm": 0.03423323482275009, + "learning_rate": 1.099323124316265e-07, + "loss": 0.0769, + "num_input_tokens_seen": 37335312, + "step": 176915 + }, + { + "epoch": 19.463146314631462, + "grad_norm": 0.1138697937130928, + "learning_rate": 1.0970759767206518e-07, + "loss": 0.0045, + "num_input_tokens_seen": 37336400, + "step": 176920 + }, + { + "epoch": 19.463696369636963, + "grad_norm": 0.05330326780676842, + "learning_rate": 1.0948311231438468e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37337424, + "step": 176925 + }, + { + "epoch": 19.464246424642464, + "grad_norm": 0.016543203964829445, + "learning_rate": 1.0925885636065558e-07, + "loss": 0.0108, + "num_input_tokens_seen": 37338480, + "step": 176930 + }, + { + "epoch": 19.464796479647966, + "grad_norm": 0.06934250891208649, + "learning_rate": 1.0903482981294011e-07, + "loss": 0.0648, + "num_input_tokens_seen": 37339568, + "step": 176935 + }, + { + "epoch": 19.465346534653467, + "grad_norm": 0.030506756156682968, + "learning_rate": 1.0881103267330884e-07, + "loss": 0.0019, + "num_input_tokens_seen": 37340560, + "step": 176940 + }, + { + "epoch": 19.465896589658964, + "grad_norm": 1.0287972688674927, + "learning_rate": 1.0858746494382399e-07, + "loss": 0.0076, + "num_input_tokens_seen": 37341680, + "step": 176945 + }, + { + "epoch": 19.466446644664465, + "grad_norm": 0.09617499262094498, + "learning_rate": 1.0836412662654505e-07, + "loss": 0.0071, + "num_input_tokens_seen": 37342704, + "step": 176950 + }, + { + "epoch": 19.466996699669966, + "grad_norm": 0.009919890202581882, + "learning_rate": 1.0814101772352592e-07, + "loss": 0.0013, + "num_input_tokens_seen": 37343792, + "step": 176955 + }, + { + "epoch": 19.467546754675467, + "grad_norm": 0.006636030040681362, + "learning_rate": 1.0791813823682884e-07, + "loss": 0.0029, + "num_input_tokens_seen": 37344784, + "step": 176960 + }, + { + "epoch": 19.46809680968097, + "grad_norm": 0.43564116954803467, + "learning_rate": 1.0769548816850495e-07, + "loss": 0.0073, + "num_input_tokens_seen": 37345840, + "step": 176965 + }, + { + "epoch": 19.46864686468647, + "grad_norm": 0.049121323972940445, + "learning_rate": 1.0747306752060815e-07, + "loss": 0.0033, + "num_input_tokens_seen": 37346832, + "step": 176970 + }, + { + "epoch": 19.46919691969197, + "grad_norm": 2.7537217140197754, + "learning_rate": 1.0725087629518959e-07, + "loss": 0.1299, + "num_input_tokens_seen": 37347920, + "step": 176975 + }, + { + "epoch": 19.46974697469747, + "grad_norm": 0.11118817329406738, + "learning_rate": 1.0702891449429209e-07, + "loss": 0.0737, + "num_input_tokens_seen": 37348912, + "step": 176980 + }, + { + "epoch": 19.47029702970297, + "grad_norm": 0.004857661202549934, + "learning_rate": 1.0680718211996399e-07, + "loss": 0.1107, + "num_input_tokens_seen": 37350032, + "step": 176985 + }, + { + "epoch": 19.47084708470847, + "grad_norm": 0.06586969643831253, + "learning_rate": 1.0658567917425089e-07, + "loss": 0.002, + "num_input_tokens_seen": 37351088, + "step": 176990 + }, + { + "epoch": 19.47139713971397, + "grad_norm": 0.17187868058681488, + "learning_rate": 1.0636440565919004e-07, + "loss": 0.0019, + "num_input_tokens_seen": 37352176, + "step": 176995 + }, + { + "epoch": 19.471947194719473, + "grad_norm": 0.013406460173428059, + "learning_rate": 1.0614336157682703e-07, + "loss": 0.011, + "num_input_tokens_seen": 37353232, + "step": 177000 + }, + { + "epoch": 19.472497249724974, + "grad_norm": 0.3528887927532196, + "learning_rate": 1.0592254692919356e-07, + "loss": 0.099, + "num_input_tokens_seen": 37354224, + "step": 177005 + }, + { + "epoch": 19.47304730473047, + "grad_norm": 0.06244758144021034, + "learning_rate": 1.0570196171832692e-07, + "loss": 0.0282, + "num_input_tokens_seen": 37355280, + "step": 177010 + }, + { + "epoch": 19.473597359735972, + "grad_norm": 0.03731955215334892, + "learning_rate": 1.0548160594625878e-07, + "loss": 0.001, + "num_input_tokens_seen": 37356336, + "step": 177015 + }, + { + "epoch": 19.474147414741473, + "grad_norm": 0.009014484472572803, + "learning_rate": 1.0526147961502086e-07, + "loss": 0.0013, + "num_input_tokens_seen": 37357456, + "step": 177020 + }, + { + "epoch": 19.474697469746975, + "grad_norm": 1.1854264736175537, + "learning_rate": 1.0504158272664488e-07, + "loss": 0.0077, + "num_input_tokens_seen": 37358512, + "step": 177025 + }, + { + "epoch": 19.475247524752476, + "grad_norm": 0.09066394716501236, + "learning_rate": 1.0482191528315143e-07, + "loss": 0.0056, + "num_input_tokens_seen": 37359568, + "step": 177030 + }, + { + "epoch": 19.475797579757977, + "grad_norm": 0.03001621924340725, + "learning_rate": 1.0460247728657225e-07, + "loss": 0.0275, + "num_input_tokens_seen": 37360624, + "step": 177035 + }, + { + "epoch": 19.476347634763478, + "grad_norm": 0.009795719757676125, + "learning_rate": 1.0438326873892512e-07, + "loss": 0.0066, + "num_input_tokens_seen": 37361680, + "step": 177040 + }, + { + "epoch": 19.476897689768975, + "grad_norm": 0.0038191964849829674, + "learning_rate": 1.041642896422279e-07, + "loss": 0.0009, + "num_input_tokens_seen": 37362736, + "step": 177045 + }, + { + "epoch": 19.477447744774476, + "grad_norm": 0.0034536144230514765, + "learning_rate": 1.0394553999850676e-07, + "loss": 0.0054, + "num_input_tokens_seen": 37363728, + "step": 177050 + }, + { + "epoch": 19.477997799779978, + "grad_norm": 0.03586812689900398, + "learning_rate": 1.037270198097684e-07, + "loss": 0.0051, + "num_input_tokens_seen": 37364816, + "step": 177055 + }, + { + "epoch": 19.47854785478548, + "grad_norm": 0.09387747943401337, + "learning_rate": 1.0350872907803622e-07, + "loss": 0.0131, + "num_input_tokens_seen": 37365872, + "step": 177060 + }, + { + "epoch": 19.47909790979098, + "grad_norm": 0.036005809903144836, + "learning_rate": 1.0329066780531694e-07, + "loss": 0.004, + "num_input_tokens_seen": 37366864, + "step": 177065 + }, + { + "epoch": 19.47964796479648, + "grad_norm": 0.004995939787477255, + "learning_rate": 1.0307283599362006e-07, + "loss": 0.0285, + "num_input_tokens_seen": 37367952, + "step": 177070 + }, + { + "epoch": 19.480198019801982, + "grad_norm": 0.02963818423449993, + "learning_rate": 1.0285523364495508e-07, + "loss": 0.0043, + "num_input_tokens_seen": 37369040, + "step": 177075 + }, + { + "epoch": 19.48074807480748, + "grad_norm": 0.15439659357070923, + "learning_rate": 1.026378607613232e-07, + "loss": 0.1469, + "num_input_tokens_seen": 37370064, + "step": 177080 + }, + { + "epoch": 19.48129812981298, + "grad_norm": 0.7326160073280334, + "learning_rate": 1.024207173447339e-07, + "loss": 0.0351, + "num_input_tokens_seen": 37371184, + "step": 177085 + }, + { + "epoch": 19.48184818481848, + "grad_norm": 0.46375352144241333, + "learning_rate": 1.022038033971856e-07, + "loss": 0.0083, + "num_input_tokens_seen": 37372240, + "step": 177090 + }, + { + "epoch": 19.482398239823983, + "grad_norm": 0.22131353616714478, + "learning_rate": 1.019871189206767e-07, + "loss": 0.0035, + "num_input_tokens_seen": 37373328, + "step": 177095 + }, + { + "epoch": 19.482948294829484, + "grad_norm": 0.46674850583076477, + "learning_rate": 1.0177066391720836e-07, + "loss": 0.0058, + "num_input_tokens_seen": 37374352, + "step": 177100 + }, + { + "epoch": 19.483498349834985, + "grad_norm": 0.023716548457741737, + "learning_rate": 1.0155443838876788e-07, + "loss": 0.0012, + "num_input_tokens_seen": 37375440, + "step": 177105 + }, + { + "epoch": 19.484048404840483, + "grad_norm": 0.44948363304138184, + "learning_rate": 1.0133844233735646e-07, + "loss": 0.0039, + "num_input_tokens_seen": 37376464, + "step": 177110 + }, + { + "epoch": 19.484598459845984, + "grad_norm": 0.04533956199884415, + "learning_rate": 1.011226757649586e-07, + "loss": 0.0025, + "num_input_tokens_seen": 37377552, + "step": 177115 + }, + { + "epoch": 19.485148514851485, + "grad_norm": 0.3714190721511841, + "learning_rate": 1.0090713867356715e-07, + "loss": 0.0044, + "num_input_tokens_seen": 37378608, + "step": 177120 + }, + { + "epoch": 19.485698569856986, + "grad_norm": 0.015859559178352356, + "learning_rate": 1.0069183106516666e-07, + "loss": 0.0048, + "num_input_tokens_seen": 37379632, + "step": 177125 + }, + { + "epoch": 19.486248624862487, + "grad_norm": 0.11667589843273163, + "learning_rate": 1.0047675294174164e-07, + "loss": 0.0128, + "num_input_tokens_seen": 37380624, + "step": 177130 + }, + { + "epoch": 19.486798679867988, + "grad_norm": 0.03854300081729889, + "learning_rate": 1.0026190430527382e-07, + "loss": 0.0017, + "num_input_tokens_seen": 37381712, + "step": 177135 + }, + { + "epoch": 19.48734873487349, + "grad_norm": 0.043602973222732544, + "learning_rate": 1.0004728515774497e-07, + "loss": 0.0577, + "num_input_tokens_seen": 37382768, + "step": 177140 + }, + { + "epoch": 19.487898789878987, + "grad_norm": 1.3349344730377197, + "learning_rate": 9.983289550113129e-08, + "loss": 0.0489, + "num_input_tokens_seen": 37383856, + "step": 177145 + }, + { + "epoch": 19.488448844884488, + "grad_norm": 0.021483007818460464, + "learning_rate": 9.961873533740895e-08, + "loss": 0.0006, + "num_input_tokens_seen": 37384880, + "step": 177150 + }, + { + "epoch": 19.48899889988999, + "grad_norm": 0.10330944508314133, + "learning_rate": 9.940480466855417e-08, + "loss": 0.0038, + "num_input_tokens_seen": 37385968, + "step": 177155 + }, + { + "epoch": 19.48954895489549, + "grad_norm": 0.04755093902349472, + "learning_rate": 9.919110349653482e-08, + "loss": 0.0228, + "num_input_tokens_seen": 37386992, + "step": 177160 + }, + { + "epoch": 19.49009900990099, + "grad_norm": 0.5634294748306274, + "learning_rate": 9.897763182332431e-08, + "loss": 0.0058, + "num_input_tokens_seen": 37388112, + "step": 177165 + }, + { + "epoch": 19.490649064906492, + "grad_norm": 0.05133267864584923, + "learning_rate": 9.876438965088496e-08, + "loss": 0.0032, + "num_input_tokens_seen": 37389200, + "step": 177170 + }, + { + "epoch": 19.49119911991199, + "grad_norm": 0.03288945555686951, + "learning_rate": 9.855137698119022e-08, + "loss": 0.0307, + "num_input_tokens_seen": 37390192, + "step": 177175 + }, + { + "epoch": 19.49174917491749, + "grad_norm": 0.01387069746851921, + "learning_rate": 9.833859381619404e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37391152, + "step": 177180 + }, + { + "epoch": 19.492299229922992, + "grad_norm": 1.8747202157974243, + "learning_rate": 9.812604015786708e-08, + "loss": 0.0434, + "num_input_tokens_seen": 37392208, + "step": 177185 + }, + { + "epoch": 19.492849284928493, + "grad_norm": 0.02369861863553524, + "learning_rate": 9.791371600816057e-08, + "loss": 0.0232, + "num_input_tokens_seen": 37393264, + "step": 177190 + }, + { + "epoch": 19.493399339933994, + "grad_norm": 2.19844126701355, + "learning_rate": 9.770162136903128e-08, + "loss": 0.0116, + "num_input_tokens_seen": 37394288, + "step": 177195 + }, + { + "epoch": 19.493949394939495, + "grad_norm": 0.21669228374958038, + "learning_rate": 9.74897562424415e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37395376, + "step": 177200 + }, + { + "epoch": 19.494499449944996, + "grad_norm": 0.06627824157476425, + "learning_rate": 9.727812063033692e-08, + "loss": 0.0023, + "num_input_tokens_seen": 37396496, + "step": 177205 + }, + { + "epoch": 19.495049504950494, + "grad_norm": 0.2784802317619324, + "learning_rate": 9.706671453467154e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37397584, + "step": 177210 + }, + { + "epoch": 19.495599559955995, + "grad_norm": 0.14268848299980164, + "learning_rate": 9.685553795739378e-08, + "loss": 0.0014, + "num_input_tokens_seen": 37398608, + "step": 177215 + }, + { + "epoch": 19.496149614961496, + "grad_norm": 2.2621469497680664, + "learning_rate": 9.664459090044653e-08, + "loss": 0.0548, + "num_input_tokens_seen": 37399632, + "step": 177220 + }, + { + "epoch": 19.496699669966997, + "grad_norm": 0.07131659984588623, + "learning_rate": 9.643387336577547e-08, + "loss": 0.0321, + "num_input_tokens_seen": 37400592, + "step": 177225 + }, + { + "epoch": 19.497249724972498, + "grad_norm": 0.008937465958297253, + "learning_rate": 9.622338535532627e-08, + "loss": 0.0109, + "num_input_tokens_seen": 37401584, + "step": 177230 + }, + { + "epoch": 19.497799779978, + "grad_norm": 0.005812226794660091, + "learning_rate": 9.601312687103347e-08, + "loss": 0.005, + "num_input_tokens_seen": 37402704, + "step": 177235 + }, + { + "epoch": 19.498349834983497, + "grad_norm": 0.04104674980044365, + "learning_rate": 9.580309791484e-08, + "loss": 0.0071, + "num_input_tokens_seen": 37403728, + "step": 177240 + }, + { + "epoch": 19.498899889988998, + "grad_norm": 0.12881579995155334, + "learning_rate": 9.559329848867759e-08, + "loss": 0.0182, + "num_input_tokens_seen": 37404784, + "step": 177245 + }, + { + "epoch": 19.4994499449945, + "grad_norm": 0.02284514158964157, + "learning_rate": 9.538372859448364e-08, + "loss": 0.0173, + "num_input_tokens_seen": 37405808, + "step": 177250 + }, + { + "epoch": 19.5, + "grad_norm": 0.07033403217792511, + "learning_rate": 9.517438823418434e-08, + "loss": 0.0147, + "num_input_tokens_seen": 37406832, + "step": 177255 + }, + { + "epoch": 19.5005500550055, + "grad_norm": 0.11922775954008102, + "learning_rate": 9.496527740971429e-08, + "loss": 0.0038, + "num_input_tokens_seen": 37407856, + "step": 177260 + }, + { + "epoch": 19.501100110011002, + "grad_norm": 0.019571181386709213, + "learning_rate": 9.475639612299691e-08, + "loss": 0.0822, + "num_input_tokens_seen": 37408912, + "step": 177265 + }, + { + "epoch": 19.501650165016503, + "grad_norm": 0.26265862584114075, + "learning_rate": 9.454774437596126e-08, + "loss": 0.0557, + "num_input_tokens_seen": 37410064, + "step": 177270 + }, + { + "epoch": 19.502200220022, + "grad_norm": 0.05373164638876915, + "learning_rate": 9.4339322170528e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37411120, + "step": 177275 + }, + { + "epoch": 19.502750275027502, + "grad_norm": 0.01303172204643488, + "learning_rate": 9.413112950861502e-08, + "loss": 0.0026, + "num_input_tokens_seen": 37412176, + "step": 177280 + }, + { + "epoch": 19.503300330033003, + "grad_norm": 0.03691648319363594, + "learning_rate": 9.39231663921486e-08, + "loss": 0.0038, + "num_input_tokens_seen": 37413264, + "step": 177285 + }, + { + "epoch": 19.503850385038504, + "grad_norm": 4.037435531616211, + "learning_rate": 9.37154328230383e-08, + "loss": 0.0856, + "num_input_tokens_seen": 37414384, + "step": 177290 + }, + { + "epoch": 19.504400440044005, + "grad_norm": 0.02218356728553772, + "learning_rate": 9.350792880320203e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37415504, + "step": 177295 + }, + { + "epoch": 19.504950495049506, + "grad_norm": 0.01688544638454914, + "learning_rate": 9.330065433455493e-08, + "loss": 0.026, + "num_input_tokens_seen": 37416528, + "step": 177300 + }, + { + "epoch": 19.505500550055004, + "grad_norm": 0.0018327627331018448, + "learning_rate": 9.309360941900103e-08, + "loss": 0.0255, + "num_input_tokens_seen": 37417552, + "step": 177305 + }, + { + "epoch": 19.506050605060505, + "grad_norm": 0.035761509090662, + "learning_rate": 9.288679405845268e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37418576, + "step": 177310 + }, + { + "epoch": 19.506600660066006, + "grad_norm": 0.019446328282356262, + "learning_rate": 9.268020825481671e-08, + "loss": 0.0389, + "num_input_tokens_seen": 37419600, + "step": 177315 + }, + { + "epoch": 19.507150715071507, + "grad_norm": 1.5542230606079102, + "learning_rate": 9.247385200999437e-08, + "loss": 0.0968, + "num_input_tokens_seen": 37420656, + "step": 177320 + }, + { + "epoch": 19.507700770077008, + "grad_norm": 0.23079343140125275, + "learning_rate": 9.226772532588968e-08, + "loss": 0.0044, + "num_input_tokens_seen": 37421680, + "step": 177325 + }, + { + "epoch": 19.50825082508251, + "grad_norm": 0.008915519341826439, + "learning_rate": 9.206182820440113e-08, + "loss": 0.0023, + "num_input_tokens_seen": 37422768, + "step": 177330 + }, + { + "epoch": 19.50880088008801, + "grad_norm": 0.00668494263663888, + "learning_rate": 9.185616064742442e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37423760, + "step": 177335 + }, + { + "epoch": 19.509350935093508, + "grad_norm": 2.0711987018585205, + "learning_rate": 9.165072265685804e-08, + "loss": 0.0804, + "num_input_tokens_seen": 37424784, + "step": 177340 + }, + { + "epoch": 19.50990099009901, + "grad_norm": 0.3802002966403961, + "learning_rate": 9.144551423459491e-08, + "loss": 0.0059, + "num_input_tokens_seen": 37425840, + "step": 177345 + }, + { + "epoch": 19.51045104510451, + "grad_norm": 0.0035836631432175636, + "learning_rate": 9.124053538252796e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37426896, + "step": 177350 + }, + { + "epoch": 19.51100110011001, + "grad_norm": 0.07333219051361084, + "learning_rate": 9.10357861025446e-08, + "loss": 0.0098, + "num_input_tokens_seen": 37427952, + "step": 177355 + }, + { + "epoch": 19.511551155115512, + "grad_norm": 0.049057360738515854, + "learning_rate": 9.083126639653216e-08, + "loss": 0.0051, + "num_input_tokens_seen": 37428976, + "step": 177360 + }, + { + "epoch": 19.512101210121013, + "grad_norm": 0.006633492652326822, + "learning_rate": 9.062697626637251e-08, + "loss": 0.0236, + "num_input_tokens_seen": 37430000, + "step": 177365 + }, + { + "epoch": 19.51265126512651, + "grad_norm": 0.22121095657348633, + "learning_rate": 9.0422915713953e-08, + "loss": 0.0068, + "num_input_tokens_seen": 37431024, + "step": 177370 + }, + { + "epoch": 19.513201320132012, + "grad_norm": 0.035853516310453415, + "learning_rate": 9.02190847411527e-08, + "loss": 0.0021, + "num_input_tokens_seen": 37432016, + "step": 177375 + }, + { + "epoch": 19.513751375137513, + "grad_norm": 0.07740446925163269, + "learning_rate": 9.001548334985066e-08, + "loss": 0.0133, + "num_input_tokens_seen": 37433040, + "step": 177380 + }, + { + "epoch": 19.514301430143014, + "grad_norm": 0.17007490992546082, + "learning_rate": 8.981211154192315e-08, + "loss": 0.0692, + "num_input_tokens_seen": 37434032, + "step": 177385 + }, + { + "epoch": 19.514851485148515, + "grad_norm": 0.42509761452674866, + "learning_rate": 8.960896931924367e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37435088, + "step": 177390 + }, + { + "epoch": 19.515401540154016, + "grad_norm": 0.00594178494066, + "learning_rate": 8.940605668368573e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37436112, + "step": 177395 + }, + { + "epoch": 19.515951595159517, + "grad_norm": 0.009773001074790955, + "learning_rate": 8.920337363711728e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37437168, + "step": 177400 + }, + { + "epoch": 19.516501650165015, + "grad_norm": 0.00925271213054657, + "learning_rate": 8.900092018140904e-08, + "loss": 0.0772, + "num_input_tokens_seen": 37438256, + "step": 177405 + }, + { + "epoch": 19.517051705170516, + "grad_norm": 0.08960472047328949, + "learning_rate": 8.879869631842619e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37439344, + "step": 177410 + }, + { + "epoch": 19.517601760176017, + "grad_norm": 1.2576905488967896, + "learning_rate": 8.859670205003113e-08, + "loss": 0.0177, + "num_input_tokens_seen": 37440400, + "step": 177415 + }, + { + "epoch": 19.51815181518152, + "grad_norm": 0.018961796537041664, + "learning_rate": 8.839493737808901e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37441520, + "step": 177420 + }, + { + "epoch": 19.51870187018702, + "grad_norm": 0.009641101583838463, + "learning_rate": 8.819340230445394e-08, + "loss": 0.0028, + "num_input_tokens_seen": 37442544, + "step": 177425 + }, + { + "epoch": 19.51925192519252, + "grad_norm": 0.017696931958198547, + "learning_rate": 8.79920968309883e-08, + "loss": 0.0021, + "num_input_tokens_seen": 37443664, + "step": 177430 + }, + { + "epoch": 19.519801980198018, + "grad_norm": 0.018174419179558754, + "learning_rate": 8.779102095954616e-08, + "loss": 0.066, + "num_input_tokens_seen": 37444784, + "step": 177435 + }, + { + "epoch": 19.52035203520352, + "grad_norm": 0.08272258192300797, + "learning_rate": 8.759017469197883e-08, + "loss": 0.0391, + "num_input_tokens_seen": 37445840, + "step": 177440 + }, + { + "epoch": 19.52090209020902, + "grad_norm": 0.3543453514575958, + "learning_rate": 8.738955803014037e-08, + "loss": 0.112, + "num_input_tokens_seen": 37446896, + "step": 177445 + }, + { + "epoch": 19.52145214521452, + "grad_norm": 0.337760865688324, + "learning_rate": 8.718917097587653e-08, + "loss": 0.0447, + "num_input_tokens_seen": 37447952, + "step": 177450 + }, + { + "epoch": 19.522002200220022, + "grad_norm": 0.003558623604476452, + "learning_rate": 8.698901353103306e-08, + "loss": 0.007, + "num_input_tokens_seen": 37449040, + "step": 177455 + }, + { + "epoch": 19.522552255225524, + "grad_norm": 1.5354286432266235, + "learning_rate": 8.678908569746125e-08, + "loss": 0.0723, + "num_input_tokens_seen": 37450064, + "step": 177460 + }, + { + "epoch": 19.523102310231025, + "grad_norm": 0.14468981325626373, + "learning_rate": 8.658938747699851e-08, + "loss": 0.0354, + "num_input_tokens_seen": 37451120, + "step": 177465 + }, + { + "epoch": 19.523652365236522, + "grad_norm": 0.00492837792262435, + "learning_rate": 8.638991887148784e-08, + "loss": 0.0063, + "num_input_tokens_seen": 37452208, + "step": 177470 + }, + { + "epoch": 19.524202420242023, + "grad_norm": 0.039918214082717896, + "learning_rate": 8.619067988276663e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37453296, + "step": 177475 + }, + { + "epoch": 19.524752475247524, + "grad_norm": 0.02191690169274807, + "learning_rate": 8.599167051267232e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37454416, + "step": 177480 + }, + { + "epoch": 19.525302530253025, + "grad_norm": 0.013640237040817738, + "learning_rate": 8.579289076303398e-08, + "loss": 0.0723, + "num_input_tokens_seen": 37455440, + "step": 177485 + }, + { + "epoch": 19.525852585258527, + "grad_norm": 1.914851427078247, + "learning_rate": 8.559434063568905e-08, + "loss": 0.0521, + "num_input_tokens_seen": 37456464, + "step": 177490 + }, + { + "epoch": 19.526402640264028, + "grad_norm": 0.05944647639989853, + "learning_rate": 8.539602013246661e-08, + "loss": 0.0319, + "num_input_tokens_seen": 37457488, + "step": 177495 + }, + { + "epoch": 19.52695269526953, + "grad_norm": 0.005620078183710575, + "learning_rate": 8.519792925519577e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37458576, + "step": 177500 + }, + { + "epoch": 19.527502750275026, + "grad_norm": 0.01519205141812563, + "learning_rate": 8.500006800569726e-08, + "loss": 0.0854, + "num_input_tokens_seen": 37459600, + "step": 177505 + }, + { + "epoch": 19.528052805280527, + "grad_norm": 1.098583459854126, + "learning_rate": 8.480243638580021e-08, + "loss": 0.033, + "num_input_tokens_seen": 37460688, + "step": 177510 + }, + { + "epoch": 19.52860286028603, + "grad_norm": 0.09267386794090271, + "learning_rate": 8.460503439731981e-08, + "loss": 0.0075, + "num_input_tokens_seen": 37461712, + "step": 177515 + }, + { + "epoch": 19.52915291529153, + "grad_norm": 0.6610267758369446, + "learning_rate": 8.44078620420824e-08, + "loss": 0.0089, + "num_input_tokens_seen": 37462800, + "step": 177520 + }, + { + "epoch": 19.52970297029703, + "grad_norm": 2.2727270126342773, + "learning_rate": 8.421091932190317e-08, + "loss": 0.0901, + "num_input_tokens_seen": 37463792, + "step": 177525 + }, + { + "epoch": 19.53025302530253, + "grad_norm": 0.24545955657958984, + "learning_rate": 8.401420623859458e-08, + "loss": 0.0296, + "num_input_tokens_seen": 37464880, + "step": 177530 + }, + { + "epoch": 19.53080308030803, + "grad_norm": 0.33046165108680725, + "learning_rate": 8.381772279396905e-08, + "loss": 0.018, + "num_input_tokens_seen": 37465872, + "step": 177535 + }, + { + "epoch": 19.53135313531353, + "grad_norm": 0.6050101518630981, + "learning_rate": 8.36214689898418e-08, + "loss": 0.0083, + "num_input_tokens_seen": 37466896, + "step": 177540 + }, + { + "epoch": 19.53190319031903, + "grad_norm": 0.009550945833325386, + "learning_rate": 8.342544482801972e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37467984, + "step": 177545 + }, + { + "epoch": 19.532453245324533, + "grad_norm": 0.03581802919507027, + "learning_rate": 8.322965031030693e-08, + "loss": 0.0173, + "num_input_tokens_seen": 37469104, + "step": 177550 + }, + { + "epoch": 19.533003300330034, + "grad_norm": 0.02135642059147358, + "learning_rate": 8.30340854385131e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37470096, + "step": 177555 + }, + { + "epoch": 19.533553355335535, + "grad_norm": 0.039523541927337646, + "learning_rate": 8.283875021443676e-08, + "loss": 0.0717, + "num_input_tokens_seen": 37471184, + "step": 177560 + }, + { + "epoch": 19.534103410341036, + "grad_norm": 0.03962972015142441, + "learning_rate": 8.264364463987928e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37472336, + "step": 177565 + }, + { + "epoch": 19.534653465346533, + "grad_norm": 0.04345928505063057, + "learning_rate": 8.24487687166392e-08, + "loss": 0.0459, + "num_input_tokens_seen": 37473456, + "step": 177570 + }, + { + "epoch": 19.535203520352034, + "grad_norm": 0.04142582044005394, + "learning_rate": 8.225412244650954e-08, + "loss": 0.0228, + "num_input_tokens_seen": 37474480, + "step": 177575 + }, + { + "epoch": 19.535753575357536, + "grad_norm": 0.0034984946250915527, + "learning_rate": 8.205970583129163e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37475472, + "step": 177580 + }, + { + "epoch": 19.536303630363037, + "grad_norm": 0.002500402508303523, + "learning_rate": 8.186551887277016e-08, + "loss": 0.006, + "num_input_tokens_seen": 37476560, + "step": 177585 + }, + { + "epoch": 19.536853685368538, + "grad_norm": 0.028620334342122078, + "learning_rate": 8.167156157273537e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37477648, + "step": 177590 + }, + { + "epoch": 19.53740374037404, + "grad_norm": 0.045468878000974655, + "learning_rate": 8.147783393298026e-08, + "loss": 0.034, + "num_input_tokens_seen": 37478704, + "step": 177595 + }, + { + "epoch": 19.537953795379536, + "grad_norm": 0.300148069858551, + "learning_rate": 8.128433595528396e-08, + "loss": 0.114, + "num_input_tokens_seen": 37479792, + "step": 177600 + }, + { + "epoch": 19.538503850385037, + "grad_norm": 0.05899932608008385, + "learning_rate": 8.109106764143392e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37480880, + "step": 177605 + }, + { + "epoch": 19.53905390539054, + "grad_norm": 0.09282491356134415, + "learning_rate": 8.089802899321208e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37482064, + "step": 177610 + }, + { + "epoch": 19.53960396039604, + "grad_norm": 2.608668565750122, + "learning_rate": 8.0705220012392e-08, + "loss": 0.1026, + "num_input_tokens_seen": 37483152, + "step": 177615 + }, + { + "epoch": 19.54015401540154, + "grad_norm": 0.11244003474712372, + "learning_rate": 8.051264070075837e-08, + "loss": 0.0025, + "num_input_tokens_seen": 37484240, + "step": 177620 + }, + { + "epoch": 19.540704070407042, + "grad_norm": 0.00555324787274003, + "learning_rate": 8.032029106007922e-08, + "loss": 0.0022, + "num_input_tokens_seen": 37485264, + "step": 177625 + }, + { + "epoch": 19.541254125412543, + "grad_norm": 0.0020669528748840094, + "learning_rate": 8.012817109213089e-08, + "loss": 0.0156, + "num_input_tokens_seen": 37486256, + "step": 177630 + }, + { + "epoch": 19.54180418041804, + "grad_norm": 4.203485488891602, + "learning_rate": 7.993628079868699e-08, + "loss": 0.0234, + "num_input_tokens_seen": 37487280, + "step": 177635 + }, + { + "epoch": 19.54235423542354, + "grad_norm": 0.010419017635285854, + "learning_rate": 7.974462018150718e-08, + "loss": 0.0038, + "num_input_tokens_seen": 37488368, + "step": 177640 + }, + { + "epoch": 19.542904290429043, + "grad_norm": 0.028443971648812294, + "learning_rate": 7.955318924236787e-08, + "loss": 0.0247, + "num_input_tokens_seen": 37489424, + "step": 177645 + }, + { + "epoch": 19.543454345434544, + "grad_norm": 2.6396143436431885, + "learning_rate": 7.936198798302596e-08, + "loss": 0.1758, + "num_input_tokens_seen": 37490544, + "step": 177650 + }, + { + "epoch": 19.544004400440045, + "grad_norm": 0.02672906592488289, + "learning_rate": 7.917101640524671e-08, + "loss": 0.0755, + "num_input_tokens_seen": 37491600, + "step": 177655 + }, + { + "epoch": 19.544554455445546, + "grad_norm": 0.019860848784446716, + "learning_rate": 7.898027451078982e-08, + "loss": 0.0243, + "num_input_tokens_seen": 37492592, + "step": 177660 + }, + { + "epoch": 19.545104510451043, + "grad_norm": 0.004645756911486387, + "learning_rate": 7.8789762301415e-08, + "loss": 0.0354, + "num_input_tokens_seen": 37493648, + "step": 177665 + }, + { + "epoch": 19.545654565456545, + "grad_norm": 0.009024742990732193, + "learning_rate": 7.859947977887638e-08, + "loss": 0.0076, + "num_input_tokens_seen": 37494672, + "step": 177670 + }, + { + "epoch": 19.546204620462046, + "grad_norm": 0.5039868950843811, + "learning_rate": 7.840942694492814e-08, + "loss": 0.0032, + "num_input_tokens_seen": 37495728, + "step": 177675 + }, + { + "epoch": 19.546754675467547, + "grad_norm": 1.6472328901290894, + "learning_rate": 7.821960380132165e-08, + "loss": 0.0809, + "num_input_tokens_seen": 37496752, + "step": 177680 + }, + { + "epoch": 19.547304730473048, + "grad_norm": 0.007088368758559227, + "learning_rate": 7.803001034980551e-08, + "loss": 0.0014, + "num_input_tokens_seen": 37497776, + "step": 177685 + }, + { + "epoch": 19.54785478547855, + "grad_norm": 0.01774543896317482, + "learning_rate": 7.784064659212553e-08, + "loss": 0.17, + "num_input_tokens_seen": 37498768, + "step": 177690 + }, + { + "epoch": 19.54840484048405, + "grad_norm": 0.046302370727062225, + "learning_rate": 7.76515125300331e-08, + "loss": 0.0051, + "num_input_tokens_seen": 37499888, + "step": 177695 + }, + { + "epoch": 19.548954895489548, + "grad_norm": 0.14005893468856812, + "learning_rate": 7.746260816526851e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37500912, + "step": 177700 + }, + { + "epoch": 19.54950495049505, + "grad_norm": 0.03398722782731056, + "learning_rate": 7.727393349956924e-08, + "loss": 0.0034, + "num_input_tokens_seen": 37501968, + "step": 177705 + }, + { + "epoch": 19.55005500550055, + "grad_norm": 0.20888501405715942, + "learning_rate": 7.708548853467834e-08, + "loss": 0.0847, + "num_input_tokens_seen": 37503088, + "step": 177710 + }, + { + "epoch": 19.55060506050605, + "grad_norm": 0.00266727595590055, + "learning_rate": 7.689727327233054e-08, + "loss": 0.0357, + "num_input_tokens_seen": 37504208, + "step": 177715 + }, + { + "epoch": 19.551155115511552, + "grad_norm": 0.21668654680252075, + "learning_rate": 7.670928771426334e-08, + "loss": 0.0705, + "num_input_tokens_seen": 37505264, + "step": 177720 + }, + { + "epoch": 19.551705170517053, + "grad_norm": 0.0663955882191658, + "learning_rate": 7.65215318622059e-08, + "loss": 0.0509, + "num_input_tokens_seen": 37506320, + "step": 177725 + }, + { + "epoch": 19.55225522552255, + "grad_norm": 0.004427677020430565, + "learning_rate": 7.633400571789018e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37507408, + "step": 177730 + }, + { + "epoch": 19.55280528052805, + "grad_norm": 1.7098534107208252, + "learning_rate": 7.614670928304535e-08, + "loss": 0.1136, + "num_input_tokens_seen": 37508528, + "step": 177735 + }, + { + "epoch": 19.553355335533553, + "grad_norm": 2.4816906452178955, + "learning_rate": 7.595964255939502e-08, + "loss": 0.1543, + "num_input_tokens_seen": 37509648, + "step": 177740 + }, + { + "epoch": 19.553905390539054, + "grad_norm": 0.09892719238996506, + "learning_rate": 7.577280554866561e-08, + "loss": 0.0068, + "num_input_tokens_seen": 37510704, + "step": 177745 + }, + { + "epoch": 19.554455445544555, + "grad_norm": 0.5967760682106018, + "learning_rate": 7.558619825257796e-08, + "loss": 0.0267, + "num_input_tokens_seen": 37511824, + "step": 177750 + }, + { + "epoch": 19.555005500550056, + "grad_norm": 0.03091229684650898, + "learning_rate": 7.53998206728529e-08, + "loss": 0.1112, + "num_input_tokens_seen": 37512912, + "step": 177755 + }, + { + "epoch": 19.555555555555557, + "grad_norm": 0.14321300387382507, + "learning_rate": 7.52136728112085e-08, + "loss": 0.0036, + "num_input_tokens_seen": 37513936, + "step": 177760 + }, + { + "epoch": 19.556105610561055, + "grad_norm": 0.17038452625274658, + "learning_rate": 7.50277546693573e-08, + "loss": 0.0026, + "num_input_tokens_seen": 37515024, + "step": 177765 + }, + { + "epoch": 19.556655665566556, + "grad_norm": 0.0602123886346817, + "learning_rate": 7.484206624901735e-08, + "loss": 0.0037, + "num_input_tokens_seen": 37516016, + "step": 177770 + }, + { + "epoch": 19.557205720572057, + "grad_norm": 0.19747847318649292, + "learning_rate": 7.465660755189563e-08, + "loss": 0.0636, + "num_input_tokens_seen": 37517104, + "step": 177775 + }, + { + "epoch": 19.557755775577558, + "grad_norm": 0.019726205617189407, + "learning_rate": 7.447137857970465e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37518224, + "step": 177780 + }, + { + "epoch": 19.55830583058306, + "grad_norm": 1.7594283819198608, + "learning_rate": 7.42863793341514e-08, + "loss": 0.0109, + "num_input_tokens_seen": 37519312, + "step": 177785 + }, + { + "epoch": 19.55885588558856, + "grad_norm": 0.17332372069358826, + "learning_rate": 7.410160981694003e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37520464, + "step": 177790 + }, + { + "epoch": 19.55940594059406, + "grad_norm": 0.05422146990895271, + "learning_rate": 7.391707002977477e-08, + "loss": 0.0134, + "num_input_tokens_seen": 37521520, + "step": 177795 + }, + { + "epoch": 19.55995599559956, + "grad_norm": 0.022461598739027977, + "learning_rate": 7.373275997435148e-08, + "loss": 0.0025, + "num_input_tokens_seen": 37522608, + "step": 177800 + }, + { + "epoch": 19.56050605060506, + "grad_norm": 0.018424883484840393, + "learning_rate": 7.354867965237433e-08, + "loss": 0.0062, + "num_input_tokens_seen": 37523600, + "step": 177805 + }, + { + "epoch": 19.56105610561056, + "grad_norm": 0.037395402789115906, + "learning_rate": 7.336482906553921e-08, + "loss": 0.007, + "num_input_tokens_seen": 37524656, + "step": 177810 + }, + { + "epoch": 19.561606160616062, + "grad_norm": 0.012355422601103783, + "learning_rate": 7.318120821553919e-08, + "loss": 0.0023, + "num_input_tokens_seen": 37525776, + "step": 177815 + }, + { + "epoch": 19.562156215621563, + "grad_norm": 0.002588218078017235, + "learning_rate": 7.299781710406462e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37526832, + "step": 177820 + }, + { + "epoch": 19.562706270627064, + "grad_norm": 0.2712121605873108, + "learning_rate": 7.281465573281132e-08, + "loss": 0.0038, + "num_input_tokens_seen": 37527888, + "step": 177825 + }, + { + "epoch": 19.563256325632562, + "grad_norm": 0.013732850551605225, + "learning_rate": 7.26317241034613e-08, + "loss": 0.0058, + "num_input_tokens_seen": 37528912, + "step": 177830 + }, + { + "epoch": 19.563806380638063, + "grad_norm": 1.5666979551315308, + "learning_rate": 7.244902221770489e-08, + "loss": 0.0792, + "num_input_tokens_seen": 37529968, + "step": 177835 + }, + { + "epoch": 19.564356435643564, + "grad_norm": 0.015659062191843987, + "learning_rate": 7.226655007722405e-08, + "loss": 0.04, + "num_input_tokens_seen": 37531024, + "step": 177840 + }, + { + "epoch": 19.564906490649065, + "grad_norm": 2.002936840057373, + "learning_rate": 7.208430768370078e-08, + "loss": 0.1382, + "num_input_tokens_seen": 37532080, + "step": 177845 + }, + { + "epoch": 19.565456545654566, + "grad_norm": 0.029268499463796616, + "learning_rate": 7.190229503881707e-08, + "loss": 0.0807, + "num_input_tokens_seen": 37533104, + "step": 177850 + }, + { + "epoch": 19.566006600660067, + "grad_norm": 0.013457500375807285, + "learning_rate": 7.172051214424657e-08, + "loss": 0.0007, + "num_input_tokens_seen": 37534128, + "step": 177855 + }, + { + "epoch": 19.566556655665565, + "grad_norm": 0.015673819929361343, + "learning_rate": 7.153895900166574e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37535248, + "step": 177860 + }, + { + "epoch": 19.567106710671066, + "grad_norm": 1.7482327222824097, + "learning_rate": 7.135763561274822e-08, + "loss": 0.0815, + "num_input_tokens_seen": 37536304, + "step": 177865 + }, + { + "epoch": 19.567656765676567, + "grad_norm": 0.6948390007019043, + "learning_rate": 7.11765419791649e-08, + "loss": 0.0058, + "num_input_tokens_seen": 37537360, + "step": 177870 + }, + { + "epoch": 19.568206820682068, + "grad_norm": 0.24265412986278534, + "learning_rate": 7.099567810258667e-08, + "loss": 0.0028, + "num_input_tokens_seen": 37538352, + "step": 177875 + }, + { + "epoch": 19.56875687568757, + "grad_norm": 0.015105286613106728, + "learning_rate": 7.081504398467886e-08, + "loss": 0.1178, + "num_input_tokens_seen": 37539408, + "step": 177880 + }, + { + "epoch": 19.56930693069307, + "grad_norm": 0.020427025854587555, + "learning_rate": 7.063463962710959e-08, + "loss": 0.0282, + "num_input_tokens_seen": 37540432, + "step": 177885 + }, + { + "epoch": 19.56985698569857, + "grad_norm": 0.07679882645606995, + "learning_rate": 7.045446503153307e-08, + "loss": 0.1395, + "num_input_tokens_seen": 37541392, + "step": 177890 + }, + { + "epoch": 19.57040704070407, + "grad_norm": 0.09421169012784958, + "learning_rate": 7.027452019961744e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37542448, + "step": 177895 + }, + { + "epoch": 19.57095709570957, + "grad_norm": 0.019728681072592735, + "learning_rate": 7.009480513302246e-08, + "loss": 0.148, + "num_input_tokens_seen": 37543472, + "step": 177900 + }, + { + "epoch": 19.57150715071507, + "grad_norm": 0.3209828734397888, + "learning_rate": 6.991531983339684e-08, + "loss": 0.0361, + "num_input_tokens_seen": 37544528, + "step": 177905 + }, + { + "epoch": 19.572057205720572, + "grad_norm": 0.06624974310398102, + "learning_rate": 6.973606430240032e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37545584, + "step": 177910 + }, + { + "epoch": 19.572607260726073, + "grad_norm": 0.49041372537612915, + "learning_rate": 6.955703854168438e-08, + "loss": 0.0045, + "num_input_tokens_seen": 37546640, + "step": 177915 + }, + { + "epoch": 19.573157315731574, + "grad_norm": 0.053173091262578964, + "learning_rate": 6.937824255290048e-08, + "loss": 0.003, + "num_input_tokens_seen": 37547728, + "step": 177920 + }, + { + "epoch": 19.573707370737075, + "grad_norm": 0.01702132262289524, + "learning_rate": 6.919967633769176e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37548816, + "step": 177925 + }, + { + "epoch": 19.574257425742573, + "grad_norm": 3.774214744567871, + "learning_rate": 6.902133989770687e-08, + "loss": 0.1031, + "num_input_tokens_seen": 37549968, + "step": 177930 + }, + { + "epoch": 19.574807480748074, + "grad_norm": 0.015108369290828705, + "learning_rate": 6.884323323458897e-08, + "loss": 0.0379, + "num_input_tokens_seen": 37551088, + "step": 177935 + }, + { + "epoch": 19.575357535753575, + "grad_norm": 0.13854843378067017, + "learning_rate": 6.866535634998117e-08, + "loss": 0.0074, + "num_input_tokens_seen": 37552144, + "step": 177940 + }, + { + "epoch": 19.575907590759076, + "grad_norm": 0.01411434356123209, + "learning_rate": 6.848770924552106e-08, + "loss": 0.0042, + "num_input_tokens_seen": 37553168, + "step": 177945 + }, + { + "epoch": 19.576457645764577, + "grad_norm": 0.11911794543266296, + "learning_rate": 6.831029192284343e-08, + "loss": 0.116, + "num_input_tokens_seen": 37554224, + "step": 177950 + }, + { + "epoch": 19.57700770077008, + "grad_norm": 0.10688773542642593, + "learning_rate": 6.813310438358866e-08, + "loss": 0.1269, + "num_input_tokens_seen": 37555312, + "step": 177955 + }, + { + "epoch": 19.577557755775576, + "grad_norm": 1.953078031539917, + "learning_rate": 6.795614662938876e-08, + "loss": 0.0828, + "num_input_tokens_seen": 37556368, + "step": 177960 + }, + { + "epoch": 19.578107810781077, + "grad_norm": 0.16998620331287384, + "learning_rate": 6.777941866187576e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37557424, + "step": 177965 + }, + { + "epoch": 19.578657865786578, + "grad_norm": 2.5849344730377197, + "learning_rate": 6.760292048267058e-08, + "loss": 0.0465, + "num_input_tokens_seen": 37558544, + "step": 177970 + }, + { + "epoch": 19.57920792079208, + "grad_norm": 0.1551998406648636, + "learning_rate": 6.742665209341081e-08, + "loss": 0.1002, + "num_input_tokens_seen": 37559568, + "step": 177975 + }, + { + "epoch": 19.57975797579758, + "grad_norm": 0.06085231900215149, + "learning_rate": 6.725061349571183e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37560624, + "step": 177980 + }, + { + "epoch": 19.58030803080308, + "grad_norm": 0.05377829074859619, + "learning_rate": 6.707480469120009e-08, + "loss": 0.0045, + "num_input_tokens_seen": 37561648, + "step": 177985 + }, + { + "epoch": 19.580858085808583, + "grad_norm": 0.0556105338037014, + "learning_rate": 6.689922568149931e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37562736, + "step": 177990 + }, + { + "epoch": 19.58140814081408, + "grad_norm": 0.011524940840899944, + "learning_rate": 6.672387646822209e-08, + "loss": 0.0062, + "num_input_tokens_seen": 37563760, + "step": 177995 + }, + { + "epoch": 19.58195819581958, + "grad_norm": 0.030759770423173904, + "learning_rate": 6.654875705298657e-08, + "loss": 0.0028, + "num_input_tokens_seen": 37564816, + "step": 178000 + }, + { + "epoch": 19.582508250825082, + "grad_norm": 0.041672494262456894, + "learning_rate": 6.637386743740537e-08, + "loss": 0.0185, + "num_input_tokens_seen": 37565872, + "step": 178005 + }, + { + "epoch": 19.583058305830583, + "grad_norm": 1.9304355382919312, + "learning_rate": 6.619920762309384e-08, + "loss": 0.0726, + "num_input_tokens_seen": 37566960, + "step": 178010 + }, + { + "epoch": 19.583608360836084, + "grad_norm": 0.016820697113871574, + "learning_rate": 6.602477761166181e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37568016, + "step": 178015 + }, + { + "epoch": 19.584158415841586, + "grad_norm": 0.024702034890651703, + "learning_rate": 6.585057740471079e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37569040, + "step": 178020 + }, + { + "epoch": 19.584708470847083, + "grad_norm": 0.04975790157914162, + "learning_rate": 6.567660700385337e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37570064, + "step": 178025 + }, + { + "epoch": 19.585258525852584, + "grad_norm": 0.0071713803336024284, + "learning_rate": 6.550286641068826e-08, + "loss": 0.0045, + "num_input_tokens_seen": 37571056, + "step": 178030 + }, + { + "epoch": 19.585808580858085, + "grad_norm": 0.009955435991287231, + "learning_rate": 6.532935562681974e-08, + "loss": 0.0517, + "num_input_tokens_seen": 37572112, + "step": 178035 + }, + { + "epoch": 19.586358635863586, + "grad_norm": 0.0041959346272051334, + "learning_rate": 6.515607465384654e-08, + "loss": 0.0021, + "num_input_tokens_seen": 37573136, + "step": 178040 + }, + { + "epoch": 19.586908690869087, + "grad_norm": 3.4987142086029053, + "learning_rate": 6.498302349336461e-08, + "loss": 0.0569, + "num_input_tokens_seen": 37574192, + "step": 178045 + }, + { + "epoch": 19.58745874587459, + "grad_norm": 0.7167109847068787, + "learning_rate": 6.481020214696709e-08, + "loss": 0.0677, + "num_input_tokens_seen": 37575280, + "step": 178050 + }, + { + "epoch": 19.58800880088009, + "grad_norm": 0.02564358524978161, + "learning_rate": 6.463761061624995e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37576272, + "step": 178055 + }, + { + "epoch": 19.588558855885587, + "grad_norm": 0.02235279232263565, + "learning_rate": 6.446524890280358e-08, + "loss": 0.0077, + "num_input_tokens_seen": 37577328, + "step": 178060 + }, + { + "epoch": 19.58910891089109, + "grad_norm": 0.029683351516723633, + "learning_rate": 6.42931170082156e-08, + "loss": 0.0007, + "num_input_tokens_seen": 37578416, + "step": 178065 + }, + { + "epoch": 19.58965896589659, + "grad_norm": 0.006527242250740528, + "learning_rate": 6.412121493407087e-08, + "loss": 0.021, + "num_input_tokens_seen": 37579504, + "step": 178070 + }, + { + "epoch": 19.59020902090209, + "grad_norm": 0.01760685071349144, + "learning_rate": 6.394954268195697e-08, + "loss": 0.0132, + "num_input_tokens_seen": 37580592, + "step": 178075 + }, + { + "epoch": 19.59075907590759, + "grad_norm": 0.4428860545158386, + "learning_rate": 6.37781002534532e-08, + "loss": 0.0038, + "num_input_tokens_seen": 37581616, + "step": 178080 + }, + { + "epoch": 19.591309130913093, + "grad_norm": 0.20191001892089844, + "learning_rate": 6.360688765014445e-08, + "loss": 0.0036, + "num_input_tokens_seen": 37582640, + "step": 178085 + }, + { + "epoch": 19.59185918591859, + "grad_norm": 0.021526776254177094, + "learning_rate": 6.34359048736044e-08, + "loss": 0.0022, + "num_input_tokens_seen": 37583696, + "step": 178090 + }, + { + "epoch": 19.59240924092409, + "grad_norm": 0.014718569815158844, + "learning_rate": 6.326515192540682e-08, + "loss": 0.0055, + "num_input_tokens_seen": 37584720, + "step": 178095 + }, + { + "epoch": 19.592959295929592, + "grad_norm": 0.03182150796055794, + "learning_rate": 6.309462880713102e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37585744, + "step": 178100 + }, + { + "epoch": 19.593509350935093, + "grad_norm": 0.08776838332414627, + "learning_rate": 6.292433552034516e-08, + "loss": 0.0159, + "num_input_tokens_seen": 37586832, + "step": 178105 + }, + { + "epoch": 19.594059405940595, + "grad_norm": 0.07294462621212006, + "learning_rate": 6.275427206662021e-08, + "loss": 0.0586, + "num_input_tokens_seen": 37587856, + "step": 178110 + }, + { + "epoch": 19.594609460946096, + "grad_norm": 0.024456888437271118, + "learning_rate": 6.258443844752438e-08, + "loss": 0.0651, + "num_input_tokens_seen": 37588944, + "step": 178115 + }, + { + "epoch": 19.595159515951597, + "grad_norm": 0.01759270206093788, + "learning_rate": 6.241483466462029e-08, + "loss": 0.0417, + "num_input_tokens_seen": 37589968, + "step": 178120 + }, + { + "epoch": 19.595709570957094, + "grad_norm": 0.3411567509174347, + "learning_rate": 6.224546071947057e-08, + "loss": 0.0935, + "num_input_tokens_seen": 37591024, + "step": 178125 + }, + { + "epoch": 19.596259625962595, + "grad_norm": 1.9198930263519287, + "learning_rate": 6.207631661364066e-08, + "loss": 0.0326, + "num_input_tokens_seen": 37592112, + "step": 178130 + }, + { + "epoch": 19.596809680968097, + "grad_norm": 2.0477488040924072, + "learning_rate": 6.190740234868764e-08, + "loss": 0.0174, + "num_input_tokens_seen": 37593200, + "step": 178135 + }, + { + "epoch": 19.597359735973598, + "grad_norm": 0.025056932121515274, + "learning_rate": 6.173871792616581e-08, + "loss": 0.0033, + "num_input_tokens_seen": 37594224, + "step": 178140 + }, + { + "epoch": 19.5979097909791, + "grad_norm": 6.997714519500732, + "learning_rate": 6.157026334763228e-08, + "loss": 0.1164, + "num_input_tokens_seen": 37595312, + "step": 178145 + }, + { + "epoch": 19.5984598459846, + "grad_norm": 0.04886370897293091, + "learning_rate": 6.140203861463855e-08, + "loss": 0.0997, + "num_input_tokens_seen": 37596336, + "step": 178150 + }, + { + "epoch": 19.599009900990097, + "grad_norm": 1.600899577140808, + "learning_rate": 6.12340437287362e-08, + "loss": 0.0073, + "num_input_tokens_seen": 37597424, + "step": 178155 + }, + { + "epoch": 19.5995599559956, + "grad_norm": 0.06466629356145859, + "learning_rate": 6.106627869147119e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37598448, + "step": 178160 + }, + { + "epoch": 19.6001100110011, + "grad_norm": 0.04264618083834648, + "learning_rate": 6.089874350439506e-08, + "loss": 0.1042, + "num_input_tokens_seen": 37599504, + "step": 178165 + }, + { + "epoch": 19.6006600660066, + "grad_norm": 0.06757411360740662, + "learning_rate": 6.073143816904548e-08, + "loss": 0.0262, + "num_input_tokens_seen": 37600624, + "step": 178170 + }, + { + "epoch": 19.6012101210121, + "grad_norm": 0.07188630104064941, + "learning_rate": 6.056436268696842e-08, + "loss": 0.0081, + "num_input_tokens_seen": 37601616, + "step": 178175 + }, + { + "epoch": 19.601760176017603, + "grad_norm": 0.07057635486125946, + "learning_rate": 6.039751705970153e-08, + "loss": 0.005, + "num_input_tokens_seen": 37602576, + "step": 178180 + }, + { + "epoch": 19.602310231023104, + "grad_norm": 0.02223096787929535, + "learning_rate": 6.02309012887825e-08, + "loss": 0.0677, + "num_input_tokens_seen": 37603632, + "step": 178185 + }, + { + "epoch": 19.6028602860286, + "grad_norm": 0.030207036063075066, + "learning_rate": 6.006451537574898e-08, + "loss": 0.0409, + "num_input_tokens_seen": 37604688, + "step": 178190 + }, + { + "epoch": 19.603410341034103, + "grad_norm": 0.01874883659183979, + "learning_rate": 5.989835932213583e-08, + "loss": 0.0045, + "num_input_tokens_seen": 37605776, + "step": 178195 + }, + { + "epoch": 19.603960396039604, + "grad_norm": 0.13772857189178467, + "learning_rate": 5.973243312946964e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37606832, + "step": 178200 + }, + { + "epoch": 19.604510451045105, + "grad_norm": 0.10415604710578918, + "learning_rate": 5.9566736799282486e-08, + "loss": 0.0073, + "num_input_tokens_seen": 37607920, + "step": 178205 + }, + { + "epoch": 19.605060506050606, + "grad_norm": 0.12645123898983002, + "learning_rate": 5.9401270333100945e-08, + "loss": 0.0528, + "num_input_tokens_seen": 37608912, + "step": 178210 + }, + { + "epoch": 19.605610561056107, + "grad_norm": 0.037833310663700104, + "learning_rate": 5.923603373245157e-08, + "loss": 0.0225, + "num_input_tokens_seen": 37610032, + "step": 178215 + }, + { + "epoch": 19.606160616061608, + "grad_norm": 0.016247455030679703, + "learning_rate": 5.907102699885259e-08, + "loss": 0.076, + "num_input_tokens_seen": 37611120, + "step": 178220 + }, + { + "epoch": 19.606710671067106, + "grad_norm": 0.07478772848844528, + "learning_rate": 5.8906250133830556e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37612240, + "step": 178225 + }, + { + "epoch": 19.607260726072607, + "grad_norm": 0.15773023664951324, + "learning_rate": 5.87417031389037e-08, + "loss": 0.061, + "num_input_tokens_seen": 37613264, + "step": 178230 + }, + { + "epoch": 19.607810781078108, + "grad_norm": 1.6822965145111084, + "learning_rate": 5.857738601558471e-08, + "loss": 0.0796, + "num_input_tokens_seen": 37614352, + "step": 178235 + }, + { + "epoch": 19.60836083608361, + "grad_norm": 0.013376311399042606, + "learning_rate": 5.8413298765389034e-08, + "loss": 0.0051, + "num_input_tokens_seen": 37615440, + "step": 178240 + }, + { + "epoch": 19.60891089108911, + "grad_norm": 0.13324549794197083, + "learning_rate": 5.824944138982935e-08, + "loss": 0.0874, + "num_input_tokens_seen": 37616528, + "step": 178245 + }, + { + "epoch": 19.60946094609461, + "grad_norm": 0.037748780101537704, + "learning_rate": 5.808581389041834e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37617552, + "step": 178250 + }, + { + "epoch": 19.61001100110011, + "grad_norm": 0.007868603803217411, + "learning_rate": 5.792241626866035e-08, + "loss": 0.008, + "num_input_tokens_seen": 37618576, + "step": 178255 + }, + { + "epoch": 19.61056105610561, + "grad_norm": 0.01247186679393053, + "learning_rate": 5.775924852606529e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37619664, + "step": 178260 + }, + { + "epoch": 19.61111111111111, + "grad_norm": 2.5194315910339355, + "learning_rate": 5.759631066413196e-08, + "loss": 0.1052, + "num_input_tokens_seen": 37620848, + "step": 178265 + }, + { + "epoch": 19.611661166116612, + "grad_norm": 0.028880883008241653, + "learning_rate": 5.7433602684367485e-08, + "loss": 0.0388, + "num_input_tokens_seen": 37621872, + "step": 178270 + }, + { + "epoch": 19.612211221122113, + "grad_norm": 0.10087930411100388, + "learning_rate": 5.7271124588267886e-08, + "loss": 0.0044, + "num_input_tokens_seen": 37622928, + "step": 178275 + }, + { + "epoch": 19.612761276127614, + "grad_norm": 0.012416880577802658, + "learning_rate": 5.7108876377331975e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37623984, + "step": 178280 + }, + { + "epoch": 19.61331133113311, + "grad_norm": 0.02680126205086708, + "learning_rate": 5.6946858053053e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37624976, + "step": 178285 + }, + { + "epoch": 19.613861386138613, + "grad_norm": 0.06433066725730896, + "learning_rate": 5.678506961692698e-08, + "loss": 0.0035, + "num_input_tokens_seen": 37626064, + "step": 178290 + }, + { + "epoch": 19.614411441144114, + "grad_norm": 0.02989666908979416, + "learning_rate": 5.662351107044439e-08, + "loss": 0.0034, + "num_input_tokens_seen": 37627120, + "step": 178295 + }, + { + "epoch": 19.614961496149615, + "grad_norm": 0.17174750566482544, + "learning_rate": 5.646218241509571e-08, + "loss": 0.0381, + "num_input_tokens_seen": 37628272, + "step": 178300 + }, + { + "epoch": 19.615511551155116, + "grad_norm": 0.06124813109636307, + "learning_rate": 5.630108365236308e-08, + "loss": 0.0095, + "num_input_tokens_seen": 37629328, + "step": 178305 + }, + { + "epoch": 19.616061606160617, + "grad_norm": 0.01935977302491665, + "learning_rate": 5.614021478373699e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37630352, + "step": 178310 + }, + { + "epoch": 19.616611661166118, + "grad_norm": 0.0052881306037306786, + "learning_rate": 5.597957581069402e-08, + "loss": 0.0438, + "num_input_tokens_seen": 37631472, + "step": 178315 + }, + { + "epoch": 19.617161716171616, + "grad_norm": 1.6995429992675781, + "learning_rate": 5.5819166734719097e-08, + "loss": 0.0437, + "num_input_tokens_seen": 37632528, + "step": 178320 + }, + { + "epoch": 19.617711771177117, + "grad_norm": 2.0543947219848633, + "learning_rate": 5.56589875572916e-08, + "loss": 0.088, + "num_input_tokens_seen": 37633584, + "step": 178325 + }, + { + "epoch": 19.618261826182618, + "grad_norm": 0.04855618625879288, + "learning_rate": 5.549903827988534e-08, + "loss": 0.0423, + "num_input_tokens_seen": 37634640, + "step": 178330 + }, + { + "epoch": 19.61881188118812, + "grad_norm": 0.034716445952653885, + "learning_rate": 5.5339318903971374e-08, + "loss": 0.0024, + "num_input_tokens_seen": 37635696, + "step": 178335 + }, + { + "epoch": 19.61936193619362, + "grad_norm": 0.00896708108484745, + "learning_rate": 5.5179829431029064e-08, + "loss": 0.0497, + "num_input_tokens_seen": 37636784, + "step": 178340 + }, + { + "epoch": 19.61991199119912, + "grad_norm": 0.09180551767349243, + "learning_rate": 5.502056986252391e-08, + "loss": 0.005, + "num_input_tokens_seen": 37637808, + "step": 178345 + }, + { + "epoch": 19.620462046204622, + "grad_norm": 0.09771748632192612, + "learning_rate": 5.486154019992418e-08, + "loss": 0.1424, + "num_input_tokens_seen": 37638896, + "step": 178350 + }, + { + "epoch": 19.62101210121012, + "grad_norm": 0.030266383662819862, + "learning_rate": 5.470274044469537e-08, + "loss": 0.0023, + "num_input_tokens_seen": 37640016, + "step": 178355 + }, + { + "epoch": 19.62156215621562, + "grad_norm": 0.03237679973244667, + "learning_rate": 5.454417059830019e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37640976, + "step": 178360 + }, + { + "epoch": 19.622112211221122, + "grad_norm": 1.9226393699645996, + "learning_rate": 5.438583066220415e-08, + "loss": 0.1521, + "num_input_tokens_seen": 37642032, + "step": 178365 + }, + { + "epoch": 19.622662266226623, + "grad_norm": 0.09523005038499832, + "learning_rate": 5.422772063786163e-08, + "loss": 0.0426, + "num_input_tokens_seen": 37643120, + "step": 178370 + }, + { + "epoch": 19.623212321232124, + "grad_norm": 0.02225571498274803, + "learning_rate": 5.406984052673258e-08, + "loss": 0.0122, + "num_input_tokens_seen": 37644112, + "step": 178375 + }, + { + "epoch": 19.623762376237625, + "grad_norm": 0.08236938714981079, + "learning_rate": 5.3912190330271395e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37645168, + "step": 178380 + }, + { + "epoch": 19.624312431243123, + "grad_norm": 1.2068341970443726, + "learning_rate": 5.375477004993246e-08, + "loss": 0.0411, + "num_input_tokens_seen": 37646256, + "step": 178385 + }, + { + "epoch": 19.624862486248624, + "grad_norm": 0.038746897131204605, + "learning_rate": 5.359757968716461e-08, + "loss": 0.0419, + "num_input_tokens_seen": 37647376, + "step": 178390 + }, + { + "epoch": 19.625412541254125, + "grad_norm": 0.00862161349505186, + "learning_rate": 5.34406192434167e-08, + "loss": 0.0071, + "num_input_tokens_seen": 37648496, + "step": 178395 + }, + { + "epoch": 19.625962596259626, + "grad_norm": 0.004838456399738789, + "learning_rate": 5.3283888720137565e-08, + "loss": 0.0273, + "num_input_tokens_seen": 37649520, + "step": 178400 + }, + { + "epoch": 19.626512651265127, + "grad_norm": 0.0866885632276535, + "learning_rate": 5.312738811877049e-08, + "loss": 0.1355, + "num_input_tokens_seen": 37650576, + "step": 178405 + }, + { + "epoch": 19.627062706270628, + "grad_norm": 0.44792628288269043, + "learning_rate": 5.2971117440756e-08, + "loss": 0.0082, + "num_input_tokens_seen": 37651696, + "step": 178410 + }, + { + "epoch": 19.62761276127613, + "grad_norm": 0.02058558538556099, + "learning_rate": 5.28150766875346e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37652752, + "step": 178415 + }, + { + "epoch": 19.628162816281627, + "grad_norm": 0.011850343085825443, + "learning_rate": 5.265926586054959e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37653808, + "step": 178420 + }, + { + "epoch": 19.628712871287128, + "grad_norm": 0.08322902768850327, + "learning_rate": 5.250368496123037e-08, + "loss": 0.1081, + "num_input_tokens_seen": 37654896, + "step": 178425 + }, + { + "epoch": 19.62926292629263, + "grad_norm": 0.02828024886548519, + "learning_rate": 5.234833399101469e-08, + "loss": 0.0042, + "num_input_tokens_seen": 37655984, + "step": 178430 + }, + { + "epoch": 19.62981298129813, + "grad_norm": 0.07488295435905457, + "learning_rate": 5.219321295133472e-08, + "loss": 0.0021, + "num_input_tokens_seen": 37657008, + "step": 178435 + }, + { + "epoch": 19.63036303630363, + "grad_norm": 0.009094825945794582, + "learning_rate": 5.203832184361712e-08, + "loss": 0.0014, + "num_input_tokens_seen": 37658032, + "step": 178440 + }, + { + "epoch": 19.630913091309132, + "grad_norm": 0.0063477675430476665, + "learning_rate": 5.188366066929129e-08, + "loss": 0.0399, + "num_input_tokens_seen": 37659088, + "step": 178445 + }, + { + "epoch": 19.63146314631463, + "grad_norm": 0.014218756929039955, + "learning_rate": 5.1729229429783865e-08, + "loss": 0.0421, + "num_input_tokens_seen": 37660176, + "step": 178450 + }, + { + "epoch": 19.63201320132013, + "grad_norm": 0.010147632099688053, + "learning_rate": 5.157502812651594e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37661232, + "step": 178455 + }, + { + "epoch": 19.632563256325632, + "grad_norm": 0.022742440924048424, + "learning_rate": 5.142105676090858e-08, + "loss": 0.0056, + "num_input_tokens_seen": 37662256, + "step": 178460 + }, + { + "epoch": 19.633113311331133, + "grad_norm": 0.018809273838996887, + "learning_rate": 5.1267315334382895e-08, + "loss": 0.0006, + "num_input_tokens_seen": 37663280, + "step": 178465 + }, + { + "epoch": 19.633663366336634, + "grad_norm": 0.0071411519311368465, + "learning_rate": 5.11138038483544e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37664368, + "step": 178470 + }, + { + "epoch": 19.634213421342135, + "grad_norm": 1.0139975547790527, + "learning_rate": 5.0960522304235867e-08, + "loss": 0.0694, + "num_input_tokens_seen": 37665520, + "step": 178475 + }, + { + "epoch": 19.634763476347636, + "grad_norm": 1.836769461631775, + "learning_rate": 5.08074707034456e-08, + "loss": 0.0712, + "num_input_tokens_seen": 37666640, + "step": 178480 + }, + { + "epoch": 19.635313531353134, + "grad_norm": 0.3222592771053314, + "learning_rate": 5.0654649047390815e-08, + "loss": 0.003, + "num_input_tokens_seen": 37667696, + "step": 178485 + }, + { + "epoch": 19.635863586358635, + "grad_norm": 0.5389066338539124, + "learning_rate": 5.050205733748148e-08, + "loss": 0.0112, + "num_input_tokens_seen": 37668688, + "step": 178490 + }, + { + "epoch": 19.636413641364136, + "grad_norm": 0.07815048843622208, + "learning_rate": 5.0349695575122033e-08, + "loss": 0.0023, + "num_input_tokens_seen": 37669808, + "step": 178495 + }, + { + "epoch": 19.636963696369637, + "grad_norm": 2.7147908210754395, + "learning_rate": 5.019756376171414e-08, + "loss": 0.0308, + "num_input_tokens_seen": 37670864, + "step": 178500 + }, + { + "epoch": 19.63751375137514, + "grad_norm": 0.06828499585390091, + "learning_rate": 5.0045661898667776e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37671920, + "step": 178505 + }, + { + "epoch": 19.63806380638064, + "grad_norm": 0.014123868197202682, + "learning_rate": 4.989398998737349e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37673104, + "step": 178510 + }, + { + "epoch": 19.638613861386137, + "grad_norm": 0.11467724293470383, + "learning_rate": 4.97425480292385e-08, + "loss": 0.0292, + "num_input_tokens_seen": 37674160, + "step": 178515 + }, + { + "epoch": 19.639163916391638, + "grad_norm": 0.04600655660033226, + "learning_rate": 4.959133602565336e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37675312, + "step": 178520 + }, + { + "epoch": 19.63971397139714, + "grad_norm": 0.2206888496875763, + "learning_rate": 4.944035397801139e-08, + "loss": 0.0034, + "num_input_tokens_seen": 37676400, + "step": 178525 + }, + { + "epoch": 19.64026402640264, + "grad_norm": 0.019807137548923492, + "learning_rate": 4.928960188770593e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37677520, + "step": 178530 + }, + { + "epoch": 19.64081408140814, + "grad_norm": 0.030754584819078445, + "learning_rate": 4.913907975612475e-08, + "loss": 0.0058, + "num_input_tokens_seen": 37678608, + "step": 178535 + }, + { + "epoch": 19.641364136413642, + "grad_norm": 1.5197784900665283, + "learning_rate": 4.8988787584655635e-08, + "loss": 0.0349, + "num_input_tokens_seen": 37679568, + "step": 178540 + }, + { + "epoch": 19.641914191419144, + "grad_norm": 0.05785228684544563, + "learning_rate": 4.883872537468637e-08, + "loss": 0.002, + "num_input_tokens_seen": 37680624, + "step": 178545 + }, + { + "epoch": 19.64246424642464, + "grad_norm": 0.25594526529312134, + "learning_rate": 4.868889312759639e-08, + "loss": 0.0062, + "num_input_tokens_seen": 37681712, + "step": 178550 + }, + { + "epoch": 19.643014301430142, + "grad_norm": 0.04421583190560341, + "learning_rate": 4.8539290844767937e-08, + "loss": 0.0837, + "num_input_tokens_seen": 37682832, + "step": 178555 + }, + { + "epoch": 19.643564356435643, + "grad_norm": 0.06154418736696243, + "learning_rate": 4.838991852758046e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37683792, + "step": 178560 + }, + { + "epoch": 19.644114411441144, + "grad_norm": 0.15247637033462524, + "learning_rate": 4.824077617741063e-08, + "loss": 0.0534, + "num_input_tokens_seen": 37684816, + "step": 178565 + }, + { + "epoch": 19.644664466446645, + "grad_norm": 0.3941591680049896, + "learning_rate": 4.809186379563235e-08, + "loss": 0.0366, + "num_input_tokens_seen": 37685936, + "step": 178570 + }, + { + "epoch": 19.645214521452147, + "grad_norm": 0.016416320577263832, + "learning_rate": 4.794318138361675e-08, + "loss": 0.0044, + "num_input_tokens_seen": 37686960, + "step": 178575 + }, + { + "epoch": 19.645764576457644, + "grad_norm": 0.01584089919924736, + "learning_rate": 4.779472894273773e-08, + "loss": 0.0278, + "num_input_tokens_seen": 37688080, + "step": 178580 + }, + { + "epoch": 19.646314631463145, + "grad_norm": 0.06871243566274643, + "learning_rate": 4.764650647436086e-08, + "loss": 0.0022, + "num_input_tokens_seen": 37689072, + "step": 178585 + }, + { + "epoch": 19.646864686468646, + "grad_norm": 2.919414520263672, + "learning_rate": 4.7498513979851725e-08, + "loss": 0.0111, + "num_input_tokens_seen": 37690096, + "step": 178590 + }, + { + "epoch": 19.647414741474147, + "grad_norm": 0.05570731312036514, + "learning_rate": 4.735075146057588e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37691120, + "step": 178595 + }, + { + "epoch": 19.64796479647965, + "grad_norm": 0.1710079312324524, + "learning_rate": 4.720321891789614e-08, + "loss": 0.0026, + "num_input_tokens_seen": 37692144, + "step": 178600 + }, + { + "epoch": 19.64851485148515, + "grad_norm": 0.002694530878216028, + "learning_rate": 4.7055916353169746e-08, + "loss": 0.0875, + "num_input_tokens_seen": 37693136, + "step": 178605 + }, + { + "epoch": 19.64906490649065, + "grad_norm": 0.26709312200546265, + "learning_rate": 4.6908843767756725e-08, + "loss": 0.0065, + "num_input_tokens_seen": 37694224, + "step": 178610 + }, + { + "epoch": 19.649614961496148, + "grad_norm": 2.2161192893981934, + "learning_rate": 4.6762001163008773e-08, + "loss": 0.1049, + "num_input_tokens_seen": 37695280, + "step": 178615 + }, + { + "epoch": 19.65016501650165, + "grad_norm": 0.021204017102718353, + "learning_rate": 4.661538854028313e-08, + "loss": 0.0257, + "num_input_tokens_seen": 37696368, + "step": 178620 + }, + { + "epoch": 19.65071507150715, + "grad_norm": 0.012741976417601109, + "learning_rate": 4.646900590092873e-08, + "loss": 0.0021, + "num_input_tokens_seen": 37697456, + "step": 178625 + }, + { + "epoch": 19.65126512651265, + "grad_norm": 0.4320608377456665, + "learning_rate": 4.632285324629726e-08, + "loss": 0.0058, + "num_input_tokens_seen": 37698512, + "step": 178630 + }, + { + "epoch": 19.651815181518153, + "grad_norm": 2.1843156814575195, + "learning_rate": 4.617693057773209e-08, + "loss": 0.0505, + "num_input_tokens_seen": 37699568, + "step": 178635 + }, + { + "epoch": 19.652365236523654, + "grad_norm": 0.09425092488527298, + "learning_rate": 4.603123789658215e-08, + "loss": 0.1162, + "num_input_tokens_seen": 37700560, + "step": 178640 + }, + { + "epoch": 19.652915291529155, + "grad_norm": 3.4220688343048096, + "learning_rate": 4.588577520418802e-08, + "loss": 0.0774, + "num_input_tokens_seen": 37701616, + "step": 178645 + }, + { + "epoch": 19.653465346534652, + "grad_norm": 0.06490185856819153, + "learning_rate": 4.574054250188753e-08, + "loss": 0.069, + "num_input_tokens_seen": 37702640, + "step": 178650 + }, + { + "epoch": 19.654015401540153, + "grad_norm": 0.3731832802295685, + "learning_rate": 4.559553979102405e-08, + "loss": 0.006, + "num_input_tokens_seen": 37703696, + "step": 178655 + }, + { + "epoch": 19.654565456545654, + "grad_norm": 0.036373790353536606, + "learning_rate": 4.545076707293538e-08, + "loss": 0.0948, + "num_input_tokens_seen": 37704720, + "step": 178660 + }, + { + "epoch": 19.655115511551156, + "grad_norm": 0.01599918305873871, + "learning_rate": 4.5306224348948266e-08, + "loss": 0.0104, + "num_input_tokens_seen": 37705808, + "step": 178665 + }, + { + "epoch": 19.655665566556657, + "grad_norm": 0.07269170880317688, + "learning_rate": 4.516191162040051e-08, + "loss": 0.0861, + "num_input_tokens_seen": 37706800, + "step": 178670 + }, + { + "epoch": 19.656215621562158, + "grad_norm": 0.06952962279319763, + "learning_rate": 4.501782888862161e-08, + "loss": 0.0248, + "num_input_tokens_seen": 37707824, + "step": 178675 + }, + { + "epoch": 19.656765676567655, + "grad_norm": 0.292525053024292, + "learning_rate": 4.487397615493827e-08, + "loss": 0.0084, + "num_input_tokens_seen": 37708880, + "step": 178680 + }, + { + "epoch": 19.657315731573156, + "grad_norm": 0.021609988063573837, + "learning_rate": 4.473035342067722e-08, + "loss": 0.0726, + "num_input_tokens_seen": 37709968, + "step": 178685 + }, + { + "epoch": 19.657865786578657, + "grad_norm": 0.005779851693660021, + "learning_rate": 4.4586960687162395e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37710992, + "step": 178690 + }, + { + "epoch": 19.65841584158416, + "grad_norm": 0.03713155910372734, + "learning_rate": 4.444379795571496e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37712080, + "step": 178695 + }, + { + "epoch": 19.65896589658966, + "grad_norm": 1.7646466493606567, + "learning_rate": 4.430086522765331e-08, + "loss": 0.0359, + "num_input_tokens_seen": 37713168, + "step": 178700 + }, + { + "epoch": 19.65951595159516, + "grad_norm": 1.41510808467865, + "learning_rate": 4.415816250429583e-08, + "loss": 0.0459, + "num_input_tokens_seen": 37714224, + "step": 178705 + }, + { + "epoch": 19.66006600660066, + "grad_norm": 0.5741239190101624, + "learning_rate": 4.401568978695536e-08, + "loss": 0.0178, + "num_input_tokens_seen": 37715312, + "step": 178710 + }, + { + "epoch": 19.66061606160616, + "grad_norm": 0.03420281782746315, + "learning_rate": 4.387344707695029e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37716368, + "step": 178715 + }, + { + "epoch": 19.66116611661166, + "grad_norm": 0.5570205450057983, + "learning_rate": 4.373143437558791e-08, + "loss": 0.0125, + "num_input_tokens_seen": 37717424, + "step": 178720 + }, + { + "epoch": 19.66171617161716, + "grad_norm": 0.023238740861415863, + "learning_rate": 4.3589651684175504e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37718512, + "step": 178725 + }, + { + "epoch": 19.662266226622663, + "grad_norm": 0.11966051161289215, + "learning_rate": 4.344809900402591e-08, + "loss": 0.0685, + "num_input_tokens_seen": 37719536, + "step": 178730 + }, + { + "epoch": 19.662816281628164, + "grad_norm": 0.12864814698696136, + "learning_rate": 4.330677633643809e-08, + "loss": 0.0056, + "num_input_tokens_seen": 37720528, + "step": 178735 + }, + { + "epoch": 19.663366336633665, + "grad_norm": 0.30269867181777954, + "learning_rate": 4.316568368271379e-08, + "loss": 0.0031, + "num_input_tokens_seen": 37721552, + "step": 178740 + }, + { + "epoch": 19.663916391639162, + "grad_norm": 0.07075747847557068, + "learning_rate": 4.302482104416028e-08, + "loss": 0.1178, + "num_input_tokens_seen": 37722672, + "step": 178745 + }, + { + "epoch": 19.664466446644663, + "grad_norm": 0.04276850447058678, + "learning_rate": 4.288418842206821e-08, + "loss": 0.0164, + "num_input_tokens_seen": 37723728, + "step": 178750 + }, + { + "epoch": 19.665016501650165, + "grad_norm": 0.039981141686439514, + "learning_rate": 4.27437858177393e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37724752, + "step": 178755 + }, + { + "epoch": 19.665566556655666, + "grad_norm": 0.1898193508386612, + "learning_rate": 4.26036132324642e-08, + "loss": 0.0557, + "num_input_tokens_seen": 37725840, + "step": 178760 + }, + { + "epoch": 19.666116611661167, + "grad_norm": 0.04261697456240654, + "learning_rate": 4.246367066753631e-08, + "loss": 0.0056, + "num_input_tokens_seen": 37726960, + "step": 178765 + }, + { + "epoch": 19.666666666666668, + "grad_norm": 1.9836028814315796, + "learning_rate": 4.23239581242435e-08, + "loss": 0.0513, + "num_input_tokens_seen": 37728048, + "step": 178770 + }, + { + "epoch": 19.66721672167217, + "grad_norm": 0.22536130249500275, + "learning_rate": 4.2184475603876395e-08, + "loss": 0.0588, + "num_input_tokens_seen": 37729136, + "step": 178775 + }, + { + "epoch": 19.667766776677666, + "grad_norm": 0.06673930585384369, + "learning_rate": 4.2045223107717304e-08, + "loss": 0.0091, + "num_input_tokens_seen": 37730096, + "step": 178780 + }, + { + "epoch": 19.668316831683168, + "grad_norm": 0.1604677438735962, + "learning_rate": 4.1906200637054084e-08, + "loss": 0.0045, + "num_input_tokens_seen": 37731120, + "step": 178785 + }, + { + "epoch": 19.66886688668867, + "grad_norm": 0.013244716450572014, + "learning_rate": 4.176740819316349e-08, + "loss": 0.1023, + "num_input_tokens_seen": 37732144, + "step": 178790 + }, + { + "epoch": 19.66941694169417, + "grad_norm": 2.2589826583862305, + "learning_rate": 4.162884577732784e-08, + "loss": 0.0161, + "num_input_tokens_seen": 37733264, + "step": 178795 + }, + { + "epoch": 19.66996699669967, + "grad_norm": 0.06006670370697975, + "learning_rate": 4.1490513390821105e-08, + "loss": 0.0165, + "num_input_tokens_seen": 37734320, + "step": 178800 + }, + { + "epoch": 19.670517051705172, + "grad_norm": 0.0028141967486590147, + "learning_rate": 4.1352411034920046e-08, + "loss": 0.0044, + "num_input_tokens_seen": 37735440, + "step": 178805 + }, + { + "epoch": 19.67106710671067, + "grad_norm": 0.06585381180047989, + "learning_rate": 4.121453871089864e-08, + "loss": 0.0027, + "num_input_tokens_seen": 37736496, + "step": 178810 + }, + { + "epoch": 19.67161716171617, + "grad_norm": 0.04312477633357048, + "learning_rate": 4.10768964200281e-08, + "loss": 0.061, + "num_input_tokens_seen": 37737616, + "step": 178815 + }, + { + "epoch": 19.67216721672167, + "grad_norm": 1.2054065465927124, + "learning_rate": 4.093948416357407e-08, + "loss": 0.0784, + "num_input_tokens_seen": 37738640, + "step": 178820 + }, + { + "epoch": 19.672717271727173, + "grad_norm": 0.006133444607257843, + "learning_rate": 4.0802301942802215e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37739728, + "step": 178825 + }, + { + "epoch": 19.673267326732674, + "grad_norm": 0.03695547580718994, + "learning_rate": 4.066534975898095e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37740784, + "step": 178830 + }, + { + "epoch": 19.673817381738175, + "grad_norm": 0.001620865543372929, + "learning_rate": 4.0528627613370394e-08, + "loss": 0.0703, + "num_input_tokens_seen": 37741840, + "step": 178835 + }, + { + "epoch": 19.674367436743676, + "grad_norm": 0.012917534448206425, + "learning_rate": 4.039213550723064e-08, + "loss": 0.1244, + "num_input_tokens_seen": 37742864, + "step": 178840 + }, + { + "epoch": 19.674917491749174, + "grad_norm": 0.0493321567773819, + "learning_rate": 4.0255873441819025e-08, + "loss": 0.0418, + "num_input_tokens_seen": 37743920, + "step": 178845 + }, + { + "epoch": 19.675467546754675, + "grad_norm": 0.026967907324433327, + "learning_rate": 4.011984141839287e-08, + "loss": 0.0174, + "num_input_tokens_seen": 37745008, + "step": 178850 + }, + { + "epoch": 19.676017601760176, + "grad_norm": 0.029677970334887505, + "learning_rate": 3.9984039438203944e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37746096, + "step": 178855 + }, + { + "epoch": 19.676567656765677, + "grad_norm": 0.021546892821788788, + "learning_rate": 3.9848467502506816e-08, + "loss": 0.0041, + "num_input_tokens_seen": 37747216, + "step": 178860 + }, + { + "epoch": 19.677117711771178, + "grad_norm": 0.0349632203578949, + "learning_rate": 3.9713125612547695e-08, + "loss": 0.0079, + "num_input_tokens_seen": 37748176, + "step": 178865 + }, + { + "epoch": 19.67766776677668, + "grad_norm": 0.03567851334810257, + "learning_rate": 3.957801376957837e-08, + "loss": 0.0084, + "num_input_tokens_seen": 37749264, + "step": 178870 + }, + { + "epoch": 19.678217821782177, + "grad_norm": 0.0451892726123333, + "learning_rate": 3.944313197483951e-08, + "loss": 0.069, + "num_input_tokens_seen": 37750320, + "step": 178875 + }, + { + "epoch": 19.678767876787678, + "grad_norm": 2.0257771015167236, + "learning_rate": 3.930848022957734e-08, + "loss": 0.1292, + "num_input_tokens_seen": 37751344, + "step": 178880 + }, + { + "epoch": 19.67931793179318, + "grad_norm": 0.018651023507118225, + "learning_rate": 3.9174058535029753e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37752368, + "step": 178885 + }, + { + "epoch": 19.67986798679868, + "grad_norm": 0.5811254382133484, + "learning_rate": 3.9039866892437435e-08, + "loss": 0.0842, + "num_input_tokens_seen": 37753392, + "step": 178890 + }, + { + "epoch": 19.68041804180418, + "grad_norm": 0.006320763844996691, + "learning_rate": 3.890590530303828e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37754416, + "step": 178895 + }, + { + "epoch": 19.680968096809682, + "grad_norm": 0.017032528296113014, + "learning_rate": 3.877217376806463e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37755472, + "step": 178900 + }, + { + "epoch": 19.681518151815183, + "grad_norm": 0.03671283274888992, + "learning_rate": 3.863867228875162e-08, + "loss": 0.0122, + "num_input_tokens_seen": 37756496, + "step": 178905 + }, + { + "epoch": 19.68206820682068, + "grad_norm": 0.00931769423186779, + "learning_rate": 3.850540086632881e-08, + "loss": 0.0033, + "num_input_tokens_seen": 37757520, + "step": 178910 + }, + { + "epoch": 19.682618261826182, + "grad_norm": 0.10840637236833572, + "learning_rate": 3.8372359502023e-08, + "loss": 0.0216, + "num_input_tokens_seen": 37758576, + "step": 178915 + }, + { + "epoch": 19.683168316831683, + "grad_norm": 0.07952029258012772, + "learning_rate": 3.823954819706099e-08, + "loss": 0.008, + "num_input_tokens_seen": 37759600, + "step": 178920 + }, + { + "epoch": 19.683718371837184, + "grad_norm": 0.011246702633798122, + "learning_rate": 3.810696695266958e-08, + "loss": 0.0274, + "num_input_tokens_seen": 37760656, + "step": 178925 + }, + { + "epoch": 19.684268426842685, + "grad_norm": 0.0733848586678505, + "learning_rate": 3.797461577006445e-08, + "loss": 0.0069, + "num_input_tokens_seen": 37761680, + "step": 178930 + }, + { + "epoch": 19.684818481848186, + "grad_norm": 0.0641474574804306, + "learning_rate": 3.78424946504724e-08, + "loss": 0.0782, + "num_input_tokens_seen": 37762736, + "step": 178935 + }, + { + "epoch": 19.685368536853684, + "grad_norm": 0.024556169286370277, + "learning_rate": 3.7710603595106365e-08, + "loss": 0.0054, + "num_input_tokens_seen": 37763792, + "step": 178940 + }, + { + "epoch": 19.685918591859185, + "grad_norm": 1.1571284532546997, + "learning_rate": 3.7578942605184795e-08, + "loss": 0.0162, + "num_input_tokens_seen": 37764816, + "step": 178945 + }, + { + "epoch": 19.686468646864686, + "grad_norm": 2.3547096252441406, + "learning_rate": 3.744751168192062e-08, + "loss": 0.0362, + "num_input_tokens_seen": 37765808, + "step": 178950 + }, + { + "epoch": 19.687018701870187, + "grad_norm": 0.020143480971455574, + "learning_rate": 3.7316310826523984e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37766832, + "step": 178955 + }, + { + "epoch": 19.687568756875688, + "grad_norm": 0.04654737189412117, + "learning_rate": 3.718534004020224e-08, + "loss": 0.0337, + "num_input_tokens_seen": 37767856, + "step": 178960 + }, + { + "epoch": 19.68811881188119, + "grad_norm": 0.037299998104572296, + "learning_rate": 3.705459932416833e-08, + "loss": 0.0146, + "num_input_tokens_seen": 37768880, + "step": 178965 + }, + { + "epoch": 19.68866886688669, + "grad_norm": 0.021334156394004822, + "learning_rate": 3.6924088679621274e-08, + "loss": 0.0026, + "num_input_tokens_seen": 37769968, + "step": 178970 + }, + { + "epoch": 19.689218921892188, + "grad_norm": 0.049808893352746964, + "learning_rate": 3.679380810776845e-08, + "loss": 0.0007, + "num_input_tokens_seen": 37770992, + "step": 178975 + }, + { + "epoch": 19.68976897689769, + "grad_norm": 0.035632215440273285, + "learning_rate": 3.666375760980611e-08, + "loss": 0.0198, + "num_input_tokens_seen": 37772048, + "step": 178980 + }, + { + "epoch": 19.69031903190319, + "grad_norm": 0.012427164241671562, + "learning_rate": 3.653393718693887e-08, + "loss": 0.0056, + "num_input_tokens_seen": 37773104, + "step": 178985 + }, + { + "epoch": 19.69086908690869, + "grad_norm": 0.03564491495490074, + "learning_rate": 3.640434684035743e-08, + "loss": 0.0052, + "num_input_tokens_seen": 37774288, + "step": 178990 + }, + { + "epoch": 19.691419141914192, + "grad_norm": 0.01305695902556181, + "learning_rate": 3.627498657125805e-08, + "loss": 0.0041, + "num_input_tokens_seen": 37775344, + "step": 178995 + }, + { + "epoch": 19.691969196919693, + "grad_norm": 0.016457397490739822, + "learning_rate": 3.614585638083423e-08, + "loss": 0.0037, + "num_input_tokens_seen": 37776432, + "step": 179000 + }, + { + "epoch": 19.69251925192519, + "grad_norm": 0.00988658145070076, + "learning_rate": 3.60169562702739e-08, + "loss": 0.1094, + "num_input_tokens_seen": 37777456, + "step": 179005 + }, + { + "epoch": 19.693069306930692, + "grad_norm": 0.13285209238529205, + "learning_rate": 3.588828624076779e-08, + "loss": 0.0892, + "num_input_tokens_seen": 37778512, + "step": 179010 + }, + { + "epoch": 19.693619361936193, + "grad_norm": 2.926854133605957, + "learning_rate": 3.575984629350104e-08, + "loss": 0.0333, + "num_input_tokens_seen": 37779568, + "step": 179015 + }, + { + "epoch": 19.694169416941694, + "grad_norm": 0.07074569910764694, + "learning_rate": 3.5631636429656054e-08, + "loss": 0.0339, + "num_input_tokens_seen": 37780624, + "step": 179020 + }, + { + "epoch": 19.694719471947195, + "grad_norm": 1.2313951253890991, + "learning_rate": 3.550365665041522e-08, + "loss": 0.074, + "num_input_tokens_seen": 37781680, + "step": 179025 + }, + { + "epoch": 19.695269526952696, + "grad_norm": 0.301656574010849, + "learning_rate": 3.537590695695536e-08, + "loss": 0.0547, + "num_input_tokens_seen": 37782768, + "step": 179030 + }, + { + "epoch": 19.695819581958197, + "grad_norm": 0.04251265898346901, + "learning_rate": 3.5248387350461655e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37783792, + "step": 179035 + }, + { + "epoch": 19.696369636963695, + "grad_norm": 0.09369444847106934, + "learning_rate": 3.5121097832099824e-08, + "loss": 0.0564, + "num_input_tokens_seen": 37784816, + "step": 179040 + }, + { + "epoch": 19.696919691969196, + "grad_norm": 0.6695687770843506, + "learning_rate": 3.499403840304949e-08, + "loss": 0.103, + "num_input_tokens_seen": 37785872, + "step": 179045 + }, + { + "epoch": 19.697469746974697, + "grad_norm": 0.024436578154563904, + "learning_rate": 3.486720906448193e-08, + "loss": 0.0007, + "num_input_tokens_seen": 37786896, + "step": 179050 + }, + { + "epoch": 19.698019801980198, + "grad_norm": 0.0041028172709047794, + "learning_rate": 3.474060981756011e-08, + "loss": 0.0009, + "num_input_tokens_seen": 37787984, + "step": 179055 + }, + { + "epoch": 19.6985698569857, + "grad_norm": 0.1080002561211586, + "learning_rate": 3.461424066345531e-08, + "loss": 0.0024, + "num_input_tokens_seen": 37789104, + "step": 179060 + }, + { + "epoch": 19.6991199119912, + "grad_norm": 0.005400661379098892, + "learning_rate": 3.448810160333327e-08, + "loss": 0.0064, + "num_input_tokens_seen": 37790128, + "step": 179065 + }, + { + "epoch": 19.6996699669967, + "grad_norm": 2.861971378326416, + "learning_rate": 3.436219263835416e-08, + "loss": 0.0403, + "num_input_tokens_seen": 37791120, + "step": 179070 + }, + { + "epoch": 19.7002200220022, + "grad_norm": 2.851776123046875, + "learning_rate": 3.423651376967818e-08, + "loss": 0.1158, + "num_input_tokens_seen": 37792176, + "step": 179075 + }, + { + "epoch": 19.7007700770077, + "grad_norm": 0.005636957939714193, + "learning_rate": 3.4111064998465505e-08, + "loss": 0.0058, + "num_input_tokens_seen": 37793328, + "step": 179080 + }, + { + "epoch": 19.7013201320132, + "grad_norm": 0.00669652596116066, + "learning_rate": 3.398584632587076e-08, + "loss": 0.0006, + "num_input_tokens_seen": 37794384, + "step": 179085 + }, + { + "epoch": 19.701870187018702, + "grad_norm": 0.01019300427287817, + "learning_rate": 3.386085775304859e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37795440, + "step": 179090 + }, + { + "epoch": 19.702420242024203, + "grad_norm": 0.03678430989384651, + "learning_rate": 3.373609928115085e-08, + "loss": 0.085, + "num_input_tokens_seen": 37796528, + "step": 179095 + }, + { + "epoch": 19.702970297029704, + "grad_norm": 2.499769449234009, + "learning_rate": 3.3611570911326605e-08, + "loss": 0.1073, + "num_input_tokens_seen": 37797616, + "step": 179100 + }, + { + "epoch": 19.703520352035202, + "grad_norm": 0.017388250678777695, + "learning_rate": 3.348727264472773e-08, + "loss": 0.0039, + "num_input_tokens_seen": 37798672, + "step": 179105 + }, + { + "epoch": 19.704070407040703, + "grad_norm": 0.03835860639810562, + "learning_rate": 3.336320448249219e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37799696, + "step": 179110 + }, + { + "epoch": 19.704620462046204, + "grad_norm": 0.02002350240945816, + "learning_rate": 3.323936642577186e-08, + "loss": 0.0048, + "num_input_tokens_seen": 37800720, + "step": 179115 + }, + { + "epoch": 19.705170517051705, + "grad_norm": 0.05082009360194206, + "learning_rate": 3.311575847570192e-08, + "loss": 0.0151, + "num_input_tokens_seen": 37801776, + "step": 179120 + }, + { + "epoch": 19.705720572057206, + "grad_norm": 0.01322823204100132, + "learning_rate": 3.2992380633423133e-08, + "loss": 0.001, + "num_input_tokens_seen": 37802800, + "step": 179125 + }, + { + "epoch": 19.706270627062707, + "grad_norm": 0.7360939979553223, + "learning_rate": 3.2869232900076264e-08, + "loss": 0.0209, + "num_input_tokens_seen": 37803824, + "step": 179130 + }, + { + "epoch": 19.706820682068205, + "grad_norm": 0.3171471357345581, + "learning_rate": 3.274631527679095e-08, + "loss": 0.0061, + "num_input_tokens_seen": 37804944, + "step": 179135 + }, + { + "epoch": 19.707370737073706, + "grad_norm": 0.05373237654566765, + "learning_rate": 3.26236277647024e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37806000, + "step": 179140 + }, + { + "epoch": 19.707920792079207, + "grad_norm": 0.07674982398748398, + "learning_rate": 3.250117036494027e-08, + "loss": 0.0104, + "num_input_tokens_seen": 37807024, + "step": 179145 + }, + { + "epoch": 19.70847084708471, + "grad_norm": 0.05756397172808647, + "learning_rate": 3.2378943078636984e-08, + "loss": 0.0103, + "num_input_tokens_seen": 37808080, + "step": 179150 + }, + { + "epoch": 19.70902090209021, + "grad_norm": 0.025643404573202133, + "learning_rate": 3.225694590691386e-08, + "loss": 0.0047, + "num_input_tokens_seen": 37809104, + "step": 179155 + }, + { + "epoch": 19.70957095709571, + "grad_norm": 0.005730716045945883, + "learning_rate": 3.213517885090056e-08, + "loss": 0.0037, + "num_input_tokens_seen": 37810224, + "step": 179160 + }, + { + "epoch": 19.71012101210121, + "grad_norm": 0.11441927403211594, + "learning_rate": 3.201364191171285e-08, + "loss": 0.0028, + "num_input_tokens_seen": 37811312, + "step": 179165 + }, + { + "epoch": 19.71067106710671, + "grad_norm": 0.005431522615253925, + "learning_rate": 3.189233509047762e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37812336, + "step": 179170 + }, + { + "epoch": 19.71122112211221, + "grad_norm": 2.044726610183716, + "learning_rate": 3.177125838830786e-08, + "loss": 0.2378, + "num_input_tokens_seen": 37813296, + "step": 179175 + }, + { + "epoch": 19.71177117711771, + "grad_norm": 0.006286622956395149, + "learning_rate": 3.1650411806324885e-08, + "loss": 0.0995, + "num_input_tokens_seen": 37814320, + "step": 179180 + }, + { + "epoch": 19.712321232123212, + "grad_norm": 0.07576561719179153, + "learning_rate": 3.152979534563616e-08, + "loss": 0.0034, + "num_input_tokens_seen": 37815312, + "step": 179185 + }, + { + "epoch": 19.712871287128714, + "grad_norm": 0.031082523986697197, + "learning_rate": 3.140940900735745e-08, + "loss": 0.0558, + "num_input_tokens_seen": 37816336, + "step": 179190 + }, + { + "epoch": 19.713421342134215, + "grad_norm": 0.012635797262191772, + "learning_rate": 3.12892527925962e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37817456, + "step": 179195 + }, + { + "epoch": 19.713971397139716, + "grad_norm": 0.03509390354156494, + "learning_rate": 3.1169326702459866e-08, + "loss": 0.0071, + "num_input_tokens_seen": 37818512, + "step": 179200 + }, + { + "epoch": 19.714521452145213, + "grad_norm": 0.017889810726046562, + "learning_rate": 3.104963073805589e-08, + "loss": 0.0039, + "num_input_tokens_seen": 37819568, + "step": 179205 + }, + { + "epoch": 19.715071507150714, + "grad_norm": 0.09684503078460693, + "learning_rate": 3.093016490048617e-08, + "loss": 0.0088, + "num_input_tokens_seen": 37820624, + "step": 179210 + }, + { + "epoch": 19.715621562156215, + "grad_norm": 0.10409144312143326, + "learning_rate": 3.08109291908526e-08, + "loss": 0.0035, + "num_input_tokens_seen": 37821616, + "step": 179215 + }, + { + "epoch": 19.716171617161717, + "grad_norm": 0.008326354436576366, + "learning_rate": 3.069192361025153e-08, + "loss": 0.0042, + "num_input_tokens_seen": 37822672, + "step": 179220 + }, + { + "epoch": 19.716721672167218, + "grad_norm": 0.0615023672580719, + "learning_rate": 3.057314815978207e-08, + "loss": 0.0052, + "num_input_tokens_seen": 37823696, + "step": 179225 + }, + { + "epoch": 19.71727172717272, + "grad_norm": 0.9600670337677002, + "learning_rate": 3.045460284054058e-08, + "loss": 0.0631, + "num_input_tokens_seen": 37824784, + "step": 179230 + }, + { + "epoch": 19.717821782178216, + "grad_norm": 0.004725687671452761, + "learning_rate": 3.033628765361507e-08, + "loss": 0.0478, + "num_input_tokens_seen": 37825904, + "step": 179235 + }, + { + "epoch": 19.718371837183717, + "grad_norm": 0.02503589354455471, + "learning_rate": 3.021820260009911e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37826928, + "step": 179240 + }, + { + "epoch": 19.71892189218922, + "grad_norm": 0.44602566957473755, + "learning_rate": 3.010034768108072e-08, + "loss": 0.0035, + "num_input_tokens_seen": 37827952, + "step": 179245 + }, + { + "epoch": 19.71947194719472, + "grad_norm": 2.4759137630462646, + "learning_rate": 2.9982722897645146e-08, + "loss": 0.1217, + "num_input_tokens_seen": 37829008, + "step": 179250 + }, + { + "epoch": 19.72002200220022, + "grad_norm": 0.29200437664985657, + "learning_rate": 2.986532825087762e-08, + "loss": 0.0061, + "num_input_tokens_seen": 37830064, + "step": 179255 + }, + { + "epoch": 19.72057205720572, + "grad_norm": 0.0316532738506794, + "learning_rate": 2.9748163741857847e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37831088, + "step": 179260 + }, + { + "epoch": 19.721122112211223, + "grad_norm": 0.05831180140376091, + "learning_rate": 2.963122937166829e-08, + "loss": 0.1182, + "num_input_tokens_seen": 37832112, + "step": 179265 + }, + { + "epoch": 19.72167216721672, + "grad_norm": 0.16505877673625946, + "learning_rate": 2.9514525141385863e-08, + "loss": 0.0069, + "num_input_tokens_seen": 37833200, + "step": 179270 + }, + { + "epoch": 19.72222222222222, + "grad_norm": 0.06751678884029388, + "learning_rate": 2.9398051052087483e-08, + "loss": 0.0013, + "num_input_tokens_seen": 37834192, + "step": 179275 + }, + { + "epoch": 19.722772277227723, + "grad_norm": 0.05658567324280739, + "learning_rate": 2.9281807104844516e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37835248, + "step": 179280 + }, + { + "epoch": 19.723322332233224, + "grad_norm": 0.13076123595237732, + "learning_rate": 2.916579330072833e-08, + "loss": 0.0025, + "num_input_tokens_seen": 37836240, + "step": 179285 + }, + { + "epoch": 19.723872387238725, + "grad_norm": 0.011375010944902897, + "learning_rate": 2.9050009640807508e-08, + "loss": 0.002, + "num_input_tokens_seen": 37837296, + "step": 179290 + }, + { + "epoch": 19.724422442244226, + "grad_norm": 0.01821102574467659, + "learning_rate": 2.893445612615342e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37838320, + "step": 179295 + }, + { + "epoch": 19.724972497249723, + "grad_norm": 0.02210153639316559, + "learning_rate": 2.881913275782633e-08, + "loss": 0.0076, + "num_input_tokens_seen": 37839376, + "step": 179300 + }, + { + "epoch": 19.725522552255224, + "grad_norm": 0.004859906621277332, + "learning_rate": 2.8704039536892048e-08, + "loss": 0.0032, + "num_input_tokens_seen": 37840400, + "step": 179305 + }, + { + "epoch": 19.726072607260726, + "grad_norm": 0.11367182433605194, + "learning_rate": 2.8589176464408063e-08, + "loss": 0.0037, + "num_input_tokens_seen": 37841552, + "step": 179310 + }, + { + "epoch": 19.726622662266227, + "grad_norm": 0.05428473278880119, + "learning_rate": 2.8474543541437415e-08, + "loss": 0.002, + "num_input_tokens_seen": 37842608, + "step": 179315 + }, + { + "epoch": 19.727172717271728, + "grad_norm": 0.008672867901623249, + "learning_rate": 2.8360140769034814e-08, + "loss": 0.1956, + "num_input_tokens_seen": 37843600, + "step": 179320 + }, + { + "epoch": 19.72772277227723, + "grad_norm": 0.053466688841581345, + "learning_rate": 2.82459681482522e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37844688, + "step": 179325 + }, + { + "epoch": 19.72827282728273, + "grad_norm": 0.5662920475006104, + "learning_rate": 2.813202568014428e-08, + "loss": 0.0083, + "num_input_tokens_seen": 37845680, + "step": 179330 + }, + { + "epoch": 19.728822882288227, + "grad_norm": 0.010009030811488628, + "learning_rate": 2.8018313365762995e-08, + "loss": 0.004, + "num_input_tokens_seen": 37846768, + "step": 179335 + }, + { + "epoch": 19.72937293729373, + "grad_norm": 0.02083549089729786, + "learning_rate": 2.790483120615195e-08, + "loss": 0.0178, + "num_input_tokens_seen": 37847760, + "step": 179340 + }, + { + "epoch": 19.72992299229923, + "grad_norm": 0.0410335510969162, + "learning_rate": 2.7791579202360307e-08, + "loss": 0.1114, + "num_input_tokens_seen": 37848848, + "step": 179345 + }, + { + "epoch": 19.73047304730473, + "grad_norm": 0.029680727049708366, + "learning_rate": 2.7678557355428902e-08, + "loss": 0.0009, + "num_input_tokens_seen": 37849904, + "step": 179350 + }, + { + "epoch": 19.731023102310232, + "grad_norm": 0.07601051777601242, + "learning_rate": 2.756576566640412e-08, + "loss": 0.0348, + "num_input_tokens_seen": 37850960, + "step": 179355 + }, + { + "epoch": 19.731573157315733, + "grad_norm": 0.008373110555112362, + "learning_rate": 2.745320413632124e-08, + "loss": 0.0004, + "num_input_tokens_seen": 37852080, + "step": 179360 + }, + { + "epoch": 19.73212321232123, + "grad_norm": 0.0024786326102912426, + "learning_rate": 2.734087276621833e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37853136, + "step": 179365 + }, + { + "epoch": 19.73267326732673, + "grad_norm": 0.043659843504428864, + "learning_rate": 2.7228771557133438e-08, + "loss": 0.0521, + "num_input_tokens_seen": 37854192, + "step": 179370 + }, + { + "epoch": 19.733223322332233, + "grad_norm": 0.014000811614096165, + "learning_rate": 2.71169005100963e-08, + "loss": 0.001, + "num_input_tokens_seen": 37855280, + "step": 179375 + }, + { + "epoch": 19.733773377337734, + "grad_norm": 0.012887997552752495, + "learning_rate": 2.7005259626139423e-08, + "loss": 0.0007, + "num_input_tokens_seen": 37856368, + "step": 179380 + }, + { + "epoch": 19.734323432343235, + "grad_norm": 0.026592804118990898, + "learning_rate": 2.689384890629254e-08, + "loss": 0.1157, + "num_input_tokens_seen": 37857456, + "step": 179385 + }, + { + "epoch": 19.734873487348736, + "grad_norm": 0.037323370575904846, + "learning_rate": 2.6782668351582607e-08, + "loss": 0.0047, + "num_input_tokens_seen": 37858480, + "step": 179390 + }, + { + "epoch": 19.735423542354237, + "grad_norm": 0.06456385552883148, + "learning_rate": 2.6671717963033803e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37859472, + "step": 179395 + }, + { + "epoch": 19.735973597359735, + "grad_norm": 2.662351369857788, + "learning_rate": 2.6560997741667536e-08, + "loss": 0.0213, + "num_input_tokens_seen": 37860560, + "step": 179400 + }, + { + "epoch": 19.736523652365236, + "grad_norm": 0.02329031378030777, + "learning_rate": 2.645050768850521e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37861680, + "step": 179405 + }, + { + "epoch": 19.737073707370737, + "grad_norm": 2.3283092975616455, + "learning_rate": 2.6340247804565454e-08, + "loss": 0.1676, + "num_input_tokens_seen": 37862704, + "step": 179410 + }, + { + "epoch": 19.737623762376238, + "grad_norm": 0.022576672956347466, + "learning_rate": 2.6230218090861347e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37863728, + "step": 179415 + }, + { + "epoch": 19.73817381738174, + "grad_norm": 0.33758893609046936, + "learning_rate": 2.6120418548414293e-08, + "loss": 0.0019, + "num_input_tokens_seen": 37864816, + "step": 179420 + }, + { + "epoch": 19.73872387238724, + "grad_norm": 0.036117374897003174, + "learning_rate": 2.6010849178229045e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37865872, + "step": 179425 + }, + { + "epoch": 19.739273927392738, + "grad_norm": 0.1143353059887886, + "learning_rate": 2.5901509981318684e-08, + "loss": 0.0113, + "num_input_tokens_seen": 37866960, + "step": 179430 + }, + { + "epoch": 19.73982398239824, + "grad_norm": 0.16650629043579102, + "learning_rate": 2.5792400958687956e-08, + "loss": 0.0046, + "num_input_tokens_seen": 37867952, + "step": 179435 + }, + { + "epoch": 19.74037403740374, + "grad_norm": 2.2242329120635986, + "learning_rate": 2.568352211134717e-08, + "loss": 0.0667, + "num_input_tokens_seen": 37869040, + "step": 179440 + }, + { + "epoch": 19.74092409240924, + "grad_norm": 0.011929018422961235, + "learning_rate": 2.5574873440298296e-08, + "loss": 0.1301, + "num_input_tokens_seen": 37870064, + "step": 179445 + }, + { + "epoch": 19.741474147414742, + "grad_norm": 0.032311126589775085, + "learning_rate": 2.5466454946540542e-08, + "loss": 0.0081, + "num_input_tokens_seen": 37871056, + "step": 179450 + }, + { + "epoch": 19.742024202420243, + "grad_norm": 0.09854944050312042, + "learning_rate": 2.535826663107588e-08, + "loss": 0.0033, + "num_input_tokens_seen": 37872048, + "step": 179455 + }, + { + "epoch": 19.742574257425744, + "grad_norm": 0.43536433577537537, + "learning_rate": 2.525030849490073e-08, + "loss": 0.0365, + "num_input_tokens_seen": 37873136, + "step": 179460 + }, + { + "epoch": 19.74312431243124, + "grad_norm": 2.6233904361724854, + "learning_rate": 2.514258053900598e-08, + "loss": 0.1295, + "num_input_tokens_seen": 37874256, + "step": 179465 + }, + { + "epoch": 19.743674367436743, + "grad_norm": 0.23016709089279175, + "learning_rate": 2.5035082764390816e-08, + "loss": 0.0049, + "num_input_tokens_seen": 37875344, + "step": 179470 + }, + { + "epoch": 19.744224422442244, + "grad_norm": 0.02586381882429123, + "learning_rate": 2.4927815172040568e-08, + "loss": 0.0298, + "num_input_tokens_seen": 37876432, + "step": 179475 + }, + { + "epoch": 19.744774477447745, + "grad_norm": 0.12765108048915863, + "learning_rate": 2.482077776294889e-08, + "loss": 0.0093, + "num_input_tokens_seen": 37877520, + "step": 179480 + }, + { + "epoch": 19.745324532453246, + "grad_norm": 0.1264466494321823, + "learning_rate": 2.471397053809832e-08, + "loss": 0.0137, + "num_input_tokens_seen": 37878544, + "step": 179485 + }, + { + "epoch": 19.745874587458747, + "grad_norm": 0.011747641488909721, + "learning_rate": 2.460739349847696e-08, + "loss": 0.0025, + "num_input_tokens_seen": 37879536, + "step": 179490 + }, + { + "epoch": 19.746424642464248, + "grad_norm": 0.0604001060128212, + "learning_rate": 2.4501046645064585e-08, + "loss": 0.0016, + "num_input_tokens_seen": 37880592, + "step": 179495 + }, + { + "epoch": 19.746974697469746, + "grad_norm": 0.016708992421627045, + "learning_rate": 2.4394929978838187e-08, + "loss": 0.0917, + "num_input_tokens_seen": 37881616, + "step": 179500 + }, + { + "epoch": 19.747524752475247, + "grad_norm": 0.007857812568545341, + "learning_rate": 2.4289043500783094e-08, + "loss": 0.0503, + "num_input_tokens_seen": 37882704, + "step": 179505 + }, + { + "epoch": 19.748074807480748, + "grad_norm": 0.10839704424142838, + "learning_rate": 2.4183387211870746e-08, + "loss": 0.019, + "num_input_tokens_seen": 37883824, + "step": 179510 + }, + { + "epoch": 19.74862486248625, + "grad_norm": 0.07944491505622864, + "learning_rate": 2.407796111307259e-08, + "loss": 0.0034, + "num_input_tokens_seen": 37884816, + "step": 179515 + }, + { + "epoch": 19.74917491749175, + "grad_norm": 0.20116403698921204, + "learning_rate": 2.3972765205365623e-08, + "loss": 0.1256, + "num_input_tokens_seen": 37885872, + "step": 179520 + }, + { + "epoch": 19.74972497249725, + "grad_norm": 0.0036992160603404045, + "learning_rate": 2.3867799489718512e-08, + "loss": 0.0363, + "num_input_tokens_seen": 37886928, + "step": 179525 + }, + { + "epoch": 19.75027502750275, + "grad_norm": 0.01923498883843422, + "learning_rate": 2.3763063967094378e-08, + "loss": 0.0014, + "num_input_tokens_seen": 37888016, + "step": 179530 + }, + { + "epoch": 19.75082508250825, + "grad_norm": 2.2541520595550537, + "learning_rate": 2.3658558638461892e-08, + "loss": 0.1239, + "num_input_tokens_seen": 37889104, + "step": 179535 + }, + { + "epoch": 19.75137513751375, + "grad_norm": 0.04965491592884064, + "learning_rate": 2.355428350478417e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37890160, + "step": 179540 + }, + { + "epoch": 19.751925192519252, + "grad_norm": 0.03923078253865242, + "learning_rate": 2.3450238567021555e-08, + "loss": 0.0024, + "num_input_tokens_seen": 37891216, + "step": 179545 + }, + { + "epoch": 19.752475247524753, + "grad_norm": 0.03372185304760933, + "learning_rate": 2.334642382613439e-08, + "loss": 0.0012, + "num_input_tokens_seen": 37892272, + "step": 179550 + }, + { + "epoch": 19.753025302530254, + "grad_norm": 0.5958120822906494, + "learning_rate": 2.3242839283077468e-08, + "loss": 0.0189, + "num_input_tokens_seen": 37893328, + "step": 179555 + }, + { + "epoch": 19.753575357535752, + "grad_norm": 2.7330245971679688, + "learning_rate": 2.3139484938805577e-08, + "loss": 0.0123, + "num_input_tokens_seen": 37894416, + "step": 179560 + }, + { + "epoch": 19.754125412541253, + "grad_norm": 0.22486859560012817, + "learning_rate": 2.303636079427074e-08, + "loss": 0.012, + "num_input_tokens_seen": 37895440, + "step": 179565 + }, + { + "epoch": 19.754675467546754, + "grad_norm": 0.032820023596286774, + "learning_rate": 2.2933466850427743e-08, + "loss": 0.0014, + "num_input_tokens_seen": 37896560, + "step": 179570 + }, + { + "epoch": 19.755225522552255, + "grad_norm": 0.011794880032539368, + "learning_rate": 2.283080310822028e-08, + "loss": 0.1318, + "num_input_tokens_seen": 37897616, + "step": 179575 + }, + { + "epoch": 19.755775577557756, + "grad_norm": 0.011087356135249138, + "learning_rate": 2.2728369568597586e-08, + "loss": 0.0005, + "num_input_tokens_seen": 37898672, + "step": 179580 + }, + { + "epoch": 19.756325632563257, + "grad_norm": 0.07121706753969193, + "learning_rate": 2.262616623250058e-08, + "loss": 0.0035, + "num_input_tokens_seen": 37899696, + "step": 179585 + }, + { + "epoch": 19.75687568756876, + "grad_norm": 0.17229989171028137, + "learning_rate": 2.2524193100872948e-08, + "loss": 0.0492, + "num_input_tokens_seen": 37900688, + "step": 179590 + }, + { + "epoch": 19.757425742574256, + "grad_norm": 0.01336115412414074, + "learning_rate": 2.2422450174655608e-08, + "loss": 0.0065, + "num_input_tokens_seen": 37901776, + "step": 179595 + }, + { + "epoch": 19.757975797579757, + "grad_norm": 0.8593603372573853, + "learning_rate": 2.232093745478392e-08, + "loss": 0.0075, + "num_input_tokens_seen": 37902800, + "step": 179600 + }, + { + "epoch": 19.758525852585258, + "grad_norm": 0.004581652581691742, + "learning_rate": 2.2219654942193246e-08, + "loss": 0.067, + "num_input_tokens_seen": 37903824, + "step": 179605 + }, + { + "epoch": 19.75907590759076, + "grad_norm": 0.0172809399664402, + "learning_rate": 2.211860263782173e-08, + "loss": 0.001, + "num_input_tokens_seen": 37904848, + "step": 179610 + }, + { + "epoch": 19.75962596259626, + "grad_norm": 0.08160874992609024, + "learning_rate": 2.20177805425964e-08, + "loss": 0.0028, + "num_input_tokens_seen": 37905968, + "step": 179615 + }, + { + "epoch": 19.76017601760176, + "grad_norm": 0.010102435015141964, + "learning_rate": 2.1917188657447076e-08, + "loss": 0.0044, + "num_input_tokens_seen": 37907088, + "step": 179620 + }, + { + "epoch": 19.760726072607262, + "grad_norm": 0.07219693064689636, + "learning_rate": 2.1816826983300786e-08, + "loss": 0.0225, + "num_input_tokens_seen": 37908208, + "step": 179625 + }, + { + "epoch": 19.76127612761276, + "grad_norm": 0.03218340501189232, + "learning_rate": 2.1716695521084575e-08, + "loss": 0.0188, + "num_input_tokens_seen": 37909328, + "step": 179630 + }, + { + "epoch": 19.76182618261826, + "grad_norm": 5.122978687286377, + "learning_rate": 2.1616794271717143e-08, + "loss": 0.0873, + "num_input_tokens_seen": 37910320, + "step": 179635 + }, + { + "epoch": 19.762376237623762, + "grad_norm": 0.0153534896671772, + "learning_rate": 2.1517123236125535e-08, + "loss": 0.0028, + "num_input_tokens_seen": 37911440, + "step": 179640 + }, + { + "epoch": 19.762926292629263, + "grad_norm": 0.004871939774602652, + "learning_rate": 2.1417682415222908e-08, + "loss": 0.0041, + "num_input_tokens_seen": 37912464, + "step": 179645 + }, + { + "epoch": 19.763476347634764, + "grad_norm": 0.01411815918982029, + "learning_rate": 2.1318471809927965e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37913488, + "step": 179650 + }, + { + "epoch": 19.764026402640265, + "grad_norm": 1.578667402267456, + "learning_rate": 2.121949142115387e-08, + "loss": 0.1127, + "num_input_tokens_seen": 37914576, + "step": 179655 + }, + { + "epoch": 19.764576457645763, + "grad_norm": 1.2766519784927368, + "learning_rate": 2.1120741249816557e-08, + "loss": 0.0134, + "num_input_tokens_seen": 37915696, + "step": 179660 + }, + { + "epoch": 19.765126512651264, + "grad_norm": 0.07965697348117828, + "learning_rate": 2.1022221296820854e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37916784, + "step": 179665 + }, + { + "epoch": 19.765676567656765, + "grad_norm": 0.32694384455680847, + "learning_rate": 2.092393156307715e-08, + "loss": 0.0733, + "num_input_tokens_seen": 37917808, + "step": 179670 + }, + { + "epoch": 19.766226622662266, + "grad_norm": 0.05681495741009712, + "learning_rate": 2.082587204949027e-08, + "loss": 0.0127, + "num_input_tokens_seen": 37918832, + "step": 179675 + }, + { + "epoch": 19.766776677667767, + "grad_norm": 0.024504832923412323, + "learning_rate": 2.0728042756967824e-08, + "loss": 0.0021, + "num_input_tokens_seen": 37919792, + "step": 179680 + }, + { + "epoch": 19.76732673267327, + "grad_norm": 0.019348936155438423, + "learning_rate": 2.0630443686409096e-08, + "loss": 0.0127, + "num_input_tokens_seen": 37920816, + "step": 179685 + }, + { + "epoch": 19.76787678767877, + "grad_norm": 0.24358290433883667, + "learning_rate": 2.0533074838710587e-08, + "loss": 0.0044, + "num_input_tokens_seen": 37921840, + "step": 179690 + }, + { + "epoch": 19.768426842684267, + "grad_norm": 0.002198510803282261, + "learning_rate": 2.0435936214774354e-08, + "loss": 0.0006, + "num_input_tokens_seen": 37922864, + "step": 179695 + }, + { + "epoch": 19.768976897689768, + "grad_norm": 0.06704887002706528, + "learning_rate": 2.033902781549413e-08, + "loss": 0.016, + "num_input_tokens_seen": 37923824, + "step": 179700 + }, + { + "epoch": 19.76952695269527, + "grad_norm": 0.016214601695537567, + "learning_rate": 2.024234964176086e-08, + "loss": 0.1027, + "num_input_tokens_seen": 37924944, + "step": 179705 + }, + { + "epoch": 19.77007700770077, + "grad_norm": 0.0635451227426529, + "learning_rate": 2.0145901694468283e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37925968, + "step": 179710 + }, + { + "epoch": 19.77062706270627, + "grad_norm": 0.18800000846385956, + "learning_rate": 2.0049683974504572e-08, + "loss": 0.0448, + "num_input_tokens_seen": 37926992, + "step": 179715 + }, + { + "epoch": 19.771177117711773, + "grad_norm": 0.013897699303925037, + "learning_rate": 1.9953696482755136e-08, + "loss": 0.0005, + "num_input_tokens_seen": 37928080, + "step": 179720 + }, + { + "epoch": 19.77172717271727, + "grad_norm": 0.09094668179750443, + "learning_rate": 1.9857939220108147e-08, + "loss": 0.0373, + "num_input_tokens_seen": 37929104, + "step": 179725 + }, + { + "epoch": 19.77227722772277, + "grad_norm": 0.06621890515089035, + "learning_rate": 1.976241218744346e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37930128, + "step": 179730 + }, + { + "epoch": 19.772827282728272, + "grad_norm": 0.25381869077682495, + "learning_rate": 1.9667115385640923e-08, + "loss": 0.0721, + "num_input_tokens_seen": 37931152, + "step": 179735 + }, + { + "epoch": 19.773377337733773, + "grad_norm": 0.004798788111656904, + "learning_rate": 1.957204881558039e-08, + "loss": 0.0535, + "num_input_tokens_seen": 37932304, + "step": 179740 + }, + { + "epoch": 19.773927392739274, + "grad_norm": 0.19833214581012726, + "learning_rate": 1.9477212478136163e-08, + "loss": 0.0626, + "num_input_tokens_seen": 37933360, + "step": 179745 + }, + { + "epoch": 19.774477447744776, + "grad_norm": 0.05534818395972252, + "learning_rate": 1.938260637418532e-08, + "loss": 0.0073, + "num_input_tokens_seen": 37934416, + "step": 179750 + }, + { + "epoch": 19.775027502750277, + "grad_norm": 0.07900843769311905, + "learning_rate": 1.9288230504596606e-08, + "loss": 0.0022, + "num_input_tokens_seen": 37935504, + "step": 179755 + }, + { + "epoch": 19.775577557755774, + "grad_norm": 0.010856869630515575, + "learning_rate": 1.9194084870244323e-08, + "loss": 0.0033, + "num_input_tokens_seen": 37936528, + "step": 179760 + }, + { + "epoch": 19.776127612761275, + "grad_norm": 3.52032470703125, + "learning_rate": 1.9100169471988893e-08, + "loss": 0.0855, + "num_input_tokens_seen": 37937584, + "step": 179765 + }, + { + "epoch": 19.776677667766776, + "grad_norm": 0.020256489515304565, + "learning_rate": 1.900648431070462e-08, + "loss": 0.0042, + "num_input_tokens_seen": 37938672, + "step": 179770 + }, + { + "epoch": 19.777227722772277, + "grad_norm": 2.2078194618225098, + "learning_rate": 1.8913029387246372e-08, + "loss": 0.0433, + "num_input_tokens_seen": 37939760, + "step": 179775 + }, + { + "epoch": 19.77777777777778, + "grad_norm": 0.03684943541884422, + "learning_rate": 1.8819804702482903e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37940848, + "step": 179780 + }, + { + "epoch": 19.77832783278328, + "grad_norm": 0.08680517971515656, + "learning_rate": 1.8726810257269077e-08, + "loss": 0.0299, + "num_input_tokens_seen": 37941808, + "step": 179785 + }, + { + "epoch": 19.778877887788777, + "grad_norm": 0.1773860603570938, + "learning_rate": 1.8634046052462552e-08, + "loss": 0.0107, + "num_input_tokens_seen": 37942864, + "step": 179790 + }, + { + "epoch": 19.77942794279428, + "grad_norm": 2.4873950481414795, + "learning_rate": 1.854151208891819e-08, + "loss": 0.0884, + "num_input_tokens_seen": 37943888, + "step": 179795 + }, + { + "epoch": 19.77997799779978, + "grad_norm": 0.04168706014752388, + "learning_rate": 1.8449208367490868e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37944944, + "step": 179800 + }, + { + "epoch": 19.78052805280528, + "grad_norm": 0.31814950704574585, + "learning_rate": 1.8357134889029903e-08, + "loss": 0.0563, + "num_input_tokens_seen": 37946032, + "step": 179805 + }, + { + "epoch": 19.78107810781078, + "grad_norm": 0.05528128892183304, + "learning_rate": 1.826529165438462e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37947088, + "step": 179810 + }, + { + "epoch": 19.781628162816283, + "grad_norm": 1.4433326721191406, + "learning_rate": 1.8173678664398785e-08, + "loss": 0.1108, + "num_input_tokens_seen": 37948112, + "step": 179815 + }, + { + "epoch": 19.782178217821784, + "grad_norm": 0.027098601683974266, + "learning_rate": 1.8082295919918945e-08, + "loss": 0.042, + "num_input_tokens_seen": 37949232, + "step": 179820 + }, + { + "epoch": 19.78272827282728, + "grad_norm": 0.028213759884238243, + "learning_rate": 1.799114342178887e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37950320, + "step": 179825 + }, + { + "epoch": 19.783278327832782, + "grad_norm": 0.0844220370054245, + "learning_rate": 1.7900221170844e-08, + "loss": 0.0015, + "num_input_tokens_seen": 37951344, + "step": 179830 + }, + { + "epoch": 19.783828382838283, + "grad_norm": 0.02240915596485138, + "learning_rate": 1.7809529167928107e-08, + "loss": 0.0033, + "num_input_tokens_seen": 37952432, + "step": 179835 + }, + { + "epoch": 19.784378437843785, + "grad_norm": 0.010059060528874397, + "learning_rate": 1.771906741387108e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37953456, + "step": 179840 + }, + { + "epoch": 19.784928492849286, + "grad_norm": 0.04260673746466637, + "learning_rate": 1.7628835909513918e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37954448, + "step": 179845 + }, + { + "epoch": 19.785478547854787, + "grad_norm": 0.027053112164139748, + "learning_rate": 1.7538834655680957e-08, + "loss": 0.0024, + "num_input_tokens_seen": 37955472, + "step": 179850 + }, + { + "epoch": 19.786028602860284, + "grad_norm": 0.027670763432979584, + "learning_rate": 1.7449063653204866e-08, + "loss": 0.047, + "num_input_tokens_seen": 37956528, + "step": 179855 + }, + { + "epoch": 19.786578657865785, + "grad_norm": 0.12733225524425507, + "learning_rate": 1.7359522902915537e-08, + "loss": 0.0035, + "num_input_tokens_seen": 37957520, + "step": 179860 + }, + { + "epoch": 19.787128712871286, + "grad_norm": 0.02099602483212948, + "learning_rate": 1.7270212405631757e-08, + "loss": 0.0266, + "num_input_tokens_seen": 37958544, + "step": 179865 + }, + { + "epoch": 19.787678767876788, + "grad_norm": 0.002703118836507201, + "learning_rate": 1.7181132162183423e-08, + "loss": 0.1092, + "num_input_tokens_seen": 37959568, + "step": 179870 + }, + { + "epoch": 19.78822882288229, + "grad_norm": 0.015591317787766457, + "learning_rate": 1.709228217338654e-08, + "loss": 0.0305, + "num_input_tokens_seen": 37960624, + "step": 179875 + }, + { + "epoch": 19.78877887788779, + "grad_norm": 0.030151743441820145, + "learning_rate": 1.700366244006546e-08, + "loss": 0.0108, + "num_input_tokens_seen": 37961744, + "step": 179880 + }, + { + "epoch": 19.78932893289329, + "grad_norm": 2.7571656703948975, + "learning_rate": 1.691527296303064e-08, + "loss": 0.0465, + "num_input_tokens_seen": 37962832, + "step": 179885 + }, + { + "epoch": 19.78987898789879, + "grad_norm": 0.06064748018980026, + "learning_rate": 1.6827113743100865e-08, + "loss": 0.0051, + "num_input_tokens_seen": 37963856, + "step": 179890 + }, + { + "epoch": 19.79042904290429, + "grad_norm": 0.018899116665124893, + "learning_rate": 1.6739184781086603e-08, + "loss": 0.0008, + "num_input_tokens_seen": 37964880, + "step": 179895 + }, + { + "epoch": 19.79097909790979, + "grad_norm": 0.11368705332279205, + "learning_rate": 1.6651486077801092e-08, + "loss": 0.0463, + "num_input_tokens_seen": 37965904, + "step": 179900 + }, + { + "epoch": 19.79152915291529, + "grad_norm": 0.01708352565765381, + "learning_rate": 1.6564017634049246e-08, + "loss": 0.0862, + "num_input_tokens_seen": 37966928, + "step": 179905 + }, + { + "epoch": 19.792079207920793, + "grad_norm": 1.4804130792617798, + "learning_rate": 1.6476779450638745e-08, + "loss": 0.0786, + "num_input_tokens_seen": 37968016, + "step": 179910 + }, + { + "epoch": 19.792629262926294, + "grad_norm": 0.007942795753479004, + "learning_rate": 1.6389771528371734e-08, + "loss": 0.0026, + "num_input_tokens_seen": 37969072, + "step": 179915 + }, + { + "epoch": 19.793179317931795, + "grad_norm": 0.14064419269561768, + "learning_rate": 1.6302993868055893e-08, + "loss": 0.0647, + "num_input_tokens_seen": 37970128, + "step": 179920 + }, + { + "epoch": 19.793729372937293, + "grad_norm": 0.014846527948975563, + "learning_rate": 1.621644647048226e-08, + "loss": 0.0025, + "num_input_tokens_seen": 37971184, + "step": 179925 + }, + { + "epoch": 19.794279427942794, + "grad_norm": 0.08068644255399704, + "learning_rate": 1.6130129336455745e-08, + "loss": 0.0063, + "num_input_tokens_seen": 37972208, + "step": 179930 + }, + { + "epoch": 19.794829482948295, + "grad_norm": 0.12096918374300003, + "learning_rate": 1.604404246677016e-08, + "loss": 0.0035, + "num_input_tokens_seen": 37973296, + "step": 179935 + }, + { + "epoch": 19.795379537953796, + "grad_norm": 0.050822366029024124, + "learning_rate": 1.595818586221931e-08, + "loss": 0.0351, + "num_input_tokens_seen": 37974320, + "step": 179940 + }, + { + "epoch": 19.795929592959297, + "grad_norm": 0.040690500289201736, + "learning_rate": 1.587255952359146e-08, + "loss": 0.0018, + "num_input_tokens_seen": 37975408, + "step": 179945 + }, + { + "epoch": 19.796479647964798, + "grad_norm": 0.022905899211764336, + "learning_rate": 1.5787163451677632e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37976464, + "step": 179950 + }, + { + "epoch": 19.797029702970296, + "grad_norm": 0.014340915717184544, + "learning_rate": 1.5701997647263323e-08, + "loss": 0.017, + "num_input_tokens_seen": 37977520, + "step": 179955 + }, + { + "epoch": 19.797579757975797, + "grad_norm": 3.8791587352752686, + "learning_rate": 1.561706211113678e-08, + "loss": 0.103, + "num_input_tokens_seen": 37978576, + "step": 179960 + }, + { + "epoch": 19.798129812981298, + "grad_norm": 2.9607536792755127, + "learning_rate": 1.553235684408072e-08, + "loss": 0.0697, + "num_input_tokens_seen": 37979664, + "step": 179965 + }, + { + "epoch": 19.7986798679868, + "grad_norm": 2.2552988529205322, + "learning_rate": 1.544788184687229e-08, + "loss": 0.0496, + "num_input_tokens_seen": 37980784, + "step": 179970 + }, + { + "epoch": 19.7992299229923, + "grad_norm": 0.07405722141265869, + "learning_rate": 1.5363637120291428e-08, + "loss": 0.0043, + "num_input_tokens_seen": 37981904, + "step": 179975 + }, + { + "epoch": 19.7997799779978, + "grad_norm": 2.39534068107605, + "learning_rate": 1.527962266511529e-08, + "loss": 0.1661, + "num_input_tokens_seen": 37982960, + "step": 179980 + }, + { + "epoch": 19.8003300330033, + "grad_norm": 0.051106177270412445, + "learning_rate": 1.5195838482118253e-08, + "loss": 0.0029, + "num_input_tokens_seen": 37983952, + "step": 179985 + }, + { + "epoch": 19.8008800880088, + "grad_norm": 0.027034500613808632, + "learning_rate": 1.5112284572074698e-08, + "loss": 0.0326, + "num_input_tokens_seen": 37985040, + "step": 179990 + }, + { + "epoch": 19.8014301430143, + "grad_norm": 0.25105413794517517, + "learning_rate": 1.5028960935747905e-08, + "loss": 0.0064, + "num_input_tokens_seen": 37986096, + "step": 179995 + }, + { + "epoch": 19.801980198019802, + "grad_norm": 4.037617206573486, + "learning_rate": 1.494586757391503e-08, + "loss": 0.0196, + "num_input_tokens_seen": 37987056, + "step": 180000 + }, + { + "epoch": 19.802530253025303, + "grad_norm": 0.0038853895384818316, + "learning_rate": 1.4863004487333799e-08, + "loss": 0.002, + "num_input_tokens_seen": 37988080, + "step": 180005 + }, + { + "epoch": 19.803080308030804, + "grad_norm": 0.07362543791532516, + "learning_rate": 1.4780371676773041e-08, + "loss": 0.0017, + "num_input_tokens_seen": 37989168, + "step": 180010 + }, + { + "epoch": 19.803630363036305, + "grad_norm": 0.10748562961816788, + "learning_rate": 1.4697969142990486e-08, + "loss": 0.005, + "num_input_tokens_seen": 37990192, + "step": 180015 + }, + { + "epoch": 19.804180418041803, + "grad_norm": 0.05360531061887741, + "learning_rate": 1.4615796886749412e-08, + "loss": 0.0748, + "num_input_tokens_seen": 37991248, + "step": 180020 + }, + { + "epoch": 19.804730473047304, + "grad_norm": 0.40763774514198303, + "learning_rate": 1.4533854908804767e-08, + "loss": 0.0204, + "num_input_tokens_seen": 37992336, + "step": 180025 + }, + { + "epoch": 19.805280528052805, + "grad_norm": 0.012869061902165413, + "learning_rate": 1.4452143209911507e-08, + "loss": 0.0602, + "num_input_tokens_seen": 37993424, + "step": 180030 + }, + { + "epoch": 19.805830583058306, + "grad_norm": 0.018829409033060074, + "learning_rate": 1.437066179082458e-08, + "loss": 0.0011, + "num_input_tokens_seen": 37994480, + "step": 180035 + }, + { + "epoch": 19.806380638063807, + "grad_norm": 0.006607601419091225, + "learning_rate": 1.4289410652293389e-08, + "loss": 0.0051, + "num_input_tokens_seen": 37995536, + "step": 180040 + }, + { + "epoch": 19.806930693069308, + "grad_norm": 2.8075737953186035, + "learning_rate": 1.4208389795067334e-08, + "loss": 0.0405, + "num_input_tokens_seen": 37996560, + "step": 180045 + }, + { + "epoch": 19.80748074807481, + "grad_norm": 0.18258188664913177, + "learning_rate": 1.4127599219895815e-08, + "loss": 0.0025, + "num_input_tokens_seen": 37997552, + "step": 180050 + }, + { + "epoch": 19.808030803080307, + "grad_norm": 0.009545584209263325, + "learning_rate": 1.4047038927517132e-08, + "loss": 0.0032, + "num_input_tokens_seen": 37998608, + "step": 180055 + }, + { + "epoch": 19.808580858085808, + "grad_norm": 0.2370854914188385, + "learning_rate": 1.3966708918680681e-08, + "loss": 0.0056, + "num_input_tokens_seen": 37999696, + "step": 180060 + }, + { + "epoch": 19.80913091309131, + "grad_norm": 0.012610831297934055, + "learning_rate": 1.3886609194121992e-08, + "loss": 0.0046, + "num_input_tokens_seen": 38000752, + "step": 180065 + }, + { + "epoch": 19.80968096809681, + "grad_norm": 1.6643459796905518, + "learning_rate": 1.3806739754579356e-08, + "loss": 0.0766, + "num_input_tokens_seen": 38001840, + "step": 180070 + }, + { + "epoch": 19.81023102310231, + "grad_norm": 1.0379841327667236, + "learning_rate": 1.3727100600793852e-08, + "loss": 0.1093, + "num_input_tokens_seen": 38002832, + "step": 180075 + }, + { + "epoch": 19.810781078107812, + "grad_norm": 0.16148269176483154, + "learning_rate": 1.3647691733492674e-08, + "loss": 0.0022, + "num_input_tokens_seen": 38003920, + "step": 180080 + }, + { + "epoch": 19.81133113311331, + "grad_norm": 0.028589174151420593, + "learning_rate": 1.356851315341412e-08, + "loss": 0.1453, + "num_input_tokens_seen": 38005008, + "step": 180085 + }, + { + "epoch": 19.81188118811881, + "grad_norm": 0.022465556859970093, + "learning_rate": 1.3489564861282611e-08, + "loss": 0.0634, + "num_input_tokens_seen": 38006000, + "step": 180090 + }, + { + "epoch": 19.812431243124312, + "grad_norm": 2.5295047760009766, + "learning_rate": 1.3410846857830894e-08, + "loss": 0.0798, + "num_input_tokens_seen": 38007152, + "step": 180095 + }, + { + "epoch": 19.812981298129813, + "grad_norm": 0.005718804430216551, + "learning_rate": 1.333235914377784e-08, + "loss": 0.0006, + "num_input_tokens_seen": 38008240, + "step": 180100 + }, + { + "epoch": 19.813531353135314, + "grad_norm": 0.01711699552834034, + "learning_rate": 1.3254101719853418e-08, + "loss": 0.0118, + "num_input_tokens_seen": 38009296, + "step": 180105 + }, + { + "epoch": 19.814081408140815, + "grad_norm": 0.059679064899683, + "learning_rate": 1.3176074586773723e-08, + "loss": 0.0009, + "num_input_tokens_seen": 38010320, + "step": 180110 + }, + { + "epoch": 19.814631463146316, + "grad_norm": 0.011929776519536972, + "learning_rate": 1.3098277745263176e-08, + "loss": 0.0049, + "num_input_tokens_seen": 38011408, + "step": 180115 + }, + { + "epoch": 19.815181518151814, + "grad_norm": 0.1637469381093979, + "learning_rate": 1.3020711196035095e-08, + "loss": 0.0075, + "num_input_tokens_seen": 38012496, + "step": 180120 + }, + { + "epoch": 19.815731573157315, + "grad_norm": 0.012894428335130215, + "learning_rate": 1.2943374939805575e-08, + "loss": 0.0108, + "num_input_tokens_seen": 38013616, + "step": 180125 + }, + { + "epoch": 19.816281628162816, + "grad_norm": 0.02333774045109749, + "learning_rate": 1.2866268977285156e-08, + "loss": 0.0835, + "num_input_tokens_seen": 38014640, + "step": 180130 + }, + { + "epoch": 19.816831683168317, + "grad_norm": 0.20546399056911469, + "learning_rate": 1.2789393309187158e-08, + "loss": 0.1403, + "num_input_tokens_seen": 38015728, + "step": 180135 + }, + { + "epoch": 19.817381738173818, + "grad_norm": 1.9299582242965698, + "learning_rate": 1.2712747936219349e-08, + "loss": 0.0065, + "num_input_tokens_seen": 38016784, + "step": 180140 + }, + { + "epoch": 19.81793179317932, + "grad_norm": 0.7422940731048584, + "learning_rate": 1.2636332859089495e-08, + "loss": 0.0321, + "num_input_tokens_seen": 38017808, + "step": 180145 + }, + { + "epoch": 19.818481848184817, + "grad_norm": 0.018199287354946136, + "learning_rate": 1.2560148078497035e-08, + "loss": 0.0011, + "num_input_tokens_seen": 38018864, + "step": 180150 + }, + { + "epoch": 19.819031903190318, + "grad_norm": 0.03687836602330208, + "learning_rate": 1.2484193595152516e-08, + "loss": 0.0015, + "num_input_tokens_seen": 38019952, + "step": 180155 + }, + { + "epoch": 19.81958195819582, + "grad_norm": 0.027844935655593872, + "learning_rate": 1.2408469409747048e-08, + "loss": 0.0025, + "num_input_tokens_seen": 38021040, + "step": 180160 + }, + { + "epoch": 19.82013201320132, + "grad_norm": 0.05189003422856331, + "learning_rate": 1.2332975522982848e-08, + "loss": 0.0015, + "num_input_tokens_seen": 38022128, + "step": 180165 + }, + { + "epoch": 19.82068206820682, + "grad_norm": 0.0052910977974534035, + "learning_rate": 1.2257711935556582e-08, + "loss": 0.0017, + "num_input_tokens_seen": 38023184, + "step": 180170 + }, + { + "epoch": 19.821232123212322, + "grad_norm": 0.009046093560755253, + "learning_rate": 1.2182678648159363e-08, + "loss": 0.0041, + "num_input_tokens_seen": 38024208, + "step": 180175 + }, + { + "epoch": 19.821782178217823, + "grad_norm": 0.0030799757223576307, + "learning_rate": 1.210787566148508e-08, + "loss": 0.0015, + "num_input_tokens_seen": 38025264, + "step": 180180 + }, + { + "epoch": 19.82233223322332, + "grad_norm": 2.2188215255737305, + "learning_rate": 1.2033302976222071e-08, + "loss": 0.0443, + "num_input_tokens_seen": 38026256, + "step": 180185 + }, + { + "epoch": 19.822882288228822, + "grad_norm": 0.23237557709217072, + "learning_rate": 1.1958960593058676e-08, + "loss": 0.0093, + "num_input_tokens_seen": 38027312, + "step": 180190 + }, + { + "epoch": 19.823432343234323, + "grad_norm": 0.09653429687023163, + "learning_rate": 1.1884848512677683e-08, + "loss": 0.0555, + "num_input_tokens_seen": 38028304, + "step": 180195 + }, + { + "epoch": 19.823982398239824, + "grad_norm": 0.011284726671874523, + "learning_rate": 1.1810966735764651e-08, + "loss": 0.0021, + "num_input_tokens_seen": 38029392, + "step": 180200 + }, + { + "epoch": 19.824532453245325, + "grad_norm": 0.008513159118592739, + "learning_rate": 1.1737315262999593e-08, + "loss": 0.0054, + "num_input_tokens_seen": 38030448, + "step": 180205 + }, + { + "epoch": 19.825082508250826, + "grad_norm": 0.05400293692946434, + "learning_rate": 1.1663894095059747e-08, + "loss": 0.0108, + "num_input_tokens_seen": 38031472, + "step": 180210 + }, + { + "epoch": 19.825632563256324, + "grad_norm": 0.03072608821094036, + "learning_rate": 1.1590703232625122e-08, + "loss": 0.0122, + "num_input_tokens_seen": 38032560, + "step": 180215 + }, + { + "epoch": 19.826182618261825, + "grad_norm": 0.01608935371041298, + "learning_rate": 1.151774267636463e-08, + "loss": 0.0866, + "num_input_tokens_seen": 38033616, + "step": 180220 + }, + { + "epoch": 19.826732673267326, + "grad_norm": 0.21207638084888458, + "learning_rate": 1.144501242695828e-08, + "loss": 0.0069, + "num_input_tokens_seen": 38034736, + "step": 180225 + }, + { + "epoch": 19.827282728272827, + "grad_norm": 0.005056168884038925, + "learning_rate": 1.1372512485072206e-08, + "loss": 0.0016, + "num_input_tokens_seen": 38035760, + "step": 180230 + }, + { + "epoch": 19.82783278327833, + "grad_norm": 2.0265393257141113, + "learning_rate": 1.1300242851372545e-08, + "loss": 0.0953, + "num_input_tokens_seen": 38036848, + "step": 180235 + }, + { + "epoch": 19.82838283828383, + "grad_norm": 0.06788447499275208, + "learning_rate": 1.1228203526530978e-08, + "loss": 0.0814, + "num_input_tokens_seen": 38037872, + "step": 180240 + }, + { + "epoch": 19.82893289328933, + "grad_norm": 0.10646149516105652, + "learning_rate": 1.1156394511205315e-08, + "loss": 0.0157, + "num_input_tokens_seen": 38038928, + "step": 180245 + }, + { + "epoch": 19.829482948294828, + "grad_norm": 0.04120558872818947, + "learning_rate": 1.1084815806061688e-08, + "loss": 0.0064, + "num_input_tokens_seen": 38039952, + "step": 180250 + }, + { + "epoch": 19.83003300330033, + "grad_norm": 0.023973967880010605, + "learning_rate": 1.1013467411757905e-08, + "loss": 0.003, + "num_input_tokens_seen": 38041008, + "step": 180255 + }, + { + "epoch": 19.83058305830583, + "grad_norm": 0.014738621190190315, + "learning_rate": 1.0942349328951773e-08, + "loss": 0.0401, + "num_input_tokens_seen": 38042096, + "step": 180260 + }, + { + "epoch": 19.83113311331133, + "grad_norm": 0.12280084937810898, + "learning_rate": 1.0871461558301099e-08, + "loss": 0.0054, + "num_input_tokens_seen": 38043152, + "step": 180265 + }, + { + "epoch": 19.831683168316832, + "grad_norm": 0.18758970499038696, + "learning_rate": 1.0800804100455364e-08, + "loss": 0.0188, + "num_input_tokens_seen": 38044176, + "step": 180270 + }, + { + "epoch": 19.832233223322334, + "grad_norm": 0.004614821635186672, + "learning_rate": 1.07303769560696e-08, + "loss": 0.0601, + "num_input_tokens_seen": 38045200, + "step": 180275 + }, + { + "epoch": 19.83278327832783, + "grad_norm": 0.029306679964065552, + "learning_rate": 1.0660180125790509e-08, + "loss": 0.0056, + "num_input_tokens_seen": 38046256, + "step": 180280 + }, + { + "epoch": 19.833333333333332, + "grad_norm": 0.018533948808908463, + "learning_rate": 1.0590213610264798e-08, + "loss": 0.0012, + "num_input_tokens_seen": 38047376, + "step": 180285 + }, + { + "epoch": 19.833883388338833, + "grad_norm": 0.05461302772164345, + "learning_rate": 1.0520477410136398e-08, + "loss": 0.0052, + "num_input_tokens_seen": 38048496, + "step": 180290 + }, + { + "epoch": 19.834433443344334, + "grad_norm": 0.3062938153743744, + "learning_rate": 1.0450971526052012e-08, + "loss": 0.0042, + "num_input_tokens_seen": 38049488, + "step": 180295 + }, + { + "epoch": 19.834983498349835, + "grad_norm": 0.018414849415421486, + "learning_rate": 1.0381695958647242e-08, + "loss": 0.0796, + "num_input_tokens_seen": 38050448, + "step": 180300 + }, + { + "epoch": 19.835533553355337, + "grad_norm": 0.063539057970047, + "learning_rate": 1.0312650708566018e-08, + "loss": 0.0055, + "num_input_tokens_seen": 38051504, + "step": 180305 + }, + { + "epoch": 19.836083608360838, + "grad_norm": 0.2328234165906906, + "learning_rate": 1.0243835776441168e-08, + "loss": 0.0101, + "num_input_tokens_seen": 38052560, + "step": 180310 + }, + { + "epoch": 19.836633663366335, + "grad_norm": 1.2046282291412354, + "learning_rate": 1.017525116290552e-08, + "loss": 0.0705, + "num_input_tokens_seen": 38053648, + "step": 180315 + }, + { + "epoch": 19.837183718371836, + "grad_norm": 0.042646363377571106, + "learning_rate": 1.0106896868597449e-08, + "loss": 0.0014, + "num_input_tokens_seen": 38054736, + "step": 180320 + }, + { + "epoch": 19.837733773377337, + "grad_norm": 0.012913791462779045, + "learning_rate": 1.0038772894138681e-08, + "loss": 0.0029, + "num_input_tokens_seen": 38055792, + "step": 180325 + }, + { + "epoch": 19.83828382838284, + "grad_norm": 0.11421441286802292, + "learning_rate": 9.970879240164822e-09, + "loss": 0.005, + "num_input_tokens_seen": 38056816, + "step": 180330 + }, + { + "epoch": 19.83883388338834, + "grad_norm": 0.0367574617266655, + "learning_rate": 9.903215907294817e-09, + "loss": 0.007, + "num_input_tokens_seen": 38057808, + "step": 180335 + }, + { + "epoch": 19.83938393839384, + "grad_norm": 0.463832288980484, + "learning_rate": 9.835782896155943e-09, + "loss": 0.0259, + "num_input_tokens_seen": 38058768, + "step": 180340 + }, + { + "epoch": 19.83993399339934, + "grad_norm": 0.09024159610271454, + "learning_rate": 9.768580207369926e-09, + "loss": 0.0041, + "num_input_tokens_seen": 38059856, + "step": 180345 + }, + { + "epoch": 19.84048404840484, + "grad_norm": 0.03688209876418114, + "learning_rate": 9.701607841555715e-09, + "loss": 0.004, + "num_input_tokens_seen": 38060944, + "step": 180350 + }, + { + "epoch": 19.84103410341034, + "grad_norm": 0.03242967650294304, + "learning_rate": 9.634865799329484e-09, + "loss": 0.004, + "num_input_tokens_seen": 38062032, + "step": 180355 + }, + { + "epoch": 19.84158415841584, + "grad_norm": 0.4432687759399414, + "learning_rate": 9.568354081307406e-09, + "loss": 0.0091, + "num_input_tokens_seen": 38063088, + "step": 180360 + }, + { + "epoch": 19.842134213421343, + "grad_norm": 0.06223051995038986, + "learning_rate": 9.502072688102881e-09, + "loss": 0.0009, + "num_input_tokens_seen": 38064144, + "step": 180365 + }, + { + "epoch": 19.842684268426844, + "grad_norm": 0.07289158552885056, + "learning_rate": 9.436021620326529e-09, + "loss": 0.0025, + "num_input_tokens_seen": 38065200, + "step": 180370 + }, + { + "epoch": 19.843234323432345, + "grad_norm": 0.17227691411972046, + "learning_rate": 9.3702008785862e-09, + "loss": 0.0239, + "num_input_tokens_seen": 38066256, + "step": 180375 + }, + { + "epoch": 19.843784378437842, + "grad_norm": 0.01126967091113329, + "learning_rate": 9.304610463489738e-09, + "loss": 0.0022, + "num_input_tokens_seen": 38067312, + "step": 180380 + }, + { + "epoch": 19.844334433443343, + "grad_norm": 0.019227230921387672, + "learning_rate": 9.239250375639441e-09, + "loss": 0.0018, + "num_input_tokens_seen": 38068432, + "step": 180385 + }, + { + "epoch": 19.844884488448844, + "grad_norm": 0.020353112369775772, + "learning_rate": 9.174120615640381e-09, + "loss": 0.0027, + "num_input_tokens_seen": 38069552, + "step": 180390 + }, + { + "epoch": 19.845434543454346, + "grad_norm": 0.015423459932208061, + "learning_rate": 9.1092211840893e-09, + "loss": 0.0481, + "num_input_tokens_seen": 38070576, + "step": 180395 + }, + { + "epoch": 19.845984598459847, + "grad_norm": 0.022184180095791817, + "learning_rate": 9.044552081588497e-09, + "loss": 0.0255, + "num_input_tokens_seen": 38071600, + "step": 180400 + }, + { + "epoch": 19.846534653465348, + "grad_norm": 0.07957761734724045, + "learning_rate": 8.980113308731942e-09, + "loss": 0.0291, + "num_input_tokens_seen": 38072624, + "step": 180405 + }, + { + "epoch": 19.847084708470845, + "grad_norm": 0.03498510271310806, + "learning_rate": 8.915904866116376e-09, + "loss": 0.0015, + "num_input_tokens_seen": 38073680, + "step": 180410 + }, + { + "epoch": 19.847634763476346, + "grad_norm": 0.02731332555413246, + "learning_rate": 8.851926754327444e-09, + "loss": 0.0016, + "num_input_tokens_seen": 38074704, + "step": 180415 + }, + { + "epoch": 19.848184818481847, + "grad_norm": 0.026018578559160233, + "learning_rate": 8.788178973959115e-09, + "loss": 0.1203, + "num_input_tokens_seen": 38075760, + "step": 180420 + }, + { + "epoch": 19.84873487348735, + "grad_norm": 0.0329865887761116, + "learning_rate": 8.724661525599809e-09, + "loss": 0.003, + "num_input_tokens_seen": 38076816, + "step": 180425 + }, + { + "epoch": 19.84928492849285, + "grad_norm": 0.037872448563575745, + "learning_rate": 8.66137440983239e-09, + "loss": 0.0397, + "num_input_tokens_seen": 38077904, + "step": 180430 + }, + { + "epoch": 19.84983498349835, + "grad_norm": 0.0047975159250199795, + "learning_rate": 8.598317627239727e-09, + "loss": 0.0039, + "num_input_tokens_seen": 38078960, + "step": 180435 + }, + { + "epoch": 19.850385038503852, + "grad_norm": 0.03761908784508705, + "learning_rate": 8.535491178407462e-09, + "loss": 0.0662, + "num_input_tokens_seen": 38080048, + "step": 180440 + }, + { + "epoch": 19.85093509350935, + "grad_norm": 0.006941532250493765, + "learning_rate": 8.472895063910135e-09, + "loss": 0.0027, + "num_input_tokens_seen": 38081072, + "step": 180445 + }, + { + "epoch": 19.85148514851485, + "grad_norm": 0.07089921087026596, + "learning_rate": 8.410529284325064e-09, + "loss": 0.1086, + "num_input_tokens_seen": 38082128, + "step": 180450 + }, + { + "epoch": 19.85203520352035, + "grad_norm": 0.0028366360347718, + "learning_rate": 8.348393840226787e-09, + "loss": 0.0856, + "num_input_tokens_seen": 38083152, + "step": 180455 + }, + { + "epoch": 19.852585258525853, + "grad_norm": 0.06603546440601349, + "learning_rate": 8.286488732192621e-09, + "loss": 0.0145, + "num_input_tokens_seen": 38084240, + "step": 180460 + }, + { + "epoch": 19.853135313531354, + "grad_norm": 0.09880516678094864, + "learning_rate": 8.22481396078878e-09, + "loss": 0.0021, + "num_input_tokens_seen": 38085264, + "step": 180465 + }, + { + "epoch": 19.853685368536855, + "grad_norm": 0.8256062865257263, + "learning_rate": 8.163369526584252e-09, + "loss": 0.0978, + "num_input_tokens_seen": 38086320, + "step": 180470 + }, + { + "epoch": 19.854235423542356, + "grad_norm": 0.1051439419388771, + "learning_rate": 8.102155430145253e-09, + "loss": 0.01, + "num_input_tokens_seen": 38087312, + "step": 180475 + }, + { + "epoch": 19.854785478547853, + "grad_norm": 0.04185781627893448, + "learning_rate": 8.04117167203522e-09, + "loss": 0.0021, + "num_input_tokens_seen": 38088336, + "step": 180480 + }, + { + "epoch": 19.855335533553355, + "grad_norm": 0.07583494484424591, + "learning_rate": 7.98041825281759e-09, + "loss": 0.0048, + "num_input_tokens_seen": 38089328, + "step": 180485 + }, + { + "epoch": 19.855885588558856, + "grad_norm": 0.354543536901474, + "learning_rate": 7.919895173053027e-09, + "loss": 0.0728, + "num_input_tokens_seen": 38090416, + "step": 180490 + }, + { + "epoch": 19.856435643564357, + "grad_norm": 2.1016006469726562, + "learning_rate": 7.859602433299417e-09, + "loss": 0.1015, + "num_input_tokens_seen": 38091472, + "step": 180495 + }, + { + "epoch": 19.856985698569858, + "grad_norm": 0.05474432557821274, + "learning_rate": 7.799540034111874e-09, + "loss": 0.044, + "num_input_tokens_seen": 38092528, + "step": 180500 + }, + { + "epoch": 19.85753575357536, + "grad_norm": 0.0111995292827487, + "learning_rate": 7.739707976042732e-09, + "loss": 0.0068, + "num_input_tokens_seen": 38093552, + "step": 180505 + }, + { + "epoch": 19.858085808580856, + "grad_norm": 0.38770410418510437, + "learning_rate": 7.680106259641551e-09, + "loss": 0.0027, + "num_input_tokens_seen": 38094608, + "step": 180510 + }, + { + "epoch": 19.858635863586358, + "grad_norm": 0.00579464714974165, + "learning_rate": 7.620734885463443e-09, + "loss": 0.0112, + "num_input_tokens_seen": 38095632, + "step": 180515 + }, + { + "epoch": 19.85918591859186, + "grad_norm": 0.011383728124201298, + "learning_rate": 7.561593854052418e-09, + "loss": 0.0055, + "num_input_tokens_seen": 38096752, + "step": 180520 + }, + { + "epoch": 19.85973597359736, + "grad_norm": 0.012630333192646503, + "learning_rate": 7.502683165952484e-09, + "loss": 0.0026, + "num_input_tokens_seen": 38097776, + "step": 180525 + }, + { + "epoch": 19.86028602860286, + "grad_norm": 0.012461751699447632, + "learning_rate": 7.444002821707652e-09, + "loss": 0.0009, + "num_input_tokens_seen": 38098864, + "step": 180530 + }, + { + "epoch": 19.860836083608362, + "grad_norm": 0.08533644676208496, + "learning_rate": 7.385552821859154e-09, + "loss": 0.0569, + "num_input_tokens_seen": 38099920, + "step": 180535 + }, + { + "epoch": 19.861386138613863, + "grad_norm": 0.42065414786338806, + "learning_rate": 7.3273331669454495e-09, + "loss": 0.0031, + "num_input_tokens_seen": 38100944, + "step": 180540 + }, + { + "epoch": 19.86193619361936, + "grad_norm": 0.026150427758693695, + "learning_rate": 7.269343857504996e-09, + "loss": 0.097, + "num_input_tokens_seen": 38102032, + "step": 180545 + }, + { + "epoch": 19.86248624862486, + "grad_norm": 1.8869450092315674, + "learning_rate": 7.211584894067924e-09, + "loss": 0.0791, + "num_input_tokens_seen": 38103088, + "step": 180550 + }, + { + "epoch": 19.863036303630363, + "grad_norm": 0.002037670696154237, + "learning_rate": 7.154056277169918e-09, + "loss": 0.0049, + "num_input_tokens_seen": 38104144, + "step": 180555 + }, + { + "epoch": 19.863586358635864, + "grad_norm": 1.8490118980407715, + "learning_rate": 7.096758007338333e-09, + "loss": 0.0166, + "num_input_tokens_seen": 38105200, + "step": 180560 + }, + { + "epoch": 19.864136413641365, + "grad_norm": 0.12960946559906006, + "learning_rate": 7.039690085106076e-09, + "loss": 0.0019, + "num_input_tokens_seen": 38106192, + "step": 180565 + }, + { + "epoch": 19.864686468646866, + "grad_norm": 0.02515771985054016, + "learning_rate": 6.982852510994953e-09, + "loss": 0.0014, + "num_input_tokens_seen": 38107216, + "step": 180570 + }, + { + "epoch": 19.865236523652364, + "grad_norm": 0.08191465586423874, + "learning_rate": 6.926245285529542e-09, + "loss": 0.0664, + "num_input_tokens_seen": 38108240, + "step": 180575 + }, + { + "epoch": 19.865786578657865, + "grad_norm": 0.04177332669496536, + "learning_rate": 6.86986840923165e-09, + "loss": 0.1021, + "num_input_tokens_seen": 38109264, + "step": 180580 + }, + { + "epoch": 19.866336633663366, + "grad_norm": 0.4298917353153229, + "learning_rate": 6.813721882623081e-09, + "loss": 0.0057, + "num_input_tokens_seen": 38110320, + "step": 180585 + }, + { + "epoch": 19.866886688668867, + "grad_norm": 0.04783080145716667, + "learning_rate": 6.757805706217313e-09, + "loss": 0.0043, + "num_input_tokens_seen": 38111312, + "step": 180590 + }, + { + "epoch": 19.867436743674368, + "grad_norm": 1.4047002792358398, + "learning_rate": 6.702119880533375e-09, + "loss": 0.0728, + "num_input_tokens_seen": 38112368, + "step": 180595 + }, + { + "epoch": 19.86798679867987, + "grad_norm": 0.0071216123178601265, + "learning_rate": 6.646664406084746e-09, + "loss": 0.0068, + "num_input_tokens_seen": 38113392, + "step": 180600 + }, + { + "epoch": 19.86853685368537, + "grad_norm": 0.044964201748371124, + "learning_rate": 6.591439283379353e-09, + "loss": 0.0013, + "num_input_tokens_seen": 38114544, + "step": 180605 + }, + { + "epoch": 19.869086908690868, + "grad_norm": 0.13300856947898865, + "learning_rate": 6.5364445129251214e-09, + "loss": 0.0673, + "num_input_tokens_seen": 38115600, + "step": 180610 + }, + { + "epoch": 19.86963696369637, + "grad_norm": 0.03674536570906639, + "learning_rate": 6.481680095235532e-09, + "loss": 0.0265, + "num_input_tokens_seen": 38116656, + "step": 180615 + }, + { + "epoch": 19.87018701870187, + "grad_norm": 0.10106296092271805, + "learning_rate": 6.427146030807407e-09, + "loss": 0.0682, + "num_input_tokens_seen": 38117776, + "step": 180620 + }, + { + "epoch": 19.87073707370737, + "grad_norm": 0.012422611005604267, + "learning_rate": 6.372842320151451e-09, + "loss": 0.0017, + "num_input_tokens_seen": 38118832, + "step": 180625 + }, + { + "epoch": 19.871287128712872, + "grad_norm": 0.00794962514191866, + "learning_rate": 6.318768963758936e-09, + "loss": 0.0347, + "num_input_tokens_seen": 38119856, + "step": 180630 + }, + { + "epoch": 19.871837183718373, + "grad_norm": 0.01287019345909357, + "learning_rate": 6.264925962137791e-09, + "loss": 0.0121, + "num_input_tokens_seen": 38120912, + "step": 180635 + }, + { + "epoch": 19.87238723872387, + "grad_norm": 0.05582428723573685, + "learning_rate": 6.211313315776512e-09, + "loss": 0.0037, + "num_input_tokens_seen": 38121936, + "step": 180640 + }, + { + "epoch": 19.872937293729372, + "grad_norm": 0.09238170087337494, + "learning_rate": 6.157931025174701e-09, + "loss": 0.0065, + "num_input_tokens_seen": 38122992, + "step": 180645 + }, + { + "epoch": 19.873487348734873, + "grad_norm": 0.010594387538731098, + "learning_rate": 6.104779090820856e-09, + "loss": 0.0178, + "num_input_tokens_seen": 38124112, + "step": 180650 + }, + { + "epoch": 19.874037403740374, + "grad_norm": 0.017024559900164604, + "learning_rate": 6.05185751320625e-09, + "loss": 0.0622, + "num_input_tokens_seen": 38125168, + "step": 180655 + }, + { + "epoch": 19.874587458745875, + "grad_norm": 2.5993504524230957, + "learning_rate": 5.9991662928166045e-09, + "loss": 0.1457, + "num_input_tokens_seen": 38126224, + "step": 180660 + }, + { + "epoch": 19.875137513751376, + "grad_norm": 0.018271764740347862, + "learning_rate": 5.946705430143196e-09, + "loss": 0.0257, + "num_input_tokens_seen": 38127280, + "step": 180665 + }, + { + "epoch": 19.875687568756877, + "grad_norm": 0.002246806863695383, + "learning_rate": 5.894474925663418e-09, + "loss": 0.0029, + "num_input_tokens_seen": 38128304, + "step": 180670 + }, + { + "epoch": 19.876237623762375, + "grad_norm": 1.0197415351867676, + "learning_rate": 5.842474779860219e-09, + "loss": 0.0602, + "num_input_tokens_seen": 38129360, + "step": 180675 + }, + { + "epoch": 19.876787678767876, + "grad_norm": 0.15053339302539825, + "learning_rate": 5.7907049932137695e-09, + "loss": 0.0041, + "num_input_tokens_seen": 38130416, + "step": 180680 + }, + { + "epoch": 19.877337733773377, + "grad_norm": 0.34917551279067993, + "learning_rate": 5.7391655662014655e-09, + "loss": 0.0049, + "num_input_tokens_seen": 38131472, + "step": 180685 + }, + { + "epoch": 19.877887788778878, + "grad_norm": 0.07958274334669113, + "learning_rate": 5.687856499297928e-09, + "loss": 0.0031, + "num_input_tokens_seen": 38132528, + "step": 180690 + }, + { + "epoch": 19.87843784378438, + "grad_norm": 0.06862737238407135, + "learning_rate": 5.6367777929777765e-09, + "loss": 0.0138, + "num_input_tokens_seen": 38133584, + "step": 180695 + }, + { + "epoch": 19.87898789878988, + "grad_norm": 0.1208171546459198, + "learning_rate": 5.585929447707305e-09, + "loss": 0.0037, + "num_input_tokens_seen": 38134608, + "step": 180700 + }, + { + "epoch": 19.879537953795378, + "grad_norm": 3.624837636947632, + "learning_rate": 5.535311463958359e-09, + "loss": 0.1031, + "num_input_tokens_seen": 38135632, + "step": 180705 + }, + { + "epoch": 19.88008800880088, + "grad_norm": 0.021333232522010803, + "learning_rate": 5.4849238421972314e-09, + "loss": 0.006, + "num_input_tokens_seen": 38136752, + "step": 180710 + }, + { + "epoch": 19.88063806380638, + "grad_norm": 0.0070388526655733585, + "learning_rate": 5.434766582887441e-09, + "loss": 0.003, + "num_input_tokens_seen": 38137744, + "step": 180715 + }, + { + "epoch": 19.88118811881188, + "grad_norm": 0.21348372101783752, + "learning_rate": 5.384839686492505e-09, + "loss": 0.0077, + "num_input_tokens_seen": 38138800, + "step": 180720 + }, + { + "epoch": 19.881738173817382, + "grad_norm": 0.022273382171988487, + "learning_rate": 5.3351431534731656e-09, + "loss": 0.0046, + "num_input_tokens_seen": 38139824, + "step": 180725 + }, + { + "epoch": 19.882288228822883, + "grad_norm": 0.044631198048591614, + "learning_rate": 5.285676984284615e-09, + "loss": 0.0602, + "num_input_tokens_seen": 38140816, + "step": 180730 + }, + { + "epoch": 19.882838283828384, + "grad_norm": 0.003910890780389309, + "learning_rate": 5.236441179384821e-09, + "loss": 0.0288, + "num_input_tokens_seen": 38141872, + "step": 180735 + }, + { + "epoch": 19.883388338833882, + "grad_norm": 0.02003576047718525, + "learning_rate": 5.187435739226199e-09, + "loss": 0.0007, + "num_input_tokens_seen": 38142928, + "step": 180740 + }, + { + "epoch": 19.883938393839383, + "grad_norm": 0.07127800583839417, + "learning_rate": 5.138660664263939e-09, + "loss": 0.0691, + "num_input_tokens_seen": 38143984, + "step": 180745 + }, + { + "epoch": 19.884488448844884, + "grad_norm": 0.15958094596862793, + "learning_rate": 5.090115954942132e-09, + "loss": 0.0585, + "num_input_tokens_seen": 38145040, + "step": 180750 + }, + { + "epoch": 19.885038503850385, + "grad_norm": 0.09702275693416595, + "learning_rate": 5.041801611713193e-09, + "loss": 0.0029, + "num_input_tokens_seen": 38146096, + "step": 180755 + }, + { + "epoch": 19.885588558855886, + "grad_norm": 0.011891711503267288, + "learning_rate": 4.993717635018436e-09, + "loss": 0.0008, + "num_input_tokens_seen": 38147152, + "step": 180760 + }, + { + "epoch": 19.886138613861387, + "grad_norm": 0.026430442929267883, + "learning_rate": 4.945864025301949e-09, + "loss": 0.0012, + "num_input_tokens_seen": 38148304, + "step": 180765 + }, + { + "epoch": 19.88668866886689, + "grad_norm": 0.08278828114271164, + "learning_rate": 4.8982407830078235e-09, + "loss": 0.008, + "num_input_tokens_seen": 38149328, + "step": 180770 + }, + { + "epoch": 19.887238723872386, + "grad_norm": 0.9342108368873596, + "learning_rate": 4.850847908571821e-09, + "loss": 0.0056, + "num_input_tokens_seen": 38150352, + "step": 180775 + }, + { + "epoch": 19.887788778877887, + "grad_norm": 0.09575145691633224, + "learning_rate": 4.803685402432479e-09, + "loss": 0.002, + "num_input_tokens_seen": 38151440, + "step": 180780 + }, + { + "epoch": 19.888338833883388, + "grad_norm": 0.005800779443234205, + "learning_rate": 4.756753265022784e-09, + "loss": 0.0227, + "num_input_tokens_seen": 38152496, + "step": 180785 + }, + { + "epoch": 19.88888888888889, + "grad_norm": 0.03509041294455528, + "learning_rate": 4.710051496775725e-09, + "loss": 0.0038, + "num_input_tokens_seen": 38153552, + "step": 180790 + }, + { + "epoch": 19.88943894389439, + "grad_norm": 0.02950676903128624, + "learning_rate": 4.6635800981242875e-09, + "loss": 0.0009, + "num_input_tokens_seen": 38154672, + "step": 180795 + }, + { + "epoch": 19.88998899889989, + "grad_norm": 0.018797263503074646, + "learning_rate": 4.617339069493132e-09, + "loss": 0.0017, + "num_input_tokens_seen": 38155696, + "step": 180800 + }, + { + "epoch": 19.89053905390539, + "grad_norm": 0.1715078204870224, + "learning_rate": 4.5713284113096944e-09, + "loss": 0.0606, + "num_input_tokens_seen": 38156752, + "step": 180805 + }, + { + "epoch": 19.89108910891089, + "grad_norm": 2.613321542739868, + "learning_rate": 4.525548124001411e-09, + "loss": 0.0775, + "num_input_tokens_seen": 38157776, + "step": 180810 + }, + { + "epoch": 19.89163916391639, + "grad_norm": 0.06042969599366188, + "learning_rate": 4.4799982079846145e-09, + "loss": 0.0084, + "num_input_tokens_seen": 38158832, + "step": 180815 + }, + { + "epoch": 19.892189218921892, + "grad_norm": 0.16689451038837433, + "learning_rate": 4.434678663681191e-09, + "loss": 0.005, + "num_input_tokens_seen": 38159856, + "step": 180820 + }, + { + "epoch": 19.892739273927393, + "grad_norm": 0.5445982813835144, + "learning_rate": 4.389589491510249e-09, + "loss": 0.0085, + "num_input_tokens_seen": 38160912, + "step": 180825 + }, + { + "epoch": 19.893289328932894, + "grad_norm": 0.003961287904530764, + "learning_rate": 4.344730691885346e-09, + "loss": 0.0059, + "num_input_tokens_seen": 38162000, + "step": 180830 + }, + { + "epoch": 19.893839383938392, + "grad_norm": 0.009243090637028217, + "learning_rate": 4.3001022652228165e-09, + "loss": 0.001, + "num_input_tokens_seen": 38163056, + "step": 180835 + }, + { + "epoch": 19.894389438943893, + "grad_norm": 0.04971938580274582, + "learning_rate": 4.255704211930667e-09, + "loss": 0.0019, + "num_input_tokens_seen": 38164080, + "step": 180840 + }, + { + "epoch": 19.894939493949394, + "grad_norm": 2.9805006980895996, + "learning_rate": 4.21153653241968e-09, + "loss": 0.1195, + "num_input_tokens_seen": 38165104, + "step": 180845 + }, + { + "epoch": 19.895489548954895, + "grad_norm": 0.02813955768942833, + "learning_rate": 4.167599227097863e-09, + "loss": 0.0048, + "num_input_tokens_seen": 38166160, + "step": 180850 + }, + { + "epoch": 19.896039603960396, + "grad_norm": 1.3415682315826416, + "learning_rate": 4.123892296367671e-09, + "loss": 0.0094, + "num_input_tokens_seen": 38167248, + "step": 180855 + }, + { + "epoch": 19.896589658965897, + "grad_norm": 0.04048829525709152, + "learning_rate": 4.080415740634335e-09, + "loss": 0.0055, + "num_input_tokens_seen": 38168272, + "step": 180860 + }, + { + "epoch": 19.8971397139714, + "grad_norm": 0.14727075397968292, + "learning_rate": 4.037169560297538e-09, + "loss": 0.0075, + "num_input_tokens_seen": 38169232, + "step": 180865 + }, + { + "epoch": 19.897689768976896, + "grad_norm": 0.11772365868091583, + "learning_rate": 3.994153755754182e-09, + "loss": 0.0055, + "num_input_tokens_seen": 38170256, + "step": 180870 + }, + { + "epoch": 19.898239823982397, + "grad_norm": 0.0845704898238182, + "learning_rate": 3.951368327403948e-09, + "loss": 0.1165, + "num_input_tokens_seen": 38171344, + "step": 180875 + }, + { + "epoch": 19.8987898789879, + "grad_norm": 0.009561244398355484, + "learning_rate": 3.908813275640966e-09, + "loss": 0.0819, + "num_input_tokens_seen": 38172496, + "step": 180880 + }, + { + "epoch": 19.8993399339934, + "grad_norm": 0.053931791335344315, + "learning_rate": 3.8664886008538124e-09, + "loss": 0.0272, + "num_input_tokens_seen": 38173584, + "step": 180885 + }, + { + "epoch": 19.8998899889989, + "grad_norm": 0.02227320708334446, + "learning_rate": 3.824394303436618e-09, + "loss": 0.0056, + "num_input_tokens_seen": 38174608, + "step": 180890 + }, + { + "epoch": 19.9004400440044, + "grad_norm": 0.009132704697549343, + "learning_rate": 3.782530383775185e-09, + "loss": 0.0016, + "num_input_tokens_seen": 38175568, + "step": 180895 + }, + { + "epoch": 19.900990099009903, + "grad_norm": 0.03967072442173958, + "learning_rate": 3.740896842255315e-09, + "loss": 0.0111, + "num_input_tokens_seen": 38176624, + "step": 180900 + }, + { + "epoch": 19.9015401540154, + "grad_norm": 0.24042052030563354, + "learning_rate": 3.6994936792600355e-09, + "loss": 0.0085, + "num_input_tokens_seen": 38177616, + "step": 180905 + }, + { + "epoch": 19.9020902090209, + "grad_norm": 3.025522470474243, + "learning_rate": 3.658320895175149e-09, + "loss": 0.0561, + "num_input_tokens_seen": 38178704, + "step": 180910 + }, + { + "epoch": 19.902640264026402, + "grad_norm": 0.10182254016399384, + "learning_rate": 3.6173784903753562e-09, + "loss": 0.0954, + "num_input_tokens_seen": 38179760, + "step": 180915 + }, + { + "epoch": 19.903190319031903, + "grad_norm": 0.012727040797472, + "learning_rate": 3.5766664652409078e-09, + "loss": 0.0015, + "num_input_tokens_seen": 38180848, + "step": 180920 + }, + { + "epoch": 19.903740374037405, + "grad_norm": 0.3710266053676605, + "learning_rate": 3.5361848201437285e-09, + "loss": 0.0372, + "num_input_tokens_seen": 38181904, + "step": 180925 + }, + { + "epoch": 19.904290429042906, + "grad_norm": 0.028582824394106865, + "learning_rate": 3.495933555461295e-09, + "loss": 0.005, + "num_input_tokens_seen": 38182864, + "step": 180930 + }, + { + "epoch": 19.904840484048403, + "grad_norm": 0.42494386434555054, + "learning_rate": 3.45591267155998e-09, + "loss": 0.056, + "num_input_tokens_seen": 38183856, + "step": 180935 + }, + { + "epoch": 19.905390539053904, + "grad_norm": 0.14131586253643036, + "learning_rate": 3.416122168814484e-09, + "loss": 0.0069, + "num_input_tokens_seen": 38184880, + "step": 180940 + }, + { + "epoch": 19.905940594059405, + "grad_norm": 0.003429053584113717, + "learning_rate": 3.3765620475856298e-09, + "loss": 0.0436, + "num_input_tokens_seen": 38185904, + "step": 180945 + }, + { + "epoch": 19.906490649064907, + "grad_norm": 1.7169326543807983, + "learning_rate": 3.3372323082397905e-09, + "loss": 0.0174, + "num_input_tokens_seen": 38186896, + "step": 180950 + }, + { + "epoch": 19.907040704070408, + "grad_norm": 0.09214197099208832, + "learning_rate": 3.2981329511405644e-09, + "loss": 0.0058, + "num_input_tokens_seen": 38187984, + "step": 180955 + }, + { + "epoch": 19.90759075907591, + "grad_norm": 1.8405919075012207, + "learning_rate": 3.2592639766487742e-09, + "loss": 0.1017, + "num_input_tokens_seen": 38189040, + "step": 180960 + }, + { + "epoch": 19.90814081408141, + "grad_norm": 0.0074129351414740086, + "learning_rate": 3.2206253851224665e-09, + "loss": 0.0178, + "num_input_tokens_seen": 38190064, + "step": 180965 + }, + { + "epoch": 19.908690869086907, + "grad_norm": 0.0023225806653499603, + "learning_rate": 3.1822171769141374e-09, + "loss": 0.0041, + "num_input_tokens_seen": 38191120, + "step": 180970 + }, + { + "epoch": 19.90924092409241, + "grad_norm": 0.008588436990976334, + "learning_rate": 3.144039352384609e-09, + "loss": 0.0011, + "num_input_tokens_seen": 38192080, + "step": 180975 + }, + { + "epoch": 19.90979097909791, + "grad_norm": 0.036290399730205536, + "learning_rate": 3.1060919118780507e-09, + "loss": 0.0013, + "num_input_tokens_seen": 38193200, + "step": 180980 + }, + { + "epoch": 19.91034103410341, + "grad_norm": 0.6931440830230713, + "learning_rate": 3.068374855749734e-09, + "loss": 0.062, + "num_input_tokens_seen": 38194288, + "step": 180985 + }, + { + "epoch": 19.91089108910891, + "grad_norm": 0.028018958866596222, + "learning_rate": 3.030888184346603e-09, + "loss": 0.0705, + "num_input_tokens_seen": 38195440, + "step": 180990 + }, + { + "epoch": 19.911441144114413, + "grad_norm": 0.00362533051520586, + "learning_rate": 2.9936318980128275e-09, + "loss": 0.0032, + "num_input_tokens_seen": 38196432, + "step": 180995 + }, + { + "epoch": 19.91199119911991, + "grad_norm": 0.04266282916069031, + "learning_rate": 2.956605997089801e-09, + "loss": 0.0803, + "num_input_tokens_seen": 38197456, + "step": 181000 + }, + { + "epoch": 19.91254125412541, + "grad_norm": 0.01991504617035389, + "learning_rate": 2.919810481921692e-09, + "loss": 0.002, + "num_input_tokens_seen": 38198448, + "step": 181005 + }, + { + "epoch": 19.913091309130913, + "grad_norm": 0.017051909118890762, + "learning_rate": 2.8832453528471194e-09, + "loss": 0.0064, + "num_input_tokens_seen": 38199504, + "step": 181010 + }, + { + "epoch": 19.913641364136414, + "grad_norm": 0.22307056188583374, + "learning_rate": 2.8469106102047004e-09, + "loss": 0.0028, + "num_input_tokens_seen": 38200592, + "step": 181015 + }, + { + "epoch": 19.914191419141915, + "grad_norm": 0.026821637526154518, + "learning_rate": 2.810806254324727e-09, + "loss": 0.0031, + "num_input_tokens_seen": 38201616, + "step": 181020 + }, + { + "epoch": 19.914741474147416, + "grad_norm": 0.049605611711740494, + "learning_rate": 2.774932285545817e-09, + "loss": 0.0007, + "num_input_tokens_seen": 38202576, + "step": 181025 + }, + { + "epoch": 19.915291529152917, + "grad_norm": 0.03178403154015541, + "learning_rate": 2.7392887041927105e-09, + "loss": 0.0006, + "num_input_tokens_seen": 38203696, + "step": 181030 + }, + { + "epoch": 19.915841584158414, + "grad_norm": 2.147088050842285, + "learning_rate": 2.7038755105984747e-09, + "loss": 0.0445, + "num_input_tokens_seen": 38204784, + "step": 181035 + }, + { + "epoch": 19.916391639163916, + "grad_norm": 0.02259857766330242, + "learning_rate": 2.6686927050878497e-09, + "loss": 0.0316, + "num_input_tokens_seen": 38205840, + "step": 181040 + }, + { + "epoch": 19.916941694169417, + "grad_norm": 1.4125927686691284, + "learning_rate": 2.6337402879828e-09, + "loss": 0.019, + "num_input_tokens_seen": 38206896, + "step": 181045 + }, + { + "epoch": 19.917491749174918, + "grad_norm": 0.05347157642245293, + "learning_rate": 2.5990182596080658e-09, + "loss": 0.0099, + "num_input_tokens_seen": 38207888, + "step": 181050 + }, + { + "epoch": 19.91804180418042, + "grad_norm": 0.12550216913223267, + "learning_rate": 2.564526620285612e-09, + "loss": 0.0033, + "num_input_tokens_seen": 38208944, + "step": 181055 + }, + { + "epoch": 19.91859185918592, + "grad_norm": 0.014899970032274723, + "learning_rate": 2.530265370329077e-09, + "loss": 0.0026, + "num_input_tokens_seen": 38209968, + "step": 181060 + }, + { + "epoch": 19.919141914191417, + "grad_norm": 0.014932435005903244, + "learning_rate": 2.496234510057649e-09, + "loss": 0.0498, + "num_input_tokens_seen": 38210992, + "step": 181065 + }, + { + "epoch": 19.91969196919692, + "grad_norm": 0.04447920620441437, + "learning_rate": 2.462434039782191e-09, + "loss": 0.0016, + "num_input_tokens_seen": 38212080, + "step": 181070 + }, + { + "epoch": 19.92024202420242, + "grad_norm": 0.0354950875043869, + "learning_rate": 2.4288639598163408e-09, + "loss": 0.0012, + "num_input_tokens_seen": 38213136, + "step": 181075 + }, + { + "epoch": 19.92079207920792, + "grad_norm": 0.007037981413304806, + "learning_rate": 2.3955242704681857e-09, + "loss": 0.0178, + "num_input_tokens_seen": 38214224, + "step": 181080 + }, + { + "epoch": 19.921342134213422, + "grad_norm": 0.033987052738666534, + "learning_rate": 2.3624149720458124e-09, + "loss": 0.0023, + "num_input_tokens_seen": 38215248, + "step": 181085 + }, + { + "epoch": 19.921892189218923, + "grad_norm": 0.010009386576712132, + "learning_rate": 2.329536064851756e-09, + "loss": 0.0045, + "num_input_tokens_seen": 38216272, + "step": 181090 + }, + { + "epoch": 19.922442244224424, + "grad_norm": 0.055282242596149445, + "learning_rate": 2.2968875491941044e-09, + "loss": 0.0065, + "num_input_tokens_seen": 38217360, + "step": 181095 + }, + { + "epoch": 19.92299229922992, + "grad_norm": 0.1791953444480896, + "learning_rate": 2.264469425369842e-09, + "loss": 0.0714, + "num_input_tokens_seen": 38218416, + "step": 181100 + }, + { + "epoch": 19.923542354235423, + "grad_norm": 0.03756440803408623, + "learning_rate": 2.232281693678728e-09, + "loss": 0.0015, + "num_input_tokens_seen": 38219504, + "step": 181105 + }, + { + "epoch": 19.924092409240924, + "grad_norm": 0.050138942897319794, + "learning_rate": 2.2003243544177486e-09, + "loss": 0.0197, + "num_input_tokens_seen": 38220592, + "step": 181110 + }, + { + "epoch": 19.924642464246425, + "grad_norm": 0.06946473568677902, + "learning_rate": 2.1685974078811123e-09, + "loss": 0.0062, + "num_input_tokens_seen": 38221712, + "step": 181115 + }, + { + "epoch": 19.925192519251926, + "grad_norm": 0.0020545318257063627, + "learning_rate": 2.1371008543630277e-09, + "loss": 0.0013, + "num_input_tokens_seen": 38222736, + "step": 181120 + }, + { + "epoch": 19.925742574257427, + "grad_norm": 0.11466039717197418, + "learning_rate": 2.105834694149378e-09, + "loss": 0.0042, + "num_input_tokens_seen": 38223824, + "step": 181125 + }, + { + "epoch": 19.926292629262925, + "grad_norm": 2.052704095840454, + "learning_rate": 2.0747989275343714e-09, + "loss": 0.0703, + "num_input_tokens_seen": 38224912, + "step": 181130 + }, + { + "epoch": 19.926842684268426, + "grad_norm": 0.009663254953920841, + "learning_rate": 2.0439935547983403e-09, + "loss": 0.0064, + "num_input_tokens_seen": 38225936, + "step": 181135 + }, + { + "epoch": 19.927392739273927, + "grad_norm": 0.22823235392570496, + "learning_rate": 2.013418576227166e-09, + "loss": 0.0125, + "num_input_tokens_seen": 38227056, + "step": 181140 + }, + { + "epoch": 19.927942794279428, + "grad_norm": 0.008926118724048138, + "learning_rate": 1.9830739921039567e-09, + "loss": 0.0019, + "num_input_tokens_seen": 38228080, + "step": 181145 + }, + { + "epoch": 19.92849284928493, + "grad_norm": 0.019937073811888695, + "learning_rate": 1.9529598027090423e-09, + "loss": 0.0493, + "num_input_tokens_seen": 38229104, + "step": 181150 + }, + { + "epoch": 19.92904290429043, + "grad_norm": 0.28696203231811523, + "learning_rate": 1.9230760083172038e-09, + "loss": 0.0023, + "num_input_tokens_seen": 38230160, + "step": 181155 + }, + { + "epoch": 19.92959295929593, + "grad_norm": 0.08025426417589188, + "learning_rate": 1.893422609205997e-09, + "loss": 0.0024, + "num_input_tokens_seen": 38231248, + "step": 181160 + }, + { + "epoch": 19.93014301430143, + "grad_norm": 0.17517726123332977, + "learning_rate": 1.8639996056474264e-09, + "loss": 0.002, + "num_input_tokens_seen": 38232304, + "step": 181165 + }, + { + "epoch": 19.93069306930693, + "grad_norm": 0.05267655849456787, + "learning_rate": 1.8348069979107207e-09, + "loss": 0.0018, + "num_input_tokens_seen": 38233296, + "step": 181170 + }, + { + "epoch": 19.93124312431243, + "grad_norm": 0.048565447330474854, + "learning_rate": 1.8058447862706606e-09, + "loss": 0.0153, + "num_input_tokens_seen": 38234416, + "step": 181175 + }, + { + "epoch": 19.931793179317932, + "grad_norm": 0.13483083248138428, + "learning_rate": 1.7771129709881484e-09, + "loss": 0.0074, + "num_input_tokens_seen": 38235504, + "step": 181180 + }, + { + "epoch": 19.932343234323433, + "grad_norm": 0.006858713459223509, + "learning_rate": 1.748611552332413e-09, + "loss": 0.0661, + "num_input_tokens_seen": 38236688, + "step": 181185 + }, + { + "epoch": 19.932893289328934, + "grad_norm": 0.0045127272605896, + "learning_rate": 1.7203405305643572e-09, + "loss": 0.0039, + "num_input_tokens_seen": 38237712, + "step": 181190 + }, + { + "epoch": 19.933443344334435, + "grad_norm": 0.09362732619047165, + "learning_rate": 1.692299905944883e-09, + "loss": 0.0386, + "num_input_tokens_seen": 38238800, + "step": 181195 + }, + { + "epoch": 19.933993399339933, + "grad_norm": 0.01120590977370739, + "learning_rate": 1.6644896787293417e-09, + "loss": 0.0012, + "num_input_tokens_seen": 38239792, + "step": 181200 + }, + { + "epoch": 19.934543454345434, + "grad_norm": 0.009773052297532558, + "learning_rate": 1.6369098491814116e-09, + "loss": 0.0025, + "num_input_tokens_seen": 38240848, + "step": 181205 + }, + { + "epoch": 19.935093509350935, + "grad_norm": 0.00982698705047369, + "learning_rate": 1.6095604175481171e-09, + "loss": 0.0039, + "num_input_tokens_seen": 38241872, + "step": 181210 + }, + { + "epoch": 19.935643564356436, + "grad_norm": 0.026438910514116287, + "learning_rate": 1.582441384082034e-09, + "loss": 0.0336, + "num_input_tokens_seen": 38242832, + "step": 181215 + }, + { + "epoch": 19.936193619361937, + "grad_norm": 0.05608294904232025, + "learning_rate": 1.5555527490385135e-09, + "loss": 0.0079, + "num_input_tokens_seen": 38243888, + "step": 181220 + }, + { + "epoch": 19.936743674367438, + "grad_norm": 0.03253151476383209, + "learning_rate": 1.5288945126618049e-09, + "loss": 0.0201, + "num_input_tokens_seen": 38244976, + "step": 181225 + }, + { + "epoch": 19.937293729372936, + "grad_norm": 0.0801050215959549, + "learning_rate": 1.5024666751961568e-09, + "loss": 0.0191, + "num_input_tokens_seen": 38246064, + "step": 181230 + }, + { + "epoch": 19.937843784378437, + "grad_norm": 0.017926441505551338, + "learning_rate": 1.4762692368858189e-09, + "loss": 0.0555, + "num_input_tokens_seen": 38247184, + "step": 181235 + }, + { + "epoch": 19.938393839383938, + "grad_norm": 0.12080667912960052, + "learning_rate": 1.4503021979750397e-09, + "loss": 0.0952, + "num_input_tokens_seen": 38248176, + "step": 181240 + }, + { + "epoch": 19.93894389438944, + "grad_norm": 0.03258289769291878, + "learning_rate": 1.4245655587025174e-09, + "loss": 0.0038, + "num_input_tokens_seen": 38249296, + "step": 181245 + }, + { + "epoch": 19.93949394939494, + "grad_norm": 0.015942741185426712, + "learning_rate": 1.3990593193041746e-09, + "loss": 0.001, + "num_input_tokens_seen": 38250320, + "step": 181250 + }, + { + "epoch": 19.94004400440044, + "grad_norm": 0.08424760401248932, + "learning_rate": 1.3737834800131577e-09, + "loss": 0.0065, + "num_input_tokens_seen": 38251408, + "step": 181255 + }, + { + "epoch": 19.94059405940594, + "grad_norm": 0.010391044430434704, + "learning_rate": 1.3487380410653894e-09, + "loss": 0.0288, + "num_input_tokens_seen": 38252496, + "step": 181260 + }, + { + "epoch": 19.94114411441144, + "grad_norm": 0.01381641998887062, + "learning_rate": 1.3239230026912408e-09, + "loss": 0.0572, + "num_input_tokens_seen": 38253552, + "step": 181265 + }, + { + "epoch": 19.94169416941694, + "grad_norm": 0.030981779098510742, + "learning_rate": 1.2993383651183078e-09, + "loss": 0.0401, + "num_input_tokens_seen": 38254544, + "step": 181270 + }, + { + "epoch": 19.942244224422442, + "grad_norm": 0.011223521083593369, + "learning_rate": 1.274984128574186e-09, + "loss": 0.0028, + "num_input_tokens_seen": 38255632, + "step": 181275 + }, + { + "epoch": 19.942794279427943, + "grad_norm": 0.01124102808535099, + "learning_rate": 1.2508602932836955e-09, + "loss": 0.0437, + "num_input_tokens_seen": 38256752, + "step": 181280 + }, + { + "epoch": 19.943344334433444, + "grad_norm": 0.01970576122403145, + "learning_rate": 1.2269668594661054e-09, + "loss": 0.0478, + "num_input_tokens_seen": 38257744, + "step": 181285 + }, + { + "epoch": 19.943894389438945, + "grad_norm": 0.09854497015476227, + "learning_rate": 1.2033038273462361e-09, + "loss": 0.0873, + "num_input_tokens_seen": 38258832, + "step": 181290 + }, + { + "epoch": 19.944444444444443, + "grad_norm": 0.011519292369484901, + "learning_rate": 1.179871197137805e-09, + "loss": 0.1022, + "num_input_tokens_seen": 38259984, + "step": 181295 + }, + { + "epoch": 19.944994499449944, + "grad_norm": 0.03659772872924805, + "learning_rate": 1.1566689690600818e-09, + "loss": 0.0097, + "num_input_tokens_seen": 38261136, + "step": 181300 + }, + { + "epoch": 19.945544554455445, + "grad_norm": 1.6242386102676392, + "learning_rate": 1.133697143326784e-09, + "loss": 0.0548, + "num_input_tokens_seen": 38262160, + "step": 181305 + }, + { + "epoch": 19.946094609460946, + "grad_norm": 0.02718408964574337, + "learning_rate": 1.110955720148854e-09, + "loss": 0.0711, + "num_input_tokens_seen": 38263184, + "step": 181310 + }, + { + "epoch": 19.946644664466447, + "grad_norm": 3.3038058280944824, + "learning_rate": 1.0884446997344588e-09, + "loss": 0.0868, + "num_input_tokens_seen": 38264208, + "step": 181315 + }, + { + "epoch": 19.94719471947195, + "grad_norm": 0.027361435815691948, + "learning_rate": 1.066164082291765e-09, + "loss": 0.0299, + "num_input_tokens_seen": 38265328, + "step": 181320 + }, + { + "epoch": 19.94774477447745, + "grad_norm": 0.18878473341464996, + "learning_rate": 1.0441138680261641e-09, + "loss": 0.008, + "num_input_tokens_seen": 38266384, + "step": 181325 + }, + { + "epoch": 19.948294829482947, + "grad_norm": 0.009830516763031483, + "learning_rate": 1.0222940571430473e-09, + "loss": 0.0045, + "num_input_tokens_seen": 38267472, + "step": 181330 + }, + { + "epoch": 19.948844884488448, + "grad_norm": 0.029991451650857925, + "learning_rate": 1.0007046498422545e-09, + "loss": 0.004, + "num_input_tokens_seen": 38268528, + "step": 181335 + }, + { + "epoch": 19.94939493949395, + "grad_norm": 0.06195143610239029, + "learning_rate": 9.793456463208506e-10, + "loss": 0.1021, + "num_input_tokens_seen": 38269584, + "step": 181340 + }, + { + "epoch": 19.94994499449945, + "grad_norm": 0.09105410426855087, + "learning_rate": 9.582170467759e-10, + "loss": 0.0501, + "num_input_tokens_seen": 38270608, + "step": 181345 + }, + { + "epoch": 19.95049504950495, + "grad_norm": 0.029600132256746292, + "learning_rate": 9.373188514044673e-10, + "loss": 0.0314, + "num_input_tokens_seen": 38271696, + "step": 181350 + }, + { + "epoch": 19.951045104510452, + "grad_norm": 0.009926192462444305, + "learning_rate": 9.16651060398066e-10, + "loss": 0.1984, + "num_input_tokens_seen": 38272720, + "step": 181355 + }, + { + "epoch": 19.95159515951595, + "grad_norm": 0.01778382621705532, + "learning_rate": 8.962136739482096e-10, + "loss": 0.0015, + "num_input_tokens_seen": 38273776, + "step": 181360 + }, + { + "epoch": 19.95214521452145, + "grad_norm": 3.342423439025879, + "learning_rate": 8.760066922408605e-10, + "loss": 0.0693, + "num_input_tokens_seen": 38274864, + "step": 181365 + }, + { + "epoch": 19.952695269526952, + "grad_norm": 0.024188069626688957, + "learning_rate": 8.560301154647566e-10, + "loss": 0.0439, + "num_input_tokens_seen": 38275920, + "step": 181370 + }, + { + "epoch": 19.953245324532453, + "grad_norm": 0.02348725125193596, + "learning_rate": 8.362839438030845e-10, + "loss": 0.0016, + "num_input_tokens_seen": 38276944, + "step": 181375 + }, + { + "epoch": 19.953795379537954, + "grad_norm": 5.89254093170166, + "learning_rate": 8.167681774362557e-10, + "loss": 0.1011, + "num_input_tokens_seen": 38278000, + "step": 181380 + }, + { + "epoch": 19.954345434543455, + "grad_norm": 1.2609471082687378, + "learning_rate": 7.974828165446813e-10, + "loss": 0.0073, + "num_input_tokens_seen": 38279056, + "step": 181385 + }, + { + "epoch": 19.954895489548957, + "grad_norm": 0.03912745416164398, + "learning_rate": 7.784278613087726e-10, + "loss": 0.0014, + "num_input_tokens_seen": 38280080, + "step": 181390 + }, + { + "epoch": 19.955445544554454, + "grad_norm": 2.240929126739502, + "learning_rate": 7.596033119033896e-10, + "loss": 0.0141, + "num_input_tokens_seen": 38281136, + "step": 181395 + }, + { + "epoch": 19.955995599559955, + "grad_norm": 0.04666389152407646, + "learning_rate": 7.410091684978415e-10, + "loss": 0.0007, + "num_input_tokens_seen": 38282192, + "step": 181400 + }, + { + "epoch": 19.956545654565456, + "grad_norm": 0.22625739872455597, + "learning_rate": 7.226454312669884e-10, + "loss": 0.0085, + "num_input_tokens_seen": 38283248, + "step": 181405 + }, + { + "epoch": 19.957095709570957, + "grad_norm": 0.05952238664031029, + "learning_rate": 7.045121003829148e-10, + "loss": 0.1179, + "num_input_tokens_seen": 38284336, + "step": 181410 + }, + { + "epoch": 19.95764576457646, + "grad_norm": 0.07669995725154877, + "learning_rate": 6.866091760066029e-10, + "loss": 0.002, + "num_input_tokens_seen": 38285424, + "step": 181415 + }, + { + "epoch": 19.95819581958196, + "grad_norm": 0.5743154287338257, + "learning_rate": 6.68936658307362e-10, + "loss": 0.0033, + "num_input_tokens_seen": 38286544, + "step": 181420 + }, + { + "epoch": 19.958745874587457, + "grad_norm": 0.009561481885612011, + "learning_rate": 6.514945474461743e-10, + "loss": 0.0027, + "num_input_tokens_seen": 38287600, + "step": 181425 + }, + { + "epoch": 19.959295929592958, + "grad_norm": 0.04955817759037018, + "learning_rate": 6.342828435840221e-10, + "loss": 0.0047, + "num_input_tokens_seen": 38288656, + "step": 181430 + }, + { + "epoch": 19.95984598459846, + "grad_norm": 0.0736289769411087, + "learning_rate": 6.173015468791121e-10, + "loss": 0.002, + "num_input_tokens_seen": 38289712, + "step": 181435 + }, + { + "epoch": 19.96039603960396, + "grad_norm": 0.08971286565065384, + "learning_rate": 6.005506574896513e-10, + "loss": 0.032, + "num_input_tokens_seen": 38290736, + "step": 181440 + }, + { + "epoch": 19.96094609460946, + "grad_norm": 0.052355777472257614, + "learning_rate": 5.840301755682953e-10, + "loss": 0.0669, + "num_input_tokens_seen": 38291856, + "step": 181445 + }, + { + "epoch": 19.961496149614963, + "grad_norm": 5.447445392608643, + "learning_rate": 5.677401012704753e-10, + "loss": 0.0531, + "num_input_tokens_seen": 38292944, + "step": 181450 + }, + { + "epoch": 19.962046204620464, + "grad_norm": 0.27284473180770874, + "learning_rate": 5.516804347405202e-10, + "loss": 0.0334, + "num_input_tokens_seen": 38293968, + "step": 181455 + }, + { + "epoch": 19.96259625962596, + "grad_norm": 0.018396949395537376, + "learning_rate": 5.358511761338614e-10, + "loss": 0.0104, + "num_input_tokens_seen": 38295120, + "step": 181460 + }, + { + "epoch": 19.963146314631462, + "grad_norm": 0.1841559261083603, + "learning_rate": 5.202523255892766e-10, + "loss": 0.0573, + "num_input_tokens_seen": 38296144, + "step": 181465 + }, + { + "epoch": 19.963696369636963, + "grad_norm": 0.03826036676764488, + "learning_rate": 5.048838832538705e-10, + "loss": 0.0036, + "num_input_tokens_seen": 38297200, + "step": 181470 + }, + { + "epoch": 19.964246424642464, + "grad_norm": 0.0370049811899662, + "learning_rate": 4.897458492691964e-10, + "loss": 0.0017, + "num_input_tokens_seen": 38298192, + "step": 181475 + }, + { + "epoch": 19.964796479647966, + "grad_norm": 0.0285929124802351, + "learning_rate": 4.748382237740323e-10, + "loss": 0.0051, + "num_input_tokens_seen": 38299248, + "step": 181480 + }, + { + "epoch": 19.965346534653467, + "grad_norm": 1.9354342222213745, + "learning_rate": 4.60161006907156e-10, + "loss": 0.022, + "num_input_tokens_seen": 38300272, + "step": 181485 + }, + { + "epoch": 19.965896589658964, + "grad_norm": 0.16970111429691315, + "learning_rate": 4.457141988045699e-10, + "loss": 0.0077, + "num_input_tokens_seen": 38301264, + "step": 181490 + }, + { + "epoch": 19.966446644664465, + "grad_norm": 0.007854213006794453, + "learning_rate": 4.314977995967251e-10, + "loss": 0.0011, + "num_input_tokens_seen": 38302352, + "step": 181495 + }, + { + "epoch": 19.966996699669966, + "grad_norm": 0.09945523738861084, + "learning_rate": 4.1751180941407285e-10, + "loss": 0.0023, + "num_input_tokens_seen": 38303408, + "step": 181500 + }, + { + "epoch": 19.967546754675467, + "grad_norm": 0.013778920285403728, + "learning_rate": 4.0375622838983996e-10, + "loss": 0.0005, + "num_input_tokens_seen": 38304496, + "step": 181505 + }, + { + "epoch": 19.96809680968097, + "grad_norm": 0.19540080428123474, + "learning_rate": 3.902310566461509e-10, + "loss": 0.0015, + "num_input_tokens_seen": 38305552, + "step": 181510 + }, + { + "epoch": 19.96864686468647, + "grad_norm": 0.2877771556377411, + "learning_rate": 3.769362943106813e-10, + "loss": 0.0481, + "num_input_tokens_seen": 38306608, + "step": 181515 + }, + { + "epoch": 19.96919691969197, + "grad_norm": 0.01657795161008835, + "learning_rate": 3.638719415055558e-10, + "loss": 0.0085, + "num_input_tokens_seen": 38307600, + "step": 181520 + }, + { + "epoch": 19.96974697469747, + "grad_norm": 0.042646534740924835, + "learning_rate": 3.510379983501233e-10, + "loss": 0.0023, + "num_input_tokens_seen": 38308656, + "step": 181525 + }, + { + "epoch": 19.97029702970297, + "grad_norm": 1.524890422821045, + "learning_rate": 3.384344649609572e-10, + "loss": 0.0552, + "num_input_tokens_seen": 38309648, + "step": 181530 + }, + { + "epoch": 19.97084708470847, + "grad_norm": 0.013381564058363438, + "learning_rate": 3.2606134146018206e-10, + "loss": 0.0425, + "num_input_tokens_seen": 38310704, + "step": 181535 + }, + { + "epoch": 19.97139713971397, + "grad_norm": 0.014400300569832325, + "learning_rate": 3.1391862795604465e-10, + "loss": 0.0183, + "num_input_tokens_seen": 38311792, + "step": 181540 + }, + { + "epoch": 19.971947194719473, + "grad_norm": 0.008648846298456192, + "learning_rate": 3.020063245623428e-10, + "loss": 0.0537, + "num_input_tokens_seen": 38312880, + "step": 181545 + }, + { + "epoch": 19.972497249724974, + "grad_norm": 0.05022032931447029, + "learning_rate": 2.903244313900988e-10, + "loss": 0.0038, + "num_input_tokens_seen": 38313872, + "step": 181550 + }, + { + "epoch": 19.97304730473047, + "grad_norm": 3.579906463623047, + "learning_rate": 2.7887294854478385e-10, + "loss": 0.0912, + "num_input_tokens_seen": 38314992, + "step": 181555 + }, + { + "epoch": 19.973597359735972, + "grad_norm": 0.41672760248184204, + "learning_rate": 2.676518761346447e-10, + "loss": 0.0038, + "num_input_tokens_seen": 38316016, + "step": 181560 + }, + { + "epoch": 19.974147414741473, + "grad_norm": 0.08447561413049698, + "learning_rate": 2.5666121425960144e-10, + "loss": 0.0017, + "num_input_tokens_seen": 38317072, + "step": 181565 + }, + { + "epoch": 19.974697469746975, + "grad_norm": 0.027157681062817574, + "learning_rate": 2.4590096302234966e-10, + "loss": 0.0504, + "num_input_tokens_seen": 38318128, + "step": 181570 + }, + { + "epoch": 19.975247524752476, + "grad_norm": 0.03524479269981384, + "learning_rate": 2.3537112252558503e-10, + "loss": 0.0022, + "num_input_tokens_seen": 38319184, + "step": 181575 + }, + { + "epoch": 19.975797579757977, + "grad_norm": 0.0057966215535998344, + "learning_rate": 2.2507169286090091e-10, + "loss": 0.0062, + "num_input_tokens_seen": 38320208, + "step": 181580 + }, + { + "epoch": 19.976347634763478, + "grad_norm": 0.0032055212650448084, + "learning_rate": 2.150026741282174e-10, + "loss": 0.0416, + "num_input_tokens_seen": 38321296, + "step": 181585 + }, + { + "epoch": 19.976897689768975, + "grad_norm": 0.009118671528995037, + "learning_rate": 2.0516406641635234e-10, + "loss": 0.0035, + "num_input_tokens_seen": 38322320, + "step": 181590 + }, + { + "epoch": 19.977447744774476, + "grad_norm": 0.08493728190660477, + "learning_rate": 1.955558698168991e-10, + "loss": 0.0722, + "num_input_tokens_seen": 38323440, + "step": 181595 + }, + { + "epoch": 19.977997799779978, + "grad_norm": 0.00807883683592081, + "learning_rate": 1.8617808441867556e-10, + "loss": 0.0011, + "num_input_tokens_seen": 38324464, + "step": 181600 + }, + { + "epoch": 19.97854785478548, + "grad_norm": 0.06211993470788002, + "learning_rate": 1.77030710307724e-10, + "loss": 0.0223, + "num_input_tokens_seen": 38325520, + "step": 181605 + }, + { + "epoch": 19.97909790979098, + "grad_norm": 0.004236430395394564, + "learning_rate": 1.681137475700867e-10, + "loss": 0.0045, + "num_input_tokens_seen": 38326576, + "step": 181610 + }, + { + "epoch": 19.97964796479648, + "grad_norm": 0.014502442441880703, + "learning_rate": 1.5942719628903034e-10, + "loss": 0.0039, + "num_input_tokens_seen": 38327664, + "step": 181615 + }, + { + "epoch": 19.980198019801982, + "grad_norm": 0.0689537301659584, + "learning_rate": 1.5097105653949507e-10, + "loss": 0.0011, + "num_input_tokens_seen": 38328688, + "step": 181620 + }, + { + "epoch": 19.98074807480748, + "grad_norm": 0.06757159531116486, + "learning_rate": 1.4274532840197198e-10, + "loss": 0.0033, + "num_input_tokens_seen": 38329744, + "step": 181625 + }, + { + "epoch": 19.98129812981298, + "grad_norm": 0.007636575493961573, + "learning_rate": 1.3475001195417668e-10, + "loss": 0.0044, + "num_input_tokens_seen": 38330832, + "step": 181630 + }, + { + "epoch": 19.98184818481848, + "grad_norm": 0.3697268068790436, + "learning_rate": 1.2698510726827372e-10, + "loss": 0.0078, + "num_input_tokens_seen": 38331856, + "step": 181635 + }, + { + "epoch": 19.982398239823983, + "grad_norm": 0.005251854192465544, + "learning_rate": 1.1945061441642758e-10, + "loss": 0.0036, + "num_input_tokens_seen": 38332880, + "step": 181640 + }, + { + "epoch": 19.982948294829484, + "grad_norm": 0.27651217579841614, + "learning_rate": 1.1214653346525162e-10, + "loss": 0.0031, + "num_input_tokens_seen": 38334032, + "step": 181645 + }, + { + "epoch": 19.983498349834985, + "grad_norm": 0.03079010359942913, + "learning_rate": 1.0507286448691033e-10, + "loss": 0.0045, + "num_input_tokens_seen": 38335024, + "step": 181650 + }, + { + "epoch": 19.984048404840483, + "grad_norm": 0.008370917290449142, + "learning_rate": 9.822960754246602e-11, + "loss": 0.0449, + "num_input_tokens_seen": 38336112, + "step": 181655 + }, + { + "epoch": 19.984598459845984, + "grad_norm": 0.07653186470270157, + "learning_rate": 9.161676269575647e-11, + "loss": 0.1017, + "num_input_tokens_seen": 38337104, + "step": 181660 + }, + { + "epoch": 19.985148514851485, + "grad_norm": 0.051878444850444794, + "learning_rate": 8.523433001061953e-11, + "loss": 0.0129, + "num_input_tokens_seen": 38338128, + "step": 181665 + }, + { + "epoch": 19.985698569856986, + "grad_norm": 0.08800959587097168, + "learning_rate": 7.908230954256634e-11, + "loss": 0.0008, + "num_input_tokens_seen": 38339152, + "step": 181670 + }, + { + "epoch": 19.986248624862487, + "grad_norm": 0.013711527921259403, + "learning_rate": 7.316070134988361e-11, + "loss": 0.0018, + "num_input_tokens_seen": 38340176, + "step": 181675 + }, + { + "epoch": 19.986798679867988, + "grad_norm": 0.010472258552908897, + "learning_rate": 6.74695054880825e-11, + "loss": 0.0015, + "num_input_tokens_seen": 38341264, + "step": 181680 + }, + { + "epoch": 19.98734873487349, + "grad_norm": 2.592087507247925, + "learning_rate": 6.200872200434748e-11, + "loss": 0.1126, + "num_input_tokens_seen": 38342320, + "step": 181685 + }, + { + "epoch": 19.987898789878987, + "grad_norm": 0.07054872810840607, + "learning_rate": 5.6778350956965264e-11, + "loss": 0.0013, + "num_input_tokens_seen": 38343344, + "step": 181690 + }, + { + "epoch": 19.988448844884488, + "grad_norm": 0.05015170946717262, + "learning_rate": 5.177839238756921e-11, + "loss": 0.0024, + "num_input_tokens_seen": 38344400, + "step": 181695 + }, + { + "epoch": 19.98899889988999, + "grad_norm": 0.04092788323760033, + "learning_rate": 4.700884634611935e-11, + "loss": 0.001, + "num_input_tokens_seen": 38345488, + "step": 181700 + }, + { + "epoch": 19.98954895489549, + "grad_norm": 0.03540594130754471, + "learning_rate": 4.2469712874249055e-11, + "loss": 0.0036, + "num_input_tokens_seen": 38346512, + "step": 181705 + }, + { + "epoch": 19.99009900990099, + "grad_norm": 0.018901366740465164, + "learning_rate": 3.8160992016367246e-11, + "loss": 0.0058, + "num_input_tokens_seen": 38347600, + "step": 181710 + }, + { + "epoch": 19.990649064906492, + "grad_norm": 0.040285542607307434, + "learning_rate": 3.4082683808556174e-11, + "loss": 0.0035, + "num_input_tokens_seen": 38348624, + "step": 181715 + }, + { + "epoch": 19.99119911991199, + "grad_norm": 3.6383252143859863, + "learning_rate": 3.023478829244919e-11, + "loss": 0.0501, + "num_input_tokens_seen": 38349744, + "step": 181720 + }, + { + "epoch": 19.99174917491749, + "grad_norm": 0.011348940432071686, + "learning_rate": 2.6617305501352996e-11, + "loss": 0.0451, + "num_input_tokens_seen": 38350864, + "step": 181725 + }, + { + "epoch": 19.992299229922992, + "grad_norm": 0.02065689116716385, + "learning_rate": 2.3230235465798722e-11, + "loss": 0.0348, + "num_input_tokens_seen": 38351888, + "step": 181730 + }, + { + "epoch": 19.992849284928493, + "grad_norm": 0.4706456661224365, + "learning_rate": 2.0073578221868616e-11, + "loss": 0.0804, + "num_input_tokens_seen": 38352880, + "step": 181735 + }, + { + "epoch": 19.993399339933994, + "grad_norm": 0.02487608790397644, + "learning_rate": 1.7147333797318254e-11, + "loss": 0.0016, + "num_input_tokens_seen": 38354000, + "step": 181740 + }, + { + "epoch": 19.993949394939495, + "grad_norm": 0.04501244053244591, + "learning_rate": 1.4451502219903213e-11, + "loss": 0.0396, + "num_input_tokens_seen": 38355088, + "step": 181745 + }, + { + "epoch": 19.994499449944996, + "grad_norm": 2.8428099155426025, + "learning_rate": 1.198608351182795e-11, + "loss": 0.0969, + "num_input_tokens_seen": 38356176, + "step": 181750 + }, + { + "epoch": 19.995049504950494, + "grad_norm": 0.48371806740760803, + "learning_rate": 9.75107769529693e-12, + "loss": 0.0057, + "num_input_tokens_seen": 38357264, + "step": 181755 + }, + { + "epoch": 19.995599559955995, + "grad_norm": 3.817279100418091, + "learning_rate": 7.746484795290165e-12, + "loss": 0.1512, + "num_input_tokens_seen": 38358352, + "step": 181760 + }, + { + "epoch": 19.996149614961496, + "grad_norm": 0.012047474272549152, + "learning_rate": 5.972304825685448e-12, + "loss": 0.003, + "num_input_tokens_seen": 38359440, + "step": 181765 + }, + { + "epoch": 19.996699669966997, + "grad_norm": 0.043859973549842834, + "learning_rate": 4.428537805911681e-12, + "loss": 0.0023, + "num_input_tokens_seen": 38360496, + "step": 181770 + }, + { + "epoch": 19.997249724972498, + "grad_norm": 0.049544792622327805, + "learning_rate": 3.1151837498466507e-12, + "loss": 0.0016, + "num_input_tokens_seen": 38361520, + "step": 181775 + }, + { + "epoch": 19.997799779978, + "grad_norm": 0.00633643614128232, + "learning_rate": 2.0322426685925876e-12, + "loss": 0.0628, + "num_input_tokens_seen": 38362544, + "step": 181780 + }, + { + "epoch": 19.998349834983497, + "grad_norm": 0.03399321064352989, + "learning_rate": 1.1797145704761648e-12, + "loss": 0.1232, + "num_input_tokens_seen": 38363632, + "step": 181785 + }, + { + "epoch": 19.998899889988998, + "grad_norm": 0.19066062569618225, + "learning_rate": 5.575994665996121e-13, + "loss": 0.0026, + "num_input_tokens_seen": 38364656, + "step": 181790 + }, + { + "epoch": 19.9994499449945, + "grad_norm": 0.034065090119838715, + "learning_rate": 1.6589736251404475e-13, + "loss": 0.0619, + "num_input_tokens_seen": 38365680, + "step": 181795 + }, + { + "epoch": 20.0, + "grad_norm": 0.041105449199676514, + "learning_rate": 4.60826099502043e-15, + "loss": 0.0013, + "num_input_tokens_seen": 38366624, + "step": 181800 + }, + { + "epoch": 20.0, + "eval_loss": 0.07789125293493271, + "eval_runtime": 37.0341, + "eval_samples_per_second": 109.089, + "eval_steps_per_second": 27.272, + "num_input_tokens_seen": 38366624, + "step": 181800 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 38366624, + "step": 181800, + "total_flos": 1.7276769973449523e+18, + "train_loss": 0.12781044730437766, + "train_runtime": 25867.5943, + "train_samples_per_second": 28.111, + "train_steps_per_second": 7.028 + } + ], + "logging_steps": 5, + "max_steps": 181800, + "num_input_tokens_seen": 38366624, + "num_train_epochs": 20, + "save_steps": 9090, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7276769973449523e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}